feat: Add execution time to both task and testing feature (#1031)

* feat: Add execution time to both task and testing feature * feat: Remove unused functions * feat: change test_crew to evalaute_crew to avoid issues with testing libs * feat: fix tests
2026-01-10 00:28:31 +00:00 · 2024-07-29 23:17:07 -03:00
parent de6b597eff
commit d824db82a3
7 changed files with 62 additions and 33 deletions
--- a/src/crewai/cli/cli.py
+++ b/src/crewai/cli/cli.py
@@ -6,9 +6,9 @@ from crewai.memory.storage.kickoff_task_outputs_storage import (
 )

 from .create_crew import create_crew
+from .evaluate_crew import evaluate_crew
 from .replay_from_task import replay_task_command
 from .reset_memories_command import reset_memories_command
-from .test_crew import test_crew
 from .train_crew import train_crew


@@ -144,7 +144,7 @@ def reset_memories(long, short, entities, kickoff_outputs, all):
 def test(n_iterations: int, model: str):
    """Test the crew and evaluate the results."""
    click.echo(f"Testing the crew for {n_iterations} iterations with model {model}")
-    test_crew(n_iterations, model)
+    evaluate_crew(n_iterations, model)


 if __name__ == "__main__":
--- a/src/crewai/cli/evaluate_crew.py
+++ b/src/crewai/cli/evaluate_crew.py
@@ -1,13 +1,11 @@
 import subprocess
+
 import click
-import pytest
-
-pytest.skip(allow_module_level=True)


-def test_crew(n_iterations: int, model: str) -> None:
+def evaluate_crew(n_iterations: int, model: str) -> None:
    """
-    Test the crew by running a command in the Poetry environment.
+    Test and Evaluate the crew by running a command in the Poetry environment.

    Args:
        n_iterations (int): The number of iterations to test the crew.
--- a/src/crewai/task.py
+++ b/src/crewai/task.py
@@ -1,3 +1,4 @@
+import datetime
 import json
 import os
 import threading
@@ -107,6 +108,7 @@ class Task(BaseModel):
    _original_description: str | None = None
    _original_expected_output: str | None = None
    _thread: threading.Thread | None = None
+    _execution_time: float | None = None

    def __init__(__pydantic_self__, **data):
        config = data.pop("config", {})
@@ -120,6 +122,12 @@ class Task(BaseModel):
                "may_not_set_field", "This field is not to be set by the user.", {}
            )

+    def _set_start_execution_time(self) -> float:
+        return datetime.datetime.now().timestamp()
+
+    def _set_end_execution_time(self, start_time: float) -> None:
+        self._execution_time = datetime.datetime.now().timestamp() - start_time
+
    @field_validator("output_file")
    @classmethod
    def output_file_validation(cls, value: str) -> str:
@@ -216,6 +224,7 @@ class Task(BaseModel):
                f"The task '{self.description}' has no agent assigned, therefore it can't be executed directly and should be executed in a Crew using a specific process that support that, like hierarchical."
            )

+        start_time = self._set_start_execution_time()
        self._execution_span = self._telemetry.task_started(crew=agent.crew, task=self)

        self.prompt_context = context
@@ -239,6 +248,7 @@ class Task(BaseModel):
        )
        self.output = task_output

+        self._set_end_execution_time(start_time)
        if self.callback:
            self.callback(self.output)

@@ -250,7 +260,9 @@ class Task(BaseModel):
            content = (
                json_output
                if json_output
-                else pydantic_output.model_dump_json() if pydantic_output else result
+                else pydantic_output.model_dump_json()
+                if pydantic_output
+                else result
            )
            self._save_file(content)

--- a/src/crewai/utilities/evaluators/crew_evaluator_handler.py
+++ b/src/crewai/utilities/evaluators/crew_evaluator_handler.py
@@ -28,6 +28,7 @@ class CrewEvaluator:
    """

    tasks_scores: defaultdict = defaultdict(list)
+    run_execution_times: defaultdict = defaultdict(list)
    iteration: int = 0

    def __init__(self, crew, openai_model_name: str):
@@ -40,9 +41,6 @@ class CrewEvaluator:
        for task in self.crew.tasks:
            task.callback = self.evaluate

-    def set_iteration(self, iteration: int) -> None:
-        self.iteration = iteration
-
    def _evaluator_agent(self):
        return Agent(
            role="Task Execution Evaluator",
@@ -71,6 +69,9 @@ class CrewEvaluator:
            output_pydantic=TaskEvaluationPydanticOutput,
        )

+    def set_iteration(self, iteration: int) -> None:
+        self.iteration = iteration
+
    def print_crew_evaluation_result(self) -> None:
        """
        Prints the evaluation result of the crew in a table.
@@ -119,6 +120,16 @@ class CrewEvaluator:
        ]
        table.add_row("Crew", *map(str, crew_scores), f"{crew_average:.1f}")

+        run_exec_times = [
+            int(sum(tasks_exec_times))
+            for _, tasks_exec_times in self.run_execution_times.items()
+        ]
+        execution_time_avg = int(sum(run_exec_times) / len(run_exec_times))
+        table.add_row(
+            "Execution Time (s)",
+            *map(str, run_exec_times),
+            f"{execution_time_avg}",
+        )
        # Display the table in the terminal
        console = Console()
        console.print(table)
@@ -145,5 +156,8 @@ class CrewEvaluator:

        if isinstance(evaluation_result.pydantic, TaskEvaluationPydanticOutput):
            self.tasks_scores[self.iteration].append(evaluation_result.pydantic.quality)
+            self.run_execution_times[self.iteration].append(
+                current_task._execution_time
+            )
        else:
            raise ValueError("Evaluation result is not in the expected format")