From 18d867aa4722527f30a25d63b55fcde338d5b733 Mon Sep 17 00:00:00 2001 From: Eduardo Chiarotti Date: Mon, 29 Jul 2024 23:17:07 -0300 Subject: [PATCH] feat: Add execution time to both task and testing feature (#1031) * feat: Add execution time to both task and testing feature * feat: Remove unused functions * feat: change test_crew to evalaute_crew to avoid issues with testing libs * feat: fix tests --- src/crewai/cli/cli.py | 4 +-- .../cli/{test_crew.py => evaluate_crew.py} | 8 +++--- src/crewai/task.py | 14 +++++++++- .../evaluators/crew_evaluator_handler.py | 20 +++++++++++--- tests/cli/cli_test.py | 18 ++++++------- tests/cli/test_crew_test.py | 26 +++++++++---------- .../evaluators/test_crew_evaluator_handler.py | 5 ++++ 7 files changed, 62 insertions(+), 33 deletions(-) rename src/crewai/cli/{test_crew.py => evaluate_crew.py} (82%) diff --git a/src/crewai/cli/cli.py b/src/crewai/cli/cli.py index c9f03f3fb..52d2bc75c 100644 --- a/src/crewai/cli/cli.py +++ b/src/crewai/cli/cli.py @@ -6,9 +6,9 @@ from crewai.memory.storage.kickoff_task_outputs_storage import ( ) from .create_crew import create_crew +from .evaluate_crew import evaluate_crew from .replay_from_task import replay_task_command from .reset_memories_command import reset_memories_command -from .test_crew import test_crew from .train_crew import train_crew @@ -144,7 +144,7 @@ def reset_memories(long, short, entities, kickoff_outputs, all): def test(n_iterations: int, model: str): """Test the crew and evaluate the results.""" click.echo(f"Testing the crew for {n_iterations} iterations with model {model}") - test_crew(n_iterations, model) + evaluate_crew(n_iterations, model) if __name__ == "__main__": diff --git a/src/crewai/cli/test_crew.py b/src/crewai/cli/evaluate_crew.py similarity index 82% rename from src/crewai/cli/test_crew.py rename to src/crewai/cli/evaluate_crew.py index b95669e55..30abda380 100644 --- a/src/crewai/cli/test_crew.py +++ b/src/crewai/cli/evaluate_crew.py @@ -1,13 +1,11 @@ import subprocess + import click -import pytest - -pytest.skip(allow_module_level=True) -def test_crew(n_iterations: int, model: str) -> None: +def evaluate_crew(n_iterations: int, model: str) -> None: """ - Test the crew by running a command in the Poetry environment. + Test and Evaluate the crew by running a command in the Poetry environment. Args: n_iterations (int): The number of iterations to test the crew. diff --git a/src/crewai/task.py b/src/crewai/task.py index 14c1e30e5..3e693a498 100644 --- a/src/crewai/task.py +++ b/src/crewai/task.py @@ -1,3 +1,4 @@ +import datetime import json import os import threading @@ -107,6 +108,7 @@ class Task(BaseModel): _original_description: str | None = None _original_expected_output: str | None = None _thread: threading.Thread | None = None + _execution_time: float | None = None def __init__(__pydantic_self__, **data): config = data.pop("config", {}) @@ -120,6 +122,12 @@ class Task(BaseModel): "may_not_set_field", "This field is not to be set by the user.", {} ) + def _set_start_execution_time(self) -> float: + return datetime.datetime.now().timestamp() + + def _set_end_execution_time(self, start_time: float) -> None: + self._execution_time = datetime.datetime.now().timestamp() - start_time + @field_validator("output_file") @classmethod def output_file_validation(cls, value: str) -> str: @@ -216,6 +224,7 @@ class Task(BaseModel): f"The task '{self.description}' has no agent assigned, therefore it can't be executed directly and should be executed in a Crew using a specific process that support that, like hierarchical." ) + start_time = self._set_start_execution_time() self._execution_span = self._telemetry.task_started(crew=agent.crew, task=self) self.prompt_context = context @@ -239,6 +248,7 @@ class Task(BaseModel): ) self.output = task_output + self._set_end_execution_time(start_time) if self.callback: self.callback(self.output) @@ -250,7 +260,9 @@ class Task(BaseModel): content = ( json_output if json_output - else pydantic_output.model_dump_json() if pydantic_output else result + else pydantic_output.model_dump_json() + if pydantic_output + else result ) self._save_file(content) diff --git a/src/crewai/utilities/evaluators/crew_evaluator_handler.py b/src/crewai/utilities/evaluators/crew_evaluator_handler.py index 3f1abb8b8..fbc5d341e 100644 --- a/src/crewai/utilities/evaluators/crew_evaluator_handler.py +++ b/src/crewai/utilities/evaluators/crew_evaluator_handler.py @@ -28,6 +28,7 @@ class CrewEvaluator: """ tasks_scores: defaultdict = defaultdict(list) + run_execution_times: defaultdict = defaultdict(list) iteration: int = 0 def __init__(self, crew, openai_model_name: str): @@ -40,9 +41,6 @@ class CrewEvaluator: for task in self.crew.tasks: task.callback = self.evaluate - def set_iteration(self, iteration: int) -> None: - self.iteration = iteration - def _evaluator_agent(self): return Agent( role="Task Execution Evaluator", @@ -71,6 +69,9 @@ class CrewEvaluator: output_pydantic=TaskEvaluationPydanticOutput, ) + def set_iteration(self, iteration: int) -> None: + self.iteration = iteration + def print_crew_evaluation_result(self) -> None: """ Prints the evaluation result of the crew in a table. @@ -119,6 +120,16 @@ class CrewEvaluator: ] table.add_row("Crew", *map(str, crew_scores), f"{crew_average:.1f}") + run_exec_times = [ + int(sum(tasks_exec_times)) + for _, tasks_exec_times in self.run_execution_times.items() + ] + execution_time_avg = int(sum(run_exec_times) / len(run_exec_times)) + table.add_row( + "Execution Time (s)", + *map(str, run_exec_times), + f"{execution_time_avg}", + ) # Display the table in the terminal console = Console() console.print(table) @@ -145,5 +156,8 @@ class CrewEvaluator: if isinstance(evaluation_result.pydantic, TaskEvaluationPydanticOutput): self.tasks_scores[self.iteration].append(evaluation_result.pydantic.quality) + self.run_execution_times[self.iteration].append( + current_task._execution_time + ) else: raise ValueError("Evaluation result is not in the expected format") diff --git a/tests/cli/cli_test.py b/tests/cli/cli_test.py index 504975dc7..509b9193a 100644 --- a/tests/cli/cli_test.py +++ b/tests/cli/cli_test.py @@ -135,29 +135,29 @@ def test_version_command_with_tools(runner): ) -@mock.patch("crewai.cli.cli.test_crew") -def test_test_default_iterations(test_crew, runner): +@mock.patch("crewai.cli.cli.evaluate_crew") +def test_test_default_iterations(evaluate_crew, runner): result = runner.invoke(test) - test_crew.assert_called_once_with(3, "gpt-4o-mini") + evaluate_crew.assert_called_once_with(3, "gpt-4o-mini") assert result.exit_code == 0 assert "Testing the crew for 3 iterations with model gpt-4o-mini" in result.output -@mock.patch("crewai.cli.cli.test_crew") -def test_test_custom_iterations(test_crew, runner): +@mock.patch("crewai.cli.cli.evaluate_crew") +def test_test_custom_iterations(evaluate_crew, runner): result = runner.invoke(test, ["--n_iterations", "5", "--model", "gpt-4o"]) - test_crew.assert_called_once_with(5, "gpt-4o") + evaluate_crew.assert_called_once_with(5, "gpt-4o") assert result.exit_code == 0 assert "Testing the crew for 5 iterations with model gpt-4o" in result.output -@mock.patch("crewai.cli.cli.test_crew") -def test_test_invalid_string_iterations(test_crew, runner): +@mock.patch("crewai.cli.cli.evaluate_crew") +def test_test_invalid_string_iterations(evaluate_crew, runner): result = runner.invoke(test, ["--n_iterations", "invalid"]) - test_crew.assert_not_called() + evaluate_crew.assert_not_called() assert result.exit_code == 2 assert ( "Usage: test [OPTIONS]\nTry 'test --help' for help.\n\nError: Invalid value for '-n' / '--n_iterations': 'invalid' is not a valid integer.\n" diff --git a/tests/cli/test_crew_test.py b/tests/cli/test_crew_test.py index 90649710a..578e413bc 100644 --- a/tests/cli/test_crew_test.py +++ b/tests/cli/test_crew_test.py @@ -3,7 +3,7 @@ from unittest import mock import pytest -from crewai.cli import test_crew +from crewai.cli import evaluate_crew @pytest.mark.parametrize( @@ -14,13 +14,13 @@ from crewai.cli import test_crew (10, "gpt-4"), ], ) -@mock.patch("crewai.cli.test_crew.subprocess.run") +@mock.patch("crewai.cli.evaluate_crew.subprocess.run") def test_crew_success(mock_subprocess_run, n_iterations, model): """Test the crew function for successful execution.""" mock_subprocess_run.return_value = subprocess.CompletedProcess( args=f"poetry run test {n_iterations} {model}", returncode=0 ) - result = test_crew.test_crew(n_iterations, model) + result = evaluate_crew.evaluate_crew(n_iterations, model) mock_subprocess_run.assert_called_once_with( ["poetry", "run", "test", str(n_iterations), model], @@ -31,26 +31,26 @@ def test_crew_success(mock_subprocess_run, n_iterations, model): assert result is None -@mock.patch("crewai.cli.test_crew.click") +@mock.patch("crewai.cli.evaluate_crew.click") def test_test_crew_zero_iterations(click): - test_crew.test_crew(0, "gpt-4o") + evaluate_crew.evaluate_crew(0, "gpt-4o") click.echo.assert_called_once_with( "An unexpected error occurred: The number of iterations must be a positive integer.", err=True, ) -@mock.patch("crewai.cli.test_crew.click") +@mock.patch("crewai.cli.evaluate_crew.click") def test_test_crew_negative_iterations(click): - test_crew.test_crew(-2, "gpt-4o") + evaluate_crew.evaluate_crew(-2, "gpt-4o") click.echo.assert_called_once_with( "An unexpected error occurred: The number of iterations must be a positive integer.", err=True, ) -@mock.patch("crewai.cli.test_crew.click") -@mock.patch("crewai.cli.test_crew.subprocess.run") +@mock.patch("crewai.cli.evaluate_crew.click") +@mock.patch("crewai.cli.evaluate_crew.subprocess.run") def test_test_crew_called_process_error(mock_subprocess_run, click): n_iterations = 5 mock_subprocess_run.side_effect = subprocess.CalledProcessError( @@ -59,7 +59,7 @@ def test_test_crew_called_process_error(mock_subprocess_run, click): output="Error", stderr="Some error occurred", ) - test_crew.test_crew(n_iterations, "gpt-4o") + evaluate_crew.evaluate_crew(n_iterations, "gpt-4o") mock_subprocess_run.assert_called_once_with( ["poetry", "run", "test", "5", "gpt-4o"], @@ -78,13 +78,13 @@ def test_test_crew_called_process_error(mock_subprocess_run, click): ) -@mock.patch("crewai.cli.test_crew.click") -@mock.patch("crewai.cli.test_crew.subprocess.run") +@mock.patch("crewai.cli.evaluate_crew.click") +@mock.patch("crewai.cli.evaluate_crew.subprocess.run") def test_test_crew_unexpected_exception(mock_subprocess_run, click): # Arrange n_iterations = 5 mock_subprocess_run.side_effect = Exception("Unexpected error") - test_crew.test_crew(n_iterations, "gpt-4o") + evaluate_crew.evaluate_crew(n_iterations, "gpt-4o") mock_subprocess_run.assert_called_once_with( ["poetry", "run", "test", "5", "gpt-4o"], diff --git a/tests/utilities/evaluators/test_crew_evaluator_handler.py b/tests/utilities/evaluators/test_crew_evaluator_handler.py index 39fa35c44..30fb7bf76 100644 --- a/tests/utilities/evaluators/test_crew_evaluator_handler.py +++ b/tests/utilities/evaluators/test_crew_evaluator_handler.py @@ -84,6 +84,10 @@ class TestCrewEvaluator: 1: [10, 9, 8], 2: [9, 8, 7], } + crew_planner.run_execution_times = { + 1: [24, 45, 66], + 2: [55, 33, 67], + } crew_planner.print_crew_evaluation_result() @@ -98,6 +102,7 @@ class TestCrewEvaluator: mock.call().add_row("Task 2", "9", "8", "8.5"), mock.call().add_row("Task 3", "8", "7", "7.5"), mock.call().add_row("Crew", "9.0", "8.0", "8.5"), + mock.call().add_row("Execution Time (s)", "135", "155", "145"), ] ) console.assert_has_calls([mock.call(), mock.call().print(table())])