diff --git a/src/crewai/crew.py b/src/crewai/crew.py index c1b80aa7c..f67161e2c 100644 --- a/src/crewai/crew.py +++ b/src/crewai/crew.py @@ -967,15 +967,18 @@ class Crew(BaseModel): return total_usage_metrics - def test( - self, n_iterations: int, model: str, inputs: Optional[Dict[str, Any]] = None + async def test( + self, + n_iterations: int, + openai_model_name: str, + inputs: Optional[Dict[str, Any]] = None, ) -> None: """Test and evaluate the Crew with the given inputs for n iterations.""" - evaluator = CrewEvaluator(self, model) + evaluator = CrewEvaluator(self, openai_model_name) for i in range(1, n_iterations + 1): evaluator.set_iteration(i) - self.kickoff(inputs=inputs) + await self.kickoff_async(inputs=inputs) evaluator.print_crew_evaluation_result() diff --git a/src/crewai/utilities/evaluators/crew_evaluator_handler.py b/src/crewai/utilities/evaluators/crew_evaluator_handler.py index 858b05650..3f1abb8b8 100644 --- a/src/crewai/utilities/evaluators/crew_evaluator_handler.py +++ b/src/crewai/utilities/evaluators/crew_evaluator_handler.py @@ -22,7 +22,7 @@ class CrewEvaluator: Attributes: crew (Crew): The crew of agents to evaluate. - model (str): The model to use for evaluating the performance of the agents (for now ONLY OpenAI accepted). + openai_model_name (str): The model to use for evaluating the performance of the agents (for now ONLY OpenAI accepted). tasks_scores (defaultdict): A dictionary to store the scores of the agents for each task. iteration (int): The current iteration of the evaluation. """ @@ -30,9 +30,9 @@ class CrewEvaluator: tasks_scores: defaultdict = defaultdict(list) iteration: int = 0 - def __init__(self, crew, model: str): + def __init__(self, crew, openai_model_name: str): self.crew = crew - self.model = model + self.openai_model_name = openai_model_name self._setup_for_evaluating() def _setup_for_evaluating(self) -> None: @@ -51,7 +51,7 @@ class CrewEvaluator: ), backstory="Evaluator agent for crew evaluation with precise capabilities to evaluate the performance of the agents in the crew based on the tasks they have performed", verbose=False, - llm=ChatOpenAI(model=self.model), + llm=ChatOpenAI(model=self.openai_model_name), ) def _evaluation_task( diff --git a/tests/utilities/evaluators/test_crew_evaluator_handler.py b/tests/utilities/evaluators/test_crew_evaluator_handler.py index 76ef5f6be..39fa35c44 100644 --- a/tests/utilities/evaluators/test_crew_evaluator_handler.py +++ b/tests/utilities/evaluators/test_crew_evaluator_handler.py @@ -8,6 +8,7 @@ from crewai.task import Task from crewai.tasks.task_output import TaskOutput from crewai.utilities.evaluators.crew_evaluator_handler import ( CrewEvaluator, + TaskEvaluationPydanticOutput, ) @@ -22,7 +23,7 @@ class TestCrewEvaluator: ) crew = Crew(agents=[agent], tasks=[task]) - return CrewEvaluator(crew, model="gpt-4o-mini") + return CrewEvaluator(crew, openai_model_name="gpt-4o-mini") def test_setup_for_evaluating(self, crew_planner): crew_planner._setup_for_evaluating() @@ -107,6 +108,6 @@ class TestCrewEvaluator: ) with mock.patch.object(Task, "execute_sync") as execute: - execute().pydantic.quality = 9.5 + execute().pydantic = TaskEvaluationPydanticOutput(quality=9.5) crew_planner.evaluate(task_output) assert crew_planner.tasks_scores[0] == [9.5]