diff --git a/src/crewai/crew.py b/src/crewai/crew.py index d488783ea..eaf9ff3e8 100644 --- a/src/crewai/crew.py +++ b/src/crewai/crew.py @@ -4,6 +4,7 @@ import uuid import warnings from concurrent.futures import Future from hashlib import md5 +from crewai.llm import LLM from typing import Any, Callable, Dict, List, Optional, Tuple, Union from pydantic import ( @@ -1075,7 +1076,7 @@ class Crew(BaseModel): def test( self, n_iterations: int, - openai_model_name: Optional[str] = None, + llm: Union[str, LLM], inputs: Optional[Dict[str, Any]] = None, ) -> None: """Test and evaluate the Crew with the given inputs for n iterations concurrently using concurrent.futures.""" @@ -1085,9 +1086,9 @@ class Crew(BaseModel): test_crew, n_iterations, inputs, - openai_model_name, # type: ignore[arg-type] - ) # type: ignore[arg-type] - evaluator = CrewEvaluator(test_crew, openai_model_name) # type: ignore[arg-type] + str(llm) if isinstance(llm, LLM) else llm, + ) + evaluator = CrewEvaluator(test_crew, llm) for i in range(1, n_iterations + 1): evaluator.set_iteration(i) diff --git a/src/crewai/utilities/evaluators/crew_evaluator_handler.py b/src/crewai/utilities/evaluators/crew_evaluator_handler.py index 3387d91b3..21fc010a5 100644 --- a/src/crewai/utilities/evaluators/crew_evaluator_handler.py +++ b/src/crewai/utilities/evaluators/crew_evaluator_handler.py @@ -1,3 +1,6 @@ +from typing import Union + +from crewai.llm import LLM from collections import defaultdict from pydantic import BaseModel, Field @@ -32,9 +35,9 @@ class CrewEvaluator: run_execution_times: defaultdict = defaultdict(list) iteration: int = 0 - def __init__(self, crew, openai_model_name: str): + def __init__(self, crew, llm: Union[str, LLM]): self.crew = crew - self.openai_model_name = openai_model_name + self.llm = LLM(model=llm) if isinstance(llm, str) else llm self._telemetry = Telemetry() self._setup_for_evaluating() @@ -51,7 +54,7 @@ class CrewEvaluator: ), backstory="Evaluator agent for crew evaluation with precise capabilities to evaluate the performance of the agents in the crew based on the tasks they have performed", verbose=False, - llm=self.openai_model_name, + llm=self.llm, ) def _evaluation_task( @@ -181,7 +184,7 @@ class CrewEvaluator: self.crew, evaluation_result.pydantic.quality, current_task._execution_time, - self.openai_model_name, + str(self.llm) if isinstance(self.llm, LLM) else self.llm, ) self.tasks_scores[self.iteration].append(evaluation_result.pydantic.quality) self.run_execution_times[self.iteration].append( diff --git a/tests/crew_test.py b/tests/crew_test.py index 2003ddada..eb523801d 100644 --- a/tests/crew_test.py +++ b/tests/crew_test.py @@ -10,6 +10,7 @@ import instructor import pydantic_core import pytest +from crewai.llm import LLM from crewai.agent import Agent from crewai.agents.cache import CacheHandler from crewai.crew import Crew @@ -1123,7 +1124,7 @@ def test_kickoff_for_each_empty_input(): assert results == [] -@pytest.mark.vcr(filter_headers=["authorization"]) +@pytest.mark.vcr(filter_headeruvs=["authorization"]) def test_kickoff_for_each_invalid_input(): """Tests if kickoff_for_each raises TypeError for invalid input types.""" @@ -2828,7 +2829,7 @@ def test_crew_testing_function(kickoff_mock, copy_mock, crew_evaluator): copy_mock.return_value = crew n_iterations = 2 - crew.test(n_iterations, openai_model_name="gpt-4o-mini", inputs={"topic": "AI"}) + crew.test(n_iterations, llm="gpt-4o-mini", inputs={"topic": "AI"}) # Ensure kickoff is called on the copied crew kickoff_mock.assert_has_calls( @@ -2844,6 +2845,32 @@ def test_crew_testing_function(kickoff_mock, copy_mock, crew_evaluator): ] ) +@mock.patch("crewai.crew.CrewEvaluator") +@mock.patch("crewai.crew.Crew.copy") +@mock.patch("crewai.crew.Crew.kickoff") +def test_crew_testing_with_custom_llm(kickoff_mock, copy_mock, crew_evaluator): + task = Task( + description="Test task", + expected_output="Test output", + agent=researcher, + ) + crew = Crew(agents=[researcher], tasks=[task]) + copy_mock.return_value = crew + custom_llm = LLM(model="gpt-4") + + crew.test(2, llm=custom_llm, inputs={"topic": "AI"}) + + kickoff_mock.assert_has_calls([ + mock.call(inputs={"topic": "AI"}), + mock.call(inputs={"topic": "AI"}) + ]) + crew_evaluator.assert_has_calls([ + mock.call(crew, custom_llm), + mock.call().set_iteration(1), + mock.call().set_iteration(2), + mock.call().print_crew_evaluation_result(), + ]) + @pytest.mark.vcr(filter_headers=["authorization"]) def test_hierarchical_verbose_manager_agent(): @@ -3125,4 +3152,4 @@ def test_multimodal_agent_live_image_analysis(): # Verify we got a meaningful response assert isinstance(result.raw, str) assert len(result.raw) > 100 # Expecting a detailed analysis - assert "error" not in result.raw.lower() # No error messages in response \ No newline at end of file + assert "error" not in result.raw.lower() # No error messages in response diff --git a/tests/utilities/evaluators/test_crew_evaluator_handler.py b/tests/utilities/evaluators/test_crew_evaluator_handler.py index 649c25998..14ffd33cb 100644 --- a/tests/utilities/evaluators/test_crew_evaluator_handler.py +++ b/tests/utilities/evaluators/test_crew_evaluator_handler.py @@ -2,6 +2,7 @@ from unittest import mock import pytest +from crewai.llm import LLM from crewai.agent import Agent from crewai.crew import Crew from crewai.task import Task @@ -23,7 +24,7 @@ class TestCrewEvaluator: ) crew = Crew(agents=[agent], tasks=[task]) - return CrewEvaluator(crew, openai_model_name="gpt-4o-mini") + return CrewEvaluator(crew, llm="gpt-4o-mini") def test_setup_for_evaluating(self, crew_planner): crew_planner._setup_for_evaluating() @@ -47,6 +48,18 @@ class TestCrewEvaluator: assert agent.verbose is False assert agent.llm.model == "gpt-4o-mini" + def test_evaluator_with_custom_llm(self, crew_planner): + custom_llm = LLM(model="gpt-4") + evaluator = CrewEvaluator(crew_planner.crew, custom_llm) + agent = evaluator._evaluator_agent() + assert agent.llm == custom_llm + + def test_evaluator_with_string_llm(self, crew_planner): + evaluator = CrewEvaluator(crew_planner.crew, "gpt-4") + agent = evaluator._evaluator_agent() + assert isinstance(agent.llm, LLM) + assert agent.llm.model == "gpt-4" + def test_evaluation_task(self, crew_planner): evaluator_agent = Agent( role="Evaluator Agent",