From a7b050f52f757d40c19d2a4379008814ee26d4ae Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Sun, 9 Feb 2025 20:51:16 +0000 Subject: [PATCH] fix: enable any llm to run test functionality This change enables the Crew.test() method to work with any LLM implementation, not just OpenAI models. It maintains backward compatibility with the openai_model_name parameter while adding support for custom LLMs. Fixes #2067 Fixes #2071 Co-Authored-By: Joe Moura --- src/crewai/crew.py | 10 +-- .../evaluators/crew_evaluator_handler.py | 32 ++++++++-- tests/crew_test.py | 62 ++++++++++++++++++- .../evaluators/test_crew_evaluator_handler.py | 4 +- 4 files changed, 95 insertions(+), 13 deletions(-) diff --git a/src/crewai/crew.py b/src/crewai/crew.py index d488783ea..77eb7fe8c 100644 --- a/src/crewai/crew.py +++ b/src/crewai/crew.py @@ -1075,19 +1075,21 @@ class Crew(BaseModel): def test( self, n_iterations: int, - openai_model_name: Optional[str] = None, + llm: Optional[Union[str, InstanceOf[LLM], Any]] = None, + openai_model_name: Optional[str] = None, # Kept for backward compatibility inputs: Optional[Dict[str, Any]] = None, ) -> None: - """Test and evaluate the Crew with the given inputs for n iterations concurrently using concurrent.futures.""" + """Test and evaluate the Crew with the given inputs for n iterations.""" test_crew = self.copy() + test_llm = llm if llm is not None else openai_model_name self._test_execution_span = test_crew._telemetry.test_execution_span( test_crew, n_iterations, inputs, - openai_model_name, # type: ignore[arg-type] + test_llm, # type: ignore[arg-type] ) # type: ignore[arg-type] - evaluator = CrewEvaluator(test_crew, openai_model_name) # type: ignore[arg-type] + evaluator = CrewEvaluator(test_crew, test_llm) # type: ignore[arg-type] for i in range(1, n_iterations + 1): evaluator.set_iteration(i) diff --git a/src/crewai/utilities/evaluators/crew_evaluator_handler.py b/src/crewai/utilities/evaluators/crew_evaluator_handler.py index 3387d91b3..5ce2ed4ca 100644 --- a/src/crewai/utilities/evaluators/crew_evaluator_handler.py +++ b/src/crewai/utilities/evaluators/crew_evaluator_handler.py @@ -1,11 +1,14 @@ +import os from collections import defaultdict +from typing import Any, Dict, List, Optional, Union -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, InstanceOf from rich.box import HEAVY_EDGE from rich.console import Console from rich.table import Table from crewai.agent import Agent +from crewai.llm import LLM from crewai.task import Task from crewai.tasks.task_output import TaskOutput from crewai.telemetry import Telemetry @@ -32,12 +35,31 @@ class CrewEvaluator: run_execution_times: defaultdict = defaultdict(list) iteration: int = 0 - def __init__(self, crew, openai_model_name: str): + def __init__(self, crew, llm: Union[str, InstanceOf[LLM], Any]): self.crew = crew - self.openai_model_name = openai_model_name + self.llm = llm self._telemetry = Telemetry() + self._setup_llm() self._setup_for_evaluating() + def _setup_llm(self): + """Set up the LLM following the Agent class pattern.""" + if isinstance(self.llm, str): + self.llm = LLM(model=self.llm) + elif isinstance(self.llm, LLM): + pass + elif self.llm is None: + model_name = os.environ.get("OPENAI_MODEL_NAME") or "gpt-4" + self.llm = LLM(model=model_name) + else: + llm_params = { + "model": getattr(self.llm, "model_name", None) + or getattr(self.llm, "deployment_name", None) + or str(self.llm), + } + self.llm = LLM(**llm_params) + + def _setup_for_evaluating(self) -> None: """Sets up the crew for evaluating.""" for task in self.crew.tasks: @@ -51,7 +73,7 @@ class CrewEvaluator: ), backstory="Evaluator agent for crew evaluation with precise capabilities to evaluate the performance of the agents in the crew based on the tasks they have performed", verbose=False, - llm=self.openai_model_name, + llm=self.llm, ) def _evaluation_task( @@ -181,7 +203,7 @@ class CrewEvaluator: self.crew, evaluation_result.pydantic.quality, current_task._execution_time, - self.openai_model_name, + self.llm, ) self.tasks_scores[self.iteration].append(evaluation_result.pydantic.quality) self.run_execution_times[self.iteration].append( diff --git a/tests/crew_test.py b/tests/crew_test.py index 2003ddada..831e1df05 100644 --- a/tests/crew_test.py +++ b/tests/crew_test.py @@ -24,6 +24,36 @@ from crewai.types.usage_metrics import UsageMetrics from crewai.utilities import Logger from crewai.utilities.rpm_controller import RPMController from crewai.utilities.task_output_storage_handler import TaskOutputStorageHandler +from crewai.llm import LLM + +class MockLLM(LLM): + """Mock LLM for testing.""" + def __init__(self): + super().__init__(model="gpt-4") # Use a known model name + + def chat_completion(self, messages, tools=None, tool_choice=None, **kwargs): + # Mock a proper response that matches the expected format + if tools and any('output' in tool.get('function', {}).get('name', '') for tool in tools): + return { + "choices": [{ + "message": { + "content": None, + "role": "assistant", + "function_call": { + "name": "output", + "arguments": '{"quality": 8.5}' + } + } + }] + } + return { + "choices": [{ + "message": { + "content": "Mock LLM Response", + "role": "assistant" + } + }] + } ceo = Agent( role="CEO", @@ -47,6 +77,34 @@ writer = Agent( ) +def test_crew_test_with_custom_llm(): + """Test that Crew.test() works with a custom LLM implementation.""" + task = Task( + description="Test task", + expected_output="Test output", + agent=researcher, + ) + crew = Crew(agents=[researcher], tasks=[task]) + + # Test with custom LLM + custom_llm = MockLLM() + crew.test(n_iterations=1, llm=custom_llm) + # No assertion needed as we just verify it runs without errors + +def test_crew_test_backward_compatibility(): + """Test that Crew.test() maintains backward compatibility with openai_model_name.""" + task = Task( + description="Test task", + expected_output="Test output", + agent=researcher, + ) + crew = Crew(agents=[researcher], tasks=[task]) + + # Test with openai_model_name + crew.test(n_iterations=1, openai_model_name="gpt-4") + # No assertion needed as we just verify it runs without errors + + def test_crew_config_conditional_requirement(): with pytest.raises(ValueError): Crew(process=Process.sequential) @@ -1123,7 +1181,7 @@ def test_kickoff_for_each_empty_input(): assert results == [] -@pytest.mark.vcr(filter_headers=["authorization"]) +@pytest.mark.vcr(filter_headeruvs=["authorization"]) def test_kickoff_for_each_invalid_input(): """Tests if kickoff_for_each raises TypeError for invalid input types.""" @@ -3125,4 +3183,4 @@ def test_multimodal_agent_live_image_analysis(): # Verify we got a meaningful response assert isinstance(result.raw, str) assert len(result.raw) > 100 # Expecting a detailed analysis - assert "error" not in result.raw.lower() # No error messages in response \ No newline at end of file + assert "error" not in result.raw.lower() # No error messages in response diff --git a/tests/utilities/evaluators/test_crew_evaluator_handler.py b/tests/utilities/evaluators/test_crew_evaluator_handler.py index 649c25998..dcfab39ad 100644 --- a/tests/utilities/evaluators/test_crew_evaluator_handler.py +++ b/tests/utilities/evaluators/test_crew_evaluator_handler.py @@ -23,7 +23,7 @@ class TestCrewEvaluator: ) crew = Crew(agents=[agent], tasks=[task]) - return CrewEvaluator(crew, openai_model_name="gpt-4o-mini") + return CrewEvaluator(crew, llm="openai/gpt-4o-mini") def test_setup_for_evaluating(self, crew_planner): crew_planner._setup_for_evaluating() @@ -45,7 +45,7 @@ class TestCrewEvaluator: == "Evaluator agent for crew evaluation with precise capabilities to evaluate the performance of the agents in the crew based on the tasks they have performed" ) assert agent.verbose is False - assert agent.llm.model == "gpt-4o-mini" + assert agent.llm.model == "openai/gpt-4o-mini" def test_evaluation_task(self, crew_planner): evaluator_agent = Agent(