fix: enable any llm to run test functionality

- Update CrewEvaluator to accept any LLM type
- Modify Crew.test() to support both custom LLMs and maintain backward compatibility
- Add comprehensive test coverage for both new functionality and backward compatibility

Fixes #2072

Co-Authored-By: Joe Moura <joao@crewai.com>
This commit is contained in:
Devin AI
2025-02-09 21:05:18 +00:00
parent d6d98ee969
commit 22d886be11
4 changed files with 104 additions and 36 deletions

View File

@@ -20,6 +20,9 @@ from pydantic import (
)
from pydantic_core import PydanticCustomError
from typing import Union
from crewai.llm import LLM
from crewai.agent import Agent
from crewai.agents.agent_builder.base_agent import BaseAgent
from crewai.agents.cache import CacheHandler
@@ -1148,19 +1151,28 @@ class Crew(BaseModel):
def test(
self,
n_iterations: int,
llm: Optional[Union[str, LLM]] = None,
openai_model_name: Optional[str] = None,
inputs: Optional[Dict[str, Any]] = None,
) -> None:
"""Test and evaluate the Crew with the given inputs for n iterations concurrently using concurrent.futures."""
"""Test and evaluate the Crew with the given inputs for n iterations.
Args:
n_iterations: Number of test iterations to run
llm: LLM instance or model name to use for evaluation
openai_model_name: (Deprecated) OpenAI model name to use for evaluation
inputs: Optional inputs for the crew
"""
test_crew = self.copy()
test_llm = llm or openai_model_name
self._test_execution_span = test_crew._telemetry.test_execution_span(
test_crew,
n_iterations,
inputs,
openai_model_name, # type: ignore[arg-type]
test_llm, # type: ignore[arg-type]
) # type: ignore[arg-type]
evaluator = CrewEvaluator(test_crew, openai_model_name) # type: ignore[arg-type]
evaluator = CrewEvaluator(test_crew, test_llm)
for i in range(1, n_iterations + 1):
evaluator.set_iteration(i)

View File

@@ -1,3 +1,6 @@
from typing import Union
from crewai.llm import LLM
from collections import defaultdict
from pydantic import BaseModel, Field
@@ -32,9 +35,15 @@ class CrewEvaluator:
run_execution_times: defaultdict = defaultdict(list)
iteration: int = 0
def __init__(self, crew, openai_model_name: str):
def __init__(self, crew, llm: Union[str, LLM, None] = None):
"""Initialize the CrewEvaluator.
Args:
crew: The crew to evaluate
llm: LLM instance or model name to use for evaluation
"""
self.crew = crew
self.openai_model_name = openai_model_name
self._llm = llm if isinstance(llm, LLM) else LLM(model=llm) if llm else None
self._telemetry = Telemetry()
self._setup_for_evaluating()
@@ -51,7 +60,7 @@ class CrewEvaluator:
),
backstory="Evaluator agent for crew evaluation with precise capabilities to evaluate the performance of the agents in the crew based on the tasks they have performed",
verbose=False,
llm=self.openai_model_name,
llm=self._llm,
)
def _evaluation_task(

View File

@@ -10,6 +10,8 @@ import instructor
import pydantic_core
import pytest
from crewai.llm import LLM
from crewai.utilities.evaluators.crew_evaluator_handler import TaskEvaluationPydanticOutput
from crewai.agent import Agent
from crewai.agents.cache import CacheHandler
from crewai.crew import Crew
@@ -3305,39 +3307,63 @@ def test_conditional_should_execute():
@mock.patch("crewai.crew.CrewEvaluator")
@mock.patch("crewai.crew.Crew.copy")
@mock.patch("crewai.crew.Crew.kickoff")
def test_crew_testing_function(kickoff_mock, copy_mock, crew_evaluator):
@mock.patch.object(Crew, "copy")
@mock.patch.object(Crew, "kickoff")
def test_crew_test_with_custom_llm(mock_kickoff, mock_copy, mock_evaluator):
"""Test that Crew.test() works with a custom LLM implementation."""
task = Task(description="Test task", expected_output="Test output", agent=researcher)
crew = Crew(agents=[researcher], tasks=[task])
mock_copy.return_value = crew
mock_evaluator.return_value = mock.MagicMock()
llm = LLM(model="gpt-4")
crew.test(n_iterations=1, llm=llm)
# Verify CrewEvaluator was called with the LLM instance
mock_evaluator.assert_called_once()
args = mock_evaluator.call_args[0]
assert args[1] == llm
@mock.patch("crewai.crew.CrewEvaluator")
@mock.patch.object(Crew, "copy")
@mock.patch.object(Crew, "kickoff")
def test_crew_test_backward_compatibility(mock_kickoff, mock_copy, mock_evaluator):
"""Test that Crew.test() maintains backward compatibility with openai_model_name."""
task = Task(description="Test task", expected_output="Test output", agent=researcher)
crew = Crew(agents=[researcher], tasks=[task])
mock_copy.return_value = crew
mock_evaluator.return_value = mock.MagicMock()
crew.test(n_iterations=1, openai_model_name="gpt-4")
# Verify CrewEvaluator was called with the model name
mock_evaluator.assert_called_once()
args = mock_evaluator.call_args[0]
assert args[1] == "gpt-4"
@mock.patch("crewai.crew.CrewEvaluator")
@mock.patch.object(Crew, "copy")
@mock.patch.object(Crew, "kickoff")
def test_crew_testing_function(mock_kickoff, mock_copy, mock_evaluator):
"""Test that Crew.test() works with basic functionality."""
task = Task(
description="Come up with a list of 5 interesting ideas to explore for an article, then write one amazing paragraph highlight for each idea that showcases how good an article about this topic could be. Return the list of ideas with their paragraph and your notes.",
expected_output="5 bullet points with a paragraph for each idea.",
description="Test task",
expected_output="Test output",
agent=researcher,
)
crew = Crew(
agents=[researcher],
tasks=[task],
)
# Create a mock for the copied crew
copy_mock.return_value = crew
n_iterations = 2
crew.test(n_iterations, openai_model_name="gpt-4o-mini", inputs={"topic": "AI"})
# Ensure kickoff is called on the copied crew
kickoff_mock.assert_has_calls(
[mock.call(inputs={"topic": "AI"}), mock.call(inputs={"topic": "AI"})]
)
crew_evaluator.assert_has_calls(
[
mock.call(crew, "gpt-4o-mini"),
mock.call().set_iteration(1),
mock.call().set_iteration(2),
mock.call().print_crew_evaluation_result(),
]
)
crew = Crew(agents=[researcher], tasks=[task])
mock_copy.return_value = crew
mock_evaluator.return_value = mock.MagicMock()
crew.test(n_iterations=1)
# Verify CrewEvaluator was called with None as llm (default behavior)
mock_evaluator.assert_called_once()
args = mock_evaluator.call_args[0]
assert args[1] is None
# Verify kickoff was called
mock_kickoff.assert_called_once()
@pytest.mark.vcr(filter_headers=["authorization"])

View File

@@ -2,6 +2,7 @@ from unittest import mock
import pytest
from crewai.llm import LLM
from crewai.agent import Agent
from crewai.crew import Crew
from crewai.task import Task
@@ -140,3 +141,23 @@ class InternalCrewEvaluator:
execute().pydantic = TaskEvaluationPydanticOutput(quality=9.5)
crew_planner.evaluate(task_output)
assert crew_planner.tasks_scores[0] == [9.5]
def test_crew_evaluator_with_llm_instance(self):
"""Test that CrewEvaluator works with an LLM instance."""
agent = Agent(role="Agent 1", goal="Goal 1", backstory="Backstory 1")
task = Task(description="Task 1", expected_output="Output 1", agent=agent)
crew = Crew(agents=[agent], tasks=[task])
llm = LLM(model="gpt-4")
evaluator = CrewEvaluator(crew, llm)
assert evaluator._llm == llm
def test_crew_evaluator_with_model_name(self):
"""Test that CrewEvaluator works with a model name string."""
agent = Agent(role="Agent 1", goal="Goal 1", backstory="Backstory 1")
task = Task(description="Task 1", expected_output="Output 1", agent=agent)
crew = Crew(agents=[agent], tasks=[task])
evaluator = CrewEvaluator(crew, "gpt-4")
assert isinstance(evaluator._llm, LLM)
assert evaluator._llm.model == "gpt-4"