feat: enable custom LLM support for Crew.test()

- Added new llm parameter to Crew.test() that accepts string or LLM instance
- Maintained backward compatibility with openai_model_name parameter
- Updated CrewEvaluator to handle any LLM implementation
- Added comprehensive test coverage

Fixes #2081

Co-Authored-By: Joe Moura <joao@crewai.com>
This commit is contained in:
Devin AI
2025-02-09 23:25:02 +00:00
parent d6d98ee969
commit f838909220
3 changed files with 100 additions and 9 deletions

View File

@@ -1148,19 +1148,31 @@ class Crew(BaseModel):
def test( def test(
self, self,
n_iterations: int, n_iterations: int,
llm: Optional[Union[str, LLM]] = None,
openai_model_name: Optional[str] = None, openai_model_name: Optional[str] = None,
inputs: Optional[Dict[str, Any]] = None, inputs: Optional[Dict[str, Any]] = None,
) -> None: ) -> None:
"""Test and evaluate the Crew with the given inputs for n iterations concurrently using concurrent.futures.""" """Test and evaluate the Crew with the given inputs for n iterations concurrently.
Args:
n_iterations: Number of test iterations to run
llm: LLM instance or model name string to use for evaluation
openai_model_name: (Deprecated) OpenAI model name string (kept for backward compatibility)
inputs: Optional dictionary of inputs to pass to each test iteration
"""
test_crew = self.copy() test_crew = self.copy()
model = llm or openai_model_name
if model is None:
raise ValueError("Either llm or openai_model_name must be provided")
self._test_execution_span = test_crew._telemetry.test_execution_span( self._test_execution_span = test_crew._telemetry.test_execution_span(
test_crew, test_crew,
n_iterations, n_iterations,
inputs, inputs,
openai_model_name, # type: ignore[arg-type] str(model) if isinstance(model, LLM) else model,
) # type: ignore[arg-type] )
evaluator = CrewEvaluator(test_crew, openai_model_name) # type: ignore[arg-type] evaluator = CrewEvaluator(test_crew, model)
for i in range(1, n_iterations + 1): for i in range(1, n_iterations + 1):
evaluator.set_iteration(i) evaluator.set_iteration(i)

View File

@@ -1,4 +1,5 @@
from collections import defaultdict from collections import defaultdict
from typing import Union
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from rich.box import HEAVY_EDGE from rich.box import HEAVY_EDGE
@@ -6,6 +7,7 @@ from rich.console import Console
from rich.table import Table from rich.table import Table
from crewai.agent import Agent from crewai.agent import Agent
from crewai.llm import LLM
from crewai.task import Task from crewai.task import Task
from crewai.tasks.task_output import TaskOutput from crewai.tasks.task_output import TaskOutput
from crewai.telemetry import Telemetry from crewai.telemetry import Telemetry
@@ -32,9 +34,15 @@ class CrewEvaluator:
run_execution_times: defaultdict = defaultdict(list) run_execution_times: defaultdict = defaultdict(list)
iteration: int = 0 iteration: int = 0
def __init__(self, crew, openai_model_name: str): def __init__(self, crew, llm: Union[str, LLM]):
"""Initialize the CrewEvaluator.
Args:
crew: The crew to evaluate
llm: LLM instance or model name string to use for evaluation
"""
self.crew = crew self.crew = crew
self.openai_model_name = openai_model_name self.llm = llm if isinstance(llm, LLM) else LLM(model=llm)
self._telemetry = Telemetry() self._telemetry = Telemetry()
self._setup_for_evaluating() self._setup_for_evaluating()
@@ -51,7 +59,7 @@ class CrewEvaluator:
), ),
backstory="Evaluator agent for crew evaluation with precise capabilities to evaluate the performance of the agents in the crew based on the tasks they have performed", backstory="Evaluator agent for crew evaluation with precise capabilities to evaluate the performance of the agents in the crew based on the tasks they have performed",
verbose=False, verbose=False,
llm=self.openai_model_name, llm=self.llm,
) )
def _evaluation_task( def _evaluation_task(
@@ -95,9 +103,20 @@ class CrewEvaluator:
│ Execution Time (s) │ 42 │ 79 │ 52 │ 57 │ │ │ Execution Time (s) │ 42 │ 79 │ 52 │ 57 │ │
└────────────────────┴───────┴───────┴───────┴────────────┴──────────────────────────────┘ └────────────────────┴───────┴───────┴───────┴────────────┴──────────────────────────────┘
""" """
# Handle empty task scores
if not self.tasks_scores:
return
task_scores_list = list(zip(*self.tasks_scores.values()))
if not task_scores_list:
return
task_averages = [ task_averages = [
sum(scores) / len(scores) for scores in zip(*self.tasks_scores.values()) sum(scores) / len(scores) for scores in task_scores_list
] ]
if not task_averages:
return
crew_average = sum(task_averages) / len(task_averages) crew_average = sum(task_averages) / len(task_averages)
table = Table(title="Tasks Scores \n (1-10 Higher is better)", box=HEAVY_EDGE) table = Table(title="Tasks Scores \n (1-10 Higher is better)", box=HEAVY_EDGE)
@@ -177,11 +196,12 @@ class CrewEvaluator:
evaluation_result = evaluation_task.execute_sync() evaluation_result = evaluation_task.execute_sync()
if isinstance(evaluation_result.pydantic, TaskEvaluationPydanticOutput): if isinstance(evaluation_result.pydantic, TaskEvaluationPydanticOutput):
model_name = str(self.llm) if isinstance(self.llm, LLM) else self.llm
self._test_result_span = self._telemetry.individual_test_result_span( self._test_result_span = self._telemetry.individual_test_result_span(
self.crew, self.crew,
evaluation_result.pydantic.quality, evaluation_result.pydantic.quality,
current_task.execution_duration, current_task.execution_duration,
self.openai_model_name, model_name,
) )
self.tasks_scores[self.iteration].append(evaluation_result.pydantic.quality) self.tasks_scores[self.iteration].append(evaluation_result.pydantic.quality)
self.run_execution_times[self.iteration].append( self.run_execution_times[self.iteration].append(

View File

@@ -2,6 +2,7 @@
import hashlib import hashlib
import json import json
from collections import defaultdict
from concurrent.futures import Future from concurrent.futures import Future
from unittest import mock from unittest import mock
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
@@ -15,6 +16,7 @@ from crewai.agents.cache import CacheHandler
from crewai.crew import Crew from crewai.crew import Crew
from crewai.crews.crew_output import CrewOutput from crewai.crews.crew_output import CrewOutput
from crewai.knowledge.source.string_knowledge_source import StringKnowledgeSource from crewai.knowledge.source.string_knowledge_source import StringKnowledgeSource
from crewai.llm import LLM
from crewai.memory.contextual.contextual_memory import ContextualMemory from crewai.memory.contextual.contextual_memory import ContextualMemory
from crewai.process import Process from crewai.process import Process
from crewai.project import crew from crewai.project import crew
@@ -26,6 +28,13 @@ from crewai.types.usage_metrics import UsageMetrics
from crewai.utilities import Logger from crewai.utilities import Logger
from crewai.utilities.rpm_controller import RPMController from crewai.utilities.rpm_controller import RPMController
from crewai.utilities.task_output_storage_handler import TaskOutputStorageHandler from crewai.utilities.task_output_storage_handler import TaskOutputStorageHandler
from crewai.utilities.evaluators.crew_evaluator_handler import CrewEvaluator
@pytest.fixture
def crew_evaluator():
evaluator = mock.MagicMock(spec=CrewEvaluator)
evaluator.print_crew_evaluation_result = mock.MagicMock()
return evaluator
ceo = Agent( ceo = Agent(
role="CEO", role="CEO",
@@ -3339,6 +3348,56 @@ def test_crew_testing_function(kickoff_mock, copy_mock, crew_evaluator):
] ]
) )
@mock.patch("crewai.crew.CrewEvaluator")
@mock.patch("crewai.crew.Crew.copy")
@mock.patch("crewai.crew.Crew.kickoff")
def test_crew_testing_with_llm_instance(kickoff_mock, copy_mock, evaluator_mock):
task = Task(
description="Test task",
expected_output="Test output",
agent=researcher,
)
crew = Crew(agents=[researcher], tasks=[task])
llm = LLM(model="gpt-4")
# Create a mock for the copied crew
copy_mock.return_value = crew
# Create a mock evaluator instance with required methods
mock_evaluator = mock.MagicMock()
mock_evaluator.set_iteration = mock.MagicMock()
mock_evaluator.evaluate = mock.MagicMock()
mock_evaluator.print_crew_evaluation_result = mock.MagicMock()
# Set up the mock class to track constructor calls and return our mock instance
evaluator_mock.side_effect = lambda crew_arg, model_arg: mock_evaluator
# Run the test
crew.test(n_iterations=2, llm=llm)
# Verify the evaluator was used correctly
kickoff_mock.assert_has_calls([
mock.call(inputs=None),
mock.call(inputs=None)
])
# Verify CrewEvaluator was instantiated with the LLM instance
evaluator_mock.assert_called_once_with(crew, llm)
# Verify print_crew_evaluation_result was called
mock_evaluator.print_crew_evaluation_result.assert_called_once()
def test_crew_testing_with_missing_model():
crew = Crew(agents=[researcher], tasks=[Task(
description="Test task",
expected_output="Test output",
agent=researcher,
)])
with pytest.raises(ValueError, match="Either llm or openai_model_name must be provided"):
crew.test(n_iterations=2)
@pytest.mark.vcr(filter_headers=["authorization"]) @pytest.mark.vcr(filter_headers=["authorization"])
def test_hierarchical_verbose_manager_agent(): def test_hierarchical_verbose_manager_agent():