diff --git a/src/crewai/crew.py b/src/crewai/crew.py index eaf9ff3e8..da856b42b 100644 --- a/src/crewai/crew.py +++ b/src/crewai/crew.py @@ -1079,7 +1079,24 @@ class Crew(BaseModel): llm: Union[str, LLM], inputs: Optional[Dict[str, Any]] = None, ) -> None: - """Test and evaluate the Crew with the given inputs for n iterations concurrently using concurrent.futures.""" + """Test and evaluate the Crew with the given inputs for n iterations concurrently using concurrent.futures. + + Args: + n_iterations: Number of test iterations to run + llm: Language model to use for evaluation. Can be either a model name string (e.g. "gpt-4") + or an LLM instance for custom implementations + inputs: Optional dictionary of input values to use for task execution + + Example: + ```python + # Using model name string + crew.test(n_iterations=3, llm="gpt-4") + + # Using custom LLM implementation + custom_llm = LLM(model="custom-model") + crew.test(n_iterations=3, llm=custom_llm) + ``` + """ test_crew = self.copy() self._test_execution_span = test_crew._telemetry.test_execution_span( diff --git a/src/crewai/utilities/evaluators/crew_evaluator_handler.py b/src/crewai/utilities/evaluators/crew_evaluator_handler.py index 21fc010a5..3b3691c11 100644 --- a/src/crewai/utilities/evaluators/crew_evaluator_handler.py +++ b/src/crewai/utilities/evaluators/crew_evaluator_handler.py @@ -1,13 +1,16 @@ -from typing import Union - -from crewai.llm import LLM from collections import defaultdict +from typing import Any, Dict, List, Optional, TypeVar, Union +from typing import DefaultDict # Separate import to avoid circular imports from pydantic import BaseModel, Field from rich.box import HEAVY_EDGE from rich.console import Console from rich.table import Table +from crewai.llm import LLM + +T = TypeVar('T', bound=LLM) + from crewai.agent import Agent from crewai.task import Task from crewai.tasks.task_output import TaskOutput @@ -31,14 +34,47 @@ class CrewEvaluator: iteration (int): The current iteration of the evaluation. """ - tasks_scores: defaultdict = defaultdict(list) - run_execution_times: defaultdict = defaultdict(list) + _tasks_scores: DefaultDict[int, List[float]] = Field( + default_factory=lambda: defaultdict(list)) + _run_execution_times: DefaultDict[int, List[float]] = Field( + default_factory=lambda: defaultdict(list)) iteration: int = 0 - def __init__(self, crew, llm: Union[str, LLM]): + @property + def tasks_scores(self) -> DefaultDict[int, List[float]]: + return self._tasks_scores + + @tasks_scores.setter + def tasks_scores(self, value: Dict[int, List[float]]) -> None: + self._tasks_scores = defaultdict(list, value) + + @property + def run_execution_times(self) -> DefaultDict[int, List[float]]: + return self._run_execution_times + + @run_execution_times.setter + def run_execution_times(self, value: Dict[int, List[float]]) -> None: + self._run_execution_times = defaultdict(list, value) + + def __init__(self, crew, llm: Union[str, T]): + """Initialize the CrewEvaluator. + + Args: + crew: The Crew instance to evaluate + llm: Language model to use for evaluation. Can be either a model name string + or an LLM instance for custom implementations + + Raises: + ValueError: If llm is None or invalid + """ + if not llm: + raise ValueError("Invalid LLM configuration") + self.crew = crew self.llm = LLM(model=llm) if isinstance(llm, str) else llm self._telemetry = Telemetry() + self._tasks_scores = defaultdict(list) + self._run_execution_times = defaultdict(list) self._setup_for_evaluating() def _setup_for_evaluating(self) -> None: @@ -184,11 +220,19 @@ class CrewEvaluator: self.crew, evaluation_result.pydantic.quality, current_task._execution_time, - str(self.llm) if isinstance(self.llm, LLM) else self.llm, + self._get_llm_identifier(), ) - self.tasks_scores[self.iteration].append(evaluation_result.pydantic.quality) - self.run_execution_times[self.iteration].append( + self._tasks_scores[self.iteration].append(evaluation_result.pydantic.quality) + self._run_execution_times[self.iteration].append( current_task._execution_time ) else: raise ValueError("Evaluation result is not in the expected format") + + def _get_llm_identifier(self) -> str: + """Get a string identifier for the LLM instance. + + Returns: + String representation of the LLM for telemetry + """ + return str(self.llm) if isinstance(self.llm, LLM) else self.llm diff --git a/tests/utilities/evaluators/test_crew_evaluator_handler.py b/tests/utilities/evaluators/test_crew_evaluator_handler.py index 14ffd33cb..0aeec963f 100644 --- a/tests/utilities/evaluators/test_crew_evaluator_handler.py +++ b/tests/utilities/evaluators/test_crew_evaluator_handler.py @@ -48,11 +48,18 @@ class TestCrewEvaluator: assert agent.verbose is False assert agent.llm.model == "gpt-4o-mini" - def test_evaluator_with_custom_llm(self, crew_planner): - custom_llm = LLM(model="gpt-4") - evaluator = CrewEvaluator(crew_planner.crew, custom_llm) + @pytest.mark.parametrize("llm_input,expected_model", [ + (LLM(model="gpt-4"), "gpt-4"), + ("gpt-4", "gpt-4"), + ]) + def test_evaluator_with_llm_types(self, crew_planner, llm_input, expected_model): + evaluator = CrewEvaluator(crew_planner.crew, llm_input) agent = evaluator._evaluator_agent() - assert agent.llm == custom_llm + assert agent.llm.model == expected_model + + def test_evaluator_with_invalid_llm(self, crew_planner): + with pytest.raises(ValueError, match="Invalid LLM configuration"): + CrewEvaluator(crew_planner.crew, None) def test_evaluator_with_string_llm(self, crew_planner): evaluator = CrewEvaluator(crew_planner.crew, "gpt-4")