feat: improve llm handling and error validation

Co-Authored-By: Joe Moura <joao@crewai.com>
2026-01-10 00:28:31 +00:00 · 2025-02-09 21:46:23 +00:00
parent a097d933f6
commit 5e528416ec
2 changed files with 83 additions and 12 deletions
--- a/src/crewai/crew.py
+++ b/src/crewai/crew.py
@@ -1081,7 +1081,26 @@ class Crew(BaseModel):
        openai_model_name: Optional[str] = None,  # For backward compatibility
        inputs: Optional[Dict[str, Any]] = None,
    ) -> None:
-        """Test and evaluate the Crew with the given inputs for n iterations concurrently using concurrent.futures."""
+        """Test and evaluate the Crew with the given inputs for n iterations.
        This method runs tests to evaluate the performance of the crew using the specified
        language model. It supports both string model names and LLM instances for flexibility.
        Args:
            n_iterations: Number of test iterations to run
            llm: Language model configuration (preferred). Can be:
                - A string model name (e.g., "gpt-4")
                - An LLM instance
                - Any object with model_name or deployment_name attributes
            openai_model_name: Legacy parameter for backward compatibility.
                Deprecated: Will be removed in future versions. Use `llm` instead.
            inputs: Optional dictionary of inputs to be used during testing
        Note:
            The `openai_model_name` parameter is deprecated and will be removed in
            future versions. Use the more flexible `llm` parameter instead, which
            supports any LLM implementation.
        """
        test_crew = self.copy()
        # For backward compatibility, convert openai_model_name to llm
--- a/src/crewai/utilities/evaluators/crew_evaluator_handler.py
+++ b/src/crewai/utilities/evaluators/crew_evaluator_handler.py
@@ -1,8 +1,13 @@
-from collections import defaultdict
+from collections.abc import Callable
 from typing import Any, Dict, List, Union, Annotated, DefaultDict
-from typing import Any, Union
+from pydantic import (
-
+    BaseModel,
-from pydantic import BaseModel, Field, InstanceOf
+    Field,
    InstanceOf,
    PrivateAttr,
    model_validator,
 )
 from rich.box import HEAVY_EDGE
 from rich.console import Console
 from rich.table import Table
@@ -20,7 +25,7 @@ class TaskEvaluationPydanticOutput(BaseModel):
    )
-class CrewEvaluator:
+class CrewEvaluator(BaseModel):
    """
    A class to evaluate the performance of the agents in the crew based on the tasks they have performed.
@@ -31,16 +36,63 @@ class CrewEvaluator:
        iteration (int): The current iteration of the evaluation.
    """
-    tasks_scores: defaultdict = defaultdict(list)
+    crew: Any = Field(description="The crew of agents to evaluate.")
-    run_execution_times: defaultdict = defaultdict(list)
+    llm: Union[str, InstanceOf[LLM], Any] = Field(
-    iteration: int = 0
+        description="Language model that will run the evaluation."
    )
    tasks_scores: DefaultDict[int, List[float]] = Field(
        default_factory=lambda: DefaultDict(list),
        description="Dictionary to store the scores of the agents for each task."
    )
    run_execution_times: DefaultDict[int, List[int]] = Field(
        default_factory=lambda: DefaultDict(list),
        description="Dictionary to store execution times for each run."
    )
    iteration: int = Field(
        default=0,
        description="Current iteration of the evaluation."
    )
    @model_validator(mode="after")
    def validate_llm(self):
        """Validates that the LLM is properly configured."""
        if not self.llm:
            raise ValueError("LLM configuration is required")
        return self
    _telemetry: Telemetry = PrivateAttr(default_factory=Telemetry)
    def __init__(self, crew, llm: Union[str, InstanceOf[LLM], Any]):
-        self.crew = crew
+        # Initialize Pydantic model with validated fields
-        self.llm = llm if isinstance(llm, LLM) else LLM(model=llm)
+        super().__init__(crew=crew, llm=llm)
        self._telemetry = Telemetry()
        self._setup_for_evaluating()
    @model_validator(mode="before")
    def init_llm(cls, values):
        """Initialize LLM before Pydantic validation."""
        llm = values.get("llm")
        try:
            if isinstance(llm, str):
                values["llm"] = LLM(model=llm)
            elif isinstance(llm, LLM):
                values["llm"] = llm
            else:
                # For any other type, attempt to extract relevant attributes
                llm_params = {
                    "model": getattr(llm, "model_name", None)
                    or getattr(llm, "deployment_name", None)
                    or str(llm),
                    "temperature": getattr(llm, "temperature", None),
                    "max_tokens": getattr(llm, "max_tokens", None),
                    "timeout": getattr(llm, "timeout", None),
                }
                # Remove None values
                llm_params = {k: v for k, v in llm_params.items() if v is not None}
                values["llm"] = LLM(**llm_params)
        except Exception as e:
            raise ValueError(f"Invalid LLM configuration: {str(e)}") from e
        return values
    def _setup_for_evaluating(self) -> None:
        """Sets up the crew for evaluating."""
        for task in self.crew.tasks: