From 5e528416ec2b1cf59be4b6f17299734328d69002 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Sun, 9 Feb 2025 21:46:23 +0000
Subject: [PATCH] feat: improve llm handling and error validation

Co-Authored-By: Joe Moura <joao@crewai.com>
---
 src/crewai/crew.py                            | 21 +++++-
 .../evaluators/crew_evaluator_handler.py      | 74 ++++++++++++++++---
 2 files changed, 83 insertions(+), 12 deletions(-)

diff --git a/src/crewai/crew.py b/src/crewai/crew.py
index 115d82387..2e88118bf 100644
--- a/src/crewai/crew.py
+++ b/src/crewai/crew.py
@@ -1081,7 +1081,26 @@ class Crew(BaseModel):
         openai_model_name: Optional[str] = None,  # For backward compatibility
         inputs: Optional[Dict[str, Any]] = None,
     ) -> None:
-        """Test and evaluate the Crew with the given inputs for n iterations concurrently using concurrent.futures."""
+        """Test and evaluate the Crew with the given inputs for n iterations.
+
+        This method runs tests to evaluate the performance of the crew using the specified
+        language model. It supports both string model names and LLM instances for flexibility.
+
+        Args:
+            n_iterations: Number of test iterations to run
+            llm: Language model configuration (preferred). Can be:
+                - A string model name (e.g., "gpt-4")
+                - An LLM instance
+                - Any object with model_name or deployment_name attributes
+            openai_model_name: Legacy parameter for backward compatibility.
+                Deprecated: Will be removed in future versions. Use `llm` instead.
+            inputs: Optional dictionary of inputs to be used during testing
+
+        Note:
+            The `openai_model_name` parameter is deprecated and will be removed in
+            future versions. Use the more flexible `llm` parameter instead, which
+            supports any LLM implementation.
+        """
         test_crew = self.copy()
 
         # For backward compatibility, convert openai_model_name to llm
diff --git a/src/crewai/utilities/evaluators/crew_evaluator_handler.py b/src/crewai/utilities/evaluators/crew_evaluator_handler.py
index bc618bc8c..3cc6e76d9 100644
--- a/src/crewai/utilities/evaluators/crew_evaluator_handler.py
+++ b/src/crewai/utilities/evaluators/crew_evaluator_handler.py
@@ -1,8 +1,13 @@
-from collections import defaultdict
+from collections.abc import Callable
+from typing import Any, Dict, List, Union, Annotated, DefaultDict
 
-from typing import Any, Union
-
-from pydantic import BaseModel, Field, InstanceOf
+from pydantic import (
+    BaseModel,
+    Field,
+    InstanceOf,
+    PrivateAttr,
+    model_validator,
+)
 from rich.box import HEAVY_EDGE
 from rich.console import Console
 from rich.table import Table
@@ -20,7 +25,7 @@ class TaskEvaluationPydanticOutput(BaseModel):
     )
 
 
-class CrewEvaluator:
+class CrewEvaluator(BaseModel):
     """
     A class to evaluate the performance of the agents in the crew based on the tasks they have performed.
 
@@ -31,16 +36,63 @@ class CrewEvaluator:
         iteration (int): The current iteration of the evaluation.
     """
 
-    tasks_scores: defaultdict = defaultdict(list)
-    run_execution_times: defaultdict = defaultdict(list)
-    iteration: int = 0
+    crew: Any = Field(description="The crew of agents to evaluate.")
+    llm: Union[str, InstanceOf[LLM], Any] = Field(
+        description="Language model that will run the evaluation."
+    )
+    tasks_scores: DefaultDict[int, List[float]] = Field(
+        default_factory=lambda: DefaultDict(list),
+        description="Dictionary to store the scores of the agents for each task."
+    )
+    run_execution_times: DefaultDict[int, List[int]] = Field(
+        default_factory=lambda: DefaultDict(list),
+        description="Dictionary to store execution times for each run."
+    )
+    iteration: int = Field(
+        default=0,
+        description="Current iteration of the evaluation."
+    )
+
+    @model_validator(mode="after")
+    def validate_llm(self):
+        """Validates that the LLM is properly configured."""
+        if not self.llm:
+            raise ValueError("LLM configuration is required")
+        return self
+
+    _telemetry: Telemetry = PrivateAttr(default_factory=Telemetry)
 
     def __init__(self, crew, llm: Union[str, InstanceOf[LLM], Any]):
-        self.crew = crew
-        self.llm = llm if isinstance(llm, LLM) else LLM(model=llm)
-        self._telemetry = Telemetry()
+        # Initialize Pydantic model with validated fields
+        super().__init__(crew=crew, llm=llm)
         self._setup_for_evaluating()
 
+    @model_validator(mode="before")
+    def init_llm(cls, values):
+        """Initialize LLM before Pydantic validation."""
+        llm = values.get("llm")
+        try:
+            if isinstance(llm, str):
+                values["llm"] = LLM(model=llm)
+            elif isinstance(llm, LLM):
+                values["llm"] = llm
+            else:
+                # For any other type, attempt to extract relevant attributes
+                llm_params = {
+                    "model": getattr(llm, "model_name", None)
+                    or getattr(llm, "deployment_name", None)
+                    or str(llm),
+                    "temperature": getattr(llm, "temperature", None),
+                    "max_tokens": getattr(llm, "max_tokens", None),
+                    "timeout": getattr(llm, "timeout", None),
+                }
+                # Remove None values
+                llm_params = {k: v for k, v in llm_params.items() if v is not None}
+                values["llm"] = LLM(**llm_params)
+        except Exception as e:
+            raise ValueError(f"Invalid LLM configuration: {str(e)}") from e
+        return values
+
     def _setup_for_evaluating(self) -> None:
         """Sets up the crew for evaluating."""
         for task in self.crew.tasks: