fix: improve error handling, logging, and test coverage

Co-Authored-By: Joe Moura <joao@crewai.com>
2026-01-09 08:08:32 +00:00 · 2025-02-09 21:33:03 +00:00
parent 81f84cab58
commit 2744af4825
3 changed files with 39 additions and 9 deletions
--- a/src/crewai/crew.py
+++ b/src/crewai/crew.py
@@ -1081,7 +1081,7 @@ class Crew(BaseModel):
        openai_model_name: Optional[Union[str, LLM]] = None,
        inputs: Optional[Dict[str, Any]] = None,
    ) -> None:
-        """Test and evaluate the Crew with the given inputs for n iterations concurrently using concurrent.futures.
+        """Test and evaluate the Crew with the given inputs for n iterations.

        Args:
            n_iterations: The number of iterations to run the test.
@@ -1089,6 +1089,9 @@ class Crew(BaseModel):
                the performance of the agents. If a string is provided, it will be used to create
                an LLM instance.
            inputs: The inputs to use for the test.
+
+        Raises:
+            ValueError: If openai_model_name is not a string or LLM instance.
        """
        test_crew = self.copy()

--- a/src/crewai/utilities/evaluators/crew_evaluator_handler.py
+++ b/src/crewai/utilities/evaluators/crew_evaluator_handler.py
@@ -4,6 +4,7 @@ from crewai.llm import LLM
 from collections import defaultdict

 from pydantic import BaseModel, Field
+from crewai.utilities.logger import Logger
 from rich.box import HEAVY_EDGE
 from rich.console import Console
 from rich.table import Table
@@ -42,11 +43,22 @@ class CrewEvaluator:
            crew (Crew): The crew to evaluate
            openai_model_name (Union[str, LLM]): Either a model name string or an LLM instance
                to use for evaluation. If a string is provided, it will be used to create an
-                LLM instance.
+                LLM instance with default settings. If an LLM instance is provided, its settings
+                (like temperature) will be preserved.
+        
+        Raises:
+            ValueError: If openai_model_name is not a string or LLM instance.
        """
        self.crew = crew
-        self.llm = openai_model_name if isinstance(openai_model_name, LLM) else LLM(model=openai_model_name)
+        if not isinstance(openai_model_name, (str, LLM)):
+            raise ValueError(f"Invalid model type '{type(openai_model_name)}'. Expected str or LLM instance.")
+        self.model_instance = openai_model_name if isinstance(openai_model_name, LLM) else LLM(model=openai_model_name)
        self._telemetry = Telemetry()
+        self._logger = Logger()
+        self._logger.log(
+            "info",
+            f"Initializing CrewEvaluator with model: {openai_model_name if isinstance(openai_model_name, str) else openai_model_name.model}"
+        )
        self._setup_for_evaluating()

    def _setup_for_evaluating(self) -> None:
@@ -62,7 +74,7 @@ class CrewEvaluator:
            ),
            backstory="Evaluator agent for crew evaluation with precise capabilities to evaluate the performance of the agents in the crew based on the tasks they have performed",
            verbose=False,
-            llm=self.llm,
+            llm=self.model_instance,
        )

    def _evaluation_task(
@@ -192,7 +204,11 @@ class CrewEvaluator:
                self.crew,
                evaluation_result.pydantic.quality,
                current_task._execution_time,
-                self.llm.model,
+                self.model_instance.model,
+            )
+            self._logger.log(
+                "info",
+                f"Task evaluation completed with quality score: {evaluation_result.pydantic.quality}"
            )
            self.tasks_scores[self.iteration].append(evaluation_result.pydantic.quality)
            self.run_execution_times[self.iteration].append(
--- a/tests/utilities/evaluators/test_crew_evaluator_handler.py
+++ b/tests/utilities/evaluators/test_crew_evaluator_handler.py
@@ -136,14 +136,25 @@ class TestCrewEvaluator:
        """Test that CrewEvaluator correctly handles custom LLM instances."""
        custom_llm = LLM(model="gpt-4", temperature=0.5)
        evaluator = CrewEvaluator(crew_planner.crew, custom_llm)
-        assert evaluator.llm == custom_llm
-        assert evaluator.llm.temperature == 0.5
+        assert evaluator.model_instance == custom_llm
+        assert evaluator.model_instance.temperature == 0.5
+
+    def test_evaluator_with_invalid_model_type(self, crew_planner):
+        """Test that CrewEvaluator raises error for invalid model type."""
+        with pytest.raises(ValueError, match="Invalid model type"):
+            CrewEvaluator(crew_planner.crew, 123)
+
+    def test_evaluator_preserves_model_settings(self, crew_planner):
+        """Test that CrewEvaluator preserves model settings."""
+        custom_llm = LLM(model="gpt-4", temperature=0.7)
+        evaluator = CrewEvaluator(crew_planner.crew, custom_llm)
+        assert evaluator.model_instance.temperature == 0.7

    def test_evaluator_with_model_name(self, crew_planner):
        """Test that CrewEvaluator correctly handles string model names."""
        evaluator = CrewEvaluator(crew_planner.crew, "gpt-4")
-        assert isinstance(evaluator.llm, LLM)
-        assert evaluator.llm.model == "gpt-4"
+        assert isinstance(evaluator.model_instance, LLM)
+        assert evaluator.model_instance.model == "gpt-4"

    def test_evaluate(self, crew_planner):
        task_output = TaskOutput(