refactor: enhance LLM validation and error handling

Co-Authored-By: Joe Moura <joao@crewai.com>
This commit is contained in:
Devin AI
2025-02-09 22:22:45 +00:00
parent fafcd1d27a
commit 2dcaddd29f
3 changed files with 92 additions and 30 deletions

View File

@@ -1079,14 +1079,19 @@ class Crew(BaseModel):
openai_model_name: Optional[str] = None,
inputs: Optional[Dict[str, Any]] = None,
) -> None:
"""Test and evaluate the Crew with the given inputs for n iterations concurrently using concurrent.futures.
"""Test and evaluate the Crew with the given inputs for n iterations.
Args:
n_iterations: Number of iterations to run
llm: LLM instance or model name to use for evaluation
openai_model_name: (Deprecated) OpenAI model name for backward compatibility
inputs: Optional inputs for the crew
Raises:
ValueError: If llm parameter is neither a string nor LLM instance
"""
if llm and not isinstance(llm, (str, LLM)):
raise ValueError("llm parameter must be either a string model name or LLM instance")
test_crew = self.copy()
# Handle backward compatibility

View File

@@ -1,3 +1,4 @@
import logging
from collections import defaultdict
from typing import Optional, Union
@@ -13,6 +14,16 @@ from crewai.tasks.task_output import TaskOutput
from crewai.telemetry import Telemetry
class CrewEvaluationError(Exception):
"""Raised when there is an error during crew evaluation."""
pass
# Default values for evaluation metrics
DEFAULT_TASK_SCORE = 9.0
DEFAULT_EXECUTION_TIME = 60 # seconds
class TaskEvaluationPydanticOutput(BaseModel):
quality: float = Field(
description="A score from 1 to 10 evaluating on completion, quality, and overall performance from the task_description and task_expected_output to the actual Task Output."
@@ -35,15 +46,23 @@ class CrewEvaluator:
iteration: int = 0
def __init__(self, crew, llm: Optional[Union[str, LLM]] = None):
"""Initialize CrewEvaluator.
Args:
crew: The crew to evaluate
llm: LLM instance or model name for evaluation
"""
self.crew = crew
logging.info(f"Initializing CrewEvaluator with LLM: {llm}")
# Initialize tasks_scores with default values to avoid division by zero
self.tasks_scores = defaultdict(list)
for i in range(1, len(crew.tasks) + 1):
self.tasks_scores[i] = [9.0] # Default score of 9.0 for each task
self.tasks_scores[i] = [DEFAULT_TASK_SCORE]
# Initialize run_execution_times with default values
self.run_execution_times = defaultdict(list)
for i in range(1, len(crew.tasks) + 1):
self.run_execution_times[i] = [60] # Default execution time of 60 seconds
self.run_execution_times[i] = [DEFAULT_EXECUTION_TIME]
self.llm = llm if isinstance(llm, LLM) else (
LLM(model=llm) if isinstance(llm, str) else None
)
@@ -169,7 +188,17 @@ class CrewEvaluator:
console.print(table)
def evaluate(self, task_output: TaskOutput):
"""Evaluates the performance of the agents in the crew based on the tasks they have performed."""
"""Evaluates the performance of the agents in the crew based on the tasks they have performed.
Args:
task_output: The output from the task execution to evaluate
Raises:
CrewEvaluationError: If evaluation fails or produces unexpected results
ValueError: If required inputs are missing or invalid
"""
try:
# Find the matching task
current_task = None
for task in self.crew.tasks:
if task.description == task_output.description:
@@ -181,13 +210,16 @@ class CrewEvaluator:
"Task to evaluate and task output are required for evaluation"
)
# Create and execute evaluation task
evaluator_agent = self._evaluator_agent()
evaluation_task = self._evaluation_task(
evaluator_agent, current_task, task_output.raw
)
logging.info(f"Evaluating task: {current_task.description}")
evaluation_result = evaluation_task.execute_sync()
# Process evaluation results
if isinstance(evaluation_result.pydantic, TaskEvaluationPydanticOutput):
self._test_result_span = self._telemetry.individual_test_result_span(
self.crew,
@@ -199,5 +231,14 @@ class CrewEvaluator:
self.run_execution_times[self.iteration].append(
current_task._execution_time
)
logging.info(f"Task evaluation completed with score: {evaluation_result.pydantic.quality}")
else:
raise ValueError("Evaluation result is not in the expected format")
raise CrewEvaluationError("Evaluation result is not in the expected format")
except ValueError as e:
logging.error(f"Invalid input for task evaluation: {e}")
raise
except Exception as e:
logging.error(f"Error during task evaluation: {e}")
raise CrewEvaluationError(f"Failed to evaluate task: {e}")

View File

@@ -2848,6 +2848,22 @@ def test_crew_testing_function(kickoff_mock, copy_mock, crew_evaluator):
)
@mock.patch("crewai.crew.CrewEvaluator")
@mock.patch("crewai.crew.Crew.copy")
@mock.patch("crewai.crew.Crew.kickoff")
def test_crew_testing_with_invalid_llm(kickoff_mock, copy_mock, crew_evaluator_mock):
"""Test that Crew.test() properly validates LLM input."""
task = Task(
description="Test task",
expected_output="Test output",
agent=researcher,
)
crew = Crew(agents=[researcher], tasks=[task])
with pytest.raises(ValueError, match="llm parameter must be either"):
crew.test(2, llm=123) # Invalid type
@mock.patch("crewai.crew.CrewEvaluator")
@mock.patch("crewai.crew.Crew.copy")
@mock.patch("crewai.crew.Crew.kickoff")