refactor: enhance LLM validation and error handling

Co-Authored-By: Joe Moura <joao@crewai.com>
This commit is contained in:
Devin AI
2025-02-09 22:22:45 +00:00
parent fafcd1d27a
commit 2dcaddd29f
3 changed files with 92 additions and 30 deletions

View File

@@ -1079,14 +1079,19 @@ class Crew(BaseModel):
openai_model_name: Optional[str] = None, openai_model_name: Optional[str] = None,
inputs: Optional[Dict[str, Any]] = None, inputs: Optional[Dict[str, Any]] = None,
) -> None: ) -> None:
"""Test and evaluate the Crew with the given inputs for n iterations concurrently using concurrent.futures. """Test and evaluate the Crew with the given inputs for n iterations.
Args: Args:
n_iterations: Number of iterations to run n_iterations: Number of iterations to run
llm: LLM instance or model name to use for evaluation llm: LLM instance or model name to use for evaluation
openai_model_name: (Deprecated) OpenAI model name for backward compatibility openai_model_name: (Deprecated) OpenAI model name for backward compatibility
inputs: Optional inputs for the crew inputs: Optional inputs for the crew
Raises:
ValueError: If llm parameter is neither a string nor LLM instance
""" """
if llm and not isinstance(llm, (str, LLM)):
raise ValueError("llm parameter must be either a string model name or LLM instance")
test_crew = self.copy() test_crew = self.copy()
# Handle backward compatibility # Handle backward compatibility

View File

@@ -1,3 +1,4 @@
import logging
from collections import defaultdict from collections import defaultdict
from typing import Optional, Union from typing import Optional, Union
@@ -13,6 +14,16 @@ from crewai.tasks.task_output import TaskOutput
from crewai.telemetry import Telemetry from crewai.telemetry import Telemetry
class CrewEvaluationError(Exception):
"""Raised when there is an error during crew evaluation."""
pass
# Default values for evaluation metrics
DEFAULT_TASK_SCORE = 9.0
DEFAULT_EXECUTION_TIME = 60 # seconds
class TaskEvaluationPydanticOutput(BaseModel): class TaskEvaluationPydanticOutput(BaseModel):
quality: float = Field( quality: float = Field(
description="A score from 1 to 10 evaluating on completion, quality, and overall performance from the task_description and task_expected_output to the actual Task Output." description="A score from 1 to 10 evaluating on completion, quality, and overall performance from the task_description and task_expected_output to the actual Task Output."
@@ -35,15 +46,23 @@ class CrewEvaluator:
iteration: int = 0 iteration: int = 0
def __init__(self, crew, llm: Optional[Union[str, LLM]] = None): def __init__(self, crew, llm: Optional[Union[str, LLM]] = None):
"""Initialize CrewEvaluator.
Args:
crew: The crew to evaluate
llm: LLM instance or model name for evaluation
"""
self.crew = crew self.crew = crew
logging.info(f"Initializing CrewEvaluator with LLM: {llm}")
# Initialize tasks_scores with default values to avoid division by zero # Initialize tasks_scores with default values to avoid division by zero
self.tasks_scores = defaultdict(list) self.tasks_scores = defaultdict(list)
for i in range(1, len(crew.tasks) + 1): for i in range(1, len(crew.tasks) + 1):
self.tasks_scores[i] = [9.0] # Default score of 9.0 for each task self.tasks_scores[i] = [DEFAULT_TASK_SCORE]
# Initialize run_execution_times with default values # Initialize run_execution_times with default values
self.run_execution_times = defaultdict(list) self.run_execution_times = defaultdict(list)
for i in range(1, len(crew.tasks) + 1): for i in range(1, len(crew.tasks) + 1):
self.run_execution_times[i] = [60] # Default execution time of 60 seconds self.run_execution_times[i] = [DEFAULT_EXECUTION_TIME]
self.llm = llm if isinstance(llm, LLM) else ( self.llm = llm if isinstance(llm, LLM) else (
LLM(model=llm) if isinstance(llm, str) else None LLM(model=llm) if isinstance(llm, str) else None
) )
@@ -169,35 +188,57 @@ class CrewEvaluator:
console.print(table) console.print(table)
def evaluate(self, task_output: TaskOutput): def evaluate(self, task_output: TaskOutput):
"""Evaluates the performance of the agents in the crew based on the tasks they have performed.""" """Evaluates the performance of the agents in the crew based on the tasks they have performed.
current_task = None
for task in self.crew.tasks: Args:
if task.description == task_output.description: task_output: The output from the task execution to evaluate
current_task = task
break Raises:
CrewEvaluationError: If evaluation fails or produces unexpected results
ValueError: If required inputs are missing or invalid
"""
try:
# Find the matching task
current_task = None
for task in self.crew.tasks:
if task.description == task_output.description:
current_task = task
break
if not current_task or not task_output: if not current_task or not task_output:
raise ValueError( raise ValueError(
"Task to evaluate and task output are required for evaluation" "Task to evaluate and task output are required for evaluation"
)
# Create and execute evaluation task
evaluator_agent = self._evaluator_agent()
evaluation_task = self._evaluation_task(
evaluator_agent, current_task, task_output.raw
) )
evaluator_agent = self._evaluator_agent() logging.info(f"Evaluating task: {current_task.description}")
evaluation_task = self._evaluation_task( evaluation_result = evaluation_task.execute_sync()
evaluator_agent, current_task, task_output.raw
)
evaluation_result = evaluation_task.execute_sync() # Process evaluation results
if isinstance(evaluation_result.pydantic, TaskEvaluationPydanticOutput):
self._test_result_span = self._telemetry.individual_test_result_span(
self.crew,
evaluation_result.pydantic.quality,
current_task._execution_time,
str(self.llm.model if self.llm else None),
)
self.tasks_scores[self.iteration].append(evaluation_result.pydantic.quality)
self.run_execution_times[self.iteration].append(
current_task._execution_time
)
logging.info(f"Task evaluation completed with score: {evaluation_result.pydantic.quality}")
else:
raise CrewEvaluationError("Evaluation result is not in the expected format")
if isinstance(evaluation_result.pydantic, TaskEvaluationPydanticOutput): except ValueError as e:
self._test_result_span = self._telemetry.individual_test_result_span( logging.error(f"Invalid input for task evaluation: {e}")
self.crew, raise
evaluation_result.pydantic.quality,
current_task._execution_time, except Exception as e:
str(self.llm.model if self.llm else None), logging.error(f"Error during task evaluation: {e}")
) raise CrewEvaluationError(f"Failed to evaluate task: {e}")
self.tasks_scores[self.iteration].append(evaluation_result.pydantic.quality)
self.run_execution_times[self.iteration].append(
current_task._execution_time
)
else:
raise ValueError("Evaluation result is not in the expected format")

View File

@@ -2848,6 +2848,22 @@ def test_crew_testing_function(kickoff_mock, copy_mock, crew_evaluator):
) )
@mock.patch("crewai.crew.CrewEvaluator")
@mock.patch("crewai.crew.Crew.copy")
@mock.patch("crewai.crew.Crew.kickoff")
def test_crew_testing_with_invalid_llm(kickoff_mock, copy_mock, crew_evaluator_mock):
"""Test that Crew.test() properly validates LLM input."""
task = Task(
description="Test task",
expected_output="Test output",
agent=researcher,
)
crew = Crew(agents=[researcher], tasks=[task])
with pytest.raises(ValueError, match="llm parameter must be either"):
crew.test(2, llm=123) # Invalid type
@mock.patch("crewai.crew.CrewEvaluator") @mock.patch("crewai.crew.CrewEvaluator")
@mock.patch("crewai.crew.Crew.copy") @mock.patch("crewai.crew.Crew.copy")
@mock.patch("crewai.crew.Crew.kickoff") @mock.patch("crewai.crew.Crew.kickoff")