import abc import enum from enum import Enum from typing import Any, Dict, List, Optional from pydantic import BaseModel, Field from crewai.agent import Agent from crewai.task import Task from crewai.llm import BaseLLM from crewai.utilities.llm_utils import create_llm class MetricCategory(enum.Enum): GOAL_ALIGNMENT = "goal_alignment" SEMANTIC_QUALITY = "semantic_quality" REASONING_EFFICIENCY = "reasoning_efficiency" TOOL_SELECTION = "tool_selection" PARAMETER_EXTRACTION = "parameter_extraction" TOOL_INVOCATION = "tool_invocation" def title(self): return self.value.replace('_', ' ').title() class EvaluationScore(BaseModel): score: float | None = Field( default=5.0, description="Numeric score from 0-10 where 0 is worst and 10 is best, None if not applicable", ge=0.0, le=10.0 ) feedback: str = Field( default="", description="Detailed feedback explaining the evaluation score" ) raw_response: str | None = Field( default=None, description="Raw response from the evaluator (e.g., LLM)" ) def __str__(self) -> str: if self.score is None: return f"Score: N/A - {self.feedback}" return f"Score: {self.score:.1f}/10 - {self.feedback}" class BaseEvaluator(abc.ABC): def __init__(self, llm: BaseLLM | None = None): self.llm: BaseLLM | None = create_llm(llm) @property @abc.abstractmethod def metric_category(self) -> MetricCategory: pass @abc.abstractmethod def evaluate( self, agent: Agent, execution_trace: Dict[str, Any], final_output: Any, task: Task | None = None, ) -> EvaluationScore: pass class AgentEvaluationResult(BaseModel): agent_id: str = Field(description="ID of the evaluated agent") task_id: str = Field(description="ID of the task that was executed") metrics: Dict[MetricCategory, EvaluationScore] = Field( default_factory=dict, description="Evaluation scores for each metric category" ) class AggregationStrategy(Enum): SIMPLE_AVERAGE = "simple_average" # Equal weight to all tasks WEIGHTED_BY_COMPLEXITY = "weighted_by_complexity" # Weight by task complexity BEST_PERFORMANCE = "best_performance" # Use best scores across tasks WORST_PERFORMANCE = "worst_performance" # Use worst scores across tasks class AgentAggregatedEvaluationResult(BaseModel): agent_id: str = Field( default="", description="ID of the agent" ) agent_role: str = Field( default="", description="Role of the agent" ) task_count: int = Field( default=0, description="Number of tasks included in this aggregation" ) aggregation_strategy: AggregationStrategy = Field( default=AggregationStrategy.SIMPLE_AVERAGE, description="Strategy used for aggregation" ) metrics: Dict[MetricCategory, EvaluationScore] = Field( default_factory=dict, description="Aggregated metrics across all tasks" ) task_results: List[str] = Field( default_factory=list, description="IDs of tasks included in this aggregation" ) overall_score: Optional[float] = Field( default=None, description="Overall score for this agent" ) def __str__(self) -> str: result = f"Agent Evaluation: {self.agent_role}\n" result += f"Strategy: {self.aggregation_strategy.value}\n" result += f"Tasks evaluated: {self.task_count}\n" for category, score in self.metrics.items(): result += f"\n\n- {category.value.upper()}: {score.score}/10\n" if score.feedback: detailed_feedback = "\n ".join(score.feedback.split('\n')) result += f" {detailed_feedback}\n" return result