crewAI/src/crewai/experimental/evaluation/base_evaluator.py

import abc
import enum
from enum import Enum
from typing import Any, Optional

from pydantic import BaseModel, Field

from crewai.agent import Agent
from crewai.task import Task
from crewai.llm import BaseLLM
from crewai.utilities.llm_utils import create_llm

class MetricCategory(enum.Enum):
    GOAL_ALIGNMENT = "goal_alignment"
    SEMANTIC_QUALITY = "semantic_quality"
    REASONING_EFFICIENCY = "reasoning_efficiency"
    TOOL_SELECTION = "tool_selection"
    PARAMETER_EXTRACTION = "parameter_extraction"
    TOOL_INVOCATION = "tool_invocation"

    def title(self):
        return self.value.replace('_', ' ').title()


class EvaluationScore(BaseModel):
    score: float | None = Field(
        default=5.0,
        description="Numeric score from 0-10 where 0 is worst and 10 is best, None if not applicable",
        ge=0.0,
        le=10.0
    )
    feedback: str = Field(
        default="",
        description="Detailed feedback explaining the evaluation score"
    )
    raw_response: str | None = Field(
        default=None,
        description="Raw response from the evaluator (e.g., LLM)"
    )

    def __str__(self) -> str:
        if self.score is None:
            return f"Score: N/A - {self.feedback}"
        return f"Score: {self.score:.1f}/10 - {self.feedback}"


class BaseEvaluator(abc.ABC):
    def __init__(self, llm: BaseLLM | None = None):
        self.llm: BaseLLM | None = create_llm(llm)

    @property
    @abc.abstractmethod
    def metric_category(self) -> MetricCategory:
        pass

    @abc.abstractmethod
    def evaluate(
        self,
        agent: Agent,
        execution_trace: dict[str, Any],
        final_output: Any,
        task: Task | None = None,
    ) -> EvaluationScore:
        pass


class AgentEvaluationResult(BaseModel):
    agent_id: str = Field(description="ID of the evaluated agent")
    task_id: str = Field(description="ID of the task that was executed")
    metrics: dict[MetricCategory, EvaluationScore] = Field(
        default_factory=dict,
        description="Evaluation scores for each metric category"
    )


class AggregationStrategy(Enum):
    SIMPLE_AVERAGE = "simple_average"  # Equal weight to all tasks
    WEIGHTED_BY_COMPLEXITY = "weighted_by_complexity"  # Weight by task complexity
    BEST_PERFORMANCE = "best_performance"  # Use best scores across tasks
    WORST_PERFORMANCE = "worst_performance"  # Use worst scores across tasks


class AgentAggregatedEvaluationResult(BaseModel):
    agent_id: str = Field(
        default="",
        description="ID of the agent"
    )
    agent_role: str = Field(
        default="",
        description="Role of the agent"
    )
    task_count: int = Field(
        default=0,
        description="Number of tasks included in this aggregation"
    )
    aggregation_strategy: AggregationStrategy = Field(
        default=AggregationStrategy.SIMPLE_AVERAGE,
        description="Strategy used for aggregation"
    )
    metrics: dict[MetricCategory, EvaluationScore] = Field(
        default_factory=dict,
        description="Aggregated metrics across all tasks"
    )
    task_results: list[str] = Field(
        default_factory=list,
        description="IDs of tasks included in this aggregation"
    )
    overall_score: Optional[float] = Field(
        default=None,
        description="Overall score for this agent"
    )

    def __str__(self) -> str:
        result = f"Agent Evaluation: {self.agent_role}\n"
        result += f"Strategy: {self.aggregation_strategy.value}\n"
        result += f"Tasks evaluated: {self.task_count}\n"

        for category, score in self.metrics.items():
            result += f"\n\n- {category.value.upper()}: {score.score}/10\n"

            if score.feedback:
                detailed_feedback = "\n  ".join(score.feedback.split('\n'))
                result += f"  {detailed_feedback}\n"

        return result