mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-05-02 15:52:34 +00:00
Introduce Evaluator Experiment (#3133)
* feat: add exchanged messages in LLMCallCompletedEvent * feat: add GoalAlignment metric for Agent evaluation * feat: add SemanticQuality metric for Agent evaluation * feat: add Tool Metrics for Agent evaluation * feat: add Reasoning Metrics for Agent evaluation, still in progress * feat: add AgentEvaluator class This class will evaluate Agent' results and report to user * fix: do not evaluate Agent by default This is a experimental feature we still need refine it further * test: add Agent eval tests * fix: render all feedback per iteration * style: resolve linter issues * style: fix mypy issues * fix: allow messages be empty on LLMCallCompletedEvent * feat: add Experiment evaluation framework with baseline comparison * fix: reset evaluator for each experiement iteraction * fix: fix track of new test cases * chore: split Experimental evaluation classes * refactor: remove unused method * refactor: isolate Console print in a dedicated class * fix: make crew required to run an experiment * fix: use time-aware to define experiment result * test: add tests for Evaluator Experiment * style: fix linter issues * fix: encode string before hashing * style: resolve linter issues * feat: add experimental folder for beta features (#3141) * test: move tests to experimental folder
This commit is contained in:
125
src/crewai/experimental/evaluation/base_evaluator.py
Normal file
125
src/crewai/experimental/evaluation/base_evaluator.py
Normal file
@@ -0,0 +1,125 @@
|
||||
import abc
|
||||
import enum
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from crewai.agent import Agent
|
||||
from crewai.task import Task
|
||||
from crewai.llm import BaseLLM
|
||||
from crewai.utilities.llm_utils import create_llm
|
||||
|
||||
class MetricCategory(enum.Enum):
|
||||
GOAL_ALIGNMENT = "goal_alignment"
|
||||
SEMANTIC_QUALITY = "semantic_quality"
|
||||
REASONING_EFFICIENCY = "reasoning_efficiency"
|
||||
TOOL_SELECTION = "tool_selection"
|
||||
PARAMETER_EXTRACTION = "parameter_extraction"
|
||||
TOOL_INVOCATION = "tool_invocation"
|
||||
|
||||
def title(self):
|
||||
return self.value.replace('_', ' ').title()
|
||||
|
||||
|
||||
class EvaluationScore(BaseModel):
|
||||
score: float | None = Field(
|
||||
default=5.0,
|
||||
description="Numeric score from 0-10 where 0 is worst and 10 is best, None if not applicable",
|
||||
ge=0.0,
|
||||
le=10.0
|
||||
)
|
||||
feedback: str = Field(
|
||||
default="",
|
||||
description="Detailed feedback explaining the evaluation score"
|
||||
)
|
||||
raw_response: str | None = Field(
|
||||
default=None,
|
||||
description="Raw response from the evaluator (e.g., LLM)"
|
||||
)
|
||||
|
||||
def __str__(self) -> str:
|
||||
if self.score is None:
|
||||
return f"Score: N/A - {self.feedback}"
|
||||
return f"Score: {self.score:.1f}/10 - {self.feedback}"
|
||||
|
||||
|
||||
class BaseEvaluator(abc.ABC):
|
||||
def __init__(self, llm: BaseLLM | None = None):
|
||||
self.llm: BaseLLM | None = create_llm(llm)
|
||||
|
||||
@property
|
||||
@abc.abstractmethod
|
||||
def metric_category(self) -> MetricCategory:
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def evaluate(
|
||||
self,
|
||||
agent: Agent,
|
||||
task: Task,
|
||||
execution_trace: Dict[str, Any],
|
||||
final_output: Any,
|
||||
) -> EvaluationScore:
|
||||
pass
|
||||
|
||||
|
||||
class AgentEvaluationResult(BaseModel):
|
||||
agent_id: str = Field(description="ID of the evaluated agent")
|
||||
task_id: str = Field(description="ID of the task that was executed")
|
||||
metrics: Dict[MetricCategory, EvaluationScore] = Field(
|
||||
default_factory=dict,
|
||||
description="Evaluation scores for each metric category"
|
||||
)
|
||||
|
||||
|
||||
class AggregationStrategy(Enum):
|
||||
SIMPLE_AVERAGE = "simple_average" # Equal weight to all tasks
|
||||
WEIGHTED_BY_COMPLEXITY = "weighted_by_complexity" # Weight by task complexity
|
||||
BEST_PERFORMANCE = "best_performance" # Use best scores across tasks
|
||||
WORST_PERFORMANCE = "worst_performance" # Use worst scores across tasks
|
||||
|
||||
|
||||
class AgentAggregatedEvaluationResult(BaseModel):
|
||||
agent_id: str = Field(
|
||||
default="",
|
||||
description="ID of the agent"
|
||||
)
|
||||
agent_role: str = Field(
|
||||
default="",
|
||||
description="Role of the agent"
|
||||
)
|
||||
task_count: int = Field(
|
||||
default=0,
|
||||
description="Number of tasks included in this aggregation"
|
||||
)
|
||||
aggregation_strategy: AggregationStrategy = Field(
|
||||
default=AggregationStrategy.SIMPLE_AVERAGE,
|
||||
description="Strategy used for aggregation"
|
||||
)
|
||||
metrics: Dict[MetricCategory, EvaluationScore] = Field(
|
||||
default_factory=dict,
|
||||
description="Aggregated metrics across all tasks"
|
||||
)
|
||||
task_results: List[str] = Field(
|
||||
default_factory=list,
|
||||
description="IDs of tasks included in this aggregation"
|
||||
)
|
||||
overall_score: Optional[float] = Field(
|
||||
default=None,
|
||||
description="Overall score for this agent"
|
||||
)
|
||||
|
||||
def __str__(self) -> str:
|
||||
result = f"Agent Evaluation: {self.agent_role}\n"
|
||||
result += f"Strategy: {self.aggregation_strategy.value}\n"
|
||||
result += f"Tasks evaluated: {self.task_count}\n"
|
||||
|
||||
for category, score in self.metrics.items():
|
||||
result += f"\n\n- {category.value.upper()}: {score.score}/10\n"
|
||||
|
||||
if score.feedback:
|
||||
detailed_feedback = "\n ".join(score.feedback.split('\n'))
|
||||
result += f" {detailed_feedback}\n"
|
||||
|
||||
return result
|
||||
Reference in New Issue
Block a user