mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-08 15:48:29 +00:00
* feat: add exchanged messages in LLMCallCompletedEvent * feat: add GoalAlignment metric for Agent evaluation * feat: add SemanticQuality metric for Agent evaluation * feat: add Tool Metrics for Agent evaluation * feat: add Reasoning Metrics for Agent evaluation, still in progress * feat: add AgentEvaluator class This class will evaluate Agent' results and report to user * fix: do not evaluate Agent by default This is a experimental feature we still need refine it further * test: add Agent eval tests * fix: render all feedback per iteration * style: resolve linter issues * style: fix mypy issues * fix: allow messages be empty on LLMCallCompletedEvent
179 lines
6.7 KiB
Python
179 lines
6.7 KiB
Python
from crewai.evaluation.base_evaluator import AgentEvaluationResult, AggregationStrategy
|
|
from crewai.agent import Agent
|
|
from crewai.task import Task
|
|
from crewai.evaluation.evaluation_display import EvaluationDisplayFormatter
|
|
|
|
from typing import Any, Dict
|
|
from collections import defaultdict
|
|
from crewai.evaluation import BaseEvaluator, create_evaluation_callbacks
|
|
from collections.abc import Sequence
|
|
from crewai.crew import Crew
|
|
from crewai.utilities.events.crewai_event_bus import crewai_event_bus
|
|
from crewai.utilities.events.utils.console_formatter import ConsoleFormatter
|
|
|
|
class AgentEvaluator:
|
|
def __init__(
|
|
self,
|
|
evaluators: Sequence[BaseEvaluator] | None = None,
|
|
crew: Crew | None = None,
|
|
):
|
|
self.crew: Crew | None = crew
|
|
self.evaluators: Sequence[BaseEvaluator] | None = evaluators
|
|
|
|
self.agent_evaluators: dict[str, Sequence[BaseEvaluator] | None] = {}
|
|
if crew is not None:
|
|
assert crew and crew.agents is not None
|
|
for agent in crew.agents:
|
|
self.agent_evaluators[str(agent.id)] = self.evaluators
|
|
|
|
self.callback = create_evaluation_callbacks()
|
|
self.console_formatter = ConsoleFormatter()
|
|
self.display_formatter = EvaluationDisplayFormatter()
|
|
|
|
self.iteration = 1
|
|
self.iterations_results: dict[int, dict[str, list[AgentEvaluationResult]]] = {}
|
|
|
|
def set_iteration(self, iteration: int) -> None:
|
|
self.iteration = iteration
|
|
|
|
def evaluate_current_iteration(self) -> dict[str, list[AgentEvaluationResult]]:
|
|
if not self.crew:
|
|
raise ValueError("Cannot evaluate: no crew was provided to the evaluator.")
|
|
|
|
if not self.callback:
|
|
raise ValueError("Cannot evaluate: no callback was set. Use set_callback() method first.")
|
|
|
|
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
|
|
evaluation_results: defaultdict[str, list[AgentEvaluationResult]] = defaultdict(list)
|
|
|
|
total_evals = 0
|
|
for agent in self.crew.agents:
|
|
for task in self.crew.tasks:
|
|
if task.agent and task.agent.id == agent.id and self.agent_evaluators.get(str(agent.id)):
|
|
total_evals += 1
|
|
|
|
with Progress(
|
|
SpinnerColumn(),
|
|
TextColumn("[bold blue]{task.description}[/bold blue]"),
|
|
BarColumn(),
|
|
TextColumn("{task.percentage:.0f}% completed"),
|
|
console=self.console_formatter.console
|
|
) as progress:
|
|
eval_task = progress.add_task(f"Evaluating agents (iteration {self.iteration})...", total=total_evals)
|
|
|
|
for agent in self.crew.agents:
|
|
evaluator = self.agent_evaluators.get(str(agent.id))
|
|
if not evaluator:
|
|
continue
|
|
|
|
for task in self.crew.tasks:
|
|
|
|
if task.agent and str(task.agent.id) != str(agent.id):
|
|
continue
|
|
|
|
trace = self.callback.get_trace(str(agent.id), str(task.id))
|
|
if not trace:
|
|
self.console_formatter.print(f"[yellow]Warning: No trace found for agent {agent.role} on task {task.description[:30]}...[/yellow]")
|
|
progress.update(eval_task, advance=1)
|
|
continue
|
|
|
|
with crewai_event_bus.scoped_handlers():
|
|
result = self.evaluate(
|
|
agent=agent,
|
|
task=task,
|
|
execution_trace=trace,
|
|
final_output=task.output
|
|
)
|
|
evaluation_results[agent.role].append(result)
|
|
progress.update(eval_task, advance=1)
|
|
|
|
self.iterations_results[self.iteration] = evaluation_results
|
|
return evaluation_results
|
|
|
|
def get_evaluation_results(self):
|
|
if self.iteration in self.iterations_results:
|
|
return self.iterations_results[self.iteration]
|
|
|
|
return self.evaluate_current_iteration()
|
|
|
|
def display_results_with_iterations(self):
|
|
self.display_formatter.display_summary_results(self.iterations_results)
|
|
|
|
def get_agent_evaluation(self, strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE, include_evaluation_feedback: bool = False):
|
|
agent_results = {}
|
|
with crewai_event_bus.scoped_handlers():
|
|
task_results = self.get_evaluation_results()
|
|
for agent_role, results in task_results.items():
|
|
if not results:
|
|
continue
|
|
|
|
agent_id = results[0].agent_id
|
|
|
|
aggregated_result = self.display_formatter._aggregate_agent_results(
|
|
agent_id=agent_id,
|
|
agent_role=agent_role,
|
|
results=results,
|
|
strategy=strategy
|
|
)
|
|
|
|
agent_results[agent_role] = aggregated_result
|
|
|
|
|
|
if self.iteration == max(self.iterations_results.keys()):
|
|
self.display_results_with_iterations()
|
|
|
|
if include_evaluation_feedback:
|
|
self.display_evaluation_with_feedback()
|
|
|
|
return agent_results
|
|
|
|
def display_evaluation_with_feedback(self):
|
|
self.display_formatter.display_evaluation_with_feedback(self.iterations_results)
|
|
|
|
def evaluate(
|
|
self,
|
|
agent: Agent,
|
|
task: Task,
|
|
execution_trace: Dict[str, Any],
|
|
final_output: Any
|
|
) -> AgentEvaluationResult:
|
|
result = AgentEvaluationResult(
|
|
agent_id=str(agent.id),
|
|
task_id=str(task.id)
|
|
)
|
|
assert self.evaluators is not None
|
|
for evaluator in self.evaluators:
|
|
try:
|
|
score = evaluator.evaluate(
|
|
agent=agent,
|
|
task=task,
|
|
execution_trace=execution_trace,
|
|
final_output=final_output
|
|
)
|
|
result.metrics[evaluator.metric_category] = score
|
|
except Exception as e:
|
|
self.console_formatter.print(f"Error in {evaluator.metric_category.value} evaluator: {str(e)}")
|
|
|
|
return result
|
|
|
|
def create_default_evaluator(crew, llm=None):
|
|
from crewai.evaluation import (
|
|
GoalAlignmentEvaluator,
|
|
SemanticQualityEvaluator,
|
|
ToolSelectionEvaluator,
|
|
ParameterExtractionEvaluator,
|
|
ToolInvocationEvaluator,
|
|
ReasoningEfficiencyEvaluator
|
|
)
|
|
|
|
evaluators = [
|
|
GoalAlignmentEvaluator(llm=llm),
|
|
SemanticQualityEvaluator(llm=llm),
|
|
ToolSelectionEvaluator(llm=llm),
|
|
ParameterExtractionEvaluator(llm=llm),
|
|
ToolInvocationEvaluator(llm=llm),
|
|
ReasoningEfficiencyEvaluator(llm=llm),
|
|
]
|
|
|
|
return AgentEvaluator(evaluators=evaluators, crew=crew)
|