mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-05-02 07:42:40 +00:00
Supporting eval single Agent/LiteAgent (#3167)
* refactor: rely on task completion event to evaluate agents * feat: remove Crew dependency to evaluate agent * feat: drop execution_context in AgentEvaluator * chore: drop experimental Agent Eval feature from stable crew.test * feat: support eval LiteAgent * resolve linter issues
This commit is contained in:
@@ -3,32 +3,32 @@ from crewai.agent import Agent
|
||||
from crewai.task import Task
|
||||
from crewai.experimental.evaluation.evaluation_display import EvaluationDisplayFormatter
|
||||
|
||||
from typing import Any, Dict
|
||||
from collections import defaultdict
|
||||
from typing import Any
|
||||
from crewai.experimental.evaluation import BaseEvaluator, create_evaluation_callbacks
|
||||
from collections.abc import Sequence
|
||||
from crewai.crew import Crew
|
||||
from crewai.utilities.events.crewai_event_bus import crewai_event_bus
|
||||
from crewai.utilities.events.utils.console_formatter import ConsoleFormatter
|
||||
from crewai.experimental.evaluation.evaluation_display import AgentAggregatedEvaluationResult
|
||||
from contextlib import contextmanager
|
||||
from crewai.utilities.events.task_events import TaskCompletedEvent
|
||||
from crewai.utilities.events.agent_events import LiteAgentExecutionCompletedEvent
|
||||
from crewai.experimental.evaluation.base_evaluator import AgentAggregatedEvaluationResult
|
||||
import threading
|
||||
|
||||
class ExecutionState:
|
||||
def __init__(self):
|
||||
self.traces: dict[str, Any] = {}
|
||||
self.current_agent_id: str | None = None
|
||||
self.current_task_id: str | None = None
|
||||
self.iteration: int = 1
|
||||
self.iterations_results: dict[int, dict[str, list[AgentEvaluationResult]]] = {}
|
||||
self.traces = {}
|
||||
self.current_agent_id = None
|
||||
self.current_task_id = None
|
||||
self.iteration = 1
|
||||
self.iterations_results = {}
|
||||
self.agent_evaluators = {}
|
||||
|
||||
class AgentEvaluator:
|
||||
def __init__(
|
||||
self,
|
||||
agents: list[Agent],
|
||||
evaluators: Sequence[BaseEvaluator] | None = None,
|
||||
crew: Crew | None = None,
|
||||
):
|
||||
self.crew: Crew | None = crew
|
||||
self.agents: list[Agent] = agents
|
||||
self.evaluators: Sequence[BaseEvaluator] | None = evaluators
|
||||
|
||||
self.callback = create_evaluation_callbacks()
|
||||
@@ -37,19 +37,10 @@ class AgentEvaluator:
|
||||
|
||||
self._thread_local: threading.local = threading.local()
|
||||
|
||||
self.agent_evaluators: dict[str, Sequence[BaseEvaluator] | None] = {}
|
||||
if crew is not None:
|
||||
assert crew and crew.agents is not None
|
||||
for agent in crew.agents:
|
||||
self.agent_evaluators[str(agent.id)] = self.evaluators
|
||||
for agent in self.agents:
|
||||
self._execution_state.agent_evaluators[str(agent.id)] = self.evaluators
|
||||
|
||||
@contextmanager
|
||||
def execution_context(self):
|
||||
state = ExecutionState()
|
||||
try:
|
||||
yield state
|
||||
finally:
|
||||
pass
|
||||
self._subscribe_to_events()
|
||||
|
||||
@property
|
||||
def _execution_state(self) -> ExecutionState:
|
||||
@@ -57,81 +48,95 @@ class AgentEvaluator:
|
||||
self._thread_local.execution_state = ExecutionState()
|
||||
return self._thread_local.execution_state
|
||||
|
||||
def _subscribe_to_events(self) -> None:
|
||||
crewai_event_bus.register_handler(TaskCompletedEvent, self._handle_task_completed)
|
||||
crewai_event_bus.register_handler(LiteAgentExecutionCompletedEvent, self._handle_lite_agent_completed)
|
||||
|
||||
def _handle_task_completed(self, source: Any, event: TaskCompletedEvent) -> None:
|
||||
assert event.task is not None
|
||||
agent = event.task.agent
|
||||
if agent and str(getattr(agent, 'id', 'unknown')) in self._execution_state.agent_evaluators:
|
||||
state = ExecutionState()
|
||||
state.current_agent_id = str(agent.id)
|
||||
state.current_task_id = str(event.task.id)
|
||||
|
||||
trace = self.callback.get_trace(state.current_agent_id, state.current_task_id)
|
||||
|
||||
if not trace:
|
||||
return
|
||||
|
||||
result = self.evaluate(
|
||||
agent=agent,
|
||||
task=event.task,
|
||||
execution_trace=trace,
|
||||
final_output=event.output,
|
||||
state=state
|
||||
)
|
||||
|
||||
current_iteration = self._execution_state.iteration
|
||||
if current_iteration not in self._execution_state.iterations_results:
|
||||
self._execution_state.iterations_results[current_iteration] = {}
|
||||
|
||||
if agent.role not in self._execution_state.iterations_results[current_iteration]:
|
||||
self._execution_state.iterations_results[current_iteration][agent.role] = []
|
||||
|
||||
self._execution_state.iterations_results[current_iteration][agent.role].append(result)
|
||||
|
||||
def _handle_lite_agent_completed(self, source: object, event: LiteAgentExecutionCompletedEvent) -> None:
|
||||
agent_info = event.agent_info
|
||||
agent_id = str(agent_info["id"])
|
||||
|
||||
if agent_id in self._execution_state.agent_evaluators:
|
||||
state = ExecutionState()
|
||||
state.current_agent_id = agent_id
|
||||
state.current_task_id = "lite_task"
|
||||
|
||||
target_agent = None
|
||||
for agent in self.agents:
|
||||
if str(agent.id) == agent_id:
|
||||
target_agent = agent
|
||||
break
|
||||
|
||||
if not target_agent:
|
||||
return
|
||||
|
||||
trace = self.callback.get_trace(state.current_agent_id, state.current_task_id)
|
||||
|
||||
if not trace:
|
||||
return
|
||||
|
||||
result = self.evaluate(
|
||||
agent=target_agent,
|
||||
execution_trace=trace,
|
||||
final_output=event.output,
|
||||
state=state
|
||||
)
|
||||
|
||||
current_iteration = self._execution_state.iteration
|
||||
if current_iteration not in self._execution_state.iterations_results:
|
||||
self._execution_state.iterations_results[current_iteration] = {}
|
||||
|
||||
agent_role = target_agent.role
|
||||
if agent_role not in self._execution_state.iterations_results[current_iteration]:
|
||||
self._execution_state.iterations_results[current_iteration][agent_role] = []
|
||||
|
||||
self._execution_state.iterations_results[current_iteration][agent_role].append(result)
|
||||
|
||||
def set_iteration(self, iteration: int) -> None:
|
||||
self._execution_state.iteration = iteration
|
||||
|
||||
def reset_iterations_results(self) -> None:
|
||||
self._execution_state.iterations_results = {}
|
||||
|
||||
def evaluate_current_iteration(self) -> dict[str, list[AgentEvaluationResult]]:
|
||||
if not self.crew:
|
||||
raise ValueError("Cannot evaluate: no crew was provided to the evaluator.")
|
||||
|
||||
if not self.callback:
|
||||
raise ValueError("Cannot evaluate: no callback was set. Use set_callback() method first.")
|
||||
|
||||
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
|
||||
evaluation_results: defaultdict[str, list[AgentEvaluationResult]] = defaultdict(list)
|
||||
|
||||
total_evals = 0
|
||||
for agent in self.crew.agents:
|
||||
for task in self.crew.tasks:
|
||||
if task.agent and task.agent.id == agent.id and self.agent_evaluators.get(str(agent.id)):
|
||||
total_evals += 1
|
||||
|
||||
with Progress(
|
||||
SpinnerColumn(),
|
||||
TextColumn("[bold blue]{task.description}[/bold blue]"),
|
||||
BarColumn(),
|
||||
TextColumn("{task.percentage:.0f}% completed"),
|
||||
console=self.console_formatter.console
|
||||
) as progress:
|
||||
eval_task = progress.add_task(f"Evaluating agents (iteration {self._execution_state.iteration})...", total=total_evals)
|
||||
|
||||
with self.execution_context() as state:
|
||||
state.iteration = self._execution_state.iteration
|
||||
|
||||
for agent in self.crew.agents:
|
||||
evaluator = self.agent_evaluators.get(str(agent.id))
|
||||
if not evaluator:
|
||||
continue
|
||||
|
||||
for task in self.crew.tasks:
|
||||
if task.agent and str(task.agent.id) != str(agent.id):
|
||||
continue
|
||||
|
||||
trace = self.callback.get_trace(str(agent.id), str(task.id))
|
||||
if not trace:
|
||||
self.console_formatter.print(f"[yellow]Warning: No trace found for agent {agent.role} on task {task.description[:30]}...[/yellow]")
|
||||
progress.update(eval_task, advance=1)
|
||||
continue
|
||||
|
||||
state.current_agent_id = str(agent.id)
|
||||
state.current_task_id = str(task.id)
|
||||
|
||||
with crewai_event_bus.scoped_handlers():
|
||||
result = self.evaluate(
|
||||
agent=agent,
|
||||
task=task,
|
||||
execution_trace=trace,
|
||||
final_output=task.output,
|
||||
state=state
|
||||
)
|
||||
evaluation_results[agent.role].append(result)
|
||||
progress.update(eval_task, advance=1)
|
||||
|
||||
self._execution_state.iterations_results[self._execution_state.iteration] = evaluation_results
|
||||
return evaluation_results
|
||||
|
||||
def get_evaluation_results(self) -> dict[str, list[AgentEvaluationResult]]:
|
||||
if self._execution_state.iteration in self._execution_state.iterations_results:
|
||||
if self._execution_state.iterations_results and self._execution_state.iteration in self._execution_state.iterations_results:
|
||||
return self._execution_state.iterations_results[self._execution_state.iteration]
|
||||
return self.evaluate_current_iteration()
|
||||
return {}
|
||||
|
||||
def display_results_with_iterations(self) -> None:
|
||||
self.display_formatter.display_summary_results(self._execution_state.iterations_results)
|
||||
|
||||
def get_agent_evaluation(self, strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE, include_evaluation_feedback: bool = False) -> Dict[str, AgentAggregatedEvaluationResult]:
|
||||
def get_agent_evaluation(self, strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE, include_evaluation_feedback: bool = True) -> dict[str, AgentAggregatedEvaluationResult]:
|
||||
agent_results = {}
|
||||
with crewai_event_bus.scoped_handlers():
|
||||
task_results = self.get_evaluation_results()
|
||||
@@ -165,14 +170,14 @@ class AgentEvaluator:
|
||||
def evaluate(
|
||||
self,
|
||||
agent: Agent,
|
||||
task: Task,
|
||||
execution_trace: dict[str, Any],
|
||||
final_output: Any,
|
||||
state: ExecutionState
|
||||
state: ExecutionState,
|
||||
task: Task | None = None,
|
||||
) -> AgentEvaluationResult:
|
||||
result = AgentEvaluationResult(
|
||||
agent_id=state.current_agent_id or str(agent.id),
|
||||
task_id=state.current_task_id or str(task.id)
|
||||
task_id=state.current_task_id or (str(task.id) if task else "unknown_task")
|
||||
)
|
||||
|
||||
assert self.evaluators is not None
|
||||
@@ -190,7 +195,7 @@ class AgentEvaluator:
|
||||
|
||||
return result
|
||||
|
||||
def create_default_evaluator(crew, llm=None):
|
||||
def create_default_evaluator(agents: list[Agent], llm: None = None):
|
||||
from crewai.experimental.evaluation import (
|
||||
GoalAlignmentEvaluator,
|
||||
SemanticQualityEvaluator,
|
||||
@@ -209,4 +214,4 @@ def create_default_evaluator(crew, llm=None):
|
||||
ReasoningEfficiencyEvaluator(llm=llm),
|
||||
]
|
||||
|
||||
return AgentEvaluator(evaluators=evaluators, crew=crew)
|
||||
return AgentEvaluator(evaluators=evaluators, agents=agents)
|
||||
|
||||
Reference in New Issue
Block a user