Supporting eval single Agent/LiteAgent (#3167)

* refactor: rely on task completion event to evaluate agents * feat: remove Crew dependency to evaluate agent * feat: drop execution_context in AgentEvaluator * chore: drop experimental Agent Eval feature from stable crew.test * feat: support eval LiteAgent * resolve linter issues
2026-05-02 07:42:40 +00:00 · 2025-07-15 10:22:41 -03:00
parent 53f674be60
commit 6ebb6c9b63
16 changed files with 1313 additions and 148 deletions
--- a/src/crewai/experimental/evaluation/agent_evaluator.py
+++ b/src/crewai/experimental/evaluation/agent_evaluator.py
@@ -3,32 +3,32 @@ from crewai.agent import Agent
 from crewai.task import Task
 from crewai.experimental.evaluation.evaluation_display import EvaluationDisplayFormatter

-from typing import Any, Dict
-from collections import defaultdict
+from typing import Any
 from crewai.experimental.evaluation import BaseEvaluator, create_evaluation_callbacks
 from collections.abc import Sequence
-from crewai.crew import Crew
 from crewai.utilities.events.crewai_event_bus import crewai_event_bus
 from crewai.utilities.events.utils.console_formatter import ConsoleFormatter
-from crewai.experimental.evaluation.evaluation_display import AgentAggregatedEvaluationResult
-from contextlib import contextmanager
+from crewai.utilities.events.task_events import TaskCompletedEvent
+from crewai.utilities.events.agent_events import LiteAgentExecutionCompletedEvent
+from crewai.experimental.evaluation.base_evaluator import AgentAggregatedEvaluationResult
 import threading

 class ExecutionState:
    def __init__(self):
-        self.traces: dict[str, Any] = {}
-        self.current_agent_id: str | None = None
-        self.current_task_id: str | None = None
-        self.iteration: int = 1
-        self.iterations_results: dict[int, dict[str, list[AgentEvaluationResult]]] = {}
+        self.traces = {}
+        self.current_agent_id = None
+        self.current_task_id = None
+        self.iteration = 1
+        self.iterations_results = {}
+        self.agent_evaluators = {}

 class AgentEvaluator:
    def __init__(
        self,
+        agents: list[Agent],
        evaluators: Sequence[BaseEvaluator] | None = None,
-        crew: Crew | None = None,
    ):
-        self.crew: Crew | None = crew
+        self.agents: list[Agent] = agents
        self.evaluators: Sequence[BaseEvaluator] | None = evaluators

        self.callback = create_evaluation_callbacks()
@@ -37,19 +37,10 @@ class AgentEvaluator:

        self._thread_local: threading.local = threading.local()

-        self.agent_evaluators: dict[str, Sequence[BaseEvaluator] | None] = {}
-        if crew is not None:
-            assert crew and crew.agents is not None
-            for agent in crew.agents:
-                self.agent_evaluators[str(agent.id)] = self.evaluators
+        for agent in self.agents:
+            self._execution_state.agent_evaluators[str(agent.id)] = self.evaluators

-    @contextmanager
-    def execution_context(self):
-        state = ExecutionState()
-        try:
-            yield state
-        finally:
-            pass
+        self._subscribe_to_events()

    @property
    def _execution_state(self) -> ExecutionState:
@@ -57,81 +48,95 @@ class AgentEvaluator:
            self._thread_local.execution_state = ExecutionState()
        return self._thread_local.execution_state

+    def _subscribe_to_events(self) -> None:
+        crewai_event_bus.register_handler(TaskCompletedEvent, self._handle_task_completed)
+        crewai_event_bus.register_handler(LiteAgentExecutionCompletedEvent, self._handle_lite_agent_completed)
+
+    def _handle_task_completed(self, source: Any, event: TaskCompletedEvent) -> None:
+        assert event.task is not None
+        agent = event.task.agent
+        if agent and str(getattr(agent, 'id', 'unknown')) in self._execution_state.agent_evaluators:
+            state = ExecutionState()
+            state.current_agent_id = str(agent.id)
+            state.current_task_id = str(event.task.id)
+
+            trace = self.callback.get_trace(state.current_agent_id, state.current_task_id)
+
+            if not trace:
+                return
+
+            result = self.evaluate(
+                agent=agent,
+                task=event.task,
+                execution_trace=trace,
+                final_output=event.output,
+                state=state
+            )
+
+            current_iteration = self._execution_state.iteration
+            if current_iteration not in self._execution_state.iterations_results:
+                self._execution_state.iterations_results[current_iteration] = {}
+
+            if agent.role not in self._execution_state.iterations_results[current_iteration]:
+                self._execution_state.iterations_results[current_iteration][agent.role] = []
+
+            self._execution_state.iterations_results[current_iteration][agent.role].append(result)
+
+    def _handle_lite_agent_completed(self, source: object, event: LiteAgentExecutionCompletedEvent) -> None:
+        agent_info = event.agent_info
+        agent_id = str(agent_info["id"])
+
+        if agent_id in self._execution_state.agent_evaluators:
+            state = ExecutionState()
+            state.current_agent_id = agent_id
+            state.current_task_id = "lite_task"
+
+            target_agent = None
+            for agent in self.agents:
+                if str(agent.id) == agent_id:
+                    target_agent = agent
+                    break
+
+            if not target_agent:
+                return
+
+            trace = self.callback.get_trace(state.current_agent_id, state.current_task_id)
+
+            if not trace:
+                return
+
+            result = self.evaluate(
+                agent=target_agent,
+                execution_trace=trace,
+                final_output=event.output,
+                state=state
+            )
+
+            current_iteration = self._execution_state.iteration
+            if current_iteration not in self._execution_state.iterations_results:
+                self._execution_state.iterations_results[current_iteration] = {}
+
+            agent_role = target_agent.role
+            if agent_role not in self._execution_state.iterations_results[current_iteration]:
+                self._execution_state.iterations_results[current_iteration][agent_role] = []
+
+            self._execution_state.iterations_results[current_iteration][agent_role].append(result)
+
    def set_iteration(self, iteration: int) -> None:
        self._execution_state.iteration = iteration

    def reset_iterations_results(self) -> None:
        self._execution_state.iterations_results = {}

-    def evaluate_current_iteration(self) -> dict[str, list[AgentEvaluationResult]]:
-        if not self.crew:
-            raise ValueError("Cannot evaluate: no crew was provided to the evaluator.")
-
-        if not self.callback:
-            raise ValueError("Cannot evaluate: no callback was set. Use set_callback() method first.")
-
-        from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
-        evaluation_results: defaultdict[str, list[AgentEvaluationResult]] = defaultdict(list)
-
-        total_evals = 0
-        for agent in self.crew.agents:
-            for task in self.crew.tasks:
-                if task.agent and task.agent.id == agent.id and self.agent_evaluators.get(str(agent.id)):
-                    total_evals += 1
-
-        with Progress(
-            SpinnerColumn(),
-            TextColumn("[bold blue]{task.description}[/bold blue]"),
-            BarColumn(),
-            TextColumn("{task.percentage:.0f}% completed"),
-            console=self.console_formatter.console
-        ) as progress:
-            eval_task = progress.add_task(f"Evaluating agents (iteration {self._execution_state.iteration})...", total=total_evals)
-
-            with self.execution_context() as state:
-                state.iteration = self._execution_state.iteration
-
-                for agent in self.crew.agents:
-                    evaluator = self.agent_evaluators.get(str(agent.id))
-                    if not evaluator:
-                        continue
-
-                    for task in self.crew.tasks:
-                        if task.agent and str(task.agent.id) != str(agent.id):
-                            continue
-
-                        trace = self.callback.get_trace(str(agent.id), str(task.id))
-                        if not trace:
-                            self.console_formatter.print(f"[yellow]Warning: No trace found for agent {agent.role} on task {task.description[:30]}...[/yellow]")
-                            progress.update(eval_task, advance=1)
-                            continue
-
-                        state.current_agent_id = str(agent.id)
-                        state.current_task_id = str(task.id)
-
-                        with crewai_event_bus.scoped_handlers():
-                            result = self.evaluate(
-                                agent=agent,
-                                task=task,
-                                execution_trace=trace,
-                                final_output=task.output,
-                                state=state
-                            )
-                            evaluation_results[agent.role].append(result)
-                            progress.update(eval_task, advance=1)
-
-        self._execution_state.iterations_results[self._execution_state.iteration] = evaluation_results
-        return evaluation_results
-
    def get_evaluation_results(self) -> dict[str, list[AgentEvaluationResult]]:
-        if self._execution_state.iteration in self._execution_state.iterations_results:
+        if self._execution_state.iterations_results and self._execution_state.iteration in self._execution_state.iterations_results:
            return self._execution_state.iterations_results[self._execution_state.iteration]
-        return self.evaluate_current_iteration()
+        return {}

    def display_results_with_iterations(self) -> None:
        self.display_formatter.display_summary_results(self._execution_state.iterations_results)

-    def get_agent_evaluation(self, strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE, include_evaluation_feedback: bool = False) -> Dict[str, AgentAggregatedEvaluationResult]:
+    def get_agent_evaluation(self, strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE, include_evaluation_feedback: bool = True) -> dict[str, AgentAggregatedEvaluationResult]:
        agent_results = {}
        with crewai_event_bus.scoped_handlers():
            task_results = self.get_evaluation_results()
@@ -165,14 +170,14 @@ class AgentEvaluator:
    def evaluate(
        self,
        agent: Agent,
-        task: Task,
        execution_trace: dict[str, Any],
        final_output: Any,
-        state: ExecutionState
+        state: ExecutionState,
+        task: Task | None = None,
    ) -> AgentEvaluationResult:
        result = AgentEvaluationResult(
            agent_id=state.current_agent_id or str(agent.id),
-            task_id=state.current_task_id or str(task.id)
+            task_id=state.current_task_id or (str(task.id) if task else "unknown_task")
        )

        assert self.evaluators is not None
@@ -190,7 +195,7 @@ class AgentEvaluator:

        return result

-def create_default_evaluator(crew, llm=None):
+def create_default_evaluator(agents: list[Agent], llm: None = None):
    from crewai.experimental.evaluation import (
        GoalAlignmentEvaluator,
        SemanticQualityEvaluator,
@@ -209,4 +214,4 @@ def create_default_evaluator(crew, llm=None):
        ReasoningEfficiencyEvaluator(llm=llm),
    ]

-    return AgentEvaluator(evaluators=evaluators, crew=crew)
+    return AgentEvaluator(evaluators=evaluators, agents=agents)