Introduce Evaluator Experiment (#3133)

* feat: add exchanged messages in LLMCallCompletedEvent * feat: add GoalAlignment metric for Agent evaluation * feat: add SemanticQuality metric for Agent evaluation * feat: add Tool Metrics for Agent evaluation * feat: add Reasoning Metrics for Agent evaluation, still in progress * feat: add AgentEvaluator class This class will evaluate Agent' results and report to user * fix: do not evaluate Agent by default This is a experimental feature we still need refine it further * test: add Agent eval tests * fix: render all feedback per iteration * style: resolve linter issues * style: fix mypy issues * fix: allow messages be empty on LLMCallCompletedEvent * feat: add Experiment evaluation framework with baseline comparison * fix: reset evaluator for each experiement iteraction * fix: fix track of new test cases * chore: split Experimental evaluation classes * refactor: remove unused method * refactor: isolate Console print in a dedicated class * fix: make crew required to run an experiment * fix: use time-aware to define experiment result * test: add tests for Evaluator Experiment * style: fix linter issues * fix: encode string before hashing * style: resolve linter issues * feat: add experimental folder for beta features (#3141) * test: move tests to experimental folder
2026-05-02 15:52:34 +00:00 · 2025-07-14 10:06:45 -03:00
parent 3ada4053bd
commit 1b6b2b36d9
27 changed files with 2512 additions and 16 deletions
--- a/src/crewai/experimental/evaluation/agent_evaluator.py
+++ b/src/crewai/experimental/evaluation/agent_evaluator.py
@@ -0,0 +1,182 @@
+from crewai.experimental.evaluation.base_evaluator import AgentEvaluationResult, AggregationStrategy
+from crewai.agent import Agent
+from crewai.task import Task
+from crewai.experimental.evaluation.evaluation_display import EvaluationDisplayFormatter
+
+from typing import Any, Dict
+from collections import defaultdict
+from crewai.experimental.evaluation import BaseEvaluator, create_evaluation_callbacks
+from collections.abc import Sequence
+from crewai.crew import Crew
+from crewai.utilities.events.crewai_event_bus import crewai_event_bus
+from crewai.utilities.events.utils.console_formatter import ConsoleFormatter
+from crewai.experimental.evaluation.evaluation_display import AgentAggregatedEvaluationResult
+
+class AgentEvaluator:
+    def __init__(
+        self,
+        evaluators: Sequence[BaseEvaluator] | None = None,
+        crew: Crew | None = None,
+    ):
+        self.crew: Crew | None = crew
+        self.evaluators: Sequence[BaseEvaluator] | None = evaluators
+
+        self.agent_evaluators: dict[str, Sequence[BaseEvaluator] | None] = {}
+        if crew is not None:
+            assert crew and crew.agents is not None
+            for agent in crew.agents:
+                self.agent_evaluators[str(agent.id)] = self.evaluators
+
+        self.callback = create_evaluation_callbacks()
+        self.console_formatter = ConsoleFormatter()
+        self.display_formatter = EvaluationDisplayFormatter()
+
+        self.iteration = 1
+        self.iterations_results: dict[int, dict[str, list[AgentEvaluationResult]]] = {}
+
+    def set_iteration(self, iteration: int) -> None:
+        self.iteration = iteration
+
+    def reset_iterations_results(self):
+        self.iterations_results = {}
+
+    def evaluate_current_iteration(self) -> dict[str, list[AgentEvaluationResult]]:
+        if not self.crew:
+            raise ValueError("Cannot evaluate: no crew was provided to the evaluator.")
+
+        if not self.callback:
+            raise ValueError("Cannot evaluate: no callback was set. Use set_callback() method first.")
+
+        from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
+        evaluation_results: defaultdict[str, list[AgentEvaluationResult]] = defaultdict(list)
+
+        total_evals = 0
+        for agent in self.crew.agents:
+            for task in self.crew.tasks:
+                if task.agent and task.agent.id == agent.id and self.agent_evaluators.get(str(agent.id)):
+                    total_evals += 1
+
+        with Progress(
+            SpinnerColumn(),
+            TextColumn("[bold blue]{task.description}[/bold blue]"),
+            BarColumn(),
+            TextColumn("{task.percentage:.0f}% completed"),
+            console=self.console_formatter.console
+        ) as progress:
+            eval_task = progress.add_task(f"Evaluating agents (iteration {self.iteration})...", total=total_evals)
+
+            for agent in self.crew.agents:
+                evaluator = self.agent_evaluators.get(str(agent.id))
+                if not evaluator:
+                    continue
+
+                for task in self.crew.tasks:
+
+                    if task.agent and str(task.agent.id) != str(agent.id):
+                        continue
+
+                    trace = self.callback.get_trace(str(agent.id), str(task.id))
+                    if not trace:
+                        self.console_formatter.print(f"[yellow]Warning: No trace found for agent {agent.role} on task {task.description[:30]}...[/yellow]")
+                        progress.update(eval_task, advance=1)
+                        continue
+
+                    with crewai_event_bus.scoped_handlers():
+                        result = self.evaluate(
+                            agent=agent,
+                            task=task,
+                            execution_trace=trace,
+                            final_output=task.output
+                        )
+                        evaluation_results[agent.role].append(result)
+                        progress.update(eval_task, advance=1)
+
+        self.iterations_results[self.iteration] = evaluation_results
+        return evaluation_results
+
+    def get_evaluation_results(self):
+        if self.iteration in self.iterations_results:
+            return self.iterations_results[self.iteration]
+
+        return self.evaluate_current_iteration()
+
+    def display_results_with_iterations(self):
+        self.display_formatter.display_summary_results(self.iterations_results)
+
+    def get_agent_evaluation(self, strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE, include_evaluation_feedback: bool = False) -> Dict[str, AgentAggregatedEvaluationResult]:
+        agent_results = {}
+        with crewai_event_bus.scoped_handlers():
+            task_results = self.get_evaluation_results()
+            for agent_role, results in task_results.items():
+                if not results:
+                    continue
+
+                agent_id = results[0].agent_id
+
+                aggregated_result = self.display_formatter._aggregate_agent_results(
+                    agent_id=agent_id,
+                    agent_role=agent_role,
+                    results=results,
+                    strategy=strategy
+                )
+
+                agent_results[agent_role] = aggregated_result
+
+
+            if self.iteration == max(self.iterations_results.keys()):
+                self.display_results_with_iterations()
+
+            if include_evaluation_feedback:
+                self.display_evaluation_with_feedback()
+
+        return agent_results
+
+    def display_evaluation_with_feedback(self):
+        self.display_formatter.display_evaluation_with_feedback(self.iterations_results)
+
+    def evaluate(
+        self,
+        agent: Agent,
+        task: Task,
+        execution_trace: Dict[str, Any],
+        final_output: Any
+    ) -> AgentEvaluationResult:
+        result = AgentEvaluationResult(
+            agent_id=str(agent.id),
+            task_id=str(task.id)
+        )
+        assert self.evaluators is not None
+        for evaluator in self.evaluators:
+            try:
+                score = evaluator.evaluate(
+                    agent=agent,
+                    task=task,
+                    execution_trace=execution_trace,
+                    final_output=final_output
+                )
+                result.metrics[evaluator.metric_category] = score
+            except Exception as e:
+                self.console_formatter.print(f"Error in {evaluator.metric_category.value} evaluator: {str(e)}")
+
+        return result
+
+def create_default_evaluator(crew, llm=None):
+    from crewai.experimental.evaluation import (
+        GoalAlignmentEvaluator,
+        SemanticQualityEvaluator,
+        ToolSelectionEvaluator,
+        ParameterExtractionEvaluator,
+        ToolInvocationEvaluator,
+        ReasoningEfficiencyEvaluator
+    )
+
+    evaluators = [
+        GoalAlignmentEvaluator(llm=llm),
+        SemanticQualityEvaluator(llm=llm),
+        ToolSelectionEvaluator(llm=llm),
+        ParameterExtractionEvaluator(llm=llm),
+        ToolInvocationEvaluator(llm=llm),
+        ReasoningEfficiencyEvaluator(llm=llm),
+    ]
+
+    return AgentEvaluator(evaluators=evaluators, crew=crew)