fix: do not evaluate Agent by default

This is a experimental feature we still need refine it further
2026-01-07 15:18:29 +00:00 · 2025-07-09 17:25:08 -03:00
parent 6d7c7d940e
commit 37cfbe7389
3 changed files with 23 additions and 12 deletions
--- a/src/crewai/crew.py
+++ b/src/crewai/crew.py
@@ -1313,6 +1313,7 @@ class Crew(FlowTrackable, BaseModel):
        n_iterations: int,
        eval_llm: Union[str, InstanceOf[BaseLLM]],
        inputs: Optional[Dict[str, Any]] = None,
+        include_agent_eval: Optional[bool] = False
    ) -> None:
        """Test and evaluate the Crew with the given inputs for n iterations concurrently using concurrent.futures."""
        try:
@@ -1331,13 +1332,29 @@ class Crew(FlowTrackable, BaseModel):
                ),
            )
            test_crew = self.copy()
+
+            # TODO: Refator to use a single Evaluator Manage class
            evaluator = CrewEvaluator(test_crew, llm_instance)

+            if include_agent_eval:
+                from crewai.evaluation import create_default_evaluator
+                agent_evaluator = create_default_evaluator(crew=test_crew)
+
            for i in range(1, n_iterations + 1):
                evaluator.set_iteration(i)
+
+                if include_agent_eval:
+                    agent_evaluator.set_iteration(i)
+
                test_crew.kickoff(inputs=inputs)

+                # TODO: Refactor to use ListenerEvents instead of trigger each iteration manually
+                if include_agent_eval:
+                    agent_evaluator.evaluate_current_iteration()
+
            evaluator.print_crew_evaluation_result()
+            if include_agent_eval:
+                agent_evaluator.get_agent_evaluation(include_evaluation_feedback=True)

            crewai_event_bus.emit(
                self,
--- a/src/crewai/evaluation/agent_evaluator.py
+++ b/src/crewai/evaluation/agent_evaluator.py
@@ -45,8 +45,6 @@ class AgentEvaluator:
            raise ValueError("Cannot evaluate: no callback was set. Use set_callback() method first.")

        from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
-        self.console_formatter.print(f"\n[bold blue]📊 Running agent evaluations for iteration {self.iteration}...[/bold blue]\n")
-
        evaluation_results = defaultdict(list)

        total_evals = 0
@@ -101,7 +99,7 @@ class AgentEvaluator:
    def display_results_with_iterations(self):
        self.display_formatter.display_summary_results(self.iterations_results)

-    def get_agent_evaluation(self, strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE):
+    def get_agent_evaluation(self, strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE, include_evaluation_feedback: bool = False):
        agent_results = {}
        with crewai_event_bus.scoped_handlers():
            task_results = self.get_evaluation_results()
@@ -120,9 +118,11 @@ class AgentEvaluator:

                agent_results[agent_role] = aggregated_result

-            if len(self.iterations_results) > 1 and self.iteration == max(self.iterations_results.keys()):
+
+            if self.iteration == max(self.iterations_results.keys()):
                self.display_results_with_iterations()
-            elif agent_results:
+
+            if include_evaluation_feedback:
                self.display_evaluation_results(agent_results)

        return agent_results
--- a/src/crewai/evaluation/evaluation_display.py
+++ b/src/crewai/evaluation/evaluation_display.py
@@ -19,7 +19,7 @@ class EvaluationDisplayFormatter:
        for agent_role, result in agent_results.items():
            self.console_formatter.print(f"\n[bold cyan]Agent: {agent_role}[/bold cyan]\n")

-            table = Table(title=f"{agent_role} Evaluation Results", box=ROUNDED)
+            table = Table(box=ROUNDED)
            table.add_column("Metric", style="cyan")
            table.add_column("Score (1-10)", justify="center")
            table.add_column("Feedback", style="green")
@@ -70,12 +70,6 @@ class EvaluationDisplayFormatter:
            self.console_formatter.print("[yellow]No evaluation results to display[/yellow]")
            return

-        title = Panel(
-            "[bold]Agent Evaluation Summary[/bold]",
-            style="blue",
-            box=ROUNDED
-        )
-        self.console_formatter.print(title, justify="center")
        self.console_formatter.print("\n")

        table = Table(title="Agent Performance Scores \n (1-10 Higher is better)", box=HEAVY_EDGE)