From 37cfbe738999cb52db0625c976112e1f36a695f0 Mon Sep 17 00:00:00 2001 From: Lucas Gomide Date: Wed, 9 Jul 2025 17:25:08 -0300 Subject: [PATCH] fix: do not evaluate Agent by default This is a experimental feature we still need refine it further --- src/crewai/crew.py | 17 +++++++++++++++++ src/crewai/evaluation/agent_evaluator.py | 10 +++++----- src/crewai/evaluation/evaluation_display.py | 8 +------- 3 files changed, 23 insertions(+), 12 deletions(-) diff --git a/src/crewai/crew.py b/src/crewai/crew.py index e1b7dc04f..89acee505 100644 --- a/src/crewai/crew.py +++ b/src/crewai/crew.py @@ -1313,6 +1313,7 @@ class Crew(FlowTrackable, BaseModel): n_iterations: int, eval_llm: Union[str, InstanceOf[BaseLLM]], inputs: Optional[Dict[str, Any]] = None, + include_agent_eval: Optional[bool] = False ) -> None: """Test and evaluate the Crew with the given inputs for n iterations concurrently using concurrent.futures.""" try: @@ -1331,13 +1332,29 @@ class Crew(FlowTrackable, BaseModel): ), ) test_crew = self.copy() + + # TODO: Refator to use a single Evaluator Manage class evaluator = CrewEvaluator(test_crew, llm_instance) + if include_agent_eval: + from crewai.evaluation import create_default_evaluator + agent_evaluator = create_default_evaluator(crew=test_crew) + for i in range(1, n_iterations + 1): evaluator.set_iteration(i) + + if include_agent_eval: + agent_evaluator.set_iteration(i) + test_crew.kickoff(inputs=inputs) + # TODO: Refactor to use ListenerEvents instead of trigger each iteration manually + if include_agent_eval: + agent_evaluator.evaluate_current_iteration() + evaluator.print_crew_evaluation_result() + if include_agent_eval: + agent_evaluator.get_agent_evaluation(include_evaluation_feedback=True) crewai_event_bus.emit( self, diff --git a/src/crewai/evaluation/agent_evaluator.py b/src/crewai/evaluation/agent_evaluator.py index 3627571b0..fa97421fb 100644 --- a/src/crewai/evaluation/agent_evaluator.py +++ b/src/crewai/evaluation/agent_evaluator.py @@ -45,8 +45,6 @@ class AgentEvaluator: raise ValueError("Cannot evaluate: no callback was set. Use set_callback() method first.") from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn - self.console_formatter.print(f"\n[bold blue]📊 Running agent evaluations for iteration {self.iteration}...[/bold blue]\n") - evaluation_results = defaultdict(list) total_evals = 0 @@ -101,7 +99,7 @@ class AgentEvaluator: def display_results_with_iterations(self): self.display_formatter.display_summary_results(self.iterations_results) - def get_agent_evaluation(self, strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE): + def get_agent_evaluation(self, strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE, include_evaluation_feedback: bool = False): agent_results = {} with crewai_event_bus.scoped_handlers(): task_results = self.get_evaluation_results() @@ -120,9 +118,11 @@ class AgentEvaluator: agent_results[agent_role] = aggregated_result - if len(self.iterations_results) > 1 and self.iteration == max(self.iterations_results.keys()): + + if self.iteration == max(self.iterations_results.keys()): self.display_results_with_iterations() - elif agent_results: + + if include_evaluation_feedback: self.display_evaluation_results(agent_results) return agent_results diff --git a/src/crewai/evaluation/evaluation_display.py b/src/crewai/evaluation/evaluation_display.py index 0b2c21e3f..9be4070db 100644 --- a/src/crewai/evaluation/evaluation_display.py +++ b/src/crewai/evaluation/evaluation_display.py @@ -19,7 +19,7 @@ class EvaluationDisplayFormatter: for agent_role, result in agent_results.items(): self.console_formatter.print(f"\n[bold cyan]Agent: {agent_role}[/bold cyan]\n") - table = Table(title=f"{agent_role} Evaluation Results", box=ROUNDED) + table = Table(box=ROUNDED) table.add_column("Metric", style="cyan") table.add_column("Score (1-10)", justify="center") table.add_column("Feedback", style="green") @@ -70,12 +70,6 @@ class EvaluationDisplayFormatter: self.console_formatter.print("[yellow]No evaluation results to display[/yellow]") return - title = Panel( - "[bold]Agent Evaluation Summary[/bold]", - style="blue", - box=ROUNDED - ) - self.console_formatter.print(title, justify="center") self.console_formatter.print("\n") table = Table(title="Agent Performance Scores \n (1-10 Higher is better)", box=HEAVY_EDGE)