mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-11 00:58:30 +00:00
fix: do not evaluate Agent by default
This is a experimental feature we still need refine it further
This commit is contained in:
@@ -1313,6 +1313,7 @@ class Crew(FlowTrackable, BaseModel):
|
|||||||
n_iterations: int,
|
n_iterations: int,
|
||||||
eval_llm: Union[str, InstanceOf[BaseLLM]],
|
eval_llm: Union[str, InstanceOf[BaseLLM]],
|
||||||
inputs: Optional[Dict[str, Any]] = None,
|
inputs: Optional[Dict[str, Any]] = None,
|
||||||
|
include_agent_eval: Optional[bool] = False
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Test and evaluate the Crew with the given inputs for n iterations concurrently using concurrent.futures."""
|
"""Test and evaluate the Crew with the given inputs for n iterations concurrently using concurrent.futures."""
|
||||||
try:
|
try:
|
||||||
@@ -1331,13 +1332,29 @@ class Crew(FlowTrackable, BaseModel):
|
|||||||
),
|
),
|
||||||
)
|
)
|
||||||
test_crew = self.copy()
|
test_crew = self.copy()
|
||||||
|
|
||||||
|
# TODO: Refator to use a single Evaluator Manage class
|
||||||
evaluator = CrewEvaluator(test_crew, llm_instance)
|
evaluator = CrewEvaluator(test_crew, llm_instance)
|
||||||
|
|
||||||
|
if include_agent_eval:
|
||||||
|
from crewai.evaluation import create_default_evaluator
|
||||||
|
agent_evaluator = create_default_evaluator(crew=test_crew)
|
||||||
|
|
||||||
for i in range(1, n_iterations + 1):
|
for i in range(1, n_iterations + 1):
|
||||||
evaluator.set_iteration(i)
|
evaluator.set_iteration(i)
|
||||||
|
|
||||||
|
if include_agent_eval:
|
||||||
|
agent_evaluator.set_iteration(i)
|
||||||
|
|
||||||
test_crew.kickoff(inputs=inputs)
|
test_crew.kickoff(inputs=inputs)
|
||||||
|
|
||||||
|
# TODO: Refactor to use ListenerEvents instead of trigger each iteration manually
|
||||||
|
if include_agent_eval:
|
||||||
|
agent_evaluator.evaluate_current_iteration()
|
||||||
|
|
||||||
evaluator.print_crew_evaluation_result()
|
evaluator.print_crew_evaluation_result()
|
||||||
|
if include_agent_eval:
|
||||||
|
agent_evaluator.get_agent_evaluation(include_evaluation_feedback=True)
|
||||||
|
|
||||||
crewai_event_bus.emit(
|
crewai_event_bus.emit(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -45,8 +45,6 @@ class AgentEvaluator:
|
|||||||
raise ValueError("Cannot evaluate: no callback was set. Use set_callback() method first.")
|
raise ValueError("Cannot evaluate: no callback was set. Use set_callback() method first.")
|
||||||
|
|
||||||
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
|
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
|
||||||
self.console_formatter.print(f"\n[bold blue]📊 Running agent evaluations for iteration {self.iteration}...[/bold blue]\n")
|
|
||||||
|
|
||||||
evaluation_results = defaultdict(list)
|
evaluation_results = defaultdict(list)
|
||||||
|
|
||||||
total_evals = 0
|
total_evals = 0
|
||||||
@@ -101,7 +99,7 @@ class AgentEvaluator:
|
|||||||
def display_results_with_iterations(self):
|
def display_results_with_iterations(self):
|
||||||
self.display_formatter.display_summary_results(self.iterations_results)
|
self.display_formatter.display_summary_results(self.iterations_results)
|
||||||
|
|
||||||
def get_agent_evaluation(self, strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE):
|
def get_agent_evaluation(self, strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE, include_evaluation_feedback: bool = False):
|
||||||
agent_results = {}
|
agent_results = {}
|
||||||
with crewai_event_bus.scoped_handlers():
|
with crewai_event_bus.scoped_handlers():
|
||||||
task_results = self.get_evaluation_results()
|
task_results = self.get_evaluation_results()
|
||||||
@@ -120,9 +118,11 @@ class AgentEvaluator:
|
|||||||
|
|
||||||
agent_results[agent_role] = aggregated_result
|
agent_results[agent_role] = aggregated_result
|
||||||
|
|
||||||
if len(self.iterations_results) > 1 and self.iteration == max(self.iterations_results.keys()):
|
|
||||||
|
if self.iteration == max(self.iterations_results.keys()):
|
||||||
self.display_results_with_iterations()
|
self.display_results_with_iterations()
|
||||||
elif agent_results:
|
|
||||||
|
if include_evaluation_feedback:
|
||||||
self.display_evaluation_results(agent_results)
|
self.display_evaluation_results(agent_results)
|
||||||
|
|
||||||
return agent_results
|
return agent_results
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ class EvaluationDisplayFormatter:
|
|||||||
for agent_role, result in agent_results.items():
|
for agent_role, result in agent_results.items():
|
||||||
self.console_formatter.print(f"\n[bold cyan]Agent: {agent_role}[/bold cyan]\n")
|
self.console_formatter.print(f"\n[bold cyan]Agent: {agent_role}[/bold cyan]\n")
|
||||||
|
|
||||||
table = Table(title=f"{agent_role} Evaluation Results", box=ROUNDED)
|
table = Table(box=ROUNDED)
|
||||||
table.add_column("Metric", style="cyan")
|
table.add_column("Metric", style="cyan")
|
||||||
table.add_column("Score (1-10)", justify="center")
|
table.add_column("Score (1-10)", justify="center")
|
||||||
table.add_column("Feedback", style="green")
|
table.add_column("Feedback", style="green")
|
||||||
@@ -70,12 +70,6 @@ class EvaluationDisplayFormatter:
|
|||||||
self.console_formatter.print("[yellow]No evaluation results to display[/yellow]")
|
self.console_formatter.print("[yellow]No evaluation results to display[/yellow]")
|
||||||
return
|
return
|
||||||
|
|
||||||
title = Panel(
|
|
||||||
"[bold]Agent Evaluation Summary[/bold]",
|
|
||||||
style="blue",
|
|
||||||
box=ROUNDED
|
|
||||||
)
|
|
||||||
self.console_formatter.print(title, justify="center")
|
|
||||||
self.console_formatter.print("\n")
|
self.console_formatter.print("\n")
|
||||||
|
|
||||||
table = Table(title="Agent Performance Scores \n (1-10 Higher is better)", box=HEAVY_EDGE)
|
table = Table(title="Agent Performance Scores \n (1-10 Higher is better)", box=HEAVY_EDGE)
|
||||||
|
|||||||
Reference in New Issue
Block a user