mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-08 07:38:29 +00:00
Some checks failed
Notify Downstream / notify-downstream (push) Has been cancelled
* refactor: implement thread-safe AgentEvaluator with hybrid state management * chore: remove useless comments
334 lines
14 KiB
Python
334 lines
14 KiB
Python
from collections import defaultdict
|
|
from typing import Dict, Any, List
|
|
from rich.table import Table
|
|
from rich.box import HEAVY_EDGE, ROUNDED
|
|
from collections.abc import Sequence
|
|
from crewai.experimental.evaluation.base_evaluator import AgentAggregatedEvaluationResult, AggregationStrategy, AgentEvaluationResult, MetricCategory
|
|
from crewai.experimental.evaluation import EvaluationScore
|
|
from crewai.utilities.events.utils.console_formatter import ConsoleFormatter
|
|
from crewai.utilities.llm_utils import create_llm
|
|
|
|
class EvaluationDisplayFormatter:
|
|
def __init__(self):
|
|
self.console_formatter = ConsoleFormatter()
|
|
|
|
def display_evaluation_with_feedback(self, iterations_results: Dict[int, Dict[str, List[Any]]]):
|
|
if not iterations_results:
|
|
self.console_formatter.print("[yellow]No evaluation results to display[/yellow]")
|
|
return
|
|
|
|
all_agent_roles: set[str] = set()
|
|
for iter_results in iterations_results.values():
|
|
all_agent_roles.update(iter_results.keys())
|
|
|
|
for agent_role in sorted(all_agent_roles):
|
|
self.console_formatter.print(f"\n[bold cyan]Agent: {agent_role}[/bold cyan]")
|
|
|
|
for iter_num, results in sorted(iterations_results.items()):
|
|
if agent_role not in results or not results[agent_role]:
|
|
continue
|
|
|
|
agent_results = results[agent_role]
|
|
agent_id = agent_results[0].agent_id
|
|
|
|
aggregated_result = self._aggregate_agent_results(
|
|
agent_id=agent_id,
|
|
agent_role=agent_role,
|
|
results=agent_results,
|
|
)
|
|
|
|
self.console_formatter.print(f"\n[bold]Iteration {iter_num}[/bold]")
|
|
|
|
table = Table(box=ROUNDED)
|
|
table.add_column("Metric", style="cyan")
|
|
table.add_column("Score (1-10)", justify="center")
|
|
table.add_column("Feedback", style="green")
|
|
|
|
if aggregated_result.metrics:
|
|
for metric, evaluation_score in aggregated_result.metrics.items():
|
|
score = evaluation_score.score
|
|
|
|
if isinstance(score, (int, float)):
|
|
if score >= 8.0:
|
|
score_text = f"[green]{score:.1f}[/green]"
|
|
elif score >= 6.0:
|
|
score_text = f"[cyan]{score:.1f}[/cyan]"
|
|
elif score >= 4.0:
|
|
score_text = f"[yellow]{score:.1f}[/yellow]"
|
|
else:
|
|
score_text = f"[red]{score:.1f}[/red]"
|
|
else:
|
|
score_text = "[dim]N/A[/dim]"
|
|
|
|
table.add_section()
|
|
table.add_row(
|
|
metric.title(),
|
|
score_text,
|
|
evaluation_score.feedback or ""
|
|
)
|
|
|
|
if aggregated_result.overall_score is not None:
|
|
overall_score = aggregated_result.overall_score
|
|
if overall_score >= 8.0:
|
|
overall_color = "green"
|
|
elif overall_score >= 6.0:
|
|
overall_color = "cyan"
|
|
elif overall_score >= 4.0:
|
|
overall_color = "yellow"
|
|
else:
|
|
overall_color = "red"
|
|
|
|
table.add_section()
|
|
table.add_row(
|
|
"Overall Score",
|
|
f"[{overall_color}]{overall_score:.1f}[/]",
|
|
"Overall agent evaluation score"
|
|
)
|
|
|
|
self.console_formatter.print(table)
|
|
|
|
def display_summary_results(self, iterations_results: Dict[int, Dict[str, List[AgentAggregatedEvaluationResult]]]):
|
|
if not iterations_results:
|
|
self.console_formatter.print("[yellow]No evaluation results to display[/yellow]")
|
|
return
|
|
|
|
self.console_formatter.print("\n")
|
|
|
|
table = Table(title="Agent Performance Scores \n (1-10 Higher is better)", box=HEAVY_EDGE)
|
|
|
|
table.add_column("Agent/Metric", style="cyan")
|
|
|
|
for iter_num in sorted(iterations_results.keys()):
|
|
run_label = f"Run {iter_num}"
|
|
table.add_column(run_label, justify="center")
|
|
|
|
table.add_column("Avg. Total", justify="center")
|
|
|
|
all_agent_roles: set[str] = set()
|
|
for results in iterations_results.values():
|
|
all_agent_roles.update(results.keys())
|
|
|
|
for agent_role in sorted(all_agent_roles):
|
|
agent_scores_by_iteration = {}
|
|
agent_metrics_by_iteration = {}
|
|
|
|
for iter_num, results in sorted(iterations_results.items()):
|
|
if agent_role not in results or not results[agent_role]:
|
|
continue
|
|
|
|
agent_results = results[agent_role]
|
|
agent_id = agent_results[0].agent_id
|
|
|
|
aggregated_result = self._aggregate_agent_results(
|
|
agent_id=agent_id,
|
|
agent_role=agent_role,
|
|
results=agent_results,
|
|
strategy=AggregationStrategy.SIMPLE_AVERAGE
|
|
)
|
|
|
|
valid_scores = [score.score for score in aggregated_result.metrics.values()
|
|
if score.score is not None]
|
|
if valid_scores:
|
|
avg_score = sum(valid_scores) / len(valid_scores)
|
|
agent_scores_by_iteration[iter_num] = avg_score
|
|
|
|
agent_metrics_by_iteration[iter_num] = aggregated_result.metrics
|
|
|
|
if not agent_scores_by_iteration:
|
|
continue
|
|
|
|
avg_across_iterations = sum(agent_scores_by_iteration.values()) / len(agent_scores_by_iteration)
|
|
|
|
row = [f"[bold]{agent_role}[/bold]"]
|
|
|
|
for iter_num in sorted(iterations_results.keys()):
|
|
if iter_num in agent_scores_by_iteration:
|
|
score = agent_scores_by_iteration[iter_num]
|
|
if score >= 8.0:
|
|
color = "green"
|
|
elif score >= 6.0:
|
|
color = "cyan"
|
|
elif score >= 4.0:
|
|
color = "yellow"
|
|
else:
|
|
color = "red"
|
|
row.append(f"[bold {color}]{score:.1f}[/]")
|
|
else:
|
|
row.append("-")
|
|
|
|
if avg_across_iterations >= 8.0:
|
|
color = "green"
|
|
elif avg_across_iterations >= 6.0:
|
|
color = "cyan"
|
|
elif avg_across_iterations >= 4.0:
|
|
color = "yellow"
|
|
else:
|
|
color = "red"
|
|
row.append(f"[bold {color}]{avg_across_iterations:.1f}[/]")
|
|
|
|
table.add_row(*row)
|
|
|
|
all_metrics: set[Any] = set()
|
|
for metrics in agent_metrics_by_iteration.values():
|
|
all_metrics.update(metrics.keys())
|
|
|
|
for metric in sorted(all_metrics, key=lambda x: x.value):
|
|
metric_scores = []
|
|
|
|
row = [f" - {metric.title()}"]
|
|
|
|
for iter_num in sorted(iterations_results.keys()):
|
|
if (iter_num in agent_metrics_by_iteration and
|
|
metric in agent_metrics_by_iteration[iter_num]):
|
|
metric_score = agent_metrics_by_iteration[iter_num][metric].score
|
|
if metric_score is not None:
|
|
metric_scores.append(metric_score)
|
|
if metric_score >= 8.0:
|
|
color = "green"
|
|
elif metric_score >= 6.0:
|
|
color = "cyan"
|
|
elif metric_score >= 4.0:
|
|
color = "yellow"
|
|
else:
|
|
color = "red"
|
|
row.append(f"[{color}]{metric_score:.1f}[/]")
|
|
else:
|
|
row.append("[dim]N/A[/dim]")
|
|
else:
|
|
row.append("-")
|
|
|
|
if metric_scores:
|
|
avg = sum(metric_scores) / len(metric_scores)
|
|
if avg >= 8.0:
|
|
color = "green"
|
|
elif avg >= 6.0:
|
|
color = "cyan"
|
|
elif avg >= 4.0:
|
|
color = "yellow"
|
|
else:
|
|
color = "red"
|
|
row.append(f"[{color}]{avg:.1f}[/]")
|
|
else:
|
|
row.append("-")
|
|
|
|
table.add_row(*row)
|
|
|
|
table.add_row(*[""] * (len(sorted(iterations_results.keys())) + 2))
|
|
|
|
self.console_formatter.print(table)
|
|
self.console_formatter.print("\n")
|
|
|
|
def _aggregate_agent_results(
|
|
self,
|
|
agent_id: str,
|
|
agent_role: str,
|
|
results: Sequence[AgentEvaluationResult],
|
|
strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE,
|
|
) -> AgentAggregatedEvaluationResult:
|
|
metrics_by_category: dict[MetricCategory, list[EvaluationScore]] = defaultdict(list)
|
|
|
|
for result in results:
|
|
for metric_name, evaluation_score in result.metrics.items():
|
|
metrics_by_category[metric_name].append(evaluation_score)
|
|
|
|
aggregated_metrics: dict[MetricCategory, EvaluationScore] = {}
|
|
for category, scores in metrics_by_category.items():
|
|
valid_scores = [s.score for s in scores if s.score is not None]
|
|
avg_score = sum(valid_scores) / len(valid_scores) if valid_scores else None
|
|
|
|
feedbacks = [s.feedback for s in scores if s.feedback]
|
|
|
|
feedback_summary = None
|
|
if feedbacks:
|
|
if len(feedbacks) > 1:
|
|
feedback_summary = self._summarize_feedbacks(
|
|
agent_role=agent_role,
|
|
metric=category.title(),
|
|
feedbacks=feedbacks,
|
|
scores=[s.score for s in scores],
|
|
strategy=strategy
|
|
)
|
|
else:
|
|
feedback_summary = feedbacks[0]
|
|
|
|
aggregated_metrics[category] = EvaluationScore(
|
|
score=avg_score,
|
|
feedback=feedback_summary
|
|
)
|
|
|
|
overall_score = None
|
|
if aggregated_metrics:
|
|
valid_scores = [m.score for m in aggregated_metrics.values() if m.score is not None]
|
|
if valid_scores:
|
|
overall_score = sum(valid_scores) / len(valid_scores)
|
|
|
|
return AgentAggregatedEvaluationResult(
|
|
agent_id=agent_id,
|
|
agent_role=agent_role,
|
|
metrics=aggregated_metrics,
|
|
overall_score=overall_score,
|
|
task_count=len(results),
|
|
aggregation_strategy=strategy
|
|
)
|
|
|
|
def _summarize_feedbacks(
|
|
self,
|
|
agent_role: str,
|
|
metric: str,
|
|
feedbacks: List[str],
|
|
scores: List[float | None],
|
|
strategy: AggregationStrategy
|
|
) -> str:
|
|
if len(feedbacks) <= 2 and all(len(fb) < 200 for fb in feedbacks):
|
|
return "\n\n".join([f"Feedback {i+1}: {fb}" for i, fb in enumerate(feedbacks)])
|
|
|
|
try:
|
|
llm = create_llm()
|
|
|
|
formatted_feedbacks = []
|
|
for i, (feedback, score) in enumerate(zip(feedbacks, scores)):
|
|
if len(feedback) > 500:
|
|
feedback = feedback[:500] + "..."
|
|
score_text = f"{score:.1f}" if score is not None else "N/A"
|
|
formatted_feedbacks.append(f"Feedback #{i+1} (Score: {score_text}):\n{feedback}")
|
|
|
|
all_feedbacks = "\n\n" + "\n\n---\n\n".join(formatted_feedbacks)
|
|
|
|
strategy_guidance = ""
|
|
if strategy == AggregationStrategy.BEST_PERFORMANCE:
|
|
strategy_guidance = "Focus on the highest-scoring aspects and strengths demonstrated."
|
|
elif strategy == AggregationStrategy.WORST_PERFORMANCE:
|
|
strategy_guidance = "Focus on areas that need improvement and common issues across tasks."
|
|
else:
|
|
strategy_guidance = "Provide a balanced analysis of strengths and weaknesses across all tasks."
|
|
|
|
prompt = [
|
|
{"role": "system", "content": f"""You are an expert evaluator creating a comprehensive summary of agent performance feedback.
|
|
Your job is to synthesize multiple feedback points about the same metric across different tasks.
|
|
|
|
Create a concise, insightful summary that captures the key patterns and themes from all feedback.
|
|
{strategy_guidance}
|
|
|
|
Your summary should be:
|
|
1. Specific and concrete (not vague or general)
|
|
2. Focused on actionable insights
|
|
3. Highlighting patterns across tasks
|
|
4. 150-250 words in length
|
|
|
|
The summary should be directly usable as final feedback for the agent's performance on this metric."""},
|
|
{"role": "user", "content": f"""I need a synthesized summary of the following feedback for:
|
|
|
|
Agent Role: {agent_role}
|
|
Metric: {metric.title()}
|
|
|
|
{all_feedbacks}
|
|
"""}
|
|
]
|
|
assert llm is not None
|
|
response = llm.call(prompt)
|
|
|
|
return response
|
|
|
|
except Exception:
|
|
return "Synthesized from multiple tasks: " + "\n\n".join([f"- {fb[:500]}..." for fb in feedbacks])
|