mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-05-03 00:02:36 +00:00
refactor: Move events module to crewai.events (#3425)
refactor(events): relocate events module & update imports - Move events from utilities/ to top-level events/ with types/, listeners/, utils/ structure - Update all source/tests/docs to new import paths - Add backwards compatibility stubs in crewai.utilities.events with deprecation warnings - Restore test mocks and fix related test imports
This commit is contained in:
@@ -1,18 +1,30 @@
|
||||
import threading
|
||||
from typing import Any, Optional
|
||||
|
||||
from crewai.experimental.evaluation.base_evaluator import AgentEvaluationResult, AggregationStrategy
|
||||
from crewai.experimental.evaluation.base_evaluator import (
|
||||
AgentEvaluationResult,
|
||||
AggregationStrategy,
|
||||
)
|
||||
from crewai.agent import Agent
|
||||
from crewai.task import Task
|
||||
from crewai.experimental.evaluation.evaluation_display import EvaluationDisplayFormatter
|
||||
from crewai.utilities.events.agent_events import AgentEvaluationStartedEvent, AgentEvaluationCompletedEvent, AgentEvaluationFailedEvent
|
||||
from crewai.events.types.agent_events import (
|
||||
AgentEvaluationStartedEvent,
|
||||
AgentEvaluationCompletedEvent,
|
||||
AgentEvaluationFailedEvent,
|
||||
)
|
||||
from crewai.experimental.evaluation import BaseEvaluator, create_evaluation_callbacks
|
||||
from collections.abc import Sequence
|
||||
from crewai.utilities.events.crewai_event_bus import crewai_event_bus
|
||||
from crewai.utilities.events.utils.console_formatter import ConsoleFormatter
|
||||
from crewai.utilities.events.task_events import TaskCompletedEvent
|
||||
from crewai.utilities.events.agent_events import LiteAgentExecutionCompletedEvent
|
||||
from crewai.experimental.evaluation.base_evaluator import AgentAggregatedEvaluationResult, EvaluationScore, MetricCategory
|
||||
from crewai.events.event_bus import crewai_event_bus
|
||||
from crewai.events.utils.console_formatter import ConsoleFormatter
|
||||
from crewai.events.types.task_events import TaskCompletedEvent
|
||||
from crewai.events.types.agent_events import LiteAgentExecutionCompletedEvent
|
||||
from crewai.experimental.evaluation.base_evaluator import (
|
||||
AgentAggregatedEvaluationResult,
|
||||
EvaluationScore,
|
||||
MetricCategory,
|
||||
)
|
||||
|
||||
|
||||
class ExecutionState:
|
||||
current_agent_id: Optional[str] = None
|
||||
@@ -24,6 +36,7 @@ class ExecutionState:
|
||||
self.iterations_results = {}
|
||||
self.agent_evaluators = {}
|
||||
|
||||
|
||||
class AgentEvaluator:
|
||||
def __init__(
|
||||
self,
|
||||
@@ -46,27 +59,45 @@ class AgentEvaluator:
|
||||
|
||||
@property
|
||||
def _execution_state(self) -> ExecutionState:
|
||||
if not hasattr(self._thread_local, 'execution_state'):
|
||||
if not hasattr(self._thread_local, "execution_state"):
|
||||
self._thread_local.execution_state = ExecutionState()
|
||||
return self._thread_local.execution_state
|
||||
|
||||
def _subscribe_to_events(self) -> None:
|
||||
from typing import cast
|
||||
crewai_event_bus.register_handler(TaskCompletedEvent, cast(Any, self._handle_task_completed))
|
||||
crewai_event_bus.register_handler(LiteAgentExecutionCompletedEvent, cast(Any, self._handle_lite_agent_completed))
|
||||
|
||||
crewai_event_bus.register_handler(
|
||||
TaskCompletedEvent, cast(Any, self._handle_task_completed)
|
||||
)
|
||||
crewai_event_bus.register_handler(
|
||||
LiteAgentExecutionCompletedEvent,
|
||||
cast(Any, self._handle_lite_agent_completed),
|
||||
)
|
||||
|
||||
def _handle_task_completed(self, source: Any, event: TaskCompletedEvent) -> None:
|
||||
assert event.task is not None
|
||||
agent = event.task.agent
|
||||
if agent and str(getattr(agent, 'id', 'unknown')) in self._execution_state.agent_evaluators:
|
||||
self.emit_evaluation_started_event(agent_role=agent.role, agent_id=str(agent.id), task_id=str(event.task.id))
|
||||
if (
|
||||
agent
|
||||
and str(getattr(agent, "id", "unknown"))
|
||||
in self._execution_state.agent_evaluators
|
||||
):
|
||||
self.emit_evaluation_started_event(
|
||||
agent_role=agent.role,
|
||||
agent_id=str(agent.id),
|
||||
task_id=str(event.task.id),
|
||||
)
|
||||
|
||||
state = ExecutionState()
|
||||
state.current_agent_id = str(agent.id)
|
||||
state.current_task_id = str(event.task.id)
|
||||
|
||||
assert state.current_agent_id is not None and state.current_task_id is not None
|
||||
trace = self.callback.get_trace(state.current_agent_id, state.current_task_id)
|
||||
assert (
|
||||
state.current_agent_id is not None and state.current_task_id is not None
|
||||
)
|
||||
trace = self.callback.get_trace(
|
||||
state.current_agent_id, state.current_task_id
|
||||
)
|
||||
|
||||
if not trace:
|
||||
return
|
||||
@@ -76,19 +107,28 @@ class AgentEvaluator:
|
||||
task=event.task,
|
||||
execution_trace=trace,
|
||||
final_output=event.output,
|
||||
state=state
|
||||
state=state,
|
||||
)
|
||||
|
||||
current_iteration = self._execution_state.iteration
|
||||
if current_iteration not in self._execution_state.iterations_results:
|
||||
self._execution_state.iterations_results[current_iteration] = {}
|
||||
|
||||
if agent.role not in self._execution_state.iterations_results[current_iteration]:
|
||||
self._execution_state.iterations_results[current_iteration][agent.role] = []
|
||||
if (
|
||||
agent.role
|
||||
not in self._execution_state.iterations_results[current_iteration]
|
||||
):
|
||||
self._execution_state.iterations_results[current_iteration][
|
||||
agent.role
|
||||
] = []
|
||||
|
||||
self._execution_state.iterations_results[current_iteration][agent.role].append(result)
|
||||
self._execution_state.iterations_results[current_iteration][
|
||||
agent.role
|
||||
].append(result)
|
||||
|
||||
def _handle_lite_agent_completed(self, source: object, event: LiteAgentExecutionCompletedEvent) -> None:
|
||||
def _handle_lite_agent_completed(
|
||||
self, source: object, event: LiteAgentExecutionCompletedEvent
|
||||
) -> None:
|
||||
agent_info = event.agent_info
|
||||
agent_id = str(agent_info["id"])
|
||||
|
||||
@@ -106,8 +146,12 @@ class AgentEvaluator:
|
||||
if not target_agent:
|
||||
return
|
||||
|
||||
assert state.current_agent_id is not None and state.current_task_id is not None
|
||||
trace = self.callback.get_trace(state.current_agent_id, state.current_task_id)
|
||||
assert (
|
||||
state.current_agent_id is not None and state.current_task_id is not None
|
||||
)
|
||||
trace = self.callback.get_trace(
|
||||
state.current_agent_id, state.current_task_id
|
||||
)
|
||||
|
||||
if not trace:
|
||||
return
|
||||
@@ -116,7 +160,7 @@ class AgentEvaluator:
|
||||
agent=target_agent,
|
||||
execution_trace=trace,
|
||||
final_output=event.output,
|
||||
state=state
|
||||
state=state,
|
||||
)
|
||||
|
||||
current_iteration = self._execution_state.iteration
|
||||
@@ -124,10 +168,17 @@ class AgentEvaluator:
|
||||
self._execution_state.iterations_results[current_iteration] = {}
|
||||
|
||||
agent_role = target_agent.role
|
||||
if agent_role not in self._execution_state.iterations_results[current_iteration]:
|
||||
self._execution_state.iterations_results[current_iteration][agent_role] = []
|
||||
if (
|
||||
agent_role
|
||||
not in self._execution_state.iterations_results[current_iteration]
|
||||
):
|
||||
self._execution_state.iterations_results[current_iteration][
|
||||
agent_role
|
||||
] = []
|
||||
|
||||
self._execution_state.iterations_results[current_iteration][agent_role].append(result)
|
||||
self._execution_state.iterations_results[current_iteration][
|
||||
agent_role
|
||||
].append(result)
|
||||
|
||||
def set_iteration(self, iteration: int) -> None:
|
||||
self._execution_state.iteration = iteration
|
||||
@@ -136,14 +187,26 @@ class AgentEvaluator:
|
||||
self._execution_state.iterations_results = {}
|
||||
|
||||
def get_evaluation_results(self) -> dict[str, list[AgentEvaluationResult]]:
|
||||
if self._execution_state.iterations_results and self._execution_state.iteration in self._execution_state.iterations_results:
|
||||
return self._execution_state.iterations_results[self._execution_state.iteration]
|
||||
if (
|
||||
self._execution_state.iterations_results
|
||||
and self._execution_state.iteration
|
||||
in self._execution_state.iterations_results
|
||||
):
|
||||
return self._execution_state.iterations_results[
|
||||
self._execution_state.iteration
|
||||
]
|
||||
return {}
|
||||
|
||||
def display_results_with_iterations(self) -> None:
|
||||
self.display_formatter.display_summary_results(self._execution_state.iterations_results)
|
||||
self.display_formatter.display_summary_results(
|
||||
self._execution_state.iterations_results
|
||||
)
|
||||
|
||||
def get_agent_evaluation(self, strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE, include_evaluation_feedback: bool = True) -> dict[str, AgentAggregatedEvaluationResult]:
|
||||
def get_agent_evaluation(
|
||||
self,
|
||||
strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE,
|
||||
include_evaluation_feedback: bool = True,
|
||||
) -> dict[str, AgentAggregatedEvaluationResult]:
|
||||
agent_results = {}
|
||||
with crewai_event_bus.scoped_handlers():
|
||||
task_results = self.get_evaluation_results()
|
||||
@@ -157,13 +220,16 @@ class AgentEvaluator:
|
||||
agent_id=agent_id,
|
||||
agent_role=agent_role,
|
||||
results=results,
|
||||
strategy=strategy
|
||||
strategy=strategy,
|
||||
)
|
||||
|
||||
agent_results[agent_role] = aggregated_result
|
||||
|
||||
|
||||
if self._execution_state.iterations_results and self._execution_state.iteration == max(self._execution_state.iterations_results.keys(), default=0):
|
||||
if (
|
||||
self._execution_state.iterations_results
|
||||
and self._execution_state.iteration
|
||||
== max(self._execution_state.iterations_results.keys(), default=0)
|
||||
):
|
||||
self.display_results_with_iterations()
|
||||
|
||||
if include_evaluation_feedback:
|
||||
@@ -172,7 +238,9 @@ class AgentEvaluator:
|
||||
return agent_results
|
||||
|
||||
def display_evaluation_with_feedback(self) -> None:
|
||||
self.display_formatter.display_evaluation_with_feedback(self._execution_state.iterations_results)
|
||||
self.display_formatter.display_evaluation_with_feedback(
|
||||
self._execution_state.iterations_results
|
||||
)
|
||||
|
||||
def evaluate(
|
||||
self,
|
||||
@@ -184,46 +252,91 @@ class AgentEvaluator:
|
||||
) -> AgentEvaluationResult:
|
||||
result = AgentEvaluationResult(
|
||||
agent_id=state.current_agent_id or str(agent.id),
|
||||
task_id=state.current_task_id or (str(task.id) if task else "unknown_task")
|
||||
task_id=state.current_task_id or (str(task.id) if task else "unknown_task"),
|
||||
)
|
||||
|
||||
assert self.evaluators is not None
|
||||
task_id = str(task.id) if task else None
|
||||
for evaluator in self.evaluators:
|
||||
try:
|
||||
self.emit_evaluation_started_event(agent_role=agent.role, agent_id=str(agent.id), task_id=task_id)
|
||||
self.emit_evaluation_started_event(
|
||||
agent_role=agent.role, agent_id=str(agent.id), task_id=task_id
|
||||
)
|
||||
score = evaluator.evaluate(
|
||||
agent=agent,
|
||||
task=task,
|
||||
execution_trace=execution_trace,
|
||||
final_output=final_output
|
||||
final_output=final_output,
|
||||
)
|
||||
result.metrics[evaluator.metric_category] = score
|
||||
self.emit_evaluation_completed_event(agent_role=agent.role, agent_id=str(agent.id), task_id=task_id, metric_category=evaluator.metric_category, score=score)
|
||||
self.emit_evaluation_completed_event(
|
||||
agent_role=agent.role,
|
||||
agent_id=str(agent.id),
|
||||
task_id=task_id,
|
||||
metric_category=evaluator.metric_category,
|
||||
score=score,
|
||||
)
|
||||
except Exception as e:
|
||||
self.emit_evaluation_failed_event(agent_role=agent.role, agent_id=str(agent.id), task_id=task_id, error=str(e))
|
||||
self.console_formatter.print(f"Error in {evaluator.metric_category.value} evaluator: {str(e)}")
|
||||
self.emit_evaluation_failed_event(
|
||||
agent_role=agent.role,
|
||||
agent_id=str(agent.id),
|
||||
task_id=task_id,
|
||||
error=str(e),
|
||||
)
|
||||
self.console_formatter.print(
|
||||
f"Error in {evaluator.metric_category.value} evaluator: {str(e)}"
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
def emit_evaluation_started_event(self, agent_role: str, agent_id: str, task_id: str | None = None):
|
||||
def emit_evaluation_started_event(
|
||||
self, agent_role: str, agent_id: str, task_id: str | None = None
|
||||
):
|
||||
crewai_event_bus.emit(
|
||||
self,
|
||||
AgentEvaluationStartedEvent(agent_role=agent_role, agent_id=agent_id, task_id=task_id, iteration=self._execution_state.iteration)
|
||||
AgentEvaluationStartedEvent(
|
||||
agent_role=agent_role,
|
||||
agent_id=agent_id,
|
||||
task_id=task_id,
|
||||
iteration=self._execution_state.iteration,
|
||||
),
|
||||
)
|
||||
|
||||
def emit_evaluation_completed_event(self, agent_role: str, agent_id: str, task_id: str | None = None, metric_category: MetricCategory | None = None, score: EvaluationScore | None = None):
|
||||
def emit_evaluation_completed_event(
|
||||
self,
|
||||
agent_role: str,
|
||||
agent_id: str,
|
||||
task_id: str | None = None,
|
||||
metric_category: MetricCategory | None = None,
|
||||
score: EvaluationScore | None = None,
|
||||
):
|
||||
crewai_event_bus.emit(
|
||||
self,
|
||||
AgentEvaluationCompletedEvent(agent_role=agent_role, agent_id=agent_id, task_id=task_id, iteration=self._execution_state.iteration, metric_category=metric_category, score=score)
|
||||
AgentEvaluationCompletedEvent(
|
||||
agent_role=agent_role,
|
||||
agent_id=agent_id,
|
||||
task_id=task_id,
|
||||
iteration=self._execution_state.iteration,
|
||||
metric_category=metric_category,
|
||||
score=score,
|
||||
),
|
||||
)
|
||||
|
||||
def emit_evaluation_failed_event(self, agent_role: str, agent_id: str, error: str, task_id: str | None = None):
|
||||
def emit_evaluation_failed_event(
|
||||
self, agent_role: str, agent_id: str, error: str, task_id: str | None = None
|
||||
):
|
||||
crewai_event_bus.emit(
|
||||
self,
|
||||
AgentEvaluationFailedEvent(agent_role=agent_role, agent_id=agent_id, task_id=task_id, iteration=self._execution_state.iteration, error=error)
|
||||
AgentEvaluationFailedEvent(
|
||||
agent_role=agent_role,
|
||||
agent_id=agent_id,
|
||||
task_id=task_id,
|
||||
iteration=self._execution_state.iteration,
|
||||
error=error,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def create_default_evaluator(agents: list[Agent], llm: None = None):
|
||||
from crewai.experimental.evaluation import (
|
||||
GoalAlignmentEvaluator,
|
||||
@@ -231,7 +344,7 @@ def create_default_evaluator(agents: list[Agent], llm: None = None):
|
||||
ToolSelectionEvaluator,
|
||||
ParameterExtractionEvaluator,
|
||||
ToolInvocationEvaluator,
|
||||
ReasoningEfficiencyEvaluator
|
||||
ReasoningEfficiencyEvaluator,
|
||||
)
|
||||
|
||||
evaluators = [
|
||||
|
||||
@@ -3,18 +3,28 @@ from typing import Dict, Any, List
|
||||
from rich.table import Table
|
||||
from rich.box import HEAVY_EDGE, ROUNDED
|
||||
from collections.abc import Sequence
|
||||
from crewai.experimental.evaluation.base_evaluator import AgentAggregatedEvaluationResult, AggregationStrategy, AgentEvaluationResult, MetricCategory
|
||||
from crewai.experimental.evaluation.base_evaluator import (
|
||||
AgentAggregatedEvaluationResult,
|
||||
AggregationStrategy,
|
||||
AgentEvaluationResult,
|
||||
MetricCategory,
|
||||
)
|
||||
from crewai.experimental.evaluation import EvaluationScore
|
||||
from crewai.utilities.events.utils.console_formatter import ConsoleFormatter
|
||||
from crewai.events.utils.console_formatter import ConsoleFormatter
|
||||
from crewai.utilities.llm_utils import create_llm
|
||||
|
||||
|
||||
class EvaluationDisplayFormatter:
|
||||
def __init__(self):
|
||||
self.console_formatter = ConsoleFormatter()
|
||||
|
||||
def display_evaluation_with_feedback(self, iterations_results: Dict[int, Dict[str, List[Any]]]):
|
||||
def display_evaluation_with_feedback(
|
||||
self, iterations_results: Dict[int, Dict[str, List[Any]]]
|
||||
):
|
||||
if not iterations_results:
|
||||
self.console_formatter.print("[yellow]No evaluation results to display[/yellow]")
|
||||
self.console_formatter.print(
|
||||
"[yellow]No evaluation results to display[/yellow]"
|
||||
)
|
||||
return
|
||||
|
||||
all_agent_roles: set[str] = set()
|
||||
@@ -22,7 +32,9 @@ class EvaluationDisplayFormatter:
|
||||
all_agent_roles.update(iter_results.keys())
|
||||
|
||||
for agent_role in sorted(all_agent_roles):
|
||||
self.console_formatter.print(f"\n[bold cyan]Agent: {agent_role}[/bold cyan]")
|
||||
self.console_formatter.print(
|
||||
f"\n[bold cyan]Agent: {agent_role}[/bold cyan]"
|
||||
)
|
||||
|
||||
for iter_num, results in sorted(iterations_results.items()):
|
||||
if agent_role not in results or not results[agent_role]:
|
||||
@@ -62,9 +74,7 @@ class EvaluationDisplayFormatter:
|
||||
|
||||
table.add_section()
|
||||
table.add_row(
|
||||
metric.title(),
|
||||
score_text,
|
||||
evaluation_score.feedback or ""
|
||||
metric.title(), score_text, evaluation_score.feedback or ""
|
||||
)
|
||||
|
||||
if aggregated_result.overall_score is not None:
|
||||
@@ -82,19 +92,26 @@ class EvaluationDisplayFormatter:
|
||||
table.add_row(
|
||||
"Overall Score",
|
||||
f"[{overall_color}]{overall_score:.1f}[/]",
|
||||
"Overall agent evaluation score"
|
||||
"Overall agent evaluation score",
|
||||
)
|
||||
|
||||
self.console_formatter.print(table)
|
||||
|
||||
def display_summary_results(self, iterations_results: Dict[int, Dict[str, List[AgentAggregatedEvaluationResult]]]):
|
||||
def display_summary_results(
|
||||
self,
|
||||
iterations_results: Dict[int, Dict[str, List[AgentAggregatedEvaluationResult]]],
|
||||
):
|
||||
if not iterations_results:
|
||||
self.console_formatter.print("[yellow]No evaluation results to display[/yellow]")
|
||||
self.console_formatter.print(
|
||||
"[yellow]No evaluation results to display[/yellow]"
|
||||
)
|
||||
return
|
||||
|
||||
self.console_formatter.print("\n")
|
||||
|
||||
table = Table(title="Agent Performance Scores \n (1-10 Higher is better)", box=HEAVY_EDGE)
|
||||
table = Table(
|
||||
title="Agent Performance Scores \n (1-10 Higher is better)", box=HEAVY_EDGE
|
||||
)
|
||||
|
||||
table.add_column("Agent/Metric", style="cyan")
|
||||
|
||||
@@ -123,11 +140,14 @@ class EvaluationDisplayFormatter:
|
||||
agent_id=agent_id,
|
||||
agent_role=agent_role,
|
||||
results=agent_results,
|
||||
strategy=AggregationStrategy.SIMPLE_AVERAGE
|
||||
strategy=AggregationStrategy.SIMPLE_AVERAGE,
|
||||
)
|
||||
|
||||
valid_scores = [score.score for score in aggregated_result.metrics.values()
|
||||
if score.score is not None]
|
||||
valid_scores = [
|
||||
score.score
|
||||
for score in aggregated_result.metrics.values()
|
||||
if score.score is not None
|
||||
]
|
||||
if valid_scores:
|
||||
avg_score = sum(valid_scores) / len(valid_scores)
|
||||
agent_scores_by_iteration[iter_num] = avg_score
|
||||
@@ -137,7 +157,9 @@ class EvaluationDisplayFormatter:
|
||||
if not agent_scores_by_iteration:
|
||||
continue
|
||||
|
||||
avg_across_iterations = sum(agent_scores_by_iteration.values()) / len(agent_scores_by_iteration)
|
||||
avg_across_iterations = sum(agent_scores_by_iteration.values()) / len(
|
||||
agent_scores_by_iteration
|
||||
)
|
||||
|
||||
row = [f"[bold]{agent_role}[/bold]"]
|
||||
|
||||
@@ -178,9 +200,13 @@ class EvaluationDisplayFormatter:
|
||||
row = [f" - {metric.title()}"]
|
||||
|
||||
for iter_num in sorted(iterations_results.keys()):
|
||||
if (iter_num in agent_metrics_by_iteration and
|
||||
metric in agent_metrics_by_iteration[iter_num]):
|
||||
metric_score = agent_metrics_by_iteration[iter_num][metric].score
|
||||
if (
|
||||
iter_num in agent_metrics_by_iteration
|
||||
and metric in agent_metrics_by_iteration[iter_num]
|
||||
):
|
||||
metric_score = agent_metrics_by_iteration[iter_num][
|
||||
metric
|
||||
].score
|
||||
if metric_score is not None:
|
||||
metric_scores.append(metric_score)
|
||||
if metric_score >= 8.0:
|
||||
@@ -225,7 +251,9 @@ class EvaluationDisplayFormatter:
|
||||
results: Sequence[AgentEvaluationResult],
|
||||
strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE,
|
||||
) -> AgentAggregatedEvaluationResult:
|
||||
metrics_by_category: dict[MetricCategory, list[EvaluationScore]] = defaultdict(list)
|
||||
metrics_by_category: dict[MetricCategory, list[EvaluationScore]] = defaultdict(
|
||||
list
|
||||
)
|
||||
|
||||
for result in results:
|
||||
for metric_name, evaluation_score in result.metrics.items():
|
||||
@@ -246,19 +274,20 @@ class EvaluationDisplayFormatter:
|
||||
metric=category.title(),
|
||||
feedbacks=feedbacks,
|
||||
scores=[s.score for s in scores],
|
||||
strategy=strategy
|
||||
strategy=strategy,
|
||||
)
|
||||
else:
|
||||
feedback_summary = feedbacks[0]
|
||||
|
||||
aggregated_metrics[category] = EvaluationScore(
|
||||
score=avg_score,
|
||||
feedback=feedback_summary
|
||||
score=avg_score, feedback=feedback_summary
|
||||
)
|
||||
|
||||
overall_score = None
|
||||
if aggregated_metrics:
|
||||
valid_scores = [m.score for m in aggregated_metrics.values() if m.score is not None]
|
||||
valid_scores = [
|
||||
m.score for m in aggregated_metrics.values() if m.score is not None
|
||||
]
|
||||
if valid_scores:
|
||||
overall_score = sum(valid_scores) / len(valid_scores)
|
||||
|
||||
@@ -268,7 +297,7 @@ class EvaluationDisplayFormatter:
|
||||
metrics=aggregated_metrics,
|
||||
overall_score=overall_score,
|
||||
task_count=len(results),
|
||||
aggregation_strategy=strategy
|
||||
aggregation_strategy=strategy,
|
||||
)
|
||||
|
||||
def _summarize_feedbacks(
|
||||
@@ -277,10 +306,12 @@ class EvaluationDisplayFormatter:
|
||||
metric: str,
|
||||
feedbacks: List[str],
|
||||
scores: List[float | None],
|
||||
strategy: AggregationStrategy
|
||||
strategy: AggregationStrategy,
|
||||
) -> str:
|
||||
if len(feedbacks) <= 2 and all(len(fb) < 200 for fb in feedbacks):
|
||||
return "\n\n".join([f"Feedback {i+1}: {fb}" for i, fb in enumerate(feedbacks)])
|
||||
return "\n\n".join(
|
||||
[f"Feedback {i+1}: {fb}" for i, fb in enumerate(feedbacks)]
|
||||
)
|
||||
|
||||
try:
|
||||
llm = create_llm()
|
||||
@@ -290,20 +321,26 @@ class EvaluationDisplayFormatter:
|
||||
if len(feedback) > 500:
|
||||
feedback = feedback[:500] + "..."
|
||||
score_text = f"{score:.1f}" if score is not None else "N/A"
|
||||
formatted_feedbacks.append(f"Feedback #{i+1} (Score: {score_text}):\n{feedback}")
|
||||
formatted_feedbacks.append(
|
||||
f"Feedback #{i+1} (Score: {score_text}):\n{feedback}"
|
||||
)
|
||||
|
||||
all_feedbacks = "\n\n" + "\n\n---\n\n".join(formatted_feedbacks)
|
||||
|
||||
strategy_guidance = ""
|
||||
if strategy == AggregationStrategy.BEST_PERFORMANCE:
|
||||
strategy_guidance = "Focus on the highest-scoring aspects and strengths demonstrated."
|
||||
strategy_guidance = (
|
||||
"Focus on the highest-scoring aspects and strengths demonstrated."
|
||||
)
|
||||
elif strategy == AggregationStrategy.WORST_PERFORMANCE:
|
||||
strategy_guidance = "Focus on areas that need improvement and common issues across tasks."
|
||||
else:
|
||||
strategy_guidance = "Provide a balanced analysis of strengths and weaknesses across all tasks."
|
||||
|
||||
prompt = [
|
||||
{"role": "system", "content": f"""You are an expert evaluator creating a comprehensive summary of agent performance feedback.
|
||||
{
|
||||
"role": "system",
|
||||
"content": f"""You are an expert evaluator creating a comprehensive summary of agent performance feedback.
|
||||
Your job is to synthesize multiple feedback points about the same metric across different tasks.
|
||||
|
||||
Create a concise, insightful summary that captures the key patterns and themes from all feedback.
|
||||
@@ -315,14 +352,18 @@ class EvaluationDisplayFormatter:
|
||||
3. Highlighting patterns across tasks
|
||||
4. 150-250 words in length
|
||||
|
||||
The summary should be directly usable as final feedback for the agent's performance on this metric."""},
|
||||
{"role": "user", "content": f"""I need a synthesized summary of the following feedback for:
|
||||
The summary should be directly usable as final feedback for the agent's performance on this metric.""",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"""I need a synthesized summary of the following feedback for:
|
||||
|
||||
Agent Role: {agent_role}
|
||||
Metric: {metric.title()}
|
||||
|
||||
{all_feedbacks}
|
||||
"""}
|
||||
""",
|
||||
},
|
||||
]
|
||||
assert llm is not None
|
||||
response = llm.call(prompt)
|
||||
@@ -330,4 +371,6 @@ class EvaluationDisplayFormatter:
|
||||
return response
|
||||
|
||||
except Exception:
|
||||
return "Synthesized from multiple tasks: " + "\n\n".join([f"- {fb[:500]}..." for fb in feedbacks])
|
||||
return "Synthesized from multiple tasks: " + "\n\n".join(
|
||||
[f"- {fb[:500]}..." for fb in feedbacks]
|
||||
)
|
||||
|
||||
@@ -5,25 +5,23 @@ from collections.abc import Sequence
|
||||
|
||||
from crewai.agent import Agent
|
||||
from crewai.task import Task
|
||||
from crewai.utilities.events.base_event_listener import BaseEventListener
|
||||
from crewai.utilities.events.crewai_event_bus import CrewAIEventsBus
|
||||
from crewai.utilities.events.agent_events import (
|
||||
from crewai.events.base_event_listener import BaseEventListener
|
||||
from crewai.events.event_bus import CrewAIEventsBus
|
||||
from crewai.events.types.agent_events import (
|
||||
AgentExecutionStartedEvent,
|
||||
AgentExecutionCompletedEvent,
|
||||
LiteAgentExecutionStartedEvent,
|
||||
LiteAgentExecutionCompletedEvent
|
||||
LiteAgentExecutionCompletedEvent,
|
||||
)
|
||||
from crewai.utilities.events.tool_usage_events import (
|
||||
from crewai.events.types.tool_usage_events import (
|
||||
ToolUsageFinishedEvent,
|
||||
ToolUsageErrorEvent,
|
||||
ToolExecutionErrorEvent,
|
||||
ToolSelectionErrorEvent,
|
||||
ToolValidateInputErrorEvent
|
||||
)
|
||||
from crewai.utilities.events.llm_events import (
|
||||
LLMCallStartedEvent,
|
||||
LLMCallCompletedEvent
|
||||
ToolValidateInputErrorEvent,
|
||||
)
|
||||
from crewai.events.types.llm_events import LLMCallStartedEvent, LLMCallCompletedEvent
|
||||
|
||||
|
||||
class EvaluationTraceCallback(BaseEventListener):
|
||||
"""Event listener for collecting execution traces for evaluation.
|
||||
@@ -68,27 +66,49 @@ class EvaluationTraceCallback(BaseEventListener):
|
||||
|
||||
@event_bus.on(ToolUsageFinishedEvent)
|
||||
def on_tool_completed(source, event: ToolUsageFinishedEvent):
|
||||
self.on_tool_use(event.tool_name, event.tool_args, event.output, success=True)
|
||||
self.on_tool_use(
|
||||
event.tool_name, event.tool_args, event.output, success=True
|
||||
)
|
||||
|
||||
@event_bus.on(ToolUsageErrorEvent)
|
||||
def on_tool_usage_error(source, event: ToolUsageErrorEvent):
|
||||
self.on_tool_use(event.tool_name, event.tool_args, event.error,
|
||||
success=False, error_type="usage_error")
|
||||
self.on_tool_use(
|
||||
event.tool_name,
|
||||
event.tool_args,
|
||||
event.error,
|
||||
success=False,
|
||||
error_type="usage_error",
|
||||
)
|
||||
|
||||
@event_bus.on(ToolExecutionErrorEvent)
|
||||
def on_tool_execution_error(source, event: ToolExecutionErrorEvent):
|
||||
self.on_tool_use(event.tool_name, event.tool_args, event.error,
|
||||
success=False, error_type="execution_error")
|
||||
self.on_tool_use(
|
||||
event.tool_name,
|
||||
event.tool_args,
|
||||
event.error,
|
||||
success=False,
|
||||
error_type="execution_error",
|
||||
)
|
||||
|
||||
@event_bus.on(ToolSelectionErrorEvent)
|
||||
def on_tool_selection_error(source, event: ToolSelectionErrorEvent):
|
||||
self.on_tool_use(event.tool_name, event.tool_args, event.error,
|
||||
success=False, error_type="selection_error")
|
||||
self.on_tool_use(
|
||||
event.tool_name,
|
||||
event.tool_args,
|
||||
event.error,
|
||||
success=False,
|
||||
error_type="selection_error",
|
||||
)
|
||||
|
||||
@event_bus.on(ToolValidateInputErrorEvent)
|
||||
def on_tool_validate_input_error(source, event: ToolValidateInputErrorEvent):
|
||||
self.on_tool_use(event.tool_name, event.tool_args, event.error,
|
||||
success=False, error_type="validation_error")
|
||||
self.on_tool_use(
|
||||
event.tool_name,
|
||||
event.tool_args,
|
||||
event.error,
|
||||
success=False,
|
||||
error_type="validation_error",
|
||||
)
|
||||
|
||||
@event_bus.on(LLMCallStartedEvent)
|
||||
def on_llm_call_started(source, event: LLMCallStartedEvent):
|
||||
@@ -99,7 +119,7 @@ class EvaluationTraceCallback(BaseEventListener):
|
||||
self.on_llm_call_end(event.messages, event.response)
|
||||
|
||||
def on_lite_agent_start(self, agent_info: dict[str, Any]):
|
||||
self.current_agent_id = agent_info['id']
|
||||
self.current_agent_id = agent_info["id"]
|
||||
self.current_task_id = "lite_task"
|
||||
|
||||
trace_key = f"{self.current_agent_id}_{self.current_task_id}"
|
||||
@@ -110,7 +130,7 @@ class EvaluationTraceCallback(BaseEventListener):
|
||||
tool_uses=[],
|
||||
llm_calls=[],
|
||||
start_time=datetime.now(),
|
||||
final_output=None
|
||||
final_output=None,
|
||||
)
|
||||
|
||||
def _init_trace(self, trace_key: str, **kwargs: Any):
|
||||
@@ -128,7 +148,7 @@ class EvaluationTraceCallback(BaseEventListener):
|
||||
tool_uses=[],
|
||||
llm_calls=[],
|
||||
start_time=datetime.now(),
|
||||
final_output=None
|
||||
final_output=None,
|
||||
)
|
||||
|
||||
def on_agent_finish(self, agent: Agent, task: Task, output: Any):
|
||||
@@ -151,8 +171,14 @@ class EvaluationTraceCallback(BaseEventListener):
|
||||
|
||||
self._reset_current()
|
||||
|
||||
def on_tool_use(self, tool_name: str, tool_args: dict[str, Any] | str, result: Any,
|
||||
success: bool = True, error_type: str | None = None):
|
||||
def on_tool_use(
|
||||
self,
|
||||
tool_name: str,
|
||||
tool_args: dict[str, Any] | str,
|
||||
result: Any,
|
||||
success: bool = True,
|
||||
error_type: str | None = None,
|
||||
):
|
||||
if not self.current_agent_id or not self.current_task_id:
|
||||
return
|
||||
|
||||
@@ -163,7 +189,7 @@ class EvaluationTraceCallback(BaseEventListener):
|
||||
"args": tool_args,
|
||||
"result": result,
|
||||
"success": success,
|
||||
"timestamp": datetime.now()
|
||||
"timestamp": datetime.now(),
|
||||
}
|
||||
|
||||
# Add error information if applicable
|
||||
@@ -173,7 +199,11 @@ class EvaluationTraceCallback(BaseEventListener):
|
||||
|
||||
self.traces[trace_key]["tool_uses"].append(tool_use)
|
||||
|
||||
def on_llm_call_start(self, messages: str | Sequence[dict[str, Any]] | None, tools: Sequence[dict[str, Any]] | None = None):
|
||||
def on_llm_call_start(
|
||||
self,
|
||||
messages: str | Sequence[dict[str, Any]] | None,
|
||||
tools: Sequence[dict[str, Any]] | None = None,
|
||||
):
|
||||
if not self.current_agent_id or not self.current_task_id:
|
||||
return
|
||||
|
||||
@@ -186,10 +216,12 @@ class EvaluationTraceCallback(BaseEventListener):
|
||||
"tools": tools,
|
||||
"start_time": datetime.now(),
|
||||
"response": None,
|
||||
"end_time": None
|
||||
"end_time": None,
|
||||
}
|
||||
|
||||
def on_llm_call_end(self, messages: str | list[dict[str, Any]] | None, response: Any):
|
||||
def on_llm_call_end(
|
||||
self, messages: str | list[dict[str, Any]] | None, response: Any
|
||||
):
|
||||
if not self.current_agent_id or not self.current_task_id:
|
||||
return
|
||||
|
||||
@@ -213,7 +245,7 @@ class EvaluationTraceCallback(BaseEventListener):
|
||||
"response": response,
|
||||
"start_time": start_time,
|
||||
"end_time": current_time,
|
||||
"total_tokens": total_tokens
|
||||
"total_tokens": total_tokens,
|
||||
}
|
||||
|
||||
self.traces[trace_key]["llm_calls"].append(llm_call)
|
||||
@@ -227,7 +259,7 @@ class EvaluationTraceCallback(BaseEventListener):
|
||||
|
||||
|
||||
def create_evaluation_callbacks() -> EvaluationTraceCallback:
|
||||
from crewai.utilities.events.crewai_event_bus import crewai_event_bus
|
||||
from crewai.events.event_bus import crewai_event_bus
|
||||
|
||||
callback = EvaluationTraceCallback()
|
||||
callback.setup_listeners(crewai_event_bus)
|
||||
|
||||
Reference in New Issue
Block a user