refactor: Move events module to crewai.events (#3425)

refactor(events): relocate events module & update imports - Move events from utilities/ to top-level events/ with types/, listeners/, utils/ structure - Update all source/tests/docs to new import paths - Add backwards compatibility stubs in crewai.utilities.events with deprecation warnings - Restore test mocks and fix related test imports
2026-05-03 00:02:36 +00:00 · 2025-09-02 10:06:42 -04:00
parent 1b1a8fdbf4
commit 878c1a649a
81 changed files with 1094 additions and 751 deletions
--- a/src/crewai/experimental/evaluation/agent_evaluator.py
+++ b/src/crewai/experimental/evaluation/agent_evaluator.py
@@ -1,18 +1,30 @@
 import threading
 from typing import Any, Optional

-from crewai.experimental.evaluation.base_evaluator import AgentEvaluationResult, AggregationStrategy
+from crewai.experimental.evaluation.base_evaluator import (
+    AgentEvaluationResult,
+    AggregationStrategy,
+)
 from crewai.agent import Agent
 from crewai.task import Task
 from crewai.experimental.evaluation.evaluation_display import EvaluationDisplayFormatter
-from crewai.utilities.events.agent_events import AgentEvaluationStartedEvent, AgentEvaluationCompletedEvent, AgentEvaluationFailedEvent
+from crewai.events.types.agent_events import (
+    AgentEvaluationStartedEvent,
+    AgentEvaluationCompletedEvent,
+    AgentEvaluationFailedEvent,
+)
 from crewai.experimental.evaluation import BaseEvaluator, create_evaluation_callbacks
 from collections.abc import Sequence
-from crewai.utilities.events.crewai_event_bus import crewai_event_bus
-from crewai.utilities.events.utils.console_formatter import ConsoleFormatter
-from crewai.utilities.events.task_events import TaskCompletedEvent
-from crewai.utilities.events.agent_events import LiteAgentExecutionCompletedEvent
-from crewai.experimental.evaluation.base_evaluator import AgentAggregatedEvaluationResult, EvaluationScore, MetricCategory
+from crewai.events.event_bus import crewai_event_bus
+from crewai.events.utils.console_formatter import ConsoleFormatter
+from crewai.events.types.task_events import TaskCompletedEvent
+from crewai.events.types.agent_events import LiteAgentExecutionCompletedEvent
+from crewai.experimental.evaluation.base_evaluator import (
+    AgentAggregatedEvaluationResult,
+    EvaluationScore,
+    MetricCategory,
+)
+

 class ExecutionState:
    current_agent_id: Optional[str] = None
@@ -24,6 +36,7 @@ class ExecutionState:
        self.iterations_results = {}
        self.agent_evaluators = {}

+
 class AgentEvaluator:
    def __init__(
        self,
@@ -46,27 +59,45 @@ class AgentEvaluator:

    @property
    def _execution_state(self) -> ExecutionState:
-        if not hasattr(self._thread_local, 'execution_state'):
+        if not hasattr(self._thread_local, "execution_state"):
            self._thread_local.execution_state = ExecutionState()
        return self._thread_local.execution_state

    def _subscribe_to_events(self) -> None:
        from typing import cast
-        crewai_event_bus.register_handler(TaskCompletedEvent, cast(Any, self._handle_task_completed))
-        crewai_event_bus.register_handler(LiteAgentExecutionCompletedEvent, cast(Any, self._handle_lite_agent_completed))
+
+        crewai_event_bus.register_handler(
+            TaskCompletedEvent, cast(Any, self._handle_task_completed)
+        )
+        crewai_event_bus.register_handler(
+            LiteAgentExecutionCompletedEvent,
+            cast(Any, self._handle_lite_agent_completed),
+        )

    def _handle_task_completed(self, source: Any, event: TaskCompletedEvent) -> None:
        assert event.task is not None
        agent = event.task.agent
-        if agent and str(getattr(agent, 'id', 'unknown')) in self._execution_state.agent_evaluators:
-            self.emit_evaluation_started_event(agent_role=agent.role, agent_id=str(agent.id), task_id=str(event.task.id))
+        if (
+            agent
+            and str(getattr(agent, "id", "unknown"))
+            in self._execution_state.agent_evaluators
+        ):
+            self.emit_evaluation_started_event(
+                agent_role=agent.role,
+                agent_id=str(agent.id),
+                task_id=str(event.task.id),
+            )

            state = ExecutionState()
            state.current_agent_id = str(agent.id)
            state.current_task_id = str(event.task.id)

-            assert state.current_agent_id is not None and state.current_task_id is not None
-            trace = self.callback.get_trace(state.current_agent_id, state.current_task_id)
+            assert (
+                state.current_agent_id is not None and state.current_task_id is not None
+            )
+            trace = self.callback.get_trace(
+                state.current_agent_id, state.current_task_id
+            )

            if not trace:
                return
@@ -76,19 +107,28 @@ class AgentEvaluator:
                task=event.task,
                execution_trace=trace,
                final_output=event.output,
-                state=state
+                state=state,
            )

            current_iteration = self._execution_state.iteration
            if current_iteration not in self._execution_state.iterations_results:
                self._execution_state.iterations_results[current_iteration] = {}

-            if agent.role not in self._execution_state.iterations_results[current_iteration]:
-                self._execution_state.iterations_results[current_iteration][agent.role] = []
+            if (
+                agent.role
+                not in self._execution_state.iterations_results[current_iteration]
+            ):
+                self._execution_state.iterations_results[current_iteration][
+                    agent.role
+                ] = []

-            self._execution_state.iterations_results[current_iteration][agent.role].append(result)
+            self._execution_state.iterations_results[current_iteration][
+                agent.role
+            ].append(result)

-    def _handle_lite_agent_completed(self, source: object, event: LiteAgentExecutionCompletedEvent) -> None:
+    def _handle_lite_agent_completed(
+        self, source: object, event: LiteAgentExecutionCompletedEvent
+    ) -> None:
        agent_info = event.agent_info
        agent_id = str(agent_info["id"])

@@ -106,8 +146,12 @@ class AgentEvaluator:
            if not target_agent:
                return

-            assert state.current_agent_id is not None and state.current_task_id is not None
-            trace = self.callback.get_trace(state.current_agent_id, state.current_task_id)
+            assert (
+                state.current_agent_id is not None and state.current_task_id is not None
+            )
+            trace = self.callback.get_trace(
+                state.current_agent_id, state.current_task_id
+            )

            if not trace:
                return
@@ -116,7 +160,7 @@ class AgentEvaluator:
                agent=target_agent,
                execution_trace=trace,
                final_output=event.output,
-                state=state
+                state=state,
            )

            current_iteration = self._execution_state.iteration
@@ -124,10 +168,17 @@ class AgentEvaluator:
                self._execution_state.iterations_results[current_iteration] = {}

            agent_role = target_agent.role
-            if agent_role not in self._execution_state.iterations_results[current_iteration]:
-                self._execution_state.iterations_results[current_iteration][agent_role] = []
+            if (
+                agent_role
+                not in self._execution_state.iterations_results[current_iteration]
+            ):
+                self._execution_state.iterations_results[current_iteration][
+                    agent_role
+                ] = []

-            self._execution_state.iterations_results[current_iteration][agent_role].append(result)
+            self._execution_state.iterations_results[current_iteration][
+                agent_role
+            ].append(result)

    def set_iteration(self, iteration: int) -> None:
        self._execution_state.iteration = iteration
@@ -136,14 +187,26 @@ class AgentEvaluator:
        self._execution_state.iterations_results = {}

    def get_evaluation_results(self) -> dict[str, list[AgentEvaluationResult]]:
-        if self._execution_state.iterations_results and self._execution_state.iteration in self._execution_state.iterations_results:
-            return self._execution_state.iterations_results[self._execution_state.iteration]
+        if (
+            self._execution_state.iterations_results
+            and self._execution_state.iteration
+            in self._execution_state.iterations_results
+        ):
+            return self._execution_state.iterations_results[
+                self._execution_state.iteration
+            ]
        return {}

    def display_results_with_iterations(self) -> None:
-        self.display_formatter.display_summary_results(self._execution_state.iterations_results)
+        self.display_formatter.display_summary_results(
+            self._execution_state.iterations_results
+        )

-    def get_agent_evaluation(self, strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE, include_evaluation_feedback: bool = True) -> dict[str, AgentAggregatedEvaluationResult]:
+    def get_agent_evaluation(
+        self,
+        strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE,
+        include_evaluation_feedback: bool = True,
+    ) -> dict[str, AgentAggregatedEvaluationResult]:
        agent_results = {}
        with crewai_event_bus.scoped_handlers():
            task_results = self.get_evaluation_results()
@@ -157,13 +220,16 @@ class AgentEvaluator:
                    agent_id=agent_id,
                    agent_role=agent_role,
                    results=results,
-                    strategy=strategy
+                    strategy=strategy,
                )

                agent_results[agent_role] = aggregated_result

-
-            if self._execution_state.iterations_results and self._execution_state.iteration == max(self._execution_state.iterations_results.keys(), default=0):
+            if (
+                self._execution_state.iterations_results
+                and self._execution_state.iteration
+                == max(self._execution_state.iterations_results.keys(), default=0)
+            ):
                self.display_results_with_iterations()

            if include_evaluation_feedback:
@@ -172,7 +238,9 @@ class AgentEvaluator:
        return agent_results

    def display_evaluation_with_feedback(self) -> None:
-        self.display_formatter.display_evaluation_with_feedback(self._execution_state.iterations_results)
+        self.display_formatter.display_evaluation_with_feedback(
+            self._execution_state.iterations_results
+        )

    def evaluate(
        self,
@@ -184,46 +252,91 @@ class AgentEvaluator:
    ) -> AgentEvaluationResult:
        result = AgentEvaluationResult(
            agent_id=state.current_agent_id or str(agent.id),
-            task_id=state.current_task_id or (str(task.id) if task else "unknown_task")
+            task_id=state.current_task_id or (str(task.id) if task else "unknown_task"),
        )

        assert self.evaluators is not None
        task_id = str(task.id) if task else None
        for evaluator in self.evaluators:
            try:
-                self.emit_evaluation_started_event(agent_role=agent.role, agent_id=str(agent.id), task_id=task_id)
+                self.emit_evaluation_started_event(
+                    agent_role=agent.role, agent_id=str(agent.id), task_id=task_id
+                )
                score = evaluator.evaluate(
                    agent=agent,
                    task=task,
                    execution_trace=execution_trace,
-                    final_output=final_output
+                    final_output=final_output,
                )
                result.metrics[evaluator.metric_category] = score
-                self.emit_evaluation_completed_event(agent_role=agent.role, agent_id=str(agent.id), task_id=task_id, metric_category=evaluator.metric_category, score=score)
+                self.emit_evaluation_completed_event(
+                    agent_role=agent.role,
+                    agent_id=str(agent.id),
+                    task_id=task_id,
+                    metric_category=evaluator.metric_category,
+                    score=score,
+                )
            except Exception as e:
-                self.emit_evaluation_failed_event(agent_role=agent.role, agent_id=str(agent.id), task_id=task_id, error=str(e))
-                self.console_formatter.print(f"Error in {evaluator.metric_category.value} evaluator: {str(e)}")
+                self.emit_evaluation_failed_event(
+                    agent_role=agent.role,
+                    agent_id=str(agent.id),
+                    task_id=task_id,
+                    error=str(e),
+                )
+                self.console_formatter.print(
+                    f"Error in {evaluator.metric_category.value} evaluator: {str(e)}"
+                )

        return result

-    def emit_evaluation_started_event(self, agent_role: str, agent_id: str, task_id: str | None = None):
+    def emit_evaluation_started_event(
+        self, agent_role: str, agent_id: str, task_id: str | None = None
+    ):
        crewai_event_bus.emit(
            self,
-            AgentEvaluationStartedEvent(agent_role=agent_role, agent_id=agent_id, task_id=task_id, iteration=self._execution_state.iteration)
+            AgentEvaluationStartedEvent(
+                agent_role=agent_role,
+                agent_id=agent_id,
+                task_id=task_id,
+                iteration=self._execution_state.iteration,
+            ),
        )

-    def emit_evaluation_completed_event(self, agent_role: str, agent_id: str, task_id: str | None = None, metric_category: MetricCategory | None = None, score: EvaluationScore | None = None):
+    def emit_evaluation_completed_event(
+        self,
+        agent_role: str,
+        agent_id: str,
+        task_id: str | None = None,
+        metric_category: MetricCategory | None = None,
+        score: EvaluationScore | None = None,
+    ):
        crewai_event_bus.emit(
            self,
-            AgentEvaluationCompletedEvent(agent_role=agent_role, agent_id=agent_id, task_id=task_id, iteration=self._execution_state.iteration, metric_category=metric_category, score=score)
+            AgentEvaluationCompletedEvent(
+                agent_role=agent_role,
+                agent_id=agent_id,
+                task_id=task_id,
+                iteration=self._execution_state.iteration,
+                metric_category=metric_category,
+                score=score,
+            ),
        )

-    def emit_evaluation_failed_event(self, agent_role: str, agent_id: str, error: str, task_id: str | None = None):
+    def emit_evaluation_failed_event(
+        self, agent_role: str, agent_id: str, error: str, task_id: str | None = None
+    ):
        crewai_event_bus.emit(
            self,
-            AgentEvaluationFailedEvent(agent_role=agent_role, agent_id=agent_id, task_id=task_id, iteration=self._execution_state.iteration, error=error)
+            AgentEvaluationFailedEvent(
+                agent_role=agent_role,
+                agent_id=agent_id,
+                task_id=task_id,
+                iteration=self._execution_state.iteration,
+                error=error,
+            ),
        )

+
 def create_default_evaluator(agents: list[Agent], llm: None = None):
    from crewai.experimental.evaluation import (
        GoalAlignmentEvaluator,
@@ -231,7 +344,7 @@ def create_default_evaluator(agents: list[Agent], llm: None = None):
        ToolSelectionEvaluator,
        ParameterExtractionEvaluator,
        ToolInvocationEvaluator,
-        ReasoningEfficiencyEvaluator
+        ReasoningEfficiencyEvaluator,
    )

    evaluators = [
--- a/src/crewai/experimental/evaluation/evaluation_display.py
+++ b/src/crewai/experimental/evaluation/evaluation_display.py
@@ -3,18 +3,28 @@ from typing import Dict, Any, List
 from rich.table import Table
 from rich.box import HEAVY_EDGE, ROUNDED
 from collections.abc import Sequence
-from crewai.experimental.evaluation.base_evaluator import AgentAggregatedEvaluationResult, AggregationStrategy, AgentEvaluationResult, MetricCategory
+from crewai.experimental.evaluation.base_evaluator import (
+    AgentAggregatedEvaluationResult,
+    AggregationStrategy,
+    AgentEvaluationResult,
+    MetricCategory,
+)
 from crewai.experimental.evaluation import EvaluationScore
-from crewai.utilities.events.utils.console_formatter import ConsoleFormatter
+from crewai.events.utils.console_formatter import ConsoleFormatter
 from crewai.utilities.llm_utils import create_llm

+
 class EvaluationDisplayFormatter:
    def __init__(self):
        self.console_formatter = ConsoleFormatter()

-    def display_evaluation_with_feedback(self, iterations_results: Dict[int, Dict[str, List[Any]]]):
+    def display_evaluation_with_feedback(
+        self, iterations_results: Dict[int, Dict[str, List[Any]]]
+    ):
        if not iterations_results:
-            self.console_formatter.print("[yellow]No evaluation results to display[/yellow]")
+            self.console_formatter.print(
+                "[yellow]No evaluation results to display[/yellow]"
+            )
            return

        all_agent_roles: set[str] = set()
@@ -22,7 +32,9 @@ class EvaluationDisplayFormatter:
            all_agent_roles.update(iter_results.keys())

        for agent_role in sorted(all_agent_roles):
-            self.console_formatter.print(f"\n[bold cyan]Agent: {agent_role}[/bold cyan]")
+            self.console_formatter.print(
+                f"\n[bold cyan]Agent: {agent_role}[/bold cyan]"
+            )

            for iter_num, results in sorted(iterations_results.items()):
                if agent_role not in results or not results[agent_role]:
@@ -62,9 +74,7 @@ class EvaluationDisplayFormatter:

                        table.add_section()
                        table.add_row(
-                            metric.title(),
-                            score_text,
-                            evaluation_score.feedback or ""
+                            metric.title(), score_text, evaluation_score.feedback or ""
                        )

                if aggregated_result.overall_score is not None:
@@ -82,19 +92,26 @@ class EvaluationDisplayFormatter:
                    table.add_row(
                        "Overall Score",
                        f"[{overall_color}]{overall_score:.1f}[/]",
-                        "Overall agent evaluation score"
+                        "Overall agent evaluation score",
                    )

                self.console_formatter.print(table)

-    def display_summary_results(self, iterations_results: Dict[int, Dict[str, List[AgentAggregatedEvaluationResult]]]):
+    def display_summary_results(
+        self,
+        iterations_results: Dict[int, Dict[str, List[AgentAggregatedEvaluationResult]]],
+    ):
        if not iterations_results:
-            self.console_formatter.print("[yellow]No evaluation results to display[/yellow]")
+            self.console_formatter.print(
+                "[yellow]No evaluation results to display[/yellow]"
+            )
            return

        self.console_formatter.print("\n")

-        table = Table(title="Agent Performance Scores \n (1-10 Higher is better)", box=HEAVY_EDGE)
+        table = Table(
+            title="Agent Performance Scores \n (1-10 Higher is better)", box=HEAVY_EDGE
+        )

        table.add_column("Agent/Metric", style="cyan")

@@ -123,11 +140,14 @@ class EvaluationDisplayFormatter:
                    agent_id=agent_id,
                    agent_role=agent_role,
                    results=agent_results,
-                    strategy=AggregationStrategy.SIMPLE_AVERAGE
+                    strategy=AggregationStrategy.SIMPLE_AVERAGE,
                )

-                valid_scores = [score.score for score in aggregated_result.metrics.values()
-                               if score.score is not None]
+                valid_scores = [
+                    score.score
+                    for score in aggregated_result.metrics.values()
+                    if score.score is not None
+                ]
                if valid_scores:
                    avg_score = sum(valid_scores) / len(valid_scores)
                    agent_scores_by_iteration[iter_num] = avg_score
@@ -137,7 +157,9 @@ class EvaluationDisplayFormatter:
            if not agent_scores_by_iteration:
                continue

-            avg_across_iterations = sum(agent_scores_by_iteration.values()) / len(agent_scores_by_iteration)
+            avg_across_iterations = sum(agent_scores_by_iteration.values()) / len(
+                agent_scores_by_iteration
+            )

            row = [f"[bold]{agent_role}[/bold]"]

@@ -178,9 +200,13 @@ class EvaluationDisplayFormatter:
                row = [f"  - {metric.title()}"]

                for iter_num in sorted(iterations_results.keys()):
-                    if (iter_num in agent_metrics_by_iteration and
-                            metric in agent_metrics_by_iteration[iter_num]):
-                        metric_score = agent_metrics_by_iteration[iter_num][metric].score
+                    if (
+                        iter_num in agent_metrics_by_iteration
+                        and metric in agent_metrics_by_iteration[iter_num]
+                    ):
+                        metric_score = agent_metrics_by_iteration[iter_num][
+                            metric
+                        ].score
                        if metric_score is not None:
                            metric_scores.append(metric_score)
                            if metric_score >= 8.0:
@@ -225,7 +251,9 @@ class EvaluationDisplayFormatter:
        results: Sequence[AgentEvaluationResult],
        strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE,
    ) -> AgentAggregatedEvaluationResult:
-        metrics_by_category: dict[MetricCategory, list[EvaluationScore]] = defaultdict(list)
+        metrics_by_category: dict[MetricCategory, list[EvaluationScore]] = defaultdict(
+            list
+        )

        for result in results:
            for metric_name, evaluation_score in result.metrics.items():
@@ -246,19 +274,20 @@ class EvaluationDisplayFormatter:
                        metric=category.title(),
                        feedbacks=feedbacks,
                        scores=[s.score for s in scores],
-                        strategy=strategy
+                        strategy=strategy,
                    )
                else:
                    feedback_summary = feedbacks[0]

            aggregated_metrics[category] = EvaluationScore(
-                score=avg_score,
-                feedback=feedback_summary
+                score=avg_score, feedback=feedback_summary
            )

        overall_score = None
        if aggregated_metrics:
-            valid_scores = [m.score for m in aggregated_metrics.values() if m.score is not None]
+            valid_scores = [
+                m.score for m in aggregated_metrics.values() if m.score is not None
+            ]
            if valid_scores:
                overall_score = sum(valid_scores) / len(valid_scores)

@@ -268,7 +297,7 @@ class EvaluationDisplayFormatter:
            metrics=aggregated_metrics,
            overall_score=overall_score,
            task_count=len(results),
-            aggregation_strategy=strategy
+            aggregation_strategy=strategy,
        )

    def _summarize_feedbacks(
@@ -277,10 +306,12 @@ class EvaluationDisplayFormatter:
        metric: str,
        feedbacks: List[str],
        scores: List[float | None],
-        strategy: AggregationStrategy
+        strategy: AggregationStrategy,
    ) -> str:
        if len(feedbacks) <= 2 and all(len(fb) < 200 for fb in feedbacks):
-            return "\n\n".join([f"Feedback {i+1}: {fb}" for i, fb in enumerate(feedbacks)])
+            return "\n\n".join(
+                [f"Feedback {i+1}: {fb}" for i, fb in enumerate(feedbacks)]
+            )

        try:
            llm = create_llm()
@@ -290,20 +321,26 @@ class EvaluationDisplayFormatter:
                if len(feedback) > 500:
                    feedback = feedback[:500] + "..."
                score_text = f"{score:.1f}" if score is not None else "N/A"
-                formatted_feedbacks.append(f"Feedback #{i+1} (Score: {score_text}):\n{feedback}")
+                formatted_feedbacks.append(
+                    f"Feedback #{i+1} (Score: {score_text}):\n{feedback}"
+                )

            all_feedbacks = "\n\n" + "\n\n---\n\n".join(formatted_feedbacks)

            strategy_guidance = ""
            if strategy == AggregationStrategy.BEST_PERFORMANCE:
-                strategy_guidance = "Focus on the highest-scoring aspects and strengths demonstrated."
+                strategy_guidance = (
+                    "Focus on the highest-scoring aspects and strengths demonstrated."
+                )
            elif strategy == AggregationStrategy.WORST_PERFORMANCE:
                strategy_guidance = "Focus on areas that need improvement and common issues across tasks."
            else:
                strategy_guidance = "Provide a balanced analysis of strengths and weaknesses across all tasks."

            prompt = [
-                {"role": "system", "content": f"""You are an expert evaluator creating a comprehensive summary of agent performance feedback.
+                {
+                    "role": "system",
+                    "content": f"""You are an expert evaluator creating a comprehensive summary of agent performance feedback.
                Your job is to synthesize multiple feedback points about the same metric across different tasks.

                Create a concise, insightful summary that captures the key patterns and themes from all feedback.
@@ -315,14 +352,18 @@ class EvaluationDisplayFormatter:
                3. Highlighting patterns across tasks
                4. 150-250 words in length

-                The summary should be directly usable as final feedback for the agent's performance on this metric."""},
-                {"role": "user", "content": f"""I need a synthesized summary of the following feedback for:
+                The summary should be directly usable as final feedback for the agent's performance on this metric.""",
+                },
+                {
+                    "role": "user",
+                    "content": f"""I need a synthesized summary of the following feedback for:

                Agent Role: {agent_role}
                Metric: {metric.title()}

                {all_feedbacks}
-                """}
+                """,
+                },
            ]
            assert llm is not None
            response = llm.call(prompt)
@@ -330,4 +371,6 @@ class EvaluationDisplayFormatter:
            return response

        except Exception:
-            return "Synthesized from multiple tasks: " + "\n\n".join([f"- {fb[:500]}..." for fb in feedbacks])
+            return "Synthesized from multiple tasks: " + "\n\n".join(
+                [f"- {fb[:500]}..." for fb in feedbacks]
+            )
--- a/src/crewai/experimental/evaluation/evaluation_listener.py
+++ b/src/crewai/experimental/evaluation/evaluation_listener.py
@@ -5,25 +5,23 @@ from collections.abc import Sequence

 from crewai.agent import Agent
 from crewai.task import Task
-from crewai.utilities.events.base_event_listener import BaseEventListener
-from crewai.utilities.events.crewai_event_bus import CrewAIEventsBus
-from crewai.utilities.events.agent_events import (
+from crewai.events.base_event_listener import BaseEventListener
+from crewai.events.event_bus import CrewAIEventsBus
+from crewai.events.types.agent_events import (
    AgentExecutionStartedEvent,
    AgentExecutionCompletedEvent,
    LiteAgentExecutionStartedEvent,
-    LiteAgentExecutionCompletedEvent
+    LiteAgentExecutionCompletedEvent,
 )
-from crewai.utilities.events.tool_usage_events import (
+from crewai.events.types.tool_usage_events import (
    ToolUsageFinishedEvent,
    ToolUsageErrorEvent,
    ToolExecutionErrorEvent,
    ToolSelectionErrorEvent,
-    ToolValidateInputErrorEvent
-)
-from crewai.utilities.events.llm_events import (
-    LLMCallStartedEvent,
-    LLMCallCompletedEvent
+    ToolValidateInputErrorEvent,
 )
+from crewai.events.types.llm_events import LLMCallStartedEvent, LLMCallCompletedEvent
+

 class EvaluationTraceCallback(BaseEventListener):
    """Event listener for collecting execution traces for evaluation.
@@ -68,27 +66,49 @@ class EvaluationTraceCallback(BaseEventListener):

        @event_bus.on(ToolUsageFinishedEvent)
        def on_tool_completed(source, event: ToolUsageFinishedEvent):
-            self.on_tool_use(event.tool_name, event.tool_args, event.output, success=True)
+            self.on_tool_use(
+                event.tool_name, event.tool_args, event.output, success=True
+            )

        @event_bus.on(ToolUsageErrorEvent)
        def on_tool_usage_error(source, event: ToolUsageErrorEvent):
-            self.on_tool_use(event.tool_name, event.tool_args, event.error,
-                           success=False, error_type="usage_error")
+            self.on_tool_use(
+                event.tool_name,
+                event.tool_args,
+                event.error,
+                success=False,
+                error_type="usage_error",
+            )

        @event_bus.on(ToolExecutionErrorEvent)
        def on_tool_execution_error(source, event: ToolExecutionErrorEvent):
-            self.on_tool_use(event.tool_name, event.tool_args, event.error,
-                           success=False, error_type="execution_error")
+            self.on_tool_use(
+                event.tool_name,
+                event.tool_args,
+                event.error,
+                success=False,
+                error_type="execution_error",
+            )

        @event_bus.on(ToolSelectionErrorEvent)
        def on_tool_selection_error(source, event: ToolSelectionErrorEvent):
-            self.on_tool_use(event.tool_name, event.tool_args, event.error,
-                           success=False, error_type="selection_error")
+            self.on_tool_use(
+                event.tool_name,
+                event.tool_args,
+                event.error,
+                success=False,
+                error_type="selection_error",
+            )

        @event_bus.on(ToolValidateInputErrorEvent)
        def on_tool_validate_input_error(source, event: ToolValidateInputErrorEvent):
-            self.on_tool_use(event.tool_name, event.tool_args, event.error,
-                           success=False, error_type="validation_error")
+            self.on_tool_use(
+                event.tool_name,
+                event.tool_args,
+                event.error,
+                success=False,
+                error_type="validation_error",
+            )

        @event_bus.on(LLMCallStartedEvent)
        def on_llm_call_started(source, event: LLMCallStartedEvent):
@@ -99,7 +119,7 @@ class EvaluationTraceCallback(BaseEventListener):
            self.on_llm_call_end(event.messages, event.response)

    def on_lite_agent_start(self, agent_info: dict[str, Any]):
-        self.current_agent_id = agent_info['id']
+        self.current_agent_id = agent_info["id"]
        self.current_task_id = "lite_task"

        trace_key = f"{self.current_agent_id}_{self.current_task_id}"
@@ -110,7 +130,7 @@ class EvaluationTraceCallback(BaseEventListener):
            tool_uses=[],
            llm_calls=[],
            start_time=datetime.now(),
-            final_output=None
+            final_output=None,
        )

    def _init_trace(self, trace_key: str, **kwargs: Any):
@@ -128,7 +148,7 @@ class EvaluationTraceCallback(BaseEventListener):
            tool_uses=[],
            llm_calls=[],
            start_time=datetime.now(),
-            final_output=None
+            final_output=None,
        )

    def on_agent_finish(self, agent: Agent, task: Task, output: Any):
@@ -151,8 +171,14 @@ class EvaluationTraceCallback(BaseEventListener):

        self._reset_current()

-    def on_tool_use(self, tool_name: str, tool_args: dict[str, Any] | str, result: Any,
-                   success: bool = True, error_type: str | None = None):
+    def on_tool_use(
+        self,
+        tool_name: str,
+        tool_args: dict[str, Any] | str,
+        result: Any,
+        success: bool = True,
+        error_type: str | None = None,
+    ):
        if not self.current_agent_id or not self.current_task_id:
            return

@@ -163,7 +189,7 @@ class EvaluationTraceCallback(BaseEventListener):
                "args": tool_args,
                "result": result,
                "success": success,
-                "timestamp": datetime.now()
+                "timestamp": datetime.now(),
            }

            # Add error information if applicable
@@ -173,7 +199,11 @@ class EvaluationTraceCallback(BaseEventListener):

            self.traces[trace_key]["tool_uses"].append(tool_use)

-    def on_llm_call_start(self, messages: str | Sequence[dict[str, Any]] | None, tools: Sequence[dict[str, Any]] | None = None):
+    def on_llm_call_start(
+        self,
+        messages: str | Sequence[dict[str, Any]] | None,
+        tools: Sequence[dict[str, Any]] | None = None,
+    ):
        if not self.current_agent_id or not self.current_task_id:
            return

@@ -186,10 +216,12 @@ class EvaluationTraceCallback(BaseEventListener):
            "tools": tools,
            "start_time": datetime.now(),
            "response": None,
-            "end_time": None
+            "end_time": None,
        }

-    def on_llm_call_end(self, messages: str | list[dict[str, Any]] | None, response: Any):
+    def on_llm_call_end(
+        self, messages: str | list[dict[str, Any]] | None, response: Any
+    ):
        if not self.current_agent_id or not self.current_task_id:
            return

@@ -213,7 +245,7 @@ class EvaluationTraceCallback(BaseEventListener):
            "response": response,
            "start_time": start_time,
            "end_time": current_time,
-            "total_tokens": total_tokens
+            "total_tokens": total_tokens,
        }

        self.traces[trace_key]["llm_calls"].append(llm_call)
@@ -227,7 +259,7 @@ class EvaluationTraceCallback(BaseEventListener):


 def create_evaluation_callbacks() -> EvaluationTraceCallback:
-    from crewai.utilities.events.crewai_event_bus import crewai_event_bus
+    from crewai.events.event_bus import crewai_event_bus

    callback = EvaluationTraceCallback()
    callback.setup_listeners(crewai_event_bus)