fix: ensure otel span is closed

2026-05-03 00:02:36 +00:00 · 2025-12-05 13:23:26 -05:00
parent 7fff2b654c
commit f2f994612c
67 changed files with 19343 additions and 37515 deletions
--- a/lib/crewai/tests/experimental/evaluation/test_agent_evaluator.py
+++ b/lib/crewai/tests/experimental/evaluation/test_agent_evaluator.py
@@ -56,54 +56,53 @@ class TestAgentEvaluator:

    @pytest.mark.vcr()
    def test_evaluate_current_iteration(self, mock_crew):
-        from crewai.events.types.task_events import TaskCompletedEvent
+        with crewai_event_bus.scoped_handlers():
+            agent_evaluator = AgentEvaluator(
+                agents=mock_crew.agents, evaluators=[GoalAlignmentEvaluator()]
+            )

-        agent_evaluator = AgentEvaluator(
-            agents=mock_crew.agents, evaluators=[GoalAlignmentEvaluator()]
-        )
+            evaluation_condition = threading.Condition()
+            evaluation_completed = False

-        evaluation_condition = threading.Condition()
-        evaluation_completed = False
+            @crewai_event_bus.on(AgentEvaluationCompletedEvent)
+            async def on_evaluation_completed(source, event):
+                nonlocal evaluation_completed
+                with evaluation_condition:
+                    evaluation_completed = True
+                    evaluation_condition.notify()
+
+            mock_crew.kickoff()

-        @crewai_event_bus.on(AgentEvaluationCompletedEvent)
-        async def on_evaluation_completed(source, event):
-            nonlocal evaluation_completed
            with evaluation_condition:
-                evaluation_completed = True
-                evaluation_condition.notify()
+                assert evaluation_condition.wait_for(
+                    lambda: evaluation_completed, timeout=5
+                ), "Timeout waiting for evaluation completion"

-        mock_crew.kickoff()
+            results = agent_evaluator.get_evaluation_results()

-        with evaluation_condition:
-            assert evaluation_condition.wait_for(
-                lambda: evaluation_completed, timeout=5
-            ), "Timeout waiting for evaluation completion"
+            assert isinstance(results, dict)

-        results = agent_evaluator.get_evaluation_results()
+            (agent,) = mock_crew.agents
+            (task,) = mock_crew.tasks

-        assert isinstance(results, dict)
+            assert len(mock_crew.agents) == 1
+            assert agent.role in results
+            assert len(results[agent.role]) == 1

-        (agent,) = mock_crew.agents
-        (task,) = mock_crew.tasks
+            (result,) = results[agent.role]
+            assert isinstance(result, AgentEvaluationResult)

-        assert len(mock_crew.agents) == 1
-        assert agent.role in results
-        assert len(results[agent.role]) == 1
+            assert result.agent_id == str(agent.id)
+            assert result.task_id == str(task.id)

-        (result,) = results[agent.role]
-        assert isinstance(result, AgentEvaluationResult)
+            (goal_alignment,) = result.metrics.values()
+            assert goal_alignment.score == 5.0

-        assert result.agent_id == str(agent.id)
-        assert result.task_id == str(task.id)
+            expected_feedback = "The agent's output demonstrates an understanding of the need for a comprehensive document outlining task"
+            assert expected_feedback in goal_alignment.feedback

-        (goal_alignment,) = result.metrics.values()
-        assert goal_alignment.score == 5.0
-
-        expected_feedback = "The agent's output demonstrates an understanding of the need for a comprehensive document outlining task"
-        assert expected_feedback in goal_alignment.feedback
-
-        assert goal_alignment.raw_response is not None
-        assert '"score": 5' in goal_alignment.raw_response
+            assert goal_alignment.raw_response is not None
+            assert '"score": 5' in goal_alignment.raw_response

    def test_create_default_evaluator(self, mock_crew):
        agent_evaluator = create_default_evaluator(agents=mock_crew.agents)
@@ -127,154 +126,156 @@ class TestAgentEvaluator:

    @pytest.mark.vcr()
    def test_eval_specific_agents_from_crew(self, mock_crew):
-        agent = Agent(
-            role="Test Agent Eval",
-            goal="Complete test tasks successfully",
-            backstory="An agent created for testing purposes",
-        )
-        task = Task(
-            description="Test task description",
-            agent=agent,
-            expected_output="Expected test output",
-        )
-        mock_crew.agents.append(agent)
-        mock_crew.tasks.append(task)
+        with crewai_event_bus.scoped_handlers():
+            agent = Agent(
+                role="Test Agent Eval",
+                goal="Complete test tasks successfully",
+                backstory="An agent created for testing purposes",
+            )
+            task = Task(
+                description="Test task description",
+                agent=agent,
+                expected_output="Expected test output",
+            )
+            mock_crew.agents.append(agent)
+            mock_crew.tasks.append(task)

-        events = {}
-        results_condition = threading.Condition()
-        completed_event_received = False
+            events = {}
+            results_condition = threading.Condition()
+            completed_event_received = False

-        agent_evaluator = AgentEvaluator(
-            agents=[agent], evaluators=[GoalAlignmentEvaluator()]
-        )
+            agent_evaluator = AgentEvaluator(
+                agents=[agent], evaluators=[GoalAlignmentEvaluator()]
+            )

-        @crewai_event_bus.on(AgentEvaluationStartedEvent)
-        async def capture_started(source, event):
-            if event.agent_id == str(agent.id):
-                events["started"] = event
+            @crewai_event_bus.on(AgentEvaluationStartedEvent)
+            async def capture_started(source, event):
+                if event.agent_id == str(agent.id):
+                    events["started"] = event

-        @crewai_event_bus.on(AgentEvaluationCompletedEvent)
-        async def capture_completed(source, event):
-            nonlocal completed_event_received
-            if event.agent_id == str(agent.id):
-                events["completed"] = event
-                with results_condition:
-                    completed_event_received = True
-                    results_condition.notify()
+            @crewai_event_bus.on(AgentEvaluationCompletedEvent)
+            async def capture_completed(source, event):
+                nonlocal completed_event_received
+                if event.agent_id == str(agent.id):
+                    events["completed"] = event
+                    with results_condition:
+                        completed_event_received = True
+                        results_condition.notify()

-        @crewai_event_bus.on(AgentEvaluationFailedEvent)
-        def capture_failed(source, event):
-            events["failed"] = event
+            @crewai_event_bus.on(AgentEvaluationFailedEvent)
+            def capture_failed(source, event):
+                events["failed"] = event

-        mock_crew.kickoff()
+            mock_crew.kickoff()

-        with results_condition:
-            assert results_condition.wait_for(
-                lambda: completed_event_received, timeout=5
-            ), "Timeout waiting for evaluation completed event"
+            with results_condition:
+                assert results_condition.wait_for(
+                    lambda: completed_event_received, timeout=5
+                ), "Timeout waiting for evaluation completed event"

-        assert events.keys() == {"started", "completed"}
-        assert events["started"].agent_id == str(agent.id)
-        assert events["started"].agent_role == agent.role
-        assert events["started"].task_id == str(task.id)
-        assert events["started"].iteration == 1
+            assert events.keys() == {"started", "completed"}
+            assert events["started"].agent_id == str(agent.id)
+            assert events["started"].agent_role == agent.role
+            assert events["started"].task_id == str(task.id)
+            assert events["started"].iteration == 1

-        assert events["completed"].agent_id == str(agent.id)
-        assert events["completed"].agent_role == agent.role
-        assert events["completed"].task_id == str(task.id)
-        assert events["completed"].iteration == 1
-        assert events["completed"].metric_category == MetricCategory.GOAL_ALIGNMENT
-        assert isinstance(events["completed"].score, EvaluationScore)
-        assert events["completed"].score.score == 5.0
+            assert events["completed"].agent_id == str(agent.id)
+            assert events["completed"].agent_role == agent.role
+            assert events["completed"].task_id == str(task.id)
+            assert events["completed"].iteration == 1
+            assert events["completed"].metric_category == MetricCategory.GOAL_ALIGNMENT
+            assert isinstance(events["completed"].score, EvaluationScore)
+            assert events["completed"].score.score == 5.0

-        results = agent_evaluator.get_evaluation_results()
+            results = agent_evaluator.get_evaluation_results()

-        assert isinstance(results, dict)
-        assert len(results.keys()) == 1
-        (result,) = results[agent.role]
-        assert isinstance(result, AgentEvaluationResult)
+            assert isinstance(results, dict)
+            assert len(results.keys()) == 1
+            (result,) = results[agent.role]
+            assert isinstance(result, AgentEvaluationResult)

-        assert result.agent_id == str(agent.id)
-        assert result.task_id == str(task.id)
+            assert result.agent_id == str(agent.id)
+            assert result.task_id == str(task.id)

-        (goal_alignment,) = result.metrics.values()
-        assert goal_alignment.score == 5.0
+            (goal_alignment,) = result.metrics.values()
+            assert goal_alignment.score == 5.0

-        expected_feedback = "The agent provided a thorough guide on how to conduct a test task but failed to produce specific expected output"
-        assert expected_feedback in goal_alignment.feedback
+            expected_feedback = "The agent provided a thorough guide on how to conduct a test task but failed to produce specific expected output"
+            assert expected_feedback in goal_alignment.feedback

-        assert goal_alignment.raw_response is not None
-        assert '"score": 5' in goal_alignment.raw_response
+            assert goal_alignment.raw_response is not None
+            assert '"score": 5' in goal_alignment.raw_response

    @pytest.mark.vcr()
    def test_failed_evaluation(self, mock_crew):
-        (agent,) = mock_crew.agents
-        (task,) = mock_crew.tasks
+        with crewai_event_bus.scoped_handlers():
+            (agent,) = mock_crew.agents
+            (task,) = mock_crew.tasks

-        events: dict[str, AgentEvaluationStartedEvent | AgentEvaluationCompletedEvent | AgentEvaluationFailedEvent] = {}
-        condition = threading.Condition()
+            events: dict[str, AgentEvaluationStartedEvent | AgentEvaluationCompletedEvent | AgentEvaluationFailedEvent] = {}
+            condition = threading.Condition()

-        @crewai_event_bus.on(AgentEvaluationStartedEvent)
-        def capture_started(source, event):
-            with condition:
-                events["started"] = event
-                condition.notify()
+            @crewai_event_bus.on(AgentEvaluationStartedEvent)
+            def capture_started(source, event):
+                with condition:
+                    events["started"] = event
+                    condition.notify()

-        @crewai_event_bus.on(AgentEvaluationCompletedEvent)
-        def capture_completed(source, event):
-            with condition:
-                events["completed"] = event
-                condition.notify()
+            @crewai_event_bus.on(AgentEvaluationCompletedEvent)
+            def capture_completed(source, event):
+                with condition:
+                    events["completed"] = event
+                    condition.notify()

-        @crewai_event_bus.on(AgentEvaluationFailedEvent)
-        def capture_failed(source, event):
-            with condition:
-                events["failed"] = event
-                condition.notify()
+            @crewai_event_bus.on(AgentEvaluationFailedEvent)
+            def capture_failed(source, event):
+                with condition:
+                    events["failed"] = event
+                    condition.notify()

-        class FailingEvaluator(BaseEvaluator):
-            metric_category = MetricCategory.GOAL_ALIGNMENT
+            class FailingEvaluator(BaseEvaluator):
+                metric_category = MetricCategory.GOAL_ALIGNMENT

-            def evaluate(self, agent, task, execution_trace, final_output):
-                raise ValueError("Forced evaluation failure")
+                def evaluate(self, agent, task, execution_trace, final_output):
+                    raise ValueError("Forced evaluation failure")

-        agent_evaluator = AgentEvaluator(
-            agents=[agent], evaluators=[FailingEvaluator()]
-        )
-        mock_crew.kickoff()
-
-        with condition:
-            success = condition.wait_for(
-                lambda: "started" in events and "failed" in events,
-                timeout=10,
+            agent_evaluator = AgentEvaluator(
+                agents=[agent], evaluators=[FailingEvaluator()]
            )
-        assert success, "Timeout waiting for evaluation events"
+            mock_crew.kickoff()

-        assert events.keys() == {"started", "failed"}
-        assert events["started"].agent_id == str(agent.id)
-        assert events["started"].agent_role == agent.role
-        assert events["started"].task_id == str(task.id)
-        assert events["started"].iteration == 1
+            with condition:
+                success = condition.wait_for(
+                    lambda: "started" in events and "failed" in events,
+                    timeout=10,
+                )
+            assert success, "Timeout waiting for evaluation events"

-        assert events["failed"].agent_id == str(agent.id)
-        assert events["failed"].agent_role == agent.role
-        assert events["failed"].task_id == str(task.id)
-        assert events["failed"].iteration == 1
-        assert events["failed"].error == "Forced evaluation failure"
+            assert events.keys() == {"started", "failed"}
+            assert events["started"].agent_id == str(agent.id)
+            assert events["started"].agent_role == agent.role
+            assert events["started"].task_id == str(task.id)
+            assert events["started"].iteration == 1

-        # Wait for results to be stored - the event is emitted before storage
-        with condition:
-            success = condition.wait_for(
-                lambda: agent.role in agent_evaluator.get_evaluation_results(),
-                timeout=5,
-            )
-        assert success, "Timeout waiting for evaluation results to be stored"
+            assert events["failed"].agent_id == str(agent.id)
+            assert events["failed"].agent_role == agent.role
+            assert events["failed"].task_id == str(task.id)
+            assert events["failed"].iteration == 1
+            assert events["failed"].error == "Forced evaluation failure"

-        results = agent_evaluator.get_evaluation_results()
-        (result,) = results[agent.role]
-        assert isinstance(result, AgentEvaluationResult)
+            # Wait for results to be stored - the event is emitted before storage
+            with condition:
+                success = condition.wait_for(
+                    lambda: agent.role in agent_evaluator.get_evaluation_results(),
+                    timeout=5,
+                )
+            assert success, "Timeout waiting for evaluation results to be stored"

-        assert result.agent_id == str(agent.id)
-        assert result.task_id == str(task.id)
+            results = agent_evaluator.get_evaluation_results()
+            (result,) = results[agent.role]
+            assert isinstance(result, AgentEvaluationResult)

-        assert result.metrics == {}
+            assert result.agent_id == str(agent.id)
+            assert result.task_id == str(task.id)
+
+            assert result.metrics == {}