mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-05-03 00:02:36 +00:00
fix: ensure otel span is closed
Some checks failed
Notify Downstream / notify-downstream (push) Has been cancelled
CodeQL Advanced / Analyze (actions) (push) Has been cancelled
CodeQL Advanced / Analyze (python) (push) Has been cancelled
Build uv cache / build-cache (3.10) (push) Has been cancelled
Build uv cache / build-cache (3.11) (push) Has been cancelled
Build uv cache / build-cache (3.12) (push) Has been cancelled
Build uv cache / build-cache (3.13) (push) Has been cancelled
Mark stale issues and pull requests / stale (push) Has been cancelled
Some checks failed
Notify Downstream / notify-downstream (push) Has been cancelled
CodeQL Advanced / Analyze (actions) (push) Has been cancelled
CodeQL Advanced / Analyze (python) (push) Has been cancelled
Build uv cache / build-cache (3.10) (push) Has been cancelled
Build uv cache / build-cache (3.11) (push) Has been cancelled
Build uv cache / build-cache (3.12) (push) Has been cancelled
Build uv cache / build-cache (3.13) (push) Has been cancelled
Mark stale issues and pull requests / stale (push) Has been cancelled
This commit is contained in:
@@ -56,54 +56,53 @@ class TestAgentEvaluator:
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_evaluate_current_iteration(self, mock_crew):
|
||||
from crewai.events.types.task_events import TaskCompletedEvent
|
||||
with crewai_event_bus.scoped_handlers():
|
||||
agent_evaluator = AgentEvaluator(
|
||||
agents=mock_crew.agents, evaluators=[GoalAlignmentEvaluator()]
|
||||
)
|
||||
|
||||
agent_evaluator = AgentEvaluator(
|
||||
agents=mock_crew.agents, evaluators=[GoalAlignmentEvaluator()]
|
||||
)
|
||||
evaluation_condition = threading.Condition()
|
||||
evaluation_completed = False
|
||||
|
||||
evaluation_condition = threading.Condition()
|
||||
evaluation_completed = False
|
||||
@crewai_event_bus.on(AgentEvaluationCompletedEvent)
|
||||
async def on_evaluation_completed(source, event):
|
||||
nonlocal evaluation_completed
|
||||
with evaluation_condition:
|
||||
evaluation_completed = True
|
||||
evaluation_condition.notify()
|
||||
|
||||
mock_crew.kickoff()
|
||||
|
||||
@crewai_event_bus.on(AgentEvaluationCompletedEvent)
|
||||
async def on_evaluation_completed(source, event):
|
||||
nonlocal evaluation_completed
|
||||
with evaluation_condition:
|
||||
evaluation_completed = True
|
||||
evaluation_condition.notify()
|
||||
assert evaluation_condition.wait_for(
|
||||
lambda: evaluation_completed, timeout=5
|
||||
), "Timeout waiting for evaluation completion"
|
||||
|
||||
mock_crew.kickoff()
|
||||
results = agent_evaluator.get_evaluation_results()
|
||||
|
||||
with evaluation_condition:
|
||||
assert evaluation_condition.wait_for(
|
||||
lambda: evaluation_completed, timeout=5
|
||||
), "Timeout waiting for evaluation completion"
|
||||
assert isinstance(results, dict)
|
||||
|
||||
results = agent_evaluator.get_evaluation_results()
|
||||
(agent,) = mock_crew.agents
|
||||
(task,) = mock_crew.tasks
|
||||
|
||||
assert isinstance(results, dict)
|
||||
assert len(mock_crew.agents) == 1
|
||||
assert agent.role in results
|
||||
assert len(results[agent.role]) == 1
|
||||
|
||||
(agent,) = mock_crew.agents
|
||||
(task,) = mock_crew.tasks
|
||||
(result,) = results[agent.role]
|
||||
assert isinstance(result, AgentEvaluationResult)
|
||||
|
||||
assert len(mock_crew.agents) == 1
|
||||
assert agent.role in results
|
||||
assert len(results[agent.role]) == 1
|
||||
assert result.agent_id == str(agent.id)
|
||||
assert result.task_id == str(task.id)
|
||||
|
||||
(result,) = results[agent.role]
|
||||
assert isinstance(result, AgentEvaluationResult)
|
||||
(goal_alignment,) = result.metrics.values()
|
||||
assert goal_alignment.score == 5.0
|
||||
|
||||
assert result.agent_id == str(agent.id)
|
||||
assert result.task_id == str(task.id)
|
||||
expected_feedback = "The agent's output demonstrates an understanding of the need for a comprehensive document outlining task"
|
||||
assert expected_feedback in goal_alignment.feedback
|
||||
|
||||
(goal_alignment,) = result.metrics.values()
|
||||
assert goal_alignment.score == 5.0
|
||||
|
||||
expected_feedback = "The agent's output demonstrates an understanding of the need for a comprehensive document outlining task"
|
||||
assert expected_feedback in goal_alignment.feedback
|
||||
|
||||
assert goal_alignment.raw_response is not None
|
||||
assert '"score": 5' in goal_alignment.raw_response
|
||||
assert goal_alignment.raw_response is not None
|
||||
assert '"score": 5' in goal_alignment.raw_response
|
||||
|
||||
def test_create_default_evaluator(self, mock_crew):
|
||||
agent_evaluator = create_default_evaluator(agents=mock_crew.agents)
|
||||
@@ -127,154 +126,156 @@ class TestAgentEvaluator:
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_eval_specific_agents_from_crew(self, mock_crew):
|
||||
agent = Agent(
|
||||
role="Test Agent Eval",
|
||||
goal="Complete test tasks successfully",
|
||||
backstory="An agent created for testing purposes",
|
||||
)
|
||||
task = Task(
|
||||
description="Test task description",
|
||||
agent=agent,
|
||||
expected_output="Expected test output",
|
||||
)
|
||||
mock_crew.agents.append(agent)
|
||||
mock_crew.tasks.append(task)
|
||||
with crewai_event_bus.scoped_handlers():
|
||||
agent = Agent(
|
||||
role="Test Agent Eval",
|
||||
goal="Complete test tasks successfully",
|
||||
backstory="An agent created for testing purposes",
|
||||
)
|
||||
task = Task(
|
||||
description="Test task description",
|
||||
agent=agent,
|
||||
expected_output="Expected test output",
|
||||
)
|
||||
mock_crew.agents.append(agent)
|
||||
mock_crew.tasks.append(task)
|
||||
|
||||
events = {}
|
||||
results_condition = threading.Condition()
|
||||
completed_event_received = False
|
||||
events = {}
|
||||
results_condition = threading.Condition()
|
||||
completed_event_received = False
|
||||
|
||||
agent_evaluator = AgentEvaluator(
|
||||
agents=[agent], evaluators=[GoalAlignmentEvaluator()]
|
||||
)
|
||||
agent_evaluator = AgentEvaluator(
|
||||
agents=[agent], evaluators=[GoalAlignmentEvaluator()]
|
||||
)
|
||||
|
||||
@crewai_event_bus.on(AgentEvaluationStartedEvent)
|
||||
async def capture_started(source, event):
|
||||
if event.agent_id == str(agent.id):
|
||||
events["started"] = event
|
||||
@crewai_event_bus.on(AgentEvaluationStartedEvent)
|
||||
async def capture_started(source, event):
|
||||
if event.agent_id == str(agent.id):
|
||||
events["started"] = event
|
||||
|
||||
@crewai_event_bus.on(AgentEvaluationCompletedEvent)
|
||||
async def capture_completed(source, event):
|
||||
nonlocal completed_event_received
|
||||
if event.agent_id == str(agent.id):
|
||||
events["completed"] = event
|
||||
with results_condition:
|
||||
completed_event_received = True
|
||||
results_condition.notify()
|
||||
@crewai_event_bus.on(AgentEvaluationCompletedEvent)
|
||||
async def capture_completed(source, event):
|
||||
nonlocal completed_event_received
|
||||
if event.agent_id == str(agent.id):
|
||||
events["completed"] = event
|
||||
with results_condition:
|
||||
completed_event_received = True
|
||||
results_condition.notify()
|
||||
|
||||
@crewai_event_bus.on(AgentEvaluationFailedEvent)
|
||||
def capture_failed(source, event):
|
||||
events["failed"] = event
|
||||
@crewai_event_bus.on(AgentEvaluationFailedEvent)
|
||||
def capture_failed(source, event):
|
||||
events["failed"] = event
|
||||
|
||||
mock_crew.kickoff()
|
||||
mock_crew.kickoff()
|
||||
|
||||
with results_condition:
|
||||
assert results_condition.wait_for(
|
||||
lambda: completed_event_received, timeout=5
|
||||
), "Timeout waiting for evaluation completed event"
|
||||
with results_condition:
|
||||
assert results_condition.wait_for(
|
||||
lambda: completed_event_received, timeout=5
|
||||
), "Timeout waiting for evaluation completed event"
|
||||
|
||||
assert events.keys() == {"started", "completed"}
|
||||
assert events["started"].agent_id == str(agent.id)
|
||||
assert events["started"].agent_role == agent.role
|
||||
assert events["started"].task_id == str(task.id)
|
||||
assert events["started"].iteration == 1
|
||||
assert events.keys() == {"started", "completed"}
|
||||
assert events["started"].agent_id == str(agent.id)
|
||||
assert events["started"].agent_role == agent.role
|
||||
assert events["started"].task_id == str(task.id)
|
||||
assert events["started"].iteration == 1
|
||||
|
||||
assert events["completed"].agent_id == str(agent.id)
|
||||
assert events["completed"].agent_role == agent.role
|
||||
assert events["completed"].task_id == str(task.id)
|
||||
assert events["completed"].iteration == 1
|
||||
assert events["completed"].metric_category == MetricCategory.GOAL_ALIGNMENT
|
||||
assert isinstance(events["completed"].score, EvaluationScore)
|
||||
assert events["completed"].score.score == 5.0
|
||||
assert events["completed"].agent_id == str(agent.id)
|
||||
assert events["completed"].agent_role == agent.role
|
||||
assert events["completed"].task_id == str(task.id)
|
||||
assert events["completed"].iteration == 1
|
||||
assert events["completed"].metric_category == MetricCategory.GOAL_ALIGNMENT
|
||||
assert isinstance(events["completed"].score, EvaluationScore)
|
||||
assert events["completed"].score.score == 5.0
|
||||
|
||||
results = agent_evaluator.get_evaluation_results()
|
||||
results = agent_evaluator.get_evaluation_results()
|
||||
|
||||
assert isinstance(results, dict)
|
||||
assert len(results.keys()) == 1
|
||||
(result,) = results[agent.role]
|
||||
assert isinstance(result, AgentEvaluationResult)
|
||||
assert isinstance(results, dict)
|
||||
assert len(results.keys()) == 1
|
||||
(result,) = results[agent.role]
|
||||
assert isinstance(result, AgentEvaluationResult)
|
||||
|
||||
assert result.agent_id == str(agent.id)
|
||||
assert result.task_id == str(task.id)
|
||||
assert result.agent_id == str(agent.id)
|
||||
assert result.task_id == str(task.id)
|
||||
|
||||
(goal_alignment,) = result.metrics.values()
|
||||
assert goal_alignment.score == 5.0
|
||||
(goal_alignment,) = result.metrics.values()
|
||||
assert goal_alignment.score == 5.0
|
||||
|
||||
expected_feedback = "The agent provided a thorough guide on how to conduct a test task but failed to produce specific expected output"
|
||||
assert expected_feedback in goal_alignment.feedback
|
||||
expected_feedback = "The agent provided a thorough guide on how to conduct a test task but failed to produce specific expected output"
|
||||
assert expected_feedback in goal_alignment.feedback
|
||||
|
||||
assert goal_alignment.raw_response is not None
|
||||
assert '"score": 5' in goal_alignment.raw_response
|
||||
assert goal_alignment.raw_response is not None
|
||||
assert '"score": 5' in goal_alignment.raw_response
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_failed_evaluation(self, mock_crew):
|
||||
(agent,) = mock_crew.agents
|
||||
(task,) = mock_crew.tasks
|
||||
with crewai_event_bus.scoped_handlers():
|
||||
(agent,) = mock_crew.agents
|
||||
(task,) = mock_crew.tasks
|
||||
|
||||
events: dict[str, AgentEvaluationStartedEvent | AgentEvaluationCompletedEvent | AgentEvaluationFailedEvent] = {}
|
||||
condition = threading.Condition()
|
||||
events: dict[str, AgentEvaluationStartedEvent | AgentEvaluationCompletedEvent | AgentEvaluationFailedEvent] = {}
|
||||
condition = threading.Condition()
|
||||
|
||||
@crewai_event_bus.on(AgentEvaluationStartedEvent)
|
||||
def capture_started(source, event):
|
||||
with condition:
|
||||
events["started"] = event
|
||||
condition.notify()
|
||||
@crewai_event_bus.on(AgentEvaluationStartedEvent)
|
||||
def capture_started(source, event):
|
||||
with condition:
|
||||
events["started"] = event
|
||||
condition.notify()
|
||||
|
||||
@crewai_event_bus.on(AgentEvaluationCompletedEvent)
|
||||
def capture_completed(source, event):
|
||||
with condition:
|
||||
events["completed"] = event
|
||||
condition.notify()
|
||||
@crewai_event_bus.on(AgentEvaluationCompletedEvent)
|
||||
def capture_completed(source, event):
|
||||
with condition:
|
||||
events["completed"] = event
|
||||
condition.notify()
|
||||
|
||||
@crewai_event_bus.on(AgentEvaluationFailedEvent)
|
||||
def capture_failed(source, event):
|
||||
with condition:
|
||||
events["failed"] = event
|
||||
condition.notify()
|
||||
@crewai_event_bus.on(AgentEvaluationFailedEvent)
|
||||
def capture_failed(source, event):
|
||||
with condition:
|
||||
events["failed"] = event
|
||||
condition.notify()
|
||||
|
||||
class FailingEvaluator(BaseEvaluator):
|
||||
metric_category = MetricCategory.GOAL_ALIGNMENT
|
||||
class FailingEvaluator(BaseEvaluator):
|
||||
metric_category = MetricCategory.GOAL_ALIGNMENT
|
||||
|
||||
def evaluate(self, agent, task, execution_trace, final_output):
|
||||
raise ValueError("Forced evaluation failure")
|
||||
def evaluate(self, agent, task, execution_trace, final_output):
|
||||
raise ValueError("Forced evaluation failure")
|
||||
|
||||
agent_evaluator = AgentEvaluator(
|
||||
agents=[agent], evaluators=[FailingEvaluator()]
|
||||
)
|
||||
mock_crew.kickoff()
|
||||
|
||||
with condition:
|
||||
success = condition.wait_for(
|
||||
lambda: "started" in events and "failed" in events,
|
||||
timeout=10,
|
||||
agent_evaluator = AgentEvaluator(
|
||||
agents=[agent], evaluators=[FailingEvaluator()]
|
||||
)
|
||||
assert success, "Timeout waiting for evaluation events"
|
||||
mock_crew.kickoff()
|
||||
|
||||
assert events.keys() == {"started", "failed"}
|
||||
assert events["started"].agent_id == str(agent.id)
|
||||
assert events["started"].agent_role == agent.role
|
||||
assert events["started"].task_id == str(task.id)
|
||||
assert events["started"].iteration == 1
|
||||
with condition:
|
||||
success = condition.wait_for(
|
||||
lambda: "started" in events and "failed" in events,
|
||||
timeout=10,
|
||||
)
|
||||
assert success, "Timeout waiting for evaluation events"
|
||||
|
||||
assert events["failed"].agent_id == str(agent.id)
|
||||
assert events["failed"].agent_role == agent.role
|
||||
assert events["failed"].task_id == str(task.id)
|
||||
assert events["failed"].iteration == 1
|
||||
assert events["failed"].error == "Forced evaluation failure"
|
||||
assert events.keys() == {"started", "failed"}
|
||||
assert events["started"].agent_id == str(agent.id)
|
||||
assert events["started"].agent_role == agent.role
|
||||
assert events["started"].task_id == str(task.id)
|
||||
assert events["started"].iteration == 1
|
||||
|
||||
# Wait for results to be stored - the event is emitted before storage
|
||||
with condition:
|
||||
success = condition.wait_for(
|
||||
lambda: agent.role in agent_evaluator.get_evaluation_results(),
|
||||
timeout=5,
|
||||
)
|
||||
assert success, "Timeout waiting for evaluation results to be stored"
|
||||
assert events["failed"].agent_id == str(agent.id)
|
||||
assert events["failed"].agent_role == agent.role
|
||||
assert events["failed"].task_id == str(task.id)
|
||||
assert events["failed"].iteration == 1
|
||||
assert events["failed"].error == "Forced evaluation failure"
|
||||
|
||||
results = agent_evaluator.get_evaluation_results()
|
||||
(result,) = results[agent.role]
|
||||
assert isinstance(result, AgentEvaluationResult)
|
||||
# Wait for results to be stored - the event is emitted before storage
|
||||
with condition:
|
||||
success = condition.wait_for(
|
||||
lambda: agent.role in agent_evaluator.get_evaluation_results(),
|
||||
timeout=5,
|
||||
)
|
||||
assert success, "Timeout waiting for evaluation results to be stored"
|
||||
|
||||
assert result.agent_id == str(agent.id)
|
||||
assert result.task_id == str(task.id)
|
||||
results = agent_evaluator.get_evaluation_results()
|
||||
(result,) = results[agent.role]
|
||||
assert isinstance(result, AgentEvaluationResult)
|
||||
|
||||
assert result.metrics == {}
|
||||
assert result.agent_id == str(agent.id)
|
||||
assert result.task_id == str(task.id)
|
||||
|
||||
assert result.metrics == {}
|
||||
|
||||
Reference in New Issue
Block a user