mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-08 15:48:29 +00:00
Some checks failed
Notify Downstream / notify-downstream (push) Has been cancelled
* refactor: implement thread-safe AgentEvaluator with hybrid state management * chore: remove useless comments
96 lines
3.1 KiB
Python
96 lines
3.1 KiB
Python
import pytest
|
|
|
|
from crewai.agent import Agent
|
|
from crewai.task import Task
|
|
from crewai.crew import Crew
|
|
from crewai.experimental.evaluation.agent_evaluator import AgentEvaluator
|
|
from crewai.experimental.evaluation.base_evaluator import AgentEvaluationResult
|
|
from crewai.experimental.evaluation import (
|
|
GoalAlignmentEvaluator,
|
|
SemanticQualityEvaluator,
|
|
ToolSelectionEvaluator,
|
|
ParameterExtractionEvaluator,
|
|
ToolInvocationEvaluator,
|
|
ReasoningEfficiencyEvaluator
|
|
)
|
|
|
|
from crewai.experimental.evaluation import create_default_evaluator
|
|
class TestAgentEvaluator:
|
|
@pytest.fixture
|
|
def mock_crew(self):
|
|
agent = Agent(
|
|
role="Test Agent",
|
|
goal="Complete test tasks successfully",
|
|
backstory="An agent created for testing purposes",
|
|
allow_delegation=False,
|
|
verbose=False
|
|
)
|
|
|
|
task = Task(
|
|
description="Test task description",
|
|
agent=agent,
|
|
expected_output="Expected test output"
|
|
)
|
|
|
|
crew = Crew(
|
|
agents=[agent],
|
|
tasks=[task]
|
|
)
|
|
return crew
|
|
|
|
def test_set_iteration(self):
|
|
agent_evaluator = AgentEvaluator()
|
|
|
|
agent_evaluator.set_iteration(3)
|
|
assert agent_evaluator._execution_state.iteration == 3
|
|
|
|
@pytest.mark.vcr(filter_headers=["authorization"])
|
|
def test_evaluate_current_iteration(self, mock_crew):
|
|
agent_evaluator = AgentEvaluator(crew=mock_crew, evaluators=[GoalAlignmentEvaluator()])
|
|
|
|
mock_crew.kickoff()
|
|
|
|
results = agent_evaluator.evaluate_current_iteration()
|
|
|
|
assert isinstance(results, dict)
|
|
|
|
agent, = mock_crew.agents
|
|
task, = mock_crew.tasks
|
|
|
|
assert len(mock_crew.agents) == 1
|
|
assert agent.role in results
|
|
assert len(results[agent.role]) == 1
|
|
|
|
result, = results[agent.role]
|
|
assert isinstance(result, AgentEvaluationResult)
|
|
|
|
assert result.agent_id == str(agent.id)
|
|
assert result.task_id == str(task.id)
|
|
|
|
goal_alignment, = result.metrics.values()
|
|
assert goal_alignment.score == 5.0
|
|
|
|
expected_feedback = "The agent's output demonstrates an understanding of the need for a comprehensive document"
|
|
assert expected_feedback in goal_alignment.feedback
|
|
|
|
assert goal_alignment.raw_response is not None
|
|
assert '"score": 5' in goal_alignment.raw_response
|
|
|
|
def test_create_default_evaluator(self, mock_crew):
|
|
agent_evaluator = create_default_evaluator(crew=mock_crew)
|
|
assert isinstance(agent_evaluator, AgentEvaluator)
|
|
assert agent_evaluator.crew == mock_crew
|
|
|
|
expected_types = [
|
|
GoalAlignmentEvaluator,
|
|
SemanticQualityEvaluator,
|
|
ToolSelectionEvaluator,
|
|
ParameterExtractionEvaluator,
|
|
ToolInvocationEvaluator,
|
|
ReasoningEfficiencyEvaluator
|
|
]
|
|
|
|
assert len(agent_evaluator.evaluators) == len(expected_types)
|
|
for evaluator, expected_type in zip(agent_evaluator.evaluators, expected_types):
|
|
assert isinstance(evaluator, expected_type)
|