mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-09 16:18:30 +00:00
Introduce Evaluator Experiment (#3133)
* feat: add exchanged messages in LLMCallCompletedEvent * feat: add GoalAlignment metric for Agent evaluation * feat: add SemanticQuality metric for Agent evaluation * feat: add Tool Metrics for Agent evaluation * feat: add Reasoning Metrics for Agent evaluation, still in progress * feat: add AgentEvaluator class This class will evaluate Agent' results and report to user * fix: do not evaluate Agent by default This is a experimental feature we still need refine it further * test: add Agent eval tests * fix: render all feedback per iteration * style: resolve linter issues * style: fix mypy issues * fix: allow messages be empty on LLMCallCompletedEvent * feat: add Experiment evaluation framework with baseline comparison * fix: reset evaluator for each experiement iteraction * fix: fix track of new test cases * chore: split Experimental evaluation classes * refactor: remove unused method * refactor: isolate Console print in a dedicated class * fix: make crew required to run an experiment * fix: use time-aware to define experiment result * test: add tests for Evaluator Experiment * style: fix linter issues * fix: encode string before hashing * style: resolve linter issues * feat: add experimental folder for beta features (#3141) * test: move tests to experimental folder
This commit is contained in:
95
tests/experimental/evaluation/test_agent_evaluator.py
Normal file
95
tests/experimental/evaluation/test_agent_evaluator.py
Normal file
@@ -0,0 +1,95 @@
|
||||
import pytest
|
||||
|
||||
from crewai.agent import Agent
|
||||
from crewai.task import Task
|
||||
from crewai.crew import Crew
|
||||
from crewai.experimental.evaluation.agent_evaluator import AgentEvaluator
|
||||
from crewai.experimental.evaluation.base_evaluator import AgentEvaluationResult
|
||||
from crewai.experimental.evaluation import (
|
||||
GoalAlignmentEvaluator,
|
||||
SemanticQualityEvaluator,
|
||||
ToolSelectionEvaluator,
|
||||
ParameterExtractionEvaluator,
|
||||
ToolInvocationEvaluator,
|
||||
ReasoningEfficiencyEvaluator
|
||||
)
|
||||
|
||||
from crewai.experimental.evaluation import create_default_evaluator
|
||||
class TestAgentEvaluator:
|
||||
@pytest.fixture
|
||||
def mock_crew(self):
|
||||
agent = Agent(
|
||||
role="Test Agent",
|
||||
goal="Complete test tasks successfully",
|
||||
backstory="An agent created for testing purposes",
|
||||
allow_delegation=False,
|
||||
verbose=False
|
||||
)
|
||||
|
||||
task = Task(
|
||||
description="Test task description",
|
||||
agent=agent,
|
||||
expected_output="Expected test output"
|
||||
)
|
||||
|
||||
crew = Crew(
|
||||
agents=[agent],
|
||||
tasks=[task]
|
||||
)
|
||||
return crew
|
||||
|
||||
def test_set_iteration(self):
|
||||
agent_evaluator = AgentEvaluator()
|
||||
|
||||
agent_evaluator.set_iteration(3)
|
||||
assert agent_evaluator.iteration == 3
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
def test_evaluate_current_iteration(self, mock_crew):
|
||||
agent_evaluator = AgentEvaluator(crew=mock_crew, evaluators=[GoalAlignmentEvaluator()])
|
||||
|
||||
mock_crew.kickoff()
|
||||
|
||||
results = agent_evaluator.evaluate_current_iteration()
|
||||
|
||||
assert isinstance(results, dict)
|
||||
|
||||
agent, = mock_crew.agents
|
||||
task, = mock_crew.tasks
|
||||
|
||||
assert len(mock_crew.agents) == 1
|
||||
assert agent.role in results
|
||||
assert len(results[agent.role]) == 1
|
||||
|
||||
result, = results[agent.role]
|
||||
assert isinstance(result, AgentEvaluationResult)
|
||||
|
||||
assert result.agent_id == str(agent.id)
|
||||
assert result.task_id == str(task.id)
|
||||
|
||||
goal_alignment, = result.metrics.values()
|
||||
assert goal_alignment.score == 5.0
|
||||
|
||||
expected_feedback = "The agent's output demonstrates an understanding of the need for a comprehensive document"
|
||||
assert expected_feedback in goal_alignment.feedback
|
||||
|
||||
assert goal_alignment.raw_response is not None
|
||||
assert '"score": 5' in goal_alignment.raw_response
|
||||
|
||||
def test_create_default_evaluator(self, mock_crew):
|
||||
agent_evaluator = create_default_evaluator(crew=mock_crew)
|
||||
assert isinstance(agent_evaluator, AgentEvaluator)
|
||||
assert agent_evaluator.crew == mock_crew
|
||||
|
||||
expected_types = [
|
||||
GoalAlignmentEvaluator,
|
||||
SemanticQualityEvaluator,
|
||||
ToolSelectionEvaluator,
|
||||
ParameterExtractionEvaluator,
|
||||
ToolInvocationEvaluator,
|
||||
ReasoningEfficiencyEvaluator
|
||||
]
|
||||
|
||||
assert len(agent_evaluator.evaluators) == len(expected_types)
|
||||
for evaluator, expected_type in zip(agent_evaluator.evaluators, expected_types):
|
||||
assert isinstance(evaluator, expected_type)
|
||||
Reference in New Issue
Block a user