mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-08 15:48:29 +00:00
* feat: add exchanged messages in LLMCallCompletedEvent * feat: add GoalAlignment metric for Agent evaluation * feat: add SemanticQuality metric for Agent evaluation * feat: add Tool Metrics for Agent evaluation * feat: add Reasoning Metrics for Agent evaluation, still in progress * feat: add AgentEvaluator class This class will evaluate Agent' results and report to user * fix: do not evaluate Agent by default This is a experimental feature we still need refine it further * test: add Agent eval tests * fix: render all feedback per iteration * style: resolve linter issues * style: fix mypy issues * fix: allow messages be empty on LLMCallCompletedEvent * feat: add Experiment evaluation framework with baseline comparison * fix: reset evaluator for each experiement iteraction * fix: fix track of new test cases * chore: split Experimental evaluation classes * refactor: remove unused method * refactor: isolate Console print in a dedicated class * fix: make crew required to run an experiment * fix: use time-aware to define experiment result * test: add tests for Evaluator Experiment * style: fix linter issues * fix: encode string before hashing * style: resolve linter issues * feat: add experimental folder for beta features (#3141) * test: move tests to experimental folder
167 lines
6.2 KiB
Python
167 lines
6.2 KiB
Python
import pytest
|
|
from unittest.mock import patch, MagicMock
|
|
from typing import List, Dict, Any
|
|
|
|
from crewai.tasks.task_output import TaskOutput
|
|
from crewai.experimental.evaluation.metrics.reasoning_metrics import (
|
|
ReasoningEfficiencyEvaluator,
|
|
)
|
|
from tests.experimental.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
|
|
from crewai.utilities.llm_utils import LLM
|
|
from crewai.experimental.evaluation.base_evaluator import EvaluationScore
|
|
|
|
class TestReasoningEfficiencyEvaluator(BaseEvaluationMetricsTest):
|
|
@pytest.fixture
|
|
def mock_output(self):
|
|
output = MagicMock(spec=TaskOutput)
|
|
output.raw = "This is the test output"
|
|
return output
|
|
|
|
@pytest.fixture
|
|
def llm_calls(self) -> List[Dict[str, Any]]:
|
|
return [
|
|
{
|
|
"prompt": "How should I approach this task?",
|
|
"response": "I'll first research the topic, then compile findings.",
|
|
"timestamp": 1626987654
|
|
},
|
|
{
|
|
"prompt": "What resources should I use?",
|
|
"response": "I'll use relevant academic papers and reliable websites.",
|
|
"timestamp": 1626987754
|
|
},
|
|
{
|
|
"prompt": "How should I structure the output?",
|
|
"response": "I'll organize information clearly with headings and bullet points.",
|
|
"timestamp": 1626987854
|
|
}
|
|
]
|
|
|
|
def test_insufficient_llm_calls(self, mock_agent, mock_task, mock_output):
|
|
execution_trace = {"llm_calls": []}
|
|
|
|
evaluator = ReasoningEfficiencyEvaluator()
|
|
result = evaluator.evaluate(
|
|
agent=mock_agent,
|
|
task=mock_task,
|
|
execution_trace=execution_trace,
|
|
final_output=mock_output
|
|
)
|
|
|
|
assert isinstance(result, EvaluationScore)
|
|
assert result.score is None
|
|
assert "Insufficient LLM calls" in result.feedback
|
|
|
|
@patch("crewai.utilities.llm_utils.create_llm")
|
|
def test_successful_evaluation(self, mock_create_llm, mock_agent, mock_task, mock_output, llm_calls):
|
|
mock_llm = MagicMock(spec=LLM)
|
|
mock_llm.call.return_value = """
|
|
{
|
|
"scores": {
|
|
"focus": 8.0,
|
|
"progression": 7.0,
|
|
"decision_quality": 7.5,
|
|
"conciseness": 8.0,
|
|
"loop_avoidance": 9.0
|
|
},
|
|
"overall_score": 7.9,
|
|
"feedback": "The agent demonstrated good reasoning efficiency.",
|
|
"optimization_suggestions": "The agent could improve by being more concise."
|
|
}
|
|
"""
|
|
mock_create_llm.return_value = mock_llm
|
|
|
|
# Setup execution trace with sufficient LLM calls
|
|
execution_trace = {"llm_calls": llm_calls}
|
|
|
|
# Mock the _detect_loops method to return a simple result
|
|
evaluator = ReasoningEfficiencyEvaluator(llm=mock_llm)
|
|
evaluator._detect_loops = MagicMock(return_value=(False, []))
|
|
|
|
# Evaluate
|
|
result = evaluator.evaluate(
|
|
agent=mock_agent,
|
|
task=mock_task,
|
|
execution_trace=execution_trace,
|
|
final_output=mock_output
|
|
)
|
|
|
|
# Assertions
|
|
assert isinstance(result, EvaluationScore)
|
|
assert result.score == 7.9
|
|
assert "The agent demonstrated good reasoning efficiency" in result.feedback
|
|
assert "Reasoning Efficiency Evaluation:" in result.feedback
|
|
assert "• Focus: 8.0/10" in result.feedback
|
|
|
|
# Verify LLM was called
|
|
mock_llm.call.assert_called_once()
|
|
|
|
@patch("crewai.utilities.llm_utils.create_llm")
|
|
def test_parse_error_handling(self, mock_create_llm, mock_agent, mock_task, mock_output, llm_calls):
|
|
mock_llm = MagicMock(spec=LLM)
|
|
mock_llm.call.return_value = "Invalid JSON response"
|
|
mock_create_llm.return_value = mock_llm
|
|
|
|
# Setup execution trace
|
|
execution_trace = {"llm_calls": llm_calls}
|
|
|
|
# Mock the _detect_loops method
|
|
evaluator = ReasoningEfficiencyEvaluator(llm=mock_llm)
|
|
evaluator._detect_loops = MagicMock(return_value=(False, []))
|
|
|
|
# Evaluate
|
|
result = evaluator.evaluate(
|
|
agent=mock_agent,
|
|
task=mock_task,
|
|
execution_trace=execution_trace,
|
|
final_output=mock_output
|
|
)
|
|
|
|
# Assertions for error handling
|
|
assert isinstance(result, EvaluationScore)
|
|
assert result.score is None
|
|
assert "Failed to parse reasoning efficiency evaluation" in result.feedback
|
|
|
|
@patch("crewai.utilities.llm_utils.create_llm")
|
|
def test_loop_detection(self, mock_create_llm, mock_agent, mock_task, mock_output):
|
|
# Setup LLM calls with a repeating pattern
|
|
repetitive_llm_calls = [
|
|
{"prompt": "How to solve?", "response": "I'll try method A", "timestamp": 1000},
|
|
{"prompt": "Let me try method A", "response": "It didn't work", "timestamp": 1100},
|
|
{"prompt": "How to solve?", "response": "I'll try method A again", "timestamp": 1200},
|
|
{"prompt": "Let me try method A", "response": "It didn't work", "timestamp": 1300},
|
|
{"prompt": "How to solve?", "response": "I'll try method A one more time", "timestamp": 1400}
|
|
]
|
|
|
|
mock_llm = MagicMock(spec=LLM)
|
|
mock_llm.call.return_value = """
|
|
{
|
|
"scores": {
|
|
"focus": 6.0,
|
|
"progression": 3.0,
|
|
"decision_quality": 4.0,
|
|
"conciseness": 6.0,
|
|
"loop_avoidance": 2.0
|
|
},
|
|
"overall_score": 4.2,
|
|
"feedback": "The agent is stuck in a reasoning loop.",
|
|
"optimization_suggestions": "The agent should try different approaches when one fails."
|
|
}
|
|
"""
|
|
mock_create_llm.return_value = mock_llm
|
|
|
|
execution_trace = {"llm_calls": repetitive_llm_calls}
|
|
|
|
evaluator = ReasoningEfficiencyEvaluator(llm=mock_llm)
|
|
|
|
result = evaluator.evaluate(
|
|
agent=mock_agent,
|
|
task=mock_task,
|
|
execution_trace=execution_trace,
|
|
final_output=mock_output
|
|
)
|
|
|
|
assert isinstance(result, EvaluationScore)
|
|
assert result.score == 4.2
|
|
assert "• Loop Avoidance: 2.0/10" in result.feedback
|