Files
crewAI/tests/experimental/evaluation/metrics/test_reasoning_metrics.py
Lucas Gomide 1b6b2b36d9 Introduce Evaluator Experiment (#3133)
* feat: add exchanged messages in LLMCallCompletedEvent

* feat: add GoalAlignment metric for Agent evaluation

* feat: add SemanticQuality metric for Agent evaluation

* feat: add Tool Metrics for Agent evaluation

* feat: add Reasoning Metrics for Agent evaluation, still in progress

* feat: add AgentEvaluator class

This class will evaluate Agent' results and report to user

* fix: do not evaluate Agent by default

This is a experimental feature we still need refine it further

* test: add Agent eval tests

* fix: render all feedback per iteration

* style: resolve linter issues

* style: fix mypy issues

* fix: allow messages be empty on LLMCallCompletedEvent

* feat: add Experiment evaluation framework with baseline comparison

* fix: reset evaluator for each experiement iteraction

* fix: fix track of new test cases

* chore: split Experimental evaluation classes

* refactor: remove unused method

* refactor: isolate Console print in a dedicated class

* fix: make crew required to run an experiment

* fix: use time-aware to define experiment result

* test: add tests for Evaluator Experiment

* style: fix linter issues

* fix: encode string before hashing

* style: resolve linter issues

* feat: add experimental folder for beta features (#3141)

* test: move tests to experimental folder
2025-07-14 09:06:45 -04:00

167 lines
6.2 KiB
Python

import pytest
from unittest.mock import patch, MagicMock
from typing import List, Dict, Any
from crewai.tasks.task_output import TaskOutput
from crewai.experimental.evaluation.metrics.reasoning_metrics import (
ReasoningEfficiencyEvaluator,
)
from tests.experimental.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
from crewai.utilities.llm_utils import LLM
from crewai.experimental.evaluation.base_evaluator import EvaluationScore
class TestReasoningEfficiencyEvaluator(BaseEvaluationMetricsTest):
@pytest.fixture
def mock_output(self):
output = MagicMock(spec=TaskOutput)
output.raw = "This is the test output"
return output
@pytest.fixture
def llm_calls(self) -> List[Dict[str, Any]]:
return [
{
"prompt": "How should I approach this task?",
"response": "I'll first research the topic, then compile findings.",
"timestamp": 1626987654
},
{
"prompt": "What resources should I use?",
"response": "I'll use relevant academic papers and reliable websites.",
"timestamp": 1626987754
},
{
"prompt": "How should I structure the output?",
"response": "I'll organize information clearly with headings and bullet points.",
"timestamp": 1626987854
}
]
def test_insufficient_llm_calls(self, mock_agent, mock_task, mock_output):
execution_trace = {"llm_calls": []}
evaluator = ReasoningEfficiencyEvaluator()
result = evaluator.evaluate(
agent=mock_agent,
task=mock_task,
execution_trace=execution_trace,
final_output=mock_output
)
assert isinstance(result, EvaluationScore)
assert result.score is None
assert "Insufficient LLM calls" in result.feedback
@patch("crewai.utilities.llm_utils.create_llm")
def test_successful_evaluation(self, mock_create_llm, mock_agent, mock_task, mock_output, llm_calls):
mock_llm = MagicMock(spec=LLM)
mock_llm.call.return_value = """
{
"scores": {
"focus": 8.0,
"progression": 7.0,
"decision_quality": 7.5,
"conciseness": 8.0,
"loop_avoidance": 9.0
},
"overall_score": 7.9,
"feedback": "The agent demonstrated good reasoning efficiency.",
"optimization_suggestions": "The agent could improve by being more concise."
}
"""
mock_create_llm.return_value = mock_llm
# Setup execution trace with sufficient LLM calls
execution_trace = {"llm_calls": llm_calls}
# Mock the _detect_loops method to return a simple result
evaluator = ReasoningEfficiencyEvaluator(llm=mock_llm)
evaluator._detect_loops = MagicMock(return_value=(False, []))
# Evaluate
result = evaluator.evaluate(
agent=mock_agent,
task=mock_task,
execution_trace=execution_trace,
final_output=mock_output
)
# Assertions
assert isinstance(result, EvaluationScore)
assert result.score == 7.9
assert "The agent demonstrated good reasoning efficiency" in result.feedback
assert "Reasoning Efficiency Evaluation:" in result.feedback
assert "• Focus: 8.0/10" in result.feedback
# Verify LLM was called
mock_llm.call.assert_called_once()
@patch("crewai.utilities.llm_utils.create_llm")
def test_parse_error_handling(self, mock_create_llm, mock_agent, mock_task, mock_output, llm_calls):
mock_llm = MagicMock(spec=LLM)
mock_llm.call.return_value = "Invalid JSON response"
mock_create_llm.return_value = mock_llm
# Setup execution trace
execution_trace = {"llm_calls": llm_calls}
# Mock the _detect_loops method
evaluator = ReasoningEfficiencyEvaluator(llm=mock_llm)
evaluator._detect_loops = MagicMock(return_value=(False, []))
# Evaluate
result = evaluator.evaluate(
agent=mock_agent,
task=mock_task,
execution_trace=execution_trace,
final_output=mock_output
)
# Assertions for error handling
assert isinstance(result, EvaluationScore)
assert result.score is None
assert "Failed to parse reasoning efficiency evaluation" in result.feedback
@patch("crewai.utilities.llm_utils.create_llm")
def test_loop_detection(self, mock_create_llm, mock_agent, mock_task, mock_output):
# Setup LLM calls with a repeating pattern
repetitive_llm_calls = [
{"prompt": "How to solve?", "response": "I'll try method A", "timestamp": 1000},
{"prompt": "Let me try method A", "response": "It didn't work", "timestamp": 1100},
{"prompt": "How to solve?", "response": "I'll try method A again", "timestamp": 1200},
{"prompt": "Let me try method A", "response": "It didn't work", "timestamp": 1300},
{"prompt": "How to solve?", "response": "I'll try method A one more time", "timestamp": 1400}
]
mock_llm = MagicMock(spec=LLM)
mock_llm.call.return_value = """
{
"scores": {
"focus": 6.0,
"progression": 3.0,
"decision_quality": 4.0,
"conciseness": 6.0,
"loop_avoidance": 2.0
},
"overall_score": 4.2,
"feedback": "The agent is stuck in a reasoning loop.",
"optimization_suggestions": "The agent should try different approaches when one fails."
}
"""
mock_create_llm.return_value = mock_llm
execution_trace = {"llm_calls": repetitive_llm_calls}
evaluator = ReasoningEfficiencyEvaluator(llm=mock_llm)
result = evaluator.evaluate(
agent=mock_agent,
task=mock_task,
execution_trace=execution_trace,
final_output=mock_output
)
assert isinstance(result, EvaluationScore)
assert result.score == 4.2
assert "• Loop Avoidance: 2.0/10" in result.feedback