mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-08 15:48:29 +00:00
* feat: add exchanged messages in LLMCallCompletedEvent * feat: add GoalAlignment metric for Agent evaluation * feat: add SemanticQuality metric for Agent evaluation * feat: add Tool Metrics for Agent evaluation * feat: add Reasoning Metrics for Agent evaluation, still in progress * feat: add AgentEvaluator class This class will evaluate Agent' results and report to user * fix: do not evaluate Agent by default This is a experimental feature we still need refine it further * test: add Agent eval tests * fix: render all feedback per iteration * style: resolve linter issues * style: fix mypy issues * fix: allow messages be empty on LLMCallCompletedEvent
231 lines
7.6 KiB
Python
231 lines
7.6 KiB
Python
from unittest.mock import patch, MagicMock
|
|
|
|
from crewai.evaluation.metrics.tools_metrics import (
|
|
ToolSelectionEvaluator,
|
|
ParameterExtractionEvaluator,
|
|
ToolInvocationEvaluator
|
|
)
|
|
from crewai.utilities.llm_utils import LLM
|
|
from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
|
|
|
|
class TestToolSelectionEvaluator(BaseEvaluationMetricsTest):
|
|
def test_no_tools_available(self, mock_task, mock_agent):
|
|
# Create agent with no tools
|
|
mock_agent.tools = []
|
|
|
|
execution_trace = {"tool_uses": []}
|
|
|
|
evaluator = ToolSelectionEvaluator()
|
|
result = evaluator.evaluate(
|
|
agent=mock_agent,
|
|
task=mock_task,
|
|
execution_trace=execution_trace,
|
|
final_output="Final output"
|
|
)
|
|
|
|
assert result.score is None
|
|
assert "no tools available" in result.feedback.lower()
|
|
|
|
def test_tools_available_but_none_used(self, mock_agent, mock_task):
|
|
mock_agent.tools = ["tool1", "tool2"]
|
|
execution_trace = {"tool_uses": []}
|
|
|
|
evaluator = ToolSelectionEvaluator()
|
|
result = evaluator.evaluate(
|
|
agent=mock_agent,
|
|
task=mock_task,
|
|
execution_trace=execution_trace,
|
|
final_output="Final output"
|
|
)
|
|
|
|
assert result.score is None
|
|
assert "had tools available but didn't use any" in result.feedback.lower()
|
|
|
|
@patch("crewai.utilities.llm_utils.create_llm")
|
|
def test_successful_evaluation(self, mock_create_llm, mock_agent, mock_task):
|
|
# Setup mock LLM response
|
|
mock_llm = MagicMock(spec=LLM)
|
|
mock_llm.call.return_value = """
|
|
{
|
|
"overall_score": 8.5,
|
|
"feedback": "The agent made good tool selections."
|
|
}
|
|
"""
|
|
mock_create_llm.return_value = mock_llm
|
|
|
|
# Setup execution trace with tool uses
|
|
execution_trace = {
|
|
"tool_uses": [
|
|
{"tool": "search_tool", "input": {"query": "test query"}, "output": "search results"},
|
|
{"tool": "calculator", "input": {"expression": "2+2"}, "output": "4"}
|
|
]
|
|
}
|
|
|
|
evaluator = ToolSelectionEvaluator(llm=mock_llm)
|
|
result = evaluator.evaluate(
|
|
agent=mock_agent,
|
|
task=mock_task,
|
|
execution_trace=execution_trace,
|
|
final_output="Final output"
|
|
)
|
|
|
|
assert result.score == 8.5
|
|
assert "The agent made good tool selections" in result.feedback
|
|
|
|
# Verify LLM was called with correct prompt
|
|
mock_llm.call.assert_called_once()
|
|
prompt = mock_llm.call.call_args[0][0]
|
|
assert isinstance(prompt, list)
|
|
assert len(prompt) >= 2
|
|
assert "system" in prompt[0]["role"]
|
|
assert "user" in prompt[1]["role"]
|
|
|
|
|
|
class TestParameterExtractionEvaluator(BaseEvaluationMetricsTest):
|
|
def test_no_tool_uses(self, mock_agent, mock_task):
|
|
execution_trace = {"tool_uses": []}
|
|
|
|
evaluator = ParameterExtractionEvaluator()
|
|
result = evaluator.evaluate(
|
|
agent=mock_agent,
|
|
task=mock_task,
|
|
execution_trace=execution_trace,
|
|
final_output="Final output"
|
|
)
|
|
|
|
assert result.score is None
|
|
assert "no tool usage" in result.feedback.lower()
|
|
|
|
@patch("crewai.utilities.llm_utils.create_llm")
|
|
def test_successful_evaluation(self, mock_create_llm, mock_agent, mock_task):
|
|
mock_agent.tools = ["tool1", "tool2"]
|
|
|
|
# Setup mock LLM response
|
|
mock_llm = MagicMock(spec=LLM)
|
|
mock_llm.call.return_value = """
|
|
{
|
|
"overall_score": 9.0,
|
|
"feedback": "The agent extracted parameters correctly."
|
|
}
|
|
"""
|
|
mock_create_llm.return_value = mock_llm
|
|
|
|
# Setup execution trace with tool uses
|
|
execution_trace = {
|
|
"tool_uses": [
|
|
{
|
|
"tool": "search_tool",
|
|
"input": {"query": "test query"},
|
|
"output": "search results",
|
|
"error": None
|
|
},
|
|
{
|
|
"tool": "calculator",
|
|
"input": {"expression": "2+2"},
|
|
"output": "4",
|
|
"error": None
|
|
}
|
|
]
|
|
}
|
|
|
|
evaluator = ParameterExtractionEvaluator(llm=mock_llm)
|
|
result = evaluator.evaluate(
|
|
agent=mock_agent,
|
|
task=mock_task,
|
|
execution_trace=execution_trace,
|
|
final_output="Final output"
|
|
)
|
|
|
|
assert result.score == 9.0
|
|
assert "The agent extracted parameters correctly" in result.feedback
|
|
|
|
|
|
class TestToolInvocationEvaluator(BaseEvaluationMetricsTest):
|
|
def test_no_tool_uses(self, mock_agent, mock_task):
|
|
execution_trace = {"tool_uses": []}
|
|
|
|
evaluator = ToolInvocationEvaluator()
|
|
result = evaluator.evaluate(
|
|
agent=mock_agent,
|
|
task=mock_task,
|
|
execution_trace=execution_trace,
|
|
final_output="Final output"
|
|
)
|
|
|
|
assert result.score is None
|
|
assert "no tool usage" in result.feedback.lower()
|
|
|
|
@patch("crewai.utilities.llm_utils.create_llm")
|
|
def test_successful_evaluation(self, mock_create_llm, mock_agent, mock_task):
|
|
mock_agent.tools = ["tool1", "tool2"]
|
|
# Setup mock LLM response
|
|
mock_llm = MagicMock(spec=LLM)
|
|
mock_llm.call.return_value = """
|
|
{
|
|
"overall_score": 8.0,
|
|
"feedback": "The agent invoked tools correctly."
|
|
}
|
|
"""
|
|
mock_create_llm.return_value = mock_llm
|
|
|
|
# Setup execution trace with tool uses
|
|
execution_trace = {
|
|
"tool_uses": [
|
|
{"tool": "search_tool", "input": {"query": "test query"}, "output": "search results"},
|
|
{"tool": "calculator", "input": {"expression": "2+2"}, "output": "4"}
|
|
]
|
|
}
|
|
|
|
evaluator = ToolInvocationEvaluator(llm=mock_llm)
|
|
result = evaluator.evaluate(
|
|
agent=mock_agent,
|
|
task=mock_task,
|
|
execution_trace=execution_trace,
|
|
final_output="Final output"
|
|
)
|
|
|
|
assert result.score == 8.0
|
|
assert "The agent invoked tools correctly" in result.feedback
|
|
|
|
@patch("crewai.utilities.llm_utils.create_llm")
|
|
def test_evaluation_with_errors(self, mock_create_llm, mock_agent, mock_task):
|
|
mock_agent.tools = ["tool1", "tool2"]
|
|
# Setup mock LLM response
|
|
mock_llm = MagicMock(spec=LLM)
|
|
mock_llm.call.return_value = """
|
|
{
|
|
"overall_score": 5.5,
|
|
"feedback": "The agent had some errors in tool invocation."
|
|
}
|
|
"""
|
|
mock_create_llm.return_value = mock_llm
|
|
|
|
# Setup execution trace with tool uses including errors
|
|
execution_trace = {
|
|
"tool_uses": [
|
|
{
|
|
"tool": "search_tool",
|
|
"input": {"query": "test query"},
|
|
"output": "search results",
|
|
"error": None
|
|
},
|
|
{
|
|
"tool": "calculator",
|
|
"input": {"expression": "2+"},
|
|
"output": None,
|
|
"error": "Invalid expression"
|
|
}
|
|
]
|
|
}
|
|
|
|
evaluator = ToolInvocationEvaluator(llm=mock_llm)
|
|
result = evaluator.evaluate(
|
|
agent=mock_agent,
|
|
task=mock_task,
|
|
execution_trace=execution_trace,
|
|
final_output="Final output"
|
|
)
|
|
|
|
assert result.score == 5.5
|
|
assert "The agent had some errors in tool invocation" in result.feedback
|