Introducing Agent evaluation (#3130)

* feat: add exchanged messages in LLMCallCompletedEvent * feat: add GoalAlignment metric for Agent evaluation * feat: add SemanticQuality metric for Agent evaluation * feat: add Tool Metrics for Agent evaluation * feat: add Reasoning Metrics for Agent evaluation, still in progress * feat: add AgentEvaluator class This class will evaluate Agent' results and report to user * fix: do not evaluate Agent by default This is a experimental feature we still need refine it further * test: add Agent eval tests * fix: render all feedback per iteration * style: resolve linter issues * style: fix mypy issues * fix: allow messages be empty on LLMCallCompletedEvent
2026-01-10 00:28:31 +00:00 · 2025-07-11 14:18:03 -03:00
parent bf8fa3232b
commit 08fa3797ca
26 changed files with 2930 additions and 14 deletions
--- a/tests/evaluation/metrics/test_tools_metrics.py
+++ b/tests/evaluation/metrics/test_tools_metrics.py
@@ -0,0 +1,230 @@
+from unittest.mock import patch, MagicMock
+
+from crewai.evaluation.metrics.tools_metrics import (
+    ToolSelectionEvaluator,
+    ParameterExtractionEvaluator,
+    ToolInvocationEvaluator
+)
+from crewai.utilities.llm_utils import LLM
+from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
+
+class TestToolSelectionEvaluator(BaseEvaluationMetricsTest):
+    def test_no_tools_available(self, mock_task, mock_agent):
+        # Create agent with no tools
+        mock_agent.tools = []
+
+        execution_trace = {"tool_uses": []}
+
+        evaluator = ToolSelectionEvaluator()
+        result = evaluator.evaluate(
+            agent=mock_agent,
+            task=mock_task,
+            execution_trace=execution_trace,
+            final_output="Final output"
+        )
+
+        assert result.score is None
+        assert "no tools available" in result.feedback.lower()
+
+    def test_tools_available_but_none_used(self, mock_agent, mock_task):
+        mock_agent.tools = ["tool1", "tool2"]
+        execution_trace = {"tool_uses": []}
+
+        evaluator = ToolSelectionEvaluator()
+        result = evaluator.evaluate(
+            agent=mock_agent,
+            task=mock_task,
+            execution_trace=execution_trace,
+            final_output="Final output"
+        )
+
+        assert result.score is None
+        assert "had tools available but didn't use any" in result.feedback.lower()
+
+    @patch("crewai.utilities.llm_utils.create_llm")
+    def test_successful_evaluation(self, mock_create_llm, mock_agent, mock_task):
+        # Setup mock LLM response
+        mock_llm = MagicMock(spec=LLM)
+        mock_llm.call.return_value = """
+        {
+            "overall_score": 8.5,
+            "feedback": "The agent made good tool selections."
+        }
+        """
+        mock_create_llm.return_value = mock_llm
+
+        # Setup execution trace with tool uses
+        execution_trace = {
+            "tool_uses": [
+                {"tool": "search_tool", "input": {"query": "test query"}, "output": "search results"},
+                {"tool": "calculator", "input": {"expression": "2+2"}, "output": "4"}
+            ]
+        }
+
+        evaluator = ToolSelectionEvaluator(llm=mock_llm)
+        result = evaluator.evaluate(
+            agent=mock_agent,
+            task=mock_task,
+            execution_trace=execution_trace,
+            final_output="Final output"
+        )
+
+        assert result.score == 8.5
+        assert "The agent made good tool selections" in result.feedback
+
+        # Verify LLM was called with correct prompt
+        mock_llm.call.assert_called_once()
+        prompt = mock_llm.call.call_args[0][0]
+        assert isinstance(prompt, list)
+        assert len(prompt) >= 2
+        assert "system" in prompt[0]["role"]
+        assert "user" in prompt[1]["role"]
+
+
+class TestParameterExtractionEvaluator(BaseEvaluationMetricsTest):
+    def test_no_tool_uses(self, mock_agent, mock_task):
+        execution_trace = {"tool_uses": []}
+
+        evaluator = ParameterExtractionEvaluator()
+        result = evaluator.evaluate(
+            agent=mock_agent,
+            task=mock_task,
+            execution_trace=execution_trace,
+            final_output="Final output"
+        )
+
+        assert result.score is None
+        assert "no tool usage" in result.feedback.lower()
+
+    @patch("crewai.utilities.llm_utils.create_llm")
+    def test_successful_evaluation(self, mock_create_llm, mock_agent, mock_task):
+        mock_agent.tools = ["tool1", "tool2"]
+
+        # Setup mock LLM response
+        mock_llm = MagicMock(spec=LLM)
+        mock_llm.call.return_value = """
+        {
+            "overall_score": 9.0,
+            "feedback": "The agent extracted parameters correctly."
+        }
+        """
+        mock_create_llm.return_value = mock_llm
+
+        # Setup execution trace with tool uses
+        execution_trace = {
+            "tool_uses": [
+                {
+                    "tool": "search_tool",
+                    "input": {"query": "test query"},
+                    "output": "search results",
+                    "error": None
+                },
+                {
+                    "tool": "calculator",
+                    "input": {"expression": "2+2"},
+                    "output": "4",
+                    "error": None
+                }
+            ]
+        }
+
+        evaluator = ParameterExtractionEvaluator(llm=mock_llm)
+        result = evaluator.evaluate(
+            agent=mock_agent,
+            task=mock_task,
+            execution_trace=execution_trace,
+            final_output="Final output"
+        )
+
+        assert result.score == 9.0
+        assert "The agent extracted parameters correctly" in result.feedback
+
+
+class TestToolInvocationEvaluator(BaseEvaluationMetricsTest):
+    def test_no_tool_uses(self, mock_agent, mock_task):
+        execution_trace = {"tool_uses": []}
+
+        evaluator = ToolInvocationEvaluator()
+        result = evaluator.evaluate(
+            agent=mock_agent,
+            task=mock_task,
+            execution_trace=execution_trace,
+            final_output="Final output"
+        )
+
+        assert result.score is None
+        assert "no tool usage" in result.feedback.lower()
+
+    @patch("crewai.utilities.llm_utils.create_llm")
+    def test_successful_evaluation(self, mock_create_llm, mock_agent, mock_task):
+        mock_agent.tools = ["tool1", "tool2"]
+        # Setup mock LLM response
+        mock_llm = MagicMock(spec=LLM)
+        mock_llm.call.return_value = """
+        {
+            "overall_score": 8.0,
+            "feedback": "The agent invoked tools correctly."
+        }
+        """
+        mock_create_llm.return_value = mock_llm
+
+        # Setup execution trace with tool uses
+        execution_trace = {
+            "tool_uses": [
+                {"tool": "search_tool", "input": {"query": "test query"}, "output": "search results"},
+                {"tool": "calculator", "input": {"expression": "2+2"}, "output": "4"}
+            ]
+        }
+
+        evaluator = ToolInvocationEvaluator(llm=mock_llm)
+        result = evaluator.evaluate(
+            agent=mock_agent,
+            task=mock_task,
+            execution_trace=execution_trace,
+            final_output="Final output"
+        )
+
+        assert result.score == 8.0
+        assert "The agent invoked tools correctly" in result.feedback
+
+    @patch("crewai.utilities.llm_utils.create_llm")
+    def test_evaluation_with_errors(self, mock_create_llm, mock_agent, mock_task):
+        mock_agent.tools = ["tool1", "tool2"]
+        # Setup mock LLM response
+        mock_llm = MagicMock(spec=LLM)
+        mock_llm.call.return_value = """
+        {
+            "overall_score": 5.5,
+            "feedback": "The agent had some errors in tool invocation."
+        }
+        """
+        mock_create_llm.return_value = mock_llm
+
+        # Setup execution trace with tool uses including errors
+        execution_trace = {
+            "tool_uses": [
+                {
+                    "tool": "search_tool",
+                    "input": {"query": "test query"},
+                    "output": "search results",
+                    "error": None
+                },
+                {
+                    "tool": "calculator",
+                    "input": {"expression": "2+"},
+                    "output": None,
+                    "error": "Invalid expression"
+                }
+            ]
+        }
+
+        evaluator = ToolInvocationEvaluator(llm=mock_llm)
+        result = evaluator.evaluate(
+            agent=mock_agent,
+            task=mock_task,
+            execution_trace=execution_trace,
+            final_output="Final output"
+        )
+
+        assert result.score == 5.5
+        assert "The agent had some errors in tool invocation" in result.feedback