crewAI/tests/evaluation/metrics/test_tools_metrics.py

from unittest.mock import patch, MagicMock

from crewai.evaluation.metrics.tools_metrics import (
    ToolSelectionEvaluator,
    ParameterExtractionEvaluator,
    ToolInvocationEvaluator
)
from crewai.utilities.llm_utils import LLM
from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest

class TestToolSelectionEvaluator(BaseEvaluationMetricsTest):
    def test_no_tools_available(self, mock_task, mock_agent):
        # Create agent with no tools
        mock_agent.tools = []

        execution_trace = {"tool_uses": []}

        evaluator = ToolSelectionEvaluator()
        result = evaluator.evaluate(
            agent=mock_agent,
            task=mock_task,
            execution_trace=execution_trace,
            final_output="Final output"
        )

        assert result.score is None
        assert "no tools available" in result.feedback.lower()

    def test_tools_available_but_none_used(self, mock_agent, mock_task):
        mock_agent.tools = ["tool1", "tool2"]
        execution_trace = {"tool_uses": []}

        evaluator = ToolSelectionEvaluator()
        result = evaluator.evaluate(
            agent=mock_agent,
            task=mock_task,
            execution_trace=execution_trace,
            final_output="Final output"
        )

        assert result.score is None
        assert "had tools available but didn't use any" in result.feedback.lower()

    @patch("crewai.utilities.llm_utils.create_llm")
    def test_successful_evaluation(self, mock_create_llm, mock_agent, mock_task):
        # Setup mock LLM response
        mock_llm = MagicMock(spec=LLM)
        mock_llm.call.return_value = """
        {
            "overall_score": 8.5,
            "feedback": "The agent made good tool selections."
        }
        """
        mock_create_llm.return_value = mock_llm

        # Setup execution trace with tool uses
        execution_trace = {
            "tool_uses": [
                {"tool": "search_tool", "input": {"query": "test query"}, "output": "search results"},
                {"tool": "calculator", "input": {"expression": "2+2"}, "output": "4"}
            ]
        }

        evaluator = ToolSelectionEvaluator(llm=mock_llm)
        result = evaluator.evaluate(
            agent=mock_agent,
            task=mock_task,
            execution_trace=execution_trace,
            final_output="Final output"
        )

        assert result.score == 8.5
        assert "The agent made good tool selections" in result.feedback

        # Verify LLM was called with correct prompt
        mock_llm.call.assert_called_once()
        prompt = mock_llm.call.call_args[0][0]
        assert isinstance(prompt, list)
        assert len(prompt) >= 2
        assert "system" in prompt[0]["role"]
        assert "user" in prompt[1]["role"]


class TestParameterExtractionEvaluator(BaseEvaluationMetricsTest):
    def test_no_tool_uses(self, mock_agent, mock_task):
        execution_trace = {"tool_uses": []}

        evaluator = ParameterExtractionEvaluator()
        result = evaluator.evaluate(
            agent=mock_agent,
            task=mock_task,
            execution_trace=execution_trace,
            final_output="Final output"
        )

        assert result.score is None
        assert "no tool usage" in result.feedback.lower()

    @patch("crewai.utilities.llm_utils.create_llm")
    def test_successful_evaluation(self, mock_create_llm, mock_agent, mock_task):
        mock_agent.tools = ["tool1", "tool2"]

        # Setup mock LLM response
        mock_llm = MagicMock(spec=LLM)
        mock_llm.call.return_value = """
        {
            "overall_score": 9.0,
            "feedback": "The agent extracted parameters correctly."
        }
        """
        mock_create_llm.return_value = mock_llm

        # Setup execution trace with tool uses
        execution_trace = {
            "tool_uses": [
                {
                    "tool": "search_tool",
                    "input": {"query": "test query"},
                    "output": "search results",
                    "error": None
                },
                {
                    "tool": "calculator",
                    "input": {"expression": "2+2"},
                    "output": "4",
                    "error": None
                }
            ]
        }

        evaluator = ParameterExtractionEvaluator(llm=mock_llm)
        result = evaluator.evaluate(
            agent=mock_agent,
            task=mock_task,
            execution_trace=execution_trace,
            final_output="Final output"
        )

        assert result.score == 9.0
        assert "The agent extracted parameters correctly" in result.feedback


class TestToolInvocationEvaluator(BaseEvaluationMetricsTest):
    def test_no_tool_uses(self, mock_agent, mock_task):
        execution_trace = {"tool_uses": []}

        evaluator = ToolInvocationEvaluator()
        result = evaluator.evaluate(
            agent=mock_agent,
            task=mock_task,
            execution_trace=execution_trace,
            final_output="Final output"
        )

        assert result.score is None
        assert "no tool usage" in result.feedback.lower()

    @patch("crewai.utilities.llm_utils.create_llm")
    def test_successful_evaluation(self, mock_create_llm, mock_agent, mock_task):
        mock_agent.tools = ["tool1", "tool2"]
        # Setup mock LLM response
        mock_llm = MagicMock(spec=LLM)
        mock_llm.call.return_value = """
        {
            "overall_score": 8.0,
            "feedback": "The agent invoked tools correctly."
        }
        """
        mock_create_llm.return_value = mock_llm

        # Setup execution trace with tool uses
        execution_trace = {
            "tool_uses": [
                {"tool": "search_tool", "input": {"query": "test query"}, "output": "search results"},
                {"tool": "calculator", "input": {"expression": "2+2"}, "output": "4"}
            ]
        }

        evaluator = ToolInvocationEvaluator(llm=mock_llm)
        result = evaluator.evaluate(
            agent=mock_agent,
            task=mock_task,
            execution_trace=execution_trace,
            final_output="Final output"
        )

        assert result.score == 8.0
        assert "The agent invoked tools correctly" in result.feedback

    @patch("crewai.utilities.llm_utils.create_llm")
    def test_evaluation_with_errors(self, mock_create_llm, mock_agent, mock_task):
        mock_agent.tools = ["tool1", "tool2"]
        # Setup mock LLM response
        mock_llm = MagicMock(spec=LLM)
        mock_llm.call.return_value = """
        {
            "overall_score": 5.5,
            "feedback": "The agent had some errors in tool invocation."
        }
        """
        mock_create_llm.return_value = mock_llm

        # Setup execution trace with tool uses including errors
        execution_trace = {
            "tool_uses": [
                {
                    "tool": "search_tool",
                    "input": {"query": "test query"},
                    "output": "search results",
                    "error": None
                },
                {
                    "tool": "calculator",
                    "input": {"expression": "2+"},
                    "output": None,
                    "error": "Invalid expression"
                }
            ]
        }

        evaluator = ToolInvocationEvaluator(llm=mock_llm)
        result = evaluator.evaluate(
            agent=mock_agent,
            task=mock_task,
            execution_trace=execution_trace,
            final_output="Final output"
        )

        assert result.score == 5.5
        assert "The agent had some errors in tool invocation" in result.feedback