feat: improve ToolMetric in Agent Evaluator when the agent fails to invoke a required tool

2026-01-10 00:28:31 +00:00 · 2025-07-17 10:24:55 -03:00
parent bf248d5118
commit 5c5e3b91cd
2 changed files with 37 additions and 18 deletions
--- a/src/crewai/experimental/evaluation/metrics/tools_metrics.py
+++ b/src/crewai/experimental/evaluation/metrics/tools_metrics.py
@@ -24,33 +24,29 @@ class ToolSelectionEvaluator(BaseEvaluator):
        if task is not None:
            task_context = f"Task description: {task.description}"
        if not agent.tools:
            return EvaluationScore(
                score=None,
                feedback="Agent had no tools available to use."
            )
        tool_uses = execution_trace.get("tool_uses", [])
        tool_count = len(tool_uses)
        unique_tool_types = set([tool.get("tool", "Unknown tool") for tool in tool_uses])
        available_tools_info = "No tools available"
        if tool_count == 0:
            if not agent.tools:
                return EvaluationScore(
                    score=None,
                    feedback="Agent had no tools available to use."
                )
            else:
                return EvaluationScore(
                    score=None,
                    feedback="Agent had tools available but didn't use any."
                )
        available_tools_info = ""
        if agent.tools:
            available_tools_info = ""
            for tool in agent.tools:
                available_tools_info += f"- {tool.name}: {tool.description}\n"
        else:
            available_tools_info = "No tools available"
        tool_types_summary = "Tools selected by the agent:\n"
        for tool_type in sorted(unique_tool_types):
            tool_types_summary += f"- {tool_type}\n"
        if tool_count == 0:
            tool_types_summary += "No tools used"
        prompt = [
            {"role": "system", "content": """You are an expert evaluator assessing if an AI agent selected the most appropriate tools for a given task.
@@ -92,6 +88,7 @@ IMPORTANT:
 """}
        ]
        assert self.llm is not None
        breakpoint()
        response = self.llm.call(prompt)
        try:
--- a/tests/experimental/evaluation/metrics/test_tools_metrics.py
+++ b/tests/experimental/evaluation/metrics/test_tools_metrics.py
@@ -1,3 +1,4 @@
 import pytest
 from unittest.mock import patch, MagicMock
 from crewai.experimental.evaluation.metrics.tools_metrics import (
@@ -7,6 +8,26 @@ from crewai.experimental.evaluation.metrics.tools_metrics import (
 )
 from crewai.utilities.llm_utils import LLM
 from tests.experimental.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
 from crewai.tools.base_tool import BaseTool
@pytest.fixture
 def tools_available():
    class MyCustomTool(BaseTool):
        name: str = "Name of my tool"
        description: str = "Clear description for what this tool is useful for, your agent will need this information to use it."
        def _run(self, question: str) -> str:
            return question
    class AnotherCustomTool(BaseTool):
        name: str = "Another name of my tool"
        description: str = "Clear description for what this another tool is useful for, your agent will need this information to use it."
        def _run(self, question: str) -> str:
            return question
    return [MyCustomTool(), AnotherCustomTool()]
 class TestToolSelectionEvaluator(BaseEvaluationMetricsTest):
    def test_no_tools_available(self, mock_task, mock_agent):
@@ -26,8 +47,8 @@ class TestToolSelectionEvaluator(BaseEvaluationMetricsTest):
        assert result.score is None
        assert "no tools available" in result.feedback.lower()
-    def test_tools_available_but_none_used(self, mock_agent, mock_task):
+    def test_tools_available_but_none_used(self, mock_agent, mock_task, tools_available):
-        mock_agent.tools = ["tool1", "tool2"]
+        mock_agent.tools = tools_available
        execution_trace = {"tool_uses": []}
        evaluator = ToolSelectionEvaluator()
@@ -42,7 +63,8 @@ class TestToolSelectionEvaluator(BaseEvaluationMetricsTest):
        assert "had tools available but didn't use any" in result.feedback.lower()
    @patch("crewai.utilities.llm_utils.create_llm")
-    def test_successful_evaluation(self, mock_create_llm, mock_agent, mock_task):
+    def test_successful_evaluation(self, mock_create_llm, mock_agent, mock_task, tools_available):
        mock_agent.tools = tools_available
        # Setup mock LLM response
        mock_llm = MagicMock(spec=LLM)
        mock_llm.call.return_value = """