diff --git a/src/crewai/experimental/evaluation/metrics/tools_metrics.py b/src/crewai/experimental/evaluation/metrics/tools_metrics.py index 9ed857aa1..9a70010f2 100644 --- a/src/crewai/experimental/evaluation/metrics/tools_metrics.py +++ b/src/crewai/experimental/evaluation/metrics/tools_metrics.py @@ -24,33 +24,29 @@ class ToolSelectionEvaluator(BaseEvaluator): if task is not None: task_context = f"Task description: {task.description}" + if not agent.tools: + return EvaluationScore( + score=None, + feedback="Agent had no tools available to use." + ) + tool_uses = execution_trace.get("tool_uses", []) tool_count = len(tool_uses) unique_tool_types = set([tool.get("tool", "Unknown tool") for tool in tool_uses]) + available_tools_info = "No tools available" - if tool_count == 0: - if not agent.tools: - return EvaluationScore( - score=None, - feedback="Agent had no tools available to use." - ) - else: - return EvaluationScore( - score=None, - feedback="Agent had tools available but didn't use any." - ) - - available_tools_info = "" if agent.tools: + available_tools_info = "" for tool in agent.tools: available_tools_info += f"- {tool.name}: {tool.description}\n" - else: - available_tools_info = "No tools available" tool_types_summary = "Tools selected by the agent:\n" for tool_type in sorted(unique_tool_types): tool_types_summary += f"- {tool_type}\n" + if tool_count == 0: + tool_types_summary += "No tools used" + prompt = [ {"role": "system", "content": """You are an expert evaluator assessing if an AI agent selected the most appropriate tools for a given task. @@ -92,6 +88,7 @@ IMPORTANT: """} ] assert self.llm is not None + breakpoint() response = self.llm.call(prompt) try: diff --git a/tests/experimental/evaluation/metrics/test_tools_metrics.py b/tests/experimental/evaluation/metrics/test_tools_metrics.py index dab3eeee1..2dffe9752 100644 --- a/tests/experimental/evaluation/metrics/test_tools_metrics.py +++ b/tests/experimental/evaluation/metrics/test_tools_metrics.py @@ -1,3 +1,4 @@ +import pytest from unittest.mock import patch, MagicMock from crewai.experimental.evaluation.metrics.tools_metrics import ( @@ -7,6 +8,26 @@ from crewai.experimental.evaluation.metrics.tools_metrics import ( ) from crewai.utilities.llm_utils import LLM from tests.experimental.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest +from crewai.tools.base_tool import BaseTool + +@pytest.fixture +def tools_available(): + class MyCustomTool(BaseTool): + name: str = "Name of my tool" + description: str = "Clear description for what this tool is useful for, your agent will need this information to use it." + + def _run(self, question: str) -> str: + return question + + class AnotherCustomTool(BaseTool): + name: str = "Another name of my tool" + description: str = "Clear description for what this another tool is useful for, your agent will need this information to use it." + + def _run(self, question: str) -> str: + return question + + return [MyCustomTool(), AnotherCustomTool()] + class TestToolSelectionEvaluator(BaseEvaluationMetricsTest): def test_no_tools_available(self, mock_task, mock_agent): @@ -26,8 +47,8 @@ class TestToolSelectionEvaluator(BaseEvaluationMetricsTest): assert result.score is None assert "no tools available" in result.feedback.lower() - def test_tools_available_but_none_used(self, mock_agent, mock_task): - mock_agent.tools = ["tool1", "tool2"] + def test_tools_available_but_none_used(self, mock_agent, mock_task, tools_available): + mock_agent.tools = tools_available execution_trace = {"tool_uses": []} evaluator = ToolSelectionEvaluator() @@ -42,7 +63,8 @@ class TestToolSelectionEvaluator(BaseEvaluationMetricsTest): assert "had tools available but didn't use any" in result.feedback.lower() @patch("crewai.utilities.llm_utils.create_llm") - def test_successful_evaluation(self, mock_create_llm, mock_agent, mock_task): + def test_successful_evaluation(self, mock_create_llm, mock_agent, mock_task, tools_available): + mock_agent.tools = tools_available # Setup mock LLM response mock_llm = MagicMock(spec=LLM) mock_llm.call.return_value = """