mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-10 00:28:31 +00:00
feat: improve ToolMetric in Agent Evaluator when the agent fails to invoke a required tool
This commit is contained in:
@@ -24,33 +24,29 @@ class ToolSelectionEvaluator(BaseEvaluator):
|
|||||||
if task is not None:
|
if task is not None:
|
||||||
task_context = f"Task description: {task.description}"
|
task_context = f"Task description: {task.description}"
|
||||||
|
|
||||||
|
if not agent.tools:
|
||||||
|
return EvaluationScore(
|
||||||
|
score=None,
|
||||||
|
feedback="Agent had no tools available to use."
|
||||||
|
)
|
||||||
|
|
||||||
tool_uses = execution_trace.get("tool_uses", [])
|
tool_uses = execution_trace.get("tool_uses", [])
|
||||||
tool_count = len(tool_uses)
|
tool_count = len(tool_uses)
|
||||||
unique_tool_types = set([tool.get("tool", "Unknown tool") for tool in tool_uses])
|
unique_tool_types = set([tool.get("tool", "Unknown tool") for tool in tool_uses])
|
||||||
|
available_tools_info = "No tools available"
|
||||||
|
|
||||||
if tool_count == 0:
|
|
||||||
if not agent.tools:
|
|
||||||
return EvaluationScore(
|
|
||||||
score=None,
|
|
||||||
feedback="Agent had no tools available to use."
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
return EvaluationScore(
|
|
||||||
score=None,
|
|
||||||
feedback="Agent had tools available but didn't use any."
|
|
||||||
)
|
|
||||||
|
|
||||||
available_tools_info = ""
|
|
||||||
if agent.tools:
|
if agent.tools:
|
||||||
|
available_tools_info = ""
|
||||||
for tool in agent.tools:
|
for tool in agent.tools:
|
||||||
available_tools_info += f"- {tool.name}: {tool.description}\n"
|
available_tools_info += f"- {tool.name}: {tool.description}\n"
|
||||||
else:
|
|
||||||
available_tools_info = "No tools available"
|
|
||||||
|
|
||||||
tool_types_summary = "Tools selected by the agent:\n"
|
tool_types_summary = "Tools selected by the agent:\n"
|
||||||
for tool_type in sorted(unique_tool_types):
|
for tool_type in sorted(unique_tool_types):
|
||||||
tool_types_summary += f"- {tool_type}\n"
|
tool_types_summary += f"- {tool_type}\n"
|
||||||
|
|
||||||
|
if tool_count == 0:
|
||||||
|
tool_types_summary += "No tools used"
|
||||||
|
|
||||||
prompt = [
|
prompt = [
|
||||||
{"role": "system", "content": """You are an expert evaluator assessing if an AI agent selected the most appropriate tools for a given task.
|
{"role": "system", "content": """You are an expert evaluator assessing if an AI agent selected the most appropriate tools for a given task.
|
||||||
|
|
||||||
@@ -92,6 +88,7 @@ IMPORTANT:
|
|||||||
"""}
|
"""}
|
||||||
]
|
]
|
||||||
assert self.llm is not None
|
assert self.llm is not None
|
||||||
|
breakpoint()
|
||||||
response = self.llm.call(prompt)
|
response = self.llm.call(prompt)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import pytest
|
||||||
from unittest.mock import patch, MagicMock
|
from unittest.mock import patch, MagicMock
|
||||||
|
|
||||||
from crewai.experimental.evaluation.metrics.tools_metrics import (
|
from crewai.experimental.evaluation.metrics.tools_metrics import (
|
||||||
@@ -7,6 +8,26 @@ from crewai.experimental.evaluation.metrics.tools_metrics import (
|
|||||||
)
|
)
|
||||||
from crewai.utilities.llm_utils import LLM
|
from crewai.utilities.llm_utils import LLM
|
||||||
from tests.experimental.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
|
from tests.experimental.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
|
||||||
|
from crewai.tools.base_tool import BaseTool
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def tools_available():
|
||||||
|
class MyCustomTool(BaseTool):
|
||||||
|
name: str = "Name of my tool"
|
||||||
|
description: str = "Clear description for what this tool is useful for, your agent will need this information to use it."
|
||||||
|
|
||||||
|
def _run(self, question: str) -> str:
|
||||||
|
return question
|
||||||
|
|
||||||
|
class AnotherCustomTool(BaseTool):
|
||||||
|
name: str = "Another name of my tool"
|
||||||
|
description: str = "Clear description for what this another tool is useful for, your agent will need this information to use it."
|
||||||
|
|
||||||
|
def _run(self, question: str) -> str:
|
||||||
|
return question
|
||||||
|
|
||||||
|
return [MyCustomTool(), AnotherCustomTool()]
|
||||||
|
|
||||||
|
|
||||||
class TestToolSelectionEvaluator(BaseEvaluationMetricsTest):
|
class TestToolSelectionEvaluator(BaseEvaluationMetricsTest):
|
||||||
def test_no_tools_available(self, mock_task, mock_agent):
|
def test_no_tools_available(self, mock_task, mock_agent):
|
||||||
@@ -26,8 +47,8 @@ class TestToolSelectionEvaluator(BaseEvaluationMetricsTest):
|
|||||||
assert result.score is None
|
assert result.score is None
|
||||||
assert "no tools available" in result.feedback.lower()
|
assert "no tools available" in result.feedback.lower()
|
||||||
|
|
||||||
def test_tools_available_but_none_used(self, mock_agent, mock_task):
|
def test_tools_available_but_none_used(self, mock_agent, mock_task, tools_available):
|
||||||
mock_agent.tools = ["tool1", "tool2"]
|
mock_agent.tools = tools_available
|
||||||
execution_trace = {"tool_uses": []}
|
execution_trace = {"tool_uses": []}
|
||||||
|
|
||||||
evaluator = ToolSelectionEvaluator()
|
evaluator = ToolSelectionEvaluator()
|
||||||
@@ -42,7 +63,8 @@ class TestToolSelectionEvaluator(BaseEvaluationMetricsTest):
|
|||||||
assert "had tools available but didn't use any" in result.feedback.lower()
|
assert "had tools available but didn't use any" in result.feedback.lower()
|
||||||
|
|
||||||
@patch("crewai.utilities.llm_utils.create_llm")
|
@patch("crewai.utilities.llm_utils.create_llm")
|
||||||
def test_successful_evaluation(self, mock_create_llm, mock_agent, mock_task):
|
def test_successful_evaluation(self, mock_create_llm, mock_agent, mock_task, tools_available):
|
||||||
|
mock_agent.tools = tools_available
|
||||||
# Setup mock LLM response
|
# Setup mock LLM response
|
||||||
mock_llm = MagicMock(spec=LLM)
|
mock_llm = MagicMock(spec=LLM)
|
||||||
mock_llm.call.return_value = """
|
mock_llm.call.return_value = """
|
||||||
|
|||||||
Reference in New Issue
Block a user