mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-08 23:58:34 +00:00
* feat: add exchanged messages in LLMCallCompletedEvent * feat: add GoalAlignment metric for Agent evaluation * feat: add SemanticQuality metric for Agent evaluation * feat: add Tool Metrics for Agent evaluation * feat: add Reasoning Metrics for Agent evaluation, still in progress * feat: add AgentEvaluator class This class will evaluate Agent' results and report to user * fix: do not evaluate Agent by default This is a experimental feature we still need refine it further * test: add Agent eval tests * fix: render all feedback per iteration * style: resolve linter issues * style: fix mypy issues * fix: allow messages be empty on LLMCallCompletedEvent * feat: add Experiment evaluation framework with baseline comparison * fix: reset evaluator for each experiement iteraction * fix: fix track of new test cases * chore: split Experimental evaluation classes * refactor: remove unused method * refactor: isolate Console print in a dedicated class * fix: make crew required to run an experiment * fix: use time-aware to define experiment result * test: add tests for Evaluator Experiment * style: fix linter issues * fix: encode string before hashing * style: resolve linter issues * feat: add experimental folder for beta features (#3141) * test: move tests to experimental folder
401 lines
17 KiB
Python
401 lines
17 KiB
Python
import json
|
|
from typing import Dict, Any
|
|
|
|
from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
|
|
from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
|
|
from crewai.agent import Agent
|
|
from crewai.task import Task
|
|
|
|
|
|
class ToolSelectionEvaluator(BaseEvaluator):
|
|
|
|
@property
|
|
def metric_category(self) -> MetricCategory:
|
|
return MetricCategory.TOOL_SELECTION
|
|
|
|
def evaluate(
|
|
self,
|
|
agent: Agent,
|
|
task: Task,
|
|
execution_trace: Dict[str, Any],
|
|
final_output: str,
|
|
) -> EvaluationScore:
|
|
tool_uses = execution_trace.get("tool_uses", [])
|
|
tool_count = len(tool_uses)
|
|
unique_tool_types = set([tool.get("tool", "Unknown tool") for tool in tool_uses])
|
|
|
|
if tool_count == 0:
|
|
if not agent.tools:
|
|
return EvaluationScore(
|
|
score=None,
|
|
feedback="Agent had no tools available to use."
|
|
)
|
|
else:
|
|
return EvaluationScore(
|
|
score=None,
|
|
feedback="Agent had tools available but didn't use any."
|
|
)
|
|
|
|
available_tools_info = ""
|
|
if agent.tools:
|
|
for tool in agent.tools:
|
|
available_tools_info += f"- {tool.name}: {tool.description}\n"
|
|
else:
|
|
available_tools_info = "No tools available"
|
|
|
|
tool_types_summary = "Tools selected by the agent:\n"
|
|
for tool_type in sorted(unique_tool_types):
|
|
tool_types_summary += f"- {tool_type}\n"
|
|
|
|
prompt = [
|
|
{"role": "system", "content": """You are an expert evaluator assessing if an AI agent selected the most appropriate tools for a given task.
|
|
|
|
You must evaluate based on these 2 criteria:
|
|
1. Relevance (0-10): Were the tools chosen directly aligned with the task's goals?
|
|
2. Coverage (0-10): Did the agent select ALL appropriate tools from the AVAILABLE tools?
|
|
|
|
IMPORTANT:
|
|
- ONLY consider tools that are listed as available to the agent
|
|
- DO NOT suggest tools that aren't in the 'Available tools' list
|
|
- DO NOT evaluate the quality or accuracy of tool outputs/results
|
|
- DO NOT evaluate how many times each tool was used
|
|
- DO NOT evaluate how the agent used the parameters
|
|
- DO NOT evaluate whether the agent interpreted the task correctly
|
|
|
|
Focus ONLY on whether the correct CATEGORIES of tools were selected from what was available.
|
|
|
|
Return your evaluation as JSON with these fields:
|
|
- scores: {"relevance": number, "coverage": number}
|
|
- overall_score: number (average of all scores, 0-10)
|
|
- feedback: string (focused ONLY on tool selection decisions from available tools)
|
|
- improvement_suggestions: string (ONLY suggest better selection from the AVAILABLE tools list, NOT new tools)
|
|
"""},
|
|
{"role": "user", "content": f"""
|
|
Agent role: {agent.role}
|
|
Task description: {task.description}
|
|
|
|
Available tools for this agent:
|
|
{available_tools_info}
|
|
|
|
{tool_types_summary}
|
|
|
|
Based ONLY on the task description and comparing the AVAILABLE tools with those that were selected (listed above), evaluate if the agent selected the appropriate tool types for this task.
|
|
|
|
IMPORTANT:
|
|
- ONLY evaluate selection from tools listed as available
|
|
- DO NOT suggest new tools that aren't in the available tools list
|
|
- DO NOT evaluate tool usage or results
|
|
"""}
|
|
]
|
|
assert self.llm is not None
|
|
response = self.llm.call(prompt)
|
|
|
|
try:
|
|
evaluation_data = extract_json_from_llm_response(response)
|
|
assert evaluation_data is not None
|
|
|
|
scores = evaluation_data.get("scores", {})
|
|
relevance = scores.get("relevance", 5.0)
|
|
coverage = scores.get("coverage", 5.0)
|
|
overall_score = float(evaluation_data.get("overall_score", 5.0))
|
|
|
|
feedback = "Tool Selection Evaluation:\n"
|
|
feedback += f"• Relevance: {relevance}/10 - Selection of appropriate tool types for the task\n"
|
|
feedback += f"• Coverage: {coverage}/10 - Selection of all necessary tool types\n"
|
|
if "improvement_suggestions" in evaluation_data:
|
|
feedback += f"Improvement Suggestions:\n{evaluation_data['improvement_suggestions']}"
|
|
else:
|
|
feedback += evaluation_data.get("feedback", "No detailed feedback available.")
|
|
|
|
return EvaluationScore(
|
|
score=overall_score,
|
|
feedback=feedback,
|
|
raw_response=response
|
|
)
|
|
except Exception as e:
|
|
return EvaluationScore(
|
|
score=None,
|
|
feedback=f"Error evaluating tool selection: {e}",
|
|
raw_response=response
|
|
)
|
|
|
|
|
|
class ParameterExtractionEvaluator(BaseEvaluator):
|
|
@property
|
|
def metric_category(self) -> MetricCategory:
|
|
return MetricCategory.PARAMETER_EXTRACTION
|
|
|
|
def evaluate(
|
|
self,
|
|
agent: Agent,
|
|
task: Task,
|
|
execution_trace: Dict[str, Any],
|
|
final_output: str,
|
|
) -> EvaluationScore:
|
|
tool_uses = execution_trace.get("tool_uses", [])
|
|
tool_count = len(tool_uses)
|
|
|
|
if tool_count == 0:
|
|
return EvaluationScore(
|
|
score=None,
|
|
feedback="No tool usage detected. Cannot evaluate parameter extraction."
|
|
)
|
|
|
|
validation_errors = []
|
|
for tool_use in tool_uses:
|
|
if not tool_use.get("success", True) and tool_use.get("error_type") == "validation_error":
|
|
validation_errors.append({
|
|
"tool": tool_use.get("tool", "Unknown tool"),
|
|
"error": tool_use.get("result"),
|
|
"args": tool_use.get("args", {})
|
|
})
|
|
|
|
validation_error_rate = len(validation_errors) / tool_count if tool_count > 0 else 0
|
|
|
|
param_samples = []
|
|
for i, tool_use in enumerate(tool_uses[:5]):
|
|
tool_name = tool_use.get("tool", "Unknown tool")
|
|
tool_args = tool_use.get("args", {})
|
|
success = tool_use.get("success", True) and not tool_use.get("error", False)
|
|
error_type = tool_use.get("error_type", "") if not success else ""
|
|
|
|
is_validation_error = error_type == "validation_error"
|
|
|
|
sample = f"Tool use #{i+1} - {tool_name}:\n"
|
|
sample += f"- Parameters: {json.dumps(tool_args, indent=2)}\n"
|
|
sample += f"- Success: {'No' if not success else 'Yes'}"
|
|
|
|
if is_validation_error:
|
|
sample += " (PARAMETER VALIDATION ERROR)\n"
|
|
sample += f"- Error: {tool_use.get('result', 'Unknown error')}"
|
|
elif not success:
|
|
sample += f" (Other error: {error_type})\n"
|
|
|
|
param_samples.append(sample)
|
|
|
|
validation_errors_info = ""
|
|
if validation_errors:
|
|
validation_errors_info = f"\nParameter validation errors detected: {len(validation_errors)} ({validation_error_rate:.1%} of tool uses)\n"
|
|
for i, err in enumerate(validation_errors[:3]):
|
|
tool_name = err.get("tool", "Unknown tool")
|
|
error_msg = err.get("error", "Unknown error")
|
|
args = err.get("args", {})
|
|
validation_errors_info += f"\nValidation Error #{i+1}:\n- Tool: {tool_name}\n- Args: {json.dumps(args, indent=2)}\n- Error: {error_msg}"
|
|
|
|
if len(validation_errors) > 3:
|
|
validation_errors_info += f"\n...and {len(validation_errors) - 3} more validation errors."
|
|
param_samples_text = "\n\n".join(param_samples)
|
|
prompt = [
|
|
{"role": "system", "content": """You are an expert evaluator assessing how well an AI agent extracts and formats PARAMETER VALUES for tool calls.
|
|
|
|
Your job is to evaluate ONLY whether the agent used the correct parameter VALUES, not whether the right tools were selected or how the tools were invoked.
|
|
|
|
Evaluate parameter extraction based on these criteria:
|
|
1. Accuracy (0-10): Are parameter values correctly identified from the context/task?
|
|
2. Formatting (0-10): Are values formatted correctly for each tool's requirements?
|
|
3. Completeness (0-10): Are all required parameter values provided, with no missing information?
|
|
|
|
IMPORTANT: DO NOT evaluate:
|
|
- Whether the right tool was chosen (that's the ToolSelectionEvaluator's job)
|
|
- How the tools were structurally invoked (that's the ToolInvocationEvaluator's job)
|
|
- The quality of results from tools
|
|
|
|
Focus ONLY on the PARAMETER VALUES - whether they were correctly extracted from the context, properly formatted, and complete.
|
|
|
|
Validation errors are important signals that parameter values weren't properly extracted or formatted.
|
|
|
|
Return your evaluation as JSON with these fields:
|
|
- scores: {"accuracy": number, "formatting": number, "completeness": number}
|
|
- overall_score: number (average of all scores, 0-10)
|
|
- feedback: string (focused ONLY on parameter value extraction quality)
|
|
- improvement_suggestions: string (concrete suggestions for better parameter VALUE extraction)
|
|
"""},
|
|
{"role": "user", "content": f"""
|
|
Agent role: {agent.role}
|
|
Task description: {task.description}
|
|
|
|
Parameter extraction examples:
|
|
{param_samples_text}
|
|
{validation_errors_info}
|
|
|
|
Evaluate the quality of the agent's parameter extraction for this task.
|
|
"""}
|
|
]
|
|
|
|
assert self.llm is not None
|
|
response = self.llm.call(prompt)
|
|
|
|
try:
|
|
evaluation_data = extract_json_from_llm_response(response)
|
|
assert evaluation_data is not None
|
|
|
|
scores = evaluation_data.get("scores", {})
|
|
accuracy = scores.get("accuracy", 5.0)
|
|
formatting = scores.get("formatting", 5.0)
|
|
completeness = scores.get("completeness", 5.0)
|
|
|
|
overall_score = float(evaluation_data.get("overall_score", 5.0))
|
|
|
|
feedback = "Parameter Extraction Evaluation:\n"
|
|
feedback += f"• Accuracy: {accuracy}/10 - Correctly identifying required parameters\n"
|
|
feedback += f"• Formatting: {formatting}/10 - Properly formatting parameters for tools\n"
|
|
feedback += f"• Completeness: {completeness}/10 - Including all necessary information\n\n"
|
|
|
|
if "improvement_suggestions" in evaluation_data:
|
|
feedback += f"Improvement Suggestions:\n{evaluation_data['improvement_suggestions']}"
|
|
else:
|
|
feedback += evaluation_data.get("feedback", "No detailed feedback available.")
|
|
|
|
return EvaluationScore(
|
|
score=overall_score,
|
|
feedback=feedback,
|
|
raw_response=response
|
|
)
|
|
except Exception as e:
|
|
return EvaluationScore(
|
|
score=None,
|
|
feedback=f"Error evaluating parameter extraction: {e}",
|
|
raw_response=response
|
|
)
|
|
|
|
|
|
class ToolInvocationEvaluator(BaseEvaluator):
|
|
@property
|
|
def metric_category(self) -> MetricCategory:
|
|
return MetricCategory.TOOL_INVOCATION
|
|
|
|
def evaluate(
|
|
self,
|
|
agent: Agent,
|
|
task: Task,
|
|
execution_trace: Dict[str, Any],
|
|
final_output: str,
|
|
) -> EvaluationScore:
|
|
tool_uses = execution_trace.get("tool_uses", [])
|
|
tool_errors = []
|
|
tool_count = len(tool_uses)
|
|
|
|
if tool_count == 0:
|
|
return EvaluationScore(
|
|
score=None,
|
|
feedback="No tool usage detected. Cannot evaluate tool invocation."
|
|
)
|
|
|
|
for tool_use in tool_uses:
|
|
if not tool_use.get("success", True) or tool_use.get("error", False):
|
|
error_info = {
|
|
"tool": tool_use.get("tool", "Unknown tool"),
|
|
"error": tool_use.get("result"),
|
|
"error_type": tool_use.get("error_type", "unknown_error")
|
|
}
|
|
tool_errors.append(error_info)
|
|
|
|
error_rate = len(tool_errors) / tool_count if tool_count > 0 else 0
|
|
|
|
error_types = {}
|
|
for error in tool_errors:
|
|
error_type = error.get("error_type", "unknown_error")
|
|
if error_type not in error_types:
|
|
error_types[error_type] = 0
|
|
error_types[error_type] += 1
|
|
|
|
invocation_samples = []
|
|
for i, tool_use in enumerate(tool_uses[:5]):
|
|
tool_name = tool_use.get("tool", "Unknown tool")
|
|
tool_args = tool_use.get("args", {})
|
|
success = tool_use.get("success", True) and not tool_use.get("error", False)
|
|
error_type = tool_use.get("error_type", "") if not success else ""
|
|
error_msg = tool_use.get("result", "No error") if not success else "No error"
|
|
|
|
sample = f"Tool invocation #{i+1}:\n"
|
|
sample += f"- Tool: {tool_name}\n"
|
|
sample += f"- Parameters: {json.dumps(tool_args, indent=2)}\n"
|
|
sample += f"- Success: {'No' if not success else 'Yes'}\n"
|
|
if not success:
|
|
sample += f"- Error type: {error_type}\n"
|
|
sample += f"- Error: {error_msg}"
|
|
invocation_samples.append(sample)
|
|
|
|
error_type_summary = ""
|
|
if error_types:
|
|
error_type_summary = "Error type breakdown:\n"
|
|
for error_type, count in error_types.items():
|
|
error_type_summary += f"- {error_type}: {count} occurrences ({(count/tool_count):.1%})\n"
|
|
|
|
invocation_samples_text = "\n\n".join(invocation_samples)
|
|
prompt = [
|
|
{"role": "system", "content": """You are an expert evaluator assessing how correctly an AI agent's tool invocations are STRUCTURED.
|
|
|
|
Your job is to evaluate ONLY the structural and syntactical aspects of how the agent called tools, NOT which tools were selected or what parameter values were used.
|
|
|
|
Evaluate the agent's tool invocation based on these criteria:
|
|
1. Structure (0-10): Does the tool call follow the expected syntax and format?
|
|
2. Error Handling (0-10): Does the agent handle tool errors appropriately?
|
|
3. Invocation Patterns (0-10): Are tool calls properly sequenced, batched, or managed?
|
|
|
|
Error types that indicate invocation issues:
|
|
- execution_error: The tool was called correctly but failed during execution
|
|
- usage_error: General errors in how the tool was used structurally
|
|
|
|
IMPORTANT: DO NOT evaluate:
|
|
- Whether the right tool was chosen (that's the ToolSelectionEvaluator's job)
|
|
- Whether the parameter values are correct (that's the ParameterExtractionEvaluator's job)
|
|
- The quality of results from tools
|
|
|
|
Focus ONLY on HOW tools were invoked - the structure, format, and handling of the invocation process.
|
|
|
|
Return your evaluation as JSON with these fields:
|
|
- scores: {"structure": number, "error_handling": number, "invocation_patterns": number}
|
|
- overall_score: number (average of all scores, 0-10)
|
|
- feedback: string (focused ONLY on structural aspects of tool invocation)
|
|
- improvement_suggestions: string (concrete suggestions for better structuring of tool calls)
|
|
"""},
|
|
{"role": "user", "content": f"""
|
|
Agent role: {agent.role}
|
|
Task description: {task.description}
|
|
|
|
Tool invocation examples:
|
|
{invocation_samples_text}
|
|
|
|
Tool error rate: {error_rate:.2%} ({len(tool_errors)} errors out of {tool_count} invocations)
|
|
{error_type_summary}
|
|
|
|
Evaluate the quality of the agent's tool invocation structure during this task.
|
|
"""}
|
|
]
|
|
|
|
assert self.llm is not None
|
|
response = self.llm.call(prompt)
|
|
|
|
try:
|
|
evaluation_data = extract_json_from_llm_response(response)
|
|
assert evaluation_data is not None
|
|
scores = evaluation_data.get("scores", {})
|
|
structure = scores.get("structure", 5.0)
|
|
error_handling = scores.get("error_handling", 5.0)
|
|
invocation_patterns = scores.get("invocation_patterns", 5.0)
|
|
|
|
overall_score = float(evaluation_data.get("overall_score", 5.0))
|
|
|
|
feedback = "Tool Invocation Evaluation:\n"
|
|
feedback += f"• Structure: {structure}/10 - Following proper syntax and format\n"
|
|
feedback += f"• Error Handling: {error_handling}/10 - Appropriately handling tool errors\n"
|
|
feedback += f"• Invocation Patterns: {invocation_patterns}/10 - Proper sequencing and management of calls\n\n"
|
|
|
|
if "improvement_suggestions" in evaluation_data:
|
|
feedback += f"Improvement Suggestions:\n{evaluation_data['improvement_suggestions']}"
|
|
else:
|
|
feedback += evaluation_data.get("feedback", "No detailed feedback available.")
|
|
|
|
return EvaluationScore(
|
|
score=overall_score,
|
|
feedback=feedback,
|
|
raw_response=response
|
|
)
|
|
except Exception as e:
|
|
return EvaluationScore(
|
|
score=None,
|
|
feedback=f"Error evaluating tool invocation: {e}",
|
|
raw_response=response
|
|
)
|