mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-09 08:08:32 +00:00
feat: add SemanticQuality metric for Agent evaluation
This commit is contained in:
63
src/crewai/evaluation/metrics/semantic_quality_metrics.py
Normal file
63
src/crewai/evaluation/metrics/semantic_quality_metrics.py
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
from typing import Any, Dict
|
||||||
|
|
||||||
|
from crewai.agent import Agent
|
||||||
|
from crewai.task import Task
|
||||||
|
|
||||||
|
from crewai.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
|
||||||
|
from crewai.evaluation.json_parser import extract_json_from_llm_response
|
||||||
|
|
||||||
|
class SemanticQualityEvaluator(BaseEvaluator):
|
||||||
|
@property
|
||||||
|
def metric_category(self) -> MetricCategory:
|
||||||
|
return MetricCategory.SEMANTIC_QUALITY
|
||||||
|
|
||||||
|
def evaluate(
|
||||||
|
self,
|
||||||
|
agent: Agent,
|
||||||
|
task: Task,
|
||||||
|
execution_trace: Dict[str, Any],
|
||||||
|
final_output: Any,
|
||||||
|
) -> EvaluationScore:
|
||||||
|
prompt = [
|
||||||
|
{"role": "system", "content": """You are an expert evaluator assessing the semantic quality of an AI agent's output.
|
||||||
|
|
||||||
|
Score the semantic quality on a scale from 0-10 where:
|
||||||
|
- 0: Completely incoherent, confusing, or logically flawed output
|
||||||
|
- 5: Moderately clear and logical output with some issues
|
||||||
|
- 10: Exceptionally clear, coherent, and logically sound output
|
||||||
|
|
||||||
|
Consider:
|
||||||
|
1. Is the output well-structured and organized?
|
||||||
|
2. Is the reasoning logical and well-supported?
|
||||||
|
3. Is the language clear, precise, and appropriate for the task?
|
||||||
|
4. Are claims supported by evidence when appropriate?
|
||||||
|
5. Is the output free from contradictions and logical fallacies?
|
||||||
|
|
||||||
|
Return your evaluation as JSON with fields 'score' (number) and 'feedback' (string).
|
||||||
|
"""},
|
||||||
|
{"role": "user", "content": f"""
|
||||||
|
Agent role: {agent.role}
|
||||||
|
Task description: {task.description}
|
||||||
|
|
||||||
|
Agent's final output:
|
||||||
|
{final_output}
|
||||||
|
|
||||||
|
Evaluate the semantic quality and reasoning of this output.
|
||||||
|
"""}
|
||||||
|
]
|
||||||
|
|
||||||
|
response = self.llm.call(prompt)
|
||||||
|
|
||||||
|
try:
|
||||||
|
evaluation_data = extract_json_from_llm_response(response)
|
||||||
|
return EvaluationScore(
|
||||||
|
score=float(evaluation_data.get("score", None)),
|
||||||
|
feedback=evaluation_data.get("feedback", response),
|
||||||
|
raw_response=response
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
return EvaluationScore(
|
||||||
|
score=None,
|
||||||
|
feedback=f"Failed to parse evaluation. Raw response: {response}",
|
||||||
|
raw_response=response
|
||||||
|
)
|
||||||
Reference in New Issue
Block a user