From 60084af7458924cd92c386b63bf3225a4582b253 Mon Sep 17 00:00:00 2001 From: Lucas Gomide Date: Wed, 9 Jul 2025 16:27:29 -0300 Subject: [PATCH] feat: add SemanticQuality metric for Agent evaluation --- .../metrics/semantic_quality_metrics.py | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 src/crewai/evaluation/metrics/semantic_quality_metrics.py diff --git a/src/crewai/evaluation/metrics/semantic_quality_metrics.py b/src/crewai/evaluation/metrics/semantic_quality_metrics.py new file mode 100644 index 000000000..24379b2c7 --- /dev/null +++ b/src/crewai/evaluation/metrics/semantic_quality_metrics.py @@ -0,0 +1,63 @@ +from typing import Any, Dict + +from crewai.agent import Agent +from crewai.task import Task + +from crewai.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory +from crewai.evaluation.json_parser import extract_json_from_llm_response + +class SemanticQualityEvaluator(BaseEvaluator): + @property + def metric_category(self) -> MetricCategory: + return MetricCategory.SEMANTIC_QUALITY + + def evaluate( + self, + agent: Agent, + task: Task, + execution_trace: Dict[str, Any], + final_output: Any, + ) -> EvaluationScore: + prompt = [ + {"role": "system", "content": """You are an expert evaluator assessing the semantic quality of an AI agent's output. + +Score the semantic quality on a scale from 0-10 where: +- 0: Completely incoherent, confusing, or logically flawed output +- 5: Moderately clear and logical output with some issues +- 10: Exceptionally clear, coherent, and logically sound output + +Consider: +1. Is the output well-structured and organized? +2. Is the reasoning logical and well-supported? +3. Is the language clear, precise, and appropriate for the task? +4. Are claims supported by evidence when appropriate? +5. Is the output free from contradictions and logical fallacies? + +Return your evaluation as JSON with fields 'score' (number) and 'feedback' (string). +"""}, + {"role": "user", "content": f""" +Agent role: {agent.role} +Task description: {task.description} + +Agent's final output: +{final_output} + +Evaluate the semantic quality and reasoning of this output. +"""} + ] + + response = self.llm.call(prompt) + + try: + evaluation_data = extract_json_from_llm_response(response) + return EvaluationScore( + score=float(evaluation_data.get("score", None)), + feedback=evaluation_data.get("feedback", response), + raw_response=response + ) + except Exception as e: + return EvaluationScore( + score=None, + feedback=f"Failed to parse evaluation. Raw response: {response}", + raw_response=response + ) \ No newline at end of file