From 60084af7458924cd92c386b63bf3225a4582b253 Mon Sep 17 00:00:00 2001
From: Lucas Gomide <lucaslg200@gmail.com>
Date: Wed, 9 Jul 2025 16:27:29 -0300
Subject: [PATCH] feat: add SemanticQuality metric for Agent evaluation

---
 .../metrics/semantic_quality_metrics.py       | 63 +++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 src/crewai/evaluation/metrics/semantic_quality_metrics.py

diff --git a/src/crewai/evaluation/metrics/semantic_quality_metrics.py b/src/crewai/evaluation/metrics/semantic_quality_metrics.py
new file mode 100644
index 000000000..24379b2c7
--- /dev/null
+++ b/src/crewai/evaluation/metrics/semantic_quality_metrics.py
@@ -0,0 +1,63 @@
+from typing import Any, Dict
+
+from crewai.agent import Agent
+from crewai.task import Task
+
+from crewai.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
+from crewai.evaluation.json_parser import extract_json_from_llm_response
+
+class SemanticQualityEvaluator(BaseEvaluator):
+    @property
+    def metric_category(self) -> MetricCategory:
+        return MetricCategory.SEMANTIC_QUALITY
+
+    def evaluate(
+        self,
+        agent: Agent,
+        task: Task,
+        execution_trace: Dict[str, Any],
+        final_output: Any,
+    ) -> EvaluationScore:
+        prompt = [
+            {"role": "system", "content": """You are an expert evaluator assessing the semantic quality of an AI agent's output.
+
+Score the semantic quality on a scale from 0-10 where:
+- 0: Completely incoherent, confusing, or logically flawed output
+- 5: Moderately clear and logical output with some issues
+- 10: Exceptionally clear, coherent, and logically sound output
+
+Consider:
+1. Is the output well-structured and organized?
+2. Is the reasoning logical and well-supported?
+3. Is the language clear, precise, and appropriate for the task?
+4. Are claims supported by evidence when appropriate?
+5. Is the output free from contradictions and logical fallacies?
+
+Return your evaluation as JSON with fields 'score' (number) and 'feedback' (string).
+"""},
+            {"role": "user", "content": f"""
+Agent role: {agent.role}
+Task description: {task.description}
+
+Agent's final output:
+{final_output}
+
+Evaluate the semantic quality and reasoning of this output.
+"""}
+        ]
+
+        response = self.llm.call(prompt)
+
+        try:
+            evaluation_data = extract_json_from_llm_response(response)
+            return EvaluationScore(
+                score=float(evaluation_data.get("score", None)),
+                feedback=evaluation_data.get("feedback", response),
+                raw_response=response
+            )
+        except Exception as e:
+            return EvaluationScore(
+                score=None,
+                feedback=f"Failed to parse evaluation. Raw response: {response}",
+                raw_response=response
+            )
\ No newline at end of file