Introducing Agent evaluation (#3130)

* feat: add exchanged messages in LLMCallCompletedEvent * feat: add GoalAlignment metric for Agent evaluation * feat: add SemanticQuality metric for Agent evaluation * feat: add Tool Metrics for Agent evaluation * feat: add Reasoning Metrics for Agent evaluation, still in progress * feat: add AgentEvaluator class This class will evaluate Agent' results and report to user * fix: do not evaluate Agent by default This is a experimental feature we still need refine it further * test: add Agent eval tests * fix: render all feedback per iteration * style: resolve linter issues * style: fix mypy issues * fix: allow messages be empty on LLMCallCompletedEvent
2026-01-29 18:18:13 +00:00 · 2025-07-11 14:18:03 -03:00
parent bf8fa3232b
commit 08fa3797ca
26 changed files with 2930 additions and 14 deletions
--- a/tests/evaluation/metrics/base_evaluation_metrics_test.py
+++ b/tests/evaluation/metrics/base_evaluation_metrics_test.py
@@ -0,0 +1,28 @@
+import pytest
+from unittest.mock import MagicMock
+from crewai.agent import Agent
+from crewai.task import Task
+
+class BaseEvaluationMetricsTest:
+    @pytest.fixture
+    def mock_agent(self):
+        agent = MagicMock(spec=Agent)
+        agent.id = "test_agent_id"
+        agent.role = "Test Agent"
+        agent.goal = "Test goal"
+        agent.tools = []
+        return agent
+
+    @pytest.fixture
+    def mock_task(self):
+        task = MagicMock(spec=Task)
+        task.description = "Test task description"
+        task.expected_output = "Test expected output"
+        return task
+
+    @pytest.fixture
+    def execution_trace(self):
+        return {
+            "thinking": ["I need to analyze this data carefully"],
+            "actions": ["Gathered information", "Analyzed data"]
+        }