Introduce Evaluator Experiment (#3133)

* feat: add exchanged messages in LLMCallCompletedEvent * feat: add GoalAlignment metric for Agent evaluation * feat: add SemanticQuality metric for Agent evaluation * feat: add Tool Metrics for Agent evaluation * feat: add Reasoning Metrics for Agent evaluation, still in progress * feat: add AgentEvaluator class This class will evaluate Agent' results and report to user * fix: do not evaluate Agent by default This is a experimental feature we still need refine it further * test: add Agent eval tests * fix: render all feedback per iteration * style: resolve linter issues * style: fix mypy issues * fix: allow messages be empty on LLMCallCompletedEvent * feat: add Experiment evaluation framework with baseline comparison * fix: reset evaluator for each experiement iteraction * fix: fix track of new test cases * chore: split Experimental evaluation classes * refactor: remove unused method * refactor: isolate Console print in a dedicated class * fix: make crew required to run an experiment * fix: use time-aware to define experiment result * test: add tests for Evaluator Experiment * style: fix linter issues * fix: encode string before hashing * style: resolve linter issues * feat: add experimental folder for beta features (#3141) * test: move tests to experimental folder
2026-05-01 07:13:00 +00:00 · 2025-07-14 10:06:45 -03:00
parent 3ada4053bd
commit 1b6b2b36d9
27 changed files with 2512 additions and 16 deletions
--- a/tests/experimental/evaluation/init.py
+++ b/tests/experimental/evaluation/init.py
--- a/tests/experimental/evaluation/metrics/init.py
+++ b/tests/experimental/evaluation/metrics/init.py
--- a/tests/experimental/evaluation/metrics/base_evaluation_metrics_test.py
+++ b/tests/experimental/evaluation/metrics/base_evaluation_metrics_test.py
@@ -0,0 +1,28 @@
+import pytest
+from unittest.mock import MagicMock
+from crewai.agent import Agent
+from crewai.task import Task
+
+class BaseEvaluationMetricsTest:
+    @pytest.fixture
+    def mock_agent(self):
+        agent = MagicMock(spec=Agent)
+        agent.id = "test_agent_id"
+        agent.role = "Test Agent"
+        agent.goal = "Test goal"
+        agent.tools = []
+        return agent
+
+    @pytest.fixture
+    def mock_task(self):
+        task = MagicMock(spec=Task)
+        task.description = "Test task description"
+        task.expected_output = "Test expected output"
+        return task
+
+    @pytest.fixture
+    def execution_trace(self):
+        return {
+            "thinking": ["I need to analyze this data carefully"],
+            "actions": ["Gathered information", "Analyzed data"]
+        }
--- a/tests/experimental/evaluation/metrics/test_goal_metrics.py
+++ b/tests/experimental/evaluation/metrics/test_goal_metrics.py
@@ -0,0 +1,59 @@
+from unittest.mock import patch, MagicMock
+from tests.experimental.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
+
+from crewai.experimental.evaluation.base_evaluator import EvaluationScore
+from crewai.experimental.evaluation.metrics.goal_metrics import GoalAlignmentEvaluator
+from crewai.utilities.llm_utils import LLM
+
+
+class TestGoalAlignmentEvaluator(BaseEvaluationMetricsTest):
+    @patch("crewai.utilities.llm_utils.create_llm")
+    def test_evaluate_success(self, mock_create_llm, mock_agent, mock_task, execution_trace):
+        mock_llm = MagicMock(spec=LLM)
+        mock_llm.call.return_value = """
+        {
+            "score": 8.5,
+            "feedback": "The agent correctly understood the task and produced relevant output."
+        }
+        """
+        mock_create_llm.return_value = mock_llm
+
+        evaluator = GoalAlignmentEvaluator(llm=mock_llm)
+
+        result = evaluator.evaluate(
+            agent=mock_agent,
+            task=mock_task,
+            execution_trace=execution_trace,
+            final_output="This is the final output"
+        )
+
+        assert isinstance(result, EvaluationScore)
+        assert result.score == 8.5
+        assert "correctly understood the task" in result.feedback
+
+        mock_llm.call.assert_called_once()
+        prompt = mock_llm.call.call_args[0][0]
+        assert len(prompt) >= 2
+        assert "system" in prompt[0]["role"]
+        assert "user" in prompt[1]["role"]
+        assert mock_agent.role in prompt[1]["content"]
+        assert mock_task.description in prompt[1]["content"]
+
+    @patch("crewai.utilities.llm_utils.create_llm")
+    def test_evaluate_error_handling(self, mock_create_llm, mock_agent, mock_task, execution_trace):
+        mock_llm = MagicMock(spec=LLM)
+        mock_llm.call.return_value = "Invalid JSON response"
+        mock_create_llm.return_value = mock_llm
+
+        evaluator = GoalAlignmentEvaluator(llm=mock_llm)
+
+        result = evaluator.evaluate(
+            agent=mock_agent,
+            task=mock_task,
+            execution_trace=execution_trace,
+            final_output="This is the final output"
+        )
+
+        assert isinstance(result, EvaluationScore)
+        assert result.score is None
+        assert "Failed to parse" in result.feedback
--- a/tests/experimental/evaluation/metrics/test_reasoning_metrics.py
+++ b/tests/experimental/evaluation/metrics/test_reasoning_metrics.py
@@ -0,0 +1,166 @@
+import pytest
+from unittest.mock import patch, MagicMock
+from typing import List, Dict, Any
+
+from crewai.tasks.task_output import TaskOutput
+from crewai.experimental.evaluation.metrics.reasoning_metrics import (
+    ReasoningEfficiencyEvaluator,
+)
+from tests.experimental.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
+from crewai.utilities.llm_utils import LLM
+from crewai.experimental.evaluation.base_evaluator import EvaluationScore
+
+class TestReasoningEfficiencyEvaluator(BaseEvaluationMetricsTest):
+    @pytest.fixture
+    def mock_output(self):
+        output = MagicMock(spec=TaskOutput)
+        output.raw = "This is the test output"
+        return output
+
+    @pytest.fixture
+    def llm_calls(self) -> List[Dict[str, Any]]:
+        return [
+            {
+                "prompt": "How should I approach this task?",
+                "response": "I'll first research the topic, then compile findings.",
+                "timestamp": 1626987654
+            },
+            {
+                "prompt": "What resources should I use?",
+                "response": "I'll use relevant academic papers and reliable websites.",
+                "timestamp": 1626987754
+            },
+            {
+                "prompt": "How should I structure the output?",
+                "response": "I'll organize information clearly with headings and bullet points.",
+                "timestamp": 1626987854
+            }
+        ]
+
+    def test_insufficient_llm_calls(self, mock_agent, mock_task, mock_output):
+        execution_trace = {"llm_calls": []}
+
+        evaluator = ReasoningEfficiencyEvaluator()
+        result = evaluator.evaluate(
+            agent=mock_agent,
+            task=mock_task,
+            execution_trace=execution_trace,
+            final_output=mock_output
+        )
+
+        assert isinstance(result, EvaluationScore)
+        assert result.score is None
+        assert "Insufficient LLM calls" in result.feedback
+
+    @patch("crewai.utilities.llm_utils.create_llm")
+    def test_successful_evaluation(self, mock_create_llm, mock_agent, mock_task, mock_output, llm_calls):
+        mock_llm = MagicMock(spec=LLM)
+        mock_llm.call.return_value = """
+        {
+            "scores": {
+                "focus": 8.0,
+                "progression": 7.0,
+                "decision_quality": 7.5,
+                "conciseness": 8.0,
+                "loop_avoidance": 9.0
+            },
+            "overall_score": 7.9,
+            "feedback": "The agent demonstrated good reasoning efficiency.",
+            "optimization_suggestions": "The agent could improve by being more concise."
+        }
+        """
+        mock_create_llm.return_value = mock_llm
+
+        # Setup execution trace with sufficient LLM calls
+        execution_trace = {"llm_calls": llm_calls}
+
+        # Mock the _detect_loops method to return a simple result
+        evaluator = ReasoningEfficiencyEvaluator(llm=mock_llm)
+        evaluator._detect_loops = MagicMock(return_value=(False, []))
+
+        # Evaluate
+        result = evaluator.evaluate(
+            agent=mock_agent,
+            task=mock_task,
+            execution_trace=execution_trace,
+            final_output=mock_output
+        )
+
+        # Assertions
+        assert isinstance(result, EvaluationScore)
+        assert result.score == 7.9
+        assert "The agent demonstrated good reasoning efficiency" in result.feedback
+        assert "Reasoning Efficiency Evaluation:" in result.feedback
+        assert "• Focus: 8.0/10" in result.feedback
+
+        # Verify LLM was called
+        mock_llm.call.assert_called_once()
+
+    @patch("crewai.utilities.llm_utils.create_llm")
+    def test_parse_error_handling(self, mock_create_llm, mock_agent, mock_task, mock_output, llm_calls):
+        mock_llm = MagicMock(spec=LLM)
+        mock_llm.call.return_value = "Invalid JSON response"
+        mock_create_llm.return_value = mock_llm
+
+        # Setup execution trace
+        execution_trace = {"llm_calls": llm_calls}
+
+        # Mock the _detect_loops method
+        evaluator = ReasoningEfficiencyEvaluator(llm=mock_llm)
+        evaluator._detect_loops = MagicMock(return_value=(False, []))
+
+        # Evaluate
+        result = evaluator.evaluate(
+            agent=mock_agent,
+            task=mock_task,
+            execution_trace=execution_trace,
+            final_output=mock_output
+        )
+
+        # Assertions for error handling
+        assert isinstance(result, EvaluationScore)
+        assert result.score is None
+        assert "Failed to parse reasoning efficiency evaluation" in result.feedback
+
+    @patch("crewai.utilities.llm_utils.create_llm")
+    def test_loop_detection(self, mock_create_llm, mock_agent, mock_task, mock_output):
+        # Setup LLM calls with a repeating pattern
+        repetitive_llm_calls = [
+            {"prompt": "How to solve?", "response": "I'll try method A", "timestamp": 1000},
+            {"prompt": "Let me try method A", "response": "It didn't work", "timestamp": 1100},
+            {"prompt": "How to solve?", "response": "I'll try method A again", "timestamp": 1200},
+            {"prompt": "Let me try method A", "response": "It didn't work", "timestamp": 1300},
+            {"prompt": "How to solve?", "response": "I'll try method A one more time", "timestamp": 1400}
+        ]
+
+        mock_llm = MagicMock(spec=LLM)
+        mock_llm.call.return_value = """
+        {
+            "scores": {
+                "focus": 6.0,
+                "progression": 3.0,
+                "decision_quality": 4.0,
+                "conciseness": 6.0,
+                "loop_avoidance": 2.0
+            },
+            "overall_score": 4.2,
+            "feedback": "The agent is stuck in a reasoning loop.",
+            "optimization_suggestions": "The agent should try different approaches when one fails."
+        }
+        """
+        mock_create_llm.return_value = mock_llm
+
+        execution_trace = {"llm_calls": repetitive_llm_calls}
+
+        evaluator = ReasoningEfficiencyEvaluator(llm=mock_llm)
+
+        result = evaluator.evaluate(
+            agent=mock_agent,
+            task=mock_task,
+            execution_trace=execution_trace,
+            final_output=mock_output
+        )
+
+        assert isinstance(result, EvaluationScore)
+        assert result.score == 4.2
+        assert "• Loop Avoidance: 2.0/10" in result.feedback
--- a/tests/experimental/evaluation/metrics/test_semantic_quality_metrics.py
+++ b/tests/experimental/evaluation/metrics/test_semantic_quality_metrics.py
@@ -0,0 +1,82 @@
+from unittest.mock import patch, MagicMock
+
+from crewai.experimental.evaluation.base_evaluator import EvaluationScore
+from crewai.experimental.evaluation.metrics.semantic_quality_metrics import SemanticQualityEvaluator
+from tests.experimental.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
+from crewai.utilities.llm_utils import LLM
+
+class TestSemanticQualityEvaluator(BaseEvaluationMetricsTest):
+    @patch("crewai.utilities.llm_utils.create_llm")
+    def test_evaluate_success(self, mock_create_llm, mock_agent, mock_task, execution_trace):
+        mock_llm = MagicMock(spec=LLM)
+        mock_llm.call.return_value = """
+        {
+            "score": 8.5,
+            "feedback": "The output is clear, coherent, and logically structured."
+        }
+        """
+        mock_create_llm.return_value = mock_llm
+
+        evaluator = SemanticQualityEvaluator(llm=mock_llm)
+
+        result = evaluator.evaluate(
+            agent=mock_agent,
+            task=mock_task,
+            execution_trace=execution_trace,
+            final_output="This is a well-structured analysis of the data."
+        )
+
+        assert isinstance(result, EvaluationScore)
+        assert result.score == 8.5
+        assert "clear, coherent" in result.feedback
+
+        mock_llm.call.assert_called_once()
+        prompt = mock_llm.call.call_args[0][0]
+        assert len(prompt) >= 2
+        assert "system" in prompt[0]["role"]
+        assert "user" in prompt[1]["role"]
+        assert mock_agent.role in prompt[1]["content"]
+        assert mock_task.description in prompt[1]["content"]
+
+    @patch("crewai.utilities.llm_utils.create_llm")
+    def test_evaluate_with_empty_output(self, mock_create_llm, mock_agent, mock_task, execution_trace):
+        mock_llm = MagicMock(spec=LLM)
+        mock_llm.call.return_value = """
+        {
+            "score": 2.0,
+            "feedback": "The output is empty or minimal, lacking substance."
+        }
+        """
+        mock_create_llm.return_value = mock_llm
+
+        evaluator = SemanticQualityEvaluator(llm=mock_llm)
+
+        result = evaluator.evaluate(
+            agent=mock_agent,
+            task=mock_task,
+            execution_trace=execution_trace,
+            final_output=""
+        )
+
+        assert isinstance(result, EvaluationScore)
+        assert result.score == 2.0
+        assert "empty or minimal" in result.feedback
+
+    @patch("crewai.utilities.llm_utils.create_llm")
+    def test_evaluate_error_handling(self, mock_create_llm, mock_agent, mock_task, execution_trace):
+        mock_llm = MagicMock(spec=LLM)
+        mock_llm.call.return_value = "Invalid JSON response"
+        mock_create_llm.return_value = mock_llm
+
+        evaluator = SemanticQualityEvaluator(llm=mock_llm)
+
+        result = evaluator.evaluate(
+            agent=mock_agent,
+            task=mock_task,
+            execution_trace=execution_trace,
+            final_output="This is the output."
+        )
+
+        assert isinstance(result, EvaluationScore)
+        assert result.score is None
+        assert "Failed to parse" in result.feedback
--- a/tests/experimental/evaluation/metrics/test_tools_metrics.py
+++ b/tests/experimental/evaluation/metrics/test_tools_metrics.py
@@ -0,0 +1,230 @@
+from unittest.mock import patch, MagicMock
+
+from crewai.experimental.evaluation.metrics.tools_metrics import (
+    ToolSelectionEvaluator,
+    ParameterExtractionEvaluator,
+    ToolInvocationEvaluator
+)
+from crewai.utilities.llm_utils import LLM
+from tests.experimental.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
+
+class TestToolSelectionEvaluator(BaseEvaluationMetricsTest):
+    def test_no_tools_available(self, mock_task, mock_agent):
+        # Create agent with no tools
+        mock_agent.tools = []
+
+        execution_trace = {"tool_uses": []}
+
+        evaluator = ToolSelectionEvaluator()
+        result = evaluator.evaluate(
+            agent=mock_agent,
+            task=mock_task,
+            execution_trace=execution_trace,
+            final_output="Final output"
+        )
+
+        assert result.score is None
+        assert "no tools available" in result.feedback.lower()
+
+    def test_tools_available_but_none_used(self, mock_agent, mock_task):
+        mock_agent.tools = ["tool1", "tool2"]
+        execution_trace = {"tool_uses": []}
+
+        evaluator = ToolSelectionEvaluator()
+        result = evaluator.evaluate(
+            agent=mock_agent,
+            task=mock_task,
+            execution_trace=execution_trace,
+            final_output="Final output"
+        )
+
+        assert result.score is None
+        assert "had tools available but didn't use any" in result.feedback.lower()
+
+    @patch("crewai.utilities.llm_utils.create_llm")
+    def test_successful_evaluation(self, mock_create_llm, mock_agent, mock_task):
+        # Setup mock LLM response
+        mock_llm = MagicMock(spec=LLM)
+        mock_llm.call.return_value = """
+        {
+            "overall_score": 8.5,
+            "feedback": "The agent made good tool selections."
+        }
+        """
+        mock_create_llm.return_value = mock_llm
+
+        # Setup execution trace with tool uses
+        execution_trace = {
+            "tool_uses": [
+                {"tool": "search_tool", "input": {"query": "test query"}, "output": "search results"},
+                {"tool": "calculator", "input": {"expression": "2+2"}, "output": "4"}
+            ]
+        }
+
+        evaluator = ToolSelectionEvaluator(llm=mock_llm)
+        result = evaluator.evaluate(
+            agent=mock_agent,
+            task=mock_task,
+            execution_trace=execution_trace,
+            final_output="Final output"
+        )
+
+        assert result.score == 8.5
+        assert "The agent made good tool selections" in result.feedback
+
+        # Verify LLM was called with correct prompt
+        mock_llm.call.assert_called_once()
+        prompt = mock_llm.call.call_args[0][0]
+        assert isinstance(prompt, list)
+        assert len(prompt) >= 2
+        assert "system" in prompt[0]["role"]
+        assert "user" in prompt[1]["role"]
+
+
+class TestParameterExtractionEvaluator(BaseEvaluationMetricsTest):
+    def test_no_tool_uses(self, mock_agent, mock_task):
+        execution_trace = {"tool_uses": []}
+
+        evaluator = ParameterExtractionEvaluator()
+        result = evaluator.evaluate(
+            agent=mock_agent,
+            task=mock_task,
+            execution_trace=execution_trace,
+            final_output="Final output"
+        )
+
+        assert result.score is None
+        assert "no tool usage" in result.feedback.lower()
+
+    @patch("crewai.utilities.llm_utils.create_llm")
+    def test_successful_evaluation(self, mock_create_llm, mock_agent, mock_task):
+        mock_agent.tools = ["tool1", "tool2"]
+
+        # Setup mock LLM response
+        mock_llm = MagicMock(spec=LLM)
+        mock_llm.call.return_value = """
+        {
+            "overall_score": 9.0,
+            "feedback": "The agent extracted parameters correctly."
+        }
+        """
+        mock_create_llm.return_value = mock_llm
+
+        # Setup execution trace with tool uses
+        execution_trace = {
+            "tool_uses": [
+                {
+                    "tool": "search_tool",
+                    "input": {"query": "test query"},
+                    "output": "search results",
+                    "error": None
+                },
+                {
+                    "tool": "calculator",
+                    "input": {"expression": "2+2"},
+                    "output": "4",
+                    "error": None
+                }
+            ]
+        }
+
+        evaluator = ParameterExtractionEvaluator(llm=mock_llm)
+        result = evaluator.evaluate(
+            agent=mock_agent,
+            task=mock_task,
+            execution_trace=execution_trace,
+            final_output="Final output"
+        )
+
+        assert result.score == 9.0
+        assert "The agent extracted parameters correctly" in result.feedback
+
+
+class TestToolInvocationEvaluator(BaseEvaluationMetricsTest):
+    def test_no_tool_uses(self, mock_agent, mock_task):
+        execution_trace = {"tool_uses": []}
+
+        evaluator = ToolInvocationEvaluator()
+        result = evaluator.evaluate(
+            agent=mock_agent,
+            task=mock_task,
+            execution_trace=execution_trace,
+            final_output="Final output"
+        )
+
+        assert result.score is None
+        assert "no tool usage" in result.feedback.lower()
+
+    @patch("crewai.utilities.llm_utils.create_llm")
+    def test_successful_evaluation(self, mock_create_llm, mock_agent, mock_task):
+        mock_agent.tools = ["tool1", "tool2"]
+        # Setup mock LLM response
+        mock_llm = MagicMock(spec=LLM)
+        mock_llm.call.return_value = """
+        {
+            "overall_score": 8.0,
+            "feedback": "The agent invoked tools correctly."
+        }
+        """
+        mock_create_llm.return_value = mock_llm
+
+        # Setup execution trace with tool uses
+        execution_trace = {
+            "tool_uses": [
+                {"tool": "search_tool", "input": {"query": "test query"}, "output": "search results"},
+                {"tool": "calculator", "input": {"expression": "2+2"}, "output": "4"}
+            ]
+        }
+
+        evaluator = ToolInvocationEvaluator(llm=mock_llm)
+        result = evaluator.evaluate(
+            agent=mock_agent,
+            task=mock_task,
+            execution_trace=execution_trace,
+            final_output="Final output"
+        )
+
+        assert result.score == 8.0
+        assert "The agent invoked tools correctly" in result.feedback
+
+    @patch("crewai.utilities.llm_utils.create_llm")
+    def test_evaluation_with_errors(self, mock_create_llm, mock_agent, mock_task):
+        mock_agent.tools = ["tool1", "tool2"]
+        # Setup mock LLM response
+        mock_llm = MagicMock(spec=LLM)
+        mock_llm.call.return_value = """
+        {
+            "overall_score": 5.5,
+            "feedback": "The agent had some errors in tool invocation."
+        }
+        """
+        mock_create_llm.return_value = mock_llm
+
+        # Setup execution trace with tool uses including errors
+        execution_trace = {
+            "tool_uses": [
+                {
+                    "tool": "search_tool",
+                    "input": {"query": "test query"},
+                    "output": "search results",
+                    "error": None
+                },
+                {
+                    "tool": "calculator",
+                    "input": {"expression": "2+"},
+                    "output": None,
+                    "error": "Invalid expression"
+                }
+            ]
+        }
+
+        evaluator = ToolInvocationEvaluator(llm=mock_llm)
+        result = evaluator.evaluate(
+            agent=mock_agent,
+            task=mock_task,
+            execution_trace=execution_trace,
+            final_output="Final output"
+        )
+
+        assert result.score == 5.5
+        assert "The agent had some errors in tool invocation" in result.feedback
--- a/tests/experimental/evaluation/test_agent_evaluator.py
+++ b/tests/experimental/evaluation/test_agent_evaluator.py
@@ -0,0 +1,95 @@
+import pytest
+
+from crewai.agent import Agent
+from crewai.task import Task
+from crewai.crew import Crew
+from crewai.experimental.evaluation.agent_evaluator import AgentEvaluator
+from crewai.experimental.evaluation.base_evaluator import AgentEvaluationResult
+from crewai.experimental.evaluation import (
+    GoalAlignmentEvaluator,
+    SemanticQualityEvaluator,
+    ToolSelectionEvaluator,
+    ParameterExtractionEvaluator,
+    ToolInvocationEvaluator,
+    ReasoningEfficiencyEvaluator
+)
+
+from crewai.experimental.evaluation import create_default_evaluator
+class TestAgentEvaluator:
+    @pytest.fixture
+    def mock_crew(self):
+        agent = Agent(
+            role="Test Agent",
+            goal="Complete test tasks successfully",
+            backstory="An agent created for testing purposes",
+            allow_delegation=False,
+            verbose=False
+        )
+
+        task = Task(
+            description="Test task description",
+            agent=agent,
+            expected_output="Expected test output"
+        )
+
+        crew = Crew(
+            agents=[agent],
+            tasks=[task]
+        )
+        return crew
+
+    def test_set_iteration(self):
+        agent_evaluator = AgentEvaluator()
+
+        agent_evaluator.set_iteration(3)
+        assert agent_evaluator.iteration == 3
+
+    @pytest.mark.vcr(filter_headers=["authorization"])
+    def test_evaluate_current_iteration(self, mock_crew):
+        agent_evaluator = AgentEvaluator(crew=mock_crew, evaluators=[GoalAlignmentEvaluator()])
+
+        mock_crew.kickoff()
+
+        results = agent_evaluator.evaluate_current_iteration()
+
+        assert isinstance(results, dict)
+
+        agent, = mock_crew.agents
+        task, = mock_crew.tasks
+
+        assert len(mock_crew.agents) == 1
+        assert agent.role in results
+        assert len(results[agent.role]) == 1
+
+        result, = results[agent.role]
+        assert isinstance(result, AgentEvaluationResult)
+
+        assert result.agent_id == str(agent.id)
+        assert result.task_id == str(task.id)
+
+        goal_alignment, = result.metrics.values()
+        assert goal_alignment.score == 5.0
+
+        expected_feedback = "The agent's output demonstrates an understanding of the need for a comprehensive document"
+        assert expected_feedback in goal_alignment.feedback
+
+        assert goal_alignment.raw_response is not None
+        assert '"score": 5' in goal_alignment.raw_response
+
+    def test_create_default_evaluator(self, mock_crew):
+        agent_evaluator = create_default_evaluator(crew=mock_crew)
+        assert isinstance(agent_evaluator, AgentEvaluator)
+        assert agent_evaluator.crew == mock_crew
+
+        expected_types = [
+            GoalAlignmentEvaluator,
+            SemanticQualityEvaluator,
+            ToolSelectionEvaluator,
+            ParameterExtractionEvaluator,
+            ToolInvocationEvaluator,
+            ReasoningEfficiencyEvaluator
+        ]
+
+        assert len(agent_evaluator.evaluators) == len(expected_types)
+        for evaluator, expected_type in zip(agent_evaluator.evaluators, expected_types):
+            assert isinstance(evaluator, expected_type)
--- a/tests/experimental/evaluation/test_experiment_result.py
+++ b/tests/experimental/evaluation/test_experiment_result.py
@@ -0,0 +1,111 @@
+import pytest
+from unittest.mock import MagicMock, patch
+
+from crewai.experimental.evaluation.experiment.result import ExperimentResult, ExperimentResults
+
+
+class TestExperimentResult:
+    @pytest.fixture
+    def mock_results(self):
+        return [
+            ExperimentResult(
+                identifier="test-1",
+                inputs={"query": "What is the capital of France?"},
+                score=10,
+                expected_score=7,
+                passed=True
+            ),
+            ExperimentResult(
+                identifier="test-2",
+                inputs={"query": "Who wrote Hamlet?"},
+                score={"relevance": 9, "factuality": 8},
+                expected_score={"relevance": 7, "factuality": 7},
+                passed=True,
+                agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}}
+            ),
+            ExperimentResult(
+                identifier="test-3",
+                inputs={"query": "Any query"},
+                score={"relevance": 9, "factuality": 8},
+                expected_score={"relevance": 7, "factuality": 7},
+                passed=False,
+                agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}}
+            ),
+            ExperimentResult(
+                identifier="test-4",
+                inputs={"query": "Another query"},
+                score={"relevance": 9, "factuality": 8},
+                expected_score={"relevance": 7, "factuality": 7},
+                passed=True,
+                agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}}
+            ),
+            ExperimentResult(
+                identifier="test-6",
+                inputs={"query": "Yet another query"},
+                score={"relevance": 9, "factuality": 8},
+                expected_score={"relevance": 7, "factuality": 7},
+                passed=True,
+                agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}}
+            )
+        ]
+
+    @patch('os.path.exists', return_value=True)
+    @patch('os.path.getsize', return_value=1)
+    @patch('json.load')
+    @patch('builtins.open', new_callable=MagicMock)
+    def test_experiment_results_compare_with_baseline(self, mock_open, mock_json_load, mock_path_getsize, mock_path_exists, mock_results):
+        baseline_data = {
+            "timestamp": "2023-01-01T00:00:00+00:00",
+            "results": [
+                {
+                    "identifier": "test-1",
+                    "inputs": {"query": "What is the capital of France?"},
+                    "score": 7,
+                    "expected_score": 7,
+                    "passed": False
+                },
+                {
+                    "identifier": "test-2",
+                    "inputs": {"query": "Who wrote Hamlet?"},
+                    "score": {"relevance": 8, "factuality": 7},
+                    "expected_score": {"relevance": 7, "factuality": 7},
+                    "passed": True
+                },
+                {
+                    "identifier": "test-3",
+                    "inputs": {"query": "Any query"},
+                    "score": {"relevance": 8, "factuality": 7},
+                    "expected_score": {"relevance": 7, "factuality": 7},
+                    "passed": True
+                },
+                {
+                    "identifier": "test-4",
+                    "inputs": {"query": "Another query"},
+                    "score": {"relevance": 8, "factuality": 7},
+                    "expected_score": {"relevance": 7, "factuality": 7},
+                    "passed": True
+                },
+                {
+                    "identifier": "test-5",
+                    "inputs": {"query": "Another query"},
+                    "score": {"relevance": 8, "factuality": 7},
+                    "expected_score": {"relevance": 7, "factuality": 7},
+                    "passed": True
+                }
+            ]
+        }
+
+        mock_json_load.return_value = baseline_data
+
+        results = ExperimentResults(results=mock_results)
+        results.display = MagicMock()
+
+        comparison = results.compare_with_baseline(baseline_filepath="baseline.json")
+
+        assert "baseline_timestamp" in comparison
+        assert comparison["baseline_timestamp"] == "2023-01-01T00:00:00+00:00"
+        assert comparison["improved"] == ["test-1"]
+        assert comparison["regressed"] == ["test-3"]
+        assert comparison["unchanged"] == ["test-2", "test-4"]
+        assert comparison["new_tests"] == ["test-6"]
+        assert comparison["missing_tests"] == ["test-5"]
--- a/tests/experimental/evaluation/test_experiment_runner.py
+++ b/tests/experimental/evaluation/test_experiment_runner.py
@@ -0,0 +1,197 @@
+import pytest
+from unittest.mock import MagicMock, patch
+
+from crewai.crew import Crew
+from crewai.experimental.evaluation.experiment.runner import ExperimentRunner
+from crewai.experimental.evaluation.experiment.result import ExperimentResults
+from crewai.experimental.evaluation.evaluation_display import AgentAggregatedEvaluationResult
+from crewai.experimental.evaluation.base_evaluator import MetricCategory, EvaluationScore
+
+
+class TestExperimentRunner:
+    @pytest.fixture
+    def mock_crew(self):
+        return MagicMock(llm=Crew)
+
+    @pytest.fixture
+    def mock_evaluator_results(self):
+        agent_evaluation = AgentAggregatedEvaluationResult(
+            agent_id="Test Agent",
+            agent_role="Test Agent Role",
+            metrics={
+                MetricCategory.GOAL_ALIGNMENT: EvaluationScore(
+                    score=9,
+                    feedback="Test feedback for goal alignment",
+                    raw_response="Test raw response for goal alignment"
+                ),
+                MetricCategory.REASONING_EFFICIENCY: EvaluationScore(
+                    score=None,
+                    feedback="Reasoning efficiency not applicable",
+                    raw_response="Reasoning efficiency not applicable"
+                ),
+                MetricCategory.PARAMETER_EXTRACTION: EvaluationScore(
+                    score=7,
+                    feedback="Test parameter extraction explanation",
+                    raw_response="Test raw output"
+                ),
+                MetricCategory.TOOL_SELECTION: EvaluationScore(
+                    score=8,
+                    feedback="Test tool selection explanation",
+                    raw_response="Test raw output"
+                )
+            }
+        )
+
+        return {"Test Agent": agent_evaluation}
+
+    @patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
+    def test_run_success(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
+        dataset = [
+            {
+                "identifier": "test-case-1",
+                "inputs": {"query": "Test query 1"},
+                "expected_score": 8
+            },
+            {
+                "identifier": "test-case-2",
+                "inputs": {"query": "Test query 2"},
+                "expected_score": {"goal_alignment": 7}
+            },
+            {
+                "inputs": {"query": "Test query 3"},
+                "expected_score": {"tool_selection": 9}
+            }
+        ]
+
+        mock_evaluator = MagicMock()
+        mock_evaluator.get_agent_evaluation.return_value = mock_evaluator_results
+        mock_evaluator.reset_iterations_results = MagicMock()
+        mock_create_evaluator.return_value = mock_evaluator
+
+        runner = ExperimentRunner(dataset=dataset)
+
+        results = runner.run(crew=mock_crew)
+
+        assert isinstance(results, ExperimentResults)
+        result_1, result_2, result_3 = results.results
+        assert len(results.results) == 3
+
+        assert result_1.identifier == "test-case-1"
+        assert result_1.inputs == {"query": "Test query 1"}
+        assert result_1.expected_score == 8
+        assert result_1.passed is True
+
+        assert result_2.identifier == "test-case-2"
+        assert result_2.inputs == {"query": "Test query 2"}
+        assert isinstance(result_2.expected_score, dict)
+        assert "goal_alignment" in result_2.expected_score
+        assert result_2.passed is True
+
+        assert result_3.identifier == "c2ed49e63aa9a83af3ca382794134fd5"
+        assert result_3.inputs == {"query": "Test query 3"}
+        assert isinstance(result_3.expected_score, dict)
+        assert "tool_selection" in result_3.expected_score
+        assert result_3.passed is False
+
+        assert mock_crew.kickoff.call_count == 3
+        mock_crew.kickoff.assert_any_call(inputs={"query": "Test query 1"})
+        mock_crew.kickoff.assert_any_call(inputs={"query": "Test query 2"})
+        mock_crew.kickoff.assert_any_call(inputs={"query": "Test query 3"})
+
+        assert mock_evaluator.reset_iterations_results.call_count == 3
+        assert mock_evaluator.get_agent_evaluation.call_count == 3
+
+
+    @patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
+    def test_run_success_with_unknown_metric(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
+        dataset = [
+            {
+                "identifier": "test-case-2",
+                "inputs": {"query": "Test query 2"},
+                "expected_score": {"goal_alignment": 7, "unknown_metric": 8}
+            }
+        ]
+
+        mock_evaluator = MagicMock()
+        mock_evaluator.get_agent_evaluation.return_value = mock_evaluator_results
+        mock_evaluator.reset_iterations_results = MagicMock()
+        mock_create_evaluator.return_value = mock_evaluator
+
+        runner = ExperimentRunner(dataset=dataset)
+
+        results = runner.run(crew=mock_crew)
+
+        result, = results.results
+
+        assert result.identifier == "test-case-2"
+        assert result.inputs == {"query": "Test query 2"}
+        assert isinstance(result.expected_score, dict)
+        assert "goal_alignment" in result.expected_score.keys()
+        assert "unknown_metric" in result.expected_score.keys()
+        assert result.passed is True
+
+    @patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
+    def test_run_success_with_single_metric_evaluator_and_expected_specific_metric(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
+        dataset = [
+            {
+                "identifier": "test-case-2",
+                "inputs": {"query": "Test query 2"},
+                "expected_score": {"goal_alignment": 7}
+            }
+        ]
+
+        mock_evaluator = MagicMock()
+        mock_create_evaluator["Test Agent"].metrics = {
+            MetricCategory.GOAL_ALIGNMENT: EvaluationScore(
+                    score=9,
+                    feedback="Test feedback for goal alignment",
+                    raw_response="Test raw response for goal alignment"
+                )
+        }
+        mock_evaluator.get_agent_evaluation.return_value = mock_evaluator_results
+        mock_evaluator.reset_iterations_results = MagicMock()
+        mock_create_evaluator.return_value = mock_evaluator
+
+        runner = ExperimentRunner(dataset=dataset)
+
+        results = runner.run(crew=mock_crew)
+        result, = results.results
+
+        assert result.identifier == "test-case-2"
+        assert result.inputs == {"query": "Test query 2"}
+        assert isinstance(result.expected_score, dict)
+        assert "goal_alignment" in result.expected_score.keys()
+        assert result.passed is True
+
+    @patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
+    def test_run_success_when_expected_metric_is_not_available(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
+        dataset = [
+            {
+                "identifier": "test-case-2",
+                "inputs": {"query": "Test query 2"},
+                "expected_score": {"unknown_metric": 7}
+            }
+        ]
+
+        mock_evaluator = MagicMock()
+        mock_create_evaluator["Test Agent"].metrics = {
+            MetricCategory.GOAL_ALIGNMENT: EvaluationScore(
+                score=5,
+                feedback="Test feedback for goal alignment",
+                raw_response="Test raw response for goal alignment"
+            )
+        }
+        mock_evaluator.get_agent_evaluation.return_value = mock_evaluator_results
+        mock_evaluator.reset_iterations_results = MagicMock()
+        mock_create_evaluator.return_value = mock_evaluator
+
+        runner = ExperimentRunner(dataset=dataset)
+
+        results = runner.run(crew=mock_crew)
+        result, = results.results
+
+        assert result.identifier == "test-case-2"
+        assert result.inputs == {"query": "Test query 2"}
+        assert isinstance(result.expected_score, dict)
+        assert "unknown_metric" in result.expected_score.keys()
+        assert result.passed is False