feat: restructure project as UV workspace with crewai in lib/

2026-01-20 13:28:13 +00:00 · 2025-09-26 14:29:28 -04:00
parent 74b5c88834
commit daf6f679ff
763 changed files with 1181 additions and 398 deletions
--- a/tests/experimental/init.py
+++ b/tests/experimental/init.py
--- a/tests/experimental/evaluation/init.py
+++ b/tests/experimental/evaluation/init.py
--- a/tests/experimental/evaluation/metrics/init.py
+++ b/tests/experimental/evaluation/metrics/init.py
--- a/tests/experimental/evaluation/metrics/test_base_evaluation_metrics.py
+++ b/tests/experimental/evaluation/metrics/test_base_evaluation_metrics.py
@@ -1,29 +0,0 @@
-import pytest
-from unittest.mock import MagicMock
-from crewai.agent import Agent
-from crewai.task import Task
-
-
-class BaseEvaluationMetricsTest:
-    @pytest.fixture
-    def mock_agent(self):
-        agent = MagicMock(spec=Agent)
-        agent.id = "test_agent_id"
-        agent.role = "Test Agent"
-        agent.goal = "Test goal"
-        agent.tools = []
-        return agent
-
-    @pytest.fixture
-    def mock_task(self):
-        task = MagicMock(spec=Task)
-        task.description = "Test task description"
-        task.expected_output = "Test expected output"
-        return task
-
-    @pytest.fixture
-    def execution_trace(self):
-        return {
-            "thinking": ["I need to analyze this data carefully"],
-            "actions": ["Gathered information", "Analyzed data"],
-        }
--- a/tests/experimental/evaluation/metrics/test_goal_metrics.py
+++ b/tests/experimental/evaluation/metrics/test_goal_metrics.py
@@ -1,65 +0,0 @@
-from unittest.mock import patch, MagicMock
-from tests.experimental.evaluation.metrics.test_base_evaluation_metrics import (
-    BaseEvaluationMetricsTest,
-)
-
-from crewai.experimental.evaluation.base_evaluator import EvaluationScore
-from crewai.experimental.evaluation.metrics.goal_metrics import GoalAlignmentEvaluator
-from crewai.utilities.llm_utils import LLM
-
-
-class TestGoalAlignmentEvaluator(BaseEvaluationMetricsTest):
-    @patch("crewai.utilities.llm_utils.create_llm")
-    def test_evaluate_success(
-        self, mock_create_llm, mock_agent, mock_task, execution_trace
-    ):
-        mock_llm = MagicMock(spec=LLM)
-        mock_llm.call.return_value = """
-        {
-            "score": 8.5,
-            "feedback": "The agent correctly understood the task and produced relevant output."
-        }
-        """
-        mock_create_llm.return_value = mock_llm
-
-        evaluator = GoalAlignmentEvaluator(llm=mock_llm)
-
-        result = evaluator.evaluate(
-            agent=mock_agent,
-            task=mock_task,
-            execution_trace=execution_trace,
-            final_output="This is the final output",
-        )
-
-        assert isinstance(result, EvaluationScore)
-        assert result.score == 8.5
-        assert "correctly understood the task" in result.feedback
-
-        mock_llm.call.assert_called_once()
-        prompt = mock_llm.call.call_args[0][0]
-        assert len(prompt) >= 2
-        assert "system" in prompt[0]["role"]
-        assert "user" in prompt[1]["role"]
-        assert mock_agent.role in prompt[1]["content"]
-        assert mock_task.description in prompt[1]["content"]
-
-    @patch("crewai.utilities.llm_utils.create_llm")
-    def test_evaluate_error_handling(
-        self, mock_create_llm, mock_agent, mock_task, execution_trace
-    ):
-        mock_llm = MagicMock(spec=LLM)
-        mock_llm.call.return_value = "Invalid JSON response"
-        mock_create_llm.return_value = mock_llm
-
-        evaluator = GoalAlignmentEvaluator(llm=mock_llm)
-
-        result = evaluator.evaluate(
-            agent=mock_agent,
-            task=mock_task,
-            execution_trace=execution_trace,
-            final_output="This is the final output",
-        )
-
-        assert isinstance(result, EvaluationScore)
-        assert result.score is None
-        assert "Failed to parse" in result.feedback
--- a/tests/experimental/evaluation/metrics/test_reasoning_metrics.py
+++ b/tests/experimental/evaluation/metrics/test_reasoning_metrics.py
@@ -1,193 +0,0 @@
-import pytest
-from unittest.mock import patch, MagicMock
-from typing import List, Dict, Any
-
-from crewai.tasks.task_output import TaskOutput
-from crewai.experimental.evaluation.metrics.reasoning_metrics import (
-    ReasoningEfficiencyEvaluator,
-)
-from tests.experimental.evaluation.metrics.test_base_evaluation_metrics import (
-    BaseEvaluationMetricsTest,
-)
-from crewai.utilities.llm_utils import LLM
-from crewai.experimental.evaluation.base_evaluator import EvaluationScore
-
-
-class TestReasoningEfficiencyEvaluator(BaseEvaluationMetricsTest):
-    @pytest.fixture
-    def mock_output(self):
-        output = MagicMock(spec=TaskOutput)
-        output.raw = "This is the test output"
-        return output
-
-    @pytest.fixture
-    def llm_calls(self) -> List[Dict[str, Any]]:
-        return [
-            {
-                "prompt": "How should I approach this task?",
-                "response": "I'll first research the topic, then compile findings.",
-                "timestamp": 1626987654,
-            },
-            {
-                "prompt": "What resources should I use?",
-                "response": "I'll use relevant academic papers and reliable websites.",
-                "timestamp": 1626987754,
-            },
-            {
-                "prompt": "How should I structure the output?",
-                "response": "I'll organize information clearly with headings and bullet points.",
-                "timestamp": 1626987854,
-            },
-        ]
-
-    def test_insufficient_llm_calls(self, mock_agent, mock_task, mock_output):
-        execution_trace = {"llm_calls": []}
-
-        evaluator = ReasoningEfficiencyEvaluator()
-        result = evaluator.evaluate(
-            agent=mock_agent,
-            task=mock_task,
-            execution_trace=execution_trace,
-            final_output=mock_output,
-        )
-
-        assert isinstance(result, EvaluationScore)
-        assert result.score is None
-        assert "Insufficient LLM calls" in result.feedback
-
-    @patch("crewai.utilities.llm_utils.create_llm")
-    def test_successful_evaluation(
-        self, mock_create_llm, mock_agent, mock_task, mock_output, llm_calls
-    ):
-        mock_llm = MagicMock(spec=LLM)
-        mock_llm.call.return_value = """
-        {
-            "scores": {
-                "focus": 8.0,
-                "progression": 7.0,
-                "decision_quality": 7.5,
-                "conciseness": 8.0,
-                "loop_avoidance": 9.0
-            },
-            "overall_score": 7.9,
-            "feedback": "The agent demonstrated good reasoning efficiency.",
-            "optimization_suggestions": "The agent could improve by being more concise."
-        }
-        """
-        mock_create_llm.return_value = mock_llm
-
-        # Setup execution trace with sufficient LLM calls
-        execution_trace = {"llm_calls": llm_calls}
-
-        # Mock the _detect_loops method to return a simple result
-        evaluator = ReasoningEfficiencyEvaluator(llm=mock_llm)
-        evaluator._detect_loops = MagicMock(return_value=(False, []))
-
-        # Evaluate
-        result = evaluator.evaluate(
-            agent=mock_agent,
-            task=mock_task,
-            execution_trace=execution_trace,
-            final_output=mock_output,
-        )
-
-        # Assertions
-        assert isinstance(result, EvaluationScore)
-        assert result.score == 7.9
-        assert "The agent demonstrated good reasoning efficiency" in result.feedback
-        assert "Reasoning Efficiency Evaluation:" in result.feedback
-        assert "• Focus: 8.0/10" in result.feedback
-
-        # Verify LLM was called
-        mock_llm.call.assert_called_once()
-
-    @patch("crewai.utilities.llm_utils.create_llm")
-    def test_parse_error_handling(
-        self, mock_create_llm, mock_agent, mock_task, mock_output, llm_calls
-    ):
-        mock_llm = MagicMock(spec=LLM)
-        mock_llm.call.return_value = "Invalid JSON response"
-        mock_create_llm.return_value = mock_llm
-
-        # Setup execution trace
-        execution_trace = {"llm_calls": llm_calls}
-
-        # Mock the _detect_loops method
-        evaluator = ReasoningEfficiencyEvaluator(llm=mock_llm)
-        evaluator._detect_loops = MagicMock(return_value=(False, []))
-
-        # Evaluate
-        result = evaluator.evaluate(
-            agent=mock_agent,
-            task=mock_task,
-            execution_trace=execution_trace,
-            final_output=mock_output,
-        )
-
-        # Assertions for error handling
-        assert isinstance(result, EvaluationScore)
-        assert result.score is None
-        assert "Failed to parse reasoning efficiency evaluation" in result.feedback
-
-    @patch("crewai.utilities.llm_utils.create_llm")
-    def test_loop_detection(self, mock_create_llm, mock_agent, mock_task, mock_output):
-        # Setup LLM calls with a repeating pattern
-        repetitive_llm_calls = [
-            {
-                "prompt": "How to solve?",
-                "response": "I'll try method A",
-                "timestamp": 1000,
-            },
-            {
-                "prompt": "Let me try method A",
-                "response": "It didn't work",
-                "timestamp": 1100,
-            },
-            {
-                "prompt": "How to solve?",
-                "response": "I'll try method A again",
-                "timestamp": 1200,
-            },
-            {
-                "prompt": "Let me try method A",
-                "response": "It didn't work",
-                "timestamp": 1300,
-            },
-            {
-                "prompt": "How to solve?",
-                "response": "I'll try method A one more time",
-                "timestamp": 1400,
-            },
-        ]
-
-        mock_llm = MagicMock(spec=LLM)
-        mock_llm.call.return_value = """
-        {
-            "scores": {
-                "focus": 6.0,
-                "progression": 3.0,
-                "decision_quality": 4.0,
-                "conciseness": 6.0,
-                "loop_avoidance": 2.0
-            },
-            "overall_score": 4.2,
-            "feedback": "The agent is stuck in a reasoning loop.",
-            "optimization_suggestions": "The agent should try different approaches when one fails."
-        }
-        """
-        mock_create_llm.return_value = mock_llm
-
-        execution_trace = {"llm_calls": repetitive_llm_calls}
-
-        evaluator = ReasoningEfficiencyEvaluator(llm=mock_llm)
-
-        result = evaluator.evaluate(
-            agent=mock_agent,
-            task=mock_task,
-            execution_trace=execution_trace,
-            final_output=mock_output,
-        )
-
-        assert isinstance(result, EvaluationScore)
-        assert result.score == 4.2
-        assert "• Loop Avoidance: 2.0/10" in result.feedback
--- a/tests/experimental/evaluation/metrics/test_semantic_quality_metrics.py
+++ b/tests/experimental/evaluation/metrics/test_semantic_quality_metrics.py
@@ -1,93 +0,0 @@
-from unittest.mock import patch, MagicMock
-
-from crewai.experimental.evaluation.base_evaluator import EvaluationScore
-from crewai.experimental.evaluation.metrics.semantic_quality_metrics import (
-    SemanticQualityEvaluator,
-)
-from tests.experimental.evaluation.metrics.test_base_evaluation_metrics import (
-    BaseEvaluationMetricsTest,
-)
-from crewai.utilities.llm_utils import LLM
-
-
-class TestSemanticQualityEvaluator(BaseEvaluationMetricsTest):
-    @patch("crewai.utilities.llm_utils.create_llm")
-    def test_evaluate_success(
-        self, mock_create_llm, mock_agent, mock_task, execution_trace
-    ):
-        mock_llm = MagicMock(spec=LLM)
-        mock_llm.call.return_value = """
-        {
-            "score": 8.5,
-            "feedback": "The output is clear, coherent, and logically structured."
-        }
-        """
-        mock_create_llm.return_value = mock_llm
-
-        evaluator = SemanticQualityEvaluator(llm=mock_llm)
-
-        result = evaluator.evaluate(
-            agent=mock_agent,
-            task=mock_task,
-            execution_trace=execution_trace,
-            final_output="This is a well-structured analysis of the data.",
-        )
-
-        assert isinstance(result, EvaluationScore)
-        assert result.score == 8.5
-        assert "clear, coherent" in result.feedback
-
-        mock_llm.call.assert_called_once()
-        prompt = mock_llm.call.call_args[0][0]
-        assert len(prompt) >= 2
-        assert "system" in prompt[0]["role"]
-        assert "user" in prompt[1]["role"]
-        assert mock_agent.role in prompt[1]["content"]
-        assert mock_task.description in prompt[1]["content"]
-
-    @patch("crewai.utilities.llm_utils.create_llm")
-    def test_evaluate_with_empty_output(
-        self, mock_create_llm, mock_agent, mock_task, execution_trace
-    ):
-        mock_llm = MagicMock(spec=LLM)
-        mock_llm.call.return_value = """
-        {
-            "score": 2.0,
-            "feedback": "The output is empty or minimal, lacking substance."
-        }
-        """
-        mock_create_llm.return_value = mock_llm
-
-        evaluator = SemanticQualityEvaluator(llm=mock_llm)
-
-        result = evaluator.evaluate(
-            agent=mock_agent,
-            task=mock_task,
-            execution_trace=execution_trace,
-            final_output="",
-        )
-
-        assert isinstance(result, EvaluationScore)
-        assert result.score == 2.0
-        assert "empty or minimal" in result.feedback
-
-    @patch("crewai.utilities.llm_utils.create_llm")
-    def test_evaluate_error_handling(
-        self, mock_create_llm, mock_agent, mock_task, execution_trace
-    ):
-        mock_llm = MagicMock(spec=LLM)
-        mock_llm.call.return_value = "Invalid JSON response"
-        mock_create_llm.return_value = mock_llm
-
-        evaluator = SemanticQualityEvaluator(llm=mock_llm)
-
-        result = evaluator.evaluate(
-            agent=mock_agent,
-            task=mock_task,
-            execution_trace=execution_trace,
-            final_output="This is the output.",
-        )
-
-        assert isinstance(result, EvaluationScore)
-        assert result.score is None
-        assert "Failed to parse" in result.feedback
--- a/tests/experimental/evaluation/metrics/test_tools_metrics.py
+++ b/tests/experimental/evaluation/metrics/test_tools_metrics.py
@@ -1,241 +0,0 @@
-from unittest.mock import patch, MagicMock
-
-from crewai.experimental.evaluation.metrics.tools_metrics import (
-    ToolSelectionEvaluator,
-    ParameterExtractionEvaluator,
-    ToolInvocationEvaluator,
-)
-from crewai.utilities.llm_utils import LLM
-from tests.experimental.evaluation.metrics.test_base_evaluation_metrics import (
-    BaseEvaluationMetricsTest,
-)
-
-
-class TestToolSelectionEvaluator(BaseEvaluationMetricsTest):
-    def test_no_tools_available(self, mock_task, mock_agent):
-        # Create agent with no tools
-        mock_agent.tools = []
-
-        execution_trace = {"tool_uses": []}
-
-        evaluator = ToolSelectionEvaluator()
-        result = evaluator.evaluate(
-            agent=mock_agent,
-            task=mock_task,
-            execution_trace=execution_trace,
-            final_output="Final output",
-        )
-
-        assert result.score is None
-        assert "no tools available" in result.feedback.lower()
-
-    def test_tools_available_but_none_used(self, mock_agent, mock_task):
-        mock_agent.tools = ["tool1", "tool2"]
-        execution_trace = {"tool_uses": []}
-
-        evaluator = ToolSelectionEvaluator()
-        result = evaluator.evaluate(
-            agent=mock_agent,
-            task=mock_task,
-            execution_trace=execution_trace,
-            final_output="Final output",
-        )
-
-        assert result.score is None
-        assert "had tools available but didn't use any" in result.feedback.lower()
-
-    @patch("crewai.utilities.llm_utils.create_llm")
-    def test_successful_evaluation(self, mock_create_llm, mock_agent, mock_task):
-        # Setup mock LLM response
-        mock_llm = MagicMock(spec=LLM)
-        mock_llm.call.return_value = """
-        {
-            "overall_score": 8.5,
-            "feedback": "The agent made good tool selections."
-        }
-        """
-        mock_create_llm.return_value = mock_llm
-
-        # Setup execution trace with tool uses
-        execution_trace = {
-            "tool_uses": [
-                {
-                    "tool": "search_tool",
-                    "input": {"query": "test query"},
-                    "output": "search results",
-                },
-                {"tool": "calculator", "input": {"expression": "2+2"}, "output": "4"},
-            ]
-        }
-
-        evaluator = ToolSelectionEvaluator(llm=mock_llm)
-        result = evaluator.evaluate(
-            agent=mock_agent,
-            task=mock_task,
-            execution_trace=execution_trace,
-            final_output="Final output",
-        )
-
-        assert result.score == 8.5
-        assert "The agent made good tool selections" in result.feedback
-
-        # Verify LLM was called with correct prompt
-        mock_llm.call.assert_called_once()
-        prompt = mock_llm.call.call_args[0][0]
-        assert isinstance(prompt, list)
-        assert len(prompt) >= 2
-        assert "system" in prompt[0]["role"]
-        assert "user" in prompt[1]["role"]
-
-
-class TestParameterExtractionEvaluator(BaseEvaluationMetricsTest):
-    def test_no_tool_uses(self, mock_agent, mock_task):
-        execution_trace = {"tool_uses": []}
-
-        evaluator = ParameterExtractionEvaluator()
-        result = evaluator.evaluate(
-            agent=mock_agent,
-            task=mock_task,
-            execution_trace=execution_trace,
-            final_output="Final output",
-        )
-
-        assert result.score is None
-        assert "no tool usage" in result.feedback.lower()
-
-    @patch("crewai.utilities.llm_utils.create_llm")
-    def test_successful_evaluation(self, mock_create_llm, mock_agent, mock_task):
-        mock_agent.tools = ["tool1", "tool2"]
-
-        # Setup mock LLM response
-        mock_llm = MagicMock(spec=LLM)
-        mock_llm.call.return_value = """
-        {
-            "overall_score": 9.0,
-            "feedback": "The agent extracted parameters correctly."
-        }
-        """
-        mock_create_llm.return_value = mock_llm
-
-        # Setup execution trace with tool uses
-        execution_trace = {
-            "tool_uses": [
-                {
-                    "tool": "search_tool",
-                    "input": {"query": "test query"},
-                    "output": "search results",
-                    "error": None,
-                },
-                {
-                    "tool": "calculator",
-                    "input": {"expression": "2+2"},
-                    "output": "4",
-                    "error": None,
-                },
-            ]
-        }
-
-        evaluator = ParameterExtractionEvaluator(llm=mock_llm)
-        result = evaluator.evaluate(
-            agent=mock_agent,
-            task=mock_task,
-            execution_trace=execution_trace,
-            final_output="Final output",
-        )
-
-        assert result.score == 9.0
-        assert "The agent extracted parameters correctly" in result.feedback
-
-
-class TestToolInvocationEvaluator(BaseEvaluationMetricsTest):
-    def test_no_tool_uses(self, mock_agent, mock_task):
-        execution_trace = {"tool_uses": []}
-
-        evaluator = ToolInvocationEvaluator()
-        result = evaluator.evaluate(
-            agent=mock_agent,
-            task=mock_task,
-            execution_trace=execution_trace,
-            final_output="Final output",
-        )
-
-        assert result.score is None
-        assert "no tool usage" in result.feedback.lower()
-
-    @patch("crewai.utilities.llm_utils.create_llm")
-    def test_successful_evaluation(self, mock_create_llm, mock_agent, mock_task):
-        mock_agent.tools = ["tool1", "tool2"]
-        # Setup mock LLM response
-        mock_llm = MagicMock(spec=LLM)
-        mock_llm.call.return_value = """
-        {
-            "overall_score": 8.0,
-            "feedback": "The agent invoked tools correctly."
-        }
-        """
-        mock_create_llm.return_value = mock_llm
-
-        # Setup execution trace with tool uses
-        execution_trace = {
-            "tool_uses": [
-                {
-                    "tool": "search_tool",
-                    "input": {"query": "test query"},
-                    "output": "search results",
-                },
-                {"tool": "calculator", "input": {"expression": "2+2"}, "output": "4"},
-            ]
-        }
-
-        evaluator = ToolInvocationEvaluator(llm=mock_llm)
-        result = evaluator.evaluate(
-            agent=mock_agent,
-            task=mock_task,
-            execution_trace=execution_trace,
-            final_output="Final output",
-        )
-
-        assert result.score == 8.0
-        assert "The agent invoked tools correctly" in result.feedback
-
-    @patch("crewai.utilities.llm_utils.create_llm")
-    def test_evaluation_with_errors(self, mock_create_llm, mock_agent, mock_task):
-        mock_agent.tools = ["tool1", "tool2"]
-        # Setup mock LLM response
-        mock_llm = MagicMock(spec=LLM)
-        mock_llm.call.return_value = """
-        {
-            "overall_score": 5.5,
-            "feedback": "The agent had some errors in tool invocation."
-        }
-        """
-        mock_create_llm.return_value = mock_llm
-
-        # Setup execution trace with tool uses including errors
-        execution_trace = {
-            "tool_uses": [
-                {
-                    "tool": "search_tool",
-                    "input": {"query": "test query"},
-                    "output": "search results",
-                    "error": None,
-                },
-                {
-                    "tool": "calculator",
-                    "input": {"expression": "2+"},
-                    "output": None,
-                    "error": "Invalid expression",
-                },
-            ]
-        }
-
-        evaluator = ToolInvocationEvaluator(llm=mock_llm)
-        result = evaluator.evaluate(
-            agent=mock_agent,
-            task=mock_task,
-            execution_trace=execution_trace,
-            final_output="Final output",
-        )
-
-        assert result.score == 5.5
-        assert "The agent had some errors in tool invocation" in result.feedback
--- a/tests/experimental/evaluation/test_agent_evaluator.py
+++ b/tests/experimental/evaluation/test_agent_evaluator.py
@@ -1,290 +0,0 @@
-import pytest
-
-from crewai.agent import Agent
-from crewai.task import Task
-from crewai.crew import Crew
-from crewai.experimental.evaluation.agent_evaluator import AgentEvaluator
-from crewai.experimental.evaluation.base_evaluator import AgentEvaluationResult
-from crewai.experimental.evaluation import (
-    GoalAlignmentEvaluator,
-    SemanticQualityEvaluator,
-    ToolSelectionEvaluator,
-    ParameterExtractionEvaluator,
-    ToolInvocationEvaluator,
-    ReasoningEfficiencyEvaluator,
-    MetricCategory,
-    EvaluationScore,
-)
-
-from crewai.events.types.agent_events import (
-    AgentEvaluationStartedEvent,
-    AgentEvaluationCompletedEvent,
-    AgentEvaluationFailedEvent,
-)
-from crewai.events.event_bus import crewai_event_bus
-from crewai.experimental.evaluation import create_default_evaluator
-
-
-class TestAgentEvaluator:
-    @pytest.fixture
-    def mock_crew(self):
-        agent = Agent(
-            role="Test Agent",
-            goal="Complete test tasks successfully",
-            backstory="An agent created for testing purposes",
-            allow_delegation=False,
-            verbose=False,
-        )
-
-        task = Task(
-            description="Test task description",
-            agent=agent,
-            expected_output="Expected test output",
-        )
-
-        crew = Crew(agents=[agent], tasks=[task])
-        return crew
-
-    def test_set_iteration(self):
-        agent_evaluator = AgentEvaluator(agents=[])
-
-        agent_evaluator.set_iteration(3)
-        assert agent_evaluator._execution_state.iteration == 3
-
-    @pytest.mark.vcr(filter_headers=["authorization"])
-    def test_evaluate_current_iteration(self, mock_crew):
-        agent_evaluator = AgentEvaluator(
-            agents=mock_crew.agents, evaluators=[GoalAlignmentEvaluator()]
-        )
-
-        mock_crew.kickoff()
-
-        results = agent_evaluator.get_evaluation_results()
-
-        assert isinstance(results, dict)
-
-        (agent,) = mock_crew.agents
-        (task,) = mock_crew.tasks
-
-        assert len(mock_crew.agents) == 1
-        assert agent.role in results
-        assert len(results[agent.role]) == 1
-
-        (result,) = results[agent.role]
-        assert isinstance(result, AgentEvaluationResult)
-
-        assert result.agent_id == str(agent.id)
-        assert result.task_id == str(task.id)
-
-        (goal_alignment,) = result.metrics.values()
-        assert goal_alignment.score == 5.0
-
-        expected_feedback = "The agent's output demonstrates an understanding of the need for a comprehensive document outlining task"
-        assert expected_feedback in goal_alignment.feedback
-
-        assert goal_alignment.raw_response is not None
-        assert '"score": 5' in goal_alignment.raw_response
-
-    def test_create_default_evaluator(self, mock_crew):
-        agent_evaluator = create_default_evaluator(agents=mock_crew.agents)
-        assert isinstance(agent_evaluator, AgentEvaluator)
-        assert agent_evaluator.agents == mock_crew.agents
-
-        expected_types = [
-            GoalAlignmentEvaluator,
-            SemanticQualityEvaluator,
-            ToolSelectionEvaluator,
-            ParameterExtractionEvaluator,
-            ToolInvocationEvaluator,
-            ReasoningEfficiencyEvaluator,
-        ]
-
-        assert len(agent_evaluator.evaluators) == len(expected_types)
-        for evaluator, expected_type in zip(agent_evaluator.evaluators, expected_types):
-            assert isinstance(evaluator, expected_type)
-
-    @pytest.mark.vcr(filter_headers=["authorization"])
-    def test_eval_lite_agent(self):
-        agent = Agent(
-            role="Test Agent",
-            goal="Complete test tasks successfully",
-            backstory="An agent created for testing purposes",
-        )
-
-        with crewai_event_bus.scoped_handlers():
-            events = {}
-
-            @crewai_event_bus.on(AgentEvaluationStartedEvent)
-            def capture_started(source, event):
-                events["started"] = event
-
-            @crewai_event_bus.on(AgentEvaluationCompletedEvent)
-            def capture_completed(source, event):
-                events["completed"] = event
-
-            @crewai_event_bus.on(AgentEvaluationFailedEvent)
-            def capture_failed(source, event):
-                events["failed"] = event
-
-            agent_evaluator = AgentEvaluator(
-                agents=[agent], evaluators=[GoalAlignmentEvaluator()]
-            )
-
-            agent.kickoff(messages="Complete this task successfully")
-
-            assert events.keys() == {"started", "completed"}
-            assert events["started"].agent_id == str(agent.id)
-            assert events["started"].agent_role == agent.role
-            assert events["started"].task_id is None
-            assert events["started"].iteration == 1
-
-            assert events["completed"].agent_id == str(agent.id)
-            assert events["completed"].agent_role == agent.role
-            assert events["completed"].task_id is None
-            assert events["completed"].iteration == 1
-            assert events["completed"].metric_category == MetricCategory.GOAL_ALIGNMENT
-            assert isinstance(events["completed"].score, EvaluationScore)
-            assert events["completed"].score.score == 2.0
-
-            results = agent_evaluator.get_evaluation_results()
-
-            assert isinstance(results, dict)
-
-            (result,) = results[agent.role]
-            assert isinstance(result, AgentEvaluationResult)
-
-            assert result.agent_id == str(agent.id)
-            assert result.task_id == "lite_task"
-
-            (goal_alignment,) = result.metrics.values()
-            assert goal_alignment.score == 2.0
-
-            expected_feedback = "The agent did not demonstrate a clear understanding of the task goal, which is to complete test tasks successfully"
-            assert expected_feedback in goal_alignment.feedback
-
-            assert goal_alignment.raw_response is not None
-            assert '"score": 2' in goal_alignment.raw_response
-
-    @pytest.mark.vcr(filter_headers=["authorization"])
-    def test_eval_specific_agents_from_crew(self, mock_crew):
-        agent = Agent(
-            role="Test Agent Eval",
-            goal="Complete test tasks successfully",
-            backstory="An agent created for testing purposes",
-        )
-        task = Task(
-            description="Test task description",
-            agent=agent,
-            expected_output="Expected test output",
-        )
-        mock_crew.agents.append(agent)
-        mock_crew.tasks.append(task)
-
-        with crewai_event_bus.scoped_handlers():
-            events = {}
-
-            @crewai_event_bus.on(AgentEvaluationStartedEvent)
-            def capture_started(source, event):
-                events["started"] = event
-
-            @crewai_event_bus.on(AgentEvaluationCompletedEvent)
-            def capture_completed(source, event):
-                events["completed"] = event
-
-            @crewai_event_bus.on(AgentEvaluationFailedEvent)
-            def capture_failed(source, event):
-                events["failed"] = event
-
-            agent_evaluator = AgentEvaluator(
-                agents=[agent], evaluators=[GoalAlignmentEvaluator()]
-            )
-            mock_crew.kickoff()
-
-            assert events.keys() == {"started", "completed"}
-            assert events["started"].agent_id == str(agent.id)
-            assert events["started"].agent_role == agent.role
-            assert events["started"].task_id == str(task.id)
-            assert events["started"].iteration == 1
-
-            assert events["completed"].agent_id == str(agent.id)
-            assert events["completed"].agent_role == agent.role
-            assert events["completed"].task_id == str(task.id)
-            assert events["completed"].iteration == 1
-            assert events["completed"].metric_category == MetricCategory.GOAL_ALIGNMENT
-            assert isinstance(events["completed"].score, EvaluationScore)
-            assert events["completed"].score.score == 5.0
-
-            results = agent_evaluator.get_evaluation_results()
-
-            assert isinstance(results, dict)
-            assert len(results.keys()) == 1
-            (result,) = results[agent.role]
-            assert isinstance(result, AgentEvaluationResult)
-
-            assert result.agent_id == str(agent.id)
-            assert result.task_id == str(task.id)
-
-            (goal_alignment,) = result.metrics.values()
-            assert goal_alignment.score == 5.0
-
-            expected_feedback = "The agent provided a thorough guide on how to conduct a test task but failed to produce specific expected output"
-            assert expected_feedback in goal_alignment.feedback
-
-            assert goal_alignment.raw_response is not None
-            assert '"score": 5' in goal_alignment.raw_response
-
-    @pytest.mark.vcr(filter_headers=["authorization"])
-    def test_failed_evaluation(self, mock_crew):
-        (agent,) = mock_crew.agents
-        (task,) = mock_crew.tasks
-
-        with crewai_event_bus.scoped_handlers():
-            events = {}
-
-            @crewai_event_bus.on(AgentEvaluationStartedEvent)
-            def capture_started(source, event):
-                events["started"] = event
-
-            @crewai_event_bus.on(AgentEvaluationCompletedEvent)
-            def capture_completed(source, event):
-                events["completed"] = event
-
-            @crewai_event_bus.on(AgentEvaluationFailedEvent)
-            def capture_failed(source, event):
-                events["failed"] = event
-
-            # Create a mock evaluator that will raise an exception
-            from crewai.experimental.evaluation.base_evaluator import BaseEvaluator
-            from crewai.experimental.evaluation import MetricCategory
-
-            class FailingEvaluator(BaseEvaluator):
-                metric_category = MetricCategory.GOAL_ALIGNMENT
-
-                def evaluate(self, agent, task, execution_trace, final_output):
-                    raise ValueError("Forced evaluation failure")
-
-            agent_evaluator = AgentEvaluator(
-                agents=[agent], evaluators=[FailingEvaluator()]
-            )
-            mock_crew.kickoff()
-
-            assert events.keys() == {"started", "failed"}
-            assert events["started"].agent_id == str(agent.id)
-            assert events["started"].agent_role == agent.role
-            assert events["started"].task_id == str(task.id)
-            assert events["started"].iteration == 1
-
-            assert events["failed"].agent_id == str(agent.id)
-            assert events["failed"].agent_role == agent.role
-            assert events["failed"].task_id == str(task.id)
-            assert events["failed"].iteration == 1
-            assert events["failed"].error == "Forced evaluation failure"
-
-            results = agent_evaluator.get_evaluation_results()
-            (result,) = results[agent.role]
-            assert isinstance(result, AgentEvaluationResult)
-
-            assert result.agent_id == str(agent.id)
-            assert result.task_id == str(task.id)
-
-            assert result.metrics == {}
--- a/tests/experimental/evaluation/test_experiment_result.py
+++ b/tests/experimental/evaluation/test_experiment_result.py
@@ -1,111 +0,0 @@
-import pytest
-from unittest.mock import MagicMock, patch
-
-from crewai.experimental.evaluation.experiment.result import ExperimentResult, ExperimentResults
-
-
-class TestExperimentResult:
-    @pytest.fixture
-    def mock_results(self):
-        return [
-            ExperimentResult(
-                identifier="test-1",
-                inputs={"query": "What is the capital of France?"},
-                score=10,
-                expected_score=7,
-                passed=True
-            ),
-            ExperimentResult(
-                identifier="test-2",
-                inputs={"query": "Who wrote Hamlet?"},
-                score={"relevance": 9, "factuality": 8},
-                expected_score={"relevance": 7, "factuality": 7},
-                passed=True,
-                agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}}
-            ),
-            ExperimentResult(
-                identifier="test-3",
-                inputs={"query": "Any query"},
-                score={"relevance": 9, "factuality": 8},
-                expected_score={"relevance": 7, "factuality": 7},
-                passed=False,
-                agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}}
-            ),
-            ExperimentResult(
-                identifier="test-4",
-                inputs={"query": "Another query"},
-                score={"relevance": 9, "factuality": 8},
-                expected_score={"relevance": 7, "factuality": 7},
-                passed=True,
-                agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}}
-            ),
-            ExperimentResult(
-                identifier="test-6",
-                inputs={"query": "Yet another query"},
-                score={"relevance": 9, "factuality": 8},
-                expected_score={"relevance": 7, "factuality": 7},
-                passed=True,
-                agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}}
-            )
-        ]
-
-    @patch('os.path.exists', return_value=True)
-    @patch('os.path.getsize', return_value=1)
-    @patch('json.load')
-    @patch('builtins.open', new_callable=MagicMock)
-    def test_experiment_results_compare_with_baseline(self, mock_open, mock_json_load, mock_path_getsize, mock_path_exists, mock_results):
-        baseline_data = {
-            "timestamp": "2023-01-01T00:00:00+00:00",
-            "results": [
-                {
-                    "identifier": "test-1",
-                    "inputs": {"query": "What is the capital of France?"},
-                    "score": 7,
-                    "expected_score": 7,
-                    "passed": False
-                },
-                {
-                    "identifier": "test-2",
-                    "inputs": {"query": "Who wrote Hamlet?"},
-                    "score": {"relevance": 8, "factuality": 7},
-                    "expected_score": {"relevance": 7, "factuality": 7},
-                    "passed": True
-                },
-                {
-                    "identifier": "test-3",
-                    "inputs": {"query": "Any query"},
-                    "score": {"relevance": 8, "factuality": 7},
-                    "expected_score": {"relevance": 7, "factuality": 7},
-                    "passed": True
-                },
-                {
-                    "identifier": "test-4",
-                    "inputs": {"query": "Another query"},
-                    "score": {"relevance": 8, "factuality": 7},
-                    "expected_score": {"relevance": 7, "factuality": 7},
-                    "passed": True
-                },
-                {
-                    "identifier": "test-5",
-                    "inputs": {"query": "Another query"},
-                    "score": {"relevance": 8, "factuality": 7},
-                    "expected_score": {"relevance": 7, "factuality": 7},
-                    "passed": True
-                }
-            ]
-        }
-
-        mock_json_load.return_value = baseline_data
-
-        results = ExperimentResults(results=mock_results)
-        results.display = MagicMock()
-
-        comparison = results.compare_with_baseline(baseline_filepath="baseline.json")
-
-        assert "baseline_timestamp" in comparison
-        assert comparison["baseline_timestamp"] == "2023-01-01T00:00:00+00:00"
-        assert comparison["improved"] == ["test-1"]
-        assert comparison["regressed"] == ["test-3"]
-        assert comparison["unchanged"] == ["test-2", "test-4"]
-        assert comparison["new_tests"] == ["test-6"]
-        assert comparison["missing_tests"] == ["test-5"]
--- a/tests/experimental/evaluation/test_experiment_runner.py
+++ b/tests/experimental/evaluation/test_experiment_runner.py
@@ -1,197 +0,0 @@
-import pytest
-from unittest.mock import MagicMock, patch
-
-from crewai.crew import Crew
-from crewai.experimental.evaluation.experiment.runner import ExperimentRunner
-from crewai.experimental.evaluation.experiment.result import ExperimentResults
-from crewai.experimental.evaluation.evaluation_display import AgentAggregatedEvaluationResult
-from crewai.experimental.evaluation.base_evaluator import MetricCategory, EvaluationScore
-
-
-class TestExperimentRunner:
-    @pytest.fixture
-    def mock_crew(self):
-        return MagicMock(llm=Crew)
-
-    @pytest.fixture
-    def mock_evaluator_results(self):
-        agent_evaluation = AgentAggregatedEvaluationResult(
-            agent_id="Test Agent",
-            agent_role="Test Agent Role",
-            metrics={
-                MetricCategory.GOAL_ALIGNMENT: EvaluationScore(
-                    score=9,
-                    feedback="Test feedback for goal alignment",
-                    raw_response="Test raw response for goal alignment"
-                ),
-                MetricCategory.REASONING_EFFICIENCY: EvaluationScore(
-                    score=None,
-                    feedback="Reasoning efficiency not applicable",
-                    raw_response="Reasoning efficiency not applicable"
-                ),
-                MetricCategory.PARAMETER_EXTRACTION: EvaluationScore(
-                    score=7,
-                    feedback="Test parameter extraction explanation",
-                    raw_response="Test raw output"
-                ),
-                MetricCategory.TOOL_SELECTION: EvaluationScore(
-                    score=8,
-                    feedback="Test tool selection explanation",
-                    raw_response="Test raw output"
-                )
-            }
-        )
-
-        return {"Test Agent": agent_evaluation}
-
-    @patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
-    def test_run_success(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
-        dataset = [
-            {
-                "identifier": "test-case-1",
-                "inputs": {"query": "Test query 1"},
-                "expected_score": 8
-            },
-            {
-                "identifier": "test-case-2",
-                "inputs": {"query": "Test query 2"},
-                "expected_score": {"goal_alignment": 7}
-            },
-            {
-                "inputs": {"query": "Test query 3"},
-                "expected_score": {"tool_selection": 9}
-            }
-        ]
-
-        mock_evaluator = MagicMock()
-        mock_evaluator.get_agent_evaluation.return_value = mock_evaluator_results
-        mock_evaluator.reset_iterations_results = MagicMock()
-        mock_create_evaluator.return_value = mock_evaluator
-
-        runner = ExperimentRunner(dataset=dataset)
-
-        results = runner.run(crew=mock_crew)
-
-        assert isinstance(results, ExperimentResults)
-        result_1, result_2, result_3 = results.results
-        assert len(results.results) == 3
-
-        assert result_1.identifier == "test-case-1"
-        assert result_1.inputs == {"query": "Test query 1"}
-        assert result_1.expected_score == 8
-        assert result_1.passed is True
-
-        assert result_2.identifier == "test-case-2"
-        assert result_2.inputs == {"query": "Test query 2"}
-        assert isinstance(result_2.expected_score, dict)
-        assert "goal_alignment" in result_2.expected_score
-        assert result_2.passed is True
-
-        assert result_3.identifier == "c2ed49e63aa9a83af3ca382794134fd5"
-        assert result_3.inputs == {"query": "Test query 3"}
-        assert isinstance(result_3.expected_score, dict)
-        assert "tool_selection" in result_3.expected_score
-        assert result_3.passed is False
-
-        assert mock_crew.kickoff.call_count == 3
-        mock_crew.kickoff.assert_any_call(inputs={"query": "Test query 1"})
-        mock_crew.kickoff.assert_any_call(inputs={"query": "Test query 2"})
-        mock_crew.kickoff.assert_any_call(inputs={"query": "Test query 3"})
-
-        assert mock_evaluator.reset_iterations_results.call_count == 3
-        assert mock_evaluator.get_agent_evaluation.call_count == 3
-
-
-    @patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
-    def test_run_success_with_unknown_metric(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
-        dataset = [
-            {
-                "identifier": "test-case-2",
-                "inputs": {"query": "Test query 2"},
-                "expected_score": {"goal_alignment": 7, "unknown_metric": 8}
-            }
-        ]
-
-        mock_evaluator = MagicMock()
-        mock_evaluator.get_agent_evaluation.return_value = mock_evaluator_results
-        mock_evaluator.reset_iterations_results = MagicMock()
-        mock_create_evaluator.return_value = mock_evaluator
-
-        runner = ExperimentRunner(dataset=dataset)
-
-        results = runner.run(crew=mock_crew)
-
-        result, = results.results
-
-        assert result.identifier == "test-case-2"
-        assert result.inputs == {"query": "Test query 2"}
-        assert isinstance(result.expected_score, dict)
-        assert "goal_alignment" in result.expected_score.keys()
-        assert "unknown_metric" in result.expected_score.keys()
-        assert result.passed is True
-
-    @patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
-    def test_run_success_with_single_metric_evaluator_and_expected_specific_metric(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
-        dataset = [
-            {
-                "identifier": "test-case-2",
-                "inputs": {"query": "Test query 2"},
-                "expected_score": {"goal_alignment": 7}
-            }
-        ]
-
-        mock_evaluator = MagicMock()
-        mock_create_evaluator["Test Agent"].metrics = {
-            MetricCategory.GOAL_ALIGNMENT: EvaluationScore(
-                    score=9,
-                    feedback="Test feedback for goal alignment",
-                    raw_response="Test raw response for goal alignment"
-                )
-        }
-        mock_evaluator.get_agent_evaluation.return_value = mock_evaluator_results
-        mock_evaluator.reset_iterations_results = MagicMock()
-        mock_create_evaluator.return_value = mock_evaluator
-
-        runner = ExperimentRunner(dataset=dataset)
-
-        results = runner.run(crew=mock_crew)
-        result, = results.results
-
-        assert result.identifier == "test-case-2"
-        assert result.inputs == {"query": "Test query 2"}
-        assert isinstance(result.expected_score, dict)
-        assert "goal_alignment" in result.expected_score.keys()
-        assert result.passed is True
-
-    @patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
-    def test_run_success_when_expected_metric_is_not_available(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
-        dataset = [
-            {
-                "identifier": "test-case-2",
-                "inputs": {"query": "Test query 2"},
-                "expected_score": {"unknown_metric": 7}
-            }
-        ]
-
-        mock_evaluator = MagicMock()
-        mock_create_evaluator["Test Agent"].metrics = {
-            MetricCategory.GOAL_ALIGNMENT: EvaluationScore(
-                score=5,
-                feedback="Test feedback for goal alignment",
-                raw_response="Test raw response for goal alignment"
-            )
-        }
-        mock_evaluator.get_agent_evaluation.return_value = mock_evaluator_results
-        mock_evaluator.reset_iterations_results = MagicMock()
-        mock_create_evaluator.return_value = mock_evaluator
-
-        runner = ExperimentRunner(dataset=dataset)
-
-        results = runner.run(crew=mock_crew)
-        result, = results.results
-
-        assert result.identifier == "test-case-2"
-        assert result.inputs == {"query": "Test query 2"}
-        assert isinstance(result.expected_score, dict)
-        assert "unknown_metric" in result.expected_score.keys()
-        assert result.passed is False