From 1b6b2b36d973f5466d75adc8303e9031c4ac7cf9 Mon Sep 17 00:00:00 2001
From: Lucas Gomide <lucaslg200@gmail.com>
Date: Mon, 14 Jul 2025 10:06:45 -0300
Subject: [PATCH] Introduce Evaluator Experiment (#3133)

* feat: add exchanged messages in LLMCallCompletedEvent

* feat: add GoalAlignment metric for Agent evaluation

* feat: add SemanticQuality metric for Agent evaluation

* feat: add Tool Metrics for Agent evaluation

* feat: add Reasoning Metrics for Agent evaluation, still in progress

* feat: add AgentEvaluator class

This class will evaluate Agent' results and report to user

* fix: do not evaluate Agent by default

This is a experimental feature we still need refine it further

* test: add Agent eval tests

* fix: render all feedback per iteration

* style: resolve linter issues

* style: fix mypy issues

* fix: allow messages be empty on LLMCallCompletedEvent

* feat: add Experiment evaluation framework with baseline comparison

* fix: reset evaluator for each experiement iteraction

* fix: fix track of new test cases

* chore: split Experimental evaluation classes

* refactor: remove unused method

* refactor: isolate Console print in a dedicated class

* fix: make crew required to run an experiment

* fix: use time-aware to define experiment result

* test: add tests for Evaluator Experiment

* style: fix linter issues

* fix: encode string before hashing

* style: resolve linter issues

* feat: add experimental folder for beta features (#3141)

* test: move tests to experimental folder
---
 src/crewai/crew.py                            |   2 +-
 src/crewai/experimental/__init__.py           |  40 ++
 .../experimental/evaluation/__init__.py       |  51 +++
 .../evaluation/agent_evaluator.py             | 182 ++++++++
 .../experimental/evaluation/base_evaluator.py | 125 ++++++
 .../evaluation/evaluation_display.py          | 341 +++++++++++++++
 .../evaluation/evaluation_listener.py         | 190 +++++++++
 .../evaluation/experiment/__init__.py         |   8 +
 .../evaluation/experiment/result.py           | 122 ++++++
 .../evaluation/experiment/result_display.py   |  70 +++
 .../evaluation/experiment/runner.py           | 117 +++++
 .../experimental/evaluation/json_parser.py    |  30 ++
 .../evaluation/metrics/__init__.py            |  26 ++
 .../evaluation/metrics/goal_metrics.py        |  66 +++
 .../evaluation/metrics/reasoning_metrics.py   | 355 ++++++++++++++++
 .../metrics/semantic_quality_metrics.py       |  65 +++
 .../evaluation/metrics/tools_metrics.py       | 400 ++++++++++++++++++
 .../{ => experimental}/evaluation/__init__.py |   0
 .../evaluation/metrics/__init__.py            |   0
 .../metrics/base_evaluation_metrics_test.py   |   0
 .../evaluation/metrics/test_goal_metrics.py   |   6 +-
 .../metrics/test_reasoning_metrics.py         |   6 +-
 .../metrics/test_semantic_quality_metrics.py  |   6 +-
 .../evaluation/metrics/test_tools_metrics.py  |   4 +-
 .../evaluation/test_agent_evaluator.py        |   8 +-
 .../evaluation/test_experiment_result.py      | 111 +++++
 .../evaluation/test_experiment_runner.py      | 197 +++++++++
 27 files changed, 2512 insertions(+), 16 deletions(-)
 create mode 100644 src/crewai/experimental/__init__.py
 create mode 100644 src/crewai/experimental/evaluation/__init__.py
 create mode 100644 src/crewai/experimental/evaluation/agent_evaluator.py
 create mode 100644 src/crewai/experimental/evaluation/base_evaluator.py
 create mode 100644 src/crewai/experimental/evaluation/evaluation_display.py
 create mode 100644 src/crewai/experimental/evaluation/evaluation_listener.py
 create mode 100644 src/crewai/experimental/evaluation/experiment/__init__.py
 create mode 100644 src/crewai/experimental/evaluation/experiment/result.py
 create mode 100644 src/crewai/experimental/evaluation/experiment/result_display.py
 create mode 100644 src/crewai/experimental/evaluation/experiment/runner.py
 create mode 100644 src/crewai/experimental/evaluation/json_parser.py
 create mode 100644 src/crewai/experimental/evaluation/metrics/__init__.py
 create mode 100644 src/crewai/experimental/evaluation/metrics/goal_metrics.py
 create mode 100644 src/crewai/experimental/evaluation/metrics/reasoning_metrics.py
 create mode 100644 src/crewai/experimental/evaluation/metrics/semantic_quality_metrics.py
 create mode 100644 src/crewai/experimental/evaluation/metrics/tools_metrics.py
 rename tests/{ => experimental}/evaluation/__init__.py (100%)
 rename tests/{ => experimental}/evaluation/metrics/__init__.py (100%)
 rename tests/{ => experimental}/evaluation/metrics/base_evaluation_metrics_test.py (100%)
 rename tests/{ => experimental}/evaluation/metrics/test_goal_metrics.py (88%)
 rename tests/{ => experimental}/evaluation/metrics/test_reasoning_metrics.py (96%)
 rename tests/{ => experimental}/evaluation/metrics/test_semantic_quality_metrics.py (90%)
 rename tests/{ => experimental}/evaluation/metrics/test_tools_metrics.py (97%)
 rename tests/{ => experimental}/evaluation/test_agent_evaluator.py (91%)
 create mode 100644 tests/experimental/evaluation/test_experiment_result.py
 create mode 100644 tests/experimental/evaluation/test_experiment_runner.py

diff --git a/src/crewai/crew.py b/src/crewai/crew.py
index 89acee505..1f02b9b06 100644
--- a/src/crewai/crew.py
+++ b/src/crewai/crew.py
@@ -1337,7 +1337,7 @@ class Crew(FlowTrackable, BaseModel):
             evaluator = CrewEvaluator(test_crew, llm_instance)
 
             if include_agent_eval:
-                from crewai.evaluation import create_default_evaluator
+                from crewai.experimental.evaluation import create_default_evaluator
                 agent_evaluator = create_default_evaluator(crew=test_crew)
 
             for i in range(1, n_iterations + 1):
diff --git a/src/crewai/experimental/__init__.py b/src/crewai/experimental/__init__.py
new file mode 100644
index 000000000..67eb7847f
--- /dev/null
+++ b/src/crewai/experimental/__init__.py
@@ -0,0 +1,40 @@
+from crewai.experimental.evaluation import (
+    BaseEvaluator,
+    EvaluationScore,
+    MetricCategory,
+    AgentEvaluationResult,
+    SemanticQualityEvaluator,
+    GoalAlignmentEvaluator,
+    ReasoningEfficiencyEvaluator,
+    ToolSelectionEvaluator,
+    ParameterExtractionEvaluator,
+    ToolInvocationEvaluator,
+    EvaluationTraceCallback,
+    create_evaluation_callbacks,
+    AgentEvaluator,
+    create_default_evaluator,
+    ExperimentRunner,
+    ExperimentResults,
+    ExperimentResult,
+)
+
+
+__all__ = [
+    "BaseEvaluator",
+    "EvaluationScore",
+    "MetricCategory",
+    "AgentEvaluationResult",
+    "SemanticQualityEvaluator",
+    "GoalAlignmentEvaluator",
+    "ReasoningEfficiencyEvaluator",
+    "ToolSelectionEvaluator",
+    "ParameterExtractionEvaluator",
+    "ToolInvocationEvaluator",
+    "EvaluationTraceCallback",
+    "create_evaluation_callbacks",
+    "AgentEvaluator",
+    "create_default_evaluator",
+    "ExperimentRunner",
+    "ExperimentResults",
+    "ExperimentResult"
+]
\ No newline at end of file
diff --git a/src/crewai/experimental/evaluation/__init__.py b/src/crewai/experimental/evaluation/__init__.py
new file mode 100644
index 000000000..024c3aaed
--- /dev/null
+++ b/src/crewai/experimental/evaluation/__init__.py
@@ -0,0 +1,51 @@
+from crewai.experimental.evaluation.base_evaluator import (
+    BaseEvaluator,
+    EvaluationScore,
+    MetricCategory,
+    AgentEvaluationResult
+)
+
+from crewai.experimental.evaluation.metrics import (
+    SemanticQualityEvaluator,
+    GoalAlignmentEvaluator,
+    ReasoningEfficiencyEvaluator,
+    ToolSelectionEvaluator,
+    ParameterExtractionEvaluator,
+    ToolInvocationEvaluator
+)
+
+from crewai.experimental.evaluation.evaluation_listener import (
+    EvaluationTraceCallback,
+    create_evaluation_callbacks
+)
+
+from crewai.experimental.evaluation.agent_evaluator import (
+    AgentEvaluator,
+    create_default_evaluator
+)
+
+from crewai.experimental.evaluation.experiment import (
+    ExperimentRunner,
+    ExperimentResults,
+    ExperimentResult
+)
+
+__all__ = [
+    "BaseEvaluator",
+    "EvaluationScore",
+    "MetricCategory",
+    "AgentEvaluationResult",
+    "SemanticQualityEvaluator",
+    "GoalAlignmentEvaluator",
+    "ReasoningEfficiencyEvaluator",
+    "ToolSelectionEvaluator",
+    "ParameterExtractionEvaluator",
+    "ToolInvocationEvaluator",
+    "EvaluationTraceCallback",
+    "create_evaluation_callbacks",
+    "AgentEvaluator",
+    "create_default_evaluator",
+    "ExperimentRunner",
+    "ExperimentResults",
+    "ExperimentResult"
+]
diff --git a/src/crewai/experimental/evaluation/agent_evaluator.py b/src/crewai/experimental/evaluation/agent_evaluator.py
new file mode 100644
index 000000000..3cf9583dc
--- /dev/null
+++ b/src/crewai/experimental/evaluation/agent_evaluator.py
@@ -0,0 +1,182 @@
+from crewai.experimental.evaluation.base_evaluator import AgentEvaluationResult, AggregationStrategy
+from crewai.agent import Agent
+from crewai.task import Task
+from crewai.experimental.evaluation.evaluation_display import EvaluationDisplayFormatter
+
+from typing import Any, Dict
+from collections import defaultdict
+from crewai.experimental.evaluation import BaseEvaluator, create_evaluation_callbacks
+from collections.abc import Sequence
+from crewai.crew import Crew
+from crewai.utilities.events.crewai_event_bus import crewai_event_bus
+from crewai.utilities.events.utils.console_formatter import ConsoleFormatter
+from crewai.experimental.evaluation.evaluation_display import AgentAggregatedEvaluationResult
+
+class AgentEvaluator:
+    def __init__(
+        self,
+        evaluators: Sequence[BaseEvaluator] | None = None,
+        crew: Crew | None = None,
+    ):
+        self.crew: Crew | None = crew
+        self.evaluators: Sequence[BaseEvaluator] | None = evaluators
+
+        self.agent_evaluators: dict[str, Sequence[BaseEvaluator] | None] = {}
+        if crew is not None:
+            assert crew and crew.agents is not None
+            for agent in crew.agents:
+                self.agent_evaluators[str(agent.id)] = self.evaluators
+
+        self.callback = create_evaluation_callbacks()
+        self.console_formatter = ConsoleFormatter()
+        self.display_formatter = EvaluationDisplayFormatter()
+
+        self.iteration = 1
+        self.iterations_results: dict[int, dict[str, list[AgentEvaluationResult]]] = {}
+
+    def set_iteration(self, iteration: int) -> None:
+        self.iteration = iteration
+
+    def reset_iterations_results(self):
+        self.iterations_results = {}
+
+    def evaluate_current_iteration(self) -> dict[str, list[AgentEvaluationResult]]:
+        if not self.crew:
+            raise ValueError("Cannot evaluate: no crew was provided to the evaluator.")
+
+        if not self.callback:
+            raise ValueError("Cannot evaluate: no callback was set. Use set_callback() method first.")
+
+        from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
+        evaluation_results: defaultdict[str, list[AgentEvaluationResult]] = defaultdict(list)
+
+        total_evals = 0
+        for agent in self.crew.agents:
+            for task in self.crew.tasks:
+                if task.agent and task.agent.id == agent.id and self.agent_evaluators.get(str(agent.id)):
+                    total_evals += 1
+
+        with Progress(
+            SpinnerColumn(),
+            TextColumn("[bold blue]{task.description}[/bold blue]"),
+            BarColumn(),
+            TextColumn("{task.percentage:.0f}% completed"),
+            console=self.console_formatter.console
+        ) as progress:
+            eval_task = progress.add_task(f"Evaluating agents (iteration {self.iteration})...", total=total_evals)
+
+            for agent in self.crew.agents:
+                evaluator = self.agent_evaluators.get(str(agent.id))
+                if not evaluator:
+                    continue
+
+                for task in self.crew.tasks:
+
+                    if task.agent and str(task.agent.id) != str(agent.id):
+                        continue
+
+                    trace = self.callback.get_trace(str(agent.id), str(task.id))
+                    if not trace:
+                        self.console_formatter.print(f"[yellow]Warning: No trace found for agent {agent.role} on task {task.description[:30]}...[/yellow]")
+                        progress.update(eval_task, advance=1)
+                        continue
+
+                    with crewai_event_bus.scoped_handlers():
+                        result = self.evaluate(
+                            agent=agent,
+                            task=task,
+                            execution_trace=trace,
+                            final_output=task.output
+                        )
+                        evaluation_results[agent.role].append(result)
+                        progress.update(eval_task, advance=1)
+
+        self.iterations_results[self.iteration] = evaluation_results
+        return evaluation_results
+
+    def get_evaluation_results(self):
+        if self.iteration in self.iterations_results:
+            return self.iterations_results[self.iteration]
+
+        return self.evaluate_current_iteration()
+
+    def display_results_with_iterations(self):
+        self.display_formatter.display_summary_results(self.iterations_results)
+
+    def get_agent_evaluation(self, strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE, include_evaluation_feedback: bool = False) -> Dict[str, AgentAggregatedEvaluationResult]:
+        agent_results = {}
+        with crewai_event_bus.scoped_handlers():
+            task_results = self.get_evaluation_results()
+            for agent_role, results in task_results.items():
+                if not results:
+                    continue
+
+                agent_id = results[0].agent_id
+
+                aggregated_result = self.display_formatter._aggregate_agent_results(
+                    agent_id=agent_id,
+                    agent_role=agent_role,
+                    results=results,
+                    strategy=strategy
+                )
+
+                agent_results[agent_role] = aggregated_result
+
+
+            if self.iteration == max(self.iterations_results.keys()):
+                self.display_results_with_iterations()
+
+            if include_evaluation_feedback:
+                self.display_evaluation_with_feedback()
+
+        return agent_results
+
+    def display_evaluation_with_feedback(self):
+        self.display_formatter.display_evaluation_with_feedback(self.iterations_results)
+
+    def evaluate(
+        self,
+        agent: Agent,
+        task: Task,
+        execution_trace: Dict[str, Any],
+        final_output: Any
+    ) -> AgentEvaluationResult:
+        result = AgentEvaluationResult(
+            agent_id=str(agent.id),
+            task_id=str(task.id)
+        )
+        assert self.evaluators is not None
+        for evaluator in self.evaluators:
+            try:
+                score = evaluator.evaluate(
+                    agent=agent,
+                    task=task,
+                    execution_trace=execution_trace,
+                    final_output=final_output
+                )
+                result.metrics[evaluator.metric_category] = score
+            except Exception as e:
+                self.console_formatter.print(f"Error in {evaluator.metric_category.value} evaluator: {str(e)}")
+
+        return result
+
+def create_default_evaluator(crew, llm=None):
+    from crewai.experimental.evaluation import (
+        GoalAlignmentEvaluator,
+        SemanticQualityEvaluator,
+        ToolSelectionEvaluator,
+        ParameterExtractionEvaluator,
+        ToolInvocationEvaluator,
+        ReasoningEfficiencyEvaluator
+    )
+
+    evaluators = [
+        GoalAlignmentEvaluator(llm=llm),
+        SemanticQualityEvaluator(llm=llm),
+        ToolSelectionEvaluator(llm=llm),
+        ParameterExtractionEvaluator(llm=llm),
+        ToolInvocationEvaluator(llm=llm),
+        ReasoningEfficiencyEvaluator(llm=llm),
+    ]
+
+    return AgentEvaluator(evaluators=evaluators, crew=crew)
diff --git a/src/crewai/experimental/evaluation/base_evaluator.py b/src/crewai/experimental/evaluation/base_evaluator.py
new file mode 100644
index 000000000..b11c61973
--- /dev/null
+++ b/src/crewai/experimental/evaluation/base_evaluator.py
@@ -0,0 +1,125 @@
+import abc
+import enum
+from enum import Enum
+from typing import Any, Dict, List, Optional
+
+from pydantic import BaseModel, Field
+
+from crewai.agent import Agent
+from crewai.task import Task
+from crewai.llm import BaseLLM
+from crewai.utilities.llm_utils import create_llm
+
+class MetricCategory(enum.Enum):
+    GOAL_ALIGNMENT = "goal_alignment"
+    SEMANTIC_QUALITY = "semantic_quality"
+    REASONING_EFFICIENCY = "reasoning_efficiency"
+    TOOL_SELECTION = "tool_selection"
+    PARAMETER_EXTRACTION = "parameter_extraction"
+    TOOL_INVOCATION = "tool_invocation"
+
+    def title(self):
+        return self.value.replace('_', ' ').title()
+
+
+class EvaluationScore(BaseModel):
+    score: float | None = Field(
+        default=5.0,
+        description="Numeric score from 0-10 where 0 is worst and 10 is best, None if not applicable",
+        ge=0.0,
+        le=10.0
+    )
+    feedback: str = Field(
+        default="",
+        description="Detailed feedback explaining the evaluation score"
+    )
+    raw_response: str | None = Field(
+        default=None,
+        description="Raw response from the evaluator (e.g., LLM)"
+    )
+
+    def __str__(self) -> str:
+        if self.score is None:
+            return f"Score: N/A - {self.feedback}"
+        return f"Score: {self.score:.1f}/10 - {self.feedback}"
+
+
+class BaseEvaluator(abc.ABC):
+    def __init__(self, llm: BaseLLM | None = None):
+        self.llm: BaseLLM | None = create_llm(llm)
+
+    @property
+    @abc.abstractmethod
+    def metric_category(self) -> MetricCategory:
+        pass
+
+    @abc.abstractmethod
+    def evaluate(
+        self,
+        agent: Agent,
+        task: Task,
+        execution_trace: Dict[str, Any],
+        final_output: Any,
+    ) -> EvaluationScore:
+        pass
+
+
+class AgentEvaluationResult(BaseModel):
+    agent_id: str = Field(description="ID of the evaluated agent")
+    task_id: str = Field(description="ID of the task that was executed")
+    metrics: Dict[MetricCategory, EvaluationScore] = Field(
+        default_factory=dict,
+        description="Evaluation scores for each metric category"
+    )
+
+
+class AggregationStrategy(Enum):
+    SIMPLE_AVERAGE = "simple_average"  # Equal weight to all tasks
+    WEIGHTED_BY_COMPLEXITY = "weighted_by_complexity"  # Weight by task complexity
+    BEST_PERFORMANCE = "best_performance"  # Use best scores across tasks
+    WORST_PERFORMANCE = "worst_performance"  # Use worst scores across tasks
+
+
+class AgentAggregatedEvaluationResult(BaseModel):
+    agent_id: str = Field(
+        default="",
+        description="ID of the agent"
+    )
+    agent_role: str = Field(
+        default="",
+        description="Role of the agent"
+    )
+    task_count: int = Field(
+        default=0,
+        description="Number of tasks included in this aggregation"
+    )
+    aggregation_strategy: AggregationStrategy = Field(
+        default=AggregationStrategy.SIMPLE_AVERAGE,
+        description="Strategy used for aggregation"
+    )
+    metrics: Dict[MetricCategory, EvaluationScore] = Field(
+        default_factory=dict,
+        description="Aggregated metrics across all tasks"
+    )
+    task_results: List[str] = Field(
+        default_factory=list,
+        description="IDs of tasks included in this aggregation"
+    )
+    overall_score: Optional[float] = Field(
+        default=None,
+        description="Overall score for this agent"
+    )
+
+    def __str__(self) -> str:
+        result = f"Agent Evaluation: {self.agent_role}\n"
+        result += f"Strategy: {self.aggregation_strategy.value}\n"
+        result += f"Tasks evaluated: {self.task_count}\n"
+
+        for category, score in self.metrics.items():
+            result += f"\n\n- {category.value.upper()}: {score.score}/10\n"
+
+            if score.feedback:
+                detailed_feedback = "\n  ".join(score.feedback.split('\n'))
+                result += f"  {detailed_feedback}\n"
+
+        return result
\ No newline at end of file
diff --git a/src/crewai/experimental/evaluation/evaluation_display.py b/src/crewai/experimental/evaluation/evaluation_display.py
new file mode 100644
index 000000000..c25ea6db4
--- /dev/null
+++ b/src/crewai/experimental/evaluation/evaluation_display.py
@@ -0,0 +1,341 @@
+from collections import defaultdict
+from typing import Dict, Any, List
+from rich.table import Table
+from rich.box import HEAVY_EDGE, ROUNDED
+from collections.abc import Sequence
+from crewai.experimental.evaluation.base_evaluator import AgentAggregatedEvaluationResult, AggregationStrategy, AgentEvaluationResult, MetricCategory
+from crewai.experimental.evaluation import EvaluationScore
+from crewai.utilities.events.utils.console_formatter import ConsoleFormatter
+from crewai.utilities.llm_utils import create_llm
+
+class EvaluationDisplayFormatter:
+    def __init__(self):
+        self.console_formatter = ConsoleFormatter()
+
+    def display_evaluation_with_feedback(self, iterations_results: Dict[int, Dict[str, List[Any]]]):
+        if not iterations_results:
+            self.console_formatter.print("[yellow]No evaluation results to display[/yellow]")
+            return
+
+        # Get all agent roles across all iterations
+        all_agent_roles: set[str] = set()
+        for iter_results in iterations_results.values():
+            all_agent_roles.update(iter_results.keys())
+
+        for agent_role in sorted(all_agent_roles):
+            self.console_formatter.print(f"\n[bold cyan]Agent: {agent_role}[/bold cyan]")
+
+            # Process each iteration
+            for iter_num, results in sorted(iterations_results.items()):
+                if agent_role not in results or not results[agent_role]:
+                    continue
+
+                agent_results = results[agent_role]
+                agent_id = agent_results[0].agent_id
+
+                # Aggregate results for this agent in this iteration
+                aggregated_result = self._aggregate_agent_results(
+                    agent_id=agent_id,
+                    agent_role=agent_role,
+                    results=agent_results,
+                )
+
+                # Display iteration header
+                self.console_formatter.print(f"\n[bold]Iteration {iter_num}[/bold]")
+
+                # Create table for this iteration
+                table = Table(box=ROUNDED)
+                table.add_column("Metric", style="cyan")
+                table.add_column("Score (1-10)", justify="center")
+                table.add_column("Feedback", style="green")
+
+                # Add metrics to table
+                if aggregated_result.metrics:
+                    for metric, evaluation_score in aggregated_result.metrics.items():
+                        score = evaluation_score.score
+
+                        if isinstance(score, (int, float)):
+                            if score >= 8.0:
+                                score_text = f"[green]{score:.1f}[/green]"
+                            elif score >= 6.0:
+                                score_text = f"[cyan]{score:.1f}[/cyan]"
+                            elif score >= 4.0:
+                                score_text = f"[yellow]{score:.1f}[/yellow]"
+                            else:
+                                score_text = f"[red]{score:.1f}[/red]"
+                        else:
+                            score_text = "[dim]N/A[/dim]"
+
+                        table.add_section()
+                        table.add_row(
+                            metric.title(),
+                            score_text,
+                            evaluation_score.feedback or ""
+                        )
+
+                if aggregated_result.overall_score is not None:
+                    overall_score = aggregated_result.overall_score
+                    if overall_score >= 8.0:
+                        overall_color = "green"
+                    elif overall_score >= 6.0:
+                        overall_color = "cyan"
+                    elif overall_score >= 4.0:
+                        overall_color = "yellow"
+                    else:
+                        overall_color = "red"
+
+                    table.add_section()
+                    table.add_row(
+                        "Overall Score",
+                        f"[{overall_color}]{overall_score:.1f}[/]",
+                        "Overall agent evaluation score"
+                    )
+
+                # Print the table for this iteration
+                self.console_formatter.print(table)
+
+    def display_summary_results(self, iterations_results: Dict[int, Dict[str, List[AgentAggregatedEvaluationResult]]]):
+        if not iterations_results:
+            self.console_formatter.print("[yellow]No evaluation results to display[/yellow]")
+            return
+
+        self.console_formatter.print("\n")
+
+        table = Table(title="Agent Performance Scores \n (1-10 Higher is better)", box=HEAVY_EDGE)
+
+        table.add_column("Agent/Metric", style="cyan")
+
+        for iter_num in sorted(iterations_results.keys()):
+            run_label = f"Run {iter_num}"
+            table.add_column(run_label, justify="center")
+
+        table.add_column("Avg. Total", justify="center")
+
+        all_agent_roles: set[str] = set()
+        for results in iterations_results.values():
+            all_agent_roles.update(results.keys())
+
+        for agent_role in sorted(all_agent_roles):
+            agent_scores_by_iteration = {}
+            agent_metrics_by_iteration = {}
+
+            for iter_num, results in sorted(iterations_results.items()):
+                if agent_role not in results or not results[agent_role]:
+                    continue
+
+                agent_results = results[agent_role]
+                agent_id = agent_results[0].agent_id
+
+                aggregated_result = self._aggregate_agent_results(
+                    agent_id=agent_id,
+                    agent_role=agent_role,
+                    results=agent_results,
+                    strategy=AggregationStrategy.SIMPLE_AVERAGE
+                )
+
+                valid_scores = [score.score for score in aggregated_result.metrics.values()
+                               if score.score is not None]
+                if valid_scores:
+                    avg_score = sum(valid_scores) / len(valid_scores)
+                    agent_scores_by_iteration[iter_num] = avg_score
+
+                agent_metrics_by_iteration[iter_num] = aggregated_result.metrics
+
+            if not agent_scores_by_iteration:
+                continue
+
+            avg_across_iterations = sum(agent_scores_by_iteration.values()) / len(agent_scores_by_iteration)
+
+            row = [f"[bold]{agent_role}[/bold]"]
+
+            for iter_num in sorted(iterations_results.keys()):
+                if iter_num in agent_scores_by_iteration:
+                    score = agent_scores_by_iteration[iter_num]
+                    if score >= 8.0:
+                        color = "green"
+                    elif score >= 6.0:
+                        color = "cyan"
+                    elif score >= 4.0:
+                        color = "yellow"
+                    else:
+                        color = "red"
+                    row.append(f"[bold {color}]{score:.1f}[/]")
+                else:
+                    row.append("-")
+
+            if avg_across_iterations >= 8.0:
+                color = "green"
+            elif avg_across_iterations >= 6.0:
+                color = "cyan"
+            elif avg_across_iterations >= 4.0:
+                color = "yellow"
+            else:
+                color = "red"
+            row.append(f"[bold {color}]{avg_across_iterations:.1f}[/]")
+
+            table.add_row(*row)
+
+            all_metrics: set[Any] = set()
+            for metrics in agent_metrics_by_iteration.values():
+                all_metrics.update(metrics.keys())
+
+            for metric in sorted(all_metrics, key=lambda x: x.value):
+                metric_scores = []
+
+                row = [f"  - {metric.title()}"]
+
+                for iter_num in sorted(iterations_results.keys()):
+                    if (iter_num in agent_metrics_by_iteration and
+                            metric in agent_metrics_by_iteration[iter_num]):
+                        metric_score = agent_metrics_by_iteration[iter_num][metric].score
+                        if metric_score is not None:
+                            metric_scores.append(metric_score)
+                            if metric_score >= 8.0:
+                                color = "green"
+                            elif metric_score >= 6.0:
+                                color = "cyan"
+                            elif metric_score >= 4.0:
+                                color = "yellow"
+                            else:
+                                color = "red"
+                            row.append(f"[{color}]{metric_score:.1f}[/]")
+                        else:
+                            row.append("[dim]N/A[/dim]")
+                    else:
+                        row.append("-")
+
+                if metric_scores:
+                    avg = sum(metric_scores) / len(metric_scores)
+                    if avg >= 8.0:
+                        color = "green"
+                    elif avg >= 6.0:
+                        color = "cyan"
+                    elif avg >= 4.0:
+                        color = "yellow"
+                    else:
+                        color = "red"
+                    row.append(f"[{color}]{avg:.1f}[/]")
+                else:
+                    row.append("-")
+
+                table.add_row(*row)
+
+            table.add_row(*[""] * (len(sorted(iterations_results.keys())) + 2))
+
+        self.console_formatter.print(table)
+        self.console_formatter.print("\n")
+
+    def _aggregate_agent_results(
+        self,
+        agent_id: str,
+        agent_role: str,
+        results: Sequence[AgentEvaluationResult],
+        strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE,
+    ) -> AgentAggregatedEvaluationResult:
+        metrics_by_category: dict[MetricCategory, list[EvaluationScore]] = defaultdict(list)
+
+        for result in results:
+            for metric_name, evaluation_score in result.metrics.items():
+                metrics_by_category[metric_name].append(evaluation_score)
+
+        aggregated_metrics: dict[MetricCategory, EvaluationScore] = {}
+        for category, scores in metrics_by_category.items():
+            valid_scores = [s.score for s in scores if s.score is not None]
+            avg_score = sum(valid_scores) / len(valid_scores) if valid_scores else None
+
+            feedbacks = [s.feedback for s in scores if s.feedback]
+
+            feedback_summary = None
+            if feedbacks:
+                if len(feedbacks) > 1:
+                    # Use the summarization method for multiple feedbacks
+                    feedback_summary = self._summarize_feedbacks(
+                        agent_role=agent_role,
+                        metric=category.title(),
+                        feedbacks=feedbacks,
+                        scores=[s.score for s in scores],
+                        strategy=strategy
+                    )
+                else:
+                    feedback_summary = feedbacks[0]
+
+            aggregated_metrics[category] = EvaluationScore(
+                score=avg_score,
+                feedback=feedback_summary
+            )
+
+        overall_score = None
+        if aggregated_metrics:
+            valid_scores = [m.score for m in aggregated_metrics.values() if m.score is not None]
+            if valid_scores:
+                overall_score = sum(valid_scores) / len(valid_scores)
+
+        return AgentAggregatedEvaluationResult(
+            agent_id=agent_id,
+            agent_role=agent_role,
+            metrics=aggregated_metrics,
+            overall_score=overall_score,
+            task_count=len(results),
+            aggregation_strategy=strategy
+        )
+
+    def _summarize_feedbacks(
+        self,
+        agent_role: str,
+        metric: str,
+        feedbacks: List[str],
+        scores: List[float | None],
+        strategy: AggregationStrategy
+    ) -> str:
+        if len(feedbacks) <= 2 and all(len(fb) < 200 for fb in feedbacks):
+            return "\n\n".join([f"Feedback {i+1}: {fb}" for i, fb in enumerate(feedbacks)])
+
+        try:
+            llm = create_llm()
+
+            formatted_feedbacks = []
+            for i, (feedback, score) in enumerate(zip(feedbacks, scores)):
+                if len(feedback) > 500:
+                    feedback = feedback[:500] + "..."
+                score_text = f"{score:.1f}" if score is not None else "N/A"
+                formatted_feedbacks.append(f"Feedback #{i+1} (Score: {score_text}):\n{feedback}")
+
+            all_feedbacks = "\n\n" + "\n\n---\n\n".join(formatted_feedbacks)
+
+            strategy_guidance = ""
+            if strategy == AggregationStrategy.BEST_PERFORMANCE:
+                strategy_guidance = "Focus on the highest-scoring aspects and strengths demonstrated."
+            elif strategy == AggregationStrategy.WORST_PERFORMANCE:
+                strategy_guidance = "Focus on areas that need improvement and common issues across tasks."
+            else:  # Default/average strategies
+                strategy_guidance = "Provide a balanced analysis of strengths and weaknesses across all tasks."
+
+            prompt = [
+                {"role": "system", "content": f"""You are an expert evaluator creating a comprehensive summary of agent performance feedback.
+                Your job is to synthesize multiple feedback points about the same metric across different tasks.
+
+                Create a concise, insightful summary that captures the key patterns and themes from all feedback.
+                {strategy_guidance}
+
+                Your summary should be:
+                1. Specific and concrete (not vague or general)
+                2. Focused on actionable insights
+                3. Highlighting patterns across tasks
+                4. 150-250 words in length
+
+                The summary should be directly usable as final feedback for the agent's performance on this metric."""},
+                {"role": "user", "content": f"""I need a synthesized summary of the following feedback for:
+
+                Agent Role: {agent_role}
+                Metric: {metric.title()}
+
+                {all_feedbacks}
+                """}
+            ]
+            assert llm is not None
+            response = llm.call(prompt)
+
+            return response
+
+        except Exception:
+            return "Synthesized from multiple tasks: " + "\n\n".join([f"- {fb[:500]}..." for fb in feedbacks])
diff --git a/src/crewai/experimental/evaluation/evaluation_listener.py b/src/crewai/experimental/evaluation/evaluation_listener.py
new file mode 100644
index 000000000..8fba03e14
--- /dev/null
+++ b/src/crewai/experimental/evaluation/evaluation_listener.py
@@ -0,0 +1,190 @@
+from datetime import datetime
+from typing import Any, Dict, Optional
+
+from collections.abc import Sequence
+
+from crewai.agent import Agent
+from crewai.task import Task
+from crewai.utilities.events.base_event_listener import BaseEventListener
+from crewai.utilities.events.crewai_event_bus import CrewAIEventsBus
+from crewai.utilities.events.agent_events import (
+    AgentExecutionStartedEvent,
+    AgentExecutionCompletedEvent
+)
+from crewai.utilities.events.tool_usage_events import (
+    ToolUsageFinishedEvent,
+    ToolUsageErrorEvent,
+    ToolExecutionErrorEvent,
+    ToolSelectionErrorEvent,
+    ToolValidateInputErrorEvent
+)
+from crewai.utilities.events.llm_events import (
+    LLMCallStartedEvent,
+    LLMCallCompletedEvent
+)
+
+class EvaluationTraceCallback(BaseEventListener):
+    """Event listener for collecting execution traces for evaluation.
+
+    This listener attaches to the event bus to collect detailed information
+    about the execution process, including agent steps, tool uses, knowledge
+    retrievals, and final output - all for use in agent evaluation.
+    """
+
+    _instance = None
+
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+            cls._instance._initialized = False
+        return cls._instance
+
+    def __init__(self):
+        if not hasattr(self, "_initialized") or not self._initialized:
+            super().__init__()
+            self.traces = {}
+            self.current_agent_id = None
+            self.current_task_id = None
+            self._initialized = True
+
+    def setup_listeners(self, event_bus: CrewAIEventsBus):
+        @event_bus.on(AgentExecutionStartedEvent)
+        def on_agent_started(source, event: AgentExecutionStartedEvent):
+            self.on_agent_start(event.agent, event.task)
+
+        @event_bus.on(AgentExecutionCompletedEvent)
+        def on_agent_completed(source, event: AgentExecutionCompletedEvent):
+            self.on_agent_finish(event.agent, event.task, event.output)
+
+        @event_bus.on(ToolUsageFinishedEvent)
+        def on_tool_completed(source, event: ToolUsageFinishedEvent):
+            self.on_tool_use(event.tool_name, event.tool_args, event.output, success=True)
+
+        @event_bus.on(ToolUsageErrorEvent)
+        def on_tool_usage_error(source, event: ToolUsageErrorEvent):
+            self.on_tool_use(event.tool_name, event.tool_args, event.error,
+                           success=False, error_type="usage_error")
+
+        @event_bus.on(ToolExecutionErrorEvent)
+        def on_tool_execution_error(source, event: ToolExecutionErrorEvent):
+            self.on_tool_use(event.tool_name, event.tool_args, event.error,
+                           success=False, error_type="execution_error")
+
+        @event_bus.on(ToolSelectionErrorEvent)
+        def on_tool_selection_error(source, event: ToolSelectionErrorEvent):
+            self.on_tool_use(event.tool_name, event.tool_args, event.error,
+                           success=False, error_type="selection_error")
+
+        @event_bus.on(ToolValidateInputErrorEvent)
+        def on_tool_validate_input_error(source, event: ToolValidateInputErrorEvent):
+            self.on_tool_use(event.tool_name, event.tool_args, event.error,
+                           success=False, error_type="validation_error")
+
+        @event_bus.on(LLMCallStartedEvent)
+        def on_llm_call_started(source, event: LLMCallStartedEvent):
+            self.on_llm_call_start(event.messages, event.tools)
+
+        @event_bus.on(LLMCallCompletedEvent)
+        def on_llm_call_completed(source, event: LLMCallCompletedEvent):
+            self.on_llm_call_end(event.messages, event.response)
+
+    def on_agent_start(self, agent: Agent, task: Task):
+        self.current_agent_id = agent.id
+        self.current_task_id = task.id
+
+        trace_key = f"{agent.id}_{task.id}"
+        self.traces[trace_key] = {
+            "agent_id": agent.id,
+            "task_id": task.id,
+            "tool_uses": [],
+            "llm_calls": [],
+            "start_time": datetime.now(),
+            "final_output": None
+        }
+
+    def on_agent_finish(self, agent: Agent, task: Task, output: Any):
+        trace_key = f"{agent.id}_{task.id}"
+        if trace_key in self.traces:
+            self.traces[trace_key]["final_output"] = output
+            self.traces[trace_key]["end_time"] = datetime.now()
+
+        self.current_agent_id = None
+        self.current_task_id = None
+
+    def on_tool_use(self, tool_name: str, tool_args: dict[str, Any] | str, result: Any,
+                   success: bool = True, error_type: str | None = None):
+        if not self.current_agent_id or not self.current_task_id:
+            return
+
+        trace_key = f"{self.current_agent_id}_{self.current_task_id}"
+        if trace_key in self.traces:
+            tool_use = {
+                "tool": tool_name,
+                "args": tool_args,
+                "result": result,
+                "success": success,
+                "timestamp": datetime.now()
+            }
+
+            # Add error information if applicable
+            if not success and error_type:
+                tool_use["error"] = True
+                tool_use["error_type"] = error_type
+
+            self.traces[trace_key]["tool_uses"].append(tool_use)
+
+    def on_llm_call_start(self, messages: str | Sequence[dict[str, Any]] | None, tools: Sequence[dict[str, Any]] | None = None):
+        if not self.current_agent_id or not self.current_task_id:
+            return
+
+        trace_key = f"{self.current_agent_id}_{self.current_task_id}"
+        if trace_key not in self.traces:
+            return
+
+        self.current_llm_call = {
+            "messages": messages,
+            "tools": tools,
+            "start_time": datetime.now(),
+            "response": None,
+            "end_time": None
+        }
+
+    def on_llm_call_end(self, messages: str | list[dict[str, Any]] | None, response: Any):
+        if not self.current_agent_id or not self.current_task_id:
+            return
+
+        trace_key = f"{self.current_agent_id}_{self.current_task_id}"
+        if trace_key not in self.traces:
+            return
+
+        total_tokens = 0
+        if hasattr(response, "usage") and hasattr(response.usage, "total_tokens"):
+            total_tokens = response.usage.total_tokens
+
+        current_time = datetime.now()
+        start_time = None
+        if hasattr(self, "current_llm_call") and self.current_llm_call:
+            start_time = self.current_llm_call.get("start_time")
+
+        if not start_time:
+            start_time = current_time
+        llm_call = {
+            "messages": messages,
+            "response": response,
+            "start_time": start_time,
+            "end_time": current_time,
+            "total_tokens": total_tokens
+        }
+
+        self.traces[trace_key]["llm_calls"].append(llm_call)
+
+        if hasattr(self, "current_llm_call"):
+            self.current_llm_call = {}
+
+    def get_trace(self, agent_id: str, task_id: str) -> Optional[Dict[str, Any]]:
+        trace_key = f"{agent_id}_{task_id}"
+        return self.traces.get(trace_key)
+
+
+def create_evaluation_callbacks() -> EvaluationTraceCallback:
+    return EvaluationTraceCallback()
\ No newline at end of file
diff --git a/src/crewai/experimental/evaluation/experiment/__init__.py b/src/crewai/experimental/evaluation/experiment/__init__.py
new file mode 100644
index 000000000..8e4fd8983
--- /dev/null
+++ b/src/crewai/experimental/evaluation/experiment/__init__.py
@@ -0,0 +1,8 @@
+from crewai.experimental.evaluation.experiment.runner import ExperimentRunner
+from crewai.experimental.evaluation.experiment.result import ExperimentResults, ExperimentResult
+
+__all__ = [
+    "ExperimentRunner",
+    "ExperimentResults",
+    "ExperimentResult"
+]
diff --git a/src/crewai/experimental/evaluation/experiment/result.py b/src/crewai/experimental/evaluation/experiment/result.py
new file mode 100644
index 000000000..44b24e95a
--- /dev/null
+++ b/src/crewai/experimental/evaluation/experiment/result.py
@@ -0,0 +1,122 @@
+import json
+import os
+from datetime import datetime, timezone
+from typing import Any
+from pydantic import BaseModel
+
+class ExperimentResult(BaseModel):
+    identifier: str
+    inputs: dict[str, Any]
+    score: int | dict[str, int | float]
+    expected_score: int | dict[str, int | float]
+    passed: bool
+    agent_evaluations: dict[str, Any] | None = None
+
+class ExperimentResults:
+    def __init__(self, results: list[ExperimentResult], metadata: dict[str, Any] | None = None):
+        self.results = results
+        self.metadata = metadata or {}
+        self.timestamp = datetime.now(timezone.utc)
+
+        from crewai.experimental.evaluation.experiment.result_display import ExperimentResultsDisplay
+        self.display = ExperimentResultsDisplay()
+
+    def to_json(self, filepath: str | None = None) -> dict[str, Any]:
+        data = {
+            "timestamp": self.timestamp.isoformat(),
+            "metadata": self.metadata,
+            "results": [r.model_dump(exclude={"agent_evaluations"}) for r in self.results]
+        }
+
+        if filepath:
+            with open(filepath, 'w') as f:
+                json.dump(data, f, indent=2)
+            self.display.console.print(f"[green]Results saved to {filepath}[/green]")
+
+        return data
+
+    def compare_with_baseline(self, baseline_filepath: str, save_current: bool = True, print_summary: bool = False) -> dict[str, Any]:
+        baseline_runs = []
+
+        if os.path.exists(baseline_filepath) and os.path.getsize(baseline_filepath) > 0:
+            try:
+                with open(baseline_filepath, 'r') as f:
+                    baseline_data = json.load(f)
+
+                if isinstance(baseline_data, dict) and "timestamp" in baseline_data:
+                    baseline_runs = [baseline_data]
+                elif isinstance(baseline_data, list):
+                    baseline_runs = baseline_data
+            except (json.JSONDecodeError, FileNotFoundError) as e:
+                self.display.console.print(f"[yellow]Warning: Could not load baseline file: {str(e)}[/yellow]")
+
+        if not baseline_runs:
+            if save_current:
+                current_data = self.to_json()
+                with open(baseline_filepath, 'w') as f:
+                    json.dump([current_data], f, indent=2)
+                self.display.console.print(f"[green]Saved current results as new baseline to {baseline_filepath}[/green]")
+            return {"is_baseline": True, "changes": {}}
+
+        baseline_runs.sort(key=lambda x: x.get("timestamp", ""), reverse=True)
+        latest_run = baseline_runs[0]
+
+        comparison = self._compare_with_run(latest_run)
+
+        if print_summary:
+            self.display.comparison_summary(comparison, latest_run["timestamp"])
+
+        if save_current:
+            current_data = self.to_json()
+            baseline_runs.append(current_data)
+            with open(baseline_filepath, 'w') as f:
+                json.dump(baseline_runs, f, indent=2)
+            self.display.console.print(f"[green]Added current results to baseline file {baseline_filepath}[/green]")
+
+        return comparison
+
+    def _compare_with_run(self, baseline_run: dict[str, Any]) -> dict[str, Any]:
+        baseline_results = baseline_run.get("results", [])
+
+        baseline_lookup = {}
+        for result in baseline_results:
+            test_identifier = result.get("identifier")
+            if test_identifier:
+                baseline_lookup[test_identifier] = result
+
+        improved = []
+        regressed = []
+        unchanged = []
+        new_tests = []
+
+        for result in self.results:
+            test_identifier = result.identifier
+            if not test_identifier or test_identifier not in baseline_lookup:
+                new_tests.append(test_identifier)
+                continue
+
+            baseline_result = baseline_lookup[test_identifier]
+            baseline_passed = baseline_result.get("passed", False)
+            if result.passed and not baseline_passed:
+                improved.append(test_identifier)
+            elif not result.passed and baseline_passed:
+                regressed.append(test_identifier)
+            else:
+                unchanged.append(test_identifier)
+
+        missing_tests = []
+        current_test_identifiers = {result.identifier for result in self.results}
+        for result in baseline_results:
+            test_identifier = result.get("identifier")
+            if test_identifier and test_identifier not in current_test_identifiers:
+                missing_tests.append(test_identifier)
+
+        return {
+            "improved": improved,
+            "regressed": regressed,
+            "unchanged": unchanged,
+            "new_tests": new_tests,
+            "missing_tests": missing_tests,
+            "total_compared": len(improved) + len(regressed) + len(unchanged),
+            "baseline_timestamp": baseline_run.get("timestamp", "unknown")
+        }
diff --git a/src/crewai/experimental/evaluation/experiment/result_display.py b/src/crewai/experimental/evaluation/experiment/result_display.py
new file mode 100644
index 000000000..79bab2fc8
--- /dev/null
+++ b/src/crewai/experimental/evaluation/experiment/result_display.py
@@ -0,0 +1,70 @@
+from typing import Dict, Any
+from rich.console import Console
+from rich.table import Table
+from rich.panel import Panel
+from crewai.experimental.evaluation.experiment.result import ExperimentResults
+
+class ExperimentResultsDisplay:
+    def __init__(self):
+        self.console = Console()
+
+    def summary(self, experiment_results: ExperimentResults):
+        total = len(experiment_results.results)
+        passed = sum(1 for r in experiment_results.results if r.passed)
+
+        table = Table(title="Experiment Summary")
+        table.add_column("Metric", style="cyan")
+        table.add_column("Value", style="green")
+
+        table.add_row("Total Test Cases", str(total))
+        table.add_row("Passed", str(passed))
+        table.add_row("Failed", str(total - passed))
+        table.add_row("Success Rate", f"{(passed / total * 100):.1f}%" if total > 0 else "N/A")
+
+        self.console.print(table)
+
+    def comparison_summary(self, comparison: Dict[str, Any], baseline_timestamp: str):
+        self.console.print(Panel(f"[bold]Comparison with baseline run from {baseline_timestamp}[/bold]",
+                                 expand=False))
+
+        table = Table(title="Results Comparison")
+        table.add_column("Metric", style="cyan")
+        table.add_column("Count", style="white")
+        table.add_column("Details", style="dim")
+
+        improved = comparison.get("improved", [])
+        if improved:
+            details = ", ".join([f"{test_identifier}" for test_identifier in improved[:3]])
+            if len(improved) > 3:
+                details += f" and {len(improved) - 3} more"
+            table.add_row("✅ Improved", str(len(improved)), details)
+        else:
+            table.add_row("✅ Improved", "0", "")
+
+        regressed = comparison.get("regressed", [])
+        if regressed:
+            details = ", ".join([f"{test_identifier}" for test_identifier in regressed[:3]])
+            if len(regressed) > 3:
+                details += f" and {len(regressed) - 3} more"
+            table.add_row("❌ Regressed", str(len(regressed)), details, style="red")
+        else:
+            table.add_row("❌ Regressed", "0", "")
+
+        unchanged = comparison.get("unchanged", [])
+        table.add_row("⏺ Unchanged", str(len(unchanged)), "")
+
+        new_tests = comparison.get("new_tests", [])
+        if new_tests:
+            details = ", ".join(new_tests[:3])
+            if len(new_tests) > 3:
+                details += f" and {len(new_tests) - 3} more"
+            table.add_row("➕ New Tests", str(len(new_tests)), details)
+
+        missing_tests = comparison.get("missing_tests", [])
+        if missing_tests:
+            details = ", ".join(missing_tests[:3])
+            if len(missing_tests) > 3:
+                details += f" and {len(missing_tests) - 3} more"
+            table.add_row("➖ Missing Tests", str(len(missing_tests)), details)
+
+        self.console.print(table)
diff --git a/src/crewai/experimental/evaluation/experiment/runner.py b/src/crewai/experimental/evaluation/experiment/runner.py
new file mode 100644
index 000000000..b1b653e68
--- /dev/null
+++ b/src/crewai/experimental/evaluation/experiment/runner.py
@@ -0,0 +1,117 @@
+from collections import defaultdict
+from hashlib import md5
+from typing import Any
+
+from crewai import Crew
+from crewai.experimental.evaluation import AgentEvaluator, create_default_evaluator
+from crewai.experimental.evaluation.experiment.result_display import ExperimentResultsDisplay
+from crewai.experimental.evaluation.experiment.result import ExperimentResults, ExperimentResult
+from crewai.experimental.evaluation.evaluation_display import AgentAggregatedEvaluationResult
+
+class ExperimentRunner:
+    def __init__(self, dataset: list[dict[str, Any]]):
+        self.dataset = dataset or []
+        self.evaluator: AgentEvaluator | None = None
+        self.display = ExperimentResultsDisplay()
+
+    def run(self, crew: Crew, print_summary: bool = False) -> ExperimentResults:
+        self.evaluator = create_default_evaluator(crew=crew)
+
+        results = []
+
+        for test_case in self.dataset:
+            self.evaluator.reset_iterations_results()
+            result = self._run_test_case(test_case, crew)
+            results.append(result)
+
+        experiment_results = ExperimentResults(results)
+
+        if print_summary:
+            self.display.summary(experiment_results)
+
+        return experiment_results
+
+    def _run_test_case(self, test_case: dict[str, Any], crew: Crew) -> ExperimentResult:
+        inputs = test_case["inputs"]
+        expected_score = test_case["expected_score"]
+        identifier = test_case.get("identifier") or md5(str(test_case).encode(), usedforsecurity=False).hexdigest()
+
+        try:
+            self.display.console.print(f"[dim]Running crew with input: {str(inputs)[:50]}...[/dim]")
+            self.display.console.print("\n")
+            crew.kickoff(inputs=inputs)
+
+            assert self.evaluator is not None
+            agent_evaluations = self.evaluator.get_agent_evaluation()
+
+            actual_score = self._extract_scores(agent_evaluations)
+
+            passed = self._assert_scores(expected_score, actual_score)
+            return ExperimentResult(
+                identifier=identifier,
+                inputs=inputs,
+                score=actual_score,
+                expected_score=expected_score,
+                passed=passed,
+                agent_evaluations=agent_evaluations
+            )
+
+        except Exception as e:
+            self.display.console.print(f"[red]Error running test case: {str(e)}[/red]")
+            return ExperimentResult(
+                identifier=identifier,
+                inputs=inputs,
+                score=0,
+                expected_score=expected_score,
+                passed=False
+            )
+
+    def _extract_scores(self, agent_evaluations: dict[str, AgentAggregatedEvaluationResult]) -> float | dict[str,  float]:
+        all_scores: dict[str, list[float]] = defaultdict(list)
+        for evaluation in agent_evaluations.values():
+            for metric_name, score in evaluation.metrics.items():
+                if score.score is not None:
+                    all_scores[metric_name.value].append(score.score)
+
+        avg_scores = {m: sum(s)/len(s) for m, s in all_scores.items()}
+
+        if len(avg_scores) == 1:
+            return list(avg_scores.values())[0]
+
+        return avg_scores
+
+    def _assert_scores(self, expected: float | dict[str, float],
+                        actual: float | dict[str, float]) -> bool:
+        """
+        Compare expected and actual scores, and return whether the test case passed.
+
+        The rules for comparison are as follows:
+        - If both expected and actual scores are single numbers, the actual score must be >= expected.
+        - If expected is a single number and actual is a dict, compare against the average of actual values.
+        - If expected is a dict and actual is a single number, actual must be >= all expected values.
+        - If both are dicts, actual must have matching keys with values >= expected values.
+        """
+
+        if isinstance(expected, (int, float)) and isinstance(actual, (int, float)):
+            return actual >= expected
+
+        if isinstance(expected, dict) and isinstance(actual, (int, float)):
+            return all(actual >= exp_score for exp_score in expected.values())
+
+        if isinstance(expected, (int, float)) and isinstance(actual, dict):
+            if not actual:
+                return False
+            avg_score = sum(actual.values()) / len(actual)
+            return avg_score >= expected
+
+        if isinstance(expected, dict) and isinstance(actual, dict):
+            if not expected:
+                return True
+            matching_keys = set(expected.keys()) & set(actual.keys())
+            if not matching_keys:
+                return False
+
+            # All matching keys must have actual >= expected
+            return all(actual[key] >= expected[key] for key in matching_keys)
+
+        return False
\ No newline at end of file
diff --git a/src/crewai/experimental/evaluation/json_parser.py b/src/crewai/experimental/evaluation/json_parser.py
new file mode 100644
index 000000000..ce7303cde
--- /dev/null
+++ b/src/crewai/experimental/evaluation/json_parser.py
@@ -0,0 +1,30 @@
+"""Robust JSON parsing utilities for evaluation responses."""
+
+import json
+import re
+from typing import Any
+
+
+def extract_json_from_llm_response(text: str) -> dict[str, Any]:
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError:
+        pass
+
+    json_patterns = [
+        # Standard markdown code blocks with json
+        r'```json\s*([\s\S]*?)\s*```',
+        # Code blocks without language specifier
+        r'```\s*([\s\S]*?)\s*```',
+        # Inline code with JSON
+        r'`([{\\[].*[}\]])`',
+    ]
+
+    for pattern in json_patterns:
+        matches = re.findall(pattern, text, re.IGNORECASE | re.DOTALL)
+        for match in matches:
+            try:
+                return json.loads(match.strip())
+            except json.JSONDecodeError:
+                continue
+    raise ValueError("No valid JSON found in the response")
diff --git a/src/crewai/experimental/evaluation/metrics/__init__.py b/src/crewai/experimental/evaluation/metrics/__init__.py
new file mode 100644
index 000000000..1b306587b
--- /dev/null
+++ b/src/crewai/experimental/evaluation/metrics/__init__.py
@@ -0,0 +1,26 @@
+from crewai.experimental.evaluation.metrics.reasoning_metrics import (
+    ReasoningEfficiencyEvaluator
+)
+
+from crewai.experimental.evaluation.metrics.tools_metrics import (
+    ToolSelectionEvaluator,
+    ParameterExtractionEvaluator,
+    ToolInvocationEvaluator
+)
+
+from crewai.experimental.evaluation.metrics.goal_metrics import (
+    GoalAlignmentEvaluator
+)
+
+from crewai.experimental.evaluation.metrics.semantic_quality_metrics import (
+    SemanticQualityEvaluator
+)
+
+__all__ = [
+    "ReasoningEfficiencyEvaluator",
+    "ToolSelectionEvaluator",
+    "ParameterExtractionEvaluator",
+    "ToolInvocationEvaluator",
+    "GoalAlignmentEvaluator",
+    "SemanticQualityEvaluator"
+]
\ No newline at end of file
diff --git a/src/crewai/experimental/evaluation/metrics/goal_metrics.py b/src/crewai/experimental/evaluation/metrics/goal_metrics.py
new file mode 100644
index 000000000..85f0b91aa
--- /dev/null
+++ b/src/crewai/experimental/evaluation/metrics/goal_metrics.py
@@ -0,0 +1,66 @@
+from typing import Any, Dict
+
+from crewai.agent import Agent
+from crewai.task import Task
+
+from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
+from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
+
+class GoalAlignmentEvaluator(BaseEvaluator):
+    @property
+    def metric_category(self) -> MetricCategory:
+        return MetricCategory.GOAL_ALIGNMENT
+
+    def evaluate(
+        self,
+        agent: Agent,
+        task: Task,
+        execution_trace: Dict[str, Any],
+        final_output: Any,
+    ) -> EvaluationScore:
+        prompt = [
+            {"role": "system", "content": """You are an expert evaluator assessing how well an AI agent's output aligns with its assigned task goal.
+
+Score the agent's goal alignment on a scale from 0-10 where:
+- 0: Complete misalignment, agent did not understand or attempt the task goal
+- 5: Partial alignment, agent attempted the task but missed key requirements
+- 10: Perfect alignment, agent fully satisfied all task requirements
+
+Consider:
+1. Did the agent correctly interpret the task goal?
+2. Did the final output directly address the requirements?
+3. Did the agent focus on relevant aspects of the task?
+4. Did the agent provide all requested information or deliverables?
+
+Return your evaluation as JSON with fields 'score' (number) and 'feedback' (string).
+"""},
+            {"role": "user", "content": f"""
+Agent role: {agent.role}
+Agent goal: {agent.goal}
+Task description: {task.description}
+Expected output: {task.expected_output}
+
+Agent's final output:
+{final_output}
+
+Evaluate how well the agent's output aligns with the assigned task goal.
+"""}
+        ]
+        assert self.llm is not None
+        response = self.llm.call(prompt)
+
+        try:
+            evaluation_data: dict[str, Any] = extract_json_from_llm_response(response)
+            assert evaluation_data is not None
+
+            return EvaluationScore(
+                score=evaluation_data.get("score", 0),
+                feedback=evaluation_data.get("feedback", response),
+                raw_response=response
+            )
+        except Exception:
+            return EvaluationScore(
+                score=None,
+                feedback=f"Failed to parse evaluation. Raw response: {response}",
+                raw_response=response
+            )
diff --git a/src/crewai/experimental/evaluation/metrics/reasoning_metrics.py b/src/crewai/experimental/evaluation/metrics/reasoning_metrics.py
new file mode 100644
index 000000000..605e5b06c
--- /dev/null
+++ b/src/crewai/experimental/evaluation/metrics/reasoning_metrics.py
@@ -0,0 +1,355 @@
+"""Agent reasoning efficiency evaluators.
+
+This module provides evaluator implementations for:
+- Reasoning efficiency
+- Loop detection
+- Thinking-to-action ratio
+"""
+
+import logging
+import re
+from enum import Enum
+from typing import Any, Dict, List, Tuple
+import numpy as np
+from collections.abc import Sequence
+
+from crewai.agent import Agent
+from crewai.task import Task
+
+from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
+from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
+from crewai.tasks.task_output import TaskOutput
+
+class ReasoningPatternType(Enum):
+    EFFICIENT = "efficient"  # Good reasoning flow
+    LOOP = "loop"  # Agent is stuck in a loop
+    VERBOSE = "verbose"  # Agent is unnecessarily verbose
+    INDECISIVE = "indecisive"  # Agent struggles to make decisions
+    SCATTERED = "scattered"  # Agent jumps between topics without focus
+
+
+class ReasoningEfficiencyEvaluator(BaseEvaluator):
+    @property
+    def metric_category(self) -> MetricCategory:
+        return MetricCategory.REASONING_EFFICIENCY
+
+    def evaluate(
+        self,
+        agent: Agent,
+        task: Task,
+        execution_trace: Dict[str, Any],
+        final_output: TaskOutput,
+    ) -> EvaluationScore:
+        llm_calls = execution_trace.get("llm_calls", [])
+
+        if not llm_calls or len(llm_calls) < 2:
+            return EvaluationScore(
+                score=None,
+                feedback="Insufficient LLM calls to evaluate reasoning efficiency."
+            )
+
+        total_calls = len(llm_calls)
+        total_tokens = sum(call.get("total_tokens", 0) for call in llm_calls)
+        avg_tokens_per_call = total_tokens / total_calls if total_calls > 0 else 0
+        time_intervals = []
+        has_reliable_timing = True
+        for i in range(1, len(llm_calls)):
+            start_time = llm_calls[i-1].get("end_time")
+            end_time = llm_calls[i].get("start_time")
+            if start_time and end_time and start_time != end_time:
+                try:
+                    interval = end_time - start_time
+                    time_intervals.append(interval.total_seconds() if hasattr(interval, 'total_seconds') else 0)
+                except Exception:
+                    has_reliable_timing = False
+            else:
+                has_reliable_timing = False
+
+        loop_detected, loop_details = self._detect_loops(llm_calls)
+        pattern_analysis = self._analyze_reasoning_patterns(llm_calls)
+
+        efficiency_metrics = {
+            "total_llm_calls": total_calls,
+            "total_tokens": total_tokens,
+            "avg_tokens_per_call": avg_tokens_per_call,
+            "reasoning_pattern": pattern_analysis["primary_pattern"].value,
+            "loops_detected": loop_detected,
+        }
+
+        if has_reliable_timing and time_intervals:
+            efficiency_metrics["avg_time_between_calls"] = np.mean(time_intervals)
+
+        loop_info = f"Detected {len(loop_details)} potential reasoning loops." if loop_detected else "No significant reasoning loops detected."
+
+        call_samples = self._get_call_samples(llm_calls)
+
+        prompt = [
+            {"role": "system", "content": """You are an expert evaluator assessing the reasoning efficiency of an AI agent's thought process.
+
+Evaluate the agent's reasoning efficiency across these five key subcategories:
+
+1. Focus (0-10): How well the agent stays on topic and avoids unnecessary tangents
+2. Progression (0-10): How effectively the agent builds on previous thoughts rather than repeating or circling
+3. Decision Quality (0-10): How decisively and appropriately the agent makes decisions
+4. Conciseness (0-10): How efficiently the agent communicates without unnecessary verbosity
+5. Loop Avoidance (0-10): How well the agent avoids getting stuck in repetitive thinking patterns
+
+For each subcategory, provide a score from 0-10 where:
+- 0: Completely inefficient
+- 5: Moderately efficient
+- 10: Highly efficient
+
+The overall score should be a weighted average of these subcategories.
+
+Return your evaluation as JSON with the following structure:
+{
+    "overall_score": float,
+    "scores": {
+        "focus": float,
+        "progression": float,
+        "decision_quality": float,
+        "conciseness": float,
+        "loop_avoidance": float
+    },
+    "feedback": string (general feedback about overall reasoning efficiency),
+    "optimization_suggestions": string (concrete suggestions for improving reasoning efficiency),
+    "detected_patterns": string (describe any inefficient reasoning patterns you observe)
+}"""},
+            {"role": "user", "content": f"""
+Agent role: {agent.role}
+Task description: {task.description}
+
+Reasoning efficiency metrics:
+- Total LLM calls: {efficiency_metrics["total_llm_calls"]}
+- Average tokens per call: {efficiency_metrics["avg_tokens_per_call"]:.1f}
+- Primary reasoning pattern: {efficiency_metrics["reasoning_pattern"]}
+- {loop_info}
+{"- Average time between calls: {:.2f} seconds".format(efficiency_metrics.get("avg_time_between_calls", 0)) if "avg_time_between_calls" in efficiency_metrics else ""}
+
+Sample of agent reasoning flow (chronological sequence):
+{call_samples}
+
+Agent's final output:
+{final_output.raw[:500]}... (truncated)
+
+Evaluate the reasoning efficiency of this agent based on these interaction patterns.
+Identify any inefficient reasoning patterns and provide specific suggestions for optimization.
+"""}
+        ]
+
+        assert self.llm is not None
+        response = self.llm.call(prompt)
+
+        try:
+            evaluation_data = extract_json_from_llm_response(response)
+
+            scores = evaluation_data.get("scores", {})
+            focus = scores.get("focus", 5.0)
+            progression = scores.get("progression", 5.0)
+            decision_quality = scores.get("decision_quality", 5.0)
+            conciseness = scores.get("conciseness", 5.0)
+            loop_avoidance = scores.get("loop_avoidance", 5.0)
+
+            overall_score = evaluation_data.get("overall_score", evaluation_data.get("score", 5.0))
+            feedback = evaluation_data.get("feedback", "No detailed feedback provided.")
+            optimization_suggestions = evaluation_data.get("optimization_suggestions", "No specific suggestions provided.")
+
+            detailed_feedback = "Reasoning Efficiency Evaluation:\n"
+            detailed_feedback += f"• Focus: {focus}/10 - Staying on topic without tangents\n"
+            detailed_feedback += f"• Progression: {progression}/10 - Building on previous thinking\n"
+            detailed_feedback += f"• Decision Quality: {decision_quality}/10 - Making appropriate decisions\n"
+            detailed_feedback += f"• Conciseness: {conciseness}/10 - Communicating efficiently\n"
+            detailed_feedback += f"• Loop Avoidance: {loop_avoidance}/10 - Avoiding repetitive patterns\n\n"
+
+            detailed_feedback += f"Feedback:\n{feedback}\n\n"
+            detailed_feedback += f"Optimization Suggestions:\n{optimization_suggestions}"
+
+            return EvaluationScore(
+                score=float(overall_score),
+                feedback=detailed_feedback,
+                raw_response=response
+            )
+        except Exception as e:
+            logging.warning(f"Failed to parse reasoning efficiency evaluation: {e}")
+            return EvaluationScore(
+                score=None,
+                feedback=f"Failed to parse reasoning efficiency evaluation. Raw response: {response[:200]}...",
+                raw_response=response
+            )
+
+    def _detect_loops(self, llm_calls: List[Dict]) -> Tuple[bool, List[Dict]]:
+        loop_details = []
+
+        messages = []
+        for call in llm_calls:
+            content = call.get("response", "")
+            if isinstance(content, str):
+                messages.append(content)
+            elif isinstance(content, list) and len(content) > 0:
+                # Handle message list format
+                for msg in content:
+                    if isinstance(msg, dict) and "content" in msg:
+                        messages.append(msg["content"])
+
+        # Simple n-gram based similarity detection
+        # For a more robust implementation, consider using embedding-based similarity
+        for i in range(len(messages) - 2):
+            for j in range(i + 1, len(messages) - 1):
+                # Check for repeated patterns (simplistic approach)
+                # A more sophisticated approach would use semantic similarity
+                similarity = self._calculate_text_similarity(messages[i], messages[j])
+                if similarity > 0.7:  # Arbitrary threshold
+                    loop_details.append({
+                        "first_occurrence": i,
+                        "second_occurrence": j,
+                        "similarity": similarity,
+                        "snippet": messages[i][:100] + "..."
+                    })
+
+        return len(loop_details) > 0, loop_details
+
+    def _calculate_text_similarity(self, text1: str, text2: str) -> float:
+        text1 = re.sub(r'\s+', ' ', text1.lower()).strip()
+        text2 = re.sub(r'\s+', ' ', text2.lower()).strip()
+
+        # Simple Jaccard similarity on word sets
+        words1 = set(text1.split())
+        words2 = set(text2.split())
+
+        intersection = len(words1.intersection(words2))
+        union = len(words1.union(words2))
+
+        return intersection / union if union > 0 else 0.0
+
+    def _analyze_reasoning_patterns(self, llm_calls: List[Dict]) -> Dict[str, Any]:
+        call_lengths = []
+        response_times = []
+
+        for call in llm_calls:
+            content = call.get("response", "")
+            if isinstance(content, str):
+                call_lengths.append(len(content))
+            elif isinstance(content, list) and len(content) > 0:
+                # Handle message list format
+                total_length = 0
+                for msg in content:
+                    if isinstance(msg, dict) and "content" in msg:
+                        total_length += len(msg["content"])
+                call_lengths.append(total_length)
+
+            start_time = call.get("start_time")
+            end_time = call.get("end_time")
+            if start_time and end_time:
+                try:
+                    response_times.append(end_time - start_time)
+                except Exception:
+                    pass
+
+        avg_length = np.mean(call_lengths) if call_lengths else 0
+        std_length = np.std(call_lengths) if call_lengths else 0
+        length_trend = self._calculate_trend(call_lengths)
+
+        primary_pattern = ReasoningPatternType.EFFICIENT
+        details = "Agent demonstrates efficient reasoning patterns."
+
+        loop_score = self._calculate_loop_likelihood(call_lengths, response_times)
+        if loop_score > 0.7:
+            primary_pattern = ReasoningPatternType.LOOP
+            details = "Agent appears to be stuck in repetitive thinking patterns."
+        elif avg_length > 1000 and std_length / avg_length < 0.3:
+            primary_pattern = ReasoningPatternType.VERBOSE
+            details = "Agent is consistently verbose across interactions."
+        elif len(llm_calls) > 10 and length_trend > 0.5:
+            primary_pattern = ReasoningPatternType.INDECISIVE
+            details = "Agent shows signs of indecisiveness with increasing message lengths."
+        elif std_length / avg_length > 0.8:
+            primary_pattern = ReasoningPatternType.SCATTERED
+            details = "Agent shows inconsistent reasoning flow with highly variable responses."
+
+        return {
+            "primary_pattern": primary_pattern,
+            "details": details,
+            "metrics": {
+                "avg_length": avg_length,
+                "std_length": std_length,
+                "length_trend": length_trend,
+                "loop_score": loop_score
+            }
+        }
+
+    def _calculate_trend(self, values: Sequence[float | int]) -> float:
+        if not values or len(values) < 2:
+            return 0.0
+
+        try:
+            x = np.arange(len(values))
+            y = np.array(values)
+
+            # Simple linear regression
+            slope = np.polyfit(x, y, 1)[0]
+
+            # Normalize slope to -1 to 1 range
+            max_possible_slope = max(values) - min(values)
+            if max_possible_slope > 0:
+                normalized_slope = slope / max_possible_slope
+                return max(min(normalized_slope, 1.0), -1.0)
+            return 0.0
+        except Exception:
+            return 0.0
+
+    def _calculate_loop_likelihood(self, call_lengths: Sequence[float], response_times: Sequence[float]) -> float:
+        if not call_lengths or len(call_lengths) < 3:
+            return 0.0
+
+        indicators = []
+
+        if len(call_lengths) >= 4:
+            repeated_lengths = 0
+            for i in range(len(call_lengths) - 2):
+                ratio = call_lengths[i] / call_lengths[i + 2] if call_lengths[i + 2] > 0 else 0
+                if 0.85 <= ratio <= 1.15:
+                    repeated_lengths += 1
+
+            length_repetition_score = repeated_lengths / (len(call_lengths) - 2)
+            indicators.append(length_repetition_score)
+
+        if response_times and len(response_times) >= 3:
+            try:
+                std_time = np.std(response_times)
+                mean_time = np.mean(response_times)
+                if mean_time > 0:
+                    time_consistency = 1.0 - (std_time / mean_time)
+                    indicators.append(max(0, time_consistency - 0.3) * 1.5)
+            except Exception:
+                pass
+
+        return np.mean(indicators) if indicators else 0.0
+
+    def _get_call_samples(self, llm_calls: List[Dict]) -> str:
+        samples = []
+
+        if len(llm_calls) <= 6:
+            sample_indices = list(range(len(llm_calls)))
+        else:
+            sample_indices = [0, 1, len(llm_calls) // 2 - 1, len(llm_calls) // 2,
+                             len(llm_calls) - 2, len(llm_calls) - 1]
+
+        for idx in sample_indices:
+            call = llm_calls[idx]
+            content = call.get("response", "")
+
+            if isinstance(content, str):
+                sample = content
+            elif isinstance(content, list) and len(content) > 0:
+                sample_parts = []
+                for msg in content:
+                    if isinstance(msg, dict) and "content" in msg:
+                        sample_parts.append(msg["content"])
+                sample = "\n".join(sample_parts)
+            else:
+                sample = str(content)
+
+            truncated = sample[:200] + "..." if len(sample) > 200 else sample
+            samples.append(f"Call {idx + 1}:\n{truncated}\n")
+
+        return "\n".join(samples)
diff --git a/src/crewai/experimental/evaluation/metrics/semantic_quality_metrics.py b/src/crewai/experimental/evaluation/metrics/semantic_quality_metrics.py
new file mode 100644
index 000000000..011618481
--- /dev/null
+++ b/src/crewai/experimental/evaluation/metrics/semantic_quality_metrics.py
@@ -0,0 +1,65 @@
+from typing import Any, Dict
+
+from crewai.agent import Agent
+from crewai.task import Task
+
+from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
+from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
+
+class SemanticQualityEvaluator(BaseEvaluator):
+    @property
+    def metric_category(self) -> MetricCategory:
+        return MetricCategory.SEMANTIC_QUALITY
+
+    def evaluate(
+        self,
+        agent: Agent,
+        task: Task,
+        execution_trace: Dict[str, Any],
+        final_output: Any,
+    ) -> EvaluationScore:
+        prompt = [
+            {"role": "system", "content": """You are an expert evaluator assessing the semantic quality of an AI agent's output.
+
+Score the semantic quality on a scale from 0-10 where:
+- 0: Completely incoherent, confusing, or logically flawed output
+- 5: Moderately clear and logical output with some issues
+- 10: Exceptionally clear, coherent, and logically sound output
+
+Consider:
+1. Is the output well-structured and organized?
+2. Is the reasoning logical and well-supported?
+3. Is the language clear, precise, and appropriate for the task?
+4. Are claims supported by evidence when appropriate?
+5. Is the output free from contradictions and logical fallacies?
+
+Return your evaluation as JSON with fields 'score' (number) and 'feedback' (string).
+"""},
+            {"role": "user", "content": f"""
+Agent role: {agent.role}
+Task description: {task.description}
+
+Agent's final output:
+{final_output}
+
+Evaluate the semantic quality and reasoning of this output.
+"""}
+        ]
+
+        assert self.llm is not None
+        response = self.llm.call(prompt)
+
+        try:
+            evaluation_data: dict[str, Any] = extract_json_from_llm_response(response)
+            assert evaluation_data is not None
+            return EvaluationScore(
+                score=float(evaluation_data["score"]) if evaluation_data.get("score") is not None else None,
+                feedback=evaluation_data.get("feedback", response),
+                raw_response=response
+            )
+        except Exception:
+            return EvaluationScore(
+                score=None,
+                feedback=f"Failed to parse evaluation. Raw response: {response}",
+                raw_response=response
+            )
\ No newline at end of file
diff --git a/src/crewai/experimental/evaluation/metrics/tools_metrics.py b/src/crewai/experimental/evaluation/metrics/tools_metrics.py
new file mode 100644
index 000000000..16b2ec7cd
--- /dev/null
+++ b/src/crewai/experimental/evaluation/metrics/tools_metrics.py
@@ -0,0 +1,400 @@
+import json
+from typing import Dict, Any
+
+from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
+from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
+from crewai.agent import Agent
+from crewai.task import Task
+
+
+class ToolSelectionEvaluator(BaseEvaluator):
+
+    @property
+    def metric_category(self) -> MetricCategory:
+        return MetricCategory.TOOL_SELECTION
+
+    def evaluate(
+        self,
+        agent: Agent,
+        task: Task,
+        execution_trace: Dict[str, Any],
+        final_output: str,
+    ) -> EvaluationScore:
+        tool_uses = execution_trace.get("tool_uses", [])
+        tool_count = len(tool_uses)
+        unique_tool_types = set([tool.get("tool", "Unknown tool") for tool in tool_uses])
+
+        if tool_count == 0:
+            if not agent.tools:
+                return EvaluationScore(
+                    score=None,
+                    feedback="Agent had no tools available to use."
+                )
+            else:
+                return EvaluationScore(
+                    score=None,
+                    feedback="Agent had tools available but didn't use any."
+                )
+
+        available_tools_info = ""
+        if agent.tools:
+            for tool in agent.tools:
+                available_tools_info += f"- {tool.name}: {tool.description}\n"
+        else:
+            available_tools_info = "No tools available"
+
+        tool_types_summary = "Tools selected by the agent:\n"
+        for tool_type in sorted(unique_tool_types):
+            tool_types_summary += f"- {tool_type}\n"
+
+        prompt = [
+            {"role": "system", "content": """You are an expert evaluator assessing if an AI agent selected the most appropriate tools for a given task.
+
+You must evaluate based on these 2 criteria:
+1. Relevance (0-10): Were the tools chosen directly aligned with the task's goals?
+2. Coverage (0-10): Did the agent select ALL appropriate tools from the AVAILABLE tools?
+
+IMPORTANT:
+- ONLY consider tools that are listed as available to the agent
+- DO NOT suggest tools that aren't in the 'Available tools' list
+- DO NOT evaluate the quality or accuracy of tool outputs/results
+- DO NOT evaluate how many times each tool was used
+- DO NOT evaluate how the agent used the parameters
+- DO NOT evaluate whether the agent interpreted the task correctly
+
+Focus ONLY on whether the correct CATEGORIES of tools were selected from what was available.
+
+Return your evaluation as JSON with these fields:
+- scores: {"relevance": number, "coverage": number}
+- overall_score: number (average of all scores, 0-10)
+- feedback: string (focused ONLY on tool selection decisions from available tools)
+- improvement_suggestions: string (ONLY suggest better selection from the AVAILABLE tools list, NOT new tools)
+"""},
+            {"role": "user", "content": f"""
+Agent role: {agent.role}
+Task description: {task.description}
+
+Available tools for this agent:
+{available_tools_info}
+
+{tool_types_summary}
+
+Based ONLY on the task description and comparing the AVAILABLE tools with those that were selected (listed above), evaluate if the agent selected the appropriate tool types for this task.
+
+IMPORTANT:
+- ONLY evaluate selection from tools listed as available
+- DO NOT suggest new tools that aren't in the available tools list
+- DO NOT evaluate tool usage or results
+"""}
+        ]
+        assert self.llm is not None
+        response = self.llm.call(prompt)
+
+        try:
+            evaluation_data = extract_json_from_llm_response(response)
+            assert evaluation_data is not None
+
+            scores = evaluation_data.get("scores", {})
+            relevance = scores.get("relevance", 5.0)
+            coverage = scores.get("coverage", 5.0)
+            overall_score = float(evaluation_data.get("overall_score", 5.0))
+
+            feedback = "Tool Selection Evaluation:\n"
+            feedback += f"• Relevance: {relevance}/10 - Selection of appropriate tool types for the task\n"
+            feedback += f"• Coverage: {coverage}/10 - Selection of all necessary tool types\n"
+            if "improvement_suggestions" in evaluation_data:
+                feedback += f"Improvement Suggestions:\n{evaluation_data['improvement_suggestions']}"
+            else:
+                feedback += evaluation_data.get("feedback", "No detailed feedback available.")
+
+            return EvaluationScore(
+                score=overall_score,
+                feedback=feedback,
+                raw_response=response
+            )
+        except Exception as e:
+            return EvaluationScore(
+                score=None,
+                feedback=f"Error evaluating tool selection: {e}",
+                raw_response=response
+            )
+
+
+class ParameterExtractionEvaluator(BaseEvaluator):
+    @property
+    def metric_category(self) -> MetricCategory:
+        return MetricCategory.PARAMETER_EXTRACTION
+
+    def evaluate(
+        self,
+        agent: Agent,
+        task: Task,
+        execution_trace: Dict[str, Any],
+        final_output: str,
+    ) -> EvaluationScore:
+        tool_uses = execution_trace.get("tool_uses", [])
+        tool_count = len(tool_uses)
+
+        if tool_count == 0:
+            return EvaluationScore(
+                score=None,
+                feedback="No tool usage detected. Cannot evaluate parameter extraction."
+            )
+
+        validation_errors = []
+        for tool_use in tool_uses:
+            if not tool_use.get("success", True) and tool_use.get("error_type") == "validation_error":
+                validation_errors.append({
+                    "tool": tool_use.get("tool", "Unknown tool"),
+                    "error": tool_use.get("result"),
+                    "args": tool_use.get("args", {})
+                })
+
+        validation_error_rate = len(validation_errors) / tool_count if tool_count > 0 else 0
+
+        param_samples = []
+        for i, tool_use in enumerate(tool_uses[:5]):
+            tool_name = tool_use.get("tool", "Unknown tool")
+            tool_args = tool_use.get("args", {})
+            success = tool_use.get("success", True) and not tool_use.get("error", False)
+            error_type = tool_use.get("error_type", "") if not success else ""
+
+            is_validation_error = error_type == "validation_error"
+
+            sample = f"Tool use #{i+1} - {tool_name}:\n"
+            sample += f"- Parameters: {json.dumps(tool_args, indent=2)}\n"
+            sample += f"- Success: {'No' if not success else 'Yes'}"
+
+            if is_validation_error:
+                sample += " (PARAMETER VALIDATION ERROR)\n"
+                sample += f"- Error: {tool_use.get('result', 'Unknown error')}"
+            elif not success:
+                sample += f" (Other error: {error_type})\n"
+
+            param_samples.append(sample)
+
+        validation_errors_info = ""
+        if validation_errors:
+            validation_errors_info = f"\nParameter validation errors detected: {len(validation_errors)} ({validation_error_rate:.1%} of tool uses)\n"
+            for i, err in enumerate(validation_errors[:3]):
+                tool_name = err.get("tool", "Unknown tool")
+                error_msg = err.get("error", "Unknown error")
+                args = err.get("args", {})
+                validation_errors_info += f"\nValidation Error #{i+1}:\n- Tool: {tool_name}\n- Args: {json.dumps(args, indent=2)}\n- Error: {error_msg}"
+
+            if len(validation_errors) > 3:
+                validation_errors_info += f"\n...and {len(validation_errors) - 3} more validation errors."
+        param_samples_text = "\n\n".join(param_samples)
+        prompt = [
+            {"role": "system", "content": """You are an expert evaluator assessing how well an AI agent extracts and formats PARAMETER VALUES for tool calls.
+
+Your job is to evaluate ONLY whether the agent used the correct parameter VALUES, not whether the right tools were selected or how the tools were invoked.
+
+Evaluate parameter extraction based on these criteria:
+1. Accuracy (0-10): Are parameter values correctly identified from the context/task?
+2. Formatting (0-10): Are values formatted correctly for each tool's requirements?
+3. Completeness (0-10): Are all required parameter values provided, with no missing information?
+
+IMPORTANT: DO NOT evaluate:
+- Whether the right tool was chosen (that's the ToolSelectionEvaluator's job)
+- How the tools were structurally invoked (that's the ToolInvocationEvaluator's job)
+- The quality of results from tools
+
+Focus ONLY on the PARAMETER VALUES - whether they were correctly extracted from the context, properly formatted, and complete.
+
+Validation errors are important signals that parameter values weren't properly extracted or formatted.
+
+Return your evaluation as JSON with these fields:
+- scores: {"accuracy": number, "formatting": number, "completeness": number}
+- overall_score: number (average of all scores, 0-10)
+- feedback: string (focused ONLY on parameter value extraction quality)
+- improvement_suggestions: string (concrete suggestions for better parameter VALUE extraction)
+"""},
+            {"role": "user", "content": f"""
+Agent role: {agent.role}
+Task description: {task.description}
+
+Parameter extraction examples:
+{param_samples_text}
+{validation_errors_info}
+
+Evaluate the quality of the agent's parameter extraction for this task.
+"""}
+        ]
+
+        assert self.llm is not None
+        response = self.llm.call(prompt)
+
+        try:
+            evaluation_data = extract_json_from_llm_response(response)
+            assert evaluation_data is not None
+
+            scores = evaluation_data.get("scores", {})
+            accuracy = scores.get("accuracy", 5.0)
+            formatting = scores.get("formatting", 5.0)
+            completeness = scores.get("completeness", 5.0)
+
+            overall_score = float(evaluation_data.get("overall_score", 5.0))
+
+            feedback = "Parameter Extraction Evaluation:\n"
+            feedback += f"• Accuracy: {accuracy}/10 - Correctly identifying required parameters\n"
+            feedback += f"• Formatting: {formatting}/10 - Properly formatting parameters for tools\n"
+            feedback += f"• Completeness: {completeness}/10 - Including all necessary information\n\n"
+
+            if "improvement_suggestions" in evaluation_data:
+                feedback += f"Improvement Suggestions:\n{evaluation_data['improvement_suggestions']}"
+            else:
+                feedback += evaluation_data.get("feedback", "No detailed feedback available.")
+
+            return EvaluationScore(
+                score=overall_score,
+                feedback=feedback,
+                raw_response=response
+            )
+        except Exception as e:
+            return EvaluationScore(
+                score=None,
+                feedback=f"Error evaluating parameter extraction: {e}",
+                raw_response=response
+            )
+
+
+class ToolInvocationEvaluator(BaseEvaluator):
+    @property
+    def metric_category(self) -> MetricCategory:
+        return MetricCategory.TOOL_INVOCATION
+
+    def evaluate(
+        self,
+        agent: Agent,
+        task: Task,
+        execution_trace: Dict[str, Any],
+        final_output: str,
+    ) -> EvaluationScore:
+        tool_uses = execution_trace.get("tool_uses", [])
+        tool_errors = []
+        tool_count = len(tool_uses)
+
+        if tool_count == 0:
+            return EvaluationScore(
+                score=None,
+                feedback="No tool usage detected. Cannot evaluate tool invocation."
+            )
+
+        for tool_use in tool_uses:
+            if not tool_use.get("success", True) or tool_use.get("error", False):
+                error_info = {
+                    "tool": tool_use.get("tool", "Unknown tool"),
+                    "error": tool_use.get("result"),
+                    "error_type": tool_use.get("error_type", "unknown_error")
+                }
+                tool_errors.append(error_info)
+
+        error_rate = len(tool_errors) / tool_count if tool_count > 0 else 0
+
+        error_types = {}
+        for error in tool_errors:
+            error_type = error.get("error_type", "unknown_error")
+            if error_type not in error_types:
+                error_types[error_type] = 0
+            error_types[error_type] += 1
+
+        invocation_samples = []
+        for i, tool_use in enumerate(tool_uses[:5]):
+            tool_name = tool_use.get("tool", "Unknown tool")
+            tool_args = tool_use.get("args", {})
+            success = tool_use.get("success", True) and not tool_use.get("error", False)
+            error_type = tool_use.get("error_type", "") if not success else ""
+            error_msg = tool_use.get("result", "No error") if not success else "No error"
+
+            sample = f"Tool invocation #{i+1}:\n"
+            sample += f"- Tool: {tool_name}\n"
+            sample += f"- Parameters: {json.dumps(tool_args, indent=2)}\n"
+            sample += f"- Success: {'No' if not success else 'Yes'}\n"
+            if not success:
+                sample += f"- Error type: {error_type}\n"
+                sample += f"- Error: {error_msg}"
+            invocation_samples.append(sample)
+
+        error_type_summary = ""
+        if error_types:
+            error_type_summary = "Error type breakdown:\n"
+            for error_type, count in error_types.items():
+                error_type_summary += f"- {error_type}: {count} occurrences ({(count/tool_count):.1%})\n"
+
+        invocation_samples_text = "\n\n".join(invocation_samples)
+        prompt = [
+            {"role": "system", "content": """You are an expert evaluator assessing how correctly an AI agent's tool invocations are STRUCTURED.
+
+Your job is to evaluate ONLY the structural and syntactical aspects of how the agent called tools, NOT which tools were selected or what parameter values were used.
+
+Evaluate the agent's tool invocation based on these criteria:
+1. Structure (0-10): Does the tool call follow the expected syntax and format?
+2. Error Handling (0-10): Does the agent handle tool errors appropriately?
+3. Invocation Patterns (0-10): Are tool calls properly sequenced, batched, or managed?
+
+Error types that indicate invocation issues:
+- execution_error: The tool was called correctly but failed during execution
+- usage_error: General errors in how the tool was used structurally
+
+IMPORTANT: DO NOT evaluate:
+- Whether the right tool was chosen (that's the ToolSelectionEvaluator's job)
+- Whether the parameter values are correct (that's the ParameterExtractionEvaluator's job)
+- The quality of results from tools
+
+Focus ONLY on HOW tools were invoked - the structure, format, and handling of the invocation process.
+
+Return your evaluation as JSON with these fields:
+- scores: {"structure": number, "error_handling": number, "invocation_patterns": number}
+- overall_score: number (average of all scores, 0-10)
+- feedback: string (focused ONLY on structural aspects of tool invocation)
+- improvement_suggestions: string (concrete suggestions for better structuring of tool calls)
+"""},
+            {"role": "user", "content": f"""
+Agent role: {agent.role}
+Task description: {task.description}
+
+Tool invocation examples:
+{invocation_samples_text}
+
+Tool error rate: {error_rate:.2%} ({len(tool_errors)} errors out of {tool_count} invocations)
+{error_type_summary}
+
+Evaluate the quality of the agent's tool invocation structure during this task.
+"""}
+        ]
+
+        assert self.llm is not None
+        response = self.llm.call(prompt)
+
+        try:
+            evaluation_data = extract_json_from_llm_response(response)
+            assert evaluation_data is not None
+            scores = evaluation_data.get("scores", {})
+            structure = scores.get("structure", 5.0)
+            error_handling = scores.get("error_handling", 5.0)
+            invocation_patterns = scores.get("invocation_patterns", 5.0)
+
+            overall_score = float(evaluation_data.get("overall_score", 5.0))
+
+            feedback = "Tool Invocation Evaluation:\n"
+            feedback += f"• Structure: {structure}/10 - Following proper syntax and format\n"
+            feedback += f"• Error Handling: {error_handling}/10 - Appropriately handling tool errors\n"
+            feedback += f"• Invocation Patterns: {invocation_patterns}/10 - Proper sequencing and management of calls\n\n"
+
+            if "improvement_suggestions" in evaluation_data:
+                feedback += f"Improvement Suggestions:\n{evaluation_data['improvement_suggestions']}"
+            else:
+                feedback += evaluation_data.get("feedback", "No detailed feedback available.")
+
+            return EvaluationScore(
+                score=overall_score,
+                feedback=feedback,
+                raw_response=response
+            )
+        except Exception as e:
+            return EvaluationScore(
+                score=None,
+                feedback=f"Error evaluating tool invocation: {e}",
+                raw_response=response
+            )
diff --git a/tests/evaluation/__init__.py b/tests/experimental/evaluation/__init__.py
similarity index 100%
rename from tests/evaluation/__init__.py
rename to tests/experimental/evaluation/__init__.py
diff --git a/tests/evaluation/metrics/__init__.py b/tests/experimental/evaluation/metrics/__init__.py
similarity index 100%
rename from tests/evaluation/metrics/__init__.py
rename to tests/experimental/evaluation/metrics/__init__.py
diff --git a/tests/evaluation/metrics/base_evaluation_metrics_test.py b/tests/experimental/evaluation/metrics/base_evaluation_metrics_test.py
similarity index 100%
rename from tests/evaluation/metrics/base_evaluation_metrics_test.py
rename to tests/experimental/evaluation/metrics/base_evaluation_metrics_test.py
diff --git a/tests/evaluation/metrics/test_goal_metrics.py b/tests/experimental/evaluation/metrics/test_goal_metrics.py
similarity index 88%
rename from tests/evaluation/metrics/test_goal_metrics.py
rename to tests/experimental/evaluation/metrics/test_goal_metrics.py
index 69ec42d1f..5c10f4301 100644
--- a/tests/evaluation/metrics/test_goal_metrics.py
+++ b/tests/experimental/evaluation/metrics/test_goal_metrics.py
@@ -1,8 +1,8 @@
 from unittest.mock import patch, MagicMock
-from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
+from tests.experimental.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
 
-from crewai.evaluation.base_evaluator import EvaluationScore
-from crewai.evaluation.metrics.goal_metrics import GoalAlignmentEvaluator
+from crewai.experimental.evaluation.base_evaluator import EvaluationScore
+from crewai.experimental.evaluation.metrics.goal_metrics import GoalAlignmentEvaluator
 from crewai.utilities.llm_utils import LLM
 
 
diff --git a/tests/evaluation/metrics/test_reasoning_metrics.py b/tests/experimental/evaluation/metrics/test_reasoning_metrics.py
similarity index 96%
rename from tests/evaluation/metrics/test_reasoning_metrics.py
rename to tests/experimental/evaluation/metrics/test_reasoning_metrics.py
index 5d8015e3b..547046965 100644
--- a/tests/evaluation/metrics/test_reasoning_metrics.py
+++ b/tests/experimental/evaluation/metrics/test_reasoning_metrics.py
@@ -3,12 +3,12 @@ from unittest.mock import patch, MagicMock
 from typing import List, Dict, Any
 
 from crewai.tasks.task_output import TaskOutput
-from crewai.evaluation.metrics.reasoning_metrics import (
+from crewai.experimental.evaluation.metrics.reasoning_metrics import (
     ReasoningEfficiencyEvaluator,
 )
-from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
+from tests.experimental.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
 from crewai.utilities.llm_utils import LLM
-from crewai.evaluation.base_evaluator import EvaluationScore
+from crewai.experimental.evaluation.base_evaluator import EvaluationScore
 
 class TestReasoningEfficiencyEvaluator(BaseEvaluationMetricsTest):
     @pytest.fixture
diff --git a/tests/evaluation/metrics/test_semantic_quality_metrics.py b/tests/experimental/evaluation/metrics/test_semantic_quality_metrics.py
similarity index 90%
rename from tests/evaluation/metrics/test_semantic_quality_metrics.py
rename to tests/experimental/evaluation/metrics/test_semantic_quality_metrics.py
index e5adb198b..4050b9562 100644
--- a/tests/evaluation/metrics/test_semantic_quality_metrics.py
+++ b/tests/experimental/evaluation/metrics/test_semantic_quality_metrics.py
@@ -1,8 +1,8 @@
 from unittest.mock import patch, MagicMock
 
-from crewai.evaluation.base_evaluator import EvaluationScore
-from crewai.evaluation.metrics.semantic_quality_metrics import SemanticQualityEvaluator
-from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
+from crewai.experimental.evaluation.base_evaluator import EvaluationScore
+from crewai.experimental.evaluation.metrics.semantic_quality_metrics import SemanticQualityEvaluator
+from tests.experimental.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
 from crewai.utilities.llm_utils import LLM
 
 class TestSemanticQualityEvaluator(BaseEvaluationMetricsTest):
diff --git a/tests/evaluation/metrics/test_tools_metrics.py b/tests/experimental/evaluation/metrics/test_tools_metrics.py
similarity index 97%
rename from tests/evaluation/metrics/test_tools_metrics.py
rename to tests/experimental/evaluation/metrics/test_tools_metrics.py
index ad6e50fdd..dab3eeee1 100644
--- a/tests/evaluation/metrics/test_tools_metrics.py
+++ b/tests/experimental/evaluation/metrics/test_tools_metrics.py
@@ -1,12 +1,12 @@
 from unittest.mock import patch, MagicMock
 
-from crewai.evaluation.metrics.tools_metrics import (
+from crewai.experimental.evaluation.metrics.tools_metrics import (
     ToolSelectionEvaluator,
     ParameterExtractionEvaluator,
     ToolInvocationEvaluator
 )
 from crewai.utilities.llm_utils import LLM
-from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
+from tests.experimental.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
 
 class TestToolSelectionEvaluator(BaseEvaluationMetricsTest):
     def test_no_tools_available(self, mock_task, mock_agent):
diff --git a/tests/evaluation/test_agent_evaluator.py b/tests/experimental/evaluation/test_agent_evaluator.py
similarity index 91%
rename from tests/evaluation/test_agent_evaluator.py
rename to tests/experimental/evaluation/test_agent_evaluator.py
index f7935e85f..88f493908 100644
--- a/tests/evaluation/test_agent_evaluator.py
+++ b/tests/experimental/evaluation/test_agent_evaluator.py
@@ -3,9 +3,9 @@ import pytest
 from crewai.agent import Agent
 from crewai.task import Task
 from crewai.crew import Crew
-from crewai.evaluation.agent_evaluator import AgentEvaluator
-from crewai.evaluation.base_evaluator import AgentEvaluationResult
-from crewai.evaluation import (
+from crewai.experimental.evaluation.agent_evaluator import AgentEvaluator
+from crewai.experimental.evaluation.base_evaluator import AgentEvaluationResult
+from crewai.experimental.evaluation import (
     GoalAlignmentEvaluator,
     SemanticQualityEvaluator,
     ToolSelectionEvaluator,
@@ -14,7 +14,7 @@ from crewai.evaluation import (
     ReasoningEfficiencyEvaluator
 )
 
-from crewai.evaluation import create_default_evaluator
+from crewai.experimental.evaluation import create_default_evaluator
 class TestAgentEvaluator:
     @pytest.fixture
     def mock_crew(self):
diff --git a/tests/experimental/evaluation/test_experiment_result.py b/tests/experimental/evaluation/test_experiment_result.py
new file mode 100644
index 000000000..5ba390b48
--- /dev/null
+++ b/tests/experimental/evaluation/test_experiment_result.py
@@ -0,0 +1,111 @@
+import pytest
+from unittest.mock import MagicMock, patch
+
+from crewai.experimental.evaluation.experiment.result import ExperimentResult, ExperimentResults
+
+
+class TestExperimentResult:
+    @pytest.fixture
+    def mock_results(self):
+        return [
+            ExperimentResult(
+                identifier="test-1",
+                inputs={"query": "What is the capital of France?"},
+                score=10,
+                expected_score=7,
+                passed=True
+            ),
+            ExperimentResult(
+                identifier="test-2",
+                inputs={"query": "Who wrote Hamlet?"},
+                score={"relevance": 9, "factuality": 8},
+                expected_score={"relevance": 7, "factuality": 7},
+                passed=True,
+                agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}}
+            ),
+            ExperimentResult(
+                identifier="test-3",
+                inputs={"query": "Any query"},
+                score={"relevance": 9, "factuality": 8},
+                expected_score={"relevance": 7, "factuality": 7},
+                passed=False,
+                agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}}
+            ),
+            ExperimentResult(
+                identifier="test-4",
+                inputs={"query": "Another query"},
+                score={"relevance": 9, "factuality": 8},
+                expected_score={"relevance": 7, "factuality": 7},
+                passed=True,
+                agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}}
+            ),
+            ExperimentResult(
+                identifier="test-6",
+                inputs={"query": "Yet another query"},
+                score={"relevance": 9, "factuality": 8},
+                expected_score={"relevance": 7, "factuality": 7},
+                passed=True,
+                agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}}
+            )
+        ]
+
+    @patch('os.path.exists', return_value=True)
+    @patch('os.path.getsize', return_value=1)
+    @patch('json.load')
+    @patch('builtins.open', new_callable=MagicMock)
+    def test_experiment_results_compare_with_baseline(self, mock_open, mock_json_load, mock_path_getsize, mock_path_exists, mock_results):
+        baseline_data = {
+            "timestamp": "2023-01-01T00:00:00+00:00",
+            "results": [
+                {
+                    "identifier": "test-1",
+                    "inputs": {"query": "What is the capital of France?"},
+                    "score": 7,
+                    "expected_score": 7,
+                    "passed": False
+                },
+                {
+                    "identifier": "test-2",
+                    "inputs": {"query": "Who wrote Hamlet?"},
+                    "score": {"relevance": 8, "factuality": 7},
+                    "expected_score": {"relevance": 7, "factuality": 7},
+                    "passed": True
+                },
+                {
+                    "identifier": "test-3",
+                    "inputs": {"query": "Any query"},
+                    "score": {"relevance": 8, "factuality": 7},
+                    "expected_score": {"relevance": 7, "factuality": 7},
+                    "passed": True
+                },
+                {
+                    "identifier": "test-4",
+                    "inputs": {"query": "Another query"},
+                    "score": {"relevance": 8, "factuality": 7},
+                    "expected_score": {"relevance": 7, "factuality": 7},
+                    "passed": True
+                },
+                {
+                    "identifier": "test-5",
+                    "inputs": {"query": "Another query"},
+                    "score": {"relevance": 8, "factuality": 7},
+                    "expected_score": {"relevance": 7, "factuality": 7},
+                    "passed": True
+                }
+            ]
+        }
+
+        mock_json_load.return_value = baseline_data
+
+        results = ExperimentResults(results=mock_results)
+        results.display = MagicMock()
+
+        comparison = results.compare_with_baseline(baseline_filepath="baseline.json")
+
+        assert "baseline_timestamp" in comparison
+        assert comparison["baseline_timestamp"] == "2023-01-01T00:00:00+00:00"
+        assert comparison["improved"] == ["test-1"]
+        assert comparison["regressed"] == ["test-3"]
+        assert comparison["unchanged"] == ["test-2", "test-4"]
+        assert comparison["new_tests"] == ["test-6"]
+        assert comparison["missing_tests"] == ["test-5"]
\ No newline at end of file
diff --git a/tests/experimental/evaluation/test_experiment_runner.py b/tests/experimental/evaluation/test_experiment_runner.py
new file mode 100644
index 000000000..58382fa65
--- /dev/null
+++ b/tests/experimental/evaluation/test_experiment_runner.py
@@ -0,0 +1,197 @@
+import pytest
+from unittest.mock import MagicMock, patch
+
+from crewai.crew import Crew
+from crewai.experimental.evaluation.experiment.runner import ExperimentRunner
+from crewai.experimental.evaluation.experiment.result import ExperimentResults
+from crewai.experimental.evaluation.evaluation_display import AgentAggregatedEvaluationResult
+from crewai.experimental.evaluation.base_evaluator import MetricCategory, EvaluationScore
+
+
+class TestExperimentRunner:
+    @pytest.fixture
+    def mock_crew(self):
+        return MagicMock(llm=Crew)
+
+    @pytest.fixture
+    def mock_evaluator_results(self):
+        agent_evaluation = AgentAggregatedEvaluationResult(
+            agent_id="Test Agent",
+            agent_role="Test Agent Role",
+            metrics={
+                MetricCategory.GOAL_ALIGNMENT: EvaluationScore(
+                    score=9,
+                    feedback="Test feedback for goal alignment",
+                    raw_response="Test raw response for goal alignment"
+                ),
+                MetricCategory.REASONING_EFFICIENCY: EvaluationScore(
+                    score=None,
+                    feedback="Reasoning efficiency not applicable",
+                    raw_response="Reasoning efficiency not applicable"
+                ),
+                MetricCategory.PARAMETER_EXTRACTION: EvaluationScore(
+                    score=7,
+                    feedback="Test parameter extraction explanation",
+                    raw_response="Test raw output"
+                ),
+                MetricCategory.TOOL_SELECTION: EvaluationScore(
+                    score=8,
+                    feedback="Test tool selection explanation",
+                    raw_response="Test raw output"
+                )
+            }
+        )
+
+        return {"Test Agent": agent_evaluation}
+
+    @patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
+    def test_run_success(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
+        dataset = [
+            {
+                "identifier": "test-case-1",
+                "inputs": {"query": "Test query 1"},
+                "expected_score": 8
+            },
+            {
+                "identifier": "test-case-2",
+                "inputs": {"query": "Test query 2"},
+                "expected_score": {"goal_alignment": 7}
+            },
+            {
+                "inputs": {"query": "Test query 3"},
+                "expected_score": {"tool_selection": 9}
+            }
+        ]
+
+        mock_evaluator = MagicMock()
+        mock_evaluator.get_agent_evaluation.return_value = mock_evaluator_results
+        mock_evaluator.reset_iterations_results = MagicMock()
+        mock_create_evaluator.return_value = mock_evaluator
+
+        runner = ExperimentRunner(dataset=dataset)
+
+        results = runner.run(crew=mock_crew)
+
+        assert isinstance(results, ExperimentResults)
+        result_1, result_2, result_3 = results.results
+        assert len(results.results) == 3
+
+        assert result_1.identifier == "test-case-1"
+        assert result_1.inputs == {"query": "Test query 1"}
+        assert result_1.expected_score == 8
+        assert result_1.passed is True
+
+        assert result_2.identifier == "test-case-2"
+        assert result_2.inputs == {"query": "Test query 2"}
+        assert isinstance(result_2.expected_score, dict)
+        assert "goal_alignment" in result_2.expected_score
+        assert result_2.passed is True
+
+        assert result_3.identifier == "c2ed49e63aa9a83af3ca382794134fd5"
+        assert result_3.inputs == {"query": "Test query 3"}
+        assert isinstance(result_3.expected_score, dict)
+        assert "tool_selection" in result_3.expected_score
+        assert result_3.passed is False
+
+        assert mock_crew.kickoff.call_count == 3
+        mock_crew.kickoff.assert_any_call(inputs={"query": "Test query 1"})
+        mock_crew.kickoff.assert_any_call(inputs={"query": "Test query 2"})
+        mock_crew.kickoff.assert_any_call(inputs={"query": "Test query 3"})
+
+        assert mock_evaluator.reset_iterations_results.call_count == 3
+        assert mock_evaluator.get_agent_evaluation.call_count == 3
+
+
+    @patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
+    def test_run_success_with_unknown_metric(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
+        dataset = [
+            {
+                "identifier": "test-case-2",
+                "inputs": {"query": "Test query 2"},
+                "expected_score": {"goal_alignment": 7, "unknown_metric": 8}
+            }
+        ]
+
+        mock_evaluator = MagicMock()
+        mock_evaluator.get_agent_evaluation.return_value = mock_evaluator_results
+        mock_evaluator.reset_iterations_results = MagicMock()
+        mock_create_evaluator.return_value = mock_evaluator
+
+        runner = ExperimentRunner(dataset=dataset)
+
+        results = runner.run(crew=mock_crew)
+
+        result, = results.results
+
+        assert result.identifier == "test-case-2"
+        assert result.inputs == {"query": "Test query 2"}
+        assert isinstance(result.expected_score, dict)
+        assert "goal_alignment" in result.expected_score.keys()
+        assert "unknown_metric" in result.expected_score.keys()
+        assert result.passed is True
+
+    @patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
+    def test_run_success_with_single_metric_evaluator_and_expected_specific_metric(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
+        dataset = [
+            {
+                "identifier": "test-case-2",
+                "inputs": {"query": "Test query 2"},
+                "expected_score": {"goal_alignment": 7}
+            }
+        ]
+
+        mock_evaluator = MagicMock()
+        mock_create_evaluator["Test Agent"].metrics = {
+            MetricCategory.GOAL_ALIGNMENT: EvaluationScore(
+                    score=9,
+                    feedback="Test feedback for goal alignment",
+                    raw_response="Test raw response for goal alignment"
+                )
+        }
+        mock_evaluator.get_agent_evaluation.return_value = mock_evaluator_results
+        mock_evaluator.reset_iterations_results = MagicMock()
+        mock_create_evaluator.return_value = mock_evaluator
+
+        runner = ExperimentRunner(dataset=dataset)
+
+        results = runner.run(crew=mock_crew)
+        result, = results.results
+
+        assert result.identifier == "test-case-2"
+        assert result.inputs == {"query": "Test query 2"}
+        assert isinstance(result.expected_score, dict)
+        assert "goal_alignment" in result.expected_score.keys()
+        assert result.passed is True
+
+    @patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
+    def test_run_success_when_expected_metric_is_not_available(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
+        dataset = [
+            {
+                "identifier": "test-case-2",
+                "inputs": {"query": "Test query 2"},
+                "expected_score": {"unknown_metric": 7}
+            }
+        ]
+
+        mock_evaluator = MagicMock()
+        mock_create_evaluator["Test Agent"].metrics = {
+            MetricCategory.GOAL_ALIGNMENT: EvaluationScore(
+                score=5,
+                feedback="Test feedback for goal alignment",
+                raw_response="Test raw response for goal alignment"
+            )
+        }
+        mock_evaluator.get_agent_evaluation.return_value = mock_evaluator_results
+        mock_evaluator.reset_iterations_results = MagicMock()
+        mock_create_evaluator.return_value = mock_evaluator
+
+        runner = ExperimentRunner(dataset=dataset)
+
+        results = runner.run(crew=mock_crew)
+        result, = results.results
+
+        assert result.identifier == "test-case-2"
+        assert result.inputs == {"query": "Test query 2"}
+        assert isinstance(result.expected_score, dict)
+        assert "unknown_metric" in result.expected_score.keys()
+        assert result.passed is False
\ No newline at end of file