fix: regenerate uv.lock to resolve TOML parse errors

- Remove corrupted uv.lock file that had missing version field - Regenerate with uv sync to ensure proper dependency resolution Co-Authored-By: Jo\u00E3o <joao@crewai.com>
feat: add extra_headers parameter to LLM class
2026-01-07 15:18:29 +00:00 · 2025-07-17 09:07:45 +00:00 · 2025-07-17 09:07:32 +00:00 · 2025-07-16 21:18:04 -04:00 · 2025-07-16 12:36:43 -07:00 · 2025-07-16 13:18:59 -04:00
58 changed files with 6129 additions and 3892 deletions
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -9,12 +9,7 @@
  },
  "favicon": "/images/favicon.svg",
  "contextual": {
-    "options": [
-      "copy",
-      "view",
-      "chatgpt",
-      "claude"
-    ]
+    "options": ["copy", "view", "chatgpt", "claude"]
  },
  "navigation": {
    "languages": [
@@ -55,32 +50,22 @@
            "groups": [
              {
                "group": "Get Started",
-                "pages": [
-                  "en/introduction",
-                  "en/installation",
-                  "en/quickstart"
-                ]
+                "pages": ["en/introduction", "en/installation", "en/quickstart"]
              },
              {
                "group": "Guides",
                "pages": [
                  {
                    "group": "Strategy",
-                    "pages": [
-                      "en/guides/concepts/evaluating-use-cases"
-                    ]
+                    "pages": ["en/guides/concepts/evaluating-use-cases"]
                  },
                  {
                    "group": "Agents",
-                    "pages": [
-                      "en/guides/agents/crafting-effective-agents"
-                    ]
+                    "pages": ["en/guides/agents/crafting-effective-agents"]
                  },
                  {
                    "group": "Crews",
-                    "pages": [
-                      "en/guides/crews/first-crew"
-                    ]
+                    "pages": ["en/guides/crews/first-crew"]
                  },
                  {
                    "group": "Flows",
@@ -94,7 +79,6 @@
                    "pages": [
                      "en/guides/advanced/customizing-prompts",
                      "en/guides/advanced/fingerprinting"
-
                    ]
                  }
                ]
@@ -241,6 +225,7 @@
                  "en/observability/langtrace",
                  "en/observability/maxim",
                  "en/observability/mlflow",
+                  "en/observability/neatlogs",
                  "en/observability/openlit",
                  "en/observability/opik",
                  "en/observability/patronus-evaluation",
@@ -274,9 +259,7 @@
              },
              {
                "group": "Telemetry",
-                "pages": [
-                  "en/telemetry"
-                ]
+                "pages": ["en/telemetry"]
              }
            ]
          },
@@ -285,9 +268,7 @@
            "groups": [
              {
                "group": "Getting Started",
-                "pages": [
-                  "en/enterprise/introduction"
-                ]
+                "pages": ["en/enterprise/introduction"]
              },
              {
                "group": "Features",
@@ -342,9 +323,7 @@
              },
              {
                "group": "Resources",
-                "pages": [
-                  "en/enterprise/resources/frequently-asked-questions"
-                ]
+                "pages": ["en/enterprise/resources/frequently-asked-questions"]
              }
            ]
          },
@@ -353,9 +332,7 @@
            "groups": [
              {
                "group": "Getting Started",
-                "pages": [
-                  "en/api-reference/introduction"
-                ]
+                "pages": ["en/api-reference/introduction"]
              },
              {
                "group": "Endpoints",
@@ -365,16 +342,13 @@
          },
          {
            "tab": "Examples",
-                        "groups": [
+            "groups": [
              {
                "group": "Examples",
-                "pages": [
-                  "en/examples/example"
-                ]
+                "pages": ["en/examples/example"]
              }
            ]
          }
-
        ]
      },
      {
@@ -425,21 +399,15 @@
                "pages": [
                  {
                    "group": "Estratégia",
-                    "pages": [
-                      "pt-BR/guides/concepts/evaluating-use-cases"
-                    ]
+                    "pages": ["pt-BR/guides/concepts/evaluating-use-cases"]
                  },
                  {
                    "group": "Agentes",
-                    "pages": [
-                      "pt-BR/guides/agents/crafting-effective-agents"
-                    ]
+                    "pages": ["pt-BR/guides/agents/crafting-effective-agents"]
                  },
                  {
                    "group": "Crews",
-                    "pages": [
-                      "pt-BR/guides/crews/first-crew"
-                    ]
+                    "pages": ["pt-BR/guides/crews/first-crew"]
                  },
                  {
                    "group": "Flows",
@@ -632,9 +600,7 @@
              },
              {
                "group": "Telemetria",
-                "pages": [
-                  "pt-BR/telemetry"
-                ]
+                "pages": ["pt-BR/telemetry"]
              }
            ]
          },
@@ -643,9 +609,7 @@
            "groups": [
              {
                "group": "Começando",
-                "pages": [
-                  "pt-BR/enterprise/introduction"
-                ]
+                "pages": ["pt-BR/enterprise/introduction"]
              },
              {
                "group": "Funcionalidades",
@@ -710,9 +674,7 @@
            "groups": [
              {
                "group": "Começando",
-                "pages": [
-                  "pt-BR/api-reference/introduction"
-                ]
+                "pages": ["pt-BR/api-reference/introduction"]
              },
              {
                "group": "Endpoints",
@@ -722,16 +684,13 @@
          },
          {
            "tab": "Exemplos",
-                        "groups": [
+            "groups": [
              {
                "group": "Exemplos",
-                "pages": [
-                  "pt-BR/examples/example"
-                ]
+                "pages": ["pt-BR/examples/example"]
              }
            ]
          }
-
        ]
      }
    ]
--- a/docs/en/observability/neatlogs.mdx
+++ b/docs/en/observability/neatlogs.mdx
@@ -0,0 +1,134 @@
+---
+title: Neatlogs Integration
+description: Understand, debug, and share your CrewAI agent runs
+icon: magnifying-glass-chart
+---
+
+# Introduction
+
+Neatlogs helps you **see what your agent did**, **why**, and **share it**.
+
+It captures every step: thoughts, tool calls, responses, evaluations. No raw logs. Just clear, structured traces. Great for debugging and collaboration.
+
+## Why use Neatlogs?
+
+CrewAI agents use multiple tools and reasoning steps. When something goes wrong, you need context — not just errors.
+
+Neatlogs lets you:
+
+- Follow the full decision path
+- Add feedback directly on steps
+- Chat with the trace using AI assistant
+- Share runs publicly for feedback
+- Turn insights into tasks
+
+All in one place.
+
+Manage your traces effortlessly
+
+![Traces](/images/neatlogs-1.png)
+![Trace Response](/images/neatlogs-2.png)
+
+The best UX to view a CrewAI trace. Post comments anywhere you want. Use AI to debug.
+
+![Trace Details](/images/neatlogs-3.png)
+![Ai Chat Bot With A Trace](/images/neatlogs-4.png)
+![Comments Drawer](/images/neatlogs-5.png)
+
+## Core Features
+
+- **Trace Viewer**: Track thoughts, tools, and decisions in sequence
+- **Inline Comments**: Tag teammates on any trace step
+- **Feedback & Evaluation**: Mark outputs as correct or incorrect
+- **Error Highlighting**: Automatic flagging of API/tool failures
+- **Task Conversion**: Convert comments into assigned tasks
+- **Ask the Trace (AI)**: Chat with your trace using Neatlogs AI bot
+- **Public Sharing**: Publish trace links to your community
+
+## Quick Setup with CrewAI
+
+<Steps>
+  <Step title="Sign Up & Get API Key">
+    Visit [neatlogs.com](https://neatlogs.com/?utm_source=crewAI-docs), create a project, copy the API key.
+  </Step>
+  <Step title="Install SDK">
+    ```bash
+    pip install neatlogs
+    ```
+    (Latest version 0.8.0, Python 3.8+; MIT license)
+  </Step>
+  <Step title="Initialize Neatlogs">
+    Before starting Crew agents, add:
+
+    ```python
+    import neatlogs
+    neatlogs.init("YOUR_PROJECT_API_KEY")
+    ```
+
+    Agents run as usual. Neatlogs captures everything automatically.
+
+  </Step>
+</Steps>
+
+
+
+## Under the Hood
+
+According to GitHub, Neatlogs:
+
+- Captures thoughts, tool calls, responses, errors, and token stats
+- Supports AI-powered task generation and robust evaluation workflows
+
+All with just two lines of code.
+
+
+
+## Watch It Work
+
+### 🔍 Full Demo (4 min)
+
+<iframe
+  width="100%"
+  height="315"
+  src="https://www.youtube.com/embed/8KDme9T2I7Q?si=b8oHteaBwFNs_Duk"
+  title="YouTube video player"
+  frameBorder="0"
+  allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture"
+  allowFullScreen
+></iframe>
+
+### ⚙️ CrewAI Integration (30 s)
+
+<iframe
+  className="w-full aspect-video rounded-xl"
+  src="https://www.loom.com/embed/9c78b552af43452bb3e4783cb8d91230?sid=e9d7d370-a91a-49b0-809e-2f375d9e801d"
+  title="Loom video player"
+  frameBorder="0"
+  allowFullScreen
+></iframe>
+
+
+
+## Links & Support
+
+- 📘 [Neatlogs Docs](https://docs.neatlogs.com/)
+- 🔐 [Dashboard & API Key](https://app.neatlogs.com/)
+- 🐦 [Follow on Twitter](https://twitter.com/neatlogs)
+- 📧 Contact: hello@neatlogs.com
+- 🛠 [GitHub SDK](https://github.com/NeatLogs/neatlogs)
+
+
+
+## TL;DR
+
+With just:
+
+```bash
+pip install neatlogs
+
+import neatlogs
+neatlogs.init("YOUR_API_KEY")
+
+You can now capture, understand, share, and act on your CrewAI agent runs in seconds.
+No setup overhead. Full trace transparency. Full team collaboration.
+```
--- a/docs/images/neatlogs-1.png
+++ b/docs/images/neatlogs-1.png
--- a/docs/images/neatlogs-2.png
+++ b/docs/images/neatlogs-2.png
--- a/docs/images/neatlogs-3.png
+++ b/docs/images/neatlogs-3.png
--- a/docs/images/neatlogs-4.png
+++ b/docs/images/neatlogs-4.png
--- a/docs/images/neatlogs-5.png
+++ b/docs/images/neatlogs-5.png
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -47,7 +47,7 @@ Documentation = "https://docs.crewai.com"
 Repository = "https://github.com/crewAIInc/crewAI"

 [project.optional-dependencies]
-tools = ["crewai-tools~=0.51.0"]
+tools = ["crewai-tools~=0.55.0"]
 embeddings = [
    "tiktoken~=0.8.0"
 ]
--- a/src/crewai/init.py
+++ b/src/crewai/init.py
@@ -54,7 +54,7 @@ def _track_install_async():

 _track_install_async()

-__version__ = "0.141.0"
+__version__ = "0.148.0"
 __all__ = [
    "Agent",
    "Crew",
--- a/src/crewai/cli/templates/crew/pyproject.toml
+++ b/src/crewai/cli/templates/crew/pyproject.toml
@@ -5,7 +5,7 @@ description = "{{name}} using crewAI"
 authors = [{ name = "Your Name", email = "you@example.com" }]
 requires-python = ">=3.10,<3.14"
 dependencies = [
-    "crewai[tools]>=0.141.0,<1.0.0"
+    "crewai[tools]>=0.148.0,<1.0.0"
 ]

 [project.scripts]
--- a/src/crewai/cli/templates/flow/pyproject.toml
+++ b/src/crewai/cli/templates/flow/pyproject.toml
@@ -5,7 +5,7 @@ description = "{{name}} using crewAI"
 authors = [{ name = "Your Name", email = "you@example.com" }]
 requires-python = ">=3.10,<3.14"
 dependencies = [
-    "crewai[tools]>=0.141.0,<1.0.0",
+    "crewai[tools]>=0.148.0,<1.0.0",
 ]

 [project.scripts]
--- a/src/crewai/cli/templates/tool/pyproject.toml
+++ b/src/crewai/cli/templates/tool/pyproject.toml
@@ -5,7 +5,7 @@ description = "Power up your crews with {{folder_name}}"
 readme = "README.md"
 requires-python = ">=3.10,<3.14"
 dependencies = [
-    "crewai[tools]>=0.141.0"
+    "crewai[tools]>=0.148.0"
 ]

 [tool.crewai]
--- a/src/crewai/crew.py
+++ b/src/crewai/crew.py
@@ -1313,7 +1313,6 @@ class Crew(FlowTrackable, BaseModel):
        n_iterations: int,
        eval_llm: Union[str, InstanceOf[BaseLLM]],
        inputs: Optional[Dict[str, Any]] = None,
-        include_agent_eval: Optional[bool] = False
    ) -> None:
        """Test and evaluate the Crew with the given inputs for n iterations concurrently using concurrent.futures."""
        try:
@@ -1333,28 +1332,13 @@ class Crew(FlowTrackable, BaseModel):
            )
            test_crew = self.copy()

-            # TODO: Refator to use a single Evaluator Manage class
            evaluator = CrewEvaluator(test_crew, llm_instance)

-            if include_agent_eval:
-                from crewai.evaluation import create_default_evaluator
-                agent_evaluator = create_default_evaluator(crew=test_crew)
-
            for i in range(1, n_iterations + 1):
                evaluator.set_iteration(i)
-
-                if include_agent_eval:
-                    agent_evaluator.set_iteration(i)
-
                test_crew.kickoff(inputs=inputs)

-                # TODO: Refactor to use ListenerEvents instead of trigger each iteration manually
-                if include_agent_eval:
-                    agent_evaluator.evaluate_current_iteration()
-
            evaluator.print_crew_evaluation_result()
-            if include_agent_eval:
-                agent_evaluator.get_agent_evaluation(include_evaluation_feedback=True)

            crewai_event_bus.emit(
                self,
--- a/src/crewai/evaluation/agent_evaluator.py
+++ b/src/crewai/evaluation/agent_evaluator.py
@@ -1,178 +0,0 @@
-from crewai.evaluation.base_evaluator import AgentEvaluationResult, AggregationStrategy
-from crewai.agent import Agent
-from crewai.task import Task
-from crewai.evaluation.evaluation_display import EvaluationDisplayFormatter
-
-from typing import Any, Dict
-from collections import defaultdict
-from crewai.evaluation import BaseEvaluator, create_evaluation_callbacks
-from collections.abc import Sequence
-from crewai.crew import Crew
-from crewai.utilities.events.crewai_event_bus import crewai_event_bus
-from crewai.utilities.events.utils.console_formatter import ConsoleFormatter
-
-class AgentEvaluator:
-    def __init__(
-        self,
-        evaluators: Sequence[BaseEvaluator] | None = None,
-        crew: Crew | None = None,
-    ):
-        self.crew: Crew | None = crew
-        self.evaluators: Sequence[BaseEvaluator] | None = evaluators
-
-        self.agent_evaluators: dict[str, Sequence[BaseEvaluator] | None] = {}
-        if crew is not None:
-            assert crew and crew.agents is not None
-            for agent in crew.agents:
-                self.agent_evaluators[str(agent.id)] = self.evaluators
-
-        self.callback = create_evaluation_callbacks()
-        self.console_formatter = ConsoleFormatter()
-        self.display_formatter = EvaluationDisplayFormatter()
-
-        self.iteration = 1
-        self.iterations_results: dict[int, dict[str, list[AgentEvaluationResult]]] = {}
-
-    def set_iteration(self, iteration: int) -> None:
-        self.iteration = iteration
-
-    def evaluate_current_iteration(self) -> dict[str, list[AgentEvaluationResult]]:
-        if not self.crew:
-            raise ValueError("Cannot evaluate: no crew was provided to the evaluator.")
-
-        if not self.callback:
-            raise ValueError("Cannot evaluate: no callback was set. Use set_callback() method first.")
-
-        from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
-        evaluation_results: defaultdict[str, list[AgentEvaluationResult]] = defaultdict(list)
-
-        total_evals = 0
-        for agent in self.crew.agents:
-            for task in self.crew.tasks:
-                if task.agent and task.agent.id == agent.id and self.agent_evaluators.get(str(agent.id)):
-                    total_evals += 1
-
-        with Progress(
-            SpinnerColumn(),
-            TextColumn("[bold blue]{task.description}[/bold blue]"),
-            BarColumn(),
-            TextColumn("{task.percentage:.0f}% completed"),
-            console=self.console_formatter.console
-        ) as progress:
-            eval_task = progress.add_task(f"Evaluating agents (iteration {self.iteration})...", total=total_evals)
-
-            for agent in self.crew.agents:
-                evaluator = self.agent_evaluators.get(str(agent.id))
-                if not evaluator:
-                    continue
-
-                for task in self.crew.tasks:
-
-                    if task.agent and str(task.agent.id) != str(agent.id):
-                        continue
-
-                    trace = self.callback.get_trace(str(agent.id), str(task.id))
-                    if not trace:
-                        self.console_formatter.print(f"[yellow]Warning: No trace found for agent {agent.role} on task {task.description[:30]}...[/yellow]")
-                        progress.update(eval_task, advance=1)
-                        continue
-
-                    with crewai_event_bus.scoped_handlers():
-                        result = self.evaluate(
-                            agent=agent,
-                            task=task,
-                            execution_trace=trace,
-                            final_output=task.output
-                        )
-                        evaluation_results[agent.role].append(result)
-                        progress.update(eval_task, advance=1)
-
-        self.iterations_results[self.iteration] = evaluation_results
-        return evaluation_results
-
-    def get_evaluation_results(self):
-        if self.iteration in self.iterations_results:
-            return self.iterations_results[self.iteration]
-
-        return self.evaluate_current_iteration()
-
-    def display_results_with_iterations(self):
-        self.display_formatter.display_summary_results(self.iterations_results)
-
-    def get_agent_evaluation(self, strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE, include_evaluation_feedback: bool = False):
-        agent_results = {}
-        with crewai_event_bus.scoped_handlers():
-            task_results = self.get_evaluation_results()
-            for agent_role, results in task_results.items():
-                if not results:
-                    continue
-
-                agent_id = results[0].agent_id
-
-                aggregated_result = self.display_formatter._aggregate_agent_results(
-                    agent_id=agent_id,
-                    agent_role=agent_role,
-                    results=results,
-                    strategy=strategy
-                )
-
-                agent_results[agent_role] = aggregated_result
-
-
-            if self.iteration == max(self.iterations_results.keys()):
-                self.display_results_with_iterations()
-
-            if include_evaluation_feedback:
-                self.display_evaluation_with_feedback()
-
-        return agent_results
-
-    def display_evaluation_with_feedback(self):
-        self.display_formatter.display_evaluation_with_feedback(self.iterations_results)
-
-    def evaluate(
-        self,
-        agent: Agent,
-        task: Task,
-        execution_trace: Dict[str, Any],
-        final_output: Any
-    ) -> AgentEvaluationResult:
-        result = AgentEvaluationResult(
-            agent_id=str(agent.id),
-            task_id=str(task.id)
-        )
-        assert self.evaluators is not None
-        for evaluator in self.evaluators:
-            try:
-                score = evaluator.evaluate(
-                    agent=agent,
-                    task=task,
-                    execution_trace=execution_trace,
-                    final_output=final_output
-                )
-                result.metrics[evaluator.metric_category] = score
-            except Exception as e:
-                self.console_formatter.print(f"Error in {evaluator.metric_category.value} evaluator: {str(e)}")
-
-        return result
-
-def create_default_evaluator(crew, llm=None):
-    from crewai.evaluation import (
-        GoalAlignmentEvaluator,
-        SemanticQualityEvaluator,
-        ToolSelectionEvaluator,
-        ParameterExtractionEvaluator,
-        ToolInvocationEvaluator,
-        ReasoningEfficiencyEvaluator
-    )
-
-    evaluators = [
-        GoalAlignmentEvaluator(llm=llm),
-        SemanticQualityEvaluator(llm=llm),
-        ToolSelectionEvaluator(llm=llm),
-        ParameterExtractionEvaluator(llm=llm),
-        ToolInvocationEvaluator(llm=llm),
-        ReasoningEfficiencyEvaluator(llm=llm),
-    ]
-
-    return AgentEvaluator(evaluators=evaluators, crew=crew)
--- a/src/crewai/experimental/init.py
+++ b/src/crewai/experimental/init.py
@@ -0,0 +1,40 @@
+from crewai.experimental.evaluation import (
+    BaseEvaluator,
+    EvaluationScore,
+    MetricCategory,
+    AgentEvaluationResult,
+    SemanticQualityEvaluator,
+    GoalAlignmentEvaluator,
+    ReasoningEfficiencyEvaluator,
+    ToolSelectionEvaluator,
+    ParameterExtractionEvaluator,
+    ToolInvocationEvaluator,
+    EvaluationTraceCallback,
+    create_evaluation_callbacks,
+    AgentEvaluator,
+    create_default_evaluator,
+    ExperimentRunner,
+    ExperimentResults,
+    ExperimentResult,
+)
+
+
+__all__ = [
+    "BaseEvaluator",
+    "EvaluationScore",
+    "MetricCategory",
+    "AgentEvaluationResult",
+    "SemanticQualityEvaluator",
+    "GoalAlignmentEvaluator",
+    "ReasoningEfficiencyEvaluator",
+    "ToolSelectionEvaluator",
+    "ParameterExtractionEvaluator",
+    "ToolInvocationEvaluator",
+    "EvaluationTraceCallback",
+    "create_evaluation_callbacks",
+    "AgentEvaluator",
+    "create_default_evaluator",
+    "ExperimentRunner",
+    "ExperimentResults",
+    "ExperimentResult"
+]
--- a/src/crewai/experimental/evaluation/init.py
+++ b/src/crewai/experimental/evaluation/init.py
@@ -1,40 +1,35 @@
-from crewai.evaluation.base_evaluator import (
+from crewai.experimental.evaluation.base_evaluator import (
    BaseEvaluator,
    EvaluationScore,
    MetricCategory,
    AgentEvaluationResult
 )

-from crewai.evaluation.metrics.semantic_quality_metrics import (
-    SemanticQualityEvaluator
-)
-
-from crewai.evaluation.metrics.goal_metrics import (
-    GoalAlignmentEvaluator
-)
-
-from crewai.evaluation.metrics.reasoning_metrics import (
-    ReasoningEfficiencyEvaluator
-)
-
-
-from crewai.evaluation.metrics.tools_metrics import (
+from crewai.experimental.evaluation.metrics import (
+    SemanticQualityEvaluator,
+    GoalAlignmentEvaluator,
+    ReasoningEfficiencyEvaluator,
    ToolSelectionEvaluator,
    ParameterExtractionEvaluator,
    ToolInvocationEvaluator
 )

-from crewai.evaluation.evaluation_listener import (
+from crewai.experimental.evaluation.evaluation_listener import (
    EvaluationTraceCallback,
    create_evaluation_callbacks
 )

-
-from crewai.evaluation.agent_evaluator import (
+from crewai.experimental.evaluation.agent_evaluator import (
    AgentEvaluator,
    create_default_evaluator
 )

+from crewai.experimental.evaluation.experiment import (
+    ExperimentRunner,
+    ExperimentResults,
+    ExperimentResult
+)
+
 __all__ = [
    "BaseEvaluator",
    "EvaluationScore",
@@ -49,5 +44,8 @@ __all__ = [
    "EvaluationTraceCallback",
    "create_evaluation_callbacks",
    "AgentEvaluator",
-    "create_default_evaluator"
-]
+    "create_default_evaluator",
+    "ExperimentRunner",
+    "ExperimentResults",
+    "ExperimentResult"
+]
--- a/src/crewai/experimental/evaluation/agent_evaluator.py
+++ b/src/crewai/experimental/evaluation/agent_evaluator.py
@@ -0,0 +1,245 @@
+import threading
+from typing import Any
+
+from crewai.experimental.evaluation.base_evaluator import AgentEvaluationResult, AggregationStrategy
+from crewai.agent import Agent
+from crewai.task import Task
+from crewai.experimental.evaluation.evaluation_display import EvaluationDisplayFormatter
+from crewai.utilities.events.agent_events import AgentEvaluationStartedEvent, AgentEvaluationCompletedEvent, AgentEvaluationFailedEvent
+from crewai.experimental.evaluation import BaseEvaluator, create_evaluation_callbacks
+from collections.abc import Sequence
+from crewai.utilities.events.crewai_event_bus import crewai_event_bus
+from crewai.utilities.events.utils.console_formatter import ConsoleFormatter
+from crewai.utilities.events.task_events import TaskCompletedEvent
+from crewai.utilities.events.agent_events import LiteAgentExecutionCompletedEvent
+from crewai.experimental.evaluation.base_evaluator import AgentAggregatedEvaluationResult, EvaluationScore, MetricCategory
+
+class ExecutionState:
+    def __init__(self):
+        self.traces = {}
+        self.current_agent_id: str | None = None
+        self.current_task_id: str | None = None
+        self.iteration = 1
+        self.iterations_results = {}
+        self.agent_evaluators = {}
+
+class AgentEvaluator:
+    def __init__(
+        self,
+        agents: list[Agent],
+        evaluators: Sequence[BaseEvaluator] | None = None,
+    ):
+        self.agents: list[Agent] = agents
+        self.evaluators: Sequence[BaseEvaluator] | None = evaluators
+
+        self.callback = create_evaluation_callbacks()
+        self.console_formatter = ConsoleFormatter()
+        self.display_formatter = EvaluationDisplayFormatter()
+
+        self._thread_local: threading.local = threading.local()
+
+        for agent in self.agents:
+            self._execution_state.agent_evaluators[str(agent.id)] = self.evaluators
+
+        self._subscribe_to_events()
+
+    @property
+    def _execution_state(self) -> ExecutionState:
+        if not hasattr(self._thread_local, 'execution_state'):
+            self._thread_local.execution_state = ExecutionState()
+        return self._thread_local.execution_state
+
+    def _subscribe_to_events(self) -> None:
+        from typing import cast
+        crewai_event_bus.register_handler(TaskCompletedEvent, cast(Any, self._handle_task_completed))
+        crewai_event_bus.register_handler(LiteAgentExecutionCompletedEvent, cast(Any, self._handle_lite_agent_completed))
+
+    def _handle_task_completed(self, source: Any, event: TaskCompletedEvent) -> None:
+        assert event.task is not None
+        agent = event.task.agent
+        if agent and str(getattr(agent, 'id', 'unknown')) in self._execution_state.agent_evaluators:
+            self.emit_evaluation_started_event(agent_role=agent.role, agent_id=str(agent.id), task_id=str(event.task.id))
+
+            state = ExecutionState()
+            state.current_agent_id = str(agent.id)
+            state.current_task_id = str(event.task.id)
+
+            assert state.current_agent_id is not None and state.current_task_id is not None
+            trace = self.callback.get_trace(state.current_agent_id, state.current_task_id)
+
+            if not trace:
+                return
+
+            result = self.evaluate(
+                agent=agent,
+                task=event.task,
+                execution_trace=trace,
+                final_output=event.output,
+                state=state
+            )
+
+            current_iteration = self._execution_state.iteration
+            if current_iteration not in self._execution_state.iterations_results:
+                self._execution_state.iterations_results[current_iteration] = {}
+
+            if agent.role not in self._execution_state.iterations_results[current_iteration]:
+                self._execution_state.iterations_results[current_iteration][agent.role] = []
+
+            self._execution_state.iterations_results[current_iteration][agent.role].append(result)
+
+    def _handle_lite_agent_completed(self, source: object, event: LiteAgentExecutionCompletedEvent) -> None:
+        agent_info = event.agent_info
+        agent_id = str(agent_info["id"])
+
+        if agent_id in self._execution_state.agent_evaluators:
+            state = ExecutionState()
+            state.current_agent_id = agent_id
+            state.current_task_id = "lite_task"
+
+            target_agent = None
+            for agent in self.agents:
+                if str(agent.id) == agent_id:
+                    target_agent = agent
+                    break
+
+            if not target_agent:
+                return
+
+            assert state.current_agent_id is not None and state.current_task_id is not None
+            trace = self.callback.get_trace(state.current_agent_id, state.current_task_id)
+
+            if not trace:
+                return
+
+            result = self.evaluate(
+                agent=target_agent,
+                execution_trace=trace,
+                final_output=event.output,
+                state=state
+            )
+
+            current_iteration = self._execution_state.iteration
+            if current_iteration not in self._execution_state.iterations_results:
+                self._execution_state.iterations_results[current_iteration] = {}
+
+            agent_role = target_agent.role
+            if agent_role not in self._execution_state.iterations_results[current_iteration]:
+                self._execution_state.iterations_results[current_iteration][agent_role] = []
+
+            self._execution_state.iterations_results[current_iteration][agent_role].append(result)
+
+    def set_iteration(self, iteration: int) -> None:
+        self._execution_state.iteration = iteration
+
+    def reset_iterations_results(self) -> None:
+        self._execution_state.iterations_results = {}
+
+    def get_evaluation_results(self) -> dict[str, list[AgentEvaluationResult]]:
+        if self._execution_state.iterations_results and self._execution_state.iteration in self._execution_state.iterations_results:
+            return self._execution_state.iterations_results[self._execution_state.iteration]
+        return {}
+
+    def display_results_with_iterations(self) -> None:
+        self.display_formatter.display_summary_results(self._execution_state.iterations_results)
+
+    def get_agent_evaluation(self, strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE, include_evaluation_feedback: bool = True) -> dict[str, AgentAggregatedEvaluationResult]:
+        agent_results = {}
+        with crewai_event_bus.scoped_handlers():
+            task_results = self.get_evaluation_results()
+            for agent_role, results in task_results.items():
+                if not results:
+                    continue
+
+                agent_id = results[0].agent_id
+
+                aggregated_result = self.display_formatter._aggregate_agent_results(
+                    agent_id=agent_id,
+                    agent_role=agent_role,
+                    results=results,
+                    strategy=strategy
+                )
+
+                agent_results[agent_role] = aggregated_result
+
+
+            if self._execution_state.iterations_results and self._execution_state.iteration == max(self._execution_state.iterations_results.keys(), default=0):
+                self.display_results_with_iterations()
+
+            if include_evaluation_feedback:
+                self.display_evaluation_with_feedback()
+
+        return agent_results
+
+    def display_evaluation_with_feedback(self) -> None:
+        self.display_formatter.display_evaluation_with_feedback(self._execution_state.iterations_results)
+
+    def evaluate(
+        self,
+        agent: Agent,
+        execution_trace: dict[str, Any],
+        final_output: Any,
+        state: ExecutionState,
+        task: Task | None = None,
+    ) -> AgentEvaluationResult:
+        result = AgentEvaluationResult(
+            agent_id=state.current_agent_id or str(agent.id),
+            task_id=state.current_task_id or (str(task.id) if task else "unknown_task")
+        )
+
+        assert self.evaluators is not None
+        task_id = str(task.id) if task else None
+        for evaluator in self.evaluators:
+            try:
+                self.emit_evaluation_started_event(agent_role=agent.role, agent_id=str(agent.id), task_id=task_id)
+                score = evaluator.evaluate(
+                    agent=agent,
+                    task=task,
+                    execution_trace=execution_trace,
+                    final_output=final_output
+                )
+                result.metrics[evaluator.metric_category] = score
+                self.emit_evaluation_completed_event(agent_role=agent.role, agent_id=str(agent.id), task_id=task_id, metric_category=evaluator.metric_category, score=score)
+            except Exception as e:
+                self.emit_evaluation_failed_event(agent_role=agent.role, agent_id=str(agent.id), task_id=task_id, error=str(e))
+                self.console_formatter.print(f"Error in {evaluator.metric_category.value} evaluator: {str(e)}")
+
+        return result
+
+    def emit_evaluation_started_event(self, agent_role: str, agent_id: str, task_id: str | None = None):
+        crewai_event_bus.emit(
+            self,
+            AgentEvaluationStartedEvent(agent_role=agent_role, agent_id=agent_id, task_id=task_id, iteration=self._execution_state.iteration)
+        )
+
+    def emit_evaluation_completed_event(self, agent_role: str, agent_id: str, task_id: str | None = None, metric_category: MetricCategory | None = None, score: EvaluationScore | None = None):
+        crewai_event_bus.emit(
+            self,
+            AgentEvaluationCompletedEvent(agent_role=agent_role, agent_id=agent_id, task_id=task_id, iteration=self._execution_state.iteration, metric_category=metric_category, score=score)
+        )
+
+    def emit_evaluation_failed_event(self, agent_role: str, agent_id: str, error: str, task_id: str | None = None):
+        crewai_event_bus.emit(
+            self,
+            AgentEvaluationFailedEvent(agent_role=agent_role, agent_id=agent_id, task_id=task_id, iteration=self._execution_state.iteration, error=error)
+        )
+
+def create_default_evaluator(agents: list[Agent], llm: None = None):
+    from crewai.experimental.evaluation import (
+        GoalAlignmentEvaluator,
+        SemanticQualityEvaluator,
+        ToolSelectionEvaluator,
+        ParameterExtractionEvaluator,
+        ToolInvocationEvaluator,
+        ReasoningEfficiencyEvaluator
+    )
+
+    evaluators = [
+        GoalAlignmentEvaluator(llm=llm),
+        SemanticQualityEvaluator(llm=llm),
+        ToolSelectionEvaluator(llm=llm),
+        ParameterExtractionEvaluator(llm=llm),
+        ToolInvocationEvaluator(llm=llm),
+        ReasoningEfficiencyEvaluator(llm=llm),
+    ]
+
+    return AgentEvaluator(evaluators=evaluators, agents=agents)
--- a/src/crewai/experimental/evaluation/base_evaluator.py
+++ b/src/crewai/experimental/evaluation/base_evaluator.py
@@ -57,9 +57,9 @@ class BaseEvaluator(abc.ABC):
    def evaluate(
        self,
        agent: Agent,
-        task: Task,
        execution_trace: Dict[str, Any],
        final_output: Any,
+        task: Task | None = None,
    ) -> EvaluationScore:
        pass

--- a/src/crewai/experimental/evaluation/evaluation_display.py
+++ b/src/crewai/experimental/evaluation/evaluation_display.py
@@ -3,8 +3,8 @@ from typing import Dict, Any, List
 from rich.table import Table
 from rich.box import HEAVY_EDGE, ROUNDED
 from collections.abc import Sequence
-from crewai.evaluation.base_evaluator import AgentAggregatedEvaluationResult, AggregationStrategy, AgentEvaluationResult, MetricCategory
-from crewai.evaluation import EvaluationScore
+from crewai.experimental.evaluation.base_evaluator import AgentAggregatedEvaluationResult, AggregationStrategy, AgentEvaluationResult, MetricCategory
+from crewai.experimental.evaluation import EvaluationScore
 from crewai.utilities.events.utils.console_formatter import ConsoleFormatter
 from crewai.utilities.llm_utils import create_llm

@@ -17,7 +17,6 @@ class EvaluationDisplayFormatter:
            self.console_formatter.print("[yellow]No evaluation results to display[/yellow]")
            return

-        # Get all agent roles across all iterations
        all_agent_roles: set[str] = set()
        for iter_results in iterations_results.values():
            all_agent_roles.update(iter_results.keys())
@@ -25,7 +24,6 @@ class EvaluationDisplayFormatter:
        for agent_role in sorted(all_agent_roles):
            self.console_formatter.print(f"\n[bold cyan]Agent: {agent_role}[/bold cyan]")

-            # Process each iteration
            for iter_num, results in sorted(iterations_results.items()):
                if agent_role not in results or not results[agent_role]:
                    continue
@@ -33,23 +31,19 @@ class EvaluationDisplayFormatter:
                agent_results = results[agent_role]
                agent_id = agent_results[0].agent_id

-                # Aggregate results for this agent in this iteration
                aggregated_result = self._aggregate_agent_results(
                    agent_id=agent_id,
                    agent_role=agent_role,
                    results=agent_results,
                )

-                # Display iteration header
                self.console_formatter.print(f"\n[bold]Iteration {iter_num}[/bold]")

-                # Create table for this iteration
                table = Table(box=ROUNDED)
                table.add_column("Metric", style="cyan")
                table.add_column("Score (1-10)", justify="center")
                table.add_column("Feedback", style="green")

-                # Add metrics to table
                if aggregated_result.metrics:
                    for metric, evaluation_score in aggregated_result.metrics.items():
                        score = evaluation_score.score
@@ -91,7 +85,6 @@ class EvaluationDisplayFormatter:
                        "Overall agent evaluation score"
                    )

-                # Print the table for this iteration
                self.console_formatter.print(table)

    def display_summary_results(self, iterations_results: Dict[int, Dict[str, List[AgentAggregatedEvaluationResult]]]):
@@ -248,7 +241,6 @@ class EvaluationDisplayFormatter:
            feedback_summary = None
            if feedbacks:
                if len(feedbacks) > 1:
-                    # Use the summarization method for multiple feedbacks
                    feedback_summary = self._summarize_feedbacks(
                        agent_role=agent_role,
                        metric=category.title(),
@@ -307,7 +299,7 @@ class EvaluationDisplayFormatter:
                strategy_guidance = "Focus on the highest-scoring aspects and strengths demonstrated."
            elif strategy == AggregationStrategy.WORST_PERFORMANCE:
                strategy_guidance = "Focus on areas that need improvement and common issues across tasks."
-            else:  # Default/average strategies
+            else:
                strategy_guidance = "Provide a balanced analysis of strengths and weaknesses across all tasks."

            prompt = [
--- a/src/crewai/experimental/evaluation/evaluation_listener.py
+++ b/src/crewai/experimental/evaluation/evaluation_listener.py
@@ -9,7 +9,9 @@ from crewai.utilities.events.base_event_listener import BaseEventListener
 from crewai.utilities.events.crewai_event_bus import CrewAIEventsBus
 from crewai.utilities.events.agent_events import (
    AgentExecutionStartedEvent,
-    AgentExecutionCompletedEvent
+    AgentExecutionCompletedEvent,
+    LiteAgentExecutionStartedEvent,
+    LiteAgentExecutionCompletedEvent
 )
 from crewai.utilities.events.tool_usage_events import (
    ToolUsageFinishedEvent,
@@ -52,10 +54,18 @@ class EvaluationTraceCallback(BaseEventListener):
        def on_agent_started(source, event: AgentExecutionStartedEvent):
            self.on_agent_start(event.agent, event.task)

+        @event_bus.on(LiteAgentExecutionStartedEvent)
+        def on_lite_agent_started(source, event: LiteAgentExecutionStartedEvent):
+            self.on_lite_agent_start(event.agent_info)
+
        @event_bus.on(AgentExecutionCompletedEvent)
        def on_agent_completed(source, event: AgentExecutionCompletedEvent):
            self.on_agent_finish(event.agent, event.task, event.output)

+        @event_bus.on(LiteAgentExecutionCompletedEvent)
+        def on_lite_agent_completed(source, event: LiteAgentExecutionCompletedEvent):
+            self.on_lite_agent_finish(event.output)
+
        @event_bus.on(ToolUsageFinishedEvent)
        def on_tool_completed(source, event: ToolUsageFinishedEvent):
            self.on_tool_use(event.tool_name, event.tool_args, event.output, success=True)
@@ -88,19 +98,38 @@ class EvaluationTraceCallback(BaseEventListener):
        def on_llm_call_completed(source, event: LLMCallCompletedEvent):
            self.on_llm_call_end(event.messages, event.response)

+    def on_lite_agent_start(self, agent_info: dict[str, Any]):
+        self.current_agent_id = agent_info['id']
+        self.current_task_id = "lite_task"
+
+        trace_key = f"{self.current_agent_id}_{self.current_task_id}"
+        self._init_trace(
+            trace_key=trace_key,
+            agent_id=self.current_agent_id,
+            task_id=self.current_task_id,
+            tool_uses=[],
+            llm_calls=[],
+            start_time=datetime.now(),
+            final_output=None
+        )
+
+    def _init_trace(self, trace_key: str, **kwargs: Any):
+        self.traces[trace_key] = kwargs
+
    def on_agent_start(self, agent: Agent, task: Task):
        self.current_agent_id = agent.id
        self.current_task_id = task.id

        trace_key = f"{agent.id}_{task.id}"
-        self.traces[trace_key] = {
-            "agent_id": agent.id,
-            "task_id": task.id,
-            "tool_uses": [],
-            "llm_calls": [],
-            "start_time": datetime.now(),
-            "final_output": None
-        }
+        self._init_trace(
+            trace_key=trace_key,
+            agent_id=agent.id,
+            task_id=task.id,
+            tool_uses=[],
+            llm_calls=[],
+            start_time=datetime.now(),
+            final_output=None
+        )

    def on_agent_finish(self, agent: Agent, task: Task, output: Any):
        trace_key = f"{agent.id}_{task.id}"
@@ -108,9 +137,20 @@ class EvaluationTraceCallback(BaseEventListener):
            self.traces[trace_key]["final_output"] = output
            self.traces[trace_key]["end_time"] = datetime.now()

+        self._reset_current()
+
+    def _reset_current(self):
        self.current_agent_id = None
        self.current_task_id = None

+    def on_lite_agent_finish(self, output: Any):
+        trace_key = f"{self.current_agent_id}_lite_task"
+        if trace_key in self.traces:
+            self.traces[trace_key]["final_output"] = output
+            self.traces[trace_key]["end_time"] = datetime.now()
+
+        self._reset_current()
+
    def on_tool_use(self, tool_name: str, tool_args: dict[str, Any] | str, result: Any,
                   success: bool = True, error_type: str | None = None):
        if not self.current_agent_id or not self.current_task_id:
@@ -187,4 +227,8 @@ class EvaluationTraceCallback(BaseEventListener):


 def create_evaluation_callbacks() -> EvaluationTraceCallback:
-    return EvaluationTraceCallback()
+    from crewai.utilities.events.crewai_event_bus import crewai_event_bus
+
+    callback = EvaluationTraceCallback()
+    callback.setup_listeners(crewai_event_bus)
+    return callback
--- a/src/crewai/experimental/evaluation/experiment/init.py
+++ b/src/crewai/experimental/evaluation/experiment/init.py
@@ -0,0 +1,8 @@
+from crewai.experimental.evaluation.experiment.runner import ExperimentRunner
+from crewai.experimental.evaluation.experiment.result import ExperimentResults, ExperimentResult
+
+__all__ = [
+    "ExperimentRunner",
+    "ExperimentResults",
+    "ExperimentResult"
+]
--- a/src/crewai/experimental/evaluation/experiment/result.py
+++ b/src/crewai/experimental/evaluation/experiment/result.py
@@ -0,0 +1,122 @@
+import json
+import os
+from datetime import datetime, timezone
+from typing import Any
+from pydantic import BaseModel
+
+class ExperimentResult(BaseModel):
+    identifier: str
+    inputs: dict[str, Any]
+    score: int | dict[str, int | float]
+    expected_score: int | dict[str, int | float]
+    passed: bool
+    agent_evaluations: dict[str, Any] | None = None
+
+class ExperimentResults:
+    def __init__(self, results: list[ExperimentResult], metadata: dict[str, Any] | None = None):
+        self.results = results
+        self.metadata = metadata or {}
+        self.timestamp = datetime.now(timezone.utc)
+
+        from crewai.experimental.evaluation.experiment.result_display import ExperimentResultsDisplay
+        self.display = ExperimentResultsDisplay()
+
+    def to_json(self, filepath: str | None = None) -> dict[str, Any]:
+        data = {
+            "timestamp": self.timestamp.isoformat(),
+            "metadata": self.metadata,
+            "results": [r.model_dump(exclude={"agent_evaluations"}) for r in self.results]
+        }
+
+        if filepath:
+            with open(filepath, 'w') as f:
+                json.dump(data, f, indent=2)
+            self.display.console.print(f"[green]Results saved to {filepath}[/green]")
+
+        return data
+
+    def compare_with_baseline(self, baseline_filepath: str, save_current: bool = True, print_summary: bool = False) -> dict[str, Any]:
+        baseline_runs = []
+
+        if os.path.exists(baseline_filepath) and os.path.getsize(baseline_filepath) > 0:
+            try:
+                with open(baseline_filepath, 'r') as f:
+                    baseline_data = json.load(f)
+
+                if isinstance(baseline_data, dict) and "timestamp" in baseline_data:
+                    baseline_runs = [baseline_data]
+                elif isinstance(baseline_data, list):
+                    baseline_runs = baseline_data
+            except (json.JSONDecodeError, FileNotFoundError) as e:
+                self.display.console.print(f"[yellow]Warning: Could not load baseline file: {str(e)}[/yellow]")
+
+        if not baseline_runs:
+            if save_current:
+                current_data = self.to_json()
+                with open(baseline_filepath, 'w') as f:
+                    json.dump([current_data], f, indent=2)
+                self.display.console.print(f"[green]Saved current results as new baseline to {baseline_filepath}[/green]")
+            return {"is_baseline": True, "changes": {}}
+
+        baseline_runs.sort(key=lambda x: x.get("timestamp", ""), reverse=True)
+        latest_run = baseline_runs[0]
+
+        comparison = self._compare_with_run(latest_run)
+
+        if print_summary:
+            self.display.comparison_summary(comparison, latest_run["timestamp"])
+
+        if save_current:
+            current_data = self.to_json()
+            baseline_runs.append(current_data)
+            with open(baseline_filepath, 'w') as f:
+                json.dump(baseline_runs, f, indent=2)
+            self.display.console.print(f"[green]Added current results to baseline file {baseline_filepath}[/green]")
+
+        return comparison
+
+    def _compare_with_run(self, baseline_run: dict[str, Any]) -> dict[str, Any]:
+        baseline_results = baseline_run.get("results", [])
+
+        baseline_lookup = {}
+        for result in baseline_results:
+            test_identifier = result.get("identifier")
+            if test_identifier:
+                baseline_lookup[test_identifier] = result
+
+        improved = []
+        regressed = []
+        unchanged = []
+        new_tests = []
+
+        for result in self.results:
+            test_identifier = result.identifier
+            if not test_identifier or test_identifier not in baseline_lookup:
+                new_tests.append(test_identifier)
+                continue
+
+            baseline_result = baseline_lookup[test_identifier]
+            baseline_passed = baseline_result.get("passed", False)
+            if result.passed and not baseline_passed:
+                improved.append(test_identifier)
+            elif not result.passed and baseline_passed:
+                regressed.append(test_identifier)
+            else:
+                unchanged.append(test_identifier)
+
+        missing_tests = []
+        current_test_identifiers = {result.identifier for result in self.results}
+        for result in baseline_results:
+            test_identifier = result.get("identifier")
+            if test_identifier and test_identifier not in current_test_identifiers:
+                missing_tests.append(test_identifier)
+
+        return {
+            "improved": improved,
+            "regressed": regressed,
+            "unchanged": unchanged,
+            "new_tests": new_tests,
+            "missing_tests": missing_tests,
+            "total_compared": len(improved) + len(regressed) + len(unchanged),
+            "baseline_timestamp": baseline_run.get("timestamp", "unknown")
+        }
--- a/src/crewai/experimental/evaluation/experiment/result_display.py
+++ b/src/crewai/experimental/evaluation/experiment/result_display.py
@@ -0,0 +1,70 @@
+from typing import Dict, Any
+from rich.console import Console
+from rich.table import Table
+from rich.panel import Panel
+from crewai.experimental.evaluation.experiment.result import ExperimentResults
+
+class ExperimentResultsDisplay:
+    def __init__(self):
+        self.console = Console()
+
+    def summary(self, experiment_results: ExperimentResults):
+        total = len(experiment_results.results)
+        passed = sum(1 for r in experiment_results.results if r.passed)
+
+        table = Table(title="Experiment Summary")
+        table.add_column("Metric", style="cyan")
+        table.add_column("Value", style="green")
+
+        table.add_row("Total Test Cases", str(total))
+        table.add_row("Passed", str(passed))
+        table.add_row("Failed", str(total - passed))
+        table.add_row("Success Rate", f"{(passed / total * 100):.1f}%" if total > 0 else "N/A")
+
+        self.console.print(table)
+
+    def comparison_summary(self, comparison: Dict[str, Any], baseline_timestamp: str):
+        self.console.print(Panel(f"[bold]Comparison with baseline run from {baseline_timestamp}[/bold]",
+                                 expand=False))
+
+        table = Table(title="Results Comparison")
+        table.add_column("Metric", style="cyan")
+        table.add_column("Count", style="white")
+        table.add_column("Details", style="dim")
+
+        improved = comparison.get("improved", [])
+        if improved:
+            details = ", ".join([f"{test_identifier}" for test_identifier in improved[:3]])
+            if len(improved) > 3:
+                details += f" and {len(improved) - 3} more"
+            table.add_row("✅ Improved", str(len(improved)), details)
+        else:
+            table.add_row("✅ Improved", "0", "")
+
+        regressed = comparison.get("regressed", [])
+        if regressed:
+            details = ", ".join([f"{test_identifier}" for test_identifier in regressed[:3]])
+            if len(regressed) > 3:
+                details += f" and {len(regressed) - 3} more"
+            table.add_row("❌ Regressed", str(len(regressed)), details, style="red")
+        else:
+            table.add_row("❌ Regressed", "0", "")
+
+        unchanged = comparison.get("unchanged", [])
+        table.add_row("⏺ Unchanged", str(len(unchanged)), "")
+
+        new_tests = comparison.get("new_tests", [])
+        if new_tests:
+            details = ", ".join(new_tests[:3])
+            if len(new_tests) > 3:
+                details += f" and {len(new_tests) - 3} more"
+            table.add_row("➕ New Tests", str(len(new_tests)), details)
+
+        missing_tests = comparison.get("missing_tests", [])
+        if missing_tests:
+            details = ", ".join(missing_tests[:3])
+            if len(missing_tests) > 3:
+                details += f" and {len(missing_tests) - 3} more"
+            table.add_row("➖ Missing Tests", str(len(missing_tests)), details)
+
+        self.console.print(table)
--- a/src/crewai/experimental/evaluation/experiment/runner.py
+++ b/src/crewai/experimental/evaluation/experiment/runner.py
@@ -0,0 +1,125 @@
+from collections import defaultdict
+from hashlib import md5
+from typing import Any
+
+from crewai import Crew, Agent
+from crewai.experimental.evaluation import AgentEvaluator, create_default_evaluator
+from crewai.experimental.evaluation.experiment.result_display import ExperimentResultsDisplay
+from crewai.experimental.evaluation.experiment.result import ExperimentResults, ExperimentResult
+from crewai.experimental.evaluation.evaluation_display import AgentAggregatedEvaluationResult
+
+class ExperimentRunner:
+    def __init__(self, dataset: list[dict[str, Any]]):
+        self.dataset = dataset or []
+        self.evaluator: AgentEvaluator | None = None
+        self.display = ExperimentResultsDisplay()
+
+    def run(self, crew: Crew | None = None, agents: list[Agent] | None = None, print_summary: bool = False) -> ExperimentResults:
+        if crew and not agents:
+            agents = crew.agents
+
+        assert agents is not None
+        self.evaluator = create_default_evaluator(agents=agents)
+
+        results = []
+
+        for test_case in self.dataset:
+            self.evaluator.reset_iterations_results()
+            result = self._run_test_case(test_case=test_case, crew=crew, agents=agents)
+            results.append(result)
+
+        experiment_results = ExperimentResults(results)
+
+        if print_summary:
+            self.display.summary(experiment_results)
+
+        return experiment_results
+
+    def _run_test_case(self, test_case: dict[str, Any], agents: list[Agent], crew: Crew | None = None) -> ExperimentResult:
+        inputs = test_case["inputs"]
+        expected_score = test_case["expected_score"]
+        identifier = test_case.get("identifier") or md5(str(test_case).encode(), usedforsecurity=False).hexdigest()
+
+        try:
+            self.display.console.print(f"[dim]Running crew with input: {str(inputs)[:50]}...[/dim]")
+            self.display.console.print("\n")
+            if crew:
+                crew.kickoff(inputs=inputs)
+            else:
+                for agent in agents:
+                    agent.kickoff(**inputs)
+
+            assert self.evaluator is not None
+            agent_evaluations = self.evaluator.get_agent_evaluation()
+
+            actual_score = self._extract_scores(agent_evaluations)
+
+            passed = self._assert_scores(expected_score, actual_score)
+            return ExperimentResult(
+                identifier=identifier,
+                inputs=inputs,
+                score=actual_score,
+                expected_score=expected_score,
+                passed=passed,
+                agent_evaluations=agent_evaluations
+            )
+
+        except Exception as e:
+            self.display.console.print(f"[red]Error running test case: {str(e)}[/red]")
+            return ExperimentResult(
+                identifier=identifier,
+                inputs=inputs,
+                score=0,
+                expected_score=expected_score,
+                passed=False
+            )
+
+    def _extract_scores(self, agent_evaluations: dict[str, AgentAggregatedEvaluationResult]) -> float | dict[str,  float]:
+        all_scores: dict[str, list[float]] = defaultdict(list)
+        for evaluation in agent_evaluations.values():
+            for metric_name, score in evaluation.metrics.items():
+                if score.score is not None:
+                    all_scores[metric_name.value].append(score.score)
+
+        avg_scores = {m: sum(s)/len(s) for m, s in all_scores.items()}
+
+        if len(avg_scores) == 1:
+            return list(avg_scores.values())[0]
+
+        return avg_scores
+
+    def _assert_scores(self, expected: float | dict[str, float],
+                        actual: float | dict[str, float]) -> bool:
+        """
+        Compare expected and actual scores, and return whether the test case passed.
+
+        The rules for comparison are as follows:
+        - If both expected and actual scores are single numbers, the actual score must be >= expected.
+        - If expected is a single number and actual is a dict, compare against the average of actual values.
+        - If expected is a dict and actual is a single number, actual must be >= all expected values.
+        - If both are dicts, actual must have matching keys with values >= expected values.
+        """
+
+        if isinstance(expected, (int, float)) and isinstance(actual, (int, float)):
+            return actual >= expected
+
+        if isinstance(expected, dict) and isinstance(actual, (int, float)):
+            return all(actual >= exp_score for exp_score in expected.values())
+
+        if isinstance(expected, (int, float)) and isinstance(actual, dict):
+            if not actual:
+                return False
+            avg_score = sum(actual.values()) / len(actual)
+            return avg_score >= expected
+
+        if isinstance(expected, dict) and isinstance(actual, dict):
+            if not expected:
+                return True
+            matching_keys = set(expected.keys()) & set(actual.keys())
+            if not matching_keys:
+                return False
+
+            # All matching keys must have actual >= expected
+            return all(actual[key] >= expected[key] for key in matching_keys)
+
+        return False
--- a/src/crewai/experimental/evaluation/json_parser.py
+++ b/src/crewai/experimental/evaluation/json_parser.py
--- a/src/crewai/experimental/evaluation/metrics/init.py
+++ b/src/crewai/experimental/evaluation/metrics/init.py
@@ -0,0 +1,26 @@
+from crewai.experimental.evaluation.metrics.reasoning_metrics import (
+    ReasoningEfficiencyEvaluator
+)
+
+from crewai.experimental.evaluation.metrics.tools_metrics import (
+    ToolSelectionEvaluator,
+    ParameterExtractionEvaluator,
+    ToolInvocationEvaluator
+)
+
+from crewai.experimental.evaluation.metrics.goal_metrics import (
+    GoalAlignmentEvaluator
+)
+
+from crewai.experimental.evaluation.metrics.semantic_quality_metrics import (
+    SemanticQualityEvaluator
+)
+
+__all__ = [
+    "ReasoningEfficiencyEvaluator",
+    "ToolSelectionEvaluator",
+    "ParameterExtractionEvaluator",
+    "ToolInvocationEvaluator",
+    "GoalAlignmentEvaluator",
+    "SemanticQualityEvaluator"
+]
--- a/src/crewai/experimental/evaluation/metrics/goal_metrics.py
+++ b/src/crewai/experimental/evaluation/metrics/goal_metrics.py
@@ -3,8 +3,8 @@ from typing import Any, Dict
 from crewai.agent import Agent
 from crewai.task import Task

-from crewai.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
-from crewai.evaluation.json_parser import extract_json_from_llm_response
+from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
+from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response

 class GoalAlignmentEvaluator(BaseEvaluator):
    @property
@@ -14,10 +14,14 @@ class GoalAlignmentEvaluator(BaseEvaluator):
    def evaluate(
        self,
        agent: Agent,
-        task: Task,
        execution_trace: Dict[str, Any],
        final_output: Any,
+        task: Task | None = None,
    ) -> EvaluationScore:
+        task_context = ""
+        if task is not None:
+            task_context = f"Task description: {task.description}\nExpected output: {task.expected_output}\n"
+
        prompt = [
            {"role": "system", "content": """You are an expert evaluator assessing how well an AI agent's output aligns with its assigned task goal.

@@ -37,8 +41,7 @@ Return your evaluation as JSON with fields 'score' (number) and 'feedback' (stri
            {"role": "user", "content": f"""
 Agent role: {agent.role}
 Agent goal: {agent.goal}
-Task description: {task.description}
-Expected output: {task.expected_output}
+{task_context}

 Agent's final output:
 {final_output}
--- a/src/crewai/experimental/evaluation/metrics/reasoning_metrics.py
+++ b/src/crewai/experimental/evaluation/metrics/reasoning_metrics.py
@@ -16,8 +16,8 @@ from collections.abc import Sequence
 from crewai.agent import Agent
 from crewai.task import Task

-from crewai.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
-from crewai.evaluation.json_parser import extract_json_from_llm_response
+from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
+from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
 from crewai.tasks.task_output import TaskOutput

 class ReasoningPatternType(Enum):
@@ -36,10 +36,14 @@ class ReasoningEfficiencyEvaluator(BaseEvaluator):
    def evaluate(
        self,
        agent: Agent,
-        task: Task,
        execution_trace: Dict[str, Any],
-        final_output: TaskOutput,
+        final_output: TaskOutput | str,
+        task: Task | None = None,
    ) -> EvaluationScore:
+        task_context = ""
+        if task is not None:
+            task_context = f"Task description: {task.description}\nExpected output: {task.expected_output}\n"
+
        llm_calls = execution_trace.get("llm_calls", [])

        if not llm_calls or len(llm_calls) < 2:
@@ -83,6 +87,8 @@ class ReasoningEfficiencyEvaluator(BaseEvaluator):

        call_samples = self._get_call_samples(llm_calls)

+        final_output = final_output.raw if isinstance(final_output, TaskOutput) else final_output
+
        prompt = [
            {"role": "system", "content": """You are an expert evaluator assessing the reasoning efficiency of an AI agent's thought process.

@@ -117,7 +123,7 @@ Return your evaluation as JSON with the following structure:
 }"""},
            {"role": "user", "content": f"""
 Agent role: {agent.role}
-Task description: {task.description}
+{task_context}

 Reasoning efficiency metrics:
 - Total LLM calls: {efficiency_metrics["total_llm_calls"]}
@@ -130,7 +136,7 @@ Sample of agent reasoning flow (chronological sequence):
 {call_samples}

 Agent's final output:
-{final_output.raw[:500]}... (truncated)
+{final_output[:500]}... (truncated)

 Evaluate the reasoning efficiency of this agent based on these interaction patterns.
 Identify any inefficient reasoning patterns and provide specific suggestions for optimization.
--- a/src/crewai/experimental/evaluation/metrics/semantic_quality_metrics.py
+++ b/src/crewai/experimental/evaluation/metrics/semantic_quality_metrics.py
@@ -3,8 +3,8 @@ from typing import Any, Dict
 from crewai.agent import Agent
 from crewai.task import Task

-from crewai.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
-from crewai.evaluation.json_parser import extract_json_from_llm_response
+from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
+from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response

 class SemanticQualityEvaluator(BaseEvaluator):
    @property
@@ -14,10 +14,13 @@ class SemanticQualityEvaluator(BaseEvaluator):
    def evaluate(
        self,
        agent: Agent,
-        task: Task,
        execution_trace: Dict[str, Any],
        final_output: Any,
+        task: Task | None = None,
    ) -> EvaluationScore:
+        task_context = ""
+        if task is not None:
+            task_context = f"Task description: {task.description}"
        prompt = [
            {"role": "system", "content": """You are an expert evaluator assessing the semantic quality of an AI agent's output.

@@ -37,7 +40,7 @@ Return your evaluation as JSON with fields 'score' (number) and 'feedback' (stri
 """},
            {"role": "user", "content": f"""
 Agent role: {agent.role}
-Task description: {task.description}
+{task_context}

 Agent's final output:
 {final_output}
--- a/src/crewai/experimental/evaluation/metrics/tools_metrics.py
+++ b/src/crewai/experimental/evaluation/metrics/tools_metrics.py
@@ -1,8 +1,8 @@
 import json
 from typing import Dict, Any

-from crewai.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
-from crewai.evaluation.json_parser import extract_json_from_llm_response
+from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
+from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
 from crewai.agent import Agent
 from crewai.task import Task

@@ -16,10 +16,14 @@ class ToolSelectionEvaluator(BaseEvaluator):
    def evaluate(
        self,
        agent: Agent,
-        task: Task,
        execution_trace: Dict[str, Any],
        final_output: str,
+        task: Task | None = None,
    ) -> EvaluationScore:
+        task_context = ""
+        if task is not None:
+            task_context = f"Task description: {task.description}"
+
        tool_uses = execution_trace.get("tool_uses", [])
        tool_count = len(tool_uses)
        unique_tool_types = set([tool.get("tool", "Unknown tool") for tool in tool_uses])
@@ -72,7 +76,7 @@ Return your evaluation as JSON with these fields:
 """},
            {"role": "user", "content": f"""
 Agent role: {agent.role}
-Task description: {task.description}
+{task_context}

 Available tools for this agent:
 {available_tools_info}
@@ -128,10 +132,13 @@ class ParameterExtractionEvaluator(BaseEvaluator):
    def evaluate(
        self,
        agent: Agent,
-        task: Task,
        execution_trace: Dict[str, Any],
        final_output: str,
+        task: Task | None = None,
    ) -> EvaluationScore:
+        task_context = ""
+        if task is not None:
+            task_context = f"Task description: {task.description}"
        tool_uses = execution_trace.get("tool_uses", [])
        tool_count = len(tool_uses)

@@ -212,7 +219,7 @@ Return your evaluation as JSON with these fields:
 """},
            {"role": "user", "content": f"""
 Agent role: {agent.role}
-Task description: {task.description}
+{task_context}

 Parameter extraction examples:
 {param_samples_text}
@@ -267,10 +274,13 @@ class ToolInvocationEvaluator(BaseEvaluator):
    def evaluate(
        self,
        agent: Agent,
-        task: Task,
        execution_trace: Dict[str, Any],
        final_output: str,
+        task: Task | None = None,
    ) -> EvaluationScore:
+        task_context = ""
+        if task is not None:
+            task_context = f"Task description: {task.description}"
        tool_uses = execution_trace.get("tool_uses", [])
        tool_errors = []
        tool_count = len(tool_uses)
@@ -352,7 +362,7 @@ Return your evaluation as JSON with these fields:
 """},
            {"role": "user", "content": f"""
 Agent role: {agent.role}
-Task description: {task.description}
+{task_context}

 Tool invocation examples:
 {invocation_samples_text}
--- a/src/crewai/experimental/evaluation/testing.py
+++ b/src/crewai/experimental/evaluation/testing.py
@@ -0,0 +1,52 @@
+import inspect
+
+from typing_extensions import Any
+import warnings
+from crewai.experimental.evaluation.experiment import ExperimentResults, ExperimentRunner
+from crewai import Crew, Agent
+
+def assert_experiment_successfully(experiment_results: ExperimentResults, baseline_filepath: str | None = None) -> None:
+    failed_tests = [result for result in experiment_results.results if not result.passed]
+
+    if failed_tests:
+        detailed_failures: list[str] = []
+
+        for result in failed_tests:
+            expected = result.expected_score
+            actual = result.score
+            detailed_failures.append(f"- {result.identifier}: expected {expected}, got {actual}")
+
+        failure_details = "\n".join(detailed_failures)
+        raise AssertionError(f"The following test cases failed:\n{failure_details}")
+
+    baseline_filepath = baseline_filepath or _get_baseline_filepath_fallback()
+    comparison = experiment_results.compare_with_baseline(baseline_filepath=baseline_filepath)
+    assert_experiment_no_regression(comparison)
+
+def assert_experiment_no_regression(comparison_result: dict[str, list[str]]) -> None:
+    regressed = comparison_result.get("regressed", [])
+    if regressed:
+        raise AssertionError(f"Regression detected! The following tests that previously passed now fail: {regressed}")
+
+    missing_tests = comparison_result.get("missing_tests", [])
+    if missing_tests:
+        warnings.warn(
+            f"Warning: {len(missing_tests)} tests from the baseline are missing in the current run: {missing_tests}",
+            UserWarning
+        )
+
+def run_experiment(dataset: list[dict[str, Any]], crew: Crew | None = None, agents: list[Agent] | None = None, verbose: bool = False) -> ExperimentResults:
+    runner = ExperimentRunner(dataset=dataset)
+
+    return runner.run(agents=agents, crew=crew, print_summary=verbose)
+
+def _get_baseline_filepath_fallback() -> str:
+    test_func_name = "experiment_fallback"
+
+    try:
+        current_frame = inspect.currentframe()
+        if current_frame is not None:
+            test_func_name = current_frame.f_back.f_back.f_code.co_name # type: ignore[union-attr]
+    except Exception:
+        ...
+    return f"{test_func_name}_results.json"
--- a/src/crewai/flow/flow.py
+++ b/src/crewai/flow/flow.py
@@ -446,20 +446,15 @@ class Flow(Generic[T], metaclass=FlowMeta):

    def __init__(
        self,
-        initial_state: Union[Type[T], T, None] = None,
        persistence: Optional[FlowPersistence] = None,
        **kwargs: Any,
    ) -> None:
        """Initialize a new Flow instance.

        Args:
-            initial_state: Initial state for the flow (BaseModel instance or dict)
            persistence: Optional persistence backend for storing flow states
            **kwargs: Additional state values to initialize or override
        """
-        # Set the initial_state for this instance
-        if initial_state is not None:
-            self.initial_state = initial_state
        # Initialize basic instance attributes
        self._methods: Dict[str, Callable] = {}
        self._method_execution_counts: Dict[str, int] = {}
@@ -557,21 +552,25 @@ class Flow(Generic[T], metaclass=FlowMeta):
        # Handle BaseModel instance case
        if isinstance(self.initial_state, BaseModel):
            model = cast(BaseModel, self.initial_state)
-            
-            # Create copy of the BaseModel to avoid mutations
-            if hasattr(model, "model_copy"):
+            if not hasattr(model, "id"):
+                raise ValueError("Flow state model must have an 'id' field")
+
+            # Create new instance with same values to avoid mutations
+            if hasattr(model, "model_dump"):
                # Pydantic v2
-                return cast(T, model.model_copy())
-            elif hasattr(model, "copy"):
+                state_dict = model.model_dump()
+            elif hasattr(model, "dict"):
                # Pydantic v1
-                return cast(T, model.copy())
+                state_dict = model.dict()
            else:
-                # Fallback for other BaseModel implementations - preserve original logic
+                # Fallback for other BaseModel implementations
                state_dict = {
                    k: v for k, v in model.__dict__.items() if not k.startswith("_")
                }
-                model_class = type(model)
-                return cast(T, model_class(**state_dict))
+
+            # Create new instance of the same class
+            model_class = type(model)
+            return cast(T, model_class(**state_dict))
        raise TypeError(
            f"Initial state must be dict or BaseModel, got {type(self.initial_state)}"
        )
@@ -646,26 +645,30 @@ class Flow(Generic[T], metaclass=FlowMeta):
            # For BaseModel states, preserve existing fields unless overridden
            try:
                model = cast(BaseModel, self._state)
-                
-                if hasattr(model, "model_copy"):
-                    # Pydantic v2
-                    self._state = cast(T, model.model_copy(update=inputs))
-                elif hasattr(model, "copy"):
-                    # Pydantic v1
-                    self._state = cast(T, model.copy(update=inputs))
+                # Get current state as dict
+                if hasattr(model, "model_dump"):
+                    current_state = model.model_dump()
+                elif hasattr(model, "dict"):
+                    current_state = model.dict()
                else:
-                    # Fallback for other BaseModel implementations - preserve original logic
                    current_state = {
                        k: v for k, v in model.__dict__.items() if not k.startswith("_")
                    }
-                    new_state = {**current_state, **inputs}
-                    model_class = type(model)
-                    if hasattr(model_class, "model_validate"):
-                        self._state = cast(T, model_class.model_validate(new_state))
-                    elif hasattr(model_class, "parse_obj"):
-                        self._state = cast(T, model_class.parse_obj(new_state))
-                    else:
-                        self._state = cast(T, model_class(**new_state))
+
+                # Create new state with preserved fields and updates
+                new_state = {**current_state, **inputs}
+
+                # Create new instance with merged state
+                model_class = type(model)
+                if hasattr(model_class, "model_validate"):
+                    # Pydantic v2
+                    self._state = cast(T, model_class.model_validate(new_state))
+                elif hasattr(model_class, "parse_obj"):
+                    # Pydantic v1
+                    self._state = cast(T, model_class.parse_obj(new_state))
+                else:
+                    # Fallback for other BaseModel implementations
+                    self._state = cast(T, model_class(**new_state))
            except ValidationError as e:
                raise ValueError(f"Invalid inputs for structured state: {e}") from e
        else:
--- a/src/crewai/lite_agent.py
+++ b/src/crewai/lite_agent.py
@@ -305,6 +305,7 @@ class LiteAgent(FlowTrackable, BaseModel):
        """
        # Create agent info for event emission
        agent_info = {
+            "id": self.id,
            "role": self.role,
            "goal": self.goal,
            "backstory": self.backstory,
--- a/src/crewai/llm.py
+++ b/src/crewai/llm.py
@@ -311,6 +311,7 @@ class LLM(BaseLLM):
        callbacks: List[Any] = [],
        reasoning_effort: Optional[Literal["none", "low", "medium", "high"]] = None,
        stream: bool = False,
+        extra_headers: Optional[Dict[str, str]] = None,
        **kwargs,
    ):
        self.model = model
@@ -337,6 +338,7 @@ class LLM(BaseLLM):
        self.additional_params = kwargs
        self.is_anthropic = self._is_anthropic_model(model)
        self.stream = stream
+        self.extra_headers = extra_headers

        litellm.drop_params = True

@@ -408,6 +410,7 @@ class LLM(BaseLLM):
            "stream": self.stream,
            "tools": tools,
            "reasoning_effort": self.reasoning_effort,
+            "extra_headers": self.extra_headers,
            **self.additional_params,
        }

--- a/src/crewai/task.py
+++ b/src/crewai/task.py
@@ -67,6 +67,7 @@ class Task(BaseModel):
        description: Descriptive text detailing task's purpose and execution.
        expected_output: Clear definition of expected task outcome.
        output_file: File path for storing task output.
+        create_directory: Whether to create the directory for output_file if it doesn't exist.
        output_json: Pydantic model for structuring JSON output.
        output_pydantic: Pydantic model for task output.
        security_config: Security configuration including fingerprinting.
@@ -115,6 +116,10 @@ class Task(BaseModel):
        description="A file path to be used to create a file output.",
        default=None,
    )
+    create_directory: Optional[bool] = Field(
+        description="Whether to create the directory for output_file if it doesn't exist.",
+        default=True,
+    )
    output: Optional[TaskOutput] = Field(
        description="Task output, it's final result after being executed", default=None
    )
@@ -753,8 +758,10 @@ Follow these guidelines:
            resolved_path = Path(self.output_file).expanduser().resolve()
            directory = resolved_path.parent

-            if not directory.exists():
+            if self.create_directory and not directory.exists():
                directory.mkdir(parents=True, exist_ok=True)
+            elif not self.create_directory and not directory.exists():
+                raise RuntimeError(f"Directory {directory} does not exist and create_directory is False")

            with resolved_path.open("w", encoding="utf-8") as file:
                if isinstance(result, dict):
--- a/src/crewai/utilities/events/init.py
+++ b/src/crewai/utilities/events/init.py
@@ -17,6 +17,9 @@ from .agent_events import (
    AgentExecutionStartedEvent,
    AgentExecutionCompletedEvent,
    AgentExecutionErrorEvent,
+    AgentEvaluationStartedEvent,
+    AgentEvaluationCompletedEvent,
+    AgentEvaluationFailedEvent,
 )
 from .task_events import (
    TaskStartedEvent,
@@ -74,6 +77,9 @@ __all__ = [
    "AgentExecutionStartedEvent",
    "AgentExecutionCompletedEvent",
    "AgentExecutionErrorEvent",
+    "AgentEvaluationStartedEvent",
+    "AgentEvaluationCompletedEvent",
+    "AgentEvaluationFailedEvent",
    "TaskStartedEvent",
    "TaskCompletedEvent",
    "TaskFailedEvent",
--- a/src/crewai/utilities/events/agent_events.py
+++ b/src/crewai/utilities/events/agent_events.py
@@ -123,3 +123,28 @@ class AgentLogsExecutionEvent(BaseEvent):
    type: str = "agent_logs_execution"

    model_config = {"arbitrary_types_allowed": True}
+
+# Agent Eval events
+class AgentEvaluationStartedEvent(BaseEvent):
+    agent_id: str
+    agent_role: str
+    task_id: str | None = None
+    iteration: int
+    type: str = "agent_evaluation_started"
+
+class AgentEvaluationCompletedEvent(BaseEvent):
+    agent_id: str
+    agent_role: str
+    task_id: str | None = None
+    iteration: int
+    metric_category: Any
+    score: Any
+    type: str = "agent_evaluation_completed"
+
+class AgentEvaluationFailedEvent(BaseEvent):
+    agent_id: str
+    agent_role: str
+    task_id: str | None = None
+    iteration: int
+    error: str
+    type: str = "agent_evaluation_failed"
--- a/src/crewai/utilities/events/event_types.py
+++ b/src/crewai/utilities/events/event_types.py
@@ -4,6 +4,7 @@ from .agent_events import (
    AgentExecutionCompletedEvent,
    AgentExecutionErrorEvent,
    AgentExecutionStartedEvent,
+    LiteAgentExecutionCompletedEvent,
 )
 from .crew_events import (
    CrewKickoffCompletedEvent,
@@ -80,6 +81,7 @@ EventTypes = Union[
    CrewTrainFailedEvent,
    AgentExecutionStartedEvent,
    AgentExecutionCompletedEvent,
+    LiteAgentExecutionCompletedEvent,
    TaskStartedEvent,
    TaskCompletedEvent,
    TaskFailedEvent,
--- a/tests/cassettes/TestAgentEvaluator.test_eval_lite_agent.yaml
+++ b/tests/cassettes/TestAgentEvaluator.test_eval_lite_agent.yaml
@@ -0,0 +1,237 @@
+interactions:
+- request:
+    body: '{"messages": [{"role": "system", "content": "You are Test Agent. An agent
+      created for testing purposes\nYour personal goal is: Complete test tasks successfully\n\nTo
+      give my best complete final answer to the task respond using the exact following
+      format:\n\nThought: I now can give a great answer\nFinal Answer: Your final
+      answer must be the great and the most complete as possible, it must be outcome
+      described.\n\nI MUST use these formats, my job depends on it!"}, {"role": "user",
+      "content": "Complete this task successfully"}], "model": "gpt-4o-mini", "stop":
+      ["\nObservation:"]}'
+    headers:
+      accept:
+      - application/json
+      accept-encoding:
+      - gzip, deflate, zstd
+      connection:
+      - keep-alive
+      content-length:
+      - '583'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+      user-agent:
+      - OpenAI/Python 1.93.0
+      x-stainless-arch:
+      - arm64
+      x-stainless-async:
+      - 'false'
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - MacOS
+      x-stainless-package-version:
+      - 1.93.0
+      x-stainless-raw-response:
+      - 'true'
+      x-stainless-read-timeout:
+      - '600.0'
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.11.12
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    body:
+      string: !!binary |
+        H4sIAAAAAAAAAwAAAP//jFNNb9swDL3nVxA6J0U+HKTNbd0woMAOw7Bu6LbCUCXa1iqLgkgnzYr8
+        98FKWqdbB+wiQHx81OMj9TgCUM6qNSjTaDFt9JNL+TZ7N/dfrusPN01NyV6vPk3f/mrl5vLrXI17
+        Bt39RCNPrDNDbfQojsIBNgm1YF91tlrOl+fzxXKWgZYs+p5WR5kUNGldcJP5dF5MpqvJ7PzIbsgZ
+        ZLWG7yMAgMd89jqDxQe1hun4KdIis65RrZ+TAFQi30eUZnYsOogaD6ChIBiy9M8NdXUja7iCQFsw
+        OkDtNgga6l4/6MBbTAA/wnsXtIc3+b6Gjx41I8REG2cRWoStkwakQeCIxlXOgEXRzjNQgvzigwBV
+        OUU038OOOgiIFhr0MdPHoIOFK9g67wEDdwlBCI7OIjgB7oxB5qrzfpeznxRokIZS3wwk5EiB8ey0
+        54RVx7r3PXTenwA6BBLdzy27fXtE9s/+eqpjojv+g6oqFxw3ZULNFHovWSiqjO5HALd5jt2L0aiY
+        qI1SCt1jfu7i4lBODdszgEVxBIVE+yE+KxbjV8qVR79PFkEZbRq0A3XYGt1ZRyfA6KTpv9W8VvvQ
+        uAv1/5QfAGMwCtoyJrTOvOx4SEvYf65/pT2bnAUrxrRxBktxmPpBWKx05w8rr3jHgm1ZuVBjiskd
+        9r6K5aLQy0LjxcKo0X70GwAA//8DAMz2wVUFBAAA
+    headers:
+      CF-RAY:
+      - 95f93ea9af627e0b-GRU
+      Connection:
+      - keep-alive
+      Content-Encoding:
+      - gzip
+      Content-Type:
+      - application/json
+      Date:
+      - Tue, 15 Jul 2025 12:25:54 GMT
+      Server:
+      - cloudflare
+      Set-Cookie:
+      - __cf_bm=GRZmZLrjW5ZRHNmUJa4ccrMcy20D1rmeqK6Ptlv0mRY-1752582354-1.0.1.1-xKd_yga48Eedech5TRlThlEpDgsB2whxkWHlCyAGOVMqMcvH1Ju9FdXYbuQ9NdUQcVxPLgiGM35lYhqSLVQiXDyK01dnyp2Gvm560FBN9DY;
+        path=/; expires=Tue, 15-Jul-25 12:55:54 GMT; domain=.api.openai.com; HttpOnly;
+        Secure; SameSite=None
+      - _cfuvid=MYqswpSR7sqr4kGp6qZVkaL7HDYwMiww49PeN9QBP.A-1752582354973-0.0.1.1-604800000;
+        path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - nosniff
+      access-control-expose-headers:
+      - X-Request-ID
+      alt-svc:
+      - h3=":443"; ma=86400
+      cf-cache-status:
+      - DYNAMIC
+      openai-organization:
+      - crewai-iuxna1
+      openai-processing-ms:
+      - '4047'
+      openai-version:
+      - '2020-10-01'
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      x-envoy-upstream-service-time:
+      - '4440'
+      x-ratelimit-limit-requests:
+      - '30000'
+      x-ratelimit-limit-tokens:
+      - '150000000'
+      x-ratelimit-remaining-requests:
+      - '29999'
+      x-ratelimit-remaining-tokens:
+      - '149999885'
+      x-ratelimit-reset-requests:
+      - 2ms
+      x-ratelimit-reset-tokens:
+      - 0s
+      x-request-id:
+      - req_5704c0f206a927ddc12aa1a19b612a75
+    status:
+      code: 200
+      message: OK
+- request:
+    body: '{"messages": [{"role": "system", "content": "You are an expert evaluator
+      assessing how well an AI agent''s output aligns with its assigned task goal.\n\nScore
+      the agent''s goal alignment on a scale from 0-10 where:\n- 0: Complete misalignment,
+      agent did not understand or attempt the task goal\n- 5: Partial alignment, agent
+      attempted the task but missed key requirements\n- 10: Perfect alignment, agent
+      fully satisfied all task requirements\n\nConsider:\n1. Did the agent correctly
+      interpret the task goal?\n2. Did the final output directly address the requirements?\n3.
+      Did the agent focus on relevant aspects of the task?\n4. Did the agent provide
+      all requested information or deliverables?\n\nReturn your evaluation as JSON
+      with fields ''score'' (number) and ''feedback'' (string).\n"}, {"role": "user",
+      "content": "\nAgent role: Test Agent\nAgent goal: Complete test tasks successfully\n\n\nAgent''s
+      final output:\nPlease provide me with the specific details or context of the
+      task you need help with, and I will ensure to complete it successfully and provide
+      a thorough response.\n\nEvaluate how well the agent''s output aligns with the
+      assigned task goal.\n"}], "model": "gpt-4o-mini", "stop": []}'
+    headers:
+      accept:
+      - application/json
+      accept-encoding:
+      - gzip, deflate, zstd
+      connection:
+      - keep-alive
+      content-length:
+      - '1196'
+      content-type:
+      - application/json
+      cookie:
+      - __cf_bm=GRZmZLrjW5ZRHNmUJa4ccrMcy20D1rmeqK6Ptlv0mRY-1752582354-1.0.1.1-xKd_yga48Eedech5TRlThlEpDgsB2whxkWHlCyAGOVMqMcvH1Ju9FdXYbuQ9NdUQcVxPLgiGM35lYhqSLVQiXDyK01dnyp2Gvm560FBN9DY;
+        _cfuvid=MYqswpSR7sqr4kGp6qZVkaL7HDYwMiww49PeN9QBP.A-1752582354973-0.0.1.1-604800000
+      host:
+      - api.openai.com
+      user-agent:
+      - OpenAI/Python 1.93.0
+      x-stainless-arch:
+      - arm64
+      x-stainless-async:
+      - 'false'
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - MacOS
+      x-stainless-package-version:
+      - 1.93.0
+      x-stainless-raw-response:
+      - 'true'
+      x-stainless-read-timeout:
+      - '600.0'
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.11.12
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    body:
+      string: !!binary |
+        H4sIAAAAAAAAA4xUy27bQAy8+yuIPdtGbMdN4FvbSxM0QIsEKNA6MJhdSmK82hWWVFwj8L8XKz/k
+        9AH0ogOHnOFjVq8DAMPOLMDYCtXWjR990O+TT7dfZs/v5OtFy/ef7++mxfu7j83t/cONGeaK+PRM
+        Vo9VYxvrxpNyDHvYJkKlzDq5mk/n19PZfN4BdXTkc1nZ6OgyjmoOPJpeTC9HF1ejyfWhuopsScwC
+        fgwAAF67b+4zOPppFnAxPEZqEsGSzOKUBGBS9DliUIRFMagZ9qCNQSl0rb8uA8DSiI2JlmYB0+E+
+        UBC5J7TrHFuah4oASwoKjh2EqOCojkE0oRIgWE+YoA2OUhZzHEqIBWhFoChrKCP6IWwqthWwgEY4
+        bItASbRLEpDWWhIpWu+3Y7gJooRuCKyAsiYHRUxQx0TgSJG9DIGDY4ua5RA82nVW5cDKqPxCWYhC
+        iSXBhrU69TOGbxV7ysxSxY0Awoa951AGkq69/do67QLZk8vBJsUXdgQYtoBWW/SQSJoYpFPq2Ptp
+        MLjTttC51DFXVIPjRFb9drw0y7A7v0uiohXM3git92cAhhAVs7c6RzwekN3JAz6WTYpP8lupKTiw
+        VKtEKDHke4vGxnTobgDw2HmtfWMf06RYN7rSuKZObjo7eM30Fu/R6yOoUdH38dnkCLzhWx1ud+ZW
+        Y9FW5PrS3trYOo5nwOBs6j+7+Rv3fnIO5f/Q94C11Ci5VZPIsX07cZ+WKP8B/pV22nLXsBFKL2xp
+        pUwpX8JRga3fv0sjW1GqVwWHklKTuHuc+ZKD3eAXAAAA//8DADksFsafBAAA
+    headers:
+      CF-RAY:
+      - 95f93ec73a1c7e0b-GRU
+      Connection:
+      - keep-alive
+      Content-Encoding:
+      - gzip
+      Content-Type:
+      - application/json
+      Date:
+      - Tue, 15 Jul 2025 12:25:57 GMT
+      Server:
+      - cloudflare
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - nosniff
+      access-control-expose-headers:
+      - X-Request-ID
+      alt-svc:
+      - h3=":443"; ma=86400
+      cf-cache-status:
+      - DYNAMIC
+      openai-organization:
+      - crewai-iuxna1
+      openai-processing-ms:
+      - '1544'
+      openai-version:
+      - '2020-10-01'
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      x-envoy-upstream-service-time:
+      - '1546'
+      x-ratelimit-limit-requests:
+      - '30000'
+      x-ratelimit-limit-tokens:
+      - '150000000'
+      x-ratelimit-remaining-requests:
+      - '29999'
+      x-ratelimit-remaining-tokens:
+      - '149999732'
+      x-ratelimit-reset-requests:
+      - 2ms
+      x-ratelimit-reset-tokens:
+      - 0s
+      x-request-id:
+      - req_44930ba12ad8d1e3f0beed1d5e3d8b0c
+    status:
+      code: 200
+      message: OK
+version: 1
--- a/tests/cassettes/TestAgentEvaluator.test_eval_specific_agents_from_crew.yaml
+++ b/tests/cassettes/TestAgentEvaluator.test_eval_specific_agents_from_crew.yaml
--- a/tests/cassettes/TestAgentEvaluator.test_evaluate_current_iteration.yaml
+++ b/tests/cassettes/TestAgentEvaluator.test_evaluate_current_iteration.yaml
@@ -427,4 +427,140 @@ interactions:
    status:
      code: 200
      message: OK
+- request:
+    body: '{"messages": [{"role": "system", "content": "You are an expert evaluator
+      assessing how well an AI agent''s output aligns with its assigned task goal.\n\nScore
+      the agent''s goal alignment on a scale from 0-10 where:\n- 0: Complete misalignment,
+      agent did not understand or attempt the task goal\n- 5: Partial alignment, agent
+      attempted the task but missed key requirements\n- 10: Perfect alignment, agent
+      fully satisfied all task requirements\n\nConsider:\n1. Did the agent correctly
+      interpret the task goal?\n2. Did the final output directly address the requirements?\n3.
+      Did the agent focus on relevant aspects of the task?\n4. Did the agent provide
+      all requested information or deliverables?\n\nReturn your evaluation as JSON
+      with fields ''score'' (number) and ''feedback'' (string).\n"}, {"role": "user",
+      "content": "\nAgent role: Test Agent\nAgent goal: Complete test tasks successfully\nTask
+      description: Test task description\nExpected output: Expected test output\n\nAgent''s
+      final output:\nThe expected test output is a comprehensive document that outlines
+      the specific parameters and criteria that define success for the task at hand.
+      It should include detailed descriptions of the tasks, the goals that need to
+      be achieved, and any specific formatting or structural requirements necessary
+      for the output. Each component of the task must be analyzed and addressed, providing
+      context as well as examples where applicable. Additionally, any tools or methodologies
+      that are relevant to executing the tasks successfully should be outlined, including
+      any potential risks or challenges that may arise during the process. This document
+      serves as a guiding framework to ensure that all aspects of the task are thoroughly
+      considered and executed to meet the high standards expected.\n\nEvaluate how
+      well the agent''s output aligns with the assigned task goal.\n"}], "model":
+      "gpt-4o-mini", "stop": []}'
+    headers:
+      accept:
+      - application/json
+      accept-encoding:
+      - gzip, deflate, zstd
+      connection:
+      - keep-alive
+      content-length:
+      - '1893'
+      content-type:
+      - application/json
+      cookie:
+      - _cfuvid=XwsgBfgvDGlKFQ4LiGYGIARIoSNTiwidqoo9UZcc.XY-1752087999227-0.0.1.1-604800000
+      host:
+      - api.openai.com
+      user-agent:
+      - OpenAI/Python 1.93.0
+      x-stainless-arch:
+      - arm64
+      x-stainless-async:
+      - 'false'
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - MacOS
+      x-stainless-package-version:
+      - 1.93.0
+      x-stainless-raw-response:
+      - 'true'
+      x-stainless-read-timeout:
+      - '600.0'
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.11.12
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    body:
+      string: !!binary |
+        H4sIAAAAAAAAAwAAAP//jFRNbxs5DL37VxA6jwPHddrUxxwWi2BRtEAPRevCYCSOh41GUkWOnTTI
+        fy8kf4zT5rCXOfCRT4+P5DxNAAw7swRjO1TbJz+90dvFxy//vX0za7dfr29+3eo/n75++Mh0O/za
+        maZUxLsfZPVYdWFjnzwpx7CHbSZUKqyX767mV/PL2eKqAn105EvZJul0Eac9B57OZ/PFdPZuenl9
+        qO4iWxKzhG8TAICn+i06g6MHs4RZc4z0JIIbMstTEoDJ0ZeIQREWxaCmGUEbg1Ko0p9WAWBlxMZM
+        K7OEq2YfaIncHdr7EluZzx0BbigopBy37MgBgiNF9uTAkdjMqbQOsYVdhwraEdBDIqvkIA6aBgXp
+        4uAdcLB+cNTArmPbAQfHFpUEJPYEQ3CUi2LHYVPoCpOi3EOmnwNn6imoXMC/cUdbyk3FWw7oj8+4
+        SAIhKkgiyy1b9P4RHHneUn4pTEn0WIYC6YDX5866aqDH+yKHFRJm5cqInjeB3AWM7vQsUgzhTFb9
+        48GtUlloSwMkZ4bEDMetOaSg1QH9XldVwSrk2wY4iBLWSs/hmG47zGiVMouylZP7WHkzdRSEtwQu
+        2qH4dhyBjcWKHWsXhzJTEgpVAwagByySirgzRSfLDrtzsTKr8Hy+VJnaQbAsdhi8PwMwhKhYfKzr
+        /P2APJ8W2MdNyvFO/ig1LQeWbp0JJYayrKIxmYo+TwC+10MZXuy+STn2Sdca76k+92ax2POZ8T5H
+        9P31AdSo6Mf4YjFvXuFb71dezk7NWLQdubF0vEscHMczYHLW9d9qXuPed85h83/oR8BaSkpunTI5
+        ti87HtMy/agTfT3t5HIVbITyli2tlSmXSThqcfD7n4qRR1Hq1y2HDeWUuf5ZyiQnz5PfAAAA//8D
+        AEfUP8BcBQAA
+    headers:
+      CF-RAY:
+      - 95f365f1bfc87ded-GRU
+      Connection:
+      - keep-alive
+      Content-Encoding:
+      - gzip
+      Content-Type:
+      - application/json
+      Date:
+      - Mon, 14 Jul 2025 19:24:07 GMT
+      Server:
+      - cloudflare
+      Set-Cookie:
+      - __cf_bm=PcC3_3T8.MK_WpZlQLdZfwpNv9Pe45AIYmrXOSgJ65E-1752521047-1.0.1.1-eyqwSWfQC7ZV6.JwTsTihK1ZWCrEmxd52CtNcfe.fw1UjjBN9rdTU4G7hRZiNqHQYo4sVZMmgRgqM9k7HRSzN2zln0bKmMiOuSQTZh6xF_I;
+        path=/; expires=Mon, 14-Jul-25 19:54:07 GMT; domain=.api.openai.com; HttpOnly;
+        Secure; SameSite=None
+      - _cfuvid=JvQ1c4qYZefNwOPoVNgAtX8ET7ObU.JKDvGc43LOR6g-1752521047741-0.0.1.1-604800000;
+        path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - nosniff
+      access-control-expose-headers:
+      - X-Request-ID
+      alt-svc:
+      - h3=":443"; ma=86400
+      cf-cache-status:
+      - DYNAMIC
+      openai-organization:
+      - crewai-iuxna1
+      openai-processing-ms:
+      - '2729'
+      openai-version:
+      - '2020-10-01'
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      x-envoy-upstream-service-time:
+      - '2789'
+      x-ratelimit-limit-requests:
+      - '30000'
+      x-ratelimit-limit-tokens:
+      - '150000000'
+      x-ratelimit-remaining-requests:
+      - '29999'
+      x-ratelimit-remaining-tokens:
+      - '149999559'
+      x-ratelimit-reset-requests:
+      - 2ms
+      x-ratelimit-reset-tokens:
+      - 0s
+      x-request-id:
+      - req_74f6e8ff49db25dbea3d3525cc149e8e
+    status:
+      code: 200
+      message: OK
 version: 1
--- a/tests/cassettes/TestAgentEvaluator.test_failed_evaluation.yaml
+++ b/tests/cassettes/TestAgentEvaluator.test_failed_evaluation.yaml
@@ -0,0 +1,123 @@
+interactions:
+- request:
+    body: '{"messages": [{"role": "system", "content": "You are Test Agent. An agent
+      created for testing purposes\nYour personal goal is: Complete test tasks successfully\nTo
+      give my best complete final answer to the task respond using the exact following
+      format:\n\nThought: I now can give a great answer\nFinal Answer: Your final
+      answer must be the great and the most complete as possible, it must be outcome
+      described.\n\nI MUST use these formats, my job depends on it!"}, {"role": "user",
+      "content": "\nCurrent Task: Test task description\n\nThis is the expected criteria
+      for your final answer: Expected test output\nyou MUST return the actual complete
+      content as the final answer, not a summary.\n\nBegin! This is VERY important
+      to you, use the tools available and give your best Final Answer, your job depends
+      on it!\n\nThought:"}], "model": "gpt-4o-mini", "stop": ["\nObservation:"]}'
+    headers:
+      accept:
+      - application/json
+      accept-encoding:
+      - gzip, deflate, zstd
+      connection:
+      - keep-alive
+      content-length:
+      - '879'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+      user-agent:
+      - OpenAI/Python 1.93.0
+      x-stainless-arch:
+      - arm64
+      x-stainless-async:
+      - 'false'
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - MacOS
+      x-stainless-package-version:
+      - 1.93.0
+      x-stainless-raw-response:
+      - 'true'
+      x-stainless-read-timeout:
+      - '600.0'
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.11.12
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    body:
+      string: !!binary |
+        H4sIAAAAAAAAAwAAAP//jFTBbhtHDL3rK4g5rwRbtaNYt9RoEaNoUaBODm0DgZnh7jKe5WyHXDmO
+        4X8vZiRLcupDLwvsPPLxPQ45jzMAx8GtwfkezQ9jnP9oeLv98N5+vfl9+4v89Mf76+XV7XDz8Yc/
+        r39T15SM9PkLeXvOWvg0jJGMk+xgnwmNCuv56nJ5+XZ1tbqswJACxZLWjTa/SPOBhefLs+XF/Gw1
+        P3+7z+4Te1K3hr9mAACP9Vt0SqCvbg1nzfPJQKrYkVsfggBcTrGcOFRlNRRzzRH0SYykSr8BSffg
+        UaDjLQFCV2QDit5TBvhbfmbBCO/q/xpue1ZgBesJ6OtI3iiAkRqkycbJGrjv2ffgk5S6CqkFhECG
+        HClAIPWZx9Kkgtz3aJVq37vChXoH2qcpBogp3UHkO1rAbU/QViW7Os8hLD5OgQBjBCFfOpEfgKVN
+        ecBSpoFAQxK1jMbSgY+Y2R6aWjJTT6K8JSHVBlACYOgpk3gCS4DyADqS55YpQDdxoMhCuoCbgwKf
+        tpSB0PeAJdaKseKpOsn0z8SZBhJrgESnXERY8S0JRsxWulkoilkKkDJ0JJQx8jcKi13DX3pWyuWm
+        FPDQN8jU7mW3KRfdSaj2r5ZLMEmgXOYg7K5OlcQYI1Cs4vSFavSVmLWnsDgdnEztpFiGV6YYTwAU
+        SVYbXkf20x55OgxpTN2Y02f9LtW1LKz9JhNqkjKQaml0FX2aAXyqyzC9mG835jSMtrF0R7Xc+Zvz
+        HZ877uARvXqzBy0ZxuP58nLVvMK32Q2rnqyT8+h7CsfU4+7hFDidALMT1/9V8xr3zjlL93/oj4D3
+        NBqFzZgpsH/p+BiW6Utd0dfDDl2ugl2ZK/a0MaZcbiJQi1PcPRxOH9Ro2LQsHeUxc309yk3Onmb/
+        AgAA//8DAAbYfvVABQAA
+    headers:
+      CF-RAY:
+      - 95f9c7ffa8331b11-GRU
+      Connection:
+      - keep-alive
+      Content-Encoding:
+      - gzip
+      Content-Type:
+      - application/json
+      Date:
+      - Tue, 15 Jul 2025 13:59:38 GMT
+      Server:
+      - cloudflare
+      Set-Cookie:
+      - __cf_bm=J_xe1AP.B5P6D2GVMCesyioeS5E9DnYT34rbwQUefFc-1752587978-1.0.1.1-5Dflk5cAj6YCsOSVbCFWWSpXpw_mXsczIdzWzs2h2OwDL01HQbduE5LAToy67sfjFjHeeO4xRrqPLUQpySy2QqyHXbI_fzX4UAt3.UdwHxU;
+        path=/; expires=Tue, 15-Jul-25 14:29:38 GMT; domain=.api.openai.com; HttpOnly;
+        Secure; SameSite=None
+      - _cfuvid=0rTD8RMpxBQQy42jzmum16_eoRtWNfaZMG_TJkhGS7I-1752587978437-0.0.1.1-604800000;
+        path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - nosniff
+      access-control-expose-headers:
+      - X-Request-ID
+      alt-svc:
+      - h3=":443"; ma=86400
+      cf-cache-status:
+      - DYNAMIC
+      openai-organization:
+      - crewai-iuxna1
+      openai-processing-ms:
+      - '2623'
+      openai-version:
+      - '2020-10-01'
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      x-envoy-upstream-service-time:
+      - '2626'
+      x-ratelimit-limit-requests:
+      - '30000'
+      x-ratelimit-limit-tokens:
+      - '150000000'
+      x-ratelimit-remaining-requests:
+      - '29999'
+      x-ratelimit-remaining-tokens:
+      - '149999813'
+      x-ratelimit-reset-requests:
+      - 2ms
+      x-ratelimit-reset-tokens:
+      - 0s
+      x-request-id:
+      - req_ccc347e91010713379c920aa0efd1f4f
+    status:
+      code: 200
+      message: OK
+version: 1
--- a/tests/evaluation/metrics/init.py
+++ b/tests/evaluation/metrics/init.py
--- a/tests/evaluation/test_agent_evaluator.py
+++ b/tests/evaluation/test_agent_evaluator.py
@@ -1,95 +0,0 @@
-import pytest
-
-from crewai.agent import Agent
-from crewai.task import Task
-from crewai.crew import Crew
-from crewai.evaluation.agent_evaluator import AgentEvaluator
-from crewai.evaluation.base_evaluator import AgentEvaluationResult
-from crewai.evaluation import (
-    GoalAlignmentEvaluator,
-    SemanticQualityEvaluator,
-    ToolSelectionEvaluator,
-    ParameterExtractionEvaluator,
-    ToolInvocationEvaluator,
-    ReasoningEfficiencyEvaluator
-)
-
-from crewai.evaluation import create_default_evaluator
-class TestAgentEvaluator:
-    @pytest.fixture
-    def mock_crew(self):
-        agent = Agent(
-            role="Test Agent",
-            goal="Complete test tasks successfully",
-            backstory="An agent created for testing purposes",
-            allow_delegation=False,
-            verbose=False
-        )
-
-        task = Task(
-            description="Test task description",
-            agent=agent,
-            expected_output="Expected test output"
-        )
-
-        crew = Crew(
-            agents=[agent],
-            tasks=[task]
-        )
-        return crew
-
-    def test_set_iteration(self):
-        agent_evaluator = AgentEvaluator()
-
-        agent_evaluator.set_iteration(3)
-        assert agent_evaluator.iteration == 3
-
-    @pytest.mark.vcr(filter_headers=["authorization"])
-    def test_evaluate_current_iteration(self, mock_crew):
-        agent_evaluator = AgentEvaluator(crew=mock_crew, evaluators=[GoalAlignmentEvaluator()])
-
-        mock_crew.kickoff()
-
-        results = agent_evaluator.evaluate_current_iteration()
-
-        assert isinstance(results, dict)
-
-        agent, = mock_crew.agents
-        task, = mock_crew.tasks
-
-        assert len(mock_crew.agents) == 1
-        assert agent.role in results
-        assert len(results[agent.role]) == 1
-
-        result, = results[agent.role]
-        assert isinstance(result, AgentEvaluationResult)
-
-        assert result.agent_id == str(agent.id)
-        assert result.task_id == str(task.id)
-
-        goal_alignment, = result.metrics.values()
-        assert goal_alignment.score == 5.0
-
-        expected_feedback = "The agent's output demonstrates an understanding of the need for a comprehensive document"
-        assert expected_feedback in goal_alignment.feedback
-
-        assert goal_alignment.raw_response is not None
-        assert '"score": 5' in goal_alignment.raw_response
-
-    def test_create_default_evaluator(self, mock_crew):
-        agent_evaluator = create_default_evaluator(crew=mock_crew)
-        assert isinstance(agent_evaluator, AgentEvaluator)
-        assert agent_evaluator.crew == mock_crew
-
-        expected_types = [
-            GoalAlignmentEvaluator,
-            SemanticQualityEvaluator,
-            ToolSelectionEvaluator,
-            ParameterExtractionEvaluator,
-            ToolInvocationEvaluator,
-            ReasoningEfficiencyEvaluator
-        ]
-
-        assert len(agent_evaluator.evaluators) == len(expected_types)
-        for evaluator, expected_type in zip(agent_evaluator.evaluators, expected_types):
-            assert isinstance(evaluator, expected_type)
--- a/tests/experimental/evaluation/init.py
+++ b/tests/experimental/evaluation/init.py
--- a/tests/experimental/evaluation/metrics/init.py
+++ b/tests/experimental/evaluation/metrics/init.py
--- a/tests/experimental/evaluation/metrics/base_evaluation_metrics_test.py
+++ b/tests/experimental/evaluation/metrics/base_evaluation_metrics_test.py
--- a/tests/experimental/evaluation/metrics/test_goal_metrics.py
+++ b/tests/experimental/evaluation/metrics/test_goal_metrics.py
@@ -1,8 +1,8 @@
 from unittest.mock import patch, MagicMock
-from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
+from tests.experimental.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest

-from crewai.evaluation.base_evaluator import EvaluationScore
-from crewai.evaluation.metrics.goal_metrics import GoalAlignmentEvaluator
+from crewai.experimental.evaluation.base_evaluator import EvaluationScore
+from crewai.experimental.evaluation.metrics.goal_metrics import GoalAlignmentEvaluator
 from crewai.utilities.llm_utils import LLM


--- a/tests/experimental/evaluation/metrics/test_reasoning_metrics.py
+++ b/tests/experimental/evaluation/metrics/test_reasoning_metrics.py
@@ -3,12 +3,12 @@ from unittest.mock import patch, MagicMock
 from typing import List, Dict, Any

 from crewai.tasks.task_output import TaskOutput
-from crewai.evaluation.metrics.reasoning_metrics import (
+from crewai.experimental.evaluation.metrics.reasoning_metrics import (
    ReasoningEfficiencyEvaluator,
 )
-from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
+from tests.experimental.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
 from crewai.utilities.llm_utils import LLM
-from crewai.evaluation.base_evaluator import EvaluationScore
+from crewai.experimental.evaluation.base_evaluator import EvaluationScore

 class TestReasoningEfficiencyEvaluator(BaseEvaluationMetricsTest):
    @pytest.fixture
--- a/tests/experimental/evaluation/metrics/test_semantic_quality_metrics.py
+++ b/tests/experimental/evaluation/metrics/test_semantic_quality_metrics.py
@@ -1,8 +1,8 @@
 from unittest.mock import patch, MagicMock

-from crewai.evaluation.base_evaluator import EvaluationScore
-from crewai.evaluation.metrics.semantic_quality_metrics import SemanticQualityEvaluator
-from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
+from crewai.experimental.evaluation.base_evaluator import EvaluationScore
+from crewai.experimental.evaluation.metrics.semantic_quality_metrics import SemanticQualityEvaluator
+from tests.experimental.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
 from crewai.utilities.llm_utils import LLM

 class TestSemanticQualityEvaluator(BaseEvaluationMetricsTest):
--- a/tests/experimental/evaluation/metrics/test_tools_metrics.py
+++ b/tests/experimental/evaluation/metrics/test_tools_metrics.py
@@ -1,12 +1,12 @@
 from unittest.mock import patch, MagicMock

-from crewai.evaluation.metrics.tools_metrics import (
+from crewai.experimental.evaluation.metrics.tools_metrics import (
    ToolSelectionEvaluator,
    ParameterExtractionEvaluator,
    ToolInvocationEvaluator
 )
 from crewai.utilities.llm_utils import LLM
-from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
+from tests.experimental.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest

 class TestToolSelectionEvaluator(BaseEvaluationMetricsTest):
    def test_no_tools_available(self, mock_task, mock_agent):
--- a/tests/experimental/evaluation/test_agent_evaluator.py
+++ b/tests/experimental/evaluation/test_agent_evaluator.py
@@ -0,0 +1,278 @@
+import pytest
+
+from crewai.agent import Agent
+from crewai.task import Task
+from crewai.crew import Crew
+from crewai.experimental.evaluation.agent_evaluator import AgentEvaluator
+from crewai.experimental.evaluation.base_evaluator import AgentEvaluationResult
+from crewai.experimental.evaluation import (
+    GoalAlignmentEvaluator,
+    SemanticQualityEvaluator,
+    ToolSelectionEvaluator,
+    ParameterExtractionEvaluator,
+    ToolInvocationEvaluator,
+    ReasoningEfficiencyEvaluator,
+    MetricCategory,
+    EvaluationScore
+)
+
+from crewai.utilities.events.agent_events import AgentEvaluationStartedEvent, AgentEvaluationCompletedEvent, AgentEvaluationFailedEvent
+from crewai.utilities.events.crewai_event_bus import crewai_event_bus
+from crewai.experimental.evaluation import create_default_evaluator
+
+class TestAgentEvaluator:
+    @pytest.fixture
+    def mock_crew(self):
+        agent = Agent(
+            role="Test Agent",
+            goal="Complete test tasks successfully",
+            backstory="An agent created for testing purposes",
+            allow_delegation=False,
+            verbose=False
+        )
+
+        task = Task(
+            description="Test task description",
+            agent=agent,
+            expected_output="Expected test output"
+        )
+
+        crew = Crew(
+            agents=[agent],
+            tasks=[task]
+        )
+        return crew
+
+    def test_set_iteration(self):
+        agent_evaluator = AgentEvaluator(agents=[])
+
+        agent_evaluator.set_iteration(3)
+        assert agent_evaluator._execution_state.iteration == 3
+
+    @pytest.mark.vcr(filter_headers=["authorization"])
+    def test_evaluate_current_iteration(self, mock_crew):
+        agent_evaluator = AgentEvaluator(agents=mock_crew.agents, evaluators=[GoalAlignmentEvaluator()])
+
+        mock_crew.kickoff()
+
+        results = agent_evaluator.get_evaluation_results()
+
+        assert isinstance(results, dict)
+
+        agent, = mock_crew.agents
+        task, = mock_crew.tasks
+
+        assert len(mock_crew.agents) == 1
+        assert agent.role in results
+        assert len(results[agent.role]) == 1
+
+        result, = results[agent.role]
+        assert isinstance(result, AgentEvaluationResult)
+
+        assert result.agent_id == str(agent.id)
+        assert result.task_id == str(task.id)
+
+        goal_alignment, = result.metrics.values()
+        assert goal_alignment.score == 5.0
+
+        expected_feedback = "The agent's output demonstrates an understanding of the need for a comprehensive document outlining task"
+        assert expected_feedback in goal_alignment.feedback
+
+        assert goal_alignment.raw_response is not None
+        assert '"score": 5' in goal_alignment.raw_response
+
+    def test_create_default_evaluator(self, mock_crew):
+        agent_evaluator = create_default_evaluator(agents=mock_crew.agents)
+        assert isinstance(agent_evaluator, AgentEvaluator)
+        assert agent_evaluator.agents == mock_crew.agents
+
+        expected_types = [
+            GoalAlignmentEvaluator,
+            SemanticQualityEvaluator,
+            ToolSelectionEvaluator,
+            ParameterExtractionEvaluator,
+            ToolInvocationEvaluator,
+            ReasoningEfficiencyEvaluator
+        ]
+
+        assert len(agent_evaluator.evaluators) == len(expected_types)
+        for evaluator, expected_type in zip(agent_evaluator.evaluators, expected_types):
+            assert isinstance(evaluator, expected_type)
+
+    @pytest.mark.vcr(filter_headers=["authorization"])
+    def test_eval_lite_agent(self):
+        agent = Agent(
+            role="Test Agent",
+            goal="Complete test tasks successfully",
+            backstory="An agent created for testing purposes",
+        )
+
+        with crewai_event_bus.scoped_handlers():
+            events = {}
+            @crewai_event_bus.on(AgentEvaluationStartedEvent)
+            def capture_started(source, event):
+                events["started"] = event
+
+            @crewai_event_bus.on(AgentEvaluationCompletedEvent)
+            def capture_completed(source, event):
+                events["completed"] = event
+
+            @crewai_event_bus.on(AgentEvaluationFailedEvent)
+            def capture_failed(source, event):
+                events["failed"] = event
+
+            agent_evaluator = AgentEvaluator(agents=[agent], evaluators=[GoalAlignmentEvaluator()])
+
+            agent.kickoff(messages="Complete this task successfully")
+
+            assert events.keys() == {"started", "completed"}
+            assert events["started"].agent_id == str(agent.id)
+            assert events["started"].agent_role == agent.role
+            assert events["started"].task_id is None
+            assert events["started"].iteration == 1
+
+            assert events["completed"].agent_id == str(agent.id)
+            assert events["completed"].agent_role == agent.role
+            assert events["completed"].task_id is None
+            assert events["completed"].iteration == 1
+            assert events["completed"].metric_category == MetricCategory.GOAL_ALIGNMENT
+            assert isinstance(events["completed"].score, EvaluationScore)
+            assert events["completed"].score.score == 2.0
+
+            results = agent_evaluator.get_evaluation_results()
+
+            assert isinstance(results, dict)
+
+            result, = results[agent.role]
+            assert isinstance(result, AgentEvaluationResult)
+
+            assert result.agent_id == str(agent.id)
+            assert result.task_id == "lite_task"
+
+            goal_alignment, = result.metrics.values()
+            assert goal_alignment.score == 2.0
+
+            expected_feedback = "The agent did not demonstrate a clear understanding of the task goal, which is to complete test tasks successfully"
+            assert expected_feedback in goal_alignment.feedback
+
+            assert goal_alignment.raw_response is not None
+            assert '"score": 2' in goal_alignment.raw_response
+
+    @pytest.mark.vcr(filter_headers=["authorization"])
+    def test_eval_specific_agents_from_crew(self, mock_crew):
+        agent = Agent(
+            role="Test Agent Eval",
+            goal="Complete test tasks successfully",
+            backstory="An agent created for testing purposes",
+        )
+        task = Task(
+            description="Test task description",
+            agent=agent,
+            expected_output="Expected test output"
+        )
+        mock_crew.agents.append(agent)
+        mock_crew.tasks.append(task)
+
+        with crewai_event_bus.scoped_handlers():
+            events = {}
+            @crewai_event_bus.on(AgentEvaluationStartedEvent)
+            def capture_started(source, event):
+                events["started"] = event
+
+            @crewai_event_bus.on(AgentEvaluationCompletedEvent)
+            def capture_completed(source, event):
+                events["completed"] = event
+
+            @crewai_event_bus.on(AgentEvaluationFailedEvent)
+            def capture_failed(source, event):
+                events["failed"] = event
+
+            agent_evaluator = AgentEvaluator(agents=[agent], evaluators=[GoalAlignmentEvaluator()])
+            mock_crew.kickoff()
+
+            assert events.keys() == {"started", "completed"}
+            assert events["started"].agent_id == str(agent.id)
+            assert events["started"].agent_role == agent.role
+            assert events["started"].task_id == str(task.id)
+            assert events["started"].iteration == 1
+
+            assert events["completed"].agent_id == str(agent.id)
+            assert events["completed"].agent_role == agent.role
+            assert events["completed"].task_id == str(task.id)
+            assert events["completed"].iteration == 1
+            assert events["completed"].metric_category == MetricCategory.GOAL_ALIGNMENT
+            assert isinstance(events["completed"].score, EvaluationScore)
+            assert events["completed"].score.score == 5.0
+
+            results = agent_evaluator.get_evaluation_results()
+
+            assert isinstance(results, dict)
+            assert len(results.keys()) == 1
+            result, = results[agent.role]
+            assert isinstance(result, AgentEvaluationResult)
+
+            assert result.agent_id == str(agent.id)
+            assert result.task_id == str(task.id)
+
+            goal_alignment, = result.metrics.values()
+            assert goal_alignment.score == 5.0
+
+            expected_feedback = "The agent provided a thorough guide on how to conduct a test task but failed to produce specific expected output"
+            assert expected_feedback in goal_alignment.feedback
+
+            assert goal_alignment.raw_response is not None
+            assert '"score": 5' in goal_alignment.raw_response
+
+
+    @pytest.mark.vcr(filter_headers=["authorization"])
+    def test_failed_evaluation(self, mock_crew):
+        agent, = mock_crew.agents
+        task, = mock_crew.tasks
+
+        with crewai_event_bus.scoped_handlers():
+            events = {}
+
+            @crewai_event_bus.on(AgentEvaluationStartedEvent)
+            def capture_started(source, event):
+                events["started"] = event
+
+            @crewai_event_bus.on(AgentEvaluationCompletedEvent)
+            def capture_completed(source, event):
+                events["completed"] = event
+
+            @crewai_event_bus.on(AgentEvaluationFailedEvent)
+            def capture_failed(source, event):
+                events["failed"] = event
+
+            # Create a mock evaluator that will raise an exception
+            from crewai.experimental.evaluation.base_evaluator import BaseEvaluator
+            from crewai.experimental.evaluation import MetricCategory
+            class FailingEvaluator(BaseEvaluator):
+                metric_category = MetricCategory.GOAL_ALIGNMENT
+
+                def evaluate(self, agent, task, execution_trace, final_output):
+                    raise ValueError("Forced evaluation failure")
+
+            agent_evaluator = AgentEvaluator(agents=[agent], evaluators=[FailingEvaluator()])
+            mock_crew.kickoff()
+
+            assert events.keys() == {"started", "failed"}
+            assert events["started"].agent_id == str(agent.id)
+            assert events["started"].agent_role == agent.role
+            assert events["started"].task_id == str(task.id)
+            assert events["started"].iteration == 1
+
+            assert events["failed"].agent_id == str(agent.id)
+            assert events["failed"].agent_role == agent.role
+            assert events["failed"].task_id == str(task.id)
+            assert events["failed"].iteration == 1
+            assert events["failed"].error == "Forced evaluation failure"
+
+            results = agent_evaluator.get_evaluation_results()
+            result, = results[agent.role]
+            assert isinstance(result, AgentEvaluationResult)
+
+            assert result.agent_id == str(agent.id)
+            assert result.task_id == str(task.id)
+
+            assert result.metrics == {}
--- a/tests/experimental/evaluation/test_experiment_result.py
+++ b/tests/experimental/evaluation/test_experiment_result.py
@@ -0,0 +1,111 @@
+import pytest
+from unittest.mock import MagicMock, patch
+
+from crewai.experimental.evaluation.experiment.result import ExperimentResult, ExperimentResults
+
+
+class TestExperimentResult:
+    @pytest.fixture
+    def mock_results(self):
+        return [
+            ExperimentResult(
+                identifier="test-1",
+                inputs={"query": "What is the capital of France?"},
+                score=10,
+                expected_score=7,
+                passed=True
+            ),
+            ExperimentResult(
+                identifier="test-2",
+                inputs={"query": "Who wrote Hamlet?"},
+                score={"relevance": 9, "factuality": 8},
+                expected_score={"relevance": 7, "factuality": 7},
+                passed=True,
+                agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}}
+            ),
+            ExperimentResult(
+                identifier="test-3",
+                inputs={"query": "Any query"},
+                score={"relevance": 9, "factuality": 8},
+                expected_score={"relevance": 7, "factuality": 7},
+                passed=False,
+                agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}}
+            ),
+            ExperimentResult(
+                identifier="test-4",
+                inputs={"query": "Another query"},
+                score={"relevance": 9, "factuality": 8},
+                expected_score={"relevance": 7, "factuality": 7},
+                passed=True,
+                agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}}
+            ),
+            ExperimentResult(
+                identifier="test-6",
+                inputs={"query": "Yet another query"},
+                score={"relevance": 9, "factuality": 8},
+                expected_score={"relevance": 7, "factuality": 7},
+                passed=True,
+                agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}}
+            )
+        ]
+
+    @patch('os.path.exists', return_value=True)
+    @patch('os.path.getsize', return_value=1)
+    @patch('json.load')
+    @patch('builtins.open', new_callable=MagicMock)
+    def test_experiment_results_compare_with_baseline(self, mock_open, mock_json_load, mock_path_getsize, mock_path_exists, mock_results):
+        baseline_data = {
+            "timestamp": "2023-01-01T00:00:00+00:00",
+            "results": [
+                {
+                    "identifier": "test-1",
+                    "inputs": {"query": "What is the capital of France?"},
+                    "score": 7,
+                    "expected_score": 7,
+                    "passed": False
+                },
+                {
+                    "identifier": "test-2",
+                    "inputs": {"query": "Who wrote Hamlet?"},
+                    "score": {"relevance": 8, "factuality": 7},
+                    "expected_score": {"relevance": 7, "factuality": 7},
+                    "passed": True
+                },
+                {
+                    "identifier": "test-3",
+                    "inputs": {"query": "Any query"},
+                    "score": {"relevance": 8, "factuality": 7},
+                    "expected_score": {"relevance": 7, "factuality": 7},
+                    "passed": True
+                },
+                {
+                    "identifier": "test-4",
+                    "inputs": {"query": "Another query"},
+                    "score": {"relevance": 8, "factuality": 7},
+                    "expected_score": {"relevance": 7, "factuality": 7},
+                    "passed": True
+                },
+                {
+                    "identifier": "test-5",
+                    "inputs": {"query": "Another query"},
+                    "score": {"relevance": 8, "factuality": 7},
+                    "expected_score": {"relevance": 7, "factuality": 7},
+                    "passed": True
+                }
+            ]
+        }
+
+        mock_json_load.return_value = baseline_data
+
+        results = ExperimentResults(results=mock_results)
+        results.display = MagicMock()
+
+        comparison = results.compare_with_baseline(baseline_filepath="baseline.json")
+
+        assert "baseline_timestamp" in comparison
+        assert comparison["baseline_timestamp"] == "2023-01-01T00:00:00+00:00"
+        assert comparison["improved"] == ["test-1"]
+        assert comparison["regressed"] == ["test-3"]
+        assert comparison["unchanged"] == ["test-2", "test-4"]
+        assert comparison["new_tests"] == ["test-6"]
+        assert comparison["missing_tests"] == ["test-5"]
--- a/tests/experimental/evaluation/test_experiment_runner.py
+++ b/tests/experimental/evaluation/test_experiment_runner.py
@@ -0,0 +1,197 @@
+import pytest
+from unittest.mock import MagicMock, patch
+
+from crewai.crew import Crew
+from crewai.experimental.evaluation.experiment.runner import ExperimentRunner
+from crewai.experimental.evaluation.experiment.result import ExperimentResults
+from crewai.experimental.evaluation.evaluation_display import AgentAggregatedEvaluationResult
+from crewai.experimental.evaluation.base_evaluator import MetricCategory, EvaluationScore
+
+
+class TestExperimentRunner:
+    @pytest.fixture
+    def mock_crew(self):
+        return MagicMock(llm=Crew)
+
+    @pytest.fixture
+    def mock_evaluator_results(self):
+        agent_evaluation = AgentAggregatedEvaluationResult(
+            agent_id="Test Agent",
+            agent_role="Test Agent Role",
+            metrics={
+                MetricCategory.GOAL_ALIGNMENT: EvaluationScore(
+                    score=9,
+                    feedback="Test feedback for goal alignment",
+                    raw_response="Test raw response for goal alignment"
+                ),
+                MetricCategory.REASONING_EFFICIENCY: EvaluationScore(
+                    score=None,
+                    feedback="Reasoning efficiency not applicable",
+                    raw_response="Reasoning efficiency not applicable"
+                ),
+                MetricCategory.PARAMETER_EXTRACTION: EvaluationScore(
+                    score=7,
+                    feedback="Test parameter extraction explanation",
+                    raw_response="Test raw output"
+                ),
+                MetricCategory.TOOL_SELECTION: EvaluationScore(
+                    score=8,
+                    feedback="Test tool selection explanation",
+                    raw_response="Test raw output"
+                )
+            }
+        )
+
+        return {"Test Agent": agent_evaluation}
+
+    @patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
+    def test_run_success(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
+        dataset = [
+            {
+                "identifier": "test-case-1",
+                "inputs": {"query": "Test query 1"},
+                "expected_score": 8
+            },
+            {
+                "identifier": "test-case-2",
+                "inputs": {"query": "Test query 2"},
+                "expected_score": {"goal_alignment": 7}
+            },
+            {
+                "inputs": {"query": "Test query 3"},
+                "expected_score": {"tool_selection": 9}
+            }
+        ]
+
+        mock_evaluator = MagicMock()
+        mock_evaluator.get_agent_evaluation.return_value = mock_evaluator_results
+        mock_evaluator.reset_iterations_results = MagicMock()
+        mock_create_evaluator.return_value = mock_evaluator
+
+        runner = ExperimentRunner(dataset=dataset)
+
+        results = runner.run(crew=mock_crew)
+
+        assert isinstance(results, ExperimentResults)
+        result_1, result_2, result_3 = results.results
+        assert len(results.results) == 3
+
+        assert result_1.identifier == "test-case-1"
+        assert result_1.inputs == {"query": "Test query 1"}
+        assert result_1.expected_score == 8
+        assert result_1.passed is True
+
+        assert result_2.identifier == "test-case-2"
+        assert result_2.inputs == {"query": "Test query 2"}
+        assert isinstance(result_2.expected_score, dict)
+        assert "goal_alignment" in result_2.expected_score
+        assert result_2.passed is True
+
+        assert result_3.identifier == "c2ed49e63aa9a83af3ca382794134fd5"
+        assert result_3.inputs == {"query": "Test query 3"}
+        assert isinstance(result_3.expected_score, dict)
+        assert "tool_selection" in result_3.expected_score
+        assert result_3.passed is False
+
+        assert mock_crew.kickoff.call_count == 3
+        mock_crew.kickoff.assert_any_call(inputs={"query": "Test query 1"})
+        mock_crew.kickoff.assert_any_call(inputs={"query": "Test query 2"})
+        mock_crew.kickoff.assert_any_call(inputs={"query": "Test query 3"})
+
+        assert mock_evaluator.reset_iterations_results.call_count == 3
+        assert mock_evaluator.get_agent_evaluation.call_count == 3
+
+
+    @patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
+    def test_run_success_with_unknown_metric(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
+        dataset = [
+            {
+                "identifier": "test-case-2",
+                "inputs": {"query": "Test query 2"},
+                "expected_score": {"goal_alignment": 7, "unknown_metric": 8}
+            }
+        ]
+
+        mock_evaluator = MagicMock()
+        mock_evaluator.get_agent_evaluation.return_value = mock_evaluator_results
+        mock_evaluator.reset_iterations_results = MagicMock()
+        mock_create_evaluator.return_value = mock_evaluator
+
+        runner = ExperimentRunner(dataset=dataset)
+
+        results = runner.run(crew=mock_crew)
+
+        result, = results.results
+
+        assert result.identifier == "test-case-2"
+        assert result.inputs == {"query": "Test query 2"}
+        assert isinstance(result.expected_score, dict)
+        assert "goal_alignment" in result.expected_score.keys()
+        assert "unknown_metric" in result.expected_score.keys()
+        assert result.passed is True
+
+    @patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
+    def test_run_success_with_single_metric_evaluator_and_expected_specific_metric(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
+        dataset = [
+            {
+                "identifier": "test-case-2",
+                "inputs": {"query": "Test query 2"},
+                "expected_score": {"goal_alignment": 7}
+            }
+        ]
+
+        mock_evaluator = MagicMock()
+        mock_create_evaluator["Test Agent"].metrics = {
+            MetricCategory.GOAL_ALIGNMENT: EvaluationScore(
+                    score=9,
+                    feedback="Test feedback for goal alignment",
+                    raw_response="Test raw response for goal alignment"
+                )
+        }
+        mock_evaluator.get_agent_evaluation.return_value = mock_evaluator_results
+        mock_evaluator.reset_iterations_results = MagicMock()
+        mock_create_evaluator.return_value = mock_evaluator
+
+        runner = ExperimentRunner(dataset=dataset)
+
+        results = runner.run(crew=mock_crew)
+        result, = results.results
+
+        assert result.identifier == "test-case-2"
+        assert result.inputs == {"query": "Test query 2"}
+        assert isinstance(result.expected_score, dict)
+        assert "goal_alignment" in result.expected_score.keys()
+        assert result.passed is True
+
+    @patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
+    def test_run_success_when_expected_metric_is_not_available(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
+        dataset = [
+            {
+                "identifier": "test-case-2",
+                "inputs": {"query": "Test query 2"},
+                "expected_score": {"unknown_metric": 7}
+            }
+        ]
+
+        mock_evaluator = MagicMock()
+        mock_create_evaluator["Test Agent"].metrics = {
+            MetricCategory.GOAL_ALIGNMENT: EvaluationScore(
+                score=5,
+                feedback="Test feedback for goal alignment",
+                raw_response="Test raw response for goal alignment"
+            )
+        }
+        mock_evaluator.get_agent_evaluation.return_value = mock_evaluator_results
+        mock_evaluator.reset_iterations_results = MagicMock()
+        mock_create_evaluator.return_value = mock_evaluator
+
+        runner = ExperimentRunner(dataset=dataset)
+
+        results = runner.run(crew=mock_crew)
+        result, = results.results
+
+        assert result.identifier == "test-case-2"
+        assert result.inputs == {"query": "Test query 2"}
+        assert isinstance(result.expected_score, dict)
+        assert "unknown_metric" in result.expected_score.keys()
+        assert result.passed is False
--- a/tests/llm_test.py
+++ b/tests/llm_test.py
@@ -509,6 +509,85 @@ def test_deepseek_r1_with_open_router():
    assert "Paris" in result


+@pytest.mark.vcr(filter_headers=["authorization"])
+def test_llm_passes_extra_headers():
+    """Test that extra_headers parameter is passed to litellm.completion."""
+    extra_headers = {
+        "X-Custom-Auth": "bearer token123",
+        "X-API-Version": "v1.0"
+    }
+    
+    llm = LLM(
+        model="gpt-4o-mini",
+        extra_headers=extra_headers,
+    )
+
+    messages = [{"role": "user", "content": "Hello, world!"}]
+
+    with patch("litellm.completion") as mocked_completion:
+        mock_message = MagicMock()
+        mock_message.content = "Test response"
+        mock_choice = MagicMock()
+        mock_choice.message = mock_message
+        mock_response = MagicMock()
+        mock_response.choices = [mock_choice]
+        mock_response.usage = {
+            "prompt_tokens": 5,
+            "completion_tokens": 5,
+            "total_tokens": 10,
+        }
+
+        mocked_completion.return_value = mock_response
+
+        result = llm.call(messages)
+
+        mocked_completion.assert_called_once()
+
+        _, kwargs = mocked_completion.call_args
+
+        assert kwargs["extra_headers"] == extra_headers
+
+        assert kwargs["model"] == "gpt-4o-mini"
+        assert kwargs["messages"] == messages
+
+        assert result == "Test response"
+
+
+def test_llm_extra_headers_none_by_default():
+    """Test that extra_headers defaults to None and doesn't break existing functionality."""
+    llm = LLM(model="gpt-4o-mini")
+    
+    messages = [{"role": "user", "content": "Hello, world!"}]
+
+    with patch("litellm.completion") as mocked_completion:
+        mock_message = MagicMock()
+        mock_message.content = "Test response"
+        mock_choice = MagicMock()
+        mock_choice.message = mock_message
+        mock_response = MagicMock()
+        mock_response.choices = [mock_choice]
+        mock_response.usage = {
+            "prompt_tokens": 5,
+            "completion_tokens": 5,
+            "total_tokens": 10,
+        }
+
+        mocked_completion.return_value = mock_response
+
+        result = llm.call(messages)
+
+        mocked_completion.assert_called_once()
+
+        _, kwargs = mocked_completion.call_args
+
+        assert "extra_headers" not in kwargs
+
+        assert kwargs["model"] == "gpt-4o-mini"
+        assert kwargs["messages"] == messages
+
+        assert result == "Test response"
+
+
 def assert_event_count(
    mock_emit,
    expected_completed_tool_call: int = 0,
--- a/tests/task_test.py
+++ b/tests/task_test.py
@@ -1133,6 +1133,119 @@ def test_output_file_validation():
        )


+def test_create_directory_true():
+    """Test that directories are created when create_directory=True."""
+    from pathlib import Path
+    
+    output_path = "test_create_dir/output.txt"
+    
+    task = Task(
+        description="Test task",
+        expected_output="Test output",
+        output_file=output_path,
+        create_directory=True,
+    )
+    
+    resolved_path = Path(output_path).expanduser().resolve()
+    resolved_dir = resolved_path.parent
+    
+    if resolved_path.exists():
+        resolved_path.unlink()
+    if resolved_dir.exists():
+        import shutil
+        shutil.rmtree(resolved_dir)
+    
+    assert not resolved_dir.exists()
+    
+    task._save_file("test content")
+    
+    assert resolved_dir.exists()
+    assert resolved_path.exists()
+    
+    if resolved_path.exists():
+        resolved_path.unlink()
+    if resolved_dir.exists():
+        import shutil
+        shutil.rmtree(resolved_dir)
+
+
+def test_create_directory_false():
+    """Test that directories are not created when create_directory=False."""
+    from pathlib import Path
+    
+    output_path = "nonexistent_test_dir/output.txt"
+    
+    task = Task(
+        description="Test task",
+        expected_output="Test output",
+        output_file=output_path,
+        create_directory=False,
+    )
+    
+    resolved_path = Path(output_path).expanduser().resolve()
+    resolved_dir = resolved_path.parent
+    
+    if resolved_dir.exists():
+        import shutil
+        shutil.rmtree(resolved_dir)
+    
+    assert not resolved_dir.exists()
+    
+    with pytest.raises(RuntimeError, match="Directory .* does not exist and create_directory is False"):
+        task._save_file("test content")
+
+
+def test_create_directory_default():
+    """Test that create_directory defaults to True for backward compatibility."""
+    task = Task(
+        description="Test task",
+        expected_output="Test output",
+        output_file="output.txt",
+    )
+    
+    assert task.create_directory is True
+
+
+def test_create_directory_with_existing_directory():
+    """Test that create_directory=False works when directory already exists."""
+    from pathlib import Path
+    
+    output_path = "existing_test_dir/output.txt"
+    
+    resolved_path = Path(output_path).expanduser().resolve()
+    resolved_dir = resolved_path.parent
+    resolved_dir.mkdir(parents=True, exist_ok=True)
+    
+    task = Task(
+        description="Test task",
+        expected_output="Test output",
+        output_file=output_path,
+        create_directory=False,
+    )
+    
+    task._save_file("test content")
+    assert resolved_path.exists()
+    
+    if resolved_path.exists():
+        resolved_path.unlink()
+    if resolved_dir.exists():
+        import shutil
+        shutil.rmtree(resolved_dir)
+
+
+def test_github_issue_3149_reproduction():
+    """Test that reproduces the exact issue from GitHub issue #3149."""
+    task = Task(
+        description="Test task for issue reproduction",
+        expected_output="Test output",
+        output_file="test_output.txt",
+        create_directory=True,
+    )
+    
+    assert task.create_directory is True
+    assert task.output_file == "test_output.txt"
+
+
@pytest.mark.vcr(filter_headers=["authorization"])
 def test_task_execution_times():
    researcher = Agent(
--- a/tests/test_flow_initial_state_fix.py
+++ b/tests/test_flow_initial_state_fix.py
@@ -1,181 +0,0 @@
-"""Test Flow initial_state BaseModel dict coercion fix for issue #3147"""
-
-from pydantic import BaseModel
-
-from crewai.flow.flow import Flow
-
-
-class StateWithItems(BaseModel):
-    items: list = [1, 2, 3]
-    metadata: dict = {"x": 1}
-
-
-class StateWithKeys(BaseModel):
-    keys: list = ["a", "b", "c"]
-    data: str = "test"
-
-
-class StateWithValues(BaseModel):
-    values: list = [10, 20, 30]
-    name: str = "example"
-
-
-class StateWithGet(BaseModel):
-    get: str = "method_name"
-    config: dict = {"enabled": True}
-
-
-class StateWithPop(BaseModel):
-    pop: int = 42
-    settings: list = ["option1", "option2"]
-
-
-class StateWithUpdate(BaseModel):
-    update: bool = True
-    version: str = "1.0.0"
-
-
-class StateWithClear(BaseModel):
-    clear: str = "action"
-    status: str = "active"
-
-
-def test_flow_initial_state_items_field():
-    """Test that BaseModel with 'items' field preserves structure and doesn't get dict coercion."""
-    flow = Flow(initial_state=StateWithItems())
-    flow.kickoff()
-    
-    assert isinstance(flow.state, StateWithItems)
-    assert not isinstance(flow.state, dict)
-    
-    assert isinstance(flow.state.items, list)
-    assert flow.state.items == [1, 2, 3]
-    assert len(flow.state.items) == 3
-    
-    assert flow.state.metadata == {"x": 1}
-
-
-def test_flow_initial_state_keys_field():
-    """Test that BaseModel with 'keys' field preserves structure."""
-    flow = Flow(initial_state=StateWithKeys())
-    flow.kickoff()
-    
-    assert isinstance(flow.state, StateWithKeys)
-    assert isinstance(flow.state.keys, list)
-    assert flow.state.keys == ["a", "b", "c"]
-    assert len(flow.state.keys) == 3
-    assert flow.state.data == "test"
-
-
-def test_flow_initial_state_values_field():
-    """Test that BaseModel with 'values' field preserves structure."""
-    flow = Flow(initial_state=StateWithValues())
-    flow.kickoff()
-    
-    assert isinstance(flow.state, StateWithValues)
-    assert isinstance(flow.state.values, list)
-    assert flow.state.values == [10, 20, 30]
-    assert len(flow.state.values) == 3
-    assert flow.state.name == "example"
-
-
-def test_flow_initial_state_get_field():
-    """Test that BaseModel with 'get' field preserves structure."""
-    flow = Flow(initial_state=StateWithGet())
-    flow.kickoff()
-    
-    assert isinstance(flow.state, StateWithGet)
-    assert isinstance(flow.state.get, str)
-    assert flow.state.get == "method_name"
-    assert flow.state.config == {"enabled": True}
-
-
-def test_flow_initial_state_pop_field():
-    """Test that BaseModel with 'pop' field preserves structure."""
-    flow = Flow(initial_state=StateWithPop())
-    flow.kickoff()
-    
-    assert isinstance(flow.state, StateWithPop)
-    assert isinstance(flow.state.pop, int)
-    assert flow.state.pop == 42
-    assert flow.state.settings == ["option1", "option2"]
-
-
-def test_flow_initial_state_update_field():
-    """Test that BaseModel with 'update' field preserves structure."""
-    flow = Flow(initial_state=StateWithUpdate())
-    flow.kickoff()
-    
-    assert isinstance(flow.state, StateWithUpdate)
-    assert isinstance(flow.state.update, bool)
-    assert flow.state.update is True
-    assert flow.state.version == "1.0.0"
-
-
-def test_flow_initial_state_clear_field():
-    """Test that BaseModel with 'clear' field preserves structure."""
-    flow = Flow(initial_state=StateWithClear())
-    flow.kickoff()
-    
-    assert isinstance(flow.state, StateWithClear)
-    assert isinstance(flow.state.clear, str)
-    assert flow.state.clear == "action"
-    assert flow.state.status == "active"
-
-
-def test_flow_state_modification_preserves_basemodel():
-    """Test that modifying flow state preserves BaseModel structure."""
-    
-    class ModifiableState(BaseModel):
-        items: list = [1, 2, 3]
-        counter: int = 0
-    
-    class TestFlow(Flow[ModifiableState]):
-        @Flow.start()
-        def modify_state(self):
-            self.state.counter += 1
-            self.state.items.append(4)
-    
-    flow = TestFlow(initial_state=ModifiableState())
-    flow.kickoff()
-    
-    assert isinstance(flow.state, ModifiableState)
-    assert not isinstance(flow.state, dict)
-    
-    assert flow.state.counter == 1
-    assert flow.state.items == [1, 2, 3, 4]
-
-
-def test_flow_with_inputs_preserves_basemodel():
-    """Test that providing inputs to flow preserves BaseModel structure."""
-    
-    class InputState(BaseModel):
-        items: list = []
-        name: str = ""
-    
-    flow = Flow(initial_state=InputState())
-    flow.kickoff(inputs={"name": "test_flow", "items": [5, 6, 7]})
-    
-    assert isinstance(flow.state, InputState)
-    assert not isinstance(flow.state, dict)
-    
-    assert flow.state.name == "test_flow"
-    assert flow.state.items == [5, 6, 7]
-
-
-def test_reproduction_case_from_issue_3147():
-    """Test the exact reproduction case from GitHub issue #3147."""
-    
-    class MyState(BaseModel):
-        items: list = [1, 2, 3]
-        metadata: dict = {"x": 1}
-    
-    flow = Flow(initial_state=MyState())
-    flow.kickoff()
-    
-    assert isinstance(flow.state.items, list)
-    assert len(flow.state.items) == 3
-    assert flow.state.items == [1, 2, 3]
-    
-    assert not callable(flow.state.items)
-    assert str(type(flow.state.items)) != "<class 'builtin_function_or_method'>"
--- a/uv.lock
+++ b/uv.lock
Author	SHA1	Message	Date
Devin AI	cf5f0a3553	fix: regenerate uv.lock to resolve TOML parse errors - Remove corrupted uv.lock file that had missing version field - Regenerate with uv sync to ensure proper dependency resolution Co-Authored-By: Jo\u00E3o <joao@crewai.com>	2025-07-17 09:07:45 +00:00
Devin AI	bb19998fe7	feat: add extra_headers parameter to LLM class - Add extra_headers parameter to LLM constructor for custom authentication headers - Update _prepare_completion_params to pass extra_headers to LiteLLM - Add comprehensive tests for extra_headers functionality - Ensure backward compatibility with None default value Fixes #3177 Co-Authored-By: Jo\u00E3o <joao@crewai.com>	2025-07-17 09:07:32 +00:00
Lucas Gomide	bf248d5118	docs: fix neatlogs documentation (#3171 ) Some checks failed Notify Downstream / notify-downstream (push) Has been cancelled Details Mark stale issues and pull requests / stale (push) Has been cancelled Details	2025-07-16 21:18:04 -04:00
Lorenze Jay	2490e8cd46	Update CrewAI version to 0.148.0 in project templates and dependencies (#3172 ) Some checks failed Notify Downstream / notify-downstream (push) Has been cancelled Details * Update CrewAI version to 0.148.0 in project templates and dependencies * Update crewai-tools dependency to version 0.55.0 in pyproject.toml and uv.lock for improved functionality and performance.	2025-07-16 12:36:43 -07:00
Lucas Gomide	9b67e5a15f	Emit events about Agent eval (#3168 ) * feat: emit events abou Agent Eval We are triggering events when an evaluation has started/completed/failed * style: fix type checking issues	2025-07-16 13:18:59 -04:00
Lucas Gomide	6ebb6c9b63	Supporting eval single Agent/LiteAgent (#3167 ) Some checks failed Notify Downstream / notify-downstream (push) Has been cancelled Details Mark stale issues and pull requests / stale (push) Has been cancelled Details * refactor: rely on task completion event to evaluate agents * feat: remove Crew dependency to evaluate agent * feat: drop execution_context in AgentEvaluator * chore: drop experimental Agent Eval feature from stable crew.test * feat: support eval LiteAgent * resolve linter issues	2025-07-15 09:22:41 -04:00
Lucas Gomide	53f674be60	chore: remove evaluation folder (#3159 ) This folder was moved to `experimental` folder	2025-07-15 08:30:20 -04:00
Paras Sakarwal	11717a5213	docs: added integration with neatlogs (#3138 ) Some checks failed Notify Downstream / notify-downstream (push) Has been cancelled Details Mark stale issues and pull requests / stale (push) Has been cancelled Details	2025-07-14 11:08:24 -04:00
Lucas Gomide	b6d699f764	Implement thread-safe AgentEvaluator (#3157 ) Some checks failed Notify Downstream / notify-downstream (push) Has been cancelled Details * refactor: implement thread-safe AgentEvaluator with hybrid state management * chore: remove useless comments	2025-07-14 10:05:42 -04:00
Lucas Gomide	5b15061b87	test: add test helper to assert Agent Experiments (#3156 )	2025-07-14 09:24:49 -04:00
Lucas Gomide	1b6b2b36d9	Introduce Evaluator Experiment (#3133 ) * feat: add exchanged messages in LLMCallCompletedEvent * feat: add GoalAlignment metric for Agent evaluation * feat: add SemanticQuality metric for Agent evaluation * feat: add Tool Metrics for Agent evaluation * feat: add Reasoning Metrics for Agent evaluation, still in progress * feat: add AgentEvaluator class This class will evaluate Agent' results and report to user * fix: do not evaluate Agent by default This is a experimental feature we still need refine it further * test: add Agent eval tests * fix: render all feedback per iteration * style: resolve linter issues * style: fix mypy issues * fix: allow messages be empty on LLMCallCompletedEvent * feat: add Experiment evaluation framework with baseline comparison * fix: reset evaluator for each experiement iteraction * fix: fix track of new test cases * chore: split Experimental evaluation classes * refactor: remove unused method * refactor: isolate Console print in a dedicated class * fix: make crew required to run an experiment * fix: use time-aware to define experiment result * test: add tests for Evaluator Experiment * style: fix linter issues * fix: encode string before hashing * style: resolve linter issues * feat: add experimental folder for beta features (#3141) * test: move tests to experimental folder	2025-07-14 09:06:45 -04:00
devin-ai-integration[bot]	3ada4053bd	Fix #3149 : Add missing create_directory parameter to Task class (#3150 ) * Fix #3149: Add missing create_directory parameter to Task class - Add create_directory field with default value True for backward compatibility - Update _save_file method to respect create_directory parameter - Add comprehensive tests covering all scenarios - Maintain existing behavior when create_directory=True (default) The create_directory parameter was documented but missing from implementation. Users can now control directory creation behavior: - create_directory=True (default): Creates directories if they don't exist - create_directory=False: Raises RuntimeError if directory doesn't exist Fixes issue where users got TypeError when trying to use the documented create_directory parameter. Co-Authored-By: Jo\u00E3o <joao@crewai.com> * Fix lint: Remove unused import os from test_create_directory_true - Removes F401 lint error: 'os' imported but unused - All lint checks should now pass Co-Authored-By: Jo\u00E3o <joao@crewai.com> --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: Jo\u00E3o <joao@crewai.com>	2025-07-14 08:15:41 -04:00