fix: allow messages be empty on LLMCallCompletedEvent

style: fix mypy issues
style: resolve linter issues
2026-01-05 22:28:29 +00:00 · 2025-07-11 14:05:25 -03:00 · 2025-07-11 13:02:34 -03:00 · 2025-07-11 13:02:34 -03:00 · 2025-07-11 13:02:34 -03:00 · 2025-07-11 13:02:34 -03:00
73 changed files with 3674 additions and 6888 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -26,5 +26,4 @@ test_flow.html
 crewairules.mdc
 plan.md
 conceptual_plan.md
-build_image
-chromadb-*.lock
+build_image
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -9,7 +9,12 @@
  },
  "favicon": "/images/favicon.svg",
  "contextual": {
-    "options": ["copy", "view", "chatgpt", "claude"]
+    "options": [
+      "copy",
+      "view",
+      "chatgpt",
+      "claude"
+    ]
  },
  "navigation": {
    "languages": [
@@ -50,22 +55,32 @@
            "groups": [
              {
                "group": "Get Started",
-                "pages": ["en/introduction", "en/installation", "en/quickstart"]
+                "pages": [
+                  "en/introduction",
+                  "en/installation",
+                  "en/quickstart"
+                ]
              },
              {
                "group": "Guides",
                "pages": [
                  {
                    "group": "Strategy",
-                    "pages": ["en/guides/concepts/evaluating-use-cases"]
+                    "pages": [
+                      "en/guides/concepts/evaluating-use-cases"
+                    ]
                  },
                  {
                    "group": "Agents",
-                    "pages": ["en/guides/agents/crafting-effective-agents"]
+                    "pages": [
+                      "en/guides/agents/crafting-effective-agents"
+                    ]
                  },
                  {
                    "group": "Crews",
-                    "pages": ["en/guides/crews/first-crew"]
+                    "pages": [
+                      "en/guides/crews/first-crew"
+                    ]
                  },
                  {
                    "group": "Flows",
@@ -79,6 +94,7 @@
                    "pages": [
                      "en/guides/advanced/customizing-prompts",
                      "en/guides/advanced/fingerprinting"
+
                    ]
                  }
                ]
@@ -225,7 +241,6 @@
                  "en/observability/langtrace",
                  "en/observability/maxim",
                  "en/observability/mlflow",
-                  "en/observability/neatlogs",
                  "en/observability/openlit",
                  "en/observability/opik",
                  "en/observability/patronus-evaluation",
@@ -259,7 +274,9 @@
              },
              {
                "group": "Telemetry",
-                "pages": ["en/telemetry"]
+                "pages": [
+                  "en/telemetry"
+                ]
              }
            ]
          },
@@ -268,7 +285,9 @@
            "groups": [
              {
                "group": "Getting Started",
-                "pages": ["en/enterprise/introduction"]
+                "pages": [
+                  "en/enterprise/introduction"
+                ]
              },
              {
                "group": "Features",
@@ -323,7 +342,9 @@
              },
              {
                "group": "Resources",
-                "pages": ["en/enterprise/resources/frequently-asked-questions"]
+                "pages": [
+                  "en/enterprise/resources/frequently-asked-questions"
+                ]
              }
            ]
          },
@@ -332,7 +353,9 @@
            "groups": [
              {
                "group": "Getting Started",
-                "pages": ["en/api-reference/introduction"]
+                "pages": [
+                  "en/api-reference/introduction"
+                ]
              },
              {
                "group": "Endpoints",
@@ -342,13 +365,16 @@
          },
          {
            "tab": "Examples",
-            "groups": [
+                        "groups": [
              {
                "group": "Examples",
-                "pages": ["en/examples/example"]
+                "pages": [
+                  "en/examples/example"
+                ]
              }
            ]
          }
+
        ]
      },
      {
@@ -399,15 +425,21 @@
                "pages": [
                  {
                    "group": "Estratégia",
-                    "pages": ["pt-BR/guides/concepts/evaluating-use-cases"]
+                    "pages": [
+                      "pt-BR/guides/concepts/evaluating-use-cases"
+                    ]
                  },
                  {
                    "group": "Agentes",
-                    "pages": ["pt-BR/guides/agents/crafting-effective-agents"]
+                    "pages": [
+                      "pt-BR/guides/agents/crafting-effective-agents"
+                    ]
                  },
                  {
                    "group": "Crews",
-                    "pages": ["pt-BR/guides/crews/first-crew"]
+                    "pages": [
+                      "pt-BR/guides/crews/first-crew"
+                    ]
                  },
                  {
                    "group": "Flows",
@@ -600,7 +632,9 @@
              },
              {
                "group": "Telemetria",
-                "pages": ["pt-BR/telemetry"]
+                "pages": [
+                  "pt-BR/telemetry"
+                ]
              }
            ]
          },
@@ -609,7 +643,9 @@
            "groups": [
              {
                "group": "Começando",
-                "pages": ["pt-BR/enterprise/introduction"]
+                "pages": [
+                  "pt-BR/enterprise/introduction"
+                ]
              },
              {
                "group": "Funcionalidades",
@@ -674,7 +710,9 @@
            "groups": [
              {
                "group": "Começando",
-                "pages": ["pt-BR/api-reference/introduction"]
+                "pages": [
+                  "pt-BR/api-reference/introduction"
+                ]
              },
              {
                "group": "Endpoints",
@@ -684,13 +722,16 @@
          },
          {
            "tab": "Exemplos",
-            "groups": [
+                        "groups": [
              {
                "group": "Exemplos",
-                "pages": ["pt-BR/examples/example"]
+                "pages": [
+                  "pt-BR/examples/example"
+                ]
              }
            ]
          }
+
        ]
      }
    ]
--- a/docs/en/concepts/crews.mdx
+++ b/docs/en/concepts/crews.mdx
@@ -32,7 +32,6 @@ A crew in crewAI represents a collaborative group of agents working together to
 | **Prompt File** _(optional)_          | `prompt_file`          | Path to the prompt JSON file to be used for the crew.                                                                                                                                                                                                     |
 | **Planning** *(optional)*             | `planning`             | Adds planning ability to the Crew. When activated before each Crew iteration, all Crew data is sent to an AgentPlanner that will plan the tasks and this plan will be added to each task description.                                                     |
 | **Planning LLM** *(optional)*         | `planning_llm`         | The language model used by the AgentPlanner in a planning process.                                                                                                                                                                                        |
-| **Knowledge Sources** _(optional)_    | `knowledge_sources`    | Knowledge sources available at the crew level, accessible to all the agents.                                                                                                                                                                                    |

 <Tip>
 **Crew Max RPM**: The `max_rpm` attribute sets the maximum number of requests per minute the crew can perform to avoid rate limits and will override individual agents' `max_rpm` settings if you set it.
--- a/docs/en/concepts/tasks.mdx
+++ b/docs/en/concepts/tasks.mdx
@@ -57,7 +57,6 @@ crew = Crew(
 | **Output JSON** _(optional)_     | `output_json`     | `Optional[Type[BaseModel]]`   | A Pydantic model to structure the JSON output.                                                                       |
 | **Output Pydantic** _(optional)_ | `output_pydantic` | `Optional[Type[BaseModel]]`   | A Pydantic model for task output.                                                                                    |
 | **Callback** _(optional)_        | `callback`        | `Optional[Any]`               | Function/object to be executed after task completion.                                                                |
-| **Guardrail** _(optional)_       | `guardrail`       | `Optional[Union[Callable, str]]` | Function or string description to validate task output before proceeding to next task.                            |

 ## Creating Tasks

@@ -87,7 +86,6 @@ research_task:
  expected_output: >
    A list with 10 bullet points of the most relevant information about {topic}
  agent: researcher
-  guardrail: ensure each bullet contains a minimum of 100 words

 reporting_task:
  description: >
@@ -334,13 +332,9 @@ Task guardrails provide a way to validate and transform task outputs before they
 are passed to the next task. This feature helps ensure data quality and provides
 feedback to agents when their output doesn't meet specific criteria.

-**Guardrails can be defined in two ways:**
-1. **Function-based guardrails**: Python functions that implement custom validation logic
-2. **String-based guardrails**: Natural language descriptions that are automatically converted to LLM-powered validation
+### Using Task Guardrails

-### Function-Based Guardrails
-
-To add a function-based guardrail to a task, provide a validation function through the `guardrail` parameter:
+To add a guardrail to a task, provide a validation function through the `guardrail` parameter:

 ```python Code
 from typing import Tuple, Union, Dict, Any
@@ -378,82 +372,9 @@ blog_task = Task(
   - On success: it returns a tuple of `(bool, Any)`. For example: `(True, validated_result)`
   - On Failure: it returns a tuple of `(bool, str)`. For example: `(False, "Error message explain the failure")`

-### String-Based Guardrails
+### LLMGuardrail

-String-based guardrails allow you to describe validation criteria in natural language. When you provide a string instead of a function, CrewAI automatically converts it to an `LLMGuardrail` that uses an AI agent to validate the task output.
-
-#### Using String Guardrails in Python
-
-```python Code
-from crewai import Task
-
-# Simple string-based guardrail
-blog_task = Task(
-    description="Write a blog post about AI",
-    expected_output="A blog post under 200 words",
-    agent=blog_agent,
-    guardrail="Ensure the blog post is under 200 words and includes practical examples"
-)
-
-# More complex validation criteria
-research_task = Task(
-    description="Research AI trends for 2025",
-    expected_output="A comprehensive research report",
-    agent=research_agent,
-    guardrail="Ensure each finding includes a credible source and is backed by recent data from 2024-2025"
-)
-```
-
-#### Using String Guardrails in YAML
-
-```yaml
-research_task:
-  description: Research the latest AI developments
-  expected_output: A list of 10 bullet points about AI
-  agent: researcher
-  guardrail: ensure each bullet contains a minimum of 100 words
-
-validation_task:
-  description: Validate the research findings
-  expected_output: A validation report
-  agent: validator
-  guardrail: confirm all sources are from reputable publications and published within the last 2 years
-```
-
-#### How String Guardrails Work
-
-When you provide a string guardrail, CrewAI automatically:
-1. Creates an `LLMGuardrail` instance using the string as validation criteria
-2. Uses the task's agent LLM to power the validation
-3. Creates a temporary validation agent that checks the output against your criteria
-4. Returns detailed feedback if validation fails
-
-This approach is ideal when you want to use natural language to describe validation rules without writing custom validation functions.
-
-### LLMGuardrail Class
-
-The `LLMGuardrail` class is the underlying mechanism that powers string-based guardrails. You can also use it directly for more advanced control:
-
-```python Code
-from crewai import Task
-from crewai.tasks.llm_guardrail import LLMGuardrail
-from crewai.llm import LLM
-
-# Create a custom LLMGuardrail with specific LLM
-custom_guardrail = LLMGuardrail(
-    description="Ensure the response contains exactly 5 bullet points with proper citations",
-    llm=LLM(model="gpt-4o-mini")
-)
-
-task = Task(
-    description="Research AI safety measures",
-    expected_output="A detailed analysis with bullet points",
-    agent=research_agent,
-    guardrail=custom_guardrail
-)
-```
-
-**Note**: When you use a string guardrail, CrewAI automatically creates an `LLMGuardrail` instance using your task's agent LLM. Using `LLMGuardrail` directly gives you more control over the validation process and LLM selection.
+The `LLMGuardrail` class offers a robust mechanism for validating task outputs.

 ### Error Handling Best Practices

@@ -877,7 +798,166 @@ While creating and executing tasks, certain validation mechanisms are in place t

 These validations help in maintaining the consistency and reliability of task executions within the crewAI framework.

+## Task Guardrails

+Task guardrails provide a powerful way to validate, transform, or filter task outputs before they are passed to the next task. Guardrails are optional functions that execute before the next task starts, allowing you to ensure that task outputs meet specific requirements or formats.
+
+### Basic Usage
+
+#### Define your own logic to validate
+
+```python Code
+from typing import Tuple, Union
+from crewai import Task
+
+def validate_json_output(result: str) -> Tuple[bool, Union[dict, str]]:
+    """Validate that the output is valid JSON."""
+    try:
+        json_data = json.loads(result)
+        return (True, json_data)
+    except json.JSONDecodeError:
+        return (False, "Output must be valid JSON")
+
+task = Task(
+    description="Generate JSON data",
+    expected_output="Valid JSON object",
+    guardrail=validate_json_output
+)
+```
+
+#### Leverage a no-code approach for validation
+
+```python Code
+from crewai import Task
+
+task = Task(
+    description="Generate JSON data",
+    expected_output="Valid JSON object",
+    guardrail="Ensure the response is a valid JSON object"
+)
+```
+
+#### Using YAML
+
+```yaml
+research_task:
+  ...
+  guardrail: make sure each bullet contains a minimum of 100 words
+  ...
+```
+
+```python Code
+@CrewBase
+class InternalCrew:
+    agents_config = "config/agents.yaml"
+    tasks_config = "config/tasks.yaml"
+
+    ...
+    @task
+    def research_task(self):
+        return Task(config=self.tasks_config["research_task"])  # type: ignore[index]
+    ...
+```
+
+
+#### Use custom models for code generation
+
+```python Code
+from crewai import Task
+from crewai.llm import LLM
+
+task = Task(
+    description="Generate JSON data",
+    expected_output="Valid JSON object",
+    guardrail=LLMGuardrail(
+        description="Ensure the response is a valid JSON object",
+        llm=LLM(model="gpt-4o-mini"),
+    )
+)
+```
+
+### How Guardrails Work
+
+1. **Optional Attribute**: Guardrails are an optional attribute at the task level, allowing you to add validation only where needed.
+2. **Execution Timing**: The guardrail function is executed before the next task starts, ensuring valid data flow between tasks.
+3. **Return Format**: Guardrails must return a tuple of `(success, data)`:
+   - If `success` is `True`, `data` is the validated/transformed result
+   - If `success` is `False`, `data` is the error message
+4. **Result Routing**:
+   - On success (`True`), the result is automatically passed to the next task
+   - On failure (`False`), the error is sent back to the agent to generate a new answer
+
+### Common Use Cases
+
+#### Data Format Validation
+```python Code
+def validate_email_format(result: str) -> Tuple[bool, Union[str, str]]:
+    """Ensure the output contains a valid email address."""
+    import re
+    email_pattern = r'^[\w\.-]+@[\w\.-]+\.\w+$'
+    if re.match(email_pattern, result.strip()):
+        return (True, result.strip())
+    return (False, "Output must be a valid email address")
+```
+
+#### Content Filtering
+```python Code
+def filter_sensitive_info(result: str) -> Tuple[bool, Union[str, str]]:
+    """Remove or validate sensitive information."""
+    sensitive_patterns = ['SSN:', 'password:', 'secret:']
+    for pattern in sensitive_patterns:
+        if pattern.lower() in result.lower():
+            return (False, f"Output contains sensitive information ({pattern})")
+    return (True, result)
+```
+
+#### Data Transformation
+```python Code
+def normalize_phone_number(result: str) -> Tuple[bool, Union[str, str]]:
+    """Ensure phone numbers are in a consistent format."""
+    import re
+    digits = re.sub(r'\D', '', result)
+    if len(digits) == 10:
+        formatted = f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"
+        return (True, formatted)
+    return (False, "Output must be a 10-digit phone number")
+```
+
+### Advanced Features
+
+#### Chaining Multiple Validations
+```python Code
+def chain_validations(*validators):
+    """Chain multiple validators together."""
+    def combined_validator(result):
+        for validator in validators:
+            success, data = validator(result)
+            if not success:
+                return (False, data)
+            result = data
+        return (True, result)
+    return combined_validator
+
+# Usage
+task = Task(
+    description="Get user contact info",
+    expected_output="Email and phone",
+    guardrail=chain_validations(
+        validate_email_format,
+        filter_sensitive_info
+    )
+)
+```
+
+#### Custom Retry Logic
+```python Code
+task = Task(
+    description="Generate data",
+    expected_output="Valid data",
+    guardrail=validate_data,
+    max_retries=5  # Override default retry limit
+)
+```

 ## Creating Directories when Saving Files

--- a/docs/en/mcp/overview.mdx
+++ b/docs/en/mcp/overview.mdx
@@ -172,60 +172,6 @@ def another_agent(self):
    )
 ```

-### Using Multiple MCP Servers with CrewBase
-
-You can configure multiple MCP servers and assign different servers and tools to different agents. Use a dictionary to pass multiple named MCP servers.
-
-```python
-@CrewBase
-class CrewWithMultipleMCP:
-  # ... define your agents and tasks config file ...
-
-  # MCP servers keyed by server name
-  mcp_server_params = {
-      "web_tools": {
-        "url": "http://localhost:8000/mcp",
-        "transport": "streamable-http"
-      },
-      "data_tools": {
-        "url":
-        "http://localhost:8001/sse",
-        "transport": "sse"
-      },
-      "local_tools": StdioServerParameters(
-          command="python3",
-          args=["servers/local_server.py"],
-          env={"UV_PYTHON": "3.12", **os.environ},
-      )
-  }
-
-  @agent
-  def web_researcher(self):
-      # Use tools from specific server
-      return Agent(
-          config=self.agents_config["web_researcher"],
-          tools=self.get_mcp_tools(server="web_tools")
-      )
-
-  @agent
-  def data_analyst(self):
-      # Use specific tools from specific server
-      return Agent(
-          config=self.agents_config["data_analyst"],
-          tools=self.get_mcp_tools("analyze_csv", "create_chart", server="data_tools")
-      )
-
-  @agent
-  def multi_tool_agent(self):
-      # Use tools from all servers
-      return Agent(
-          config=self.agents_config["multi_tool_agent"],
-          tools=self.get_mcp_tools()  # No server specified = all tools
-      )
-
-  # ... rest of your crew setup ...
-```
-
 ## Explore MCP Integrations

 <CardGroup cols={2}>
--- a/docs/en/observability/neatlogs.mdx
+++ b/docs/en/observability/neatlogs.mdx
@@ -1,134 +0,0 @@
---
-title: Neatlogs Integration
-description: Understand, debug, and share your CrewAI agent runs
-icon: magnifying-glass-chart
---
-
-# Introduction
-
-Neatlogs helps you **see what your agent did**, **why**, and **share it**.
-
-It captures every step: thoughts, tool calls, responses, evaluations. No raw logs. Just clear, structured traces. Great for debugging and collaboration.
-
-## Why use Neatlogs?
-
-CrewAI agents use multiple tools and reasoning steps. When something goes wrong, you need context — not just errors.
-
-Neatlogs lets you:
-
- Follow the full decision path
- Add feedback directly on steps
- Chat with the trace using AI assistant
- Share runs publicly for feedback
- Turn insights into tasks
-
-All in one place.
-
-Manage your traces effortlessly
-
-![Traces](/images/neatlogs-1.png)
-![Trace Response](/images/neatlogs-2.png)
-
-The best UX to view a CrewAI trace. Post comments anywhere you want. Use AI to debug.
-
-![Trace Details](/images/neatlogs-3.png)
-![Ai Chat Bot With A Trace](/images/neatlogs-4.png)
-![Comments Drawer](/images/neatlogs-5.png)
-
-## Core Features
-
- **Trace Viewer**: Track thoughts, tools, and decisions in sequence
- **Inline Comments**: Tag teammates on any trace step
- **Feedback & Evaluation**: Mark outputs as correct or incorrect
- **Error Highlighting**: Automatic flagging of API/tool failures
- **Task Conversion**: Convert comments into assigned tasks
- **Ask the Trace (AI)**: Chat with your trace using Neatlogs AI bot
- **Public Sharing**: Publish trace links to your community
-
-## Quick Setup with CrewAI
-
-<Steps>
-  <Step title="Sign Up & Get API Key">
-    Visit [neatlogs.com](https://neatlogs.com/?utm_source=crewAI-docs), create a project, copy the API key.
-  </Step>
-  <Step title="Install SDK">
-    ```bash
-    pip install neatlogs
-    ```
-    (Latest version 0.8.0, Python 3.8+; MIT license)
-  </Step>
-  <Step title="Initialize Neatlogs">
-    Before starting Crew agents, add:
-
-    ```python
-    import neatlogs
-    neatlogs.init("YOUR_PROJECT_API_KEY")
-    ```
-
-    Agents run as usual. Neatlogs captures everything automatically.
-
-  </Step>
-</Steps>
-
-
-
-## Under the Hood
-
-According to GitHub, Neatlogs:
-
- Captures thoughts, tool calls, responses, errors, and token stats
- Supports AI-powered task generation and robust evaluation workflows
-
-All with just two lines of code.
-
-
-
-## Watch It Work
-
-### 🔍 Full Demo (4 min)
-
-<iframe
-  width="100%"
-  height="315"
-  src="https://www.youtube.com/embed/8KDme9T2I7Q?si=b8oHteaBwFNs_Duk"
-  title="YouTube video player"
-  frameBorder="0"
-  allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture"
-  allowFullScreen
-></iframe>
-
-### ⚙️ CrewAI Integration (30 s)
-
-<iframe
-  className="w-full aspect-video rounded-xl"
-  src="https://www.loom.com/embed/9c78b552af43452bb3e4783cb8d91230?sid=e9d7d370-a91a-49b0-809e-2f375d9e801d"
-  title="Loom video player"
-  frameBorder="0"
-  allowFullScreen
-></iframe>
-
-
-
-## Links & Support
-
- 📘 [Neatlogs Docs](https://docs.neatlogs.com/)
- 🔐 [Dashboard & API Key](https://app.neatlogs.com/)
- 🐦 [Follow on Twitter](https://twitter.com/neatlogs)
- 📧 Contact: hello@neatlogs.com
- 🛠 [GitHub SDK](https://github.com/NeatLogs/neatlogs)
-
-
-
-## TL;DR
-
-With just:
-
-```bash
-pip install neatlogs
-
-import neatlogs
-neatlogs.init("YOUR_API_KEY")
-
-You can now capture, understand, share, and act on your CrewAI agent runs in seconds.
-No setup overhead. Full trace transparency. Full team collaboration.
-```
--- a/docs/images/neatlogs-1.png
+++ b/docs/images/neatlogs-1.png
--- a/docs/images/neatlogs-2.png
+++ b/docs/images/neatlogs-2.png
--- a/docs/images/neatlogs-3.png
+++ b/docs/images/neatlogs-3.png
--- a/docs/images/neatlogs-4.png
+++ b/docs/images/neatlogs-4.png
--- a/docs/images/neatlogs-5.png
+++ b/docs/images/neatlogs-5.png
--- a/docs/pt-BR/concepts/cli.mdx
+++ b/docs/pt-BR/concepts/cli.mdx
@@ -76,7 +76,6 @@ Exemplo:
 crewai train -n 10 -f my_training_data.pkl
 ```

-```python
 # Exemplo de uso programático do comando train
 n_iterations = 2
 inputs = {"topic": "Treinamento CrewAI"}
@@ -90,7 +89,6 @@ try:
    )
 except Exception as e:
    raise Exception(f"Ocorreu um erro ao treinar a crew: {e}")
-```

 ### 4. Replay

--- a/docs/pt-BR/concepts/tasks.mdx
+++ b/docs/pt-BR/concepts/tasks.mdx
@@ -57,7 +57,6 @@ crew = Crew(
 | **Saída JSON** _(opcional)_      | `output_json`     | `Optional[Type[BaseModel]]`  | Um modelo Pydantic para estruturar a saída em JSON.                                                                |
 | **Output Pydantic** _(opcional)_ | `output_pydantic` | `Optional[Type[BaseModel]]`  | Um modelo Pydantic para a saída da tarefa.                                                                         |
 | **Callback** _(opcional)_        | `callback`        | `Optional[Any]`              | Função/objeto a ser executado após a conclusão da tarefa.                                                          |
-| **Guardrail** _(opcional)_       | `guardrail`       | `Optional[Union[Callable, str]]` | Função ou descrição em string para validar a saída da tarefa antes de prosseguir para a próxima tarefa.        |

 ## Criando Tarefas

@@ -87,7 +86,6 @@ research_task:
  expected_output: >
    Uma lista com 10 tópicos em bullet points das informações mais relevantes sobre {topic}
  agent: researcher
-  guardrail: garanta que cada bullet point contenha no mínimo 100 palavras

 reporting_task:
  description: >
@@ -332,13 +330,9 @@ analysis_task = Task(

 Guardrails (trilhas de proteção) de tarefas fornecem uma maneira de validar e transformar as saídas das tarefas antes que elas sejam passadas para a próxima tarefa. Esse recurso assegura a qualidade dos dados e oferece feedback aos agentes quando sua saída não atende a critérios específicos.

-**Guardrails podem ser definidos de duas maneiras:**
-1. **Guardrails baseados em função**: Funções Python que implementam lógica de validação customizada
-2. **Guardrails baseados em string**: Descrições em linguagem natural que são automaticamente convertidas em validação baseada em LLM
+### Usando Guardrails em Tarefas

-### Guardrails Baseados em Função
-
-Para adicionar um guardrail baseado em função a uma tarefa, forneça uma função de validação por meio do parâmetro `guardrail`:
+Para adicionar um guardrail a uma tarefa, forneça uma função de validação por meio do parâmetro `guardrail`:

 ```python Code
 from typing import Tuple, Union, Dict, Any
@@ -376,82 +370,9 @@ blog_task = Task(
   - Em caso de sucesso: retorna uma tupla `(True, resultado_validado)`
   - Em caso de falha: retorna uma tupla `(False, "mensagem de erro explicando a falha")`

-### Guardrails Baseados em String
+### LLMGuardrail

-Guardrails baseados em string permitem que você descreva critérios de validação em linguagem natural. Quando você fornece uma string em vez de uma função, o CrewAI automaticamente a converte em um `LLMGuardrail` que usa um agente de IA para validar a saída da tarefa.
-
-#### Usando Guardrails de String em Python
-
-```python Code
-from crewai import Task
-
-# Guardrail simples baseado em string
-blog_task = Task(
-    description="Escreva um post de blog sobre IA",
-    expected_output="Um post de blog com menos de 200 palavras",
-    agent=blog_agent,
-    guardrail="Garanta que o post do blog tenha menos de 200 palavras e inclua exemplos práticos"
-)
-
-# Critérios de validação mais complexos
-research_task = Task(
-    description="Pesquise tendências de IA para 2025",
-    expected_output="Um relatório abrangente de pesquisa",
-    agent=research_agent,
-    guardrail="Garanta que cada descoberta inclua uma fonte confiável e seja respaldada por dados recentes de 2024-2025"
-)
-```
-
-#### Usando Guardrails de String em YAML
-
-```yaml
-research_task:
-  description: Pesquise os últimos desenvolvimentos em IA
-  expected_output: Uma lista de 10 bullet points sobre IA
-  agent: researcher
-  guardrail: garanta que cada bullet point contenha no mínimo 100 palavras
-
-validation_task:
-  description: Valide os achados da pesquisa
-  expected_output: Um relatório de validação
-  agent: validator
-  guardrail: confirme que todas as fontes são de publicações respeitáveis e publicadas nos últimos 2 anos
-```
-
-#### Como Funcionam os Guardrails de String
-
-Quando você fornece um guardrail de string, o CrewAI automaticamente:
-1. Cria uma instância `LLMGuardrail` usando a string como critério de validação
-2. Usa o LLM do agente da tarefa para alimentar a validação
-3. Cria um agente temporário de validação que verifica a saída contra seus critérios
-4. Retorna feedback detalhado se a validação falhar
-
-Esta abordagem é ideal quando você quer usar linguagem natural para descrever regras de validação sem escrever funções de validação customizadas.
-
-### Classe LLMGuardrail
-
-A classe `LLMGuardrail` é o mecanismo subjacente que alimenta os guardrails baseados em string. Você também pode usá-la diretamente para maior controle avançado:
-
-```python Code
-from crewai import Task
-from crewai.tasks.llm_guardrail import LLMGuardrail
-from crewai.llm import LLM
-
-# Crie um LLMGuardrail customizado com LLM específico
-custom_guardrail = LLMGuardrail(
-    description="Garanta que a resposta contenha exatamente 5 bullet points com citações adequadas",
-    llm=LLM(model="gpt-4o-mini")
-)
-
-task = Task(
-    description="Pesquise medidas de segurança em IA",
-    expected_output="Uma análise detalhada com bullet points",
-    agent=research_agent,
-    guardrail=custom_guardrail
-)
-```
-
-**Nota**: Quando você usa um guardrail de string, o CrewAI automaticamente cria uma instância `LLMGuardrail` usando o LLM do agente da sua tarefa. Usar `LLMGuardrail` diretamente lhe dá mais controle sobre o processo de validação e seleção de LLM.
+A classe `LLMGuardrail` oferece um mecanismo robusto para validação das saídas das tarefas.

 ### Melhores Práticas de Tratamento de Erros

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,7 +39,6 @@ dependencies = [
    "tomli>=2.0.2",
    "blinker>=1.9.0",
    "json5>=0.10.0",
-    "portalocker==2.7.0",
 ]

 [project.urls]
@@ -48,7 +47,7 @@ Documentation = "https://docs.crewai.com"
 Repository = "https://github.com/crewAIInc/crewAI"

 [project.optional-dependencies]
-tools = ["crewai-tools~=0.55.0"]
+tools = ["crewai-tools~=0.51.0"]
 embeddings = [
    "tiktoken~=0.8.0"
 ]
--- a/src/crewai/init.py
+++ b/src/crewai/init.py
@@ -54,7 +54,7 @@ def _track_install_async():

 _track_install_async()

-__version__ = "0.148.0"
+__version__ = "0.141.0"
 __all__ = [
    "Agent",
    "Crew",
--- a/src/crewai/agent.py
+++ b/src/crewai/agent.py
@@ -210,6 +210,7 @@ class Agent(BaseAgent):
                        sources=self.knowledge_sources,
                        embedder=self.embedder,
                        collection_name=self.role,
+                        storage=self.knowledge_storage or None,
                    )
                    self.knowledge.add_sources()
        except (TypeError, ValueError) as e:
@@ -340,8 +341,7 @@ class Agent(BaseAgent):
            self.knowledge_config.model_dump() if self.knowledge_config else {}
        )

-
-        if self.knowledge or (self.crew and self.crew.knowledge):
+        if self.knowledge:
            crewai_event_bus.emit(
                self,
                event=KnowledgeRetrievalStartedEvent(
@@ -353,28 +353,25 @@ class Agent(BaseAgent):
                    task_prompt
                )
                if self.knowledge_search_query:
-                    # Quering agent specific knowledge
-                    if self.knowledge:
-                        agent_knowledge_snippets = self.knowledge.query(
-                            [self.knowledge_search_query], **knowledge_config
-                        )
-                        if agent_knowledge_snippets:
-                            self.agent_knowledge_context = extract_knowledge_context(
-                                agent_knowledge_snippets
-                            )
-                            if self.agent_knowledge_context:
-                                task_prompt += self.agent_knowledge_context
-
-                    # Quering crew specific knowledge
-                    knowledge_snippets = self.crew.query_knowledge(
+                    agent_knowledge_snippets = self.knowledge.query(
                        [self.knowledge_search_query], **knowledge_config
                    )
-                    if knowledge_snippets:
-                        self.crew_knowledge_context = extract_knowledge_context(
-                            knowledge_snippets
+                    if agent_knowledge_snippets:
+                        self.agent_knowledge_context = extract_knowledge_context(
+                            agent_knowledge_snippets
                        )
-                        if self.crew_knowledge_context:
-                            task_prompt += self.crew_knowledge_context
+                        if self.agent_knowledge_context:
+                            task_prompt += self.agent_knowledge_context
+                    if self.crew:
+                        knowledge_snippets = self.crew.query_knowledge(
+                            [self.knowledge_search_query], **knowledge_config
+                        )
+                        if knowledge_snippets:
+                            self.crew_knowledge_context = extract_knowledge_context(
+                                knowledge_snippets
+                            )
+                            if self.crew_knowledge_context:
+                                task_prompt += self.crew_knowledge_context

                    crewai_event_bus.emit(
                        self,
--- a/src/crewai/cli/templates/crew/pyproject.toml
+++ b/src/crewai/cli/templates/crew/pyproject.toml
@@ -5,7 +5,7 @@ description = "{{name}} using crewAI"
 authors = [{ name = "Your Name", email = "you@example.com" }]
 requires-python = ">=3.10,<3.14"
 dependencies = [
-    "crewai[tools]>=0.148.0,<1.0.0"
+    "crewai[tools]>=0.141.0,<1.0.0"
 ]

 [project.scripts]
--- a/src/crewai/cli/templates/flow/pyproject.toml
+++ b/src/crewai/cli/templates/flow/pyproject.toml
@@ -5,7 +5,7 @@ description = "{{name}} using crewAI"
 authors = [{ name = "Your Name", email = "you@example.com" }]
 requires-python = ">=3.10,<3.14"
 dependencies = [
-    "crewai[tools]>=0.148.0,<1.0.0",
+    "crewai[tools]>=0.141.0,<1.0.0",
 ]

 [project.scripts]
--- a/src/crewai/cli/templates/tool/pyproject.toml
+++ b/src/crewai/cli/templates/tool/pyproject.toml
@@ -5,7 +5,7 @@ description = "Power up your crews with {{folder_name}}"
 readme = "README.md"
 requires-python = ">=3.10,<3.14"
 dependencies = [
-    "crewai[tools]>=0.148.0"
+    "crewai[tools]>=0.141.0"
 ]

 [tool.crewai]
--- a/src/crewai/crew.py
+++ b/src/crewai/crew.py
@@ -1313,6 +1313,7 @@ class Crew(FlowTrackable, BaseModel):
        n_iterations: int,
        eval_llm: Union[str, InstanceOf[BaseLLM]],
        inputs: Optional[Dict[str, Any]] = None,
+        include_agent_eval: Optional[bool] = False
    ) -> None:
        """Test and evaluate the Crew with the given inputs for n iterations concurrently using concurrent.futures."""
        try:
@@ -1332,13 +1333,28 @@ class Crew(FlowTrackable, BaseModel):
            )
            test_crew = self.copy()

+            # TODO: Refator to use a single Evaluator Manage class
            evaluator = CrewEvaluator(test_crew, llm_instance)

+            if include_agent_eval:
+                from crewai.evaluation import create_default_evaluator
+                agent_evaluator = create_default_evaluator(crew=test_crew)
+
            for i in range(1, n_iterations + 1):
                evaluator.set_iteration(i)
+
+                if include_agent_eval:
+                    agent_evaluator.set_iteration(i)
+
                test_crew.kickoff(inputs=inputs)

+                # TODO: Refactor to use ListenerEvents instead of trigger each iteration manually
+                if include_agent_eval:
+                    agent_evaluator.evaluate_current_iteration()
+
            evaluator.print_crew_evaluation_result()
+            if include_agent_eval:
+                agent_evaluator.get_agent_evaluation(include_evaluation_feedback=True)

            crewai_event_bus.emit(
                self,
--- a/src/crewai/experimental/evaluation/init.py
+++ b/src/crewai/experimental/evaluation/init.py
@@ -1,35 +1,40 @@
-from crewai.experimental.evaluation.base_evaluator import (
+from crewai.evaluation.base_evaluator import (
    BaseEvaluator,
    EvaluationScore,
    MetricCategory,
    AgentEvaluationResult
 )

-from crewai.experimental.evaluation.metrics import (
-    SemanticQualityEvaluator,
-    GoalAlignmentEvaluator,
-    ReasoningEfficiencyEvaluator,
+from crewai.evaluation.metrics.semantic_quality_metrics import (
+    SemanticQualityEvaluator
+)
+
+from crewai.evaluation.metrics.goal_metrics import (
+    GoalAlignmentEvaluator
+)
+
+from crewai.evaluation.metrics.reasoning_metrics import (
+    ReasoningEfficiencyEvaluator
+)
+
+
+from crewai.evaluation.metrics.tools_metrics import (
    ToolSelectionEvaluator,
    ParameterExtractionEvaluator,
    ToolInvocationEvaluator
 )

-from crewai.experimental.evaluation.evaluation_listener import (
+from crewai.evaluation.evaluation_listener import (
    EvaluationTraceCallback,
    create_evaluation_callbacks
 )

-from crewai.experimental.evaluation.agent_evaluator import (
+
+from crewai.evaluation.agent_evaluator import (
    AgentEvaluator,
    create_default_evaluator
 )

-from crewai.experimental.evaluation.experiment import (
-    ExperimentRunner,
-    ExperimentResults,
-    ExperimentResult
-)
-
 __all__ = [
    "BaseEvaluator",
    "EvaluationScore",
@@ -44,8 +49,5 @@ __all__ = [
    "EvaluationTraceCallback",
    "create_evaluation_callbacks",
    "AgentEvaluator",
-    "create_default_evaluator",
-    "ExperimentRunner",
-    "ExperimentResults",
-    "ExperimentResult"
-]
+    "create_default_evaluator"
+]
--- a/src/crewai/evaluation/agent_evaluator.py
+++ b/src/crewai/evaluation/agent_evaluator.py
@@ -0,0 +1,178 @@
+from crewai.evaluation.base_evaluator import AgentEvaluationResult, AggregationStrategy
+from crewai.agent import Agent
+from crewai.task import Task
+from crewai.evaluation.evaluation_display import EvaluationDisplayFormatter
+
+from typing import Any, Dict
+from collections import defaultdict
+from crewai.evaluation import BaseEvaluator, create_evaluation_callbacks
+from collections.abc import Sequence
+from crewai.crew import Crew
+from crewai.utilities.events.crewai_event_bus import crewai_event_bus
+from crewai.utilities.events.utils.console_formatter import ConsoleFormatter
+
+class AgentEvaluator:
+    def __init__(
+        self,
+        evaluators: Sequence[BaseEvaluator] | None = None,
+        crew: Crew | None = None,
+    ):
+        self.crew: Crew | None = crew
+        self.evaluators: Sequence[BaseEvaluator] | None = evaluators
+
+        self.agent_evaluators: dict[str, Sequence[BaseEvaluator] | None] = {}
+        if crew is not None:
+            assert crew and crew.agents is not None
+            for agent in crew.agents:
+                self.agent_evaluators[str(agent.id)] = self.evaluators
+
+        self.callback = create_evaluation_callbacks()
+        self.console_formatter = ConsoleFormatter()
+        self.display_formatter = EvaluationDisplayFormatter()
+
+        self.iteration = 1
+        self.iterations_results: dict[int, dict[str, list[AgentEvaluationResult]]] = {}
+
+    def set_iteration(self, iteration: int) -> None:
+        self.iteration = iteration
+
+    def evaluate_current_iteration(self) -> dict[str, list[AgentEvaluationResult]]:
+        if not self.crew:
+            raise ValueError("Cannot evaluate: no crew was provided to the evaluator.")
+
+        if not self.callback:
+            raise ValueError("Cannot evaluate: no callback was set. Use set_callback() method first.")
+
+        from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
+        evaluation_results: defaultdict[str, list[AgentEvaluationResult]] = defaultdict(list)
+
+        total_evals = 0
+        for agent in self.crew.agents:
+            for task in self.crew.tasks:
+                if task.agent and task.agent.id == agent.id and self.agent_evaluators.get(str(agent.id)):
+                    total_evals += 1
+
+        with Progress(
+            SpinnerColumn(),
+            TextColumn("[bold blue]{task.description}[/bold blue]"),
+            BarColumn(),
+            TextColumn("{task.percentage:.0f}% completed"),
+            console=self.console_formatter.console
+        ) as progress:
+            eval_task = progress.add_task(f"Evaluating agents (iteration {self.iteration})...", total=total_evals)
+
+            for agent in self.crew.agents:
+                evaluator = self.agent_evaluators.get(str(agent.id))
+                if not evaluator:
+                    continue
+
+                for task in self.crew.tasks:
+
+                    if task.agent and str(task.agent.id) != str(agent.id):
+                        continue
+
+                    trace = self.callback.get_trace(str(agent.id), str(task.id))
+                    if not trace:
+                        self.console_formatter.print(f"[yellow]Warning: No trace found for agent {agent.role} on task {task.description[:30]}...[/yellow]")
+                        progress.update(eval_task, advance=1)
+                        continue
+
+                    with crewai_event_bus.scoped_handlers():
+                        result = self.evaluate(
+                            agent=agent,
+                            task=task,
+                            execution_trace=trace,
+                            final_output=task.output
+                        )
+                        evaluation_results[agent.role].append(result)
+                        progress.update(eval_task, advance=1)
+
+        self.iterations_results[self.iteration] = evaluation_results
+        return evaluation_results
+
+    def get_evaluation_results(self):
+        if self.iteration in self.iterations_results:
+            return self.iterations_results[self.iteration]
+
+        return self.evaluate_current_iteration()
+
+    def display_results_with_iterations(self):
+        self.display_formatter.display_summary_results(self.iterations_results)
+
+    def get_agent_evaluation(self, strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE, include_evaluation_feedback: bool = False):
+        agent_results = {}
+        with crewai_event_bus.scoped_handlers():
+            task_results = self.get_evaluation_results()
+            for agent_role, results in task_results.items():
+                if not results:
+                    continue
+
+                agent_id = results[0].agent_id
+
+                aggregated_result = self.display_formatter._aggregate_agent_results(
+                    agent_id=agent_id,
+                    agent_role=agent_role,
+                    results=results,
+                    strategy=strategy
+                )
+
+                agent_results[agent_role] = aggregated_result
+
+
+            if self.iteration == max(self.iterations_results.keys()):
+                self.display_results_with_iterations()
+
+            if include_evaluation_feedback:
+                self.display_evaluation_with_feedback()
+
+        return agent_results
+
+    def display_evaluation_with_feedback(self):
+        self.display_formatter.display_evaluation_with_feedback(self.iterations_results)
+
+    def evaluate(
+        self,
+        agent: Agent,
+        task: Task,
+        execution_trace: Dict[str, Any],
+        final_output: Any
+    ) -> AgentEvaluationResult:
+        result = AgentEvaluationResult(
+            agent_id=str(agent.id),
+            task_id=str(task.id)
+        )
+        assert self.evaluators is not None
+        for evaluator in self.evaluators:
+            try:
+                score = evaluator.evaluate(
+                    agent=agent,
+                    task=task,
+                    execution_trace=execution_trace,
+                    final_output=final_output
+                )
+                result.metrics[evaluator.metric_category] = score
+            except Exception as e:
+                self.console_formatter.print(f"Error in {evaluator.metric_category.value} evaluator: {str(e)}")
+
+        return result
+
+def create_default_evaluator(crew, llm=None):
+    from crewai.evaluation import (
+        GoalAlignmentEvaluator,
+        SemanticQualityEvaluator,
+        ToolSelectionEvaluator,
+        ParameterExtractionEvaluator,
+        ToolInvocationEvaluator,
+        ReasoningEfficiencyEvaluator
+    )
+
+    evaluators = [
+        GoalAlignmentEvaluator(llm=llm),
+        SemanticQualityEvaluator(llm=llm),
+        ToolSelectionEvaluator(llm=llm),
+        ParameterExtractionEvaluator(llm=llm),
+        ToolInvocationEvaluator(llm=llm),
+        ReasoningEfficiencyEvaluator(llm=llm),
+    ]
+
+    return AgentEvaluator(evaluators=evaluators, crew=crew)
--- a/src/crewai/experimental/evaluation/base_evaluator.py
+++ b/src/crewai/experimental/evaluation/base_evaluator.py
@@ -57,9 +57,9 @@ class BaseEvaluator(abc.ABC):
    def evaluate(
        self,
        agent: Agent,
+        task: Task,
        execution_trace: Dict[str, Any],
        final_output: Any,
-        task: Task | None = None,
    ) -> EvaluationScore:
        pass

--- a/src/crewai/experimental/evaluation/evaluation_display.py
+++ b/src/crewai/experimental/evaluation/evaluation_display.py
@@ -3,8 +3,8 @@ from typing import Dict, Any, List
 from rich.table import Table
 from rich.box import HEAVY_EDGE, ROUNDED
 from collections.abc import Sequence
-from crewai.experimental.evaluation.base_evaluator import AgentAggregatedEvaluationResult, AggregationStrategy, AgentEvaluationResult, MetricCategory
-from crewai.experimental.evaluation import EvaluationScore
+from crewai.evaluation.base_evaluator import AgentAggregatedEvaluationResult, AggregationStrategy, AgentEvaluationResult, MetricCategory
+from crewai.evaluation import EvaluationScore
 from crewai.utilities.events.utils.console_formatter import ConsoleFormatter
 from crewai.utilities.llm_utils import create_llm

@@ -17,6 +17,7 @@ class EvaluationDisplayFormatter:
            self.console_formatter.print("[yellow]No evaluation results to display[/yellow]")
            return

+        # Get all agent roles across all iterations
        all_agent_roles: set[str] = set()
        for iter_results in iterations_results.values():
            all_agent_roles.update(iter_results.keys())
@@ -24,6 +25,7 @@ class EvaluationDisplayFormatter:
        for agent_role in sorted(all_agent_roles):
            self.console_formatter.print(f"\n[bold cyan]Agent: {agent_role}[/bold cyan]")

+            # Process each iteration
            for iter_num, results in sorted(iterations_results.items()):
                if agent_role not in results or not results[agent_role]:
                    continue
@@ -31,19 +33,23 @@ class EvaluationDisplayFormatter:
                agent_results = results[agent_role]
                agent_id = agent_results[0].agent_id

+                # Aggregate results for this agent in this iteration
                aggregated_result = self._aggregate_agent_results(
                    agent_id=agent_id,
                    agent_role=agent_role,
                    results=agent_results,
                )

+                # Display iteration header
                self.console_formatter.print(f"\n[bold]Iteration {iter_num}[/bold]")

+                # Create table for this iteration
                table = Table(box=ROUNDED)
                table.add_column("Metric", style="cyan")
                table.add_column("Score (1-10)", justify="center")
                table.add_column("Feedback", style="green")

+                # Add metrics to table
                if aggregated_result.metrics:
                    for metric, evaluation_score in aggregated_result.metrics.items():
                        score = evaluation_score.score
@@ -85,6 +91,7 @@ class EvaluationDisplayFormatter:
                        "Overall agent evaluation score"
                    )

+                # Print the table for this iteration
                self.console_formatter.print(table)

    def display_summary_results(self, iterations_results: Dict[int, Dict[str, List[AgentAggregatedEvaluationResult]]]):
@@ -241,6 +248,7 @@ class EvaluationDisplayFormatter:
            feedback_summary = None
            if feedbacks:
                if len(feedbacks) > 1:
+                    # Use the summarization method for multiple feedbacks
                    feedback_summary = self._summarize_feedbacks(
                        agent_role=agent_role,
                        metric=category.title(),
@@ -299,7 +307,7 @@ class EvaluationDisplayFormatter:
                strategy_guidance = "Focus on the highest-scoring aspects and strengths demonstrated."
            elif strategy == AggregationStrategy.WORST_PERFORMANCE:
                strategy_guidance = "Focus on areas that need improvement and common issues across tasks."
-            else:
+            else:  # Default/average strategies
                strategy_guidance = "Provide a balanced analysis of strengths and weaknesses across all tasks."

            prompt = [
--- a/src/crewai/experimental/evaluation/evaluation_listener.py
+++ b/src/crewai/experimental/evaluation/evaluation_listener.py
@@ -9,9 +9,7 @@ from crewai.utilities.events.base_event_listener import BaseEventListener
 from crewai.utilities.events.crewai_event_bus import CrewAIEventsBus
 from crewai.utilities.events.agent_events import (
    AgentExecutionStartedEvent,
-    AgentExecutionCompletedEvent,
-    LiteAgentExecutionStartedEvent,
-    LiteAgentExecutionCompletedEvent
+    AgentExecutionCompletedEvent
 )
 from crewai.utilities.events.tool_usage_events import (
    ToolUsageFinishedEvent,
@@ -54,18 +52,10 @@ class EvaluationTraceCallback(BaseEventListener):
        def on_agent_started(source, event: AgentExecutionStartedEvent):
            self.on_agent_start(event.agent, event.task)

-        @event_bus.on(LiteAgentExecutionStartedEvent)
-        def on_lite_agent_started(source, event: LiteAgentExecutionStartedEvent):
-            self.on_lite_agent_start(event.agent_info)
-
        @event_bus.on(AgentExecutionCompletedEvent)
        def on_agent_completed(source, event: AgentExecutionCompletedEvent):
            self.on_agent_finish(event.agent, event.task, event.output)

-        @event_bus.on(LiteAgentExecutionCompletedEvent)
-        def on_lite_agent_completed(source, event: LiteAgentExecutionCompletedEvent):
-            self.on_lite_agent_finish(event.output)
-
        @event_bus.on(ToolUsageFinishedEvent)
        def on_tool_completed(source, event: ToolUsageFinishedEvent):
            self.on_tool_use(event.tool_name, event.tool_args, event.output, success=True)
@@ -98,38 +88,19 @@ class EvaluationTraceCallback(BaseEventListener):
        def on_llm_call_completed(source, event: LLMCallCompletedEvent):
            self.on_llm_call_end(event.messages, event.response)

-    def on_lite_agent_start(self, agent_info: dict[str, Any]):
-        self.current_agent_id = agent_info['id']
-        self.current_task_id = "lite_task"
-
-        trace_key = f"{self.current_agent_id}_{self.current_task_id}"
-        self._init_trace(
-            trace_key=trace_key,
-            agent_id=self.current_agent_id,
-            task_id=self.current_task_id,
-            tool_uses=[],
-            llm_calls=[],
-            start_time=datetime.now(),
-            final_output=None
-        )
-
-    def _init_trace(self, trace_key: str, **kwargs: Any):
-        self.traces[trace_key] = kwargs
-
    def on_agent_start(self, agent: Agent, task: Task):
        self.current_agent_id = agent.id
        self.current_task_id = task.id

        trace_key = f"{agent.id}_{task.id}"
-        self._init_trace(
-            trace_key=trace_key,
-            agent_id=agent.id,
-            task_id=task.id,
-            tool_uses=[],
-            llm_calls=[],
-            start_time=datetime.now(),
-            final_output=None
-        )
+        self.traces[trace_key] = {
+            "agent_id": agent.id,
+            "task_id": task.id,
+            "tool_uses": [],
+            "llm_calls": [],
+            "start_time": datetime.now(),
+            "final_output": None
+        }

    def on_agent_finish(self, agent: Agent, task: Task, output: Any):
        trace_key = f"{agent.id}_{task.id}"
@@ -137,20 +108,9 @@ class EvaluationTraceCallback(BaseEventListener):
            self.traces[trace_key]["final_output"] = output
            self.traces[trace_key]["end_time"] = datetime.now()

-        self._reset_current()
-
-    def _reset_current(self):
        self.current_agent_id = None
        self.current_task_id = None

-    def on_lite_agent_finish(self, output: Any):
-        trace_key = f"{self.current_agent_id}_lite_task"
-        if trace_key in self.traces:
-            self.traces[trace_key]["final_output"] = output
-            self.traces[trace_key]["end_time"] = datetime.now()
-
-        self._reset_current()
-
    def on_tool_use(self, tool_name: str, tool_args: dict[str, Any] | str, result: Any,
                   success: bool = True, error_type: str | None = None):
        if not self.current_agent_id or not self.current_task_id:
@@ -227,8 +187,4 @@ class EvaluationTraceCallback(BaseEventListener):


 def create_evaluation_callbacks() -> EvaluationTraceCallback:
-    from crewai.utilities.events.crewai_event_bus import crewai_event_bus
-
-    callback = EvaluationTraceCallback()
-    callback.setup_listeners(crewai_event_bus)
-    return callback
+    return EvaluationTraceCallback()
--- a/src/crewai/experimental/evaluation/json_parser.py
+++ b/src/crewai/experimental/evaluation/json_parser.py
--- a/src/crewai/evaluation/metrics/init.py
+++ b/src/crewai/evaluation/metrics/init.py
--- a/src/crewai/experimental/evaluation/metrics/goal_metrics.py
+++ b/src/crewai/experimental/evaluation/metrics/goal_metrics.py
@@ -3,8 +3,8 @@ from typing import Any, Dict
 from crewai.agent import Agent
 from crewai.task import Task

-from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
-from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
+from crewai.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
+from crewai.evaluation.json_parser import extract_json_from_llm_response

 class GoalAlignmentEvaluator(BaseEvaluator):
    @property
@@ -14,14 +14,10 @@ class GoalAlignmentEvaluator(BaseEvaluator):
    def evaluate(
        self,
        agent: Agent,
+        task: Task,
        execution_trace: Dict[str, Any],
        final_output: Any,
-        task: Task | None = None,
    ) -> EvaluationScore:
-        task_context = ""
-        if task is not None:
-            task_context = f"Task description: {task.description}\nExpected output: {task.expected_output}\n"
-
        prompt = [
            {"role": "system", "content": """You are an expert evaluator assessing how well an AI agent's output aligns with its assigned task goal.

@@ -41,7 +37,8 @@ Return your evaluation as JSON with fields 'score' (number) and 'feedback' (stri
            {"role": "user", "content": f"""
 Agent role: {agent.role}
 Agent goal: {agent.goal}
-{task_context}
+Task description: {task.description}
+Expected output: {task.expected_output}

 Agent's final output:
 {final_output}
--- a/src/crewai/experimental/evaluation/metrics/reasoning_metrics.py
+++ b/src/crewai/experimental/evaluation/metrics/reasoning_metrics.py
@@ -16,8 +16,8 @@ from collections.abc import Sequence
 from crewai.agent import Agent
 from crewai.task import Task

-from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
-from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
+from crewai.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
+from crewai.evaluation.json_parser import extract_json_from_llm_response
 from crewai.tasks.task_output import TaskOutput

 class ReasoningPatternType(Enum):
@@ -36,14 +36,10 @@ class ReasoningEfficiencyEvaluator(BaseEvaluator):
    def evaluate(
        self,
        agent: Agent,
+        task: Task,
        execution_trace: Dict[str, Any],
-        final_output: TaskOutput | str,
-        task: Task | None = None,
+        final_output: TaskOutput,
    ) -> EvaluationScore:
-        task_context = ""
-        if task is not None:
-            task_context = f"Task description: {task.description}\nExpected output: {task.expected_output}\n"
-
        llm_calls = execution_trace.get("llm_calls", [])

        if not llm_calls or len(llm_calls) < 2:
@@ -87,8 +83,6 @@ class ReasoningEfficiencyEvaluator(BaseEvaluator):

        call_samples = self._get_call_samples(llm_calls)

-        final_output = final_output.raw if isinstance(final_output, TaskOutput) else final_output
-
        prompt = [
            {"role": "system", "content": """You are an expert evaluator assessing the reasoning efficiency of an AI agent's thought process.

@@ -123,7 +117,7 @@ Return your evaluation as JSON with the following structure:
 }"""},
            {"role": "user", "content": f"""
 Agent role: {agent.role}
-{task_context}
+Task description: {task.description}

 Reasoning efficiency metrics:
 - Total LLM calls: {efficiency_metrics["total_llm_calls"]}
@@ -136,7 +130,7 @@ Sample of agent reasoning flow (chronological sequence):
 {call_samples}

 Agent's final output:
-{final_output[:500]}... (truncated)
+{final_output.raw[:500]}... (truncated)

 Evaluate the reasoning efficiency of this agent based on these interaction patterns.
 Identify any inefficient reasoning patterns and provide specific suggestions for optimization.
--- a/src/crewai/experimental/evaluation/metrics/semantic_quality_metrics.py
+++ b/src/crewai/experimental/evaluation/metrics/semantic_quality_metrics.py
@@ -3,8 +3,8 @@ from typing import Any, Dict
 from crewai.agent import Agent
 from crewai.task import Task

-from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
-from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
+from crewai.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
+from crewai.evaluation.json_parser import extract_json_from_llm_response

 class SemanticQualityEvaluator(BaseEvaluator):
    @property
@@ -14,13 +14,10 @@ class SemanticQualityEvaluator(BaseEvaluator):
    def evaluate(
        self,
        agent: Agent,
+        task: Task,
        execution_trace: Dict[str, Any],
        final_output: Any,
-        task: Task | None = None,
    ) -> EvaluationScore:
-        task_context = ""
-        if task is not None:
-            task_context = f"Task description: {task.description}"
        prompt = [
            {"role": "system", "content": """You are an expert evaluator assessing the semantic quality of an AI agent's output.

@@ -40,7 +37,7 @@ Return your evaluation as JSON with fields 'score' (number) and 'feedback' (stri
 """},
            {"role": "user", "content": f"""
 Agent role: {agent.role}
-{task_context}
+Task description: {task.description}

 Agent's final output:
 {final_output}
--- a/src/crewai/experimental/evaluation/metrics/tools_metrics.py
+++ b/src/crewai/experimental/evaluation/metrics/tools_metrics.py
@@ -1,8 +1,8 @@
 import json
 from typing import Dict, Any

-from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
-from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
+from crewai.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
+from crewai.evaluation.json_parser import extract_json_from_llm_response
 from crewai.agent import Agent
 from crewai.task import Task

@@ -16,14 +16,10 @@ class ToolSelectionEvaluator(BaseEvaluator):
    def evaluate(
        self,
        agent: Agent,
+        task: Task,
        execution_trace: Dict[str, Any],
        final_output: str,
-        task: Task | None = None,
    ) -> EvaluationScore:
-        task_context = ""
-        if task is not None:
-            task_context = f"Task description: {task.description}"
-
        tool_uses = execution_trace.get("tool_uses", [])
        tool_count = len(tool_uses)
        unique_tool_types = set([tool.get("tool", "Unknown tool") for tool in tool_uses])
@@ -76,7 +72,7 @@ Return your evaluation as JSON with these fields:
 """},
            {"role": "user", "content": f"""
 Agent role: {agent.role}
-{task_context}
+Task description: {task.description}

 Available tools for this agent:
 {available_tools_info}
@@ -132,13 +128,10 @@ class ParameterExtractionEvaluator(BaseEvaluator):
    def evaluate(
        self,
        agent: Agent,
+        task: Task,
        execution_trace: Dict[str, Any],
        final_output: str,
-        task: Task | None = None,
    ) -> EvaluationScore:
-        task_context = ""
-        if task is not None:
-            task_context = f"Task description: {task.description}"
        tool_uses = execution_trace.get("tool_uses", [])
        tool_count = len(tool_uses)

@@ -219,7 +212,7 @@ Return your evaluation as JSON with these fields:
 """},
            {"role": "user", "content": f"""
 Agent role: {agent.role}
-{task_context}
+Task description: {task.description}

 Parameter extraction examples:
 {param_samples_text}
@@ -274,13 +267,10 @@ class ToolInvocationEvaluator(BaseEvaluator):
    def evaluate(
        self,
        agent: Agent,
+        task: Task,
        execution_trace: Dict[str, Any],
        final_output: str,
-        task: Task | None = None,
    ) -> EvaluationScore:
-        task_context = ""
-        if task is not None:
-            task_context = f"Task description: {task.description}"
        tool_uses = execution_trace.get("tool_uses", [])
        tool_errors = []
        tool_count = len(tool_uses)
@@ -362,7 +352,7 @@ Return your evaluation as JSON with these fields:
 """},
            {"role": "user", "content": f"""
 Agent role: {agent.role}
-{task_context}
+Task description: {task.description}

 Tool invocation examples:
 {invocation_samples_text}
--- a/src/crewai/experimental/init.py
+++ b/src/crewai/experimental/init.py
@@ -1,40 +0,0 @@
-from crewai.experimental.evaluation import (
-    BaseEvaluator,
-    EvaluationScore,
-    MetricCategory,
-    AgentEvaluationResult,
-    SemanticQualityEvaluator,
-    GoalAlignmentEvaluator,
-    ReasoningEfficiencyEvaluator,
-    ToolSelectionEvaluator,
-    ParameterExtractionEvaluator,
-    ToolInvocationEvaluator,
-    EvaluationTraceCallback,
-    create_evaluation_callbacks,
-    AgentEvaluator,
-    create_default_evaluator,
-    ExperimentRunner,
-    ExperimentResults,
-    ExperimentResult,
-)
-
-
-__all__ = [
-    "BaseEvaluator",
-    "EvaluationScore",
-    "MetricCategory",
-    "AgentEvaluationResult",
-    "SemanticQualityEvaluator",
-    "GoalAlignmentEvaluator",
-    "ReasoningEfficiencyEvaluator",
-    "ToolSelectionEvaluator",
-    "ParameterExtractionEvaluator",
-    "ToolInvocationEvaluator",
-    "EvaluationTraceCallback",
-    "create_evaluation_callbacks",
-    "AgentEvaluator",
-    "create_default_evaluator",
-    "ExperimentRunner",
-    "ExperimentResults",
-    "ExperimentResult"
-]
--- a/src/crewai/experimental/evaluation/agent_evaluator.py
+++ b/src/crewai/experimental/evaluation/agent_evaluator.py
@@ -1,245 +0,0 @@
-import threading
-from typing import Any
-
-from crewai.experimental.evaluation.base_evaluator import AgentEvaluationResult, AggregationStrategy
-from crewai.agent import Agent
-from crewai.task import Task
-from crewai.experimental.evaluation.evaluation_display import EvaluationDisplayFormatter
-from crewai.utilities.events.agent_events import AgentEvaluationStartedEvent, AgentEvaluationCompletedEvent, AgentEvaluationFailedEvent
-from crewai.experimental.evaluation import BaseEvaluator, create_evaluation_callbacks
-from collections.abc import Sequence
-from crewai.utilities.events.crewai_event_bus import crewai_event_bus
-from crewai.utilities.events.utils.console_formatter import ConsoleFormatter
-from crewai.utilities.events.task_events import TaskCompletedEvent
-from crewai.utilities.events.agent_events import LiteAgentExecutionCompletedEvent
-from crewai.experimental.evaluation.base_evaluator import AgentAggregatedEvaluationResult, EvaluationScore, MetricCategory
-
-class ExecutionState:
-    def __init__(self):
-        self.traces = {}
-        self.current_agent_id: str | None = None
-        self.current_task_id: str | None = None
-        self.iteration = 1
-        self.iterations_results = {}
-        self.agent_evaluators = {}
-
-class AgentEvaluator:
-    def __init__(
-        self,
-        agents: list[Agent],
-        evaluators: Sequence[BaseEvaluator] | None = None,
-    ):
-        self.agents: list[Agent] = agents
-        self.evaluators: Sequence[BaseEvaluator] | None = evaluators
-
-        self.callback = create_evaluation_callbacks()
-        self.console_formatter = ConsoleFormatter()
-        self.display_formatter = EvaluationDisplayFormatter()
-
-        self._thread_local: threading.local = threading.local()
-
-        for agent in self.agents:
-            self._execution_state.agent_evaluators[str(agent.id)] = self.evaluators
-
-        self._subscribe_to_events()
-
-    @property
-    def _execution_state(self) -> ExecutionState:
-        if not hasattr(self._thread_local, 'execution_state'):
-            self._thread_local.execution_state = ExecutionState()
-        return self._thread_local.execution_state
-
-    def _subscribe_to_events(self) -> None:
-        from typing import cast
-        crewai_event_bus.register_handler(TaskCompletedEvent, cast(Any, self._handle_task_completed))
-        crewai_event_bus.register_handler(LiteAgentExecutionCompletedEvent, cast(Any, self._handle_lite_agent_completed))
-
-    def _handle_task_completed(self, source: Any, event: TaskCompletedEvent) -> None:
-        assert event.task is not None
-        agent = event.task.agent
-        if agent and str(getattr(agent, 'id', 'unknown')) in self._execution_state.agent_evaluators:
-            self.emit_evaluation_started_event(agent_role=agent.role, agent_id=str(agent.id), task_id=str(event.task.id))
-
-            state = ExecutionState()
-            state.current_agent_id = str(agent.id)
-            state.current_task_id = str(event.task.id)
-
-            assert state.current_agent_id is not None and state.current_task_id is not None
-            trace = self.callback.get_trace(state.current_agent_id, state.current_task_id)
-
-            if not trace:
-                return
-
-            result = self.evaluate(
-                agent=agent,
-                task=event.task,
-                execution_trace=trace,
-                final_output=event.output,
-                state=state
-            )
-
-            current_iteration = self._execution_state.iteration
-            if current_iteration not in self._execution_state.iterations_results:
-                self._execution_state.iterations_results[current_iteration] = {}
-
-            if agent.role not in self._execution_state.iterations_results[current_iteration]:
-                self._execution_state.iterations_results[current_iteration][agent.role] = []
-
-            self._execution_state.iterations_results[current_iteration][agent.role].append(result)
-
-    def _handle_lite_agent_completed(self, source: object, event: LiteAgentExecutionCompletedEvent) -> None:
-        agent_info = event.agent_info
-        agent_id = str(agent_info["id"])
-
-        if agent_id in self._execution_state.agent_evaluators:
-            state = ExecutionState()
-            state.current_agent_id = agent_id
-            state.current_task_id = "lite_task"
-
-            target_agent = None
-            for agent in self.agents:
-                if str(agent.id) == agent_id:
-                    target_agent = agent
-                    break
-
-            if not target_agent:
-                return
-
-            assert state.current_agent_id is not None and state.current_task_id is not None
-            trace = self.callback.get_trace(state.current_agent_id, state.current_task_id)
-
-            if not trace:
-                return
-
-            result = self.evaluate(
-                agent=target_agent,
-                execution_trace=trace,
-                final_output=event.output,
-                state=state
-            )
-
-            current_iteration = self._execution_state.iteration
-            if current_iteration not in self._execution_state.iterations_results:
-                self._execution_state.iterations_results[current_iteration] = {}
-
-            agent_role = target_agent.role
-            if agent_role not in self._execution_state.iterations_results[current_iteration]:
-                self._execution_state.iterations_results[current_iteration][agent_role] = []
-
-            self._execution_state.iterations_results[current_iteration][agent_role].append(result)
-
-    def set_iteration(self, iteration: int) -> None:
-        self._execution_state.iteration = iteration
-
-    def reset_iterations_results(self) -> None:
-        self._execution_state.iterations_results = {}
-
-    def get_evaluation_results(self) -> dict[str, list[AgentEvaluationResult]]:
-        if self._execution_state.iterations_results and self._execution_state.iteration in self._execution_state.iterations_results:
-            return self._execution_state.iterations_results[self._execution_state.iteration]
-        return {}
-
-    def display_results_with_iterations(self) -> None:
-        self.display_formatter.display_summary_results(self._execution_state.iterations_results)
-
-    def get_agent_evaluation(self, strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE, include_evaluation_feedback: bool = True) -> dict[str, AgentAggregatedEvaluationResult]:
-        agent_results = {}
-        with crewai_event_bus.scoped_handlers():
-            task_results = self.get_evaluation_results()
-            for agent_role, results in task_results.items():
-                if not results:
-                    continue
-
-                agent_id = results[0].agent_id
-
-                aggregated_result = self.display_formatter._aggregate_agent_results(
-                    agent_id=agent_id,
-                    agent_role=agent_role,
-                    results=results,
-                    strategy=strategy
-                )
-
-                agent_results[agent_role] = aggregated_result
-
-
-            if self._execution_state.iterations_results and self._execution_state.iteration == max(self._execution_state.iterations_results.keys(), default=0):
-                self.display_results_with_iterations()
-
-            if include_evaluation_feedback:
-                self.display_evaluation_with_feedback()
-
-        return agent_results
-
-    def display_evaluation_with_feedback(self) -> None:
-        self.display_formatter.display_evaluation_with_feedback(self._execution_state.iterations_results)
-
-    def evaluate(
-        self,
-        agent: Agent,
-        execution_trace: dict[str, Any],
-        final_output: Any,
-        state: ExecutionState,
-        task: Task | None = None,
-    ) -> AgentEvaluationResult:
-        result = AgentEvaluationResult(
-            agent_id=state.current_agent_id or str(agent.id),
-            task_id=state.current_task_id or (str(task.id) if task else "unknown_task")
-        )
-
-        assert self.evaluators is not None
-        task_id = str(task.id) if task else None
-        for evaluator in self.evaluators:
-            try:
-                self.emit_evaluation_started_event(agent_role=agent.role, agent_id=str(agent.id), task_id=task_id)
-                score = evaluator.evaluate(
-                    agent=agent,
-                    task=task,
-                    execution_trace=execution_trace,
-                    final_output=final_output
-                )
-                result.metrics[evaluator.metric_category] = score
-                self.emit_evaluation_completed_event(agent_role=agent.role, agent_id=str(agent.id), task_id=task_id, metric_category=evaluator.metric_category, score=score)
-            except Exception as e:
-                self.emit_evaluation_failed_event(agent_role=agent.role, agent_id=str(agent.id), task_id=task_id, error=str(e))
-                self.console_formatter.print(f"Error in {evaluator.metric_category.value} evaluator: {str(e)}")
-
-        return result
-
-    def emit_evaluation_started_event(self, agent_role: str, agent_id: str, task_id: str | None = None):
-        crewai_event_bus.emit(
-            self,
-            AgentEvaluationStartedEvent(agent_role=agent_role, agent_id=agent_id, task_id=task_id, iteration=self._execution_state.iteration)
-        )
-
-    def emit_evaluation_completed_event(self, agent_role: str, agent_id: str, task_id: str | None = None, metric_category: MetricCategory | None = None, score: EvaluationScore | None = None):
-        crewai_event_bus.emit(
-            self,
-            AgentEvaluationCompletedEvent(agent_role=agent_role, agent_id=agent_id, task_id=task_id, iteration=self._execution_state.iteration, metric_category=metric_category, score=score)
-        )
-
-    def emit_evaluation_failed_event(self, agent_role: str, agent_id: str, error: str, task_id: str | None = None):
-        crewai_event_bus.emit(
-            self,
-            AgentEvaluationFailedEvent(agent_role=agent_role, agent_id=agent_id, task_id=task_id, iteration=self._execution_state.iteration, error=error)
-        )
-
-def create_default_evaluator(agents: list[Agent], llm: None = None):
-    from crewai.experimental.evaluation import (
-        GoalAlignmentEvaluator,
-        SemanticQualityEvaluator,
-        ToolSelectionEvaluator,
-        ParameterExtractionEvaluator,
-        ToolInvocationEvaluator,
-        ReasoningEfficiencyEvaluator
-    )
-
-    evaluators = [
-        GoalAlignmentEvaluator(llm=llm),
-        SemanticQualityEvaluator(llm=llm),
-        ToolSelectionEvaluator(llm=llm),
-        ParameterExtractionEvaluator(llm=llm),
-        ToolInvocationEvaluator(llm=llm),
-        ReasoningEfficiencyEvaluator(llm=llm),
-    ]
-
-    return AgentEvaluator(evaluators=evaluators, agents=agents)
--- a/src/crewai/experimental/evaluation/experiment/init.py
+++ b/src/crewai/experimental/evaluation/experiment/init.py
@@ -1,8 +0,0 @@
-from crewai.experimental.evaluation.experiment.runner import ExperimentRunner
-from crewai.experimental.evaluation.experiment.result import ExperimentResults, ExperimentResult
-
-__all__ = [
-    "ExperimentRunner",
-    "ExperimentResults",
-    "ExperimentResult"
-]
--- a/src/crewai/experimental/evaluation/experiment/result.py
+++ b/src/crewai/experimental/evaluation/experiment/result.py
@@ -1,122 +0,0 @@
-import json
-import os
-from datetime import datetime, timezone
-from typing import Any
-from pydantic import BaseModel
-
-class ExperimentResult(BaseModel):
-    identifier: str
-    inputs: dict[str, Any]
-    score: int | dict[str, int | float]
-    expected_score: int | dict[str, int | float]
-    passed: bool
-    agent_evaluations: dict[str, Any] | None = None
-
-class ExperimentResults:
-    def __init__(self, results: list[ExperimentResult], metadata: dict[str, Any] | None = None):
-        self.results = results
-        self.metadata = metadata or {}
-        self.timestamp = datetime.now(timezone.utc)
-
-        from crewai.experimental.evaluation.experiment.result_display import ExperimentResultsDisplay
-        self.display = ExperimentResultsDisplay()
-
-    def to_json(self, filepath: str | None = None) -> dict[str, Any]:
-        data = {
-            "timestamp": self.timestamp.isoformat(),
-            "metadata": self.metadata,
-            "results": [r.model_dump(exclude={"agent_evaluations"}) for r in self.results]
-        }
-
-        if filepath:
-            with open(filepath, 'w') as f:
-                json.dump(data, f, indent=2)
-            self.display.console.print(f"[green]Results saved to {filepath}[/green]")
-
-        return data
-
-    def compare_with_baseline(self, baseline_filepath: str, save_current: bool = True, print_summary: bool = False) -> dict[str, Any]:
-        baseline_runs = []
-
-        if os.path.exists(baseline_filepath) and os.path.getsize(baseline_filepath) > 0:
-            try:
-                with open(baseline_filepath, 'r') as f:
-                    baseline_data = json.load(f)
-
-                if isinstance(baseline_data, dict) and "timestamp" in baseline_data:
-                    baseline_runs = [baseline_data]
-                elif isinstance(baseline_data, list):
-                    baseline_runs = baseline_data
-            except (json.JSONDecodeError, FileNotFoundError) as e:
-                self.display.console.print(f"[yellow]Warning: Could not load baseline file: {str(e)}[/yellow]")
-
-        if not baseline_runs:
-            if save_current:
-                current_data = self.to_json()
-                with open(baseline_filepath, 'w') as f:
-                    json.dump([current_data], f, indent=2)
-                self.display.console.print(f"[green]Saved current results as new baseline to {baseline_filepath}[/green]")
-            return {"is_baseline": True, "changes": {}}
-
-        baseline_runs.sort(key=lambda x: x.get("timestamp", ""), reverse=True)
-        latest_run = baseline_runs[0]
-
-        comparison = self._compare_with_run(latest_run)
-
-        if print_summary:
-            self.display.comparison_summary(comparison, latest_run["timestamp"])
-
-        if save_current:
-            current_data = self.to_json()
-            baseline_runs.append(current_data)
-            with open(baseline_filepath, 'w') as f:
-                json.dump(baseline_runs, f, indent=2)
-            self.display.console.print(f"[green]Added current results to baseline file {baseline_filepath}[/green]")
-
-        return comparison
-
-    def _compare_with_run(self, baseline_run: dict[str, Any]) -> dict[str, Any]:
-        baseline_results = baseline_run.get("results", [])
-
-        baseline_lookup = {}
-        for result in baseline_results:
-            test_identifier = result.get("identifier")
-            if test_identifier:
-                baseline_lookup[test_identifier] = result
-
-        improved = []
-        regressed = []
-        unchanged = []
-        new_tests = []
-
-        for result in self.results:
-            test_identifier = result.identifier
-            if not test_identifier or test_identifier not in baseline_lookup:
-                new_tests.append(test_identifier)
-                continue
-
-            baseline_result = baseline_lookup[test_identifier]
-            baseline_passed = baseline_result.get("passed", False)
-            if result.passed and not baseline_passed:
-                improved.append(test_identifier)
-            elif not result.passed and baseline_passed:
-                regressed.append(test_identifier)
-            else:
-                unchanged.append(test_identifier)
-
-        missing_tests = []
-        current_test_identifiers = {result.identifier for result in self.results}
-        for result in baseline_results:
-            test_identifier = result.get("identifier")
-            if test_identifier and test_identifier not in current_test_identifiers:
-                missing_tests.append(test_identifier)
-
-        return {
-            "improved": improved,
-            "regressed": regressed,
-            "unchanged": unchanged,
-            "new_tests": new_tests,
-            "missing_tests": missing_tests,
-            "total_compared": len(improved) + len(regressed) + len(unchanged),
-            "baseline_timestamp": baseline_run.get("timestamp", "unknown")
-        }
--- a/src/crewai/experimental/evaluation/experiment/result_display.py
+++ b/src/crewai/experimental/evaluation/experiment/result_display.py
@@ -1,70 +0,0 @@
-from typing import Dict, Any
-from rich.console import Console
-from rich.table import Table
-from rich.panel import Panel
-from crewai.experimental.evaluation.experiment.result import ExperimentResults
-
-class ExperimentResultsDisplay:
-    def __init__(self):
-        self.console = Console()
-
-    def summary(self, experiment_results: ExperimentResults):
-        total = len(experiment_results.results)
-        passed = sum(1 for r in experiment_results.results if r.passed)
-
-        table = Table(title="Experiment Summary")
-        table.add_column("Metric", style="cyan")
-        table.add_column("Value", style="green")
-
-        table.add_row("Total Test Cases", str(total))
-        table.add_row("Passed", str(passed))
-        table.add_row("Failed", str(total - passed))
-        table.add_row("Success Rate", f"{(passed / total * 100):.1f}%" if total > 0 else "N/A")
-
-        self.console.print(table)
-
-    def comparison_summary(self, comparison: Dict[str, Any], baseline_timestamp: str):
-        self.console.print(Panel(f"[bold]Comparison with baseline run from {baseline_timestamp}[/bold]",
-                                 expand=False))
-
-        table = Table(title="Results Comparison")
-        table.add_column("Metric", style="cyan")
-        table.add_column("Count", style="white")
-        table.add_column("Details", style="dim")
-
-        improved = comparison.get("improved", [])
-        if improved:
-            details = ", ".join([f"{test_identifier}" for test_identifier in improved[:3]])
-            if len(improved) > 3:
-                details += f" and {len(improved) - 3} more"
-            table.add_row("✅ Improved", str(len(improved)), details)
-        else:
-            table.add_row("✅ Improved", "0", "")
-
-        regressed = comparison.get("regressed", [])
-        if regressed:
-            details = ", ".join([f"{test_identifier}" for test_identifier in regressed[:3]])
-            if len(regressed) > 3:
-                details += f" and {len(regressed) - 3} more"
-            table.add_row("❌ Regressed", str(len(regressed)), details, style="red")
-        else:
-            table.add_row("❌ Regressed", "0", "")
-
-        unchanged = comparison.get("unchanged", [])
-        table.add_row("⏺ Unchanged", str(len(unchanged)), "")
-
-        new_tests = comparison.get("new_tests", [])
-        if new_tests:
-            details = ", ".join(new_tests[:3])
-            if len(new_tests) > 3:
-                details += f" and {len(new_tests) - 3} more"
-            table.add_row("➕ New Tests", str(len(new_tests)), details)
-
-        missing_tests = comparison.get("missing_tests", [])
-        if missing_tests:
-            details = ", ".join(missing_tests[:3])
-            if len(missing_tests) > 3:
-                details += f" and {len(missing_tests) - 3} more"
-            table.add_row("➖ Missing Tests", str(len(missing_tests)), details)
-
-        self.console.print(table)
--- a/src/crewai/experimental/evaluation/experiment/runner.py
+++ b/src/crewai/experimental/evaluation/experiment/runner.py
@@ -1,125 +0,0 @@
-from collections import defaultdict
-from hashlib import md5
-from typing import Any
-
-from crewai import Crew, Agent
-from crewai.experimental.evaluation import AgentEvaluator, create_default_evaluator
-from crewai.experimental.evaluation.experiment.result_display import ExperimentResultsDisplay
-from crewai.experimental.evaluation.experiment.result import ExperimentResults, ExperimentResult
-from crewai.experimental.evaluation.evaluation_display import AgentAggregatedEvaluationResult
-
-class ExperimentRunner:
-    def __init__(self, dataset: list[dict[str, Any]]):
-        self.dataset = dataset or []
-        self.evaluator: AgentEvaluator | None = None
-        self.display = ExperimentResultsDisplay()
-
-    def run(self, crew: Crew | None = None, agents: list[Agent] | None = None, print_summary: bool = False) -> ExperimentResults:
-        if crew and not agents:
-            agents = crew.agents
-
-        assert agents is not None
-        self.evaluator = create_default_evaluator(agents=agents)
-
-        results = []
-
-        for test_case in self.dataset:
-            self.evaluator.reset_iterations_results()
-            result = self._run_test_case(test_case=test_case, crew=crew, agents=agents)
-            results.append(result)
-
-        experiment_results = ExperimentResults(results)
-
-        if print_summary:
-            self.display.summary(experiment_results)
-
-        return experiment_results
-
-    def _run_test_case(self, test_case: dict[str, Any], agents: list[Agent], crew: Crew | None = None) -> ExperimentResult:
-        inputs = test_case["inputs"]
-        expected_score = test_case["expected_score"]
-        identifier = test_case.get("identifier") or md5(str(test_case).encode(), usedforsecurity=False).hexdigest()
-
-        try:
-            self.display.console.print(f"[dim]Running crew with input: {str(inputs)[:50]}...[/dim]")
-            self.display.console.print("\n")
-            if crew:
-                crew.kickoff(inputs=inputs)
-            else:
-                for agent in agents:
-                    agent.kickoff(**inputs)
-
-            assert self.evaluator is not None
-            agent_evaluations = self.evaluator.get_agent_evaluation()
-
-            actual_score = self._extract_scores(agent_evaluations)
-
-            passed = self._assert_scores(expected_score, actual_score)
-            return ExperimentResult(
-                identifier=identifier,
-                inputs=inputs,
-                score=actual_score,
-                expected_score=expected_score,
-                passed=passed,
-                agent_evaluations=agent_evaluations
-            )
-
-        except Exception as e:
-            self.display.console.print(f"[red]Error running test case: {str(e)}[/red]")
-            return ExperimentResult(
-                identifier=identifier,
-                inputs=inputs,
-                score=0,
-                expected_score=expected_score,
-                passed=False
-            )
-
-    def _extract_scores(self, agent_evaluations: dict[str, AgentAggregatedEvaluationResult]) -> float | dict[str,  float]:
-        all_scores: dict[str, list[float]] = defaultdict(list)
-        for evaluation in agent_evaluations.values():
-            for metric_name, score in evaluation.metrics.items():
-                if score.score is not None:
-                    all_scores[metric_name.value].append(score.score)
-
-        avg_scores = {m: sum(s)/len(s) for m, s in all_scores.items()}
-
-        if len(avg_scores) == 1:
-            return list(avg_scores.values())[0]
-
-        return avg_scores
-
-    def _assert_scores(self, expected: float | dict[str, float],
-                        actual: float | dict[str, float]) -> bool:
-        """
-        Compare expected and actual scores, and return whether the test case passed.
-
-        The rules for comparison are as follows:
-        - If both expected and actual scores are single numbers, the actual score must be >= expected.
-        - If expected is a single number and actual is a dict, compare against the average of actual values.
-        - If expected is a dict and actual is a single number, actual must be >= all expected values.
-        - If both are dicts, actual must have matching keys with values >= expected values.
-        """
-
-        if isinstance(expected, (int, float)) and isinstance(actual, (int, float)):
-            return actual >= expected
-
-        if isinstance(expected, dict) and isinstance(actual, (int, float)):
-            return all(actual >= exp_score for exp_score in expected.values())
-
-        if isinstance(expected, (int, float)) and isinstance(actual, dict):
-            if not actual:
-                return False
-            avg_score = sum(actual.values()) / len(actual)
-            return avg_score >= expected
-
-        if isinstance(expected, dict) and isinstance(actual, dict):
-            if not expected:
-                return True
-            matching_keys = set(expected.keys()) & set(actual.keys())
-            if not matching_keys:
-                return False
-
-            # All matching keys must have actual >= expected
-            return all(actual[key] >= expected[key] for key in matching_keys)
-
-        return False
--- a/src/crewai/experimental/evaluation/metrics/init.py
+++ b/src/crewai/experimental/evaluation/metrics/init.py
@@ -1,26 +0,0 @@
-from crewai.experimental.evaluation.metrics.reasoning_metrics import (
-    ReasoningEfficiencyEvaluator
-)
-
-from crewai.experimental.evaluation.metrics.tools_metrics import (
-    ToolSelectionEvaluator,
-    ParameterExtractionEvaluator,
-    ToolInvocationEvaluator
-)
-
-from crewai.experimental.evaluation.metrics.goal_metrics import (
-    GoalAlignmentEvaluator
-)
-
-from crewai.experimental.evaluation.metrics.semantic_quality_metrics import (
-    SemanticQualityEvaluator
-)
-
-__all__ = [
-    "ReasoningEfficiencyEvaluator",
-    "ToolSelectionEvaluator",
-    "ParameterExtractionEvaluator",
-    "ToolInvocationEvaluator",
-    "GoalAlignmentEvaluator",
-    "SemanticQualityEvaluator"
-]
--- a/src/crewai/experimental/evaluation/testing.py
+++ b/src/crewai/experimental/evaluation/testing.py
@@ -1,52 +0,0 @@
-import inspect
-
-from typing_extensions import Any
-import warnings
-from crewai.experimental.evaluation.experiment import ExperimentResults, ExperimentRunner
-from crewai import Crew, Agent
-
-def assert_experiment_successfully(experiment_results: ExperimentResults, baseline_filepath: str | None = None) -> None:
-    failed_tests = [result for result in experiment_results.results if not result.passed]
-
-    if failed_tests:
-        detailed_failures: list[str] = []
-
-        for result in failed_tests:
-            expected = result.expected_score
-            actual = result.score
-            detailed_failures.append(f"- {result.identifier}: expected {expected}, got {actual}")
-
-        failure_details = "\n".join(detailed_failures)
-        raise AssertionError(f"The following test cases failed:\n{failure_details}")
-
-    baseline_filepath = baseline_filepath or _get_baseline_filepath_fallback()
-    comparison = experiment_results.compare_with_baseline(baseline_filepath=baseline_filepath)
-    assert_experiment_no_regression(comparison)
-
-def assert_experiment_no_regression(comparison_result: dict[str, list[str]]) -> None:
-    regressed = comparison_result.get("regressed", [])
-    if regressed:
-        raise AssertionError(f"Regression detected! The following tests that previously passed now fail: {regressed}")
-
-    missing_tests = comparison_result.get("missing_tests", [])
-    if missing_tests:
-        warnings.warn(
-            f"Warning: {len(missing_tests)} tests from the baseline are missing in the current run: {missing_tests}",
-            UserWarning
-        )
-
-def run_experiment(dataset: list[dict[str, Any]], crew: Crew | None = None, agents: list[Agent] | None = None, verbose: bool = False) -> ExperimentResults:
-    runner = ExperimentRunner(dataset=dataset)
-
-    return runner.run(agents=agents, crew=crew, print_summary=verbose)
-
-def _get_baseline_filepath_fallback() -> str:
-    test_func_name = "experiment_fallback"
-
-    try:
-        current_frame = inspect.currentframe()
-        if current_frame is not None:
-            test_func_name = current_frame.f_back.f_back.f_code.co_name # type: ignore[union-attr]
-    except Exception:
-        ...
-    return f"{test_func_name}_results.json"
--- a/src/crewai/knowledge/storage/knowledge_storage.py
+++ b/src/crewai/knowledge/storage/knowledge_storage.py
@@ -18,7 +18,6 @@ from crewai.utilities.chromadb import sanitize_collection_name
 from crewai.utilities.constants import KNOWLEDGE_DIRECTORY
 from crewai.utilities.logger import Logger
 from crewai.utilities.paths import db_storage_path
-from crewai.utilities.chromadb import create_persistent_client


@contextlib.contextmanager
@@ -85,11 +84,14 @@ class KnowledgeStorage(BaseKnowledgeStorage):
                raise Exception("Collection not initialized")

    def initialize_knowledge_storage(self):
-        self.app = create_persistent_client(
-            path=os.path.join(db_storage_path(), "knowledge"),
+        base_path = os.path.join(db_storage_path(), "knowledge")
+        chroma_client = chromadb.PersistentClient(
+            path=base_path,
            settings=Settings(allow_reset=True),
        )

+        self.app = chroma_client
+
        try:
            collection_name = (
                f"knowledge_{self.collection_name}"
@@ -109,8 +111,9 @@ class KnowledgeStorage(BaseKnowledgeStorage):
    def reset(self):
        base_path = os.path.join(db_storage_path(), KNOWLEDGE_DIRECTORY)
        if not self.app:
-            self.app = create_persistent_client(
-                path=base_path, settings=Settings(allow_reset=True)
+            self.app = chromadb.PersistentClient(
+                path=base_path,
+                settings=Settings(allow_reset=True),
            )

        self.app.reset()
--- a/src/crewai/lite_agent.py
+++ b/src/crewai/lite_agent.py
@@ -28,7 +28,7 @@ from pydantic import (
    InstanceOf,
    PrivateAttr,
    model_validator,
-    field_validator
+    field_validator,
 )

 from crewai.agents.agent_builder.base_agent import BaseAgent
@@ -40,7 +40,7 @@ from crewai.agents.parser import (
    OutputParserException,
 )
 from crewai.flow.flow_trackable import FlowTrackable
-from crewai.llm import LLM, BaseLLM
+from crewai.llm import LLM
 from crewai.tools.base_tool import BaseTool
 from crewai.tools.structured_tool import CrewStructuredTool
 from crewai.utilities import I18N
@@ -135,7 +135,7 @@ class LiteAgent(FlowTrackable, BaseModel):
    role: str = Field(description="Role of the agent")
    goal: str = Field(description="Goal of the agent")
    backstory: str = Field(description="Backstory of the agent")
-    llm: Optional[Union[str, InstanceOf[BaseLLM], Any]] = Field(
+    llm: Optional[Union[str, InstanceOf[LLM], Any]] = Field(
        default=None, description="Language model that will run the agent"
    )
    tools: List[BaseTool] = Field(
@@ -209,8 +209,8 @@ class LiteAgent(FlowTrackable, BaseModel):
    def setup_llm(self):
        """Set up the LLM and other components after initialization."""
        self.llm = create_llm(self.llm)
-        if not isinstance(self.llm, BaseLLM):
-            raise ValueError(f"Expected LLM instance of type BaseLLM, got {type(self.llm).__name__}")
+        if not isinstance(self.llm, LLM):
+            raise ValueError("Unable to create LLM instance")

        # Initialize callbacks
        token_callback = TokenCalcHandler(token_cost_process=self._token_process)
@@ -232,8 +232,7 @@ class LiteAgent(FlowTrackable, BaseModel):
        elif isinstance(self.guardrail, str):
            from crewai.tasks.llm_guardrail import LLMGuardrail

-            if not isinstance(self.llm, BaseLLM):
-                raise TypeError(f"Guardrail requires LLM instance of type BaseLLM, got {type(self.llm).__name__}")
+            assert isinstance(self.llm, LLM)

            self._guardrail = LLMGuardrail(description=self.guardrail, llm=self.llm)

@@ -305,7 +304,6 @@ class LiteAgent(FlowTrackable, BaseModel):
        """
        # Create agent info for event emission
        agent_info = {
-            "id": self.id,
            "role": self.role,
            "goal": self.goal,
            "backstory": self.backstory,
@@ -622,4 +620,4 @@ class LiteAgent(FlowTrackable, BaseModel):

    def _append_message(self, text: str, role: str = "assistant") -> None:
        """Append a message to the message list with the given role."""
-        self._messages.append(format_message_for_llm(text, role=role))
+        self._messages.append(format_message_for_llm(text, role=role))
--- a/src/crewai/memory/storage/rag_storage.py
+++ b/src/crewai/memory/storage/rag_storage.py
@@ -4,12 +4,12 @@ import logging
 import os
 import shutil
 import uuid
-
 from typing import Any, Dict, List, Optional
+
 from chromadb.api import ClientAPI
+
 from crewai.memory.storage.base_rag_storage import BaseRAGStorage
 from crewai.utilities import EmbeddingConfigurator
-from crewai.utilities.chromadb import create_persistent_client
 from crewai.utilities.constants import MAX_FILE_NAME_LENGTH
 from crewai.utilities.paths import db_storage_path

@@ -60,15 +60,17 @@ class RAGStorage(BaseRAGStorage):
        self.embedder_config = configurator.configure_embedder(self.embedder_config)

    def _initialize_app(self):
+        import chromadb
        from chromadb.config import Settings

        self._set_embedder_config()
-
-        self.app = create_persistent_client(
+        chroma_client = chromadb.PersistentClient(
            path=self.path if self.path else self.storage_file_name,
            settings=Settings(allow_reset=self.allow_reset),
        )

+        self.app = chroma_client
+
        self.collection = self.app.get_or_create_collection(
            name=self.type, embedding_function=self.embedder_config
        )
--- a/src/crewai/project/crew_base.py
+++ b/src/crewai/project/crew_base.py
@@ -1,7 +1,7 @@
 import inspect
 import logging
 from pathlib import Path
-from typing import Any, Callable, Dict, TypeVar, cast, List, Union
+from typing import Any, Callable, Dict, TypeVar, cast, List
 from crewai.tools import BaseTool

 import yaml
@@ -28,8 +28,7 @@ def CrewBase(cls: T) -> T:
        )
        original_tasks_config_path = getattr(cls, "tasks_config", "config/tasks.yaml")

-        mcp_server_params: Union[list[str | dict[str, str]], dict[str, str], None] = getattr(cls, "mcp_server_params", None)
-        _mcp_server_adapter: Union[dict[str, Any], Any, None] = None
+        mcp_server_params: Any = getattr(cls, "mcp_server_params", None)

        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
@@ -68,57 +67,36 @@ def CrewBase(cls: T) -> T:
                self._original_functions, "is_kickoff"
            )

-            # Add close mcp servers method to after kickoff
-            bound_method = self._create_close_mcp_servers_method()
-            self._after_kickoff['_close_mcp_servers'] = bound_method
+            # Add close mcp server method to after kickoff
+            bound_method = self._create_close_mcp_server_method()
+            self._after_kickoff['_close_mcp_server'] = bound_method

-        def _create_close_mcp_servers_method(self):
-            def _close_mcp_servers(self, instance, outputs):
-                if self._mcp_server_adapter is None:
-                    return outputs
-                for adapter in self._mcp_server_adapter.values():
+        def _create_close_mcp_server_method(self):
+            def _close_mcp_server(self, instance, outputs):
+                adapter = getattr(self, '_mcp_server_adapter', None)
+                if adapter is not None:
                    try:
                        adapter.stop()
                    except Exception as e:
                        logging.warning(f"Error stopping MCP server: {e}")
                return outputs

-            _close_mcp_servers.is_after_kickoff = True
+            _close_mcp_server.is_after_kickoff = True

            import types
-            return types.MethodType(_close_mcp_servers, self)
+            return types.MethodType(_close_mcp_server, self)

-        def get_mcp_tools(self, *tool_names: list[str], server: str | None = None) -> List[BaseTool]:
+        def get_mcp_tools(self, *tool_names: list[str]) -> List[BaseTool]:
            if not self.mcp_server_params:
                return []

            from crewai_tools import MCPServerAdapter

-            if isinstance(self.mcp_server_params, list):
-                if self._mcp_server_adapter is None:
-                    self._mcp_server_adapter = MCPServerAdapter(self.mcp_server_params)
-                if server is not None and len(self.mcp_server_params) > 1:
-                    logging.warning("Using list of MCP server parameters. To use server parameter, please use a dictionary of MCP server parameters.")
-                # Type assertion: when mcp_server_params is a list, _mcp_server_adapter is a single MCPServerAdapter
-                adapter = cast(Any, self._mcp_server_adapter)
-                return adapter.tools.filter_by_names(tool_names or None)
+            adapter = getattr(self, '_mcp_server_adapter', None)
+            if not adapter:
+                self._mcp_server_adapter = MCPServerAdapter(self.mcp_server_params)

-            # Separated MCP adapters for each server.
-            elif isinstance(self.mcp_server_params, dict):
-                if self._mcp_server_adapter is None:
-                    self._mcp_server_adapter = {}
-                aggregated_tools = []
-                for server_name, params in self.mcp_server_params.items():
-                    if server is not None and server_name != server:
-                        continue
-
-                    adapter = self._mcp_server_adapter.get(server_name, None)
-                    if not adapter:
-                        self._mcp_server_adapter[server_name] = MCPServerAdapter(params)
-                    aggregated_tools.extend(
-                        self._mcp_server_adapter[server_name].tools.filter_by_names(tool_names or None))
-
-            return aggregated_tools
+            return self._mcp_server_adapter.tools.filter_by_names(tool_names or None)


        def load_configurations(self):
--- a/src/crewai/task.py
+++ b/src/crewai/task.py
@@ -67,7 +67,6 @@ class Task(BaseModel):
        description: Descriptive text detailing task's purpose and execution.
        expected_output: Clear definition of expected task outcome.
        output_file: File path for storing task output.
-        create_directory: Whether to create the directory for output_file if it doesn't exist.
        output_json: Pydantic model for structuring JSON output.
        output_pydantic: Pydantic model for task output.
        security_config: Security configuration including fingerprinting.
@@ -116,10 +115,6 @@ class Task(BaseModel):
        description="A file path to be used to create a file output.",
        default=None,
    )
-    create_directory: Optional[bool] = Field(
-        description="Whether to create the directory for output_file if it doesn't exist.",
-        default=True,
-    )
    output: Optional[TaskOutput] = Field(
        description="Task output, it's final result after being executed", default=None
    )
@@ -758,10 +753,8 @@ Follow these guidelines:
            resolved_path = Path(self.output_file).expanduser().resolve()
            directory = resolved_path.parent

-            if self.create_directory and not directory.exists():
+            if not directory.exists():
                directory.mkdir(parents=True, exist_ok=True)
-            elif not self.create_directory and not directory.exists():
-                raise RuntimeError(f"Directory {directory} does not exist and create_directory is False")

            with resolved_path.open("w", encoding="utf-8") as file:
                if isinstance(result, dict):
--- a/src/crewai/tasks/llm_guardrail.py
+++ b/src/crewai/tasks/llm_guardrail.py
@@ -1,9 +1,10 @@
-from typing import Any, Tuple
+from typing import Any, Optional, Tuple

 from pydantic import BaseModel, Field

 from crewai.agent import Agent, LiteAgentOutput
-from crewai.llm import BaseLLM
+from crewai.llm import LLM
+from crewai.task import Task
 from crewai.tasks.task_output import TaskOutput


@@ -31,11 +32,11 @@ class LLMGuardrail:
    def __init__(
        self,
        description: str,
-        llm: BaseLLM,
+        llm: LLM,
    ):
        self.description = description

-        self.llm: BaseLLM = llm
+        self.llm: LLM = llm

    def _validate_output(self, task_output: TaskOutput) -> LiteAgentOutput:
        agent = Agent(
--- a/src/crewai/utilities/chromadb.py
+++ b/src/crewai/utilities/chromadb.py
@@ -1,10 +1,6 @@
 import re
-import portalocker
-from chromadb import PersistentClient
-from hashlib import md5
 from typing import Optional

-
 MIN_COLLECTION_LENGTH = 3
 MAX_COLLECTION_LENGTH = 63
 DEFAULT_COLLECTION = "default_collection"
@@ -64,16 +60,3 @@ def sanitize_collection_name(name: Optional[str], max_collection_length: int = M
            sanitized = sanitized[:-1] + "z"

    return sanitized
-
-
-def create_persistent_client(path: str, **kwargs):
-    """
-    Creates a persistent client for ChromaDB with a lock file to prevent
-    concurrent creations. Works for both multi-threads and multi-processes
-    environments.
-    """
-    lockfile = f"chromadb-{md5(path.encode(), usedforsecurity=False).hexdigest()}.lock"
-    with portalocker.Lock(lockfile):
-        client = PersistentClient(path=path, **kwargs)
-
-    return client
--- a/src/crewai/utilities/events/init.py
+++ b/src/crewai/utilities/events/init.py
@@ -17,9 +17,6 @@ from .agent_events import (
    AgentExecutionStartedEvent,
    AgentExecutionCompletedEvent,
    AgentExecutionErrorEvent,
-    AgentEvaluationStartedEvent,
-    AgentEvaluationCompletedEvent,
-    AgentEvaluationFailedEvent,
 )
 from .task_events import (
    TaskStartedEvent,
@@ -77,9 +74,6 @@ __all__ = [
    "AgentExecutionStartedEvent",
    "AgentExecutionCompletedEvent",
    "AgentExecutionErrorEvent",
-    "AgentEvaluationStartedEvent",
-    "AgentEvaluationCompletedEvent",
-    "AgentEvaluationFailedEvent",
    "TaskStartedEvent",
    "TaskCompletedEvent",
    "TaskFailedEvent",
--- a/src/crewai/utilities/events/agent_events.py
+++ b/src/crewai/utilities/events/agent_events.py
@@ -123,28 +123,3 @@ class AgentLogsExecutionEvent(BaseEvent):
    type: str = "agent_logs_execution"

    model_config = {"arbitrary_types_allowed": True}
-
-# Agent Eval events
-class AgentEvaluationStartedEvent(BaseEvent):
-    agent_id: str
-    agent_role: str
-    task_id: str | None = None
-    iteration: int
-    type: str = "agent_evaluation_started"
-
-class AgentEvaluationCompletedEvent(BaseEvent):
-    agent_id: str
-    agent_role: str
-    task_id: str | None = None
-    iteration: int
-    metric_category: Any
-    score: Any
-    type: str = "agent_evaluation_completed"
-
-class AgentEvaluationFailedEvent(BaseEvent):
-    agent_id: str
-    agent_role: str
-    task_id: str | None = None
-    iteration: int
-    error: str
-    type: str = "agent_evaluation_failed"
--- a/src/crewai/utilities/events/event_types.py
+++ b/src/crewai/utilities/events/event_types.py
@@ -4,7 +4,6 @@ from .agent_events import (
    AgentExecutionCompletedEvent,
    AgentExecutionErrorEvent,
    AgentExecutionStartedEvent,
-    LiteAgentExecutionCompletedEvent,
 )
 from .crew_events import (
    CrewKickoffCompletedEvent,
@@ -81,7 +80,6 @@ EventTypes = Union[
    CrewTrainFailedEvent,
    AgentExecutionStartedEvent,
    AgentExecutionCompletedEvent,
-    LiteAgentExecutionCompletedEvent,
    TaskStartedEvent,
    TaskCompletedEvent,
    TaskFailedEvent,
--- a/tests/agent_test.py
+++ b/tests/agent_test.py
@@ -1896,80 +1896,6 @@ def test_agent_with_knowledge_sources_generate_search_query():
        assert "red" in result.raw.lower()


-@pytest.mark.vcr(record_mode='none', filter_headers=["authorization"])
-def test_agent_with_knowledge_with_no_crewai_knowledge():
-    mock_knowledge = MagicMock(spec=Knowledge)
-
-    agent = Agent(
-        role="Information Agent",
-        goal="Provide information based on knowledge sources",
-        backstory="You have access to specific knowledge sources.",
-        llm=LLM(model="openrouter/openai/gpt-4o-mini",api_key=os.getenv('OPENROUTER_API_KEY')),
-        knowledge=mock_knowledge
-    )
-
-    # Create a task that requires the agent to use the knowledge
-    task = Task(
-        description="What is Vidit's favorite color?",
-        expected_output="Vidit's favorclearite color.",
-        agent=agent,
-    )
-
-    crew = Crew(agents=[agent], tasks=[task])
-    crew.kickoff()
-    mock_knowledge.query.assert_called_once()
-
-
-@pytest.mark.vcr(record_mode='none', filter_headers=["authorization"])
-def test_agent_with_only_crewai_knowledge():
-    mock_knowledge = MagicMock(spec=Knowledge)
-
-    agent = Agent(
-        role="Information Agent",
-        goal="Provide information based on knowledge sources",
-        backstory="You have access to specific knowledge sources.",
-        llm=LLM(model="openrouter/openai/gpt-4o-mini",api_key=os.getenv('OPENROUTER_API_KEY'))
-    )
-
-    # Create a task that requires the agent to use the knowledge
-    task = Task(
-        description="What is Vidit's favorite color?",
-        expected_output="Vidit's favorclearite color.",
-        agent=agent
-    )
-
-    crew = Crew(agents=[agent], tasks=[task],knowledge=mock_knowledge)
-    crew.kickoff()
-    mock_knowledge.query.assert_called_once()
-
-
-@pytest.mark.vcr(record_mode='none', filter_headers=["authorization"])
-def test_agent_knowledege_with_crewai_knowledge():
-    crew_knowledge = MagicMock(spec=Knowledge)
-    agent_knowledge = MagicMock(spec=Knowledge)
-
-
-    agent = Agent(
-        role="Information Agent",
-        goal="Provide information based on knowledge sources",
-        backstory="You have access to specific knowledge sources.",
-        llm=LLM(model="openrouter/openai/gpt-4o-mini",api_key=os.getenv('OPENROUTER_API_KEY')),
-        knowledge=agent_knowledge
-    )
-
-    # Create a task that requires the agent to use the knowledge
-    task = Task(
-        description="What is Vidit's favorite color?",
-        expected_output="Vidit's favorclearite color.",
-        agent=agent,
-    )
-
-    crew = Crew(agents=[agent],tasks=[task],knowledge=crew_knowledge)
-    crew.kickoff()
-    agent_knowledge.query.assert_called_once()
-    crew_knowledge.query.assert_called_once()
-
-
@pytest.mark.vcr(filter_headers=["authorization"])
 def test_litellm_auth_error_handling():
    """Test that LiteLLM authentication errors are handled correctly and not retried."""
--- a/tests/cassettes/TestAgentEvaluator.test_eval_lite_agent.yaml
+++ b/tests/cassettes/TestAgentEvaluator.test_eval_lite_agent.yaml
@@ -1,237 +0,0 @@
-interactions:
- request:
-    body: '{"messages": [{"role": "system", "content": "You are Test Agent. An agent
-      created for testing purposes\nYour personal goal is: Complete test tasks successfully\n\nTo
-      give my best complete final answer to the task respond using the exact following
-      format:\n\nThought: I now can give a great answer\nFinal Answer: Your final
-      answer must be the great and the most complete as possible, it must be outcome
-      described.\n\nI MUST use these formats, my job depends on it!"}, {"role": "user",
-      "content": "Complete this task successfully"}], "model": "gpt-4o-mini", "stop":
-      ["\nObservation:"]}'
-    headers:
-      accept:
-      - application/json
-      accept-encoding:
-      - gzip, deflate, zstd
-      connection:
-      - keep-alive
-      content-length:
-      - '583'
-      content-type:
-      - application/json
-      host:
-      - api.openai.com
-      user-agent:
-      - OpenAI/Python 1.93.0
-      x-stainless-arch:
-      - arm64
-      x-stainless-async:
-      - 'false'
-      x-stainless-lang:
-      - python
-      x-stainless-os:
-      - MacOS
-      x-stainless-package-version:
-      - 1.93.0
-      x-stainless-raw-response:
-      - 'true'
-      x-stainless-read-timeout:
-      - '600.0'
-      x-stainless-retry-count:
-      - '0'
-      x-stainless-runtime:
-      - CPython
-      x-stainless-runtime-version:
-      - 3.11.12
-    method: POST
-    uri: https://api.openai.com/v1/chat/completions
-  response:
-    body:
-      string: !!binary |
-        H4sIAAAAAAAAAwAAAP//jFNNb9swDL3nVxA6J0U+HKTNbd0woMAOw7Bu6LbCUCXa1iqLgkgnzYr8
-        98FKWqdbB+wiQHx81OMj9TgCUM6qNSjTaDFt9JNL+TZ7N/dfrusPN01NyV6vPk3f/mrl5vLrXI17
-        Bt39RCNPrDNDbfQojsIBNgm1YF91tlrOl+fzxXKWgZYs+p5WR5kUNGldcJP5dF5MpqvJ7PzIbsgZ
-        ZLWG7yMAgMd89jqDxQe1hun4KdIis65RrZ+TAFQi30eUZnYsOogaD6ChIBiy9M8NdXUja7iCQFsw
-        OkDtNgga6l4/6MBbTAA/wnsXtIc3+b6Gjx41I8REG2cRWoStkwakQeCIxlXOgEXRzjNQgvzigwBV
-        OUU038OOOgiIFhr0MdPHoIOFK9g67wEDdwlBCI7OIjgB7oxB5qrzfpeznxRokIZS3wwk5EiB8ey0
-        54RVx7r3PXTenwA6BBLdzy27fXtE9s/+eqpjojv+g6oqFxw3ZULNFHovWSiqjO5HALd5jt2L0aiY
-        qI1SCt1jfu7i4lBODdszgEVxBIVE+yE+KxbjV8qVR79PFkEZbRq0A3XYGt1ZRyfA6KTpv9W8VvvQ
-        uAv1/5QfAGMwCtoyJrTOvOx4SEvYf65/pT2bnAUrxrRxBktxmPpBWKx05w8rr3jHgm1ZuVBjiskd
-        9r6K5aLQy0LjxcKo0X70GwAA//8DAMz2wVUFBAAA
-    headers:
-      CF-RAY:
-      - 95f93ea9af627e0b-GRU
-      Connection:
-      - keep-alive
-      Content-Encoding:
-      - gzip
-      Content-Type:
-      - application/json
-      Date:
-      - Tue, 15 Jul 2025 12:25:54 GMT
-      Server:
-      - cloudflare
-      Set-Cookie:
-      - __cf_bm=GRZmZLrjW5ZRHNmUJa4ccrMcy20D1rmeqK6Ptlv0mRY-1752582354-1.0.1.1-xKd_yga48Eedech5TRlThlEpDgsB2whxkWHlCyAGOVMqMcvH1Ju9FdXYbuQ9NdUQcVxPLgiGM35lYhqSLVQiXDyK01dnyp2Gvm560FBN9DY;
-        path=/; expires=Tue, 15-Jul-25 12:55:54 GMT; domain=.api.openai.com; HttpOnly;
-        Secure; SameSite=None
-      - _cfuvid=MYqswpSR7sqr4kGp6qZVkaL7HDYwMiww49PeN9QBP.A-1752582354973-0.0.1.1-604800000;
-        path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
-      Transfer-Encoding:
-      - chunked
-      X-Content-Type-Options:
-      - nosniff
-      access-control-expose-headers:
-      - X-Request-ID
-      alt-svc:
-      - h3=":443"; ma=86400
-      cf-cache-status:
-      - DYNAMIC
-      openai-organization:
-      - crewai-iuxna1
-      openai-processing-ms:
-      - '4047'
-      openai-version:
-      - '2020-10-01'
-      strict-transport-security:
-      - max-age=31536000; includeSubDomains; preload
-      x-envoy-upstream-service-time:
-      - '4440'
-      x-ratelimit-limit-requests:
-      - '30000'
-      x-ratelimit-limit-tokens:
-      - '150000000'
-      x-ratelimit-remaining-requests:
-      - '29999'
-      x-ratelimit-remaining-tokens:
-      - '149999885'
-      x-ratelimit-reset-requests:
-      - 2ms
-      x-ratelimit-reset-tokens:
-      - 0s
-      x-request-id:
-      - req_5704c0f206a927ddc12aa1a19b612a75
-    status:
-      code: 200
-      message: OK
- request:
-    body: '{"messages": [{"role": "system", "content": "You are an expert evaluator
-      assessing how well an AI agent''s output aligns with its assigned task goal.\n\nScore
-      the agent''s goal alignment on a scale from 0-10 where:\n- 0: Complete misalignment,
-      agent did not understand or attempt the task goal\n- 5: Partial alignment, agent
-      attempted the task but missed key requirements\n- 10: Perfect alignment, agent
-      fully satisfied all task requirements\n\nConsider:\n1. Did the agent correctly
-      interpret the task goal?\n2. Did the final output directly address the requirements?\n3.
-      Did the agent focus on relevant aspects of the task?\n4. Did the agent provide
-      all requested information or deliverables?\n\nReturn your evaluation as JSON
-      with fields ''score'' (number) and ''feedback'' (string).\n"}, {"role": "user",
-      "content": "\nAgent role: Test Agent\nAgent goal: Complete test tasks successfully\n\n\nAgent''s
-      final output:\nPlease provide me with the specific details or context of the
-      task you need help with, and I will ensure to complete it successfully and provide
-      a thorough response.\n\nEvaluate how well the agent''s output aligns with the
-      assigned task goal.\n"}], "model": "gpt-4o-mini", "stop": []}'
-    headers:
-      accept:
-      - application/json
-      accept-encoding:
-      - gzip, deflate, zstd
-      connection:
-      - keep-alive
-      content-length:
-      - '1196'
-      content-type:
-      - application/json
-      cookie:
-      - __cf_bm=GRZmZLrjW5ZRHNmUJa4ccrMcy20D1rmeqK6Ptlv0mRY-1752582354-1.0.1.1-xKd_yga48Eedech5TRlThlEpDgsB2whxkWHlCyAGOVMqMcvH1Ju9FdXYbuQ9NdUQcVxPLgiGM35lYhqSLVQiXDyK01dnyp2Gvm560FBN9DY;
-        _cfuvid=MYqswpSR7sqr4kGp6qZVkaL7HDYwMiww49PeN9QBP.A-1752582354973-0.0.1.1-604800000
-      host:
-      - api.openai.com
-      user-agent:
-      - OpenAI/Python 1.93.0
-      x-stainless-arch:
-      - arm64
-      x-stainless-async:
-      - 'false'
-      x-stainless-lang:
-      - python
-      x-stainless-os:
-      - MacOS
-      x-stainless-package-version:
-      - 1.93.0
-      x-stainless-raw-response:
-      - 'true'
-      x-stainless-read-timeout:
-      - '600.0'
-      x-stainless-retry-count:
-      - '0'
-      x-stainless-runtime:
-      - CPython
-      x-stainless-runtime-version:
-      - 3.11.12
-    method: POST
-    uri: https://api.openai.com/v1/chat/completions
-  response:
-    body:
-      string: !!binary |
-        H4sIAAAAAAAAA4xUy27bQAy8+yuIPdtGbMdN4FvbSxM0QIsEKNA6MJhdSmK82hWWVFwj8L8XKz/k
-        9AH0ogOHnOFjVq8DAMPOLMDYCtXWjR990O+TT7dfZs/v5OtFy/ef7++mxfu7j83t/cONGeaK+PRM
-        Vo9VYxvrxpNyDHvYJkKlzDq5mk/n19PZfN4BdXTkc1nZ6OgyjmoOPJpeTC9HF1ejyfWhuopsScwC
-        fgwAAF67b+4zOPppFnAxPEZqEsGSzOKUBGBS9DliUIRFMagZ9qCNQSl0rb8uA8DSiI2JlmYB0+E+
-        UBC5J7TrHFuah4oASwoKjh2EqOCojkE0oRIgWE+YoA2OUhZzHEqIBWhFoChrKCP6IWwqthWwgEY4
-        bItASbRLEpDWWhIpWu+3Y7gJooRuCKyAsiYHRUxQx0TgSJG9DIGDY4ua5RA82nVW5cDKqPxCWYhC
-        iSXBhrU69TOGbxV7ysxSxY0Awoa951AGkq69/do67QLZk8vBJsUXdgQYtoBWW/SQSJoYpFPq2Ptp
-        MLjTttC51DFXVIPjRFb9drw0y7A7v0uiohXM3git92cAhhAVs7c6RzwekN3JAz6WTYpP8lupKTiw
-        VKtEKDHke4vGxnTobgDw2HmtfWMf06RYN7rSuKZObjo7eM30Fu/R6yOoUdH38dnkCLzhWx1ud+ZW
-        Y9FW5PrS3trYOo5nwOBs6j+7+Rv3fnIO5f/Q94C11Ci5VZPIsX07cZ+WKP8B/pV22nLXsBFKL2xp
-        pUwpX8JRga3fv0sjW1GqVwWHklKTuHuc+ZKD3eAXAAAA//8DADksFsafBAAA
-    headers:
-      CF-RAY:
-      - 95f93ec73a1c7e0b-GRU
-      Connection:
-      - keep-alive
-      Content-Encoding:
-      - gzip
-      Content-Type:
-      - application/json
-      Date:
-      - Tue, 15 Jul 2025 12:25:57 GMT
-      Server:
-      - cloudflare
-      Transfer-Encoding:
-      - chunked
-      X-Content-Type-Options:
-      - nosniff
-      access-control-expose-headers:
-      - X-Request-ID
-      alt-svc:
-      - h3=":443"; ma=86400
-      cf-cache-status:
-      - DYNAMIC
-      openai-organization:
-      - crewai-iuxna1
-      openai-processing-ms:
-      - '1544'
-      openai-version:
-      - '2020-10-01'
-      strict-transport-security:
-      - max-age=31536000; includeSubDomains; preload
-      x-envoy-upstream-service-time:
-      - '1546'
-      x-ratelimit-limit-requests:
-      - '30000'
-      x-ratelimit-limit-tokens:
-      - '150000000'
-      x-ratelimit-remaining-requests:
-      - '29999'
-      x-ratelimit-remaining-tokens:
-      - '149999732'
-      x-ratelimit-reset-requests:
-      - 2ms
-      x-ratelimit-reset-tokens:
-      - 0s
-      x-request-id:
-      - req_44930ba12ad8d1e3f0beed1d5e3d8b0c
-    status:
-      code: 200
-      message: OK
-version: 1
--- a/tests/cassettes/TestAgentEvaluator.test_eval_specific_agents_from_crew.yaml
+++ b/tests/cassettes/TestAgentEvaluator.test_eval_specific_agents_from_crew.yaml
--- a/tests/cassettes/TestAgentEvaluator.test_evaluate_current_iteration.yaml
+++ b/tests/cassettes/TestAgentEvaluator.test_evaluate_current_iteration.yaml
@@ -427,140 +427,4 @@ interactions:
    status:
      code: 200
      message: OK
- request:
-    body: '{"messages": [{"role": "system", "content": "You are an expert evaluator
-      assessing how well an AI agent''s output aligns with its assigned task goal.\n\nScore
-      the agent''s goal alignment on a scale from 0-10 where:\n- 0: Complete misalignment,
-      agent did not understand or attempt the task goal\n- 5: Partial alignment, agent
-      attempted the task but missed key requirements\n- 10: Perfect alignment, agent
-      fully satisfied all task requirements\n\nConsider:\n1. Did the agent correctly
-      interpret the task goal?\n2. Did the final output directly address the requirements?\n3.
-      Did the agent focus on relevant aspects of the task?\n4. Did the agent provide
-      all requested information or deliverables?\n\nReturn your evaluation as JSON
-      with fields ''score'' (number) and ''feedback'' (string).\n"}, {"role": "user",
-      "content": "\nAgent role: Test Agent\nAgent goal: Complete test tasks successfully\nTask
-      description: Test task description\nExpected output: Expected test output\n\nAgent''s
-      final output:\nThe expected test output is a comprehensive document that outlines
-      the specific parameters and criteria that define success for the task at hand.
-      It should include detailed descriptions of the tasks, the goals that need to
-      be achieved, and any specific formatting or structural requirements necessary
-      for the output. Each component of the task must be analyzed and addressed, providing
-      context as well as examples where applicable. Additionally, any tools or methodologies
-      that are relevant to executing the tasks successfully should be outlined, including
-      any potential risks or challenges that may arise during the process. This document
-      serves as a guiding framework to ensure that all aspects of the task are thoroughly
-      considered and executed to meet the high standards expected.\n\nEvaluate how
-      well the agent''s output aligns with the assigned task goal.\n"}], "model":
-      "gpt-4o-mini", "stop": []}'
-    headers:
-      accept:
-      - application/json
-      accept-encoding:
-      - gzip, deflate, zstd
-      connection:
-      - keep-alive
-      content-length:
-      - '1893'
-      content-type:
-      - application/json
-      cookie:
-      - _cfuvid=XwsgBfgvDGlKFQ4LiGYGIARIoSNTiwidqoo9UZcc.XY-1752087999227-0.0.1.1-604800000
-      host:
-      - api.openai.com
-      user-agent:
-      - OpenAI/Python 1.93.0
-      x-stainless-arch:
-      - arm64
-      x-stainless-async:
-      - 'false'
-      x-stainless-lang:
-      - python
-      x-stainless-os:
-      - MacOS
-      x-stainless-package-version:
-      - 1.93.0
-      x-stainless-raw-response:
-      - 'true'
-      x-stainless-read-timeout:
-      - '600.0'
-      x-stainless-retry-count:
-      - '0'
-      x-stainless-runtime:
-      - CPython
-      x-stainless-runtime-version:
-      - 3.11.12
-    method: POST
-    uri: https://api.openai.com/v1/chat/completions
-  response:
-    body:
-      string: !!binary |
-        H4sIAAAAAAAAAwAAAP//jFRNbxs5DL37VxA6jwPHddrUxxwWi2BRtEAPRevCYCSOh41GUkWOnTTI
-        fy8kf4zT5rCXOfCRT4+P5DxNAAw7swRjO1TbJz+90dvFxy//vX0za7dfr29+3eo/n75++Mh0O/za
-        maZUxLsfZPVYdWFjnzwpx7CHbSZUKqyX767mV/PL2eKqAn105EvZJul0Eac9B57OZ/PFdPZuenl9
-        qO4iWxKzhG8TAICn+i06g6MHs4RZc4z0JIIbMstTEoDJ0ZeIQREWxaCmGUEbg1Ko0p9WAWBlxMZM
-        K7OEq2YfaIncHdr7EluZzx0BbigopBy37MgBgiNF9uTAkdjMqbQOsYVdhwraEdBDIqvkIA6aBgXp
-        4uAdcLB+cNTArmPbAQfHFpUEJPYEQ3CUi2LHYVPoCpOi3EOmnwNn6imoXMC/cUdbyk3FWw7oj8+4
-        SAIhKkgiyy1b9P4RHHneUn4pTEn0WIYC6YDX5866aqDH+yKHFRJm5cqInjeB3AWM7vQsUgzhTFb9
-        48GtUlloSwMkZ4bEDMetOaSg1QH9XldVwSrk2wY4iBLWSs/hmG47zGiVMouylZP7WHkzdRSEtwQu
-        2qH4dhyBjcWKHWsXhzJTEgpVAwagByySirgzRSfLDrtzsTKr8Hy+VJnaQbAsdhi8PwMwhKhYfKzr
-        /P2APJ8W2MdNyvFO/ig1LQeWbp0JJYayrKIxmYo+TwC+10MZXuy+STn2Sdca76k+92ax2POZ8T5H
-        9P31AdSo6Mf4YjFvXuFb71dezk7NWLQdubF0vEscHMczYHLW9d9qXuPed85h83/oR8BaSkpunTI5
-        ti87HtMy/agTfT3t5HIVbITyli2tlSmXSThqcfD7n4qRR1Hq1y2HDeWUuf5ZyiQnz5PfAAAA//8D
-        AEfUP8BcBQAA
-    headers:
-      CF-RAY:
-      - 95f365f1bfc87ded-GRU
-      Connection:
-      - keep-alive
-      Content-Encoding:
-      - gzip
-      Content-Type:
-      - application/json
-      Date:
-      - Mon, 14 Jul 2025 19:24:07 GMT
-      Server:
-      - cloudflare
-      Set-Cookie:
-      - __cf_bm=PcC3_3T8.MK_WpZlQLdZfwpNv9Pe45AIYmrXOSgJ65E-1752521047-1.0.1.1-eyqwSWfQC7ZV6.JwTsTihK1ZWCrEmxd52CtNcfe.fw1UjjBN9rdTU4G7hRZiNqHQYo4sVZMmgRgqM9k7HRSzN2zln0bKmMiOuSQTZh6xF_I;
-        path=/; expires=Mon, 14-Jul-25 19:54:07 GMT; domain=.api.openai.com; HttpOnly;
-        Secure; SameSite=None
-      - _cfuvid=JvQ1c4qYZefNwOPoVNgAtX8ET7ObU.JKDvGc43LOR6g-1752521047741-0.0.1.1-604800000;
-        path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
-      Transfer-Encoding:
-      - chunked
-      X-Content-Type-Options:
-      - nosniff
-      access-control-expose-headers:
-      - X-Request-ID
-      alt-svc:
-      - h3=":443"; ma=86400
-      cf-cache-status:
-      - DYNAMIC
-      openai-organization:
-      - crewai-iuxna1
-      openai-processing-ms:
-      - '2729'
-      openai-version:
-      - '2020-10-01'
-      strict-transport-security:
-      - max-age=31536000; includeSubDomains; preload
-      x-envoy-upstream-service-time:
-      - '2789'
-      x-ratelimit-limit-requests:
-      - '30000'
-      x-ratelimit-limit-tokens:
-      - '150000000'
-      x-ratelimit-remaining-requests:
-      - '29999'
-      x-ratelimit-remaining-tokens:
-      - '149999559'
-      x-ratelimit-reset-requests:
-      - 2ms
-      x-ratelimit-reset-tokens:
-      - 0s
-      x-request-id:
-      - req_74f6e8ff49db25dbea3d3525cc149e8e
-    status:
-      code: 200
-      message: OK
 version: 1
--- a/tests/cassettes/TestAgentEvaluator.test_failed_evaluation.yaml
+++ b/tests/cassettes/TestAgentEvaluator.test_failed_evaluation.yaml
@@ -1,123 +0,0 @@
-interactions:
- request:
-    body: '{"messages": [{"role": "system", "content": "You are Test Agent. An agent
-      created for testing purposes\nYour personal goal is: Complete test tasks successfully\nTo
-      give my best complete final answer to the task respond using the exact following
-      format:\n\nThought: I now can give a great answer\nFinal Answer: Your final
-      answer must be the great and the most complete as possible, it must be outcome
-      described.\n\nI MUST use these formats, my job depends on it!"}, {"role": "user",
-      "content": "\nCurrent Task: Test task description\n\nThis is the expected criteria
-      for your final answer: Expected test output\nyou MUST return the actual complete
-      content as the final answer, not a summary.\n\nBegin! This is VERY important
-      to you, use the tools available and give your best Final Answer, your job depends
-      on it!\n\nThought:"}], "model": "gpt-4o-mini", "stop": ["\nObservation:"]}'
-    headers:
-      accept:
-      - application/json
-      accept-encoding:
-      - gzip, deflate, zstd
-      connection:
-      - keep-alive
-      content-length:
-      - '879'
-      content-type:
-      - application/json
-      host:
-      - api.openai.com
-      user-agent:
-      - OpenAI/Python 1.93.0
-      x-stainless-arch:
-      - arm64
-      x-stainless-async:
-      - 'false'
-      x-stainless-lang:
-      - python
-      x-stainless-os:
-      - MacOS
-      x-stainless-package-version:
-      - 1.93.0
-      x-stainless-raw-response:
-      - 'true'
-      x-stainless-read-timeout:
-      - '600.0'
-      x-stainless-retry-count:
-      - '0'
-      x-stainless-runtime:
-      - CPython
-      x-stainless-runtime-version:
-      - 3.11.12
-    method: POST
-    uri: https://api.openai.com/v1/chat/completions
-  response:
-    body:
-      string: !!binary |
-        H4sIAAAAAAAAAwAAAP//jFTBbhtHDL3rK4g5rwRbtaNYt9RoEaNoUaBODm0DgZnh7jKe5WyHXDmO
-        4X8vZiRLcupDLwvsPPLxPQ45jzMAx8GtwfkezQ9jnP9oeLv98N5+vfl9+4v89Mf76+XV7XDz8Yc/
-        r39T15SM9PkLeXvOWvg0jJGMk+xgnwmNCuv56nJ5+XZ1tbqswJACxZLWjTa/SPOBhefLs+XF/Gw1
-        P3+7z+4Te1K3hr9mAACP9Vt0SqCvbg1nzfPJQKrYkVsfggBcTrGcOFRlNRRzzRH0SYykSr8BSffg
-        UaDjLQFCV2QDit5TBvhbfmbBCO/q/xpue1ZgBesJ6OtI3iiAkRqkycbJGrjv2ffgk5S6CqkFhECG
-        HClAIPWZx9Kkgtz3aJVq37vChXoH2qcpBogp3UHkO1rAbU/QViW7Os8hLD5OgQBjBCFfOpEfgKVN
-        ecBSpoFAQxK1jMbSgY+Y2R6aWjJTT6K8JSHVBlACYOgpk3gCS4DyADqS55YpQDdxoMhCuoCbgwKf
-        tpSB0PeAJdaKseKpOsn0z8SZBhJrgESnXERY8S0JRsxWulkoilkKkDJ0JJQx8jcKi13DX3pWyuWm
-        FPDQN8jU7mW3KRfdSaj2r5ZLMEmgXOYg7K5OlcQYI1Cs4vSFavSVmLWnsDgdnEztpFiGV6YYTwAU
-        SVYbXkf20x55OgxpTN2Y02f9LtW1LKz9JhNqkjKQaml0FX2aAXyqyzC9mG835jSMtrF0R7Xc+Zvz
-        HZ877uARvXqzBy0ZxuP58nLVvMK32Q2rnqyT8+h7CsfU4+7hFDidALMT1/9V8xr3zjlL93/oj4D3
-        NBqFzZgpsH/p+BiW6Utd0dfDDl2ugl2ZK/a0MaZcbiJQi1PcPRxOH9Ro2LQsHeUxc309yk3Onmb/
-        AgAA//8DAAbYfvVABQAA
-    headers:
-      CF-RAY:
-      - 95f9c7ffa8331b11-GRU
-      Connection:
-      - keep-alive
-      Content-Encoding:
-      - gzip
-      Content-Type:
-      - application/json
-      Date:
-      - Tue, 15 Jul 2025 13:59:38 GMT
-      Server:
-      - cloudflare
-      Set-Cookie:
-      - __cf_bm=J_xe1AP.B5P6D2GVMCesyioeS5E9DnYT34rbwQUefFc-1752587978-1.0.1.1-5Dflk5cAj6YCsOSVbCFWWSpXpw_mXsczIdzWzs2h2OwDL01HQbduE5LAToy67sfjFjHeeO4xRrqPLUQpySy2QqyHXbI_fzX4UAt3.UdwHxU;
-        path=/; expires=Tue, 15-Jul-25 14:29:38 GMT; domain=.api.openai.com; HttpOnly;
-        Secure; SameSite=None
-      - _cfuvid=0rTD8RMpxBQQy42jzmum16_eoRtWNfaZMG_TJkhGS7I-1752587978437-0.0.1.1-604800000;
-        path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
-      Transfer-Encoding:
-      - chunked
-      X-Content-Type-Options:
-      - nosniff
-      access-control-expose-headers:
-      - X-Request-ID
-      alt-svc:
-      - h3=":443"; ma=86400
-      cf-cache-status:
-      - DYNAMIC
-      openai-organization:
-      - crewai-iuxna1
-      openai-processing-ms:
-      - '2623'
-      openai-version:
-      - '2020-10-01'
-      strict-transport-security:
-      - max-age=31536000; includeSubDomains; preload
-      x-envoy-upstream-service-time:
-      - '2626'
-      x-ratelimit-limit-requests:
-      - '30000'
-      x-ratelimit-limit-tokens:
-      - '150000000'
-      x-ratelimit-remaining-requests:
-      - '29999'
-      x-ratelimit-remaining-tokens:
-      - '149999813'
-      x-ratelimit-reset-requests:
-      - 2ms
-      x-ratelimit-reset-tokens:
-      - 0s
-      x-request-id:
-      - req_ccc347e91010713379c920aa0efd1f4f
-    status:
-      code: 200
-      message: OK
-version: 1
--- a/tests/cassettes/test_agent_knowledege_with_crewai_knowledge.yaml
+++ b/tests/cassettes/test_agent_knowledege_with_crewai_knowledge.yaml
@@ -1,150 +0,0 @@
-interactions:
- request:
-    body: '{"model": "openai/gpt-4o-mini", "messages": [{"role": "system", "content":
-      "Your goal is to rewrite the user query so that it is optimized for retrieval
-      from a vector database. Consider how the query will be used to find relevant
-      documents, and aim to make it more specific and context-aware. \n\n Do not include
-      any other text than the rewritten query, especially any preamble or postamble
-      and only add expected output format if its relevant to the rewritten query.
-      \n\n Focus on the key words of the intended task and to retrieve the most relevant
-      information. \n\n There will be some extra context provided that might need
-      to be removed such as expected_output formats structured_outputs and other instructions."},
-      {"role": "user", "content": "The original query is: What is Vidit''s favorite
-      color?\n\nThis is the expected criteria for your final answer: Vidit''s favorclearite
-      color.\nyou MUST return the actual complete content as the final answer, not
-      a summary.."}], "stream": false, "stop": ["\nObservation:"]}'
-    headers:
-      accept:
-      - '*/*'
-      accept-encoding:
-      - gzip, deflate
-      connection:
-      - keep-alive
-      content-length:
-      - '1017'
-      content-type:
-      - application/json
-      host:
-      - openrouter.ai
-      http-referer:
-      - https://litellm.ai
-      user-agent:
-      - litellm/1.68.0
-      x-title:
-      - liteLLM
-    method: POST
-    uri: https://openrouter.ai/api/v1/chat/completions
-  response:
-    body:
-      string: !!binary |
-        H4sIAAAAAAAAAwAAAP//4lKAAS4AAAAA//90kE1vE0EMhv9K9V64TMrmgyadG8ceECAhhIrQarrj
-        3bidHY/GTgSK9r+jpUpaJLja78djn8ARHgPlxXK72a6X6+12szhq7Id72d2V8b58/nbzQb98gkOp
-        cuRIFR4fC+X3d3AYJVKChxTKgd8OxRYbWYycGQ7y8EidwaPbB7vuZCyJjCXDoasUjCL8S61Dtxfu
-        SOG/n5BkKFUeFD4fUnLoObPu20pBJcNDTQoccjA+UvufLedIP+Ebh5FUw0DwJ1RJBI+gymoh20wj
-        2SjPpF85sr3Rqz4cpbLRVSdJ6jUcKvUHDenM81zFeXgeTNMPB/2lRuMMM1Atlf8k9qVt1rer3WrV
-        3DZwOJw5SpWxWGvyRFnnR7ybQc4/usxvHEwspBfhbun+NreRLHDSObUL3Z7iRdxM/wh9rb/c8coy
-        Tb8BAAD//wMAqVt3JyMCAAA=
-    headers:
-      Access-Control-Allow-Origin:
-      - '*'
-      CF-RAY:
-      - 9402cb503aec46c0-BOM
-      Connection:
-      - keep-alive
-      Content-Encoding:
-      - gzip
-      Content-Type:
-      - application/json
-      Date:
-      - Thu, 15 May 2025 12:56:14 GMT
-      Server:
-      - cloudflare
-      Transfer-Encoding:
-      - chunked
-      Vary:
-      - Accept-Encoding
-      x-clerk-auth-message:
-      - Invalid JWT form. A JWT consists of three parts separated by dots. (reason=token-invalid,
-        token-carrier=header)
-      x-clerk-auth-reason:
-      - token-invalid
-      x-clerk-auth-status:
-      - signed-out
-    status:
-      code: 200
-      message: OK
- request:
-    body: '{"model": "openai/gpt-4o-mini", "messages": [{"role": "system", "content":
-      "You are Information Agent. You have access to specific knowledge sources.\nYour
-      personal goal is: Provide information based on knowledge sources\nTo give my
-      best complete final answer to the task respond using the exact following format:\n\nThought:
-      I now can give a great answer\nFinal Answer: Your final answer must be the great
-      and the most complete as possible, it must be outcome described.\n\nI MUST use
-      these formats, my job depends on it!"}, {"role": "user", "content": "\nCurrent
-      Task: What is Vidit''s favorite color?\n\nThis is the expected criteria for
-      your final answer: Vidit''s favorclearite color.\nyou MUST return the actual
-      complete content as the final answer, not a summary.\n\nBegin! This is VERY
-      important to you, use the tools available and give your best Final Answer, your
-      job depends on it!\n\nThought:"}], "stream": false, "stop": ["\nObservation:"]}'
-    headers:
-      accept:
-      - '*/*'
-      accept-encoding:
-      - gzip, deflate
-      connection:
-      - keep-alive
-      content-length:
-      - '951'
-      content-type:
-      - application/json
-      host:
-      - openrouter.ai
-      http-referer:
-      - https://litellm.ai
-      user-agent:
-      - litellm/1.68.0
-      x-title:
-      - liteLLM
-    method: POST
-    uri: https://openrouter.ai/api/v1/chat/completions
-  response:
-    body:
-      string: !!binary |
-        H4sIAAAAAAAAAwAAAP//4lKAAS4AAAAA///iQjABAAAA//90kE9rG0EMxb/K8C69jNON7WJ7boFS
-        CD2ENm2g/1jGs/Ja7aw0zIydBuPvXjbBcQrtUU9P0u/pAO7g0JNMLhfzxexytli8mdy8r7c6/3Lb
-        v13eff00088fPj7AImXdc0cZDjeJ5OoaFoN2FOGgicTz6z7VyVwnAwvDQtc/KVQ4hK2vF0GHFKmy
-        CixCJl+pgzuftQhb5UAF7tsBUfuUdV3gZBejxYaFy7bN5IsKHErVBAvxlffU/qfL0tFvuMZioFJ8
-        T3AHZI0EB18Kl+qljjQqlWQkvTai9yZ4MT3vyXjTj6DGS7mnbMx3ecfio7l6rJ25447rq2I2fq+Z
-        K5mgUbPhYtZxRxewyLTZFR9PMZ4IWfon4Xj8YVEeSqVhzNBTTpkfQTapbWar6XI6bVYNLHYn/JR1
-        SLWt+oukjP9rRv7Ta8/6yqJq9fGsLFf27+m2o+o5lnFt8GFL3bO5Of5j60v/c5AXI8fjHwAAAP//
-        AwDEkP8dZgIAAA==
-    headers:
-      Access-Control-Allow-Origin:
-      - '*'
-      CF-RAY:
-      - 9402cb55c9fe46c0-BOM
-      Connection:
-      - keep-alive
-      Content-Encoding:
-      - gzip
-      Content-Type:
-      - application/json
-      Date:
-      - Thu, 15 May 2025 12:56:15 GMT
-      Server:
-      - cloudflare
-      Transfer-Encoding:
-      - chunked
-      Vary:
-      - Accept-Encoding
-      x-clerk-auth-message:
-      - Invalid JWT form. A JWT consists of three parts separated by dots. (reason=token-invalid,
-        token-carrier=header)
-      x-clerk-auth-reason:
-      - token-invalid
-      x-clerk-auth-status:
-      - signed-out
-    status:
-      code: 200
-      message: OK
-version: 1
--- a/tests/cassettes/test_agent_with_knowledge_with_no_crewai_knowledge.yaml
+++ b/tests/cassettes/test_agent_with_knowledge_with_no_crewai_knowledge.yaml
@@ -1,151 +0,0 @@
-interactions:
- request:
-    body: '{"model": "openai/gpt-4o-mini", "messages": [{"role": "system", "content":
-      "Your goal is to rewrite the user query so that it is optimized for retrieval
-      from a vector database. Consider how the query will be used to find relevant
-      documents, and aim to make it more specific and context-aware. \n\n Do not include
-      any other text than the rewritten query, especially any preamble or postamble
-      and only add expected output format if its relevant to the rewritten query.
-      \n\n Focus on the key words of the intended task and to retrieve the most relevant
-      information. \n\n There will be some extra context provided that might need
-      to be removed such as expected_output formats structured_outputs and other instructions."},
-      {"role": "user", "content": "The original query is: What is Vidit''s favorite
-      color?\n\nThis is the expected criteria for your final answer: Vidit''s favorclearite
-      color.\nyou MUST return the actual complete content as the final answer, not
-      a summary.."}], "stream": false, "stop": ["\nObservation:"]}'
-    headers:
-      accept:
-      - '*/*'
-      accept-encoding:
-      - gzip, deflate
-      connection:
-      - keep-alive
-      content-length:
-      - '1017'
-      content-type:
-      - application/json
-      host:
-      - openrouter.ai
-      http-referer:
-      - https://litellm.ai
-      user-agent:
-      - litellm/1.68.0
-      x-title:
-      - liteLLM
-    method: POST
-    uri: https://openrouter.ai/api/v1/chat/completions
-  response:
-    body:
-      string: !!binary |
-        H4sIAAAAAAAAAwAAAP//4lKAAS4AAAAA//90kE1vE0EMhv9K9V64TGCbNGQ7N46gIg6IXhBaTWed
-        Xbez49HYiaii/e9oqRKKBFf7/XjsE7iHx0B5db272W2uN++b3ep585k+jcmo/XqnYXvX5m/3cChV
-        jtxThceXQvnDRzhM0lOChxTKgd8NxVY3spo4Mxzk4ZGiwSOOwd5GmUoiY8lwiJWCUQ9/qW0d4igc
-        SeG/n5BkKFUeFD4fUnLYc2Ydu0pBJcNDTQoccjA+UvefLeeefsI3DhOphoHgT6iSCB5BldVCtoVG
-        slFeSO+5Z3ujV/twlMpGV1GSVDhU2h80pDPOSxPn4WUwzz8c9FmNpoVloFoq/w7cl67Z3K7b9bq5
-        beBwOGOUKlOxzuSJsi5/2C4c5xdd5lsHEwvpj7Bt3N/mricLnHRJjSGO1F/EzfyP0Nf6yx2vLPP8
-        CwAA//8DAOHu/cIiAgAA
-    headers:
-      Access-Control-Allow-Origin:
-      - '*'
-      CF-RAY:
-      - 9402c73df9d8859c-BOM
-      Connection:
-      - keep-alive
-      Content-Encoding:
-      - gzip
-      Content-Type:
-      - application/json
-      Date:
-      - Thu, 15 May 2025 12:53:27 GMT
-      Server:
-      - cloudflare
-      Transfer-Encoding:
-      - chunked
-      Vary:
-      - Accept-Encoding
-      x-clerk-auth-message:
-      - Invalid JWT form. A JWT consists of three parts separated by dots. (reason=token-invalid,
-        token-carrier=header)
-      x-clerk-auth-reason:
-      - token-invalid
-      x-clerk-auth-status:
-      - signed-out
-    status:
-      code: 200
-      message: OK
- request:
-    body: '{"model": "openai/gpt-4o-mini", "messages": [{"role": "system", "content":
-      "You are Information Agent. You have access to specific knowledge sources.\nYour
-      personal goal is: Provide information based on knowledge sources\nTo give my
-      best complete final answer to the task respond using the exact following format:\n\nThought:
-      I now can give a great answer\nFinal Answer: Your final answer must be the great
-      and the most complete as possible, it must be outcome described.\n\nI MUST use
-      these formats, my job depends on it!"}, {"role": "user", "content": "\nCurrent
-      Task: What is Vidit''s favorite color?\n\nThis is the expected criteria for
-      your final answer: Vidit''s favorclearite color.\nyou MUST return the actual
-      complete content as the final answer, not a summary.\n\nBegin! This is VERY
-      important to you, use the tools available and give your best Final Answer, your
-      job depends on it!\n\nThought:"}], "stream": false, "stop": ["\nObservation:"]}'
-    headers:
-      accept:
-      - '*/*'
-      accept-encoding:
-      - gzip, deflate
-      connection:
-      - keep-alive
-      content-length:
-      - '951'
-      content-type:
-      - application/json
-      host:
-      - openrouter.ai
-      http-referer:
-      - https://litellm.ai
-      user-agent:
-      - litellm/1.68.0
-      x-title:
-      - liteLLM
-    method: POST
-    uri: https://openrouter.ai/api/v1/chat/completions
-  response:
-    body:
-      string: !!binary |
-        H4sIAAAAAAAAAwAAAP//4lKAAS4AAAAA///iQjABAAAA//90kUGPEzEMhf+K5QuXdJmlpbvkthIg
-        emFXQoIDoMpNPFNDJo6STLul6n9H09KyIDjmxc9+/rxH8Wix4zi5vpndTK+n8+Z2wo9vXj28fHff
-        vW4+PNT5j1l6/wkNpqwb8ZzR4n3ieLdAg716DmhRE0eS512qk5lOeomCBnX1jV1Fi25N9cppnwJX
-        0YgGXWaq7NH+HmvQrVUcF7Sf9xi0S1lXBW0cQjDYSpSyXmamohEtlqoJDUaqsuHlf34len5E2xjs
-        uRTqGO0eswZGi1SKlEqxjmk0Vo5j0gVE3YKjCJ1sGAi6MShQLFvOAF/iW4kU4O74tvBRvNRnBVra
-        aJbK4DRoBikQtcJWPIcdeHVDz7GyB4mQhlUQF3ZAG5JAq8BQdMiOi4GisBiHj+ZftIHA87hePeY5
-        5cjcUfYSO1hLgZLYSSvurxRXaDBzOxQKZ4gnPhK7k3A4fDVYdqVyPxLsOKcsRwxtWvoVOZo3vm3Q
-        4HCGl7L2qS6rfudYxus1I73zYS/69NZg1UrhorwYD/yHe+m5koQytnXk1uwvxc3hH12f1l8WeWI5
-        HH4CAAD//wMAhZKqO+QCAAA=
-    headers:
-      Access-Control-Allow-Origin:
-      - '*'
-      CF-RAY:
-      - 9402c7459f3f859c-BOM
-      Connection:
-      - keep-alive
-      Content-Encoding:
-      - gzip
-      Content-Type:
-      - application/json
-      Date:
-      - Thu, 15 May 2025 12:53:28 GMT
-      Server:
-      - cloudflare
-      Transfer-Encoding:
-      - chunked
-      Vary:
-      - Accept-Encoding
-      x-clerk-auth-message:
-      - Invalid JWT form. A JWT consists of three parts separated by dots. (reason=token-invalid,
-        token-carrier=header)
-      x-clerk-auth-reason:
-      - token-invalid
-      x-clerk-auth-status:
-      - signed-out
-    status:
-      code: 200
-      message: OK
-version: 1
--- a/tests/cassettes/test_agent_with_only_crewai_knowledge.yaml
+++ b/tests/cassettes/test_agent_with_only_crewai_knowledge.yaml
@@ -1,150 +0,0 @@
-interactions:
- request:
-    body: '{"model": "openai/gpt-4o-mini", "messages": [{"role": "system", "content":
-      "Your goal is to rewrite the user query so that it is optimized for retrieval
-      from a vector database. Consider how the query will be used to find relevant
-      documents, and aim to make it more specific and context-aware. \n\n Do not include
-      any other text than the rewritten query, especially any preamble or postamble
-      and only add expected output format if its relevant to the rewritten query.
-      \n\n Focus on the key words of the intended task and to retrieve the most relevant
-      information. \n\n There will be some extra context provided that might need
-      to be removed such as expected_output formats structured_outputs and other instructions."},
-      {"role": "user", "content": "The original query is: What is Vidit''s favorite
-      color?\n\nThis is the expected criteria for your final answer: Vidit''s favorclearite
-      color.\nyou MUST return the actual complete content as the final answer, not
-      a summary.."}], "stream": false, "stop": ["\nObservation:"]}'
-    headers:
-      accept:
-      - '*/*'
-      accept-encoding:
-      - gzip, deflate
-      connection:
-      - keep-alive
-      content-length:
-      - '1017'
-      content-type:
-      - application/json
-      host:
-      - openrouter.ai
-      http-referer:
-      - https://litellm.ai
-      user-agent:
-      - litellm/1.68.0
-      x-title:
-      - liteLLM
-    method: POST
-    uri: https://openrouter.ai/api/v1/chat/completions
-  response:
-    body:
-      string: !!binary |
-        H4sIAAAAAAAAAwAAAP//4lKAAS4AAAAA//90kE1PIzEMhv8Kei97Sdnplwq5gTgAF8ShcFitRmnG
-        nTFk4ihxq11V899Xs6gFJLja78djH8ANLFqKk+lqsZpP56vpYqJhublfP1eP65v1i79Lt9fdMwxS
-        lj03lGHxkChe3cGgl4YCLCRRdPyzTTpZyKTnyDCQzQt5hYXvnJ576VMgZYkw8JmcUgP7XmvgO2FP
-        BfbXAUHalGVTYOMuBIMtRy5dnckVibAoKgkG0Snvqf5my7GhP7CVQU+luJZgD8gSCBauFC7qoo40
-        EpXiSPrEDeuPcrZ1e8msdOYlSIZBpu2uuHDEeWvi2L4NhuG3QflblPqRpaWcMv8P3Ka6ml/OLmaz
-        6rKCwe6IkbL0SWuVV4pl/MNy5Di+6DRfGqioC+/Ci8p8NtcNqeNQxlTvfEfNSVwNX4R+1J/u+GAZ
-        hn8AAAD//wMAIwJ79CICAAA=
-    headers:
-      Access-Control-Allow-Origin:
-      - '*'
-      CF-RAY:
-      - 9402c9db99ec4722-BOM
-      Connection:
-      - keep-alive
-      Content-Encoding:
-      - gzip
-      Content-Type:
-      - application/json
-      Date:
-      - Thu, 15 May 2025 12:55:14 GMT
-      Server:
-      - cloudflare
-      Transfer-Encoding:
-      - chunked
-      Vary:
-      - Accept-Encoding
-      x-clerk-auth-message:
-      - Invalid JWT form. A JWT consists of three parts separated by dots. (reason=token-invalid,
-        token-carrier=header)
-      x-clerk-auth-reason:
-      - token-invalid
-      x-clerk-auth-status:
-      - signed-out
-    status:
-      code: 200
-      message: OK
- request:
-    body: '{"model": "openai/gpt-4o-mini", "messages": [{"role": "system", "content":
-      "You are Information Agent. You have access to specific knowledge sources.\nYour
-      personal goal is: Provide information based on knowledge sources\nTo give my
-      best complete final answer to the task respond using the exact following format:\n\nThought:
-      I now can give a great answer\nFinal Answer: Your final answer must be the great
-      and the most complete as possible, it must be outcome described.\n\nI MUST use
-      these formats, my job depends on it!"}, {"role": "user", "content": "\nCurrent
-      Task: What is Vidit''s favorite color?\n\nThis is the expected criteria for
-      your final answer: Vidit''s favorclearite color.\nyou MUST return the actual
-      complete content as the final answer, not a summary.\n\nBegin! This is VERY
-      important to you, use the tools available and give your best Final Answer, your
-      job depends on it!\n\nThought:"}], "stream": false, "stop": ["\nObservation:"]}'
-    headers:
-      accept:
-      - '*/*'
-      accept-encoding:
-      - gzip, deflate
-      connection:
-      - keep-alive
-      content-length:
-      - '951'
-      content-type:
-      - application/json
-      host:
-      - openrouter.ai
-      http-referer:
-      - https://litellm.ai
-      user-agent:
-      - litellm/1.68.0
-      x-title:
-      - liteLLM
-    method: POST
-    uri: https://openrouter.ai/api/v1/chat/completions
-  response:
-    body:
-      string: !!binary |
-        H4sIAAAAAAAAAwAAAP//4lKAAS4AAAAA///iQjABAAAA//90kN1qGzEQRl9FfNdyul4nday73ARy
-        VUpLE2jLIu+O15NoZ4QkOy1moa/R1+uTlE1wnEB7qU/zc84cwB0cepLZfHm+XMwXy/nF7II/3d7V
-        H+tOPvsS3le3d+keFjHpnjtKcPgQSa5uYDFoRwEOGkk8v+tjmZ3rbGBhWOj6ntoCh3bry1mrQwxU
-        WAUWbSJfqIM7rbVot8otZbivBwTtY9J1hpNdCBYbFs7bJpHPKnDIRSMsxBfeU/OfX5aOfsBVFgPl
-        7HuCOyBpIDj4nDkXL2WiUSkkE+mNEX00rRfT856MN/0EarzkR0rGfJNrFh/M1dPbmS/ccfnz63c2
-        G7/XxIVMq0GT4WzWYUdnsEi02WUfjiLPjCz9czCO3y3yz1xomCx6SjHxE8omNtViVV/WdbWqYLE7
-        CsSkQyxN0QeSPF2wmgyOxz3lK4uixYdTcrmyb7ubjornkKexrW+31L0UV+M/pr6ufxF51TKOfwEA
-        AP//AwBybekMaAIAAA==
-    headers:
-      Access-Control-Allow-Origin:
-      - '*'
-      CF-RAY:
-      - 9402c9e1b94a4722-BOM
-      Connection:
-      - keep-alive
-      Content-Encoding:
-      - gzip
-      Content-Type:
-      - application/json
-      Date:
-      - Thu, 15 May 2025 12:55:15 GMT
-      Server:
-      - cloudflare
-      Transfer-Encoding:
-      - chunked
-      Vary:
-      - Accept-Encoding
-      x-clerk-auth-message:
-      - Invalid JWT form. A JWT consists of three parts separated by dots. (reason=token-invalid,
-        token-carrier=header)
-      x-clerk-auth-reason:
-      - token-invalid
-      x-clerk-auth-status:
-      - signed-out
-    status:
-      code: 200
-      message: OK
-version: 1
--- a/tests/experimental/evaluation/metrics/init.py
+++ b/tests/experimental/evaluation/metrics/init.py
--- a/tests/evaluation/metrics/init.py
+++ b/tests/evaluation/metrics/init.py
--- a/tests/experimental/evaluation/metrics/base_evaluation_metrics_test.py
+++ b/tests/experimental/evaluation/metrics/base_evaluation_metrics_test.py
--- a/tests/experimental/evaluation/metrics/test_goal_metrics.py
+++ b/tests/experimental/evaluation/metrics/test_goal_metrics.py
@@ -1,8 +1,8 @@
 from unittest.mock import patch, MagicMock
-from tests.experimental.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
+from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest

-from crewai.experimental.evaluation.base_evaluator import EvaluationScore
-from crewai.experimental.evaluation.metrics.goal_metrics import GoalAlignmentEvaluator
+from crewai.evaluation.base_evaluator import EvaluationScore
+from crewai.evaluation.metrics.goal_metrics import GoalAlignmentEvaluator
 from crewai.utilities.llm_utils import LLM


--- a/tests/experimental/evaluation/metrics/test_reasoning_metrics.py
+++ b/tests/experimental/evaluation/metrics/test_reasoning_metrics.py
@@ -3,12 +3,12 @@ from unittest.mock import patch, MagicMock
 from typing import List, Dict, Any

 from crewai.tasks.task_output import TaskOutput
-from crewai.experimental.evaluation.metrics.reasoning_metrics import (
+from crewai.evaluation.metrics.reasoning_metrics import (
    ReasoningEfficiencyEvaluator,
 )
-from tests.experimental.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
+from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
 from crewai.utilities.llm_utils import LLM
-from crewai.experimental.evaluation.base_evaluator import EvaluationScore
+from crewai.evaluation.base_evaluator import EvaluationScore

 class TestReasoningEfficiencyEvaluator(BaseEvaluationMetricsTest):
    @pytest.fixture
--- a/tests/experimental/evaluation/metrics/test_semantic_quality_metrics.py
+++ b/tests/experimental/evaluation/metrics/test_semantic_quality_metrics.py
@@ -1,8 +1,8 @@
 from unittest.mock import patch, MagicMock

-from crewai.experimental.evaluation.base_evaluator import EvaluationScore
-from crewai.experimental.evaluation.metrics.semantic_quality_metrics import SemanticQualityEvaluator
-from tests.experimental.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
+from crewai.evaluation.base_evaluator import EvaluationScore
+from crewai.evaluation.metrics.semantic_quality_metrics import SemanticQualityEvaluator
+from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
 from crewai.utilities.llm_utils import LLM

 class TestSemanticQualityEvaluator(BaseEvaluationMetricsTest):
--- a/tests/experimental/evaluation/metrics/test_tools_metrics.py
+++ b/tests/experimental/evaluation/metrics/test_tools_metrics.py
@@ -1,12 +1,12 @@
 from unittest.mock import patch, MagicMock

-from crewai.experimental.evaluation.metrics.tools_metrics import (
+from crewai.evaluation.metrics.tools_metrics import (
    ToolSelectionEvaluator,
    ParameterExtractionEvaluator,
    ToolInvocationEvaluator
 )
 from crewai.utilities.llm_utils import LLM
-from tests.experimental.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
+from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest

 class TestToolSelectionEvaluator(BaseEvaluationMetricsTest):
    def test_no_tools_available(self, mock_task, mock_agent):
--- a/tests/evaluation/test_agent_evaluator.py
+++ b/tests/evaluation/test_agent_evaluator.py
@@ -0,0 +1,95 @@
+import pytest
+
+from crewai.agent import Agent
+from crewai.task import Task
+from crewai.crew import Crew
+from crewai.evaluation.agent_evaluator import AgentEvaluator
+from crewai.evaluation.base_evaluator import AgentEvaluationResult
+from crewai.evaluation import (
+    GoalAlignmentEvaluator,
+    SemanticQualityEvaluator,
+    ToolSelectionEvaluator,
+    ParameterExtractionEvaluator,
+    ToolInvocationEvaluator,
+    ReasoningEfficiencyEvaluator
+)
+
+from crewai.evaluation import create_default_evaluator
+class TestAgentEvaluator:
+    @pytest.fixture
+    def mock_crew(self):
+        agent = Agent(
+            role="Test Agent",
+            goal="Complete test tasks successfully",
+            backstory="An agent created for testing purposes",
+            allow_delegation=False,
+            verbose=False
+        )
+
+        task = Task(
+            description="Test task description",
+            agent=agent,
+            expected_output="Expected test output"
+        )
+
+        crew = Crew(
+            agents=[agent],
+            tasks=[task]
+        )
+        return crew
+
+    def test_set_iteration(self):
+        agent_evaluator = AgentEvaluator()
+
+        agent_evaluator.set_iteration(3)
+        assert agent_evaluator.iteration == 3
+
+    @pytest.mark.vcr(filter_headers=["authorization"])
+    def test_evaluate_current_iteration(self, mock_crew):
+        agent_evaluator = AgentEvaluator(crew=mock_crew, evaluators=[GoalAlignmentEvaluator()])
+
+        mock_crew.kickoff()
+
+        results = agent_evaluator.evaluate_current_iteration()
+
+        assert isinstance(results, dict)
+
+        agent, = mock_crew.agents
+        task, = mock_crew.tasks
+
+        assert len(mock_crew.agents) == 1
+        assert agent.role in results
+        assert len(results[agent.role]) == 1
+
+        result, = results[agent.role]
+        assert isinstance(result, AgentEvaluationResult)
+
+        assert result.agent_id == str(agent.id)
+        assert result.task_id == str(task.id)
+
+        goal_alignment, = result.metrics.values()
+        assert goal_alignment.score == 5.0
+
+        expected_feedback = "The agent's output demonstrates an understanding of the need for a comprehensive document"
+        assert expected_feedback in goal_alignment.feedback
+
+        assert goal_alignment.raw_response is not None
+        assert '"score": 5' in goal_alignment.raw_response
+
+    def test_create_default_evaluator(self, mock_crew):
+        agent_evaluator = create_default_evaluator(crew=mock_crew)
+        assert isinstance(agent_evaluator, AgentEvaluator)
+        assert agent_evaluator.crew == mock_crew
+
+        expected_types = [
+            GoalAlignmentEvaluator,
+            SemanticQualityEvaluator,
+            ToolSelectionEvaluator,
+            ParameterExtractionEvaluator,
+            ToolInvocationEvaluator,
+            ReasoningEfficiencyEvaluator
+        ]
+
+        assert len(agent_evaluator.evaluators) == len(expected_types)
+        for evaluator, expected_type in zip(agent_evaluator.evaluators, expected_types):
+            assert isinstance(evaluator, expected_type)
--- a/tests/experimental/evaluation/test_agent_evaluator.py
+++ b/tests/experimental/evaluation/test_agent_evaluator.py
@@ -1,278 +0,0 @@
-import pytest
-
-from crewai.agent import Agent
-from crewai.task import Task
-from crewai.crew import Crew
-from crewai.experimental.evaluation.agent_evaluator import AgentEvaluator
-from crewai.experimental.evaluation.base_evaluator import AgentEvaluationResult
-from crewai.experimental.evaluation import (
-    GoalAlignmentEvaluator,
-    SemanticQualityEvaluator,
-    ToolSelectionEvaluator,
-    ParameterExtractionEvaluator,
-    ToolInvocationEvaluator,
-    ReasoningEfficiencyEvaluator,
-    MetricCategory,
-    EvaluationScore
-)
-
-from crewai.utilities.events.agent_events import AgentEvaluationStartedEvent, AgentEvaluationCompletedEvent, AgentEvaluationFailedEvent
-from crewai.utilities.events.crewai_event_bus import crewai_event_bus
-from crewai.experimental.evaluation import create_default_evaluator
-
-class TestAgentEvaluator:
-    @pytest.fixture
-    def mock_crew(self):
-        agent = Agent(
-            role="Test Agent",
-            goal="Complete test tasks successfully",
-            backstory="An agent created for testing purposes",
-            allow_delegation=False,
-            verbose=False
-        )
-
-        task = Task(
-            description="Test task description",
-            agent=agent,
-            expected_output="Expected test output"
-        )
-
-        crew = Crew(
-            agents=[agent],
-            tasks=[task]
-        )
-        return crew
-
-    def test_set_iteration(self):
-        agent_evaluator = AgentEvaluator(agents=[])
-
-        agent_evaluator.set_iteration(3)
-        assert agent_evaluator._execution_state.iteration == 3
-
-    @pytest.mark.vcr(filter_headers=["authorization"])
-    def test_evaluate_current_iteration(self, mock_crew):
-        agent_evaluator = AgentEvaluator(agents=mock_crew.agents, evaluators=[GoalAlignmentEvaluator()])
-
-        mock_crew.kickoff()
-
-        results = agent_evaluator.get_evaluation_results()
-
-        assert isinstance(results, dict)
-
-        agent, = mock_crew.agents
-        task, = mock_crew.tasks
-
-        assert len(mock_crew.agents) == 1
-        assert agent.role in results
-        assert len(results[agent.role]) == 1
-
-        result, = results[agent.role]
-        assert isinstance(result, AgentEvaluationResult)
-
-        assert result.agent_id == str(agent.id)
-        assert result.task_id == str(task.id)
-
-        goal_alignment, = result.metrics.values()
-        assert goal_alignment.score == 5.0
-
-        expected_feedback = "The agent's output demonstrates an understanding of the need for a comprehensive document outlining task"
-        assert expected_feedback in goal_alignment.feedback
-
-        assert goal_alignment.raw_response is not None
-        assert '"score": 5' in goal_alignment.raw_response
-
-    def test_create_default_evaluator(self, mock_crew):
-        agent_evaluator = create_default_evaluator(agents=mock_crew.agents)
-        assert isinstance(agent_evaluator, AgentEvaluator)
-        assert agent_evaluator.agents == mock_crew.agents
-
-        expected_types = [
-            GoalAlignmentEvaluator,
-            SemanticQualityEvaluator,
-            ToolSelectionEvaluator,
-            ParameterExtractionEvaluator,
-            ToolInvocationEvaluator,
-            ReasoningEfficiencyEvaluator
-        ]
-
-        assert len(agent_evaluator.evaluators) == len(expected_types)
-        for evaluator, expected_type in zip(agent_evaluator.evaluators, expected_types):
-            assert isinstance(evaluator, expected_type)
-
-    @pytest.mark.vcr(filter_headers=["authorization"])
-    def test_eval_lite_agent(self):
-        agent = Agent(
-            role="Test Agent",
-            goal="Complete test tasks successfully",
-            backstory="An agent created for testing purposes",
-        )
-
-        with crewai_event_bus.scoped_handlers():
-            events = {}
-            @crewai_event_bus.on(AgentEvaluationStartedEvent)
-            def capture_started(source, event):
-                events["started"] = event
-
-            @crewai_event_bus.on(AgentEvaluationCompletedEvent)
-            def capture_completed(source, event):
-                events["completed"] = event
-
-            @crewai_event_bus.on(AgentEvaluationFailedEvent)
-            def capture_failed(source, event):
-                events["failed"] = event
-
-            agent_evaluator = AgentEvaluator(agents=[agent], evaluators=[GoalAlignmentEvaluator()])
-
-            agent.kickoff(messages="Complete this task successfully")
-
-            assert events.keys() == {"started", "completed"}
-            assert events["started"].agent_id == str(agent.id)
-            assert events["started"].agent_role == agent.role
-            assert events["started"].task_id is None
-            assert events["started"].iteration == 1
-
-            assert events["completed"].agent_id == str(agent.id)
-            assert events["completed"].agent_role == agent.role
-            assert events["completed"].task_id is None
-            assert events["completed"].iteration == 1
-            assert events["completed"].metric_category == MetricCategory.GOAL_ALIGNMENT
-            assert isinstance(events["completed"].score, EvaluationScore)
-            assert events["completed"].score.score == 2.0
-
-            results = agent_evaluator.get_evaluation_results()
-
-            assert isinstance(results, dict)
-
-            result, = results[agent.role]
-            assert isinstance(result, AgentEvaluationResult)
-
-            assert result.agent_id == str(agent.id)
-            assert result.task_id == "lite_task"
-
-            goal_alignment, = result.metrics.values()
-            assert goal_alignment.score == 2.0
-
-            expected_feedback = "The agent did not demonstrate a clear understanding of the task goal, which is to complete test tasks successfully"
-            assert expected_feedback in goal_alignment.feedback
-
-            assert goal_alignment.raw_response is not None
-            assert '"score": 2' in goal_alignment.raw_response
-
-    @pytest.mark.vcr(filter_headers=["authorization"])
-    def test_eval_specific_agents_from_crew(self, mock_crew):
-        agent = Agent(
-            role="Test Agent Eval",
-            goal="Complete test tasks successfully",
-            backstory="An agent created for testing purposes",
-        )
-        task = Task(
-            description="Test task description",
-            agent=agent,
-            expected_output="Expected test output"
-        )
-        mock_crew.agents.append(agent)
-        mock_crew.tasks.append(task)
-
-        with crewai_event_bus.scoped_handlers():
-            events = {}
-            @crewai_event_bus.on(AgentEvaluationStartedEvent)
-            def capture_started(source, event):
-                events["started"] = event
-
-            @crewai_event_bus.on(AgentEvaluationCompletedEvent)
-            def capture_completed(source, event):
-                events["completed"] = event
-
-            @crewai_event_bus.on(AgentEvaluationFailedEvent)
-            def capture_failed(source, event):
-                events["failed"] = event
-
-            agent_evaluator = AgentEvaluator(agents=[agent], evaluators=[GoalAlignmentEvaluator()])
-            mock_crew.kickoff()
-
-            assert events.keys() == {"started", "completed"}
-            assert events["started"].agent_id == str(agent.id)
-            assert events["started"].agent_role == agent.role
-            assert events["started"].task_id == str(task.id)
-            assert events["started"].iteration == 1
-
-            assert events["completed"].agent_id == str(agent.id)
-            assert events["completed"].agent_role == agent.role
-            assert events["completed"].task_id == str(task.id)
-            assert events["completed"].iteration == 1
-            assert events["completed"].metric_category == MetricCategory.GOAL_ALIGNMENT
-            assert isinstance(events["completed"].score, EvaluationScore)
-            assert events["completed"].score.score == 5.0
-
-            results = agent_evaluator.get_evaluation_results()
-
-            assert isinstance(results, dict)
-            assert len(results.keys()) == 1
-            result, = results[agent.role]
-            assert isinstance(result, AgentEvaluationResult)
-
-            assert result.agent_id == str(agent.id)
-            assert result.task_id == str(task.id)
-
-            goal_alignment, = result.metrics.values()
-            assert goal_alignment.score == 5.0
-
-            expected_feedback = "The agent provided a thorough guide on how to conduct a test task but failed to produce specific expected output"
-            assert expected_feedback in goal_alignment.feedback
-
-            assert goal_alignment.raw_response is not None
-            assert '"score": 5' in goal_alignment.raw_response
-
-
-    @pytest.mark.vcr(filter_headers=["authorization"])
-    def test_failed_evaluation(self, mock_crew):
-        agent, = mock_crew.agents
-        task, = mock_crew.tasks
-
-        with crewai_event_bus.scoped_handlers():
-            events = {}
-
-            @crewai_event_bus.on(AgentEvaluationStartedEvent)
-            def capture_started(source, event):
-                events["started"] = event
-
-            @crewai_event_bus.on(AgentEvaluationCompletedEvent)
-            def capture_completed(source, event):
-                events["completed"] = event
-
-            @crewai_event_bus.on(AgentEvaluationFailedEvent)
-            def capture_failed(source, event):
-                events["failed"] = event
-
-            # Create a mock evaluator that will raise an exception
-            from crewai.experimental.evaluation.base_evaluator import BaseEvaluator
-            from crewai.experimental.evaluation import MetricCategory
-            class FailingEvaluator(BaseEvaluator):
-                metric_category = MetricCategory.GOAL_ALIGNMENT
-
-                def evaluate(self, agent, task, execution_trace, final_output):
-                    raise ValueError("Forced evaluation failure")
-
-            agent_evaluator = AgentEvaluator(agents=[agent], evaluators=[FailingEvaluator()])
-            mock_crew.kickoff()
-
-            assert events.keys() == {"started", "failed"}
-            assert events["started"].agent_id == str(agent.id)
-            assert events["started"].agent_role == agent.role
-            assert events["started"].task_id == str(task.id)
-            assert events["started"].iteration == 1
-
-            assert events["failed"].agent_id == str(agent.id)
-            assert events["failed"].agent_role == agent.role
-            assert events["failed"].task_id == str(task.id)
-            assert events["failed"].iteration == 1
-            assert events["failed"].error == "Forced evaluation failure"
-
-            results = agent_evaluator.get_evaluation_results()
-            result, = results[agent.role]
-            assert isinstance(result, AgentEvaluationResult)
-
-            assert result.agent_id == str(agent.id)
-            assert result.task_id == str(task.id)
-
-            assert result.metrics == {}
--- a/tests/experimental/evaluation/test_experiment_result.py
+++ b/tests/experimental/evaluation/test_experiment_result.py
@@ -1,111 +0,0 @@
-import pytest
-from unittest.mock import MagicMock, patch
-
-from crewai.experimental.evaluation.experiment.result import ExperimentResult, ExperimentResults
-
-
-class TestExperimentResult:
-    @pytest.fixture
-    def mock_results(self):
-        return [
-            ExperimentResult(
-                identifier="test-1",
-                inputs={"query": "What is the capital of France?"},
-                score=10,
-                expected_score=7,
-                passed=True
-            ),
-            ExperimentResult(
-                identifier="test-2",
-                inputs={"query": "Who wrote Hamlet?"},
-                score={"relevance": 9, "factuality": 8},
-                expected_score={"relevance": 7, "factuality": 7},
-                passed=True,
-                agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}}
-            ),
-            ExperimentResult(
-                identifier="test-3",
-                inputs={"query": "Any query"},
-                score={"relevance": 9, "factuality": 8},
-                expected_score={"relevance": 7, "factuality": 7},
-                passed=False,
-                agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}}
-            ),
-            ExperimentResult(
-                identifier="test-4",
-                inputs={"query": "Another query"},
-                score={"relevance": 9, "factuality": 8},
-                expected_score={"relevance": 7, "factuality": 7},
-                passed=True,
-                agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}}
-            ),
-            ExperimentResult(
-                identifier="test-6",
-                inputs={"query": "Yet another query"},
-                score={"relevance": 9, "factuality": 8},
-                expected_score={"relevance": 7, "factuality": 7},
-                passed=True,
-                agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}}
-            )
-        ]
-
-    @patch('os.path.exists', return_value=True)
-    @patch('os.path.getsize', return_value=1)
-    @patch('json.load')
-    @patch('builtins.open', new_callable=MagicMock)
-    def test_experiment_results_compare_with_baseline(self, mock_open, mock_json_load, mock_path_getsize, mock_path_exists, mock_results):
-        baseline_data = {
-            "timestamp": "2023-01-01T00:00:00+00:00",
-            "results": [
-                {
-                    "identifier": "test-1",
-                    "inputs": {"query": "What is the capital of France?"},
-                    "score": 7,
-                    "expected_score": 7,
-                    "passed": False
-                },
-                {
-                    "identifier": "test-2",
-                    "inputs": {"query": "Who wrote Hamlet?"},
-                    "score": {"relevance": 8, "factuality": 7},
-                    "expected_score": {"relevance": 7, "factuality": 7},
-                    "passed": True
-                },
-                {
-                    "identifier": "test-3",
-                    "inputs": {"query": "Any query"},
-                    "score": {"relevance": 8, "factuality": 7},
-                    "expected_score": {"relevance": 7, "factuality": 7},
-                    "passed": True
-                },
-                {
-                    "identifier": "test-4",
-                    "inputs": {"query": "Another query"},
-                    "score": {"relevance": 8, "factuality": 7},
-                    "expected_score": {"relevance": 7, "factuality": 7},
-                    "passed": True
-                },
-                {
-                    "identifier": "test-5",
-                    "inputs": {"query": "Another query"},
-                    "score": {"relevance": 8, "factuality": 7},
-                    "expected_score": {"relevance": 7, "factuality": 7},
-                    "passed": True
-                }
-            ]
-        }
-
-        mock_json_load.return_value = baseline_data
-
-        results = ExperimentResults(results=mock_results)
-        results.display = MagicMock()
-
-        comparison = results.compare_with_baseline(baseline_filepath="baseline.json")
-
-        assert "baseline_timestamp" in comparison
-        assert comparison["baseline_timestamp"] == "2023-01-01T00:00:00+00:00"
-        assert comparison["improved"] == ["test-1"]
-        assert comparison["regressed"] == ["test-3"]
-        assert comparison["unchanged"] == ["test-2", "test-4"]
-        assert comparison["new_tests"] == ["test-6"]
-        assert comparison["missing_tests"] == ["test-5"]
--- a/tests/experimental/evaluation/test_experiment_runner.py
+++ b/tests/experimental/evaluation/test_experiment_runner.py
@@ -1,197 +0,0 @@
-import pytest
-from unittest.mock import MagicMock, patch
-
-from crewai.crew import Crew
-from crewai.experimental.evaluation.experiment.runner import ExperimentRunner
-from crewai.experimental.evaluation.experiment.result import ExperimentResults
-from crewai.experimental.evaluation.evaluation_display import AgentAggregatedEvaluationResult
-from crewai.experimental.evaluation.base_evaluator import MetricCategory, EvaluationScore
-
-
-class TestExperimentRunner:
-    @pytest.fixture
-    def mock_crew(self):
-        return MagicMock(llm=Crew)
-
-    @pytest.fixture
-    def mock_evaluator_results(self):
-        agent_evaluation = AgentAggregatedEvaluationResult(
-            agent_id="Test Agent",
-            agent_role="Test Agent Role",
-            metrics={
-                MetricCategory.GOAL_ALIGNMENT: EvaluationScore(
-                    score=9,
-                    feedback="Test feedback for goal alignment",
-                    raw_response="Test raw response for goal alignment"
-                ),
-                MetricCategory.REASONING_EFFICIENCY: EvaluationScore(
-                    score=None,
-                    feedback="Reasoning efficiency not applicable",
-                    raw_response="Reasoning efficiency not applicable"
-                ),
-                MetricCategory.PARAMETER_EXTRACTION: EvaluationScore(
-                    score=7,
-                    feedback="Test parameter extraction explanation",
-                    raw_response="Test raw output"
-                ),
-                MetricCategory.TOOL_SELECTION: EvaluationScore(
-                    score=8,
-                    feedback="Test tool selection explanation",
-                    raw_response="Test raw output"
-                )
-            }
-        )
-
-        return {"Test Agent": agent_evaluation}
-
-    @patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
-    def test_run_success(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
-        dataset = [
-            {
-                "identifier": "test-case-1",
-                "inputs": {"query": "Test query 1"},
-                "expected_score": 8
-            },
-            {
-                "identifier": "test-case-2",
-                "inputs": {"query": "Test query 2"},
-                "expected_score": {"goal_alignment": 7}
-            },
-            {
-                "inputs": {"query": "Test query 3"},
-                "expected_score": {"tool_selection": 9}
-            }
-        ]
-
-        mock_evaluator = MagicMock()
-        mock_evaluator.get_agent_evaluation.return_value = mock_evaluator_results
-        mock_evaluator.reset_iterations_results = MagicMock()
-        mock_create_evaluator.return_value = mock_evaluator
-
-        runner = ExperimentRunner(dataset=dataset)
-
-        results = runner.run(crew=mock_crew)
-
-        assert isinstance(results, ExperimentResults)
-        result_1, result_2, result_3 = results.results
-        assert len(results.results) == 3
-
-        assert result_1.identifier == "test-case-1"
-        assert result_1.inputs == {"query": "Test query 1"}
-        assert result_1.expected_score == 8
-        assert result_1.passed is True
-
-        assert result_2.identifier == "test-case-2"
-        assert result_2.inputs == {"query": "Test query 2"}
-        assert isinstance(result_2.expected_score, dict)
-        assert "goal_alignment" in result_2.expected_score
-        assert result_2.passed is True
-
-        assert result_3.identifier == "c2ed49e63aa9a83af3ca382794134fd5"
-        assert result_3.inputs == {"query": "Test query 3"}
-        assert isinstance(result_3.expected_score, dict)
-        assert "tool_selection" in result_3.expected_score
-        assert result_3.passed is False
-
-        assert mock_crew.kickoff.call_count == 3
-        mock_crew.kickoff.assert_any_call(inputs={"query": "Test query 1"})
-        mock_crew.kickoff.assert_any_call(inputs={"query": "Test query 2"})
-        mock_crew.kickoff.assert_any_call(inputs={"query": "Test query 3"})
-
-        assert mock_evaluator.reset_iterations_results.call_count == 3
-        assert mock_evaluator.get_agent_evaluation.call_count == 3
-
-
-    @patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
-    def test_run_success_with_unknown_metric(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
-        dataset = [
-            {
-                "identifier": "test-case-2",
-                "inputs": {"query": "Test query 2"},
-                "expected_score": {"goal_alignment": 7, "unknown_metric": 8}
-            }
-        ]
-
-        mock_evaluator = MagicMock()
-        mock_evaluator.get_agent_evaluation.return_value = mock_evaluator_results
-        mock_evaluator.reset_iterations_results = MagicMock()
-        mock_create_evaluator.return_value = mock_evaluator
-
-        runner = ExperimentRunner(dataset=dataset)
-
-        results = runner.run(crew=mock_crew)
-
-        result, = results.results
-
-        assert result.identifier == "test-case-2"
-        assert result.inputs == {"query": "Test query 2"}
-        assert isinstance(result.expected_score, dict)
-        assert "goal_alignment" in result.expected_score.keys()
-        assert "unknown_metric" in result.expected_score.keys()
-        assert result.passed is True
-
-    @patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
-    def test_run_success_with_single_metric_evaluator_and_expected_specific_metric(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
-        dataset = [
-            {
-                "identifier": "test-case-2",
-                "inputs": {"query": "Test query 2"},
-                "expected_score": {"goal_alignment": 7}
-            }
-        ]
-
-        mock_evaluator = MagicMock()
-        mock_create_evaluator["Test Agent"].metrics = {
-            MetricCategory.GOAL_ALIGNMENT: EvaluationScore(
-                    score=9,
-                    feedback="Test feedback for goal alignment",
-                    raw_response="Test raw response for goal alignment"
-                )
-        }
-        mock_evaluator.get_agent_evaluation.return_value = mock_evaluator_results
-        mock_evaluator.reset_iterations_results = MagicMock()
-        mock_create_evaluator.return_value = mock_evaluator
-
-        runner = ExperimentRunner(dataset=dataset)
-
-        results = runner.run(crew=mock_crew)
-        result, = results.results
-
-        assert result.identifier == "test-case-2"
-        assert result.inputs == {"query": "Test query 2"}
-        assert isinstance(result.expected_score, dict)
-        assert "goal_alignment" in result.expected_score.keys()
-        assert result.passed is True
-
-    @patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
-    def test_run_success_when_expected_metric_is_not_available(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
-        dataset = [
-            {
-                "identifier": "test-case-2",
-                "inputs": {"query": "Test query 2"},
-                "expected_score": {"unknown_metric": 7}
-            }
-        ]
-
-        mock_evaluator = MagicMock()
-        mock_create_evaluator["Test Agent"].metrics = {
-            MetricCategory.GOAL_ALIGNMENT: EvaluationScore(
-                score=5,
-                feedback="Test feedback for goal alignment",
-                raw_response="Test raw response for goal alignment"
-            )
-        }
-        mock_evaluator.get_agent_evaluation.return_value = mock_evaluator_results
-        mock_evaluator.reset_iterations_results = MagicMock()
-        mock_create_evaluator.return_value = mock_evaluator
-
-        runner = ExperimentRunner(dataset=dataset)
-
-        results = runner.run(crew=mock_crew)
-        result, = results.results
-
-        assert result.identifier == "test-case-2"
-        assert result.inputs == {"query": "Test query 2"}
-        assert isinstance(result.expected_score, dict)
-        assert "unknown_metric" in result.expected_score.keys()
-        assert result.passed is False
--- a/tests/project_test.py
+++ b/tests/project_test.py
@@ -87,7 +87,7 @@ class InternalCrew:

@CrewBase
 class InternalCrewWithMCP(InternalCrew):
-    mcp_server_params = [{"url": "localhost", "port": 8000}]
+    mcp_server_params = {"host": "localhost", "port": 8000}

    @agent
    def reporting_analyst(self):
@@ -97,19 +97,6 @@ class InternalCrewWithMCP(InternalCrew):
    def researcher(self):
        return Agent(config=self.agents_config["researcher"], tools=self.get_mcp_tools("simple_tool"))  # type: ignore[index]

-@CrewBase
-class InternalCrewWithMultipleMCP(InternalCrew):
-    mcp_server_params = {"mcp1": {"url": "localhost", "port": 8000}, "mcp2": {"url": "localhost", "port": 8001}}
-
-    @agent
-    def reporting_analyst(self):
-        return Agent(config=self.agents_config["reporting_analyst"], tools=self.get_mcp_tools(server="mcp1"))  # type: ignore[index]
-
-    @agent
-    def researcher(self):
-        return Agent(config=self.agents_config["researcher"], tools=self.get_mcp_tools("simple_tool", server="mcp2"))  # type: ignore[index]
-
-
 def test_agent_memoization():
    crew = SimpleCrew()
    first_call_result = crew.simple_agent()
@@ -283,21 +270,4 @@ def test_internal_crew_with_mcp():
        assert crew.reporting_analyst().tools == [simple_tool, another_simple_tool]
        assert crew.researcher().tools == [simple_tool]

-    adapter_mock.assert_called_once_with([{"url": "localhost", "port": 8000}])
-
-
-def test_internal_crew_with_multiple_mcp():
-    from crewai_tools import MCPServerAdapter
-    from crewai_tools.adapters.mcp_adapter import ToolCollection
-    from unittest.mock import call
-
-    mock = Mock(spec=MCPServerAdapter)
-    mock.tools = ToolCollection([simple_tool, another_simple_tool])
-    with patch("crewai_tools.MCPServerAdapter", return_value=mock) as adapter_mock:
-        crew = InternalCrewWithMultipleMCP()
-        assert crew.reporting_analyst().tools == [simple_tool, another_simple_tool]
-        assert crew.researcher().tools == [simple_tool]
-        adapter_mock.assert_has_calls([
-            call({"url": "localhost", "port": 8000}),
-            call({"url": "localhost", "port": 8001})
-        ], any_order=True)
+    adapter_mock.assert_called_once_with({"host": "localhost", "port": 8000})
--- a/tests/task_test.py
+++ b/tests/task_test.py
@@ -1133,119 +1133,6 @@ def test_output_file_validation():
        )


-def test_create_directory_true():
-    """Test that directories are created when create_directory=True."""
-    from pathlib import Path
-    
-    output_path = "test_create_dir/output.txt"
-    
-    task = Task(
-        description="Test task",
-        expected_output="Test output",
-        output_file=output_path,
-        create_directory=True,
-    )
-    
-    resolved_path = Path(output_path).expanduser().resolve()
-    resolved_dir = resolved_path.parent
-    
-    if resolved_path.exists():
-        resolved_path.unlink()
-    if resolved_dir.exists():
-        import shutil
-        shutil.rmtree(resolved_dir)
-    
-    assert not resolved_dir.exists()
-    
-    task._save_file("test content")
-    
-    assert resolved_dir.exists()
-    assert resolved_path.exists()
-    
-    if resolved_path.exists():
-        resolved_path.unlink()
-    if resolved_dir.exists():
-        import shutil
-        shutil.rmtree(resolved_dir)
-
-
-def test_create_directory_false():
-    """Test that directories are not created when create_directory=False."""
-    from pathlib import Path
-    
-    output_path = "nonexistent_test_dir/output.txt"
-    
-    task = Task(
-        description="Test task",
-        expected_output="Test output",
-        output_file=output_path,
-        create_directory=False,
-    )
-    
-    resolved_path = Path(output_path).expanduser().resolve()
-    resolved_dir = resolved_path.parent
-    
-    if resolved_dir.exists():
-        import shutil
-        shutil.rmtree(resolved_dir)
-    
-    assert not resolved_dir.exists()
-    
-    with pytest.raises(RuntimeError, match="Directory .* does not exist and create_directory is False"):
-        task._save_file("test content")
-
-
-def test_create_directory_default():
-    """Test that create_directory defaults to True for backward compatibility."""
-    task = Task(
-        description="Test task",
-        expected_output="Test output",
-        output_file="output.txt",
-    )
-    
-    assert task.create_directory is True
-
-
-def test_create_directory_with_existing_directory():
-    """Test that create_directory=False works when directory already exists."""
-    from pathlib import Path
-    
-    output_path = "existing_test_dir/output.txt"
-    
-    resolved_path = Path(output_path).expanduser().resolve()
-    resolved_dir = resolved_path.parent
-    resolved_dir.mkdir(parents=True, exist_ok=True)
-    
-    task = Task(
-        description="Test task",
-        expected_output="Test output",
-        output_file=output_path,
-        create_directory=False,
-    )
-    
-    task._save_file("test content")
-    assert resolved_path.exists()
-    
-    if resolved_path.exists():
-        resolved_path.unlink()
-    if resolved_dir.exists():
-        import shutil
-        shutil.rmtree(resolved_dir)
-
-
-def test_github_issue_3149_reproduction():
-    """Test that reproduces the exact issue from GitHub issue #3149."""
-    task = Task(
-        description="Test task for issue reproduction",
-        expected_output="Test output",
-        output_file="test_output.txt",
-        create_directory=True,
-    )
-    
-    assert task.create_directory is True
-    assert task.output_file == "test_output.txt"
-
-
@pytest.mark.vcr(filter_headers=["authorization"])
 def test_task_execution_times():
    researcher = Agent(
--- a/tests/test_lite_agent.py
+++ b/tests/test_lite_agent.py
@@ -12,8 +12,6 @@ from crewai.tools import BaseTool
 from crewai.utilities.events import crewai_event_bus
 from crewai.utilities.events.agent_events import LiteAgentExecutionStartedEvent
 from crewai.utilities.events.tool_usage_events import ToolUsageStartedEvent
-from crewai.llms.base_llm import BaseLLM
-from unittest.mock import patch


 # A simple test tool
@@ -420,76 +418,3 @@ def test_agent_output_when_guardrail_returns_base_model():
    result = agent.kickoff(messages="Top 10 best players in the world?")

    assert result.pydantic == Player(name="Lionel Messi", country="Argentina")
-
-def test_lite_agent_with_custom_llm_and_guardrails():
-    """Test that CustomLLM (inheriting from BaseLLM) works with guardrails."""
-    class CustomLLM(BaseLLM):
-        def __init__(self, response: str = "Custom response"):
-            super().__init__(model="custom-model")
-            self.response = response
-            self.call_count = 0
-
-        def call(self, messages, tools=None, callbacks=None, available_functions=None, from_task=None, from_agent=None) -> str:
-            self.call_count += 1
-
-            if "valid" in str(messages) and "feedback" in str(messages):
-                return '{"valid": true, "feedback": null}'
-
-            if "Thought:" in str(messages):
-                return f"Thought: I will analyze soccer players\nFinal Answer: {self.response}"
-
-            return self.response
-
-        def supports_function_calling(self) -> bool:
-            return False
-
-        def supports_stop_words(self) -> bool:
-            return False
-
-        def get_context_window_size(self) -> int:
-            return 4096
-
-    custom_llm = CustomLLM(response="Brazilian soccer players are the best!")
-
-    agent = LiteAgent(
-        role="Sports Analyst",
-        goal="Analyze soccer players",
-        backstory="You analyze soccer players and their performance.",
-        llm=custom_llm,
-        guardrail="Only include Brazilian players"
-    )
-
-    result = agent.kickoff("Tell me about the best soccer players")
-
-    assert custom_llm.call_count > 0
-    assert "Brazilian" in result.raw
-
-    custom_llm2 = CustomLLM(response="Original response")
-
-    def test_guardrail(output):
-        return (True, "Modified by guardrail")
-
-    agent2 = LiteAgent(
-        role="Test Agent",
-        goal="Test goal",
-        backstory="Test backstory",
-        llm=custom_llm2,
-        guardrail=test_guardrail
-    )
-
-    result2 = agent2.kickoff("Test message")
-    assert result2.raw == "Modified by guardrail"
-
-
-@pytest.mark.vcr(filter_headers=["authorization"])
-def test_lite_agent_with_invalid_llm():
-    """Test that LiteAgent raises proper error when create_llm returns None."""
-    with patch('crewai.lite_agent.create_llm', return_value=None):
-        with pytest.raises(ValueError) as exc_info:
-            LiteAgent(
-                role="Test Agent",
-                goal="Test goal", 
-                backstory="Test backstory",
-                llm="invalid-model"
-            )
-        assert "Expected LLM instance of type BaseLLM" in str(exc_info.value)
--- a/tests/utilities/test_chromadb_utils.py
+++ b/tests/utilities/test_chromadb_utils.py
@@ -1,27 +1,16 @@
-import multiprocessing
-import tempfile
 import unittest
+from typing import Any, Dict, List, Union

-from chromadb.config import Settings
-from unittest.mock import patch, MagicMock
+import pytest

 from crewai.utilities.chromadb import (
    MAX_COLLECTION_LENGTH,
    MIN_COLLECTION_LENGTH,
    is_ipv4_pattern,
    sanitize_collection_name,
-    create_persistent_client,
 )


-def persistent_client_worker(path, queue):
-    try:
-        create_persistent_client(path=path)
-        queue.put(None)
-    except Exception as e:
-        queue.put(e)
-
-
 class TestChromadbUtils(unittest.TestCase):
    def test_sanitize_collection_name_long_name(self):
        """Test sanitizing a very long collection name."""
@@ -90,34 +79,3 @@ class TestChromadbUtils(unittest.TestCase):
            self.assertLessEqual(len(sanitized), MAX_COLLECTION_LENGTH)
            self.assertTrue(sanitized[0].isalnum())
            self.assertTrue(sanitized[-1].isalnum())
-
-    def test_create_persistent_client_passes_args(self):
-        with patch(
-            "crewai.utilities.chromadb.PersistentClient"
-        ) as mock_persistent_client, tempfile.TemporaryDirectory() as tmpdir:
-            mock_instance = MagicMock()
-            mock_persistent_client.return_value = mock_instance
-
-            settings = Settings(allow_reset=True)
-            client = create_persistent_client(path=tmpdir, settings=settings)
-
-            mock_persistent_client.assert_called_once_with(
-                path=tmpdir, settings=settings
-            )
-            self.assertIs(client, mock_instance)
-
-    def test_create_persistent_client_process_safe(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            queue = multiprocessing.Queue()
-            processes = [
-                multiprocessing.Process(
-                    target=persistent_client_worker, args=(tmpdir, queue)
-                )
-                for _ in range(5)
-            ]
-
-            [p.start() for p in processes]
-            [p.join() for p in processes]
-
-            errors = [queue.get(timeout=5) for _ in processes]
-            self.assertTrue(all(err is None for err in errors))
--- a/uv.lock
+++ b/uv.lock
Author	SHA1	Message	Date
Lucas Gomide	064997464e	fix: allow messages be empty on LLMCallCompletedEvent	2025-07-11 14:05:25 -03:00
Lucas Gomide	6f0ed6642b	style: fix mypy issues	2025-07-11 13:02:34 -03:00
Lucas Gomide	43f339fa84	style: resolve linter issues	2025-07-11 13:02:34 -03:00
Lucas Gomide	5ea221e54e	fix: render all feedback per iteration	2025-07-11 13:02:34 -03:00
Lucas Gomide	d4c15ec25f	test: add Agent eval tests	2025-07-11 13:02:34 -03:00
Lucas Gomide	37cfbe7389	fix: do not evaluate Agent by default This is a experimental feature we still need refine it further	2025-07-11 13:02:34 -03:00
Lucas Gomide	6d7c7d940e	feat: add AgentEvaluator class This class will evaluate Agent' results and report to user	2025-07-11 13:02:34 -03:00
Lucas Gomide	80bd23a8a9	feat: add Reasoning Metrics for Agent evaluation, still in progress	2025-07-11 13:02:34 -03:00
Lucas Gomide	50593d1485	feat: add Tool Metrics for Agent evaluation	2025-07-11 13:02:34 -03:00
Lucas Gomide	60084af745	feat: add SemanticQuality metric for Agent evaluation	2025-07-11 13:02:34 -03:00
Lucas Gomide	be4ade8c45	feat: add GoalAlignment metric for Agent evaluation	2025-07-11 13:02:34 -03:00
Lucas Gomide	6a49a24810	feat: add exchanged messages in LLMCallCompletedEvent	2025-07-11 13:02:34 -03:00