Reset finalize guard on each executor invocation

refactor: enhance planning and execution flow in agents
- Updated the PlannerObserver to accept a kickoff input for standalone task execution, improving flexibility in task handling. - Refined the step execution process in StepExecutor to support multi-turn action loops, allowing for iterative tool execution and observation. - Introduced a method to extract relevant task sections from descriptions, ensuring clarity in task requirements. - Enhanced the AgentExecutor to manage step failures more effectively, triggering replans only when necessary and preserving completed task history. - Updated translations to reflect changes in planning principles and execution prompts, emphasizing concrete and executable steps.
2026-03-05 11:28:14 +00:00 · 2026-03-03 18:21:27 +00:00 · 2026-03-03 10:17:35 -08:00 · 2026-02-25 13:44:50 -08:00 · 2026-02-24 15:04:02 -08:00 · 2026-02-24 14:19:27 -08:00
184 changed files with 41782 additions and 5459 deletions
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -79,101 +79,6 @@
                  "en/quickstart"
                ]
              },
-              {
-                "group": "AI Docs",
-                "pages": [
-                  "en/ai/overview",
-                  {
-                    "group": "Flows",
-                    "icon": "arrow-progress",
-                    "pages": [
-                      "en/ai/flows/index",
-                      "en/ai/flows/reference",
-                      "en/ai/flows/patterns",
-                      "en/ai/flows/troubleshooting",
-                      "en/ai/flows/examples"
-                    ]
-                  },
-                  {
-                    "group": "Agents",
-                    "icon": "user",
-                    "pages": [
-                      "en/ai/agents/index",
-                      "en/ai/agents/reference",
-                      "en/ai/agents/patterns",
-                      "en/ai/agents/troubleshooting",
-                      "en/ai/agents/examples"
-                    ]
-                  },
-                  {
-                    "group": "Crews",
-                    "icon": "users",
-                    "pages": [
-                      "en/ai/crews/index",
-                      "en/ai/crews/reference",
-                      "en/ai/crews/patterns",
-                      "en/ai/crews/troubleshooting",
-                      "en/ai/crews/examples"
-                    ]
-                  },
-                  {
-                    "group": "LLMs",
-                    "icon": "microchip-ai",
-                    "pages": [
-                      "en/ai/llms/index",
-                      "en/ai/llms/reference",
-                      "en/ai/llms/patterns",
-                      "en/ai/llms/troubleshooting",
-                      "en/ai/llms/examples"
-                    ]
-                  },
-                  {
-                    "group": "Memory",
-                    "icon": "database",
-                    "pages": [
-                      "en/ai/memory/index",
-                      "en/ai/memory/reference",
-                      "en/ai/memory/patterns",
-                      "en/ai/memory/troubleshooting",
-                      "en/ai/memory/examples"
-                    ]
-                  },
-                  {
-                    "group": "Tools",
-                    "icon": "wrench",
-                    "pages": [
-                      "en/ai/tools/index",
-                      "en/ai/tools/reference",
-                      "en/ai/tools/patterns",
-                      "en/ai/tools/troubleshooting",
-                      "en/ai/tools/examples"
-                    ]
-                  }
-                ]
-              },
-              {
-                "group": "Core Concepts",
-                "pages": [
-                  "en/concepts/agents",
-                  "en/concepts/tasks",
-                  "en/concepts/crews",
-                  "en/concepts/flows",
-                  "en/concepts/production-architecture",
-                  "en/concepts/knowledge",
-                  "en/concepts/llms",
-                  "en/concepts/files",
-                  "en/concepts/processes",
-                  "en/concepts/collaboration",
-                  "en/concepts/training",
-                  "en/concepts/memory",
-                  "en/concepts/reasoning",
-                  "en/concepts/planning",
-                  "en/concepts/testing",
-                  "en/concepts/cli",
-                  "en/concepts/tools",
-                  "en/concepts/event-listener"
-                ]
-              },
              {
                "group": "Guides",
                "pages": [
@@ -223,6 +128,29 @@
                  }
                ]
              },
+              {
+                "group": "Core Concepts",
+                "pages": [
+                  "en/concepts/agents",
+                  "en/concepts/tasks",
+                  "en/concepts/crews",
+                  "en/concepts/flows",
+                  "en/concepts/production-architecture",
+                  "en/concepts/knowledge",
+                  "en/concepts/llms",
+                  "en/concepts/files",
+                  "en/concepts/processes",
+                  "en/concepts/collaboration",
+                  "en/concepts/training",
+                  "en/concepts/memory",
+                  "en/concepts/reasoning",
+                  "en/concepts/planning",
+                  "en/concepts/testing",
+                  "en/concepts/cli",
+                  "en/concepts/tools",
+                  "en/concepts/event-listener"
+                ]
+              },
              {
                "group": "MCP Integration",
                "pages": [
@@ -403,7 +331,6 @@
                  "en/learn/human-input-on-execution",
                  "en/learn/human-in-the-loop",
                  "en/learn/human-feedback-in-flows",
-                  "en/learn/flowstate-chat-history",
                  "en/learn/kickoff-async",
                  "en/learn/kickoff-for-each",
                  "en/learn/llm-connections",
@@ -556,6 +483,7 @@
              {
                "group": "Examples",
                "pages": [
+                  "en/examples/example",
                  "en/examples/cookbooks"
                ]
              }
@@ -1554,18 +1482,6 @@
      "source": "/api-reference",
      "destination": "/en/api-reference/introduction"
    },
-    {
-      "source": "/",
-      "destination": "/en/introduction"
-    },
-    {
-      "source": "/en",
-      "destination": "/en/introduction"
-    },
-    {
-      "source": "/en/examples/example",
-      "destination": "/en/examples/cookbooks"
-    },
    {
      "source": "/introduction",
      "destination": "/en/introduction"
@@ -1662,4 +1578,4 @@
      "reddit": "https://www.reddit.com/r/crewAIInc/"
    }
  }
-}
+}
--- a/docs/en/ai/agents/examples.mdx
+++ b/docs/en/ai/agents/examples.mdx
@@ -1,12 +0,0 @@
---
-title: "Agents: Examples"
-description: "Runnable examples for robust agent configuration and execution."
-icon: "rocket-launch"
-mode: "wide"
---
-
-## Example links
-
- [/en/guides/agents/crafting-effective-agents](/en/guides/agents/crafting-effective-agents)
- [/en/learn/customizing-agents](/en/learn/customizing-agents)
- [/en/learn/coding-agents](/en/learn/coding-agents)
--- a/docs/en/ai/agents/index.mdx
+++ b/docs/en/ai/agents/index.mdx
@@ -1,32 +0,0 @@
---
-title: "Agents: Concepts"
-description: "Agent role contracts, task boundaries, and decision criteria for robust agent behavior."
-icon: "user"
-mode: "wide"
---
-
-## When to use
-
- You need specialized behavior with explicit role and goal.
- You need tool-enabled execution under constraints.
-
-## When not to use
-
- Static transformations are enough without model reasoning.
- Task can be solved by deterministic code only.
-
-## Core decisions
-
-| Decision | Choose this when |
-|---|---|
-| Single agent | Narrow scope, low coordination needs |
-| Multi-agent crew | Distinct expertise and review loops needed |
-| Tool-enabled agent | Model needs external actions or data |
-
-## Canonical links
-
- Reference: [/en/ai/agents/reference](/en/ai/agents/reference)
- Patterns: [/en/ai/agents/patterns](/en/ai/agents/patterns)
- Troubleshooting: [/en/ai/agents/troubleshooting](/en/ai/agents/troubleshooting)
- Examples: [/en/ai/agents/examples](/en/ai/agents/examples)
- Existing docs: [/en/concepts/agents](/en/concepts/agents)
--- a/docs/en/ai/agents/patterns.mdx
+++ b/docs/en/ai/agents/patterns.mdx
@@ -1,17 +0,0 @@
---
-title: "Agents: Patterns"
-description: "Practical agent patterns for role design, tool boundaries, and reliable outputs."
-icon: "diagram-project"
-mode: "wide"
---
-
-## Patterns
-
-1. Role + reviewer pair
- One agent drafts, one agent validates.
-
-2. Tool-bounded agent
- Restrict tool list to minimal action set.
-
-3. Structured output agent
- Force JSON or schema output for automation pipelines.
--- a/docs/en/ai/agents/reference.mdx
+++ b/docs/en/ai/agents/reference.mdx
@@ -1,22 +0,0 @@
---
-title: "Agents: Reference"
-description: "Reference for agent fields, prompt contracts, tool usage, and output constraints."
-icon: "book"
-mode: "wide"
---
-
-## Agent contract
-
- `role`: stable operating identity
- `goal`: measurable completion objective
- `backstory`: bounded style and context
- `tools`: allowed action surface
-
-## Output contract
-
- Prefer structured outputs for machine workflows.
- Define failure behavior for missing tool data.
-
-## Canonical source
-
-Primary API details live in [/en/concepts/agents](/en/concepts/agents).
--- a/docs/en/ai/agents/troubleshooting.mdx
+++ b/docs/en/ai/agents/troubleshooting.mdx
@@ -1,12 +0,0 @@
---
-title: "Agents: Troubleshooting"
-description: "Diagnose and fix common agent reliability and instruction-following failures."
-icon: "circle-exclamation"
-mode: "wide"
---
-
-## Common issues
-
- Hallucinated tool results: require tool-call evidence in output.
- Prompt drift: tighten role and success criteria.
- Verbose but low-signal output: enforce concise schema output.
--- a/docs/en/ai/crews/examples.mdx
+++ b/docs/en/ai/crews/examples.mdx
@@ -1,12 +0,0 @@
---
-title: "Crews: Examples"
-description: "Runnable crew examples for sequential and hierarchical execution."
-icon: "rocket-launch"
-mode: "wide"
---
-
-## Example links
-
- [/en/guides/crews/first-crew](/en/guides/crews/first-crew)
- [/en/learn/sequential-process](/en/learn/sequential-process)
- [/en/learn/hierarchical-process](/en/learn/hierarchical-process)
--- a/docs/en/ai/crews/index.mdx
+++ b/docs/en/ai/crews/index.mdx
@@ -1,26 +0,0 @@
---
-title: "Crews: Concepts"
-description: "When to use crews, process selection, delegation boundaries, and collaboration strategy."
-icon: "users"
-mode: "wide"
---
-
-## When to use
-
- You need multiple agents with specialized roles.
- You need staged execution and reviewer loops.
-
-## Process decision table
-
-| Process | Best for |
-|---|---|
-| Sequential | Linear pipelines and deterministic ordering |
-| Hierarchical | Manager-controlled planning and delegation |
-
-## Canonical links
-
- Reference: [/en/ai/crews/reference](/en/ai/crews/reference)
- Patterns: [/en/ai/crews/patterns](/en/ai/crews/patterns)
- Troubleshooting: [/en/ai/crews/troubleshooting](/en/ai/crews/troubleshooting)
- Examples: [/en/ai/crews/examples](/en/ai/crews/examples)
- Existing docs: [/en/concepts/crews](/en/concepts/crews)
--- a/docs/en/ai/crews/patterns.mdx
+++ b/docs/en/ai/crews/patterns.mdx
@@ -1,12 +0,0 @@
---
-title: "Crews: Patterns"
-description: "Production crew patterns for decomposition, review loops, and hybrid orchestration with Flows."
-icon: "diagram-project"
-mode: "wide"
---
-
-## Patterns
-
-1. Researcher + writer + reviewer
-2. Manager-directed hierarchical crew
-3. Flow-orchestrated multi-crew pipeline
--- a/docs/en/ai/crews/reference.mdx
+++ b/docs/en/ai/crews/reference.mdx
@@ -1,21 +0,0 @@
---
-title: "Crews: Reference"
-description: "Reference for crew composition, process semantics, task context passing, and execution modes."
-icon: "book"
-mode: "wide"
---
-
-## Crew contract
-
- `agents`: available executors
- `tasks`: work units with expected output
- `process`: ordering and delegation semantics
-
-## Runtime
-
- `kickoff()` for synchronous runs
- `kickoff_async()` for async execution
-
-## Canonical source
-
-Primary API details live in [/en/concepts/crews](/en/concepts/crews).
--- a/docs/en/ai/crews/troubleshooting.mdx
+++ b/docs/en/ai/crews/troubleshooting.mdx
@@ -1,12 +0,0 @@
---
-title: "Crews: Troubleshooting"
-description: "Common multi-agent coordination failures and practical fixes."
-icon: "circle-exclamation"
-mode: "wide"
---
-
-## Common issues
-
- Agents overlap on responsibilities: tighten role boundaries.
- Output inconsistency: standardize expected outputs per task.
- Slow runs: reduce unnecessary handoffs and model size.
--- a/docs/en/ai/flows/examples.mdx
+++ b/docs/en/ai/flows/examples.mdx
@@ -1,17 +0,0 @@
---
-title: "Flows: Examples"
-description: "Runnable end-to-end examples for production flow orchestration."
-icon: "rocket-launch"
-mode: "wide"
---
-
-## Canonical examples
-
-<CardGroup cols={2}>
-  <Card title="Flowstate Chat History" icon="comments" href="/en/learn/flowstate-chat-history">
-    Persistent chat history with summary compaction and memory scope.
-  </Card>
-  <Card title="Flows Concepts Example" icon="arrow-progress" href="/en/concepts/flows">
-    Full API and feature-oriented flow examples, including routers and persistence.
-  </Card>
-</CardGroup>
--- a/docs/en/ai/flows/index.mdx
+++ b/docs/en/ai/flows/index.mdx
@@ -1,39 +0,0 @@
---
-title: "Flows: Concepts"
-description: "When to use Flows, when not to use them, and key design constraints for production orchestration."
-icon: "arrow-progress"
-mode: "wide"
---
-
-## When to use
-
- You need deterministic orchestration, branching, and resumable execution.
- You need explicit state transitions across steps.
- You need persistence, routing, and event-driven control.
-
-## When not to use
-
- A single prompt/response interaction is enough.
- You only need one agent call without orchestration logic.
-
-## Core decisions
-
-| Decision | Choose this when |
-|---|---|
-| Unstructured state | Fast prototyping, highly dynamic fields |
-| Structured state | Stable contracts, team development, type safety |
-| `@persist()` | Long-running workflows and recovery requirements |
-| Router labels | Deterministic branch handling |
-
-## Canonical links
-
- Reference: [/en/ai/flows/reference](/en/ai/flows/reference)
- Patterns: [/en/ai/flows/patterns](/en/ai/flows/patterns)
- Troubleshooting: [/en/ai/flows/troubleshooting](/en/ai/flows/troubleshooting)
- Examples: [/en/ai/flows/examples](/en/ai/flows/examples)
-
-## Existing docs
-
- [/en/concepts/flows](/en/concepts/flows)
- [/en/guides/flows/mastering-flow-state](/en/guides/flows/mastering-flow-state)
- [/en/learn/flowstate-chat-history](/en/learn/flowstate-chat-history)
--- a/docs/en/ai/flows/patterns.mdx
+++ b/docs/en/ai/flows/patterns.mdx
@@ -1,29 +0,0 @@
---
-title: "Flows: Patterns"
-description: "Production flow patterns: triage routing, flowstate chat history, and human-in-the-loop checkpoints."
-icon: "diagram-project"
-mode: "wide"
---
-
-## Recommended patterns
-
-1. Triage router flow
- Inputs: normalized request payload
- Output: deterministic route label + action
- Reference: [/en/concepts/flows](/en/concepts/flows)
-
-2. Flowstate chat history
- Inputs: `session_id`, `last_user_message`
- Output: assistant reply + compact context state
- Reference: [/en/learn/flowstate-chat-history](/en/learn/flowstate-chat-history)
-
-3. Human feedback gates
- Inputs: generated artifact + reviewer feedback
- Output: approved/rejected/revision path
- Reference: [/en/learn/human-feedback-in-flows](/en/learn/human-feedback-in-flows)
-
-## Pattern requirements
-
- declare explicit input schema
- define expected output shape
- list failure modes and retries
--- a/docs/en/ai/flows/reference.mdx
+++ b/docs/en/ai/flows/reference.mdx
@@ -1,34 +0,0 @@
---
-title: "Flows: Reference"
-description: "API-oriented reference for Flow decorators, lifecycle semantics, state, routing, and persistence."
-icon: "book"
-mode: "wide"
---
-
-## Decorators
-
- `@start()` entrypoint, optional conditional trigger
- `@listen(...)` downstream method subscription
- `@router(...)` label-based deterministic routing
- `@persist()` automatic state persistence checkpoints
-
-## Runtime contracts
-
- `kickoff(inputs=...)` initializes or updates run inputs.
- final output is the value from the last completed method.
- `self.state` always has an auto-generated `id`.
-
-## State contracts
-
- Use typed state for durable workflows.
- Keep control fields explicit (`route`, `status`, `retry_count`).
- Avoid storing unbounded raw transcripts in state.
-
-## Resume and recovery
-
- Use persistence for recoverable runs.
- Keep idempotent step logic for safe retries.
-
-## Canonical source
-
-Primary API details live in [/en/concepts/flows](/en/concepts/flows).
--- a/docs/en/ai/flows/troubleshooting.mdx
+++ b/docs/en/ai/flows/troubleshooting.mdx
@@ -1,28 +0,0 @@
---
-title: "Flows: Troubleshooting"
-description: "Common flow failures, causes, and fixes for state, routing, persistence, and resumption."
-icon: "circle-exclamation"
-mode: "wide"
---
-
-## Common issues
-
-### Branch did not trigger
-
- Cause: router label mismatch.
- Fix: align returned label with `@listen("label")` exactly.
-
-### State fields missing
-
- Cause: untyped dynamic writes or missing inputs.
- Fix: switch to typed state and validate required fields at `@start()`.
-
-### Context window blow-up
-
- Cause: raw message accumulation.
- Fix: use sliding window + summary compaction pattern.
-
-### Resume behavior inconsistent
-
- Cause: non-idempotent side effects in retried steps.
- Fix: make side-effecting calls idempotent and record execution markers in state.
--- a/docs/en/ai/llms/examples.mdx
+++ b/docs/en/ai/llms/examples.mdx
@@ -1,12 +0,0 @@
---
-title: "LLMs: Examples"
-description: "Concrete examples for model setup, routing, and output-control patterns."
-icon: "rocket-launch"
-mode: "wide"
---
-
-## Example links
-
- [/en/concepts/llms](/en/concepts/llms)
- [/en/learn/llm-connections](/en/learn/llm-connections)
- [/en/learn/custom-llm](/en/learn/custom-llm)
--- a/docs/en/ai/llms/index.mdx
+++ b/docs/en/ai/llms/index.mdx
@@ -1,27 +0,0 @@
---
-title: "LLMs: Concepts"
-description: "Model selection strategy, cost-quality tradeoffs, and reliability posture for CrewAI systems."
-icon: "microchip-ai"
-mode: "wide"
---
-
-## When to use advanced LLM configuration
-
- You need predictable quality, latency, and cost control.
- You need model routing by task type.
-
-## Core decisions
-
-| Decision | Choose this when |
-|---|---|
-| Single model | Small systems with uniform task profile |
-| Routed models | Mixed workloads with different quality/cost needs |
-| Structured output | Automation pipelines and strict parsing needs |
-
-## Canonical links
-
- Reference: [/en/ai/llms/reference](/en/ai/llms/reference)
- Patterns: [/en/ai/llms/patterns](/en/ai/llms/patterns)
- Troubleshooting: [/en/ai/llms/troubleshooting](/en/ai/llms/troubleshooting)
- Examples: [/en/ai/llms/examples](/en/ai/llms/examples)
- Existing docs: [/en/concepts/llms](/en/concepts/llms)
--- a/docs/en/ai/llms/patterns.mdx
+++ b/docs/en/ai/llms/patterns.mdx
@@ -1,17 +0,0 @@
---
-title: "LLMs: Patterns"
-description: "Model routing, reliability defaults, and structured outputs for production AI workflows."
-icon: "diagram-project"
-mode: "wide"
---
-
-## Patterns
-
-1. Role-based model routing
-2. Reliability defaults (`timeout`, `max_retries`, low temperature)
-3. JSON-first outputs for machine consumption
-4. Responses API for multi-turn reasoning flows
-
-## Reference
-
- [/en/concepts/llms#production-llm-patterns](/en/concepts/llms#production-llm-patterns)
--- a/docs/en/ai/llms/reference.mdx
+++ b/docs/en/ai/llms/reference.mdx
@@ -1,25 +0,0 @@
---
-title: "LLMs: Reference"
-description: "Provider-agnostic LLM configuration reference for CrewAI projects."
-icon: "book"
-mode: "wide"
---
-
-## Common parameters
-
- `model`
- `temperature`
- `max_tokens`
- `timeout`
- `max_retries`
- `response_format`
-
-## Contract guidance
-
- Set low temperature for extraction/classification.
- Use structured outputs for downstream automation.
- Set explicit timeout and retry policy for production.
-
-## Canonical source
-
-Primary API details live in [/en/concepts/llms](/en/concepts/llms).
--- a/docs/en/ai/llms/troubleshooting.mdx
+++ b/docs/en/ai/llms/troubleshooting.mdx
@@ -1,12 +0,0 @@
---
-title: "LLMs: Troubleshooting"
-description: "Fix common model behavior failures: drift, latency spikes, malformed output, and cost overruns."
-icon: "circle-exclamation"
-mode: "wide"
---
-
-## Common issues
-
- Malformed JSON: enforce `response_format` and validate at boundary.
- Latency spikes: route heavy tasks to smaller models when acceptable.
- Cost growth: add budget-aware model routing and truncation rules.
--- a/docs/en/ai/memory/examples.mdx
+++ b/docs/en/ai/memory/examples.mdx
@@ -1,11 +0,0 @@
---
-title: "Memory: Examples"
-description: "Runnable examples for scoped storage and semantic retrieval in CrewAI."
-icon: "rocket-launch"
-mode: "wide"
---
-
-## Example links
-
- [/en/concepts/memory](/en/concepts/memory)
- [/en/learn/flowstate-chat-history](/en/learn/flowstate-chat-history)
--- a/docs/en/ai/memory/index.mdx
+++ b/docs/en/ai/memory/index.mdx
@@ -1,24 +0,0 @@
---
-title: "Memory: Concepts"
-description: "Designing recall systems with scope boundaries and state-vs-memory separation."
-icon: "database"
-mode: "wide"
---
-
-## When to use memory
-
- You need semantic recall across runs.
- You need long-term context outside immediate flow state.
-
-## When to use state instead
-
- Data is only needed for current control flow.
- Data must remain deterministic and explicit per step.
-
-## Canonical links
-
- Reference: [/en/ai/memory/reference](/en/ai/memory/reference)
- Patterns: [/en/ai/memory/patterns](/en/ai/memory/patterns)
- Troubleshooting: [/en/ai/memory/troubleshooting](/en/ai/memory/troubleshooting)
- Examples: [/en/ai/memory/examples](/en/ai/memory/examples)
- Existing docs: [/en/concepts/memory](/en/concepts/memory)
--- a/docs/en/ai/memory/patterns.mdx
+++ b/docs/en/ai/memory/patterns.mdx
@@ -1,17 +0,0 @@
---
-title: "Memory: Patterns"
-description: "Practical memory patterns for session recall, scoped retrieval, and hybrid flow-state designs."
-icon: "diagram-project"
-mode: "wide"
---
-
-## Patterns
-
-1. Session-scoped recall (`/chat/{session_id}`)
-2. Project-scoped knowledge (`/project/{project_id}`)
-3. Hybrid pattern: flow state for control, memory for long-tail context
-
-## Reference
-
- [/en/learn/flowstate-chat-history](/en/learn/flowstate-chat-history)
- [/en/guides/flows/mastering-flow-state](/en/guides/flows/mastering-flow-state)
--- a/docs/en/ai/memory/reference.mdx
+++ b/docs/en/ai/memory/reference.mdx
@@ -1,23 +0,0 @@
---
-title: "Memory: Reference"
-description: "Reference for remember/recall contracts, scopes, and retrieval tuning."
-icon: "book"
-mode: "wide"
---
-
-## API surface
-
- `remember(content, scope=...)`
- `recall(query, limit=...)`
- `extract_memories(text)`
- `scope(path)` and `subscope(name)`
-
-## Scope rules
-
- use `/{entity_type}/{identifier}` paths
- keep hierarchy shallow
- isolate sessions by stable identifiers
-
-## Canonical source
-
-Primary API details live in [/en/concepts/memory](/en/concepts/memory).
--- a/docs/en/ai/memory/troubleshooting.mdx
+++ b/docs/en/ai/memory/troubleshooting.mdx
@@ -1,12 +0,0 @@
---
-title: "Memory: Troubleshooting"
-description: "Diagnose poor recall quality, scope leakage, and stale memory retrieval."
-icon: "circle-exclamation"
-mode: "wide"
---
-
-## Common issues
-
- Irrelevant recall: tighten scopes and query wording.
- Missing recall: check scope path and recency weighting.
- Scope leakage: avoid shared broad scopes for unrelated workflows.
--- a/docs/en/ai/overview.mdx
+++ b/docs/en/ai/overview.mdx
@@ -1,54 +0,0 @@
---
-title: "AI-First Documentation"
-description: "Canonical, agent-optimized documentation map for Flows, Agents, Crews, LLMs, Memory, and Tools."
-icon: "sitemap"
-mode: "wide"
---
-
-## Purpose
-
-This section is the canonical map for AI agents and developers.
-
-Use it when you need:
- one source of truth per domain
- predictable page structure
- runnable patterns with explicit inputs and outputs
-
-## Domain Packs
-
-<CardGroup cols={3}>
-  <Card title="Flows" icon="arrow-progress" href="/en/ai/flows/index">
-    State, routing, persistence, resume, and orchestration lifecycle.
-  </Card>
-  <Card title="Agents" icon="user" href="/en/ai/agents/index">
-    Agent contracts, tool boundaries, prompt roles, and output discipline.
-  </Card>
-  <Card title="Crews" icon="users" href="/en/ai/crews/index">
-    Multi-agent execution, process choice, delegation, and coordination.
-  </Card>
-  <Card title="LLMs" icon="microchip-ai" href="/en/ai/llms/index">
-    Model configuration contracts, routing, reliability defaults, and providers.
-  </Card>
-  <Card title="Memory" icon="database" href="/en/ai/memory/index">
-    Retrieval semantics, scope design, and state-vs-memory architecture.
-  </Card>
-  <Card title="Tools" icon="wrench" href="/en/ai/tools/index">
-    Tool safety, schema contracts, retries, and integration patterns.
-  </Card>
-</CardGroup>
-
-## Writing Contract
-
-Every domain follows the same structure:
-1. Concepts (`index`)
-2. Reference (`reference`)
-3. Patterns (`patterns`)
-4. Troubleshooting (`troubleshooting`)
-5. Examples (`examples`)
-
-## Deprecation Policy
-
-When a page is replaced:
- keep a redirect for the old URL
- keep one canonical destination
- avoid duplicated conceptual prose
--- a/docs/en/ai/tools/examples.mdx
+++ b/docs/en/ai/tools/examples.mdx
@@ -1,12 +0,0 @@
---
-title: "Tools: Examples"
-description: "Practical examples for tool-driven agents and crews."
-icon: "rocket-launch"
-mode: "wide"
---
-
-## Example links
-
- [/en/tools/overview](/en/tools/overview)
- [/en/learn/create-custom-tools](/en/learn/create-custom-tools)
- [/en/learn/tool-hooks](/en/learn/tool-hooks)
--- a/docs/en/ai/tools/index.mdx
+++ b/docs/en/ai/tools/index.mdx
@@ -1,25 +0,0 @@
---
-title: "Tools: Concepts"
-description: "Tool selection strategy, safety boundaries, and reliability rules for agentic execution."
-icon: "wrench"
-mode: "wide"
---
-
-## When to use tools
-
- Agents need external data or side effects.
- Deterministic systems must be integrated into agent workflows.
-
-## Tool safety rules
-
- define clear input schemas
- validate outputs before downstream use
- isolate privileged tools behind policy checks
-
-## Canonical links
-
- Reference: [/en/ai/tools/reference](/en/ai/tools/reference)
- Patterns: [/en/ai/tools/patterns](/en/ai/tools/patterns)
- Troubleshooting: [/en/ai/tools/troubleshooting](/en/ai/tools/troubleshooting)
- Examples: [/en/ai/tools/examples](/en/ai/tools/examples)
- Existing docs: [/en/concepts/tools](/en/concepts/tools)
--- a/docs/en/ai/tools/patterns.mdx
+++ b/docs/en/ai/tools/patterns.mdx
@@ -1,12 +0,0 @@
---
-title: "Tools: Patterns"
-description: "Tool execution patterns for retrieval, action safety, and response grounding."
-icon: "diagram-project"
-mode: "wide"
---
-
-## Patterns
-
-1. Read-first then write pattern
-2. Validation gate before side effects
-3. Fallback tool chains for degraded mode
--- a/docs/en/ai/tools/reference.mdx
+++ b/docs/en/ai/tools/reference.mdx
@@ -1,22 +0,0 @@
---
-title: "Tools: Reference"
-description: "Reference for tool invocation contracts, argument schemas, and runtime safeguards."
-icon: "book"
-mode: "wide"
---
-
-## Tool contract
-
- deterministic input schema
- stable output schema
- explicit error behavior
-
-## Runtime safeguards
-
- timeout and retry policy
- idempotency for side effects
- validation before commit
-
-## Canonical source
-
-Primary API details live in [/en/concepts/tools](/en/concepts/tools).
--- a/docs/en/ai/tools/troubleshooting.mdx
+++ b/docs/en/ai/tools/troubleshooting.mdx
@@ -1,12 +0,0 @@
---
-title: "Tools: Troubleshooting"
-description: "Common tool-call failures and fixes for schema mismatch, retries, and side effects."
-icon: "circle-exclamation"
-mode: "wide"
---
-
-## Common issues
-
- Schema mismatch: align tool args with declared model output schema.
- Repeated side effects: add idempotency keys.
- Tool timeouts: define retries with bounded backoff.
--- a/docs/en/concepts/agents.mdx
+++ b/docs/en/concepts/agents.mdx
@@ -23,17 +23,6 @@ In the CrewAI framework, an `Agent` is an autonomous unit that can:
  at creating content.
 </Tip>

-## When to Use Agents
-
- You need role-specific reasoning and decision-making.
- You need tool-enabled execution with delegated responsibilities.
- You need reusable behavioral units across tasks and crews.
-
-## When Not to Use Agents
-
- Deterministic business logic in plain code is sufficient.
- A static transformation without reasoning is sufficient.
-
 <Note type="info" title="Enterprise Enhancement: Visual Agent Builder">
 CrewAI AMP includes a Visual Agent Builder that simplifies agent creation and configuration without writing code. Design your agents visually and test them in real-time.

--- a/docs/en/concepts/crews.mdx
+++ b/docs/en/concepts/crews.mdx
@@ -9,17 +9,6 @@ mode: "wide"

 A crew in crewAI represents a collaborative group of agents working together to achieve a set of tasks. Each crew defines the strategy for task execution, agent collaboration, and the overall workflow.

-## When to Use Crews
-
- You need multiple specialized agents collaborating on a shared outcome.
- You need process-level orchestration (`sequential` or `hierarchical`).
- You need task-level handoffs and context propagation.
-
-## When Not to Use Crews
-
- A single agent can complete the work end-to-end.
- You do not need multi-step task decomposition.
-
 ## Crew Attributes

 | Attribute                             | Parameters             | Description                                                                                                                                                                                                                                               |
@@ -428,17 +417,3 @@ crewai replay -t <task_id>
 ```

 These commands let you replay from your latest kickoff tasks, still retaining context from previously executed tasks.
-
-## Common Failure Modes
-
-### Agents overlap responsibilities
- Cause: role/goal definitions are too broad.
- Fix: tighten role boundaries and task ownership.
-
-### Hierarchical runs stall or degrade
- Cause: weak manager configuration or unclear delegation criteria.
- Fix: define a stronger manager objective and explicit completion criteria.
-
-### Crew outputs are inconsistent
- Cause: expected outputs are underspecified across tasks.
- Fix: enforce structured outputs and stronger task contracts.
--- a/docs/en/concepts/flows.mdx
+++ b/docs/en/concepts/flows.mdx
@@ -19,121 +19,82 @@ Flows allow you to create structured, event-driven workflows. They provide a sea

 4. **Flexible Control Flow**: Implement conditional logic, loops, and branching within your workflows.

-## When to Use Flows
-
- You need deterministic orchestration and branching logic.
- You need explicit state transitions across multiple steps.
- You need resumable workflows with persistence.
- You need to combine crews, direct model calls, and Python logic in one runtime.
-
-## When Not to Use Flows
-
- A single prompt/response call is sufficient.
- A single crew kickoff with no orchestration logic is sufficient.
- You do not need stateful multi-step execution.
-
 ## Getting Started

-The example below shows a realistic Flow for support-ticket triage. It demonstrates features teams use in production: typed state, routing, memory access, and persistence.
+Let's create a simple Flow where you will use OpenAI to generate a random city in one task and then use that city to generate a fun fact in another task.

 ```python Code
-from crewai.flow.flow import Flow, listen, router, start
-from crewai.flow.persistence import persist
-from pydantic import BaseModel, Field
+
+from crewai.flow.flow import Flow, listen, start
+from dotenv import load_dotenv
+from litellm import completion


-class SupportTriageState(BaseModel):
-    ticket_id: str = ""
-    customer_tier: str = "standard"  # standard | enterprise
-    issue: str = ""
-    urgency: str = "normal"
-    route: str = ""
-    draft_reply: str = ""
-    internal_notes: list[str] = Field(default_factory=list)
+class ExampleFlow(Flow):
+    model = "gpt-4o-mini"

-
-@persist()
-class SupportTriageFlow(Flow[SupportTriageState]):
    @start()
-    def ingest_ticket(self):
-        # kickoff(inputs={...}) is merged into typed state fields
-        print(f"Flow State ID: {self.state.id}")
+    def generate_city(self):
+        print("Starting flow")
+        # Each flow state automatically gets a unique ID
+        print(f"Flow State ID: {self.state['id']}")

-        self.remember(
-            f"Ticket {self.state.ticket_id}: {self.state.issue}",
-            scope=f"/support/{self.state.ticket_id}",
+        response = completion(
+            model=self.model,
+            messages=[
+                {
+                    "role": "user",
+                    "content": "Return the name of a random city in the world.",
+                },
+            ],
        )

-        issue = self.state.issue.lower()
-        if "security" in issue or "breach" in issue:
-            self.state.urgency = "critical"
-        elif self.state.customer_tier == "enterprise":
-            self.state.urgency = "high"
-        else:
-            self.state.urgency = "normal"
+        random_city = response["choices"][0]["message"]["content"]
+        # Store the city in our state
+        self.state["city"] = random_city
+        print(f"Random City: {random_city}")

-        return self.state.issue
+        return random_city

-    @router(ingest_ticket)
-    def route_ticket(self):
-        issue = self.state.issue.lower()
-        if "security" in issue or "breach" in issue:
-            self.state.route = "security"
-            return "security_review"
-        if self.state.customer_tier == "enterprise" or self.state.urgency == "high":
-            self.state.route = "priority"
-            return "priority_queue"
-        self.state.route = "standard"
-        return "standard_queue"
-
-    @listen("security_review")
-    def handle_security(self):
-        self.state.internal_notes.append("Escalated to Security Incident Response")
-        self.state.draft_reply = (
-            "We have escalated your case to our security team and will update you shortly."
+    @listen(generate_city)
+    def generate_fun_fact(self, random_city):
+        response = completion(
+            model=self.model,
+            messages=[
+                {
+                    "role": "user",
+                    "content": f"Tell me a fun fact about {random_city}",
+                },
+            ],
        )
-        return self.state.draft_reply

-    @listen("priority_queue")
-    def handle_priority(self):
-        history = self.recall("SLA commitments for enterprise support", limit=2)
-        self.state.internal_notes.append(
-            f"Loaded {len(history)} memory hits for priority handling"
-        )
-        self.state.draft_reply = (
-            "Your ticket has been prioritized and assigned to a senior support engineer."
-        )
-        return self.state.draft_reply
-
-    @listen("standard_queue")
-    def handle_standard(self):
-        self.state.internal_notes.append("Routed to standard support queue")
-        self.state.draft_reply = "Thanks for reporting this. Our team will follow up soon."
-        return self.state.draft_reply
+        fun_fact = response["choices"][0]["message"]["content"]
+        # Store the fun fact in our state
+        self.state["fun_fact"] = fun_fact
+        return fun_fact


-flow = SupportTriageFlow()
-flow.plot("support_triage_flow")
-result = flow.kickoff(
-    inputs={
-        "ticket_id": "TCK-1024",
-        "customer_tier": "enterprise",
-        "issue": "Cannot access SSO after enabling new policy",
-    }
-)
-print("Final reply:", result)
-print("Route:", flow.state.route)
-print("Notes:", flow.state.internal_notes)
+
+flow = ExampleFlow()
+flow.plot()
+result = flow.kickoff()
+
+print(f"Generated fun fact: {result}")
 ```
 ![Flow Visual image](/images/crewai-flow-1.png)
-In this example, one flow demonstrates several core features together:
-1. `@start()` initializes and normalizes state for downstream steps.
-2. `@router()` performs deterministic branching into labeled routes.
-3. Route listeners implement lane-specific behavior (`security`, `priority`, `standard`).
-4. `@persist()` keeps the flow state recoverable between runs.
-5. Built-in memory methods (`remember`, `recall`) add durable context beyond a single method call.
+In the above example, we have created a simple Flow that generates a random city using OpenAI and then generates a fun fact about that city. The Flow consists of two tasks: `generate_city` and `generate_fun_fact`. The `generate_city` task is the starting point of the Flow, and the `generate_fun_fact` task listens for the output of the `generate_city` task.

-This pattern mirrors typical production workflows where request classification, policy-aware routing, and auditable state all happen in one orchestrated flow.
+Each Flow instance automatically receives a unique identifier (UUID) in its state, which helps track and manage flow executions. The state can also store additional data (like the generated city and fun fact) that persists throughout the flow's execution.
+
+When you run the Flow, it will:
+1. Generate a unique ID for the flow state
+2. Generate a random city and store it in the state
+3. Generate a fun fact about that city and store it in the state
+4. Print the results to the console
+
+The state's unique ID and stored data can be useful for tracking flow executions and maintaining context between tasks.
+
+**Note:** Ensure you have set up your `.env` file to store your `OPENAI_API_KEY`. This key is necessary for authenticating requests to the OpenAI API.

 ### @start()

@@ -156,15 +117,15 @@ The `@listen()` decorator can be used in several ways:
 1. **Listening to a Method by Name**: You can pass the name of the method you want to listen to as a string. When that method completes, the listener method will be triggered.

   ```python Code
-   @listen("upstream_method")
-   def downstream_method(self, upstream_result):
+   @listen("generate_city")
+   def generate_fun_fact(self, random_city):
       # Implementation
   ```

 2. **Listening to a Method Directly**: You can pass the method itself. When that method completes, the listener method will be triggered.
   ```python Code
-   @listen(upstream_method)
-   def downstream_method(self, upstream_result):
+   @listen(generate_city)
+   def generate_fun_fact(self, random_city):
       # Implementation
   ```

@@ -780,17 +741,201 @@ This example demonstrates several key features of using Agents in flows:

 3. **Tool Integration**: Agents can use tools (like `WebsiteSearchTool`) to enhance their capabilities.

-## Multi-Crew Flows and Plotting
+## Adding Crews to Flows

-Detailed build walkthroughs and project scaffolding are documented in guide pages to keep this concepts page focused.
+Creating a flow with multiple crews in CrewAI is straightforward.

- Build your first flow: [/en/guides/flows/first-flow](/en/guides/flows/first-flow)
- Master state and persistence: [/en/guides/flows/mastering-flow-state](/en/guides/flows/mastering-flow-state)
- Real-world chat-state pattern: [/en/learn/flowstate-chat-history](/en/learn/flowstate-chat-history)
+You can generate a new CrewAI project that includes all the scaffolding needed to create a flow with multiple crews by running the following command:

-For visualization:
- Use `flow.plot("my_flow_plot")` in code, or
- Use `crewai flow plot` in CLI projects.
+```bash
+crewai create flow name_of_flow
+```
+
+This command will generate a new CrewAI project with the necessary folder structure. The generated project includes a prebuilt crew called `poem_crew` that is already working. You can use this crew as a template by copying, pasting, and editing it to create other crews.
+
+### Folder Structure
+
+After running the `crewai create flow name_of_flow` command, you will see a folder structure similar to the following:
+
+| Directory/File         | Description                                                        |
+| :--------------------- | :----------------------------------------------------------------- |
+| `name_of_flow/`        | Root directory for the flow.                                       |
+| ├── `crews/`           | Contains directories for specific crews.                           |
+| │ └── `poem_crew/`     | Directory for the "poem_crew" with its configurations and scripts. |
+| │ ├── `config/`        | Configuration files directory for the "poem_crew".                 |
+| │ │ ├── `agents.yaml`  | YAML file defining the agents for "poem_crew".                     |
+| │ │ └── `tasks.yaml`   | YAML file defining the tasks for "poem_crew".                      |
+| │ ├── `poem_crew.py`   | Script for "poem_crew" functionality.                              |
+| ├── `tools/`           | Directory for additional tools used in the flow.                   |
+| │ └── `custom_tool.py` | Custom tool implementation.                                        |
+| ├── `main.py`          | Main script for running the flow.                                  |
+| ├── `README.md`        | Project description and instructions.                              |
+| ├── `pyproject.toml`   | Configuration file for project dependencies and settings.          |
+| └── `.gitignore`       | Specifies files and directories to ignore in version control.      |
+
+### Building Your Crews
+
+In the `crews` folder, you can define multiple crews. Each crew will have its own folder containing configuration files and the crew definition file. For example, the `poem_crew` folder contains:
+
+- `config/agents.yaml`: Defines the agents for the crew.
+- `config/tasks.yaml`: Defines the tasks for the crew.
+- `poem_crew.py`: Contains the crew definition, including agents, tasks, and the crew itself.
+
+You can copy, paste, and edit the `poem_crew` to create other crews.
+
+### Connecting Crews in `main.py`
+
+The `main.py` file is where you create your flow and connect the crews together. You can define your flow by using the `Flow` class and the decorators `@start` and `@listen` to specify the flow of execution.
+
+Here's an example of how you can connect the `poem_crew` in the `main.py` file:
+
+```python Code
+#!/usr/bin/env python
+from random import randint
+
+from pydantic import BaseModel
+from crewai.flow.flow import Flow, listen, start
+from .crews.poem_crew.poem_crew import PoemCrew
+
+class PoemState(BaseModel):
+    sentence_count: int = 1
+    poem: str = ""
+
+class PoemFlow(Flow[PoemState]):
+
+    @start()
+    def generate_sentence_count(self):
+        print("Generating sentence count")
+        self.state.sentence_count = randint(1, 5)
+
+    @listen(generate_sentence_count)
+    def generate_poem(self):
+        print("Generating poem")
+        result = PoemCrew().crew().kickoff(inputs={"sentence_count": self.state.sentence_count})
+
+        print("Poem generated", result.raw)
+        self.state.poem = result.raw
+
+    @listen(generate_poem)
+    def save_poem(self):
+        print("Saving poem")
+        with open("poem.txt", "w") as f:
+            f.write(self.state.poem)
+
+def kickoff():
+    poem_flow = PoemFlow()
+    poem_flow.kickoff()
+
+
+def plot():
+    poem_flow = PoemFlow()
+    poem_flow.plot("PoemFlowPlot")
+
+if __name__ == "__main__":
+    kickoff()
+    plot()
+```
+
+In this example, the `PoemFlow` class defines a flow that generates a sentence count, uses the `PoemCrew` to generate a poem, and then saves the poem to a file. The flow is kicked off by calling the `kickoff()` method. The PoemFlowPlot will be generated by `plot()` method.
+
+![Flow Visual image](/images/crewai-flow-8.png)
+
+### Running the Flow
+
+(Optional) Before running the flow, you can install the dependencies by running:
+
+```bash
+crewai install
+```
+
+Once all of the dependencies are installed, you need to activate the virtual environment by running:
+
+```bash
+source .venv/bin/activate
+```
+
+After activating the virtual environment, you can run the flow by executing one of the following commands:
+
+```bash
+crewai flow kickoff
+```
+
+or
+
+```bash
+uv run kickoff
+```
+
+The flow will execute, and you should see the output in the console.
+
+## Plot Flows
+
+Visualizing your AI workflows can provide valuable insights into the structure and execution paths of your flows. CrewAI offers a powerful visualization tool that allows you to generate interactive plots of your flows, making it easier to understand and optimize your AI workflows.
+
+### What are Plots?
+
+Plots in CrewAI are graphical representations of your AI workflows. They display the various tasks, their connections, and the flow of data between them. This visualization helps in understanding the sequence of operations, identifying bottlenecks, and ensuring that the workflow logic aligns with your expectations.
+
+### How to Generate a Plot
+
+CrewAI provides two convenient methods to generate plots of your flows:
+
+#### Option 1: Using the `plot()` Method
+
+If you are working directly with a flow instance, you can generate a plot by calling the `plot()` method on your flow object. This method will create an HTML file containing the interactive plot of your flow.
+
+```python Code
+# Assuming you have a flow instance
+flow.plot("my_flow_plot")
+```
+
+This will generate a file named `my_flow_plot.html` in your current directory. You can open this file in a web browser to view the interactive plot.
+
+#### Option 2: Using the Command Line
+
+If you are working within a structured CrewAI project, you can generate a plot using the command line. This is particularly useful for larger projects where you want to visualize the entire flow setup.
+
+```bash
+crewai flow plot
+```
+
+This command will generate an HTML file with the plot of your flow, similar to the `plot()` method. The file will be saved in your project directory, and you can open it in a web browser to explore the flow.
+
+### Understanding the Plot
+
+The generated plot will display nodes representing the tasks in your flow, with directed edges indicating the flow of execution. The plot is interactive, allowing you to zoom in and out, and hover over nodes to see additional details.
+
+By visualizing your flows, you can gain a clearer understanding of the workflow's structure, making it easier to debug, optimize, and communicate your AI processes to others.
+
+### Conclusion
+
+Plotting your flows is a powerful feature of CrewAI that enhances your ability to design and manage complex AI workflows. Whether you choose to use the `plot()` method or the command line, generating plots will provide you with a visual representation of your workflows, aiding in both development and presentation.
+
+## Next Steps
+
+If you're interested in exploring additional examples of flows, we have a variety of recommendations in our examples repository. Here are four specific flow examples, each showcasing unique use cases to help you match your current problem type to a specific example:
+
+1. **Email Auto Responder Flow**: This example demonstrates an infinite loop where a background job continually runs to automate email responses. It's a great use case for tasks that need to be performed repeatedly without manual intervention. [View Example](https://github.com/crewAIInc/crewAI-examples/tree/main/email_auto_responder_flow)
+
+2. **Lead Score Flow**: This flow showcases adding human-in-the-loop feedback and handling different conditional branches using the router. It's an excellent example of how to incorporate dynamic decision-making and human oversight into your workflows. [View Example](https://github.com/crewAIInc/crewAI-examples/tree/main/lead-score-flow)
+
+3. **Write a Book Flow**: This example excels at chaining multiple crews together, where the output of one crew is used by another. Specifically, one crew outlines an entire book, and another crew generates chapters based on the outline. Eventually, everything is connected to produce a complete book. This flow is perfect for complex, multi-step processes that require coordination between different tasks. [View Example](https://github.com/crewAIInc/crewAI-examples/tree/main/write_a_book_with_flows)
+
+4. **Meeting Assistant Flow**: This flow demonstrates how to broadcast one event to trigger multiple follow-up actions. For instance, after a meeting is completed, the flow can update a Trello board, send a Slack message, and save the results. It's a great example of handling multiple outcomes from a single event, making it ideal for comprehensive task management and notification systems. [View Example](https://github.com/crewAIInc/crewAI-examples/tree/main/meeting_assistant_flow)
+
+By exploring these examples, you can gain insights into how to leverage CrewAI Flows for various use cases, from automating repetitive tasks to managing complex, multi-step processes with dynamic decision-making and human feedback.
+
+Also, check out our YouTube video on how to use flows in CrewAI below!
+
+<iframe
+  className="w-full aspect-video rounded-xl"
+  src="https://www.youtube.com/embed/MTb5my6VOT8"
+  title="CrewAI Flows overview"
+  frameBorder="0"
+  allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"
+  referrerPolicy="strict-origin-when-cross-origin"
+  allowFullScreen
+></iframe>

 ## Running Flows

@@ -801,7 +946,7 @@ There are two ways to run a flow:
 You can run a flow programmatically by creating an instance of your flow class and calling the `kickoff()` method:

 ```python
-flow = SupportTriageFlow()
+flow = ExampleFlow()
 result = flow.kickoff()
 ```

@@ -920,21 +1065,3 @@ crewai flow kickoff
 ```

 However, the `crewai run` command is now the preferred method as it works for both crews and flows.
-
-## Common Failure Modes
-
-### Router branch not firing
- Cause: returned label does not match a `@listen("label")` value.
- Fix: align router return strings with listener labels exactly.
-
-### State fields missing at runtime
- Cause: untyped dynamic fields or missing kickoff inputs.
- Fix: use typed state and validate required fields in `@start()`.
-
-### Prompt/token growth over time
- Cause: appending unbounded message history in state.
- Fix: apply sliding-window state and summary compaction patterns.
-
-### Non-idempotent retries
- Cause: side effects executed on retried steps.
- Fix: add idempotency keys/markers to state and guard external writes.
--- a/docs/en/concepts/llms.mdx
+++ b/docs/en/concepts/llms.mdx
--- a/docs/en/concepts/memory.mdx
+++ b/docs/en/concepts/memory.mdx
@@ -156,7 +156,6 @@ class ResearchFlow(Flow):
 ```

 See the [Flows documentation](/concepts/flows) for more on memory in Flows.
-For a production-style conversational pattern that combines Flow state and memory, see [Flowstate Chat History](/en/learn/flowstate-chat-history).


 ## Hierarchical Scopes
--- a/docs/en/concepts/planning.mdx
+++ b/docs/en/concepts/planning.mdx
@@ -10,17 +10,6 @@ mode: "wide"
 The planning feature in CrewAI allows you to add planning capability to your crew. When enabled, before each Crew iteration, 
 all Crew information is sent to an AgentPlanner that will plan the tasks step by step, and this plan will be added to each task description.

-## When to Use Planning
-
- Tasks require multi-step decomposition before execution.
- You need more consistent execution quality on complex tasks.
- You want transparent planning traces in crew runs.
-
-## When Not to Use Planning
-
- Tasks are simple and deterministic.
- Latency and token budget are strict and planning overhead is not justified.
-
 ### Using the Planning Feature

 Getting started with the planning feature is very easy, the only step required is to add `planning=True` to your Crew:
@@ -42,7 +31,7 @@ my_crew = Crew(
 From this point on, your crew will have planning enabled, and the tasks will be planned before each iteration.

 <Warning>
-Planning model defaults can vary by version and environment. To avoid implicit provider dependencies, set `planning_llm` explicitly in your crew configuration.
+When planning is enabled, crewAI will use `gpt-4o-mini` as the default LLM for planning, which requires a valid OpenAI API key. Since your agents might be using different LLMs, this could cause confusion if you don't have an OpenAI API key configured or if you're experiencing unexpected behavior related to LLM API calls.
 </Warning>

 #### Planning LLM
@@ -163,14 +152,4 @@ A list with 10 bullet points of the most relevant information about AI LLMs.
 **Expected Output:**
 A fully fledged report with the main topics, each with a full section of information. Formatted as markdown without '```'.
 ```
-</CodeGroup>
-
-## Common Failure Modes
-
-### Planning adds cost/latency without quality gains
- Cause: planning enabled for simple tasks.
- Fix: disable `planning` for straightforward pipelines.
-
-### Unexpected provider authentication errors
- Cause: implicit planner model/provider assumptions.
- Fix: set `planning_llm` explicitly and ensure matching credentials are configured.
+</CodeGroup>
--- a/docs/en/concepts/processes.mdx
+++ b/docs/en/concepts/processes.mdx
@@ -12,20 +12,11 @@ mode: "wide"
  These processes ensure tasks are distributed and executed efficiently, in alignment with a predefined strategy.
 </Tip>

-## When to Use Each Process
-
- Use `sequential` when task order is fixed and outputs feed directly into the next task.
- Use `hierarchical` when you need a manager to delegate and validate work dynamically.
-
-## When Not to Use Hierarchical
-
- You do not need dynamic delegation.
- You cannot provide a reliable `manager_llm` or `manager_agent`.
-
 ## Process Implementations

 - **Sequential**: Executes tasks sequentially, ensuring tasks are completed in an orderly progression.
 - **Hierarchical**: Organizes tasks in a managerial hierarchy, where tasks are delegated and executed based on a structured chain of command. A manager language model (`manager_llm`) or a custom manager agent (`manager_agent`) must be specified in the crew to enable the hierarchical process, facilitating the creation and management of tasks by the manager.
+- **Consensual Process (Planned)**: Aiming for collaborative decision-making among agents on task execution, this process type introduces a democratic approach to task management within CrewAI. It is planned for future development and is not currently implemented in the codebase.

 ## The Role of Processes in Teamwork
 Processes enable individual agents to operate as a cohesive unit, streamlining their efforts to achieve common objectives with efficiency and coherence.
@@ -68,17 +59,9 @@ Emulates a corporate hierarchy, CrewAI allows specifying a custom manager agent

 ## Process Class: Detailed Overview

-The `Process` class is implemented as an enumeration (`Enum`), ensuring type safety and restricting process values to the defined types (`sequential`, `hierarchical`).
+The `Process` class is implemented as an enumeration (`Enum`), ensuring type safety and restricting process values to the defined types (`sequential`, `hierarchical`). The consensual process is planned for future inclusion, emphasizing our commitment to continuous development and innovation.

 ## Conclusion

 The structured collaboration facilitated by processes within CrewAI is crucial for enabling systematic teamwork among agents. 
-## Common Failure Modes
-
-### Hierarchical process fails at startup
- Cause: missing `manager_llm` or `manager_agent`.
- Fix: provide one of them explicitly in crew configuration.
-
-### Sequential process produces weak outputs
- Cause: task boundaries/context are underspecified.
- Fix: improve task descriptions, expected outputs, and task context chaining.
+This documentation has been updated to reflect the latest features, enhancements, and the planned integration of the Consensual Process, ensuring users have access to the most current and comprehensive information.
--- a/docs/en/concepts/testing.mdx
+++ b/docs/en/concepts/testing.mdx
@@ -9,20 +9,9 @@ mode: "wide"

 Testing is a crucial part of the development process, and it is essential to ensure that your crew is performing as expected. With crewAI, you can easily test your crew and evaluate its performance using the built-in testing capabilities.

-## When to Use Testing
-
- Before promoting a crew to production.
- After changing prompts, tools, or model configurations.
- When benchmarking quality/cost/latency tradeoffs.
-
-## When Not to Rely on Testing Alone
-
- For safety-critical deployments without human review gates.
- When test datasets are too small or unrepresentative.
-
 ### Using the Testing Feature

-Use the CLI command `crewai test` to run repeated crew executions and compare outputs across iterations. The parameters are `n_iterations` and `model`, which are optional and default to `2` and `gpt-4o-mini`.
+We added the CLI command `crewai test` to make it easy to test your crew. This command will run your crew for a specified number of iterations and provide detailed performance metrics. The parameters are `n_iterations` and `model`, which are optional and default to 2 and `gpt-4o-mini` respectively. For now, the only provider available is OpenAI.

 ```bash
 crewai test
@@ -58,13 +47,3 @@ A table of scores at the end will show the performance of the crew in terms of t
 | Execution Time (s) |  126  |  145  |    **135**     |                                |                                  |

 The example above shows the test results for two runs of the crew with two tasks, with the average total score for each task and the crew as a whole.
-
-## Common Failure Modes
-
-### Scores fluctuate too much between runs
- Cause: high sampling randomness or unstable prompts.
- Fix: lower temperature and tighten output constraints.
-
-### Good test scores but poor production quality
- Cause: test prompts do not match real workload.
- Fix: build a representative test set from real production inputs.
--- a/docs/en/concepts/tools.mdx
+++ b/docs/en/concepts/tools.mdx
@@ -10,17 +10,6 @@ mode: "wide"
 CrewAI tools empower agents with capabilities ranging from web searching and data analysis to collaboration and delegating tasks among coworkers.
 This documentation outlines how to create, integrate, and leverage these tools within the CrewAI framework, including a new focus on collaboration tools.

-## When to Use Tools
-
- Agents need external data or side effects.
- You need deterministic actions wrapped in reusable interfaces.
- You need to connect APIs, files, databases, or browser actions into agent workflows.
-
-## When Not to Use Tools
-
- The task can be solved entirely from prompt context.
- The external side effect cannot be made safe or idempotent.
-
 ## What is a Tool?

 A tool in CrewAI is a skill or function that agents can utilize to perform various actions.
@@ -296,17 +285,3 @@ writer1 = Agent(
 Tools are pivotal in extending the capabilities of CrewAI agents, enabling them to undertake a broad spectrum of tasks and collaborate effectively.
 When building solutions with CrewAI, leverage both custom and existing tools to empower your agents and enhance the AI ecosystem. Consider utilizing error handling,
 caching mechanisms, and the flexibility of tool arguments to optimize your agents' performance and capabilities.
-
-## Common Failure Modes
-
-### Tool schema mismatch
- Cause: model-generated arguments do not match tool signature.
- Fix: tighten tool descriptions and validate input schemas.
-
-### Repeated side effects
- Cause: retries trigger duplicate writes/actions.
- Fix: add idempotency keys and deduplication checks in tool logic.
-
-### Tool timeouts under load
- Cause: unbounded retries or slow external services.
- Fix: set explicit timeout/retry policy and graceful fallbacks.
--- a/docs/en/examples/cookbooks.mdx
+++ b/docs/en/examples/cookbooks.mdx
@@ -8,10 +8,6 @@ mode: "wide"
 ## Quickstarts & Demos

 <CardGroup cols={3}>
-  <Card title="Flowstate Chat History" icon="comments" href="/en/learn/flowstate-chat-history">
-    Manage chat sessions with sliding-window history, summary compaction, and persisted Flow state.
-  </Card>
-
  <Card title="Collaboration" icon="people-arrows" href="https://github.com/crewAIInc/crewAI-quickstarts/blob/main/Collaboration/crewai_collaboration.ipynb">
    Coordinate multiple agents on shared tasks. Includes notebook with end-to-end collaboration pattern.
  </Card>
--- a/docs/en/examples/example.mdx
+++ b/docs/en/examples/example.mdx
@@ -34,10 +34,6 @@ mode: "wide"
 ## Flows

 <CardGroup cols={3}>
-  <Card title="Flowstate Chat History" icon="comments" href="/en/learn/flowstate-chat-history">
-    Stateful chat pattern with compacted context and persisted session state.
-  </Card>
-
  <Card title="Content Creator Flow" icon="pen" href="https://github.com/crewAIInc/crewAI-examples/tree/main/flows/content_creator_flow">
    Multi‑crew content generation with routing.
  </Card>
--- a/docs/en/guides/flows/mastering-flow-state.mdx
+++ b/docs/en/guides/flows/mastering-flow-state.mdx
@@ -47,23 +47,6 @@ CrewAI offers two ways to manage state in your flows:

 Let's examine each approach in detail.

-### Flow State vs Memory: When to use each
-
-Both features keep context, but they solve different problems.
-
-| Dimension | Flow State (`self.state`) | Memory (`self.remember` / `self.recall`) |
-|---|---|---|
-| Primary purpose | Track execution and deterministic workflow data | Store and retrieve semantic knowledge across interactions |
-| Data shape | Explicit fields (dict/Pydantic model) | Text records with inferred scopes and ranked recall |
-| Typical lifetime | Current flow run (or persisted checkpoints) | Long-term knowledge over many runs |
-| Access pattern | Direct reads/writes (`self.state.field`) | Query-based retrieval (`self.recall("...")`) |
-| Best for | Routing flags, counters, intermediate outputs, chat window | Durable facts, prior outcomes, reusable context |
-| Chat use | Recent turns + running summary + control flags | Long-tail memory outside context window |
-
-Practical rule:
- Use **state** for what your control flow depends on right now.
- Use **memory** for what you may want to retrieve later by meaning.
-
 ## Unstructured State Management

 Unstructured state uses a dictionary-like approach, offering flexibility and simplicity for straightforward applications.
--- a/docs/en/index.mdx
+++ b/docs/en/index.mdx
@@ -27,11 +27,8 @@ mode: "wide"
  </div>

  <div style={{ display: 'flex', flexWrap: 'wrap', gap: 12, justifyContent: 'center' }}>
-    <a className="button button-primary" href="/en/installation">Install</a>
-    <a className="button" href="/en/quickstart">Quickstart</a>
-    <a className="button" href="/en/guides/crews/first-crew">First Crew</a>
-    <a className="button" href="/en/guides/flows/first-flow">First Flow</a>
-    <a className="button" href="/en/concepts/llms">LLM Setup</a>
+    <a className="button button-primary" href="/en/quickstart">Get started</a>
+    <a className="button" href="/en/changelog">View changelog</a>
    <a className="button" href="/en/api-reference/introduction">API Reference</a>
  </div>

@@ -39,49 +36,17 @@ mode: "wide"

 <div style={{ marginTop: 32 }} />

-## Start in 3 steps
+## Get started

 <CardGroup cols={3}>
-  <Card title="1) Install" href="/en/installation" icon="wrench">
+  <Card title="Introduction" href="/en/introduction" icon="sparkles">
+    Overview of CrewAI concepts, architecture, and what you can build with agents, crews, and flows.
+  </Card>
+  <Card title="Installation" href="/en/installation" icon="wrench">
    Install via `uv`, configure API keys, and set up the CLI for local development.
  </Card>
-  <Card title="2) Run Quickstart" href="/en/quickstart" icon="rocket">
-    Launch your first working crew with a minimal project and iterate from there.
-  </Card>
-  <Card title="3) Pick a path" href="/en/ai/overview" icon="sitemap">
-    Continue with canonical domain packs for Flows, Agents, Crews, LLMs, Memory, and Tools.
-  </Card>
-</CardGroup>
-
-## Most-used pages
-
-<CardGroup cols={3}>
-  <Card title="First Crew" href="/en/guides/crews/first-crew" icon="users">
-    Build a production-style crew with role/task configuration and execution flow.
-  </Card>
-  <Card title="First Flow" href="/en/guides/flows/first-flow" icon="arrow-progress">
-    Build event-driven orchestration with state, listeners, and routing.
-  </Card>
-  <Card title="Flowstate Chat History" href="/en/learn/flowstate-chat-history" icon="comments">
-    Stateful chat history pattern with persistence and summary compaction.
-  </Card>
-  <Card title="Agents" href="/en/concepts/agents" icon="user">
-    Agent role design, tool boundaries, and output contracts.
-  </Card>
-  <Card title="Crews" href="/en/concepts/crews" icon="users-gear">
-    Multi-agent collaboration patterns and process semantics.
-  </Card>
-  <Card title="Flows" href="/en/concepts/flows" icon="code-branch">
-    Deterministic orchestration, state lifecycle, persistence, and resume.
-  </Card>
-  <Card title="LLMs" href="/en/concepts/llms" icon="microchip-ai">
-    Model setup, provider config, routing patterns, and reliability defaults.
-  </Card>
-  <Card title="Memory" href="/en/concepts/memory" icon="database">
-    Semantic recall, scope strategy, and state-vs-memory architecture.
-  </Card>
-  <Card title="Tools" href="/en/tools/overview" icon="wrench">
-    Tool categories, integration surfaces, and practical usage patterns.
+  <Card title="Quickstart" href="/en/quickstart" icon="rocket">
+    Spin up your first crew in minutes. Learn the core runtime, project layout, and dev loop.
  </Card>
 </CardGroup>

@@ -125,11 +90,7 @@ mode: "wide"
 </CardGroup>

 <Callout title="Explore real-world patterns" icon="github">
-  Browse the <a href="/en/examples/cookbooks">examples and cookbooks</a> for end-to-end reference implementations across agents, flows, and enterprise automations. For a practical conversational pattern, start with <a href="/en/learn/flowstate-chat-history">Flowstate Chat History</a>.
-</Callout>
-
-<Callout title="AI-First Docs" icon="sitemap">
-  Use the <a href="/en/ai/overview">AI-First Documentation map</a> for canonical domain packs across Flows, Agents, Crews, LLMs, Memory, and Tools.
+  Browse the <a href="/en/examples/cookbooks">examples and cookbooks</a> for end-to-end reference implementations across agents, flows, and enterprise automations.
 </Callout>

 ## Stay connected
--- a/docs/en/introduction.mdx
+++ b/docs/en/introduction.mdx
@@ -16,52 +16,6 @@ It empowers developers to build production-ready multi-agent systems by combinin

 With over 100,000 developers certified through our community courses, CrewAI is the standard for enterprise-ready AI automation.

-## Start Here
-
-<CardGroup cols={3}>
-  <Card title="Install" href="/en/installation" icon="wrench">
-    Set up CrewAI, configure API keys, and prepare your local environment.
-  </Card>
-  <Card title="Quickstart" href="/en/quickstart" icon="rocket">
-    Run your first working crew with a minimal setup.
-  </Card>
-  <Card title="First Crew" href="/en/guides/crews/first-crew" icon="users-gear">
-    Build a production-style crew with roles, tasks, and execution flow.
-  </Card>
-  <Card title="First Flow" href="/en/guides/flows/first-flow" icon="arrow-progress">
-    Build event-driven orchestration with state, listeners, and routers.
-  </Card>
-  <Card title="LLM Setup" href="/en/concepts/llms" icon="microchip-ai">
-    Configure providers, models, and reliability defaults.
-  </Card>
-  <Card title="API Reference" href="/en/api-reference/introduction" icon="book">
-    Use kickoff, resume, and status endpoints for production integrations.
-  </Card>
-</CardGroup>
-
-## Most-used Docs
-
-<CardGroup cols={3}>
-  <Card title="Agents" href="/en/concepts/agents" icon="user">
-    Role design, tool boundaries, and output contracts.
-  </Card>
-  <Card title="Crews" href="/en/concepts/crews" icon="users">
-    Multi-agent coordination and process choices.
-  </Card>
-  <Card title="Flows" href="/en/concepts/flows" icon="code-branch">
-    Deterministic orchestration, state, persistence, and resume.
-  </Card>
-  <Card title="Memory" href="/en/concepts/memory" icon="database">
-    Scope strategy and semantic recall across runs.
-  </Card>
-  <Card title="Flowstate Chat History" href="/en/learn/flowstate-chat-history" icon="comments">
-    Stateful chat context with summary compaction and persistence.
-  </Card>
-  <Card title="AI-First Docs Map" href="/en/ai/overview" icon="sitemap">
-    Canonical domain packs for Flows, Agents, Crews, LLMs, Memory, and Tools.
-  </Card>
-</CardGroup>
-
 ## The CrewAI Architecture

 CrewAI's architecture is designed to balance autonomy with control.
@@ -176,7 +130,7 @@ For any production-ready application, **start with a Flow**.
  <Card
    title="Quick Start"
    icon="bolt"
-    href="/en/quickstart"
+    href="en/quickstart"
  >
    Follow our quickstart guide to create your first CrewAI agent and get hands-on experience.
  </Card>
--- a/docs/en/learn/flowstate-chat-history.mdx
+++ b/docs/en/learn/flowstate-chat-history.mdx
@@ -1,167 +0,0 @@
---
-title: "Flowstate Chat History"
-description: "Build a stateful chat workflow that keeps context compact, persistent, and production-friendly."
-icon: "comments"
-mode: "wide"
---
-
-## Overview
-
-This guide shows a practical pattern for managing LLM chat history with Flow state:
-
- Keep recent turns in a sliding window
- Summarize older turns into a compact running summary
- Persist state automatically with `@persist()`
- Keep optional long-term recall using Flow memory
-
-## Why this pattern works
-
-Naively appending every message to prompts causes token bloat and unstable behavior over long sessions. A better approach is:
-
-1. Keep only the most recent turns in `state.messages`
-2. Move older turns into `state.running_summary`
-3. Build prompts from `running_summary + recent messages`
-
-## Prerequisites
-
-1. CrewAI installed and configured
-2. API key configured for your model provider
-3. Basic familiarity with Flow decorators (`@start`, `@listen`)
-
-## Step 1: Define typed chat state
-
-```python Code
-from typing import Dict, List
-from pydantic import BaseModel, Field
-
-
-class ChatSessionState(BaseModel):
-    session_id: str = "demo-session"
-    running_summary: str = ""
-    messages: List[Dict[str, str]] = Field(default_factory=list)
-    max_recent_messages: int = 8
-    last_user_message: str = ""
-    assistant_reply: str = ""
-    turn_count: int = 0
-```
-
-## Step 2: Build the Flow
-
-```python Code
-from crewai.flow.flow import Flow, start, listen
-from crewai.flow.persistence import persist
-from litellm import completion
-
-
-@persist()
-class ChatHistoryFlow(Flow[ChatSessionState]):
-    model = "gpt-4o-mini"
-
-    @start()
-    def capture_user_message(self):
-        self.state.last_user_message = self.state.last_user_message.strip()
-        self.state.messages.append(
-            {"role": "user", "content": self.state.last_user_message}
-        )
-        self.state.turn_count += 1
-        return self.state.last_user_message
-
-    @listen(capture_user_message)
-    def compact_old_history(self, _):
-        if len(self.state.messages) <= self.state.max_recent_messages:
-            return "no_compaction"
-
-        overflow = self.state.messages[:-self.state.max_recent_messages]
-        self.state.messages = self.state.messages[-self.state.max_recent_messages :]
-        overflow_text = "\n".join(
-            f"{m['role']}: {m['content']}" for m in overflow
-        )
-
-        summary_prompt = [
-            {
-                "role": "system",
-                "content": "Summarize old chat turns into short bullet points. Preserve facts, constraints, and decisions.",
-            },
-            {
-                "role": "user",
-                "content": (
-                    f"Existing summary:\n{self.state.running_summary or '(empty)'}\n\n"
-                    f"New old turns:\n{overflow_text}"
-                ),
-            },
-        ]
-        summary_response = completion(model=self.model, messages=summary_prompt)
-        self.state.running_summary = summary_response["choices"][0]["message"]["content"]
-        return "compacted"
-
-    @listen(compact_old_history)
-    def generate_reply(self, _):
-        system_context = (
-            "You are a helpful assistant.\n"
-            f"Conversation summary so far:\n{self.state.running_summary or '(none)'}"
-        )
-
-        response = completion(
-            model=self.model,
-            messages=[{"role": "system", "content": system_context}, *self.state.messages],
-        )
-        answer = response["choices"][0]["message"]["content"]
-
-        self.state.assistant_reply = answer
-        self.state.messages.append({"role": "assistant", "content": answer})
-
-        # Optional: store key turns in long-term memory for later recall
-        self.remember(
-            f"Session {self.state.session_id} turn {self.state.turn_count}: "
-            f"user={self.state.last_user_message} assistant={answer}",
-            scope=f"/chat/{self.state.session_id}",
-        )
-        return answer
-```
-
-## Step 3: Run it
-
-```python Code
-flow = ChatHistoryFlow()
-
-first = flow.kickoff(
-    inputs={
-        "session_id": "customer-42",
-        "last_user_message": "I need help choosing a pricing plan for a 10-person team.",
-    }
-)
-print("Assistant:", first)
-
-second = flow.kickoff(
-    inputs={
-        "last_user_message": "We also need SSO and audit logs. What do you recommend now?",
-    }
-)
-print("Assistant:", second)
-print("Turns:", flow.state.turn_count)
-print("Recent messages:", len(flow.state.messages))
-```
-
-## Expected output (shape)
-
-```text Output
-Assistant: ...initial recommendation...
-Assistant: ...updated recommendation with SSO and audit-log requirements...
-Turns: 2
-Recent messages: 4
-```
-
-## Troubleshooting
-
- If replies ignore earlier context:
-  increase `max_recent_messages` and ensure `running_summary` is included in the system context.
- If prompts become too large:
-  lower `max_recent_messages` and summarize more aggressively.
- If sessions collide:
-  provide a stable `session_id` and isolate memory scope with `/chat/{session_id}`.
-
-## Next steps
-
- Add tool calls for account lookup or product catalog retrieval
- Route to human review for high-risk decisions
- Add structured output to capture recommendations in machine-readable JSON
--- a/lib/crewai/pyproject.toml
+++ b/lib/crewai/pyproject.toml
@@ -38,6 +38,7 @@ dependencies = [
    "json5~=0.10.0",
    "portalocker~=2.7.0",
    "pydantic-settings~=2.10.1",
+    "httpx~=0.28.1",
    "mcp~=1.26.0",
    "uv~=0.9.13",
    "aiosqlite~=0.21.0",
--- a/lib/crewai/src/crewai/init.py
+++ b/lib/crewai/src/crewai/init.py
@@ -4,6 +4,7 @@ import urllib.request
 import warnings

 from crewai.agent.core import Agent
+from crewai.agent.planning_config import PlanningConfig
 from crewai.crew import Crew
 from crewai.crews.crew_output import CrewOutput
 from crewai.flow.flow import Flow
@@ -82,6 +83,7 @@ __all__ = [
    "Knowledge",
    "LLMGuardrail",
    "Memory",
+    "PlanningConfig",
    "Process",
    "Task",
    "TaskOutput",
--- a/lib/crewai/src/crewai/agent/core.py
+++ b/lib/crewai/src/crewai/agent/core.py
@@ -24,6 +24,7 @@ from pydantic import (
 )
 from typing_extensions import Self

+from crewai.agent.planning_config import PlanningConfig
 from crewai.agent.utils import (
    ahandle_knowledge_retrieval,
    apply_training_data,
@@ -211,13 +212,23 @@ class Agent(BaseAgent):
        default="safe",
        description="Mode for code execution: 'safe' (using Docker) or 'unsafe' (direct execution).",
    )
-    reasoning: bool = Field(
+    planning_config: PlanningConfig | None = Field(
+        default=None,
+        description="Configuration for agent planning before task execution.",
+    )
+    planning: bool = Field(
        default=False,
        description="Whether the agent should reflect and create a plan before executing a task.",
    )
+    reasoning: bool = Field(
+        default=False,
+        description="[DEPRECATED: Use planning_config instead] Whether the agent should reflect and create a plan before executing a task.",
+        deprecated=True,
+    )
    max_reasoning_attempts: int | None = Field(
        default=None,
-        description="Maximum number of reasoning attempts before executing the task. If None, will try until ready.",
+        description="[DEPRECATED: Use planning_config.max_attempts instead] Maximum number of reasoning attempts before executing the task. If None, will try until ready.",
+        deprecated=True,
    )
    embedder: EmbedderConfig | None = Field(
        default=None,
@@ -284,8 +295,26 @@ class Agent(BaseAgent):
        if self.allow_code_execution:
            self._validate_docker_installation()

+        # Handle backward compatibility: convert reasoning=True to planning_config
+        if self.reasoning and self.planning_config is None:
+            import warnings
+
+            warnings.warn(
+                "The 'reasoning' parameter is deprecated. Use 'planning_config=PlanningConfig()' instead.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+            self.planning_config = PlanningConfig(
+                max_attempts=self.max_reasoning_attempts,
+            )
+
        return self

+    @property
+    def planning_enabled(self) -> bool:
+        """Check if planning is enabled for this agent."""
+        return self.planning_config is not None or self.planning
+
    def _setup_agent_executor(self) -> None:
        if not self.cache_handler:
            self.cache_handler = CacheHandler()
@@ -354,7 +383,11 @@ class Agent(BaseAgent):
            ValueError: If the max execution time is not a positive integer.
            RuntimeError: If the agent execution fails for other reasons.
        """
-        handle_reasoning(self, task)
+        # Only call handle_reasoning for legacy CrewAgentExecutor
+        # For AgentExecutor, planning is handled in AgentExecutor.generate_plan()
+        if self.executor_class is not AgentExecutor:
+            handle_reasoning(self, task)
+
        self._inject_date_to_task(task)

        if self.tools_handler:
@@ -592,7 +625,10 @@ class Agent(BaseAgent):
            ValueError: If the max execution time is not a positive integer.
            RuntimeError: If the agent execution fails for other reasons.
        """
-        handle_reasoning(self, task)
+        if self.executor_class is not AgentExecutor:
+            handle_reasoning(
+                self, task
+            )  # we need this till CrewAgentExecutor migrates to AgentExecutor
        self._inject_date_to_task(task)

        if self.tools_handler:
@@ -1712,7 +1748,8 @@ class Agent(BaseAgent):

            existing_names = {sanitize_tool_name(t.name) for t in raw_tools}
            raw_tools.extend(
-                mt for mt in create_memory_tools(agent_memory)
+                mt
+                for mt in create_memory_tools(agent_memory)
                if sanitize_tool_name(mt.name) not in existing_names
            )

@@ -1937,94 +1974,111 @@ class Agent(BaseAgent):
            if isinstance(messages, str):
                input_str = messages
            else:
-                input_str = "\n".join(
-                    str(msg.get("content", "")) for msg in messages if msg.get("content")
-                ) or "User request"
-            raw = (
-                f"Input: {input_str}\n"
-                f"Agent: {self.role}\n"
-                f"Result: {output_text}"
-            )
+                input_str = (
+                    "\n".join(
+                        str(msg.get("content", ""))
+                        for msg in messages
+                        if msg.get("content")
+                    )
+                    or "User request"
+                )
+            raw = f"Input: {input_str}\nAgent: {self.role}\nResult: {output_text}"
            extracted = agent_memory.extract_memories(raw)
            if extracted:
                agent_memory.remember_many(extracted)
        except Exception as e:
            self._logger.log("error", f"Failed to save kickoff result to memory: {e}")

+    def _build_output_from_result(
+        self,
+        result: dict[str, Any],
+        executor: AgentExecutor,
+        response_format: type[Any] | None = None,
+    ) -> LiteAgentOutput:
+        """Build a LiteAgentOutput from an executor result dict.
+
+        Shared logic used by both sync and async execution paths.
+
+        Args:
+            result: The result dictionary from executor.invoke / invoke_async.
+            executor: The executor instance.
+            response_format: Optional response format.
+
+        Returns:
+            LiteAgentOutput with raw output, formatted result, and metrics.
+        """
+        import json
+
+        output = result.get("output", "")
+
+        # Handle response format conversion
+        formatted_result: BaseModel | None = None
+        raw_output: str
+
+        if isinstance(output, BaseModel):
+            formatted_result = output
+            raw_output = output.model_dump_json()
+        elif response_format:
+            raw_output = str(output) if not isinstance(output, str) else output
+            try:
+                model_schema = generate_model_description(response_format)
+                schema = json.dumps(model_schema, indent=2)
+                instructions = self.i18n.slice("formatted_task_instructions").format(
+                    output_format=schema
+                )
+
+                converter = Converter(
+                    llm=self.llm,
+                    text=raw_output,
+                    model=response_format,
+                    instructions=instructions,
+                )
+
+                conversion_result = converter.to_pydantic()
+                if isinstance(conversion_result, BaseModel):
+                    formatted_result = conversion_result
+            except ConverterError:
+                pass  # Keep raw output if conversion fails
+        else:
+            raw_output = str(output) if not isinstance(output, str) else output
+
+        # Get token usage metrics
+        if isinstance(self.llm, BaseLLM):
+            usage_metrics = self.llm.get_token_usage_summary()
+        else:
+            usage_metrics = self._token_process.get_summary()
+
+        raw_str = (
+            raw_output
+            if isinstance(raw_output, str)
+            else raw_output.model_dump_json()
+            if isinstance(raw_output, BaseModel)
+            else str(raw_output)
+        )
+
+        todo_results = LiteAgentOutput.from_todo_items(executor.state.todos.items)
+
+        return LiteAgentOutput(
+            raw=raw_str,
+            pydantic=formatted_result,
+            agent_role=self.role,
+            usage_metrics=usage_metrics.model_dump() if usage_metrics else None,
+            messages=list(executor.state.messages),
+            plan=executor.state.plan,
+            todos=todo_results,
+            replan_count=executor.state.replan_count,
+            last_replan_reason=executor.state.last_replan_reason,
+        )
+
    def _execute_and_build_output(
        self,
        executor: AgentExecutor,
        inputs: dict[str, str],
        response_format: type[Any] | None = None,
    ) -> LiteAgentOutput:
-        """Execute the agent and build the output object.
-
-        Args:
-            executor: The executor instance.
-            inputs: Input dictionary for execution.
-            response_format: Optional response format.
-
-        Returns:
-            LiteAgentOutput with raw output, formatted result, and metrics.
-        """
-        import json
-
-        # Execute the agent (this is called from sync path, so invoke returns dict)
+        """Execute the agent synchronously and build the output object."""
        result = cast(dict[str, Any], executor.invoke(inputs))
-        output = result.get("output", "")
-
-        # Handle response format conversion
-        formatted_result: BaseModel | None = None
-        raw_output: str
-
-        if isinstance(output, BaseModel):
-            formatted_result = output
-            raw_output = output.model_dump_json()
-        elif response_format:
-            raw_output = str(output) if not isinstance(output, str) else output
-            try:
-                model_schema = generate_model_description(response_format)
-                schema = json.dumps(model_schema, indent=2)
-                instructions = self.i18n.slice("formatted_task_instructions").format(
-                    output_format=schema
-                )
-
-                converter = Converter(
-                    llm=self.llm,
-                    text=raw_output,
-                    model=response_format,
-                    instructions=instructions,
-                )
-
-                conversion_result = converter.to_pydantic()
-                if isinstance(conversion_result, BaseModel):
-                    formatted_result = conversion_result
-            except ConverterError:
-                pass  # Keep raw output if conversion fails
-        else:
-            raw_output = str(output) if not isinstance(output, str) else output
-
-        # Get token usage metrics
-        if isinstance(self.llm, BaseLLM):
-            usage_metrics = self.llm.get_token_usage_summary()
-        else:
-            usage_metrics = self._token_process.get_summary()
-
-        raw_str = (
-            raw_output
-            if isinstance(raw_output, str)
-            else raw_output.model_dump_json()
-            if isinstance(raw_output, BaseModel)
-            else str(raw_output)
-        )
-
-        return LiteAgentOutput(
-            raw=raw_str,
-            pydantic=formatted_result,
-            agent_role=self.role,
-            usage_metrics=usage_metrics.model_dump() if usage_metrics else None,
-            messages=executor.messages,
-        )
+        return self._build_output_from_result(result, executor, response_format)

    async def _execute_and_build_output_async(
        self,
@@ -2032,77 +2086,9 @@ class Agent(BaseAgent):
        inputs: dict[str, str],
        response_format: type[Any] | None = None,
    ) -> LiteAgentOutput:
-        """Execute the agent asynchronously and build the output object.
-
-        This is the async version of _execute_and_build_output that uses
-        invoke_async() for native async execution within event loops.
-
-        Args:
-            executor: The executor instance.
-            inputs: Input dictionary for execution.
-            response_format: Optional response format.
-
-        Returns:
-            LiteAgentOutput with raw output, formatted result, and metrics.
-        """
-        import json
-
-        # Execute the agent asynchronously
+        """Execute the agent asynchronously and build the output object."""
        result = await executor.invoke_async(inputs)
-        output = result.get("output", "")
-
-        # Handle response format conversion
-        formatted_result: BaseModel | None = None
-        raw_output: str
-
-        if isinstance(output, BaseModel):
-            formatted_result = output
-            raw_output = output.model_dump_json()
-        elif response_format:
-            raw_output = str(output) if not isinstance(output, str) else output
-            try:
-                model_schema = generate_model_description(response_format)
-                schema = json.dumps(model_schema, indent=2)
-                instructions = self.i18n.slice("formatted_task_instructions").format(
-                    output_format=schema
-                )
-
-                converter = Converter(
-                    llm=self.llm,
-                    text=raw_output,
-                    model=response_format,
-                    instructions=instructions,
-                )
-
-                conversion_result = converter.to_pydantic()
-                if isinstance(conversion_result, BaseModel):
-                    formatted_result = conversion_result
-            except ConverterError:
-                pass  # Keep raw output if conversion fails
-        else:
-            raw_output = str(output) if not isinstance(output, str) else output
-
-        # Get token usage metrics
-        if isinstance(self.llm, BaseLLM):
-            usage_metrics = self.llm.get_token_usage_summary()
-        else:
-            usage_metrics = self._token_process.get_summary()
-
-        raw_str = (
-            raw_output
-            if isinstance(raw_output, str)
-            else raw_output.model_dump_json()
-            if isinstance(raw_output, BaseModel)
-            else str(raw_output)
-        )
-
-        return LiteAgentOutput(
-            raw=raw_str,
-            pydantic=formatted_result,
-            agent_role=self.role,
-            usage_metrics=usage_metrics.model_dump() if usage_metrics else None,
-            messages=executor.messages,
-        )
+        return self._build_output_from_result(result, executor, response_format)

    def _process_kickoff_guardrail(
        self,
--- a/lib/crewai/src/crewai/agent/planning_config.py
+++ b/lib/crewai/src/crewai/agent/planning_config.py
@@ -0,0 +1,115 @@
+from __future__ import annotations
+
+from typing import Any, Literal
+
+from pydantic import BaseModel, Field
+
+
+class PlanningConfig(BaseModel):
+    """Configuration for agent planning/reasoning before task execution.
+
+    This allows users to customize the planning behavior including prompts,
+    iteration limits, the LLM used for planning, and the reasoning effort
+    level that controls post-step observation and replanning behavior.
+
+    Note: To disable planning, don't pass a planning_config or set planning=False
+    on the Agent. The presence of a PlanningConfig enables planning.
+
+    Attributes:
+        reasoning_effort: Controls observation and replanning after each step.
+            - "low": Observe each step (validates success), but skip the
+              decide/replan/refine pipeline. Steps are marked complete and
+              execution continues linearly. Fastest option.
+            - "medium": Observe each step. On failure, trigger replanning.
+              On success, skip refinement and continue. Balanced option.
+            - "high": Full observation pipeline — observe every step, then
+              route through decide_next_action which can trigger early goal
+              achievement, full replanning, or lightweight refinement.
+              Most adaptive but adds latency per step.
+        max_attempts: Maximum number of planning refinement attempts.
+            If None, will continue until the agent indicates readiness.
+        max_steps: Maximum number of steps in the generated plan.
+        system_prompt: Custom system prompt for planning. Uses default if None.
+        plan_prompt: Custom prompt for creating the initial plan.
+        refine_prompt: Custom prompt for refining the plan.
+        llm: LLM to use for planning. Uses agent's LLM if None.
+
+    Example:
+        ```python
+        from crewai import Agent
+        from crewai.agent.planning_config import PlanningConfig
+
+        # Simple usage — fast, linear execution (default)
+        agent = Agent(
+            role="Researcher",
+            goal="Research topics",
+            backstory="Expert researcher",
+            planning_config=PlanningConfig(),
+        )
+
+        # Balanced — replan only when steps fail
+        agent = Agent(
+            role="Researcher",
+            goal="Research topics",
+            backstory="Expert researcher",
+            planning_config=PlanningConfig(
+                reasoning_effort="medium",
+            ),
+        )
+
+        # Full adaptive planning with refinement and replanning
+        agent = Agent(
+            role="Researcher",
+            goal="Research topics",
+            backstory="Expert researcher",
+            planning_config=PlanningConfig(
+                reasoning_effort="high",
+                max_attempts=3,
+                max_steps=10,
+                plan_prompt="Create a focused plan for: {description}",
+                llm="gpt-4o-mini",  # Use cheaper model for planning
+            ),
+        )
+        ```
+    """
+
+    reasoning_effort: Literal["low", "medium", "high"] = Field(
+        default="low",
+        description=(
+            "Controls post-step observation and replanning behavior. "
+            "'low' observes steps but skips replanning/refinement (fastest). "
+            "'medium' observes and replans only on step failure (balanced). "
+            "'high' runs full observation pipeline with replanning, refinement, "
+            "and early goal detection (most adaptive, highest latency)."
+        ),
+    )
+    max_attempts: int | None = Field(
+        default=None,
+        description=(
+            "Maximum number of planning refinement attempts. "
+            "If None, will continue until the agent indicates readiness."
+        ),
+    )
+    max_steps: int = Field(
+        default=20,
+        description="Maximum number of steps in the generated plan.",
+        ge=1,
+    )
+    system_prompt: str | None = Field(
+        default=None,
+        description="Custom system prompt for planning. Uses default if None.",
+    )
+    plan_prompt: str | None = Field(
+        default=None,
+        description="Custom prompt for creating the initial plan.",
+    )
+    refine_prompt: str | None = Field(
+        default=None,
+        description="Custom prompt for refining the plan.",
+    )
+    llm: str | Any | None = Field(
+        default=None,
+        description="LLM to use for planning. Uses agent's LLM if None.",
+    )
+
+    model_config = {"arbitrary_types_allowed": True}
--- a/lib/crewai/src/crewai/agent/utils.py
+++ b/lib/crewai/src/crewai/agent/utils.py
@@ -28,13 +28,20 @@ if TYPE_CHECKING:


 def handle_reasoning(agent: Agent, task: Task) -> None:
-    """Handle the reasoning process for an agent before task execution.
+    """Handle the reasoning/planning process for an agent before task execution.
+
+    This function checks if planning is enabled for the agent and, if so,
+    creates a plan that gets appended to the task description.
+
+    Note: This function is used by CrewAgentExecutor (legacy path).
+    For AgentExecutor, planning is handled in AgentExecutor.generate_plan().

    Args:
        agent: The agent performing the task.
        task: The task to execute.
    """
-    if not agent.reasoning:
+    # Check if planning is enabled using the planning_enabled property
+    if not getattr(agent, "planning_enabled", False):
        return

    try:
@@ -43,13 +50,13 @@ def handle_reasoning(agent: Agent, task: Task) -> None:
            AgentReasoningOutput,
        )

-        reasoning_handler = AgentReasoning(task=task, agent=agent)
-        reasoning_output: AgentReasoningOutput = (
-            reasoning_handler.handle_agent_reasoning()
+        planning_handler = AgentReasoning(agent=agent, task=task)
+        planning_output: AgentReasoningOutput = (
+            planning_handler.handle_agent_reasoning()
        )
-        task.description += f"\n\nReasoning Plan:\n{reasoning_output.plan.plan}"
+        task.description += f"\n\nPlanning:\n{planning_output.plan.plan}"
    except Exception as e:
-        agent._logger.log("error", f"Error during reasoning process: {e!s}")
+        agent._logger.log("error", f"Error during planning: {e!s}")


 def build_task_prompt_with_schema(task: Task, task_prompt: str, i18n: I18N) -> str:
--- a/lib/crewai/src/crewai/agents/crew_agent_executor.py
+++ b/lib/crewai/src/crewai/agents/crew_agent_executor.py
@@ -6,8 +6,10 @@ and memory management.

 from __future__ import annotations

+import asyncio
 from collections.abc import Callable
 from concurrent.futures import ThreadPoolExecutor, as_completed
+import inspect
 import logging
 from typing import TYPE_CHECKING, Any, Literal, cast

@@ -736,7 +738,9 @@ class CrewAgentExecutor(CrewAgentExecutorMixin):
                ] = []
                for call_id, func_name, func_args in parsed_calls:
                    original_tool = original_tools_by_name.get(func_name)
-                    execution_plan.append((call_id, func_name, func_args, original_tool))
+                    execution_plan.append(
+                        (call_id, func_name, func_args, original_tool)
+                    )

                self._append_assistant_tool_calls_message(
                    [
@@ -746,7 +750,9 @@ class CrewAgentExecutor(CrewAgentExecutorMixin):
                )

                max_workers = min(8, len(execution_plan))
-                ordered_results: list[dict[str, Any] | None] = [None] * len(execution_plan)
+                ordered_results: list[dict[str, Any] | None] = [None] * len(
+                    execution_plan
+                )
                with ThreadPoolExecutor(max_workers=max_workers) as pool:
                    futures = {
                        pool.submit(
@@ -803,7 +809,7 @@ class CrewAgentExecutor(CrewAgentExecutorMixin):
            return tool_finish

        reasoning_prompt = self._i18n.slice("post_tool_reasoning")
-        reasoning_message: LLMMessage = {
+        reasoning_message = {
            "role": "user",
            "content": reasoning_prompt,
        }
@@ -908,9 +914,9 @@ class CrewAgentExecutor(CrewAgentExecutorMixin):
        elif (
            should_execute
            and original_tool
-            and getattr(original_tool, "max_usage_count", None) is not None
-            and getattr(original_tool, "current_usage_count", 0)
-            >= original_tool.max_usage_count
+            and (max_count := getattr(original_tool, "max_usage_count", None))
+            is not None
+            and getattr(original_tool, "current_usage_count", 0) >= max_count
        ):
            max_usage_reached = True

@@ -989,13 +995,17 @@ class CrewAgentExecutor(CrewAgentExecutorMixin):
                        and hasattr(original_tool, "cache_function")
                        and callable(original_tool.cache_function)
                    ):
-                        should_cache = original_tool.cache_function(args_dict, raw_result)
+                        should_cache = original_tool.cache_function(
+                            args_dict, raw_result
+                        )
                    if should_cache:
                        self.tools_handler.cache.add(
                            tool=func_name, input=input_str, output=raw_result
                        )

-                result = str(raw_result) if not isinstance(raw_result, str) else raw_result
+                result = (
+                    str(raw_result) if not isinstance(raw_result, str) else raw_result
+                )
            except Exception as e:
                result = f"Error executing tool: {e}"
                if self.task:
@@ -1490,7 +1500,9 @@ class CrewAgentExecutor(CrewAgentExecutorMixin):
            formatted_answer: Current agent response.
        """
        if self.step_callback:
-            self.step_callback(formatted_answer)
+            cb_result = self.step_callback(formatted_answer)
+            if inspect.iscoroutine(cb_result):
+                asyncio.run(cb_result)

    def _append_message(
        self, text: str, role: Literal["user", "assistant", "system"] = "assistant"
--- a/lib/crewai/src/crewai/agents/planner_observer.py
+++ b/lib/crewai/src/crewai/agents/planner_observer.py
@@ -0,0 +1,309 @@
+"""PlannerObserver: Observation phase after each step execution.
+
+Implements the "Observe" phase. After every step execution, the Planner
+analyzes what happened, what new information was learned, and whether the
+remaining plan is still valid.
+
+This is NOT an error detector — it runs on every step, including successes,
+to incorporate runtime observations into the remaining plan.
+
+Refinements are structured (StepRefinement objects) and applied directly
+from the observation result — no second LLM call required.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, Any
+
+from crewai.events.event_bus import crewai_event_bus
+from crewai.events.types.observation_events import (
+    StepObservationCompletedEvent,
+    StepObservationFailedEvent,
+    StepObservationStartedEvent,
+)
+from crewai.utilities.i18n import I18N, get_i18n
+from crewai.utilities.llm_utils import create_llm
+from crewai.utilities.planning_types import StepObservation, TodoItem
+from crewai.utilities.types import LLMMessage
+
+
+if TYPE_CHECKING:
+    from crewai.agent import Agent
+    from crewai.task import Task
+
+logger = logging.getLogger(__name__)
+
+
+class PlannerObserver:
+    """Observes step execution results and decides on plan continuation.
+
+    After EVERY step execution, this class:
+    1. Analyzes what the step accomplished
+    2. Identifies new information learned
+    3. Decides if the remaining plan is still valid
+    4. Suggests lightweight refinements or triggers full replanning
+
+    LLM resolution (magical fallback):
+    - If ``agent.planning_config.llm`` is explicitly set → use that
+    - Otherwise → fall back to ``agent.llm`` (same LLM for everything)
+
+    Args:
+        agent: The agent instance (for LLM resolution and config).
+        task: Optional task context (for description and expected output).
+    """
+
+    def __init__(
+        self,
+        agent: Agent,
+        task: Task | None = None,
+        kickoff_input: str = "",
+    ) -> None:
+        self.agent = agent
+        self.task = task
+        self.kickoff_input = kickoff_input
+        self.llm = self._resolve_llm()
+        self._i18n: I18N = get_i18n()
+
+    def _resolve_llm(self) -> Any:
+        """Resolve which LLM to use for observation/planning.
+
+        Mirrors AgentReasoning._resolve_llm(): uses planning_config.llm
+        if explicitly set, otherwise falls back to agent.llm.
+
+        Returns:
+            The resolved LLM instance.
+        """
+        from crewai.llm import LLM
+
+        config = getattr(self.agent, "planning_config", None)
+        if config is not None and config.llm is not None:
+            if isinstance(config.llm, LLM):
+                return config.llm
+            return create_llm(config.llm)
+        return self.agent.llm
+
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+
+    def observe(
+        self,
+        completed_step: TodoItem,
+        result: str,
+        all_completed: list[TodoItem],
+        remaining_todos: list[TodoItem],
+    ) -> StepObservation:
+        """Observe a step's result and decide on plan continuation.
+
+        This runs after EVERY step execution — not just failures.
+
+        Args:
+            completed_step: The todo item that was just executed.
+            result: The final result string from the step.
+            all_completed: All previously completed todos (for context).
+            remaining_todos: The pending todos still in the plan.
+
+        Returns:
+            StepObservation with the Planner's analysis. Any suggested
+            refinements are structured StepRefinement objects ready for
+            direct application — no second LLM call needed.
+        """
+        agent_role = self.agent.role
+
+        crewai_event_bus.emit(
+            self.agent,
+            event=StepObservationStartedEvent(
+                agent_role=agent_role,
+                step_number=completed_step.step_number,
+                step_description=completed_step.description,
+                from_task=self.task,
+                from_agent=self.agent,
+            ),
+        )
+
+        messages = self._build_observation_messages(
+            completed_step, result, all_completed, remaining_todos
+        )
+
+        try:
+            response = self.llm.call(
+                messages,
+                response_model=StepObservation,
+                from_task=self.task,
+                from_agent=self.agent,
+            )
+
+            if isinstance(response, StepObservation):
+                observation = response
+            else:
+                observation = StepObservation(
+                    step_completed_successfully=True,
+                    key_information_learned=str(response) if response else "",
+                    remaining_plan_still_valid=True,
+                )
+
+            refinement_summaries = (
+                [
+                    f"Step {r.step_number}: {r.new_description}"
+                    for r in observation.suggested_refinements
+                ]
+                if observation.suggested_refinements
+                else None
+            )
+
+            crewai_event_bus.emit(
+                self.agent,
+                event=StepObservationCompletedEvent(
+                    agent_role=agent_role,
+                    step_number=completed_step.step_number,
+                    step_description=completed_step.description,
+                    step_completed_successfully=observation.step_completed_successfully,
+                    key_information_learned=observation.key_information_learned,
+                    remaining_plan_still_valid=observation.remaining_plan_still_valid,
+                    needs_full_replan=observation.needs_full_replan,
+                    replan_reason=observation.replan_reason,
+                    goal_already_achieved=observation.goal_already_achieved,
+                    suggested_refinements=refinement_summaries,
+                    from_task=self.task,
+                    from_agent=self.agent,
+                ),
+            )
+
+            return observation
+
+        except Exception as e:
+            logger.warning(
+                f"Observation LLM call failed: {e}. Defaulting to conservative replan."
+            )
+
+            crewai_event_bus.emit(
+                self.agent,
+                event=StepObservationFailedEvent(
+                    agent_role=agent_role,
+                    step_number=completed_step.step_number,
+                    step_description=completed_step.description,
+                    error=str(e),
+                    from_task=self.task,
+                    from_agent=self.agent,
+                ),
+            )
+
+            # Don't force a full replan — the step may have succeeded even if the
+            # observer LLM failed to parse the result. Defaulting to "continue" is
+            # far less disruptive than wiping the entire plan on every observer error.
+            return StepObservation(
+                step_completed_successfully=True,
+                key_information_learned="",
+                remaining_plan_still_valid=True,
+                needs_full_replan=False,
+            )
+
+    def _extract_task_section(self, text: str) -> str:
+        """Extract the ## Task body from a structured enriched instruction.
+
+        Falls back to the full text (capped at 2000 chars) for plain inputs.
+        """
+        for marker in ("\n## Task\n", "\n## Task:", "## Task\n"):
+            idx = text.find(marker)
+            if idx >= 0:
+                start = idx + len(marker)
+                for end_marker in ("\n---\n", "\n## "):
+                    end = text.find(end_marker, start)
+                    if end > 0:
+                        return text[start:end].strip()
+                return text[start : start + 2000].strip()
+        return text[:2000] if len(text) > 2000 else text
+
+    def apply_refinements(
+        self,
+        observation: StepObservation,
+        remaining_todos: list[TodoItem],
+    ) -> list[TodoItem]:
+        """Apply structured refinements from the observation directly to todo descriptions.
+
+        No LLM call needed — refinements are already structured StepRefinement
+        objects produced by the observation call. This is a pure in-memory update.
+
+        Args:
+            observation: The observation containing structured refinements.
+            remaining_todos: The pending todos to update in-place.
+
+        Returns:
+            The same todo list with updated descriptions where refinements applied.
+        """
+        if not observation.suggested_refinements:
+            return remaining_todos
+
+        todo_by_step: dict[int, TodoItem] = {t.step_number: t for t in remaining_todos}
+        for refinement in observation.suggested_refinements:
+            if refinement.step_number in todo_by_step and refinement.new_description:
+                todo_by_step[refinement.step_number].description = refinement.new_description
+
+        return remaining_todos
+
+    # ------------------------------------------------------------------
+    # Internal: Message building
+    # ------------------------------------------------------------------
+
+    def _build_observation_messages(
+        self,
+        completed_step: TodoItem,
+        result: str,
+        all_completed: list[TodoItem],
+        remaining_todos: list[TodoItem],
+    ) -> list[LLMMessage]:
+        """Build messages for the observation LLM call."""
+        task_desc = ""
+        task_goal = ""
+        if self.task:
+            task_desc = self.task.description or ""
+            task_goal = self.task.expected_output or ""
+        elif self.kickoff_input:
+            # Standalone kickoff path — no Task object, but we have the raw input.
+            # Extract just the ## Task section so the observer sees the actual goal,
+            # not the full enriched instruction with env/tools/verification noise.
+            task_desc = self._extract_task_section(self.kickoff_input)
+            task_goal = "Complete the task successfully"
+
+        system_prompt = self._i18n.retrieve("planning", "observation_system_prompt")
+
+        # Build context of what's been done
+        completed_summary = ""
+        if all_completed:
+            completed_lines = []
+            for todo in all_completed:
+                result_preview = (todo.result or "")[:200]
+                completed_lines.append(
+                    f"  Step {todo.step_number}: {todo.description}\n"
+                    f"    Result: {result_preview}"
+                )
+            completed_summary = "\n## Previously completed steps:\n" + "\n".join(
+                completed_lines
+            )
+
+        # Build remaining plan
+        remaining_summary = ""
+        if remaining_todos:
+            remaining_lines = [
+                f"  Step {todo.step_number}: {todo.description}"
+                for todo in remaining_todos
+            ]
+            remaining_summary = "\n## Remaining plan steps:\n" + "\n".join(
+                remaining_lines
+            )
+
+        user_prompt = self._i18n.retrieve("planning", "observation_user_prompt").format(
+            task_description=task_desc,
+            task_goal=task_goal,
+            completed_summary=completed_summary,
+            step_number=completed_step.step_number,
+            step_description=completed_step.description,
+            step_result=result,
+            remaining_summary=remaining_summary,
+        )
+
+        return [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt},
+        ]
--- a/lib/crewai/src/crewai/agents/step_executor.py
+++ b/lib/crewai/src/crewai/agents/step_executor.py
@@ -0,0 +1,608 @@
+"""StepExecutor: Isolated executor for a single plan step.
+
+Implements the direct-action execution pattern from Plan-and-Act
+(arxiv 2503.09572): the Executor receives one step description,
+makes a single LLM call, executes any tool call returned, and
+returns the result immediately.
+
+There is no inner loop. Recovery from failure (retry, replan) is
+the responsibility of PlannerObserver and AgentExecutor — keeping
+this class single-purpose and fast.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+from datetime import datetime
+import json
+import time
+from typing import TYPE_CHECKING, Any
+
+from pydantic import BaseModel
+
+from crewai.agents.parser import AgentAction, AgentFinish
+from crewai.events.event_bus import crewai_event_bus
+from crewai.events.types.tool_usage_events import (
+    ToolUsageErrorEvent,
+    ToolUsageFinishedEvent,
+    ToolUsageStartedEvent,
+)
+from crewai.utilities.agent_utils import (
+    build_tool_calls_assistant_message,
+    check_native_tool_support,
+    enforce_rpm_limit,
+    execute_single_native_tool_call,
+    format_message_for_llm,
+    is_tool_call_list,
+    process_llm_response,
+    setup_native_tools,
+)
+from crewai.utilities.i18n import I18N, get_i18n
+from crewai.utilities.planning_types import TodoItem
+from crewai.utilities.printer import Printer
+from crewai.utilities.step_execution_context import StepExecutionContext, StepResult
+from crewai.utilities.string_utils import sanitize_tool_name
+from crewai.utilities.tool_utils import execute_tool_and_check_finality
+from crewai.utilities.types import LLMMessage
+
+
+if TYPE_CHECKING:
+    from crewai.agent import Agent
+    from crewai.agents.tools_handler import ToolsHandler
+    from crewai.crew import Crew
+    from crewai.llms.base_llm import BaseLLM
+    from crewai.task import Task
+    from crewai.tools.base_tool import BaseTool
+    from crewai.tools.structured_tool import CrewStructuredTool
+
+
+class StepExecutor:
+    """Executes a SINGLE todo item using direct-action execution.
+
+    The StepExecutor owns its own message list per invocation. It never reads
+    or writes the AgentExecutor's state. Results flow back via StepResult.
+
+    Execution pattern (per Plan-and-Act, arxiv 2503.09572):
+        1. Build messages from todo + context
+        2. Call LLM once (with or without native tools)
+        3. If tool call → execute it → return tool result
+        4. If text answer → return it directly
+        No inner loop — recovery is PlannerObserver's responsibility.
+
+    Args:
+        llm: The language model to use for execution.
+        tools: Structured tools available to the executor.
+        agent: The agent instance (for role/goal/verbose/config).
+        original_tools: Original BaseTool instances (needed for native tool schema).
+        tools_handler: Optional tools handler for caching and delegation tracking.
+        task: Optional task context.
+        crew: Optional crew context.
+        function_calling_llm: Optional separate LLM for function calling.
+        request_within_rpm_limit: Optional RPM limit function.
+        callbacks: Optional list of callbacks.
+        i18n: Optional i18n instance.
+    """
+
+    def __init__(
+        self,
+        llm: BaseLLM,
+        tools: list[CrewStructuredTool],
+        agent: Agent,
+        original_tools: list[BaseTool] | None = None,
+        tools_handler: ToolsHandler | None = None,
+        task: Task | None = None,
+        crew: Crew | None = None,
+        function_calling_llm: BaseLLM | Any | None = None,
+        request_within_rpm_limit: Callable[[], bool] | None = None,
+        callbacks: list[Any] | None = None,
+        i18n: I18N | None = None,
+    ) -> None:
+        self.llm = llm
+        self.tools = tools
+        self.agent = agent
+        self.original_tools = original_tools or []
+        self.tools_handler = tools_handler
+        self.task = task
+        self.crew = crew
+        self.function_calling_llm = function_calling_llm
+        self.request_within_rpm_limit = request_within_rpm_limit
+        self.callbacks = callbacks or []
+        self._i18n: I18N = i18n or get_i18n()
+        self._printer: Printer = Printer()
+
+        # Native tool support — set up once
+        self._use_native_tools = check_native_tool_support(self.llm, self.original_tools)
+        self._openai_tools: list[dict[str, Any]] = []
+        self._available_functions: dict[str, Callable[..., Any]] = {}
+        if self._use_native_tools and self.original_tools:
+            self._openai_tools, self._available_functions = setup_native_tools(
+                self.original_tools
+            )
+
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+
+    def execute(self, todo: TodoItem, context: StepExecutionContext) -> StepResult:
+        """Execute a single todo item using a multi-turn action loop.
+
+        Enforces the RPM limit, builds a fresh message list, then iterates
+        LLM call → tool execution → observation until the LLM signals it is
+        done (text answer) or max_step_iterations is reached.  Never touches
+        external AgentExecutor state.
+
+        Args:
+            todo: The todo item to execute.
+            context: Immutable context with task info and dependency results.
+
+        Returns:
+            StepResult with the outcome.
+        """
+        start_time = time.monotonic()
+        tool_calls_made: list[str] = []
+
+        try:
+            enforce_rpm_limit(self.request_within_rpm_limit)
+            messages = self._build_isolated_messages(todo, context)
+
+            if self._use_native_tools:
+                result_text = self._execute_native(messages, tool_calls_made)
+            else:
+                result_text = self._execute_text_parsed(messages, tool_calls_made)
+            self._validate_expected_tool_usage(todo, tool_calls_made)
+
+            elapsed = time.monotonic() - start_time
+            return StepResult(
+                success=True,
+                result=result_text,
+                tool_calls_made=tool_calls_made,
+                execution_time=elapsed,
+            )
+        except Exception as e:
+            elapsed = time.monotonic() - start_time
+            return StepResult(
+                success=False,
+                result="",
+                error=str(e),
+                tool_calls_made=tool_calls_made,
+                execution_time=elapsed,
+            )
+
+    # ------------------------------------------------------------------
+    # Internal: Message building
+    # ------------------------------------------------------------------
+
+    def _build_isolated_messages(
+        self, todo: TodoItem, context: StepExecutionContext
+    ) -> list[LLMMessage]:
+        """Build a fresh message list for this step's execution.
+
+        System prompt tells the LLM it is an Executor focused on one step.
+        User prompt provides the step description, dependencies, and tools.
+        """
+        system_prompt = self._build_system_prompt()
+        user_prompt = self._build_user_prompt(todo, context)
+
+        return [
+            format_message_for_llm(system_prompt, role="system"),
+            format_message_for_llm(user_prompt, role="user"),
+        ]
+
+    def _build_system_prompt(self) -> str:
+        """Build the Executor's system prompt."""
+        role = self.agent.role if self.agent else "Assistant"
+        goal = self.agent.goal if self.agent else "Complete tasks efficiently"
+        backstory = getattr(self.agent, "backstory", "") or ""
+
+        tools_section = ""
+        if self.tools and not self._use_native_tools:
+            tool_names = ", ".join(sanitize_tool_name(t.name) for t in self.tools)
+            tools_section = self._i18n.retrieve(
+                "planning", "step_executor_tools_section"
+            ).format(tool_names=tool_names)
+
+        return self._i18n.retrieve("planning", "step_executor_system_prompt").format(
+            role=role,
+            backstory=backstory,
+            goal=goal,
+            tools_section=tools_section,
+        )
+
+    def _extract_task_section(self, task_description: str) -> str:
+        """Extract the most relevant portion of the task description.
+
+        For structured descriptions (e.g. harbor_agent-style with ## Task
+        and ## Instructions sections), extracts just the task body so the
+        executor sees the requirements without duplicating tool/verification
+        instructions that are already in the system prompt.
+
+        For plain descriptions, returns the full text (up to 2000 chars).
+        """
+        # Try to extract between "## Task" and the next "---" separator
+        # or next "##" heading — this isolates the task spec from env/tool noise.
+        for marker in ("\n## Task\n", "\n## Task:", "## Task\n"):
+            idx = task_description.find(marker)
+            if idx >= 0:
+                start = idx + len(marker)
+                # End at the first horizontal rule or next top-level ## section
+                for end_marker in ("\n---\n", "\n## "):
+                    end = task_description.find(end_marker, start)
+                    if end > 0:
+                        return task_description[start:end].strip()
+                # No end marker — take up to 2000 chars
+                return task_description[start : start + 2000].strip()
+
+        # No structured format — use the full description, reasonably truncated
+        if len(task_description) > 2000:
+            return task_description[:2000] + "\n... [truncated]"
+        return task_description
+
+    def _build_user_prompt(self, todo: TodoItem, context: StepExecutionContext) -> str:
+        """Build the user prompt for this specific step."""
+        parts: list[str] = []
+
+        # Include overall task context so the executor knows the full goal and
+        # required output format/location — critical for knowing WHAT to produce.
+        # We extract only the task body (not tool instructions or verification
+        # sections) to avoid duplicating directives already in the system prompt.
+        if context.task_description:
+            task_section = self._extract_task_section(context.task_description)
+            if task_section:
+                parts.append(
+                    self._i18n.retrieve("planning", "step_executor_task_context").format(
+                        task_context=task_section,
+                    )
+                )
+
+        parts.append(
+            self._i18n.retrieve("planning", "step_executor_user_prompt").format(
+                step_description=todo.description,
+            )
+        )
+
+        if todo.tool_to_use:
+            parts.append(
+                self._i18n.retrieve("planning", "step_executor_suggested_tool").format(
+                    tool_to_use=todo.tool_to_use,
+                )
+            )
+
+        # Include dependency results (final results only, no traces)
+        if context.dependency_results:
+            parts.append(
+                self._i18n.retrieve("planning", "step_executor_context_header")
+            )
+            for step_num, result in sorted(context.dependency_results.items()):
+                parts.append(
+                    self._i18n.retrieve(
+                        "planning", "step_executor_context_entry"
+                    ).format(step_number=step_num, result=result)
+                )
+
+        parts.append(self._i18n.retrieve("planning", "step_executor_complete_step"))
+
+        return "\n".join(parts)
+
+    # ------------------------------------------------------------------
+    # Internal: Multi-turn execution loop
+    # ------------------------------------------------------------------
+
+    def _execute_text_parsed(
+        self,
+        messages: list[LLMMessage],
+        tool_calls_made: list[str],
+        max_step_iterations: int = 15,
+    ) -> str:
+        """Execute step using text-parsed tool calling with a multi-turn loop.
+
+        Iterates LLM call → tool execution → observation until the LLM
+        produces a Final Answer or max_step_iterations is reached.
+        This allows the agent to: run a command, see the output, adjust its
+        approach, and run another command — all within a single plan step.
+        """
+        use_stop_words = self.llm.supports_stop_words() if self.llm else False
+        last_tool_result = ""
+
+        for _ in range(max_step_iterations):
+            answer = self.llm.call(
+                messages,
+                callbacks=self.callbacks,
+                from_task=self.task,
+                from_agent=self.agent,
+            )
+
+            if not answer:
+                raise ValueError("Empty response from LLM")
+
+            answer_str = str(answer)
+            formatted = process_llm_response(answer_str, use_stop_words)
+
+            if isinstance(formatted, AgentFinish):
+                return str(formatted.output)
+
+            if isinstance(formatted, AgentAction):
+                tool_calls_made.append(formatted.tool)
+                tool_result = self._execute_text_tool_with_events(formatted)
+                last_tool_result = tool_result
+                # Append the assistant's reasoning + action, then the observation.
+                # _build_observation_message handles vision sentinels so the LLM
+                # receives an image content block instead of raw base64 text.
+                messages.append({"role": "assistant", "content": answer_str})
+                messages.append(self._build_observation_message(tool_result))
+                continue
+
+            # Raw text response with no Final Answer marker — treat as done
+            return answer_str
+
+        # Max iterations reached — return the last tool result we accumulated
+        return last_tool_result
+
+    def _execute_text_tool_with_events(self, formatted: AgentAction) -> str:
+        """Execute text-parsed tool calls with tool usage events."""
+        args_dict = self._parse_tool_args(formatted.tool_input)
+        agent_key = getattr(self.agent, "key", "unknown") if self.agent else "unknown"
+        started_at = datetime.now()
+        crewai_event_bus.emit(
+            self,
+            event=ToolUsageStartedEvent(
+                tool_name=formatted.tool,
+                tool_args=args_dict,
+                from_agent=self.agent,
+                from_task=self.task,
+                agent_key=agent_key,
+            ),
+        )
+
+        try:
+            fingerprint_context = {}
+            if (
+                self.agent
+                and hasattr(self.agent, "security_config")
+                and hasattr(self.agent.security_config, "fingerprint")
+            ):
+                fingerprint_context = {
+                    "agent_fingerprint": str(self.agent.security_config.fingerprint)
+                }
+
+            tool_result = execute_tool_and_check_finality(
+                agent_action=formatted,
+                fingerprint_context=fingerprint_context,
+                tools=self.tools,
+                i18n=self._i18n,
+                agent_key=self.agent.key if self.agent else None,
+                agent_role=self.agent.role if self.agent else None,
+                tools_handler=self.tools_handler,
+                task=self.task,
+                agent=self.agent,
+                function_calling_llm=self.function_calling_llm,
+                crew=self.crew,
+            )
+        except Exception as e:
+            crewai_event_bus.emit(
+                self,
+                event=ToolUsageErrorEvent(
+                    tool_name=formatted.tool,
+                    tool_args=args_dict,
+                    from_agent=self.agent,
+                    from_task=self.task,
+                    agent_key=agent_key,
+                    error=e,
+                ),
+            )
+            raise
+
+        crewai_event_bus.emit(
+            self,
+            event=ToolUsageFinishedEvent(
+                output=str(tool_result.result),
+                tool_name=formatted.tool,
+                tool_args=args_dict,
+                from_agent=self.agent,
+                from_task=self.task,
+                agent_key=agent_key,
+                started_at=started_at,
+                finished_at=datetime.now(),
+            ),
+        )
+        return str(tool_result.result)
+
+    def _parse_tool_args(self, tool_input: Any) -> dict[str, Any]:
+        """Parse tool args from the parser output into a dict payload for events."""
+        if isinstance(tool_input, dict):
+            return tool_input
+        if isinstance(tool_input, str):
+            stripped_input = tool_input.strip()
+            if not stripped_input:
+                return {}
+            try:
+                parsed = json.loads(stripped_input)
+                if isinstance(parsed, dict):
+                    return parsed
+                return {"input": parsed}
+            except json.JSONDecodeError:
+                return {"input": stripped_input}
+        return {"input": str(tool_input)}
+
+    # ------------------------------------------------------------------
+    # Internal: Vision support
+    # ------------------------------------------------------------------
+
+    @staticmethod
+    def _parse_vision_sentinel(raw: str) -> tuple[str, str] | None:
+        """Parse a VISION_IMAGE sentinel into (media_type, base64_data), or None."""
+        _PREFIX = "VISION_IMAGE:"
+        if not raw.startswith(_PREFIX):
+            return None
+        rest = raw[len(_PREFIX):]
+        sep = rest.find(":")
+        if sep <= 0:
+            return None
+        return rest[:sep], rest[sep + 1:]
+
+    @staticmethod
+    def _build_observation_message(tool_result: str) -> LLMMessage:
+        """Build an observation message, converting vision sentinels to image blocks.
+
+        When a tool returns a VISION_IMAGE sentinel (e.g. from read_image),
+        we build a multimodal content block so the LLM can actually *see*
+        the image rather than receiving a wall of base64 text.
+
+        Uses the standard image_url / data-URI format so each LLM provider's
+        SDK (OpenAI, LiteLLM, etc.) handles the provider-specific conversion.
+
+        Format: ``VISION_IMAGE:<media_type>:<base64_data>``
+        """
+        parsed = StepExecutor._parse_vision_sentinel(tool_result)
+        if parsed:
+            media_type, b64_data = parsed
+            return {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Observation: Here is the image:"},
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:{media_type};base64,{b64_data}",
+                        },
+                    },
+                ],
+            }
+        return {"role": "user", "content": f"Observation: {tool_result}"}
+
+    def _validate_expected_tool_usage(
+        self,
+        todo: TodoItem,
+        tool_calls_made: list[str],
+    ) -> None:
+        """Fail step execution when a required tool is configured but not called."""
+        expected_tool = getattr(todo, "tool_to_use", None)
+        if not expected_tool:
+            return
+        expected_tool_name = sanitize_tool_name(expected_tool)
+        available_tool_names = {
+            sanitize_tool_name(tool.name)
+            for tool in self.tools
+            if getattr(tool, "name", "")
+        } | set(self._available_functions.keys())
+        if expected_tool_name not in available_tool_names:
+            return
+        called_names = {sanitize_tool_name(name) for name in tool_calls_made}
+        if expected_tool_name not in called_names:
+            raise ValueError(
+                f"Expected tool '{expected_tool_name}' was not called "
+                f"for step {todo.step_number}."
+            )
+
+    def _execute_native(
+        self,
+        messages: list[LLMMessage],
+        tool_calls_made: list[str],
+        max_step_iterations: int = 15,
+    ) -> str:
+        """Execute step using native function calling with a multi-turn loop.
+
+        Iterates LLM call → tool execution → appended results until the LLM
+        returns a text answer (no more tool calls) or max_step_iterations is
+        reached.  This lets the agent run a shell command, observe the output,
+        correct mistakes, and issue follow-up commands — all within one step.
+        """
+        accumulated_results: list[str] = []
+
+        for _ in range(max_step_iterations):
+            answer = self.llm.call(
+                messages,
+                tools=self._openai_tools,
+                callbacks=self.callbacks,
+                from_task=self.task,
+                from_agent=self.agent,
+            )
+
+            if not answer:
+                raise ValueError("Empty response from LLM")
+
+            if isinstance(answer, BaseModel):
+                return answer.model_dump_json()
+
+            if isinstance(answer, list) and answer and is_tool_call_list(answer):
+                # _execute_native_tool_calls appends assistant + tool messages
+                # to `messages` as a side-effect, so the next LLM call will
+                # see the full conversation history including tool outputs.
+                result = self._execute_native_tool_calls(
+                    answer, messages, tool_calls_made
+                )
+                accumulated_results.append(result)
+                continue
+
+            # Text answer → LLM decided the step is done
+            return str(answer)
+
+        # Max iterations reached — return everything we accumulated
+        return "\n".join(filter(None, accumulated_results))
+
+    def _execute_native_tool_calls(
+        self,
+        tool_calls: list[Any],
+        messages: list[LLMMessage],
+        tool_calls_made: list[str],
+    ) -> str:
+        """Execute a batch of native tool calls and return their results.
+
+        Returns the result of the first tool marked result_as_answer if any,
+        otherwise returns all tool results concatenated.
+        """
+        assistant_message, _reports = build_tool_calls_assistant_message(tool_calls)
+        if assistant_message:
+            messages.append(assistant_message)
+
+        tool_results: list[str] = []
+        for tool_call in tool_calls:
+            call_result = execute_single_native_tool_call(
+                tool_call,
+                available_functions=self._available_functions,
+                original_tools=self.original_tools,
+                structured_tools=self.tools,
+                tools_handler=self.tools_handler,
+                agent=self.agent,
+                task=self.task,
+                crew=self.crew,
+                event_source=self,
+                printer=self._printer,
+                verbose=bool(self.agent and self.agent.verbose),
+            )
+
+            if call_result.func_name:
+                tool_calls_made.append(call_result.func_name)
+
+            if call_result.result_as_answer:
+                return str(call_result.result)
+
+            if call_result.tool_message:
+                raw_content = call_result.tool_message.get("content", "")
+                if isinstance(raw_content, str):
+                    parsed = self._parse_vision_sentinel(raw_content)
+                    if parsed:
+                        media_type, b64_data = parsed
+                        # Replace the sentinel with a standard image_url content block.
+                        # Each provider SDK (LiteLLM → Anthropic, OpenAI native, etc.)
+                        # converts the data-URI to its own wire format.
+                        modified = dict(call_result.tool_message)
+                        modified["content"] = [
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:{media_type};base64,{b64_data}",
+                                },
+                            }
+                        ]
+                        messages.append(modified)
+                        tool_results.append("[image]")
+                    else:
+                        messages.append(call_result.tool_message)
+                        if raw_content:
+                            tool_results.append(raw_content)
+                else:
+                    messages.append(call_result.tool_message)
+                    if raw_content:
+                        tool_results.append(str(raw_content))
+
+        return "\n".join(tool_results) if tool_results else ""
--- a/lib/crewai/src/crewai/cli/authentication/main.py
+++ b/lib/crewai/src/crewai/cli/authentication/main.py
@@ -2,8 +2,8 @@ import time
 from typing import TYPE_CHECKING, Any, TypeVar, cast
 import webbrowser

+import httpx
 from pydantic import BaseModel, Field
-import requests
 from rich.console import Console

 from crewai.cli.authentication.utils import validate_jwt_token
@@ -98,7 +98,7 @@ class AuthenticationCommand:
            "scope": " ".join(self.oauth2_provider.get_oauth_scopes()),
            "audience": self.oauth2_provider.get_audience(),
        }
-        response = requests.post(
+        response = httpx.post(
            url=self.oauth2_provider.get_authorize_url(),
            data=device_code_payload,
            timeout=20,
@@ -130,7 +130,7 @@ class AuthenticationCommand:

        attempts = 0
        while True and attempts < 10:
-            response = requests.post(
+            response = httpx.post(
                self.oauth2_provider.get_token_url(), data=token_payload, timeout=30
            )
            token_data = response.json()
@@ -149,7 +149,7 @@ class AuthenticationCommand:
                return

            if token_data["error"] not in ("authorization_pending", "slow_down"):
-                raise requests.HTTPError(
+                raise httpx.HTTPError(
                    token_data.get("error_description") or token_data.get("error")
                )

--- a/lib/crewai/src/crewai/cli/command.py
+++ b/lib/crewai/src/crewai/cli/command.py
@@ -1,5 +1,6 @@
-import requests
-from requests.exceptions import JSONDecodeError
+import json
+
+import httpx
 from rich.console import Console

 from crewai.cli.authentication.token import get_auth_token
@@ -30,16 +31,16 @@ class PlusAPIMixin:
            console.print("Run 'crewai login' to sign up/login.", style="bold green")
            raise SystemExit from None

-    def _validate_response(self, response: requests.Response) -> None:
+    def _validate_response(self, response: httpx.Response) -> None:
        """
        Handle and display error messages from API responses.

        Args:
-            response (requests.Response): The response from the Plus API
+            response (httpx.Response): The response from the Plus API
        """
        try:
            json_response = response.json()
-        except (JSONDecodeError, ValueError):
+        except (json.JSONDecodeError, ValueError):
            console.print(
                "Failed to parse response from Enterprise API failed. Details:",
                style="bold red",
@@ -62,7 +63,7 @@ class PlusAPIMixin:
                    )
            raise SystemExit

-        if not response.ok:
+        if not response.is_success:
            console.print(
                "Request to Enterprise API failed. Details:", style="bold red"
            )
--- a/lib/crewai/src/crewai/cli/enterprise/main.py
+++ b/lib/crewai/src/crewai/cli/enterprise/main.py
@@ -1,7 +1,7 @@
+import json
 from typing import Any, cast

-import requests
-from requests.exceptions import JSONDecodeError, RequestException
+import httpx
 from rich.console import Console

 from crewai.cli.authentication.main import Oauth2Settings, ProviderFactory
@@ -47,12 +47,12 @@ class EnterpriseConfigureCommand(BaseCommand):
                "User-Agent": f"CrewAI-CLI/{get_crewai_version()}",
                "X-Crewai-Version": get_crewai_version(),
            }
-            response = requests.get(oauth_endpoint, timeout=30, headers=headers)
+            response = httpx.get(oauth_endpoint, timeout=30, headers=headers)
            response.raise_for_status()

            try:
                oauth_config = response.json()
-            except JSONDecodeError as e:
+            except json.JSONDecodeError as e:
                raise ValueError(f"Invalid JSON response from {oauth_endpoint}") from e

            self._validate_oauth_config(oauth_config)
@@ -62,7 +62,7 @@ class EnterpriseConfigureCommand(BaseCommand):
            )
            return cast(dict[str, Any], oauth_config)

-        except RequestException as e:
+        except httpx.HTTPError as e:
            raise ValueError(f"Failed to connect to enterprise URL: {e!s}") from e
        except Exception as e:
            raise ValueError(f"Error fetching OAuth2 configuration: {e!s}") from e
--- a/lib/crewai/src/crewai/cli/organization/main.py
+++ b/lib/crewai/src/crewai/cli/organization/main.py
@@ -1,4 +1,4 @@
-from requests import HTTPError
+from httpx import HTTPStatusError
 from rich.console import Console
 from rich.table import Table

@@ -10,11 +10,11 @@ console = Console()


 class OrganizationCommand(BaseCommand, PlusAPIMixin):
-    def __init__(self):
+    def __init__(self) -> None:
        BaseCommand.__init__(self)
        PlusAPIMixin.__init__(self, telemetry=self._telemetry)

-    def list(self):
+    def list(self) -> None:
        try:
            response = self.plus_api_client.get_organizations()
            response.raise_for_status()
@@ -33,7 +33,7 @@ class OrganizationCommand(BaseCommand, PlusAPIMixin):
                table.add_row(org["name"], org["uuid"])

            console.print(table)
-        except HTTPError as e:
+        except HTTPStatusError as e:
            if e.response.status_code == 401:
                console.print(
                    "You are not logged in to any organization. Use 'crewai login' to login.",
@@ -50,7 +50,7 @@ class OrganizationCommand(BaseCommand, PlusAPIMixin):
            )
            raise SystemExit(1) from e

-    def switch(self, org_id):
+    def switch(self, org_id: str) -> None:
        try:
            response = self.plus_api_client.get_organizations()
            response.raise_for_status()
@@ -72,7 +72,7 @@ class OrganizationCommand(BaseCommand, PlusAPIMixin):
                f"Successfully switched to {org['name']} ({org['uuid']})",
                style="bold green",
            )
-        except HTTPError as e:
+        except HTTPStatusError as e:
            if e.response.status_code == 401:
                console.print(
                    "You are not logged in to any organization. Use 'crewai login' to login.",
@@ -87,7 +87,7 @@ class OrganizationCommand(BaseCommand, PlusAPIMixin):
            console.print(f"Failed to switch organization: {e!s}", style="bold red")
            raise SystemExit(1) from e

-    def current(self):
+    def current(self) -> None:
        settings = Settings()
        if settings.org_uuid:
            console.print(
--- a/lib/crewai/src/crewai/cli/plus_api.py
+++ b/lib/crewai/src/crewai/cli/plus_api.py
@@ -3,7 +3,6 @@ from typing import Any
 from urllib.parse import urljoin

 import httpx
-import requests

 from crewai.cli.config import Settings
 from crewai.cli.constants import DEFAULT_CREWAI_ENTERPRISE_URL
@@ -43,16 +42,16 @@ class PlusAPI:

    def _make_request(
        self, method: str, endpoint: str, **kwargs: Any
-    ) -> requests.Response:
+    ) -> httpx.Response:
        url = urljoin(self.base_url, endpoint)
-        session = requests.Session()
-        session.trust_env = False
-        return session.request(method, url, headers=self.headers, **kwargs)
+        verify = kwargs.pop("verify", True)
+        with httpx.Client(trust_env=False, verify=verify) as client:
+            return client.request(method, url, headers=self.headers, **kwargs)

-    def login_to_tool_repository(self) -> requests.Response:
+    def login_to_tool_repository(self) -> httpx.Response:
        return self._make_request("POST", f"{self.TOOLS_RESOURCE}/login")

-    def get_tool(self, handle: str) -> requests.Response:
+    def get_tool(self, handle: str) -> httpx.Response:
        return self._make_request("GET", f"{self.TOOLS_RESOURCE}/{handle}")

    async def get_agent(self, handle: str) -> httpx.Response:
@@ -68,7 +67,7 @@ class PlusAPI:
        description: str | None,
        encoded_file: str,
        available_exports: list[dict[str, Any]] | None = None,
-    ) -> requests.Response:
+    ) -> httpx.Response:
        params = {
            "handle": handle,
            "public": is_public,
@@ -79,54 +78,52 @@ class PlusAPI:
        }
        return self._make_request("POST", f"{self.TOOLS_RESOURCE}", json=params)

-    def deploy_by_name(self, project_name: str) -> requests.Response:
+    def deploy_by_name(self, project_name: str) -> httpx.Response:
        return self._make_request(
            "POST", f"{self.CREWS_RESOURCE}/by-name/{project_name}/deploy"
        )

-    def deploy_by_uuid(self, uuid: str) -> requests.Response:
+    def deploy_by_uuid(self, uuid: str) -> httpx.Response:
        return self._make_request("POST", f"{self.CREWS_RESOURCE}/{uuid}/deploy")

-    def crew_status_by_name(self, project_name: str) -> requests.Response:
+    def crew_status_by_name(self, project_name: str) -> httpx.Response:
        return self._make_request(
            "GET", f"{self.CREWS_RESOURCE}/by-name/{project_name}/status"
        )

-    def crew_status_by_uuid(self, uuid: str) -> requests.Response:
+    def crew_status_by_uuid(self, uuid: str) -> httpx.Response:
        return self._make_request("GET", f"{self.CREWS_RESOURCE}/{uuid}/status")

    def crew_by_name(
        self, project_name: str, log_type: str = "deployment"
-    ) -> requests.Response:
+    ) -> httpx.Response:
        return self._make_request(
            "GET", f"{self.CREWS_RESOURCE}/by-name/{project_name}/logs/{log_type}"
        )

-    def crew_by_uuid(
-        self, uuid: str, log_type: str = "deployment"
-    ) -> requests.Response:
+    def crew_by_uuid(self, uuid: str, log_type: str = "deployment") -> httpx.Response:
        return self._make_request(
            "GET", f"{self.CREWS_RESOURCE}/{uuid}/logs/{log_type}"
        )

-    def delete_crew_by_name(self, project_name: str) -> requests.Response:
+    def delete_crew_by_name(self, project_name: str) -> httpx.Response:
        return self._make_request(
            "DELETE", f"{self.CREWS_RESOURCE}/by-name/{project_name}"
        )

-    def delete_crew_by_uuid(self, uuid: str) -> requests.Response:
+    def delete_crew_by_uuid(self, uuid: str) -> httpx.Response:
        return self._make_request("DELETE", f"{self.CREWS_RESOURCE}/{uuid}")

-    def list_crews(self) -> requests.Response:
+    def list_crews(self) -> httpx.Response:
        return self._make_request("GET", self.CREWS_RESOURCE)

-    def create_crew(self, payload: dict[str, Any]) -> requests.Response:
+    def create_crew(self, payload: dict[str, Any]) -> httpx.Response:
        return self._make_request("POST", self.CREWS_RESOURCE, json=payload)

-    def get_organizations(self) -> requests.Response:
+    def get_organizations(self) -> httpx.Response:
        return self._make_request("GET", self.ORGANIZATIONS_RESOURCE)

-    def initialize_trace_batch(self, payload: dict[str, Any]) -> requests.Response:
+    def initialize_trace_batch(self, payload: dict[str, Any]) -> httpx.Response:
        return self._make_request(
            "POST",
            f"{self.TRACING_RESOURCE}/batches",
@@ -136,7 +133,7 @@ class PlusAPI:

    def initialize_ephemeral_trace_batch(
        self, payload: dict[str, Any]
-    ) -> requests.Response:
+    ) -> httpx.Response:
        return self._make_request(
            "POST",
            f"{self.EPHEMERAL_TRACING_RESOURCE}/batches",
@@ -145,7 +142,7 @@ class PlusAPI:

    def send_trace_events(
        self, trace_batch_id: str, payload: dict[str, Any]
-    ) -> requests.Response:
+    ) -> httpx.Response:
        return self._make_request(
            "POST",
            f"{self.TRACING_RESOURCE}/batches/{trace_batch_id}/events",
@@ -155,7 +152,7 @@ class PlusAPI:

    def send_ephemeral_trace_events(
        self, trace_batch_id: str, payload: dict[str, Any]
-    ) -> requests.Response:
+    ) -> httpx.Response:
        return self._make_request(
            "POST",
            f"{self.EPHEMERAL_TRACING_RESOURCE}/batches/{trace_batch_id}/events",
@@ -165,7 +162,7 @@ class PlusAPI:

    def finalize_trace_batch(
        self, trace_batch_id: str, payload: dict[str, Any]
-    ) -> requests.Response:
+    ) -> httpx.Response:
        return self._make_request(
            "PATCH",
            f"{self.TRACING_RESOURCE}/batches/{trace_batch_id}/finalize",
@@ -175,7 +172,7 @@ class PlusAPI:

    def finalize_ephemeral_trace_batch(
        self, trace_batch_id: str, payload: dict[str, Any]
-    ) -> requests.Response:
+    ) -> httpx.Response:
        return self._make_request(
            "PATCH",
            f"{self.EPHEMERAL_TRACING_RESOURCE}/batches/{trace_batch_id}/finalize",
@@ -185,7 +182,7 @@ class PlusAPI:

    def mark_trace_batch_as_failed(
        self, trace_batch_id: str, error_message: str
-    ) -> requests.Response:
+    ) -> httpx.Response:
        return self._make_request(
            "PATCH",
            f"{self.TRACING_RESOURCE}/batches/{trace_batch_id}",
@@ -193,13 +190,11 @@ class PlusAPI:
            timeout=30,
        )

-    def get_triggers(self) -> requests.Response:
+    def get_triggers(self) -> httpx.Response:
        """Get all available triggers from integrations."""
        return self._make_request("GET", f"{self.INTEGRATIONS_RESOURCE}/apps")

-    def get_trigger_payload(
-        self, app_slug: str, trigger_slug: str
-    ) -> requests.Response:
+    def get_trigger_payload(self, app_slug: str, trigger_slug: str) -> httpx.Response:
        """Get sample payload for a specific trigger."""
        return self._make_request(
            "GET", f"{self.INTEGRATIONS_RESOURCE}/{app_slug}/{trigger_slug}/payload"
--- a/lib/crewai/src/crewai/cli/provider.py
+++ b/lib/crewai/src/crewai/cli/provider.py
@@ -8,7 +8,7 @@ from typing import Any

 import certifi
 import click
-import requests
+import httpx

 from crewai.cli.constants import JSON_URL, MODELS, PROVIDERS

@@ -165,20 +165,20 @@ def fetch_provider_data(cache_file: Path) -> dict[str, Any] | None:
    ssl_config = os.environ["SSL_CERT_FILE"] = certifi.where()

    try:
-        response = requests.get(JSON_URL, stream=True, timeout=60, verify=ssl_config)
-        response.raise_for_status()
-        data = download_data(response)
-        with open(cache_file, "w") as f:
-            json.dump(data, f)
-        return data
-    except requests.RequestException as e:
+        with httpx.stream("GET", JSON_URL, timeout=60, verify=ssl_config) as response:
+            response.raise_for_status()
+            data = download_data(response)
+            with open(cache_file, "w") as f:
+                json.dump(data, f)
+            return data
+    except httpx.HTTPError as e:
        click.secho(f"Error fetching provider data: {e}", fg="red")
    except json.JSONDecodeError:
        click.secho("Error parsing provider data. Invalid JSON format.", fg="red")
    return None


-def download_data(response: requests.Response) -> dict[str, Any]:
+def download_data(response: httpx.Response) -> dict[str, Any]:
    """Downloads data from a given HTTP response and returns the JSON content.

    Args:
@@ -194,7 +194,7 @@ def download_data(response: requests.Response) -> dict[str, Any]:
    with click.progressbar(
        length=total_size, label="Downloading", show_pos=True
    ) as bar:
-        for chunk in response.iter_content(block_size):
+        for chunk in response.iter_bytes(block_size):
            if chunk:
                data_chunks.append(chunk)
                bar.update(len(chunk))
--- a/lib/crewai/src/crewai/events/event_listener.py
+++ b/lib/crewai/src/crewai/events/event_listener.py
@@ -74,6 +74,14 @@ from crewai.events.types.mcp_events import (
    MCPToolExecutionFailedEvent,
    MCPToolExecutionStartedEvent,
 )
+from crewai.events.types.observation_events import (
+    GoalAchievedEarlyEvent,
+    PlanRefinementEvent,
+    PlanReplanTriggeredEvent,
+    StepObservationCompletedEvent,
+    StepObservationFailedEvent,
+    StepObservationStartedEvent,
+)
 from crewai.events.types.reasoning_events import (
    AgentReasoningCompletedEvent,
    AgentReasoningFailedEvent,
@@ -534,6 +542,64 @@ class EventListener(BaseEventListener):
                event.error,
            )

+        # ----------- OBSERVATION EVENTS (Plan-and-Execute) -----------
+
+        @crewai_event_bus.on(StepObservationStartedEvent)
+        def on_step_observation_started(
+            _: Any, event: StepObservationStartedEvent
+        ) -> None:
+            self.formatter.handle_observation_started(
+                event.agent_role,
+                event.step_number,
+                event.step_description,
+            )
+
+        @crewai_event_bus.on(StepObservationCompletedEvent)
+        def on_step_observation_completed(
+            _: Any, event: StepObservationCompletedEvent
+        ) -> None:
+            self.formatter.handle_observation_completed(
+                event.agent_role,
+                event.step_number,
+                event.step_completed_successfully,
+                event.remaining_plan_still_valid,
+                event.key_information_learned,
+                event.needs_full_replan,
+                event.goal_already_achieved,
+            )
+
+        @crewai_event_bus.on(StepObservationFailedEvent)
+        def on_step_observation_failed(
+            _: Any, event: StepObservationFailedEvent
+        ) -> None:
+            self.formatter.handle_observation_failed(
+                event.step_number,
+                event.error,
+            )
+
+        @crewai_event_bus.on(PlanRefinementEvent)
+        def on_plan_refinement(_: Any, event: PlanRefinementEvent) -> None:
+            self.formatter.handle_plan_refinement(
+                event.step_number,
+                event.refined_step_count,
+                event.refinements,
+            )
+
+        @crewai_event_bus.on(PlanReplanTriggeredEvent)
+        def on_plan_replan_triggered(_: Any, event: PlanReplanTriggeredEvent) -> None:
+            self.formatter.handle_plan_replan(
+                event.replan_reason,
+                event.replan_count,
+                event.completed_steps_preserved,
+            )
+
+        @crewai_event_bus.on(GoalAchievedEarlyEvent)
+        def on_goal_achieved_early(_: Any, event: GoalAchievedEarlyEvent) -> None:
+            self.formatter.handle_goal_achieved_early(
+                event.steps_completed,
+                event.steps_remaining,
+            )
+
        # ----------- AGENT LOGGING EVENTS -----------

        @crewai_event_bus.on(AgentLogsStartedEvent)
--- a/lib/crewai/src/crewai/events/listeners/tracing/trace_listener.py
+++ b/lib/crewai/src/crewai/events/listeners/tracing/trace_listener.py
@@ -93,6 +93,14 @@ from crewai.events.types.memory_events import (
    MemorySaveFailedEvent,
    MemorySaveStartedEvent,
 )
+from crewai.events.types.observation_events import (
+    GoalAchievedEarlyEvent,
+    PlanRefinementEvent,
+    PlanReplanTriggeredEvent,
+    StepObservationCompletedEvent,
+    StepObservationFailedEvent,
+    StepObservationStartedEvent,
+)
 from crewai.events.types.reasoning_events import (
    AgentReasoningCompletedEvent,
    AgentReasoningFailedEvent,
@@ -437,6 +445,39 @@ class TraceCollectionListener(BaseEventListener):
        ) -> None:
            self._handle_action_event("agent_reasoning_failed", source, event)

+        # Observation events (Plan-and-Execute)
+        @event_bus.on(StepObservationStartedEvent)
+        def on_step_observation_started(
+            source: Any, event: StepObservationStartedEvent
+        ) -> None:
+            self._handle_action_event("step_observation_started", source, event)
+
+        @event_bus.on(StepObservationCompletedEvent)
+        def on_step_observation_completed(
+            source: Any, event: StepObservationCompletedEvent
+        ) -> None:
+            self._handle_action_event("step_observation_completed", source, event)
+
+        @event_bus.on(StepObservationFailedEvent)
+        def on_step_observation_failed(
+            source: Any, event: StepObservationFailedEvent
+        ) -> None:
+            self._handle_action_event("step_observation_failed", source, event)
+
+        @event_bus.on(PlanRefinementEvent)
+        def on_plan_refinement(source: Any, event: PlanRefinementEvent) -> None:
+            self._handle_action_event("plan_refinement", source, event)
+
+        @event_bus.on(PlanReplanTriggeredEvent)
+        def on_plan_replan_triggered(
+            source: Any, event: PlanReplanTriggeredEvent
+        ) -> None:
+            self._handle_action_event("plan_replan_triggered", source, event)
+
+        @event_bus.on(GoalAchievedEarlyEvent)
+        def on_goal_achieved_early(source: Any, event: GoalAchievedEarlyEvent) -> None:
+            self._handle_action_event("goal_achieved_early", source, event)
+
        @event_bus.on(KnowledgeRetrievalStartedEvent)
        def on_knowledge_retrieval_started(
            source: Any, event: KnowledgeRetrievalStartedEvent
--- a/lib/crewai/src/crewai/events/types/observation_events.py
+++ b/lib/crewai/src/crewai/events/types/observation_events.py
@@ -0,0 +1,99 @@
+"""Observation events for the Plan-and-Execute architecture.
+
+Emitted during the Observation phase (PLAN-AND-ACT Section 3.3) when the
+PlannerObserver analyzes step execution results and decides on plan
+continuation, refinement, or replanning.
+"""
+
+from typing import Any
+
+from crewai.events.base_events import BaseEvent
+
+
+class ObservationEvent(BaseEvent):
+    """Base event for observation phase events."""
+
+    type: str
+    agent_role: str
+    step_number: int
+    step_description: str = ""
+    from_task: Any | None = None
+    from_agent: Any | None = None
+
+    def __init__(self, **data: Any) -> None:
+        super().__init__(**data)
+        self._set_task_params(data)
+        self._set_agent_params(data)
+
+
+class StepObservationStartedEvent(ObservationEvent):
+    """Emitted when the Planner begins observing a step's result.
+
+    Fires after every step execution, before the observation LLM call.
+    """
+
+    type: str = "step_observation_started"
+
+
+class StepObservationCompletedEvent(ObservationEvent):
+    """Emitted when the Planner finishes observing a step's result.
+
+    Contains the full observation analysis: what was learned, whether
+    the plan is still valid, and what action to take next.
+    """
+
+    type: str = "step_observation_completed"
+    step_completed_successfully: bool = True
+    key_information_learned: str = ""
+    remaining_plan_still_valid: bool = True
+    needs_full_replan: bool = False
+    replan_reason: str | None = None
+    goal_already_achieved: bool = False
+    suggested_refinements: list[str] | None = None
+
+
+class StepObservationFailedEvent(ObservationEvent):
+    """Emitted when the observation LLM call itself fails.
+
+    The system defaults to continuing the plan when this happens,
+    but the event allows monitoring/alerting on observation failures.
+    """
+
+    type: str = "step_observation_failed"
+    error: str = ""
+
+
+class PlanRefinementEvent(ObservationEvent):
+    """Emitted when the Planner refines upcoming step descriptions.
+
+    This is the lightweight refinement path — no full replan, just
+    sharpening pending todo descriptions based on new information.
+    """
+
+    type: str = "plan_refinement"
+    refined_step_count: int = 0
+    refinements: list[str] | None = None
+
+
+class PlanReplanTriggeredEvent(ObservationEvent):
+    """Emitted when the Planner triggers a full replan.
+
+    The remaining plan was deemed fundamentally wrong and will be
+    regenerated from scratch, preserving completed step results.
+    """
+
+    type: str = "plan_replan_triggered"
+    replan_reason: str = ""
+    replan_count: int = 0
+    completed_steps_preserved: int = 0
+
+
+class GoalAchievedEarlyEvent(ObservationEvent):
+    """Emitted when the Planner detects the goal was achieved early.
+
+    Remaining steps will be skipped and execution will finalize.
+    """
+
+    type: str = "goal_achieved_early"
+    steps_remaining: int = 0
+    steps_completed: int = 0
--- a/lib/crewai/src/crewai/events/types/reasoning_events.py
+++ b/lib/crewai/src/crewai/events/types/reasoning_events.py
@@ -9,7 +9,7 @@ class ReasoningEvent(BaseEvent):
    type: str
    attempt: int = 1
    agent_role: str
-    task_id: str
+    task_id: str | None = None
    task_name: str | None = None
    from_task: Any | None = None
    agent_id: str | None = None
--- a/lib/crewai/src/crewai/events/utils/console_formatter.py
+++ b/lib/crewai/src/crewai/events/utils/console_formatter.py
@@ -936,6 +936,152 @@ To enable tracing, do any one of these:
        )
        self.print_panel(error_content, "❌ Reasoning Error", "red")

+    # ----------- OBSERVATION EVENTS (Plan-and-Execute) -----------
+
+    def handle_observation_started(
+        self,
+        agent_role: str,
+        step_number: int,
+        step_description: str,
+    ) -> None:
+        """Handle step observation started event."""
+        if not self.verbose:
+            return
+
+        content = Text()
+        content.append("Observation Started\n", style="cyan bold")
+        content.append("Agent: ", style="white")
+        content.append(f"{agent_role}\n", style="cyan")
+        content.append("Step: ", style="white")
+        content.append(f"{step_number}\n", style="cyan")
+        if step_description:
+            desc_preview = step_description[:80] + (
+                "..." if len(step_description) > 80 else ""
+            )
+            content.append("Description: ", style="white")
+            content.append(f"{desc_preview}\n", style="cyan")
+
+        self.print_panel(content, "🔍 Observing Step Result", "cyan")
+
+    def handle_observation_completed(
+        self,
+        agent_role: str,
+        step_number: int,
+        step_completed: bool,
+        plan_valid: bool,
+        key_info: str,
+        needs_replan: bool,
+        goal_achieved: bool,
+    ) -> None:
+        """Handle step observation completed event."""
+        if not self.verbose:
+            return
+
+        if goal_achieved:
+            style = "green"
+            status = "Goal Achieved Early"
+        elif needs_replan:
+            style = "yellow"
+            status = "Replan Needed"
+        elif plan_valid:
+            style = "green"
+            status = "Plan Valid — Continue"
+        else:
+            style = "red"
+            status = "Step Failed"
+
+        content = Text()
+        content.append("Observation Complete\n", style=f"{style} bold")
+        content.append("Step: ", style="white")
+        content.append(f"{step_number}\n", style=style)
+        content.append("Status: ", style="white")
+        content.append(f"{status}\n", style=style)
+        if key_info:
+            info_preview = key_info[:120] + ("..." if len(key_info) > 120 else "")
+            content.append("Learned: ", style="white")
+            content.append(f"{info_preview}\n", style=style)
+
+        self.print_panel(content, "🔍 Observation Result", style)
+
+    def handle_observation_failed(
+        self,
+        step_number: int,
+        error: str,
+    ) -> None:
+        """Handle step observation failure event."""
+        if not self.verbose:
+            return
+
+        error_content = self.create_status_content(
+            "Observation Failed",
+            "Error",
+            "red",
+            Step=str(step_number),
+            Error=error,
+        )
+        self.print_panel(error_content, "❌ Observation Error", "red")
+
+    def handle_plan_refinement(
+        self,
+        step_number: int,
+        refined_count: int,
+        refinements: list[str] | None,
+    ) -> None:
+        """Handle plan refinement event."""
+        if not self.verbose:
+            return
+
+        content = Text()
+        content.append("Plan Refined\n", style="cyan bold")
+        content.append("After Step: ", style="white")
+        content.append(f"{step_number}\n", style="cyan")
+        content.append("Steps Updated: ", style="white")
+        content.append(f"{refined_count}\n", style="cyan")
+        if refinements:
+            for r in refinements[:3]:
+                content.append(f"  • {r[:80]}\n", style="white")
+
+        self.print_panel(content, "✏️ Plan Refinement", "cyan")
+
+    def handle_plan_replan(
+        self,
+        reason: str,
+        replan_count: int,
+        preserved_count: int,
+    ) -> None:
+        """Handle plan replan triggered event."""
+        if not self.verbose:
+            return
+
+        content = Text()
+        content.append("Full Replan Triggered\n", style="yellow bold")
+        content.append("Reason: ", style="white")
+        content.append(f"{reason}\n", style="yellow")
+        content.append("Replan #: ", style="white")
+        content.append(f"{replan_count}\n", style="yellow")
+        content.append("Preserved Steps: ", style="white")
+        content.append(f"{preserved_count}\n", style="yellow")
+
+        self.print_panel(content, "🔄 Dynamic Replan", "yellow")
+
+    def handle_goal_achieved_early(
+        self,
+        steps_completed: int,
+        steps_remaining: int,
+    ) -> None:
+        """Handle goal achieved early event."""
+        if not self.verbose:
+            return
+
+        content = Text()
+        content.append("Goal Achieved Early!\n", style="green bold")
+        content.append("Completed: ", style="white")
+        content.append(f"{steps_completed} steps\n", style="green")
+        content.append("Skipped: ", style="white")
+        content.append(f"{steps_remaining} remaining steps\n", style="green")
+
+        self.print_panel(content, "🎯 Early Goal Achievement", "green")
+
    # ----------- AGENT LOGGING EVENTS -----------

    def handle_agent_logs_started(
--- a/lib/crewai/src/crewai/experimental/agent_executor.py
+++ b/lib/crewai/src/crewai/experimental/agent_executor.py
--- a/lib/crewai/src/crewai/lite_agent_output.py
+++ b/lib/crewai/src/crewai/lite_agent_output.py
@@ -6,9 +6,27 @@ from typing import Any

 from pydantic import BaseModel, Field

+from crewai.utilities.planning_types import TodoItem
 from crewai.utilities.types import LLMMessage


+class TodoExecutionResult(BaseModel):
+    """Summary of a single todo execution."""
+
+    step_number: int = Field(description="Step number in the plan")
+    description: str = Field(description="What the todo was supposed to do")
+    tool_used: str | None = Field(
+        default=None, description="Tool that was used for this step"
+    )
+    status: str = Field(description="Final status: completed, failed, pending")
+    result: str | None = Field(
+        default=None, description="Result or error message from execution"
+    )
+    depends_on: list[int] = Field(
+        default_factory=list, description="Step numbers this depended on"
+    )
+
+
 class LiteAgentOutput(BaseModel):
    """Class that represents the result of a LiteAgent execution."""

@@ -24,12 +42,75 @@ class LiteAgentOutput(BaseModel):
    )
    messages: list[LLMMessage] = Field(description="Messages of the agent", default=[])

+    plan: str | None = Field(
+        default=None, description="The execution plan that was generated, if any"
+    )
+    todos: list[TodoExecutionResult] = Field(
+        default_factory=list,
+        description="List of todos that were executed with their results",
+    )
+    replan_count: int = Field(
+        default=0, description="Number of times the plan was regenerated"
+    )
+    last_replan_reason: str | None = Field(
+        default=None, description="Reason for the last replan, if any"
+    )
+
+    @classmethod
+    def from_todo_items(cls, todo_items: list[TodoItem]) -> list[TodoExecutionResult]:
+        """Convert TodoItem objects to TodoExecutionResult summaries.
+
+        Args:
+            todo_items: List of TodoItem objects from execution.
+
+        Returns:
+            List of TodoExecutionResult summaries.
+        """
+        return [
+            TodoExecutionResult(
+                step_number=item.step_number,
+                description=item.description,
+                tool_used=item.tool_to_use,
+                status=item.status,
+                result=item.result,
+                depends_on=item.depends_on,
+            )
+            for item in todo_items
+        ]
+
    def to_dict(self) -> dict[str, Any]:
        """Convert pydantic_output to a dictionary."""
        if self.pydantic:
            return self.pydantic.model_dump()
        return {}

+    @property
+    def completed_todos(self) -> list[TodoExecutionResult]:
+        """Get only the completed todos."""
+        return [t for t in self.todos if t.status == "completed"]
+
+    @property
+    def failed_todos(self) -> list[TodoExecutionResult]:
+        """Get only the failed todos."""
+        return [t for t in self.todos if t.status == "failed"]
+
+    @property
+    def had_plan(self) -> bool:
+        """Check if the agent executed with a plan."""
+        return self.plan is not None or len(self.todos) > 0
+
    def __str__(self) -> str:
        """Return the raw output as a string."""
        return self.raw
+
+    def __repr__(self) -> str:
+        """Return a detailed representation including todo summary."""
+        parts = [f"LiteAgentOutput(role={self.agent_role!r}"]
+        if self.todos:
+            completed = len(self.completed_todos)
+            total = len(self.todos)
+            parts.append(f", todos={completed}/{total} completed")
+        if self.replan_count > 0:
+            parts.append(f", replans={self.replan_count}")
+        parts.append(")")
+        return "".join(parts)
--- a/lib/crewai/src/crewai/llms/providers/bedrock/completion.py
+++ b/lib/crewai/src/crewai/llms/providers/bedrock/completion.py
@@ -1838,7 +1838,10 @@ class BedrockCompletion(BaseLLM):
                    )

        # CRITICAL: Handle model-specific conversation requirements
-        # Cohere and some other models require conversation to end with user message
+        # Cohere and some other models require conversation to end with user message.
+        # Anthropic models on Bedrock also reject assistant messages in the final
+        # position when tools are present ("pre-filling the assistant response is
+        # not supported").
        if converse_messages:
            last_message = converse_messages[-1]
            if last_message["role"] == "assistant":
@@ -1865,6 +1868,20 @@ class BedrockCompletion(BaseLLM):
                            "content": [{"text": "Continue your response."}],
                        }
                    )
+                # Anthropic (Claude) models reject assistant-last messages when
+                # tools are in the request. Append a user message so the
+                # Converse API accepts the payload.
+                elif "anthropic" in self.model.lower() or "claude" in self.model.lower():
+                    converse_messages.append(
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "text": "Please continue and provide your final answer."
+                                }
+                            ],
+                        }
+                    )

        # Ensure first message is from user (required by Converse API)
        if not converse_messages:
--- a/lib/crewai/src/crewai/task.py
+++ b/lib/crewai/src/crewai/task.py
@@ -1,5 +1,6 @@
 from __future__ import annotations

+import asyncio
 from concurrent.futures import Future
 from copy import copy as shallow_copy
 import datetime
@@ -624,11 +625,15 @@ class Task(BaseModel):
            self.end_time = datetime.datetime.now()

            if self.callback:
-                self.callback(self.output)
+                cb_result = self.callback(self.output)
+                if inspect.isawaitable(cb_result):
+                    await cb_result

            crew = self.agent.crew  # type: ignore[union-attr]
            if crew and crew.task_callback and crew.task_callback != self.callback:
-                crew.task_callback(self.output)
+                cb_result = crew.task_callback(self.output)
+                if inspect.isawaitable(cb_result):
+                    await cb_result

            if self.output_file:
                content = (
@@ -722,11 +727,15 @@ class Task(BaseModel):
            self.end_time = datetime.datetime.now()

            if self.callback:
-                self.callback(self.output)
+                cb_result = self.callback(self.output)
+                if inspect.iscoroutine(cb_result):
+                    asyncio.run(cb_result)

            crew = self.agent.crew  # type: ignore[union-attr]
            if crew and crew.task_callback and crew.task_callback != self.callback:
-                crew.task_callback(self.output)
+                cb_result = crew.task_callback(self.output)
+                if inspect.iscoroutine(cb_result):
+                    asyncio.run(cb_result)

            if self.output_file:
                content = (
--- a/lib/crewai/src/crewai/translations/en.json
+++ b/lib/crewai/src/crewai/translations/en.json
@@ -74,9 +74,28 @@
    "consolidation_user": "New content to consider storing:\n{new_content}\n\nExisting similar memories:\n{records_summary}\n\nReturn the consolidation plan as structured output."
  },
  "reasoning": {
-    "initial_plan": "You are {role}, a professional with the following background: {backstory}\n\nYour primary goal is: {goal}\n\nAs {role}, you are creating a strategic plan for a task that requires your expertise and unique perspective.",
-    "refine_plan": "You are {role}, a professional with the following background: {backstory}\n\nYour primary goal is: {goal}\n\nAs {role}, you are refining a strategic plan for a task that requires your expertise and unique perspective.",
-    "create_plan_prompt": "You are {role} with this background: {backstory}\n\nYour primary goal is: {goal}\n\nYou have been assigned the following task:\n{description}\n\nExpected output:\n{expected_output}\n\nAvailable tools: {tools}\n\nBefore executing this task, create a detailed plan that leverages your expertise as {role} and outlines:\n1. Your understanding of the task from your professional perspective\n2. The key steps you'll take to complete it, drawing on your background and skills\n3. How you'll approach any challenges that might arise, considering your expertise\n4. How you'll strategically use the available tools based on your experience, exactly what tools to use and how to use them\n5. The expected outcome and how it aligns with your goal\n\nAfter creating your plan, assess whether you feel ready to execute the task or if you could do better.\nConclude with one of these statements:\n- \"READY: I am ready to execute the task.\"\n- \"NOT READY: I need to refine my plan because [specific reason].\"",
-    "refine_plan_prompt": "You are {role} with this background: {backstory}\n\nYour primary goal is: {goal}\n\nYou created the following plan for this task:\n{current_plan}\n\nHowever, you indicated that you're not ready to execute the task yet.\n\nPlease refine your plan further, drawing on your expertise as {role} to address any gaps or uncertainties. As you refine your plan, be specific about which available tools you will use, how you will use them, and why they are the best choices for each step. Clearly outline your tool usage strategy as part of your improved plan.\n\nAfter refining your plan, assess whether you feel ready to execute the task.\nConclude with one of these statements:\n- \"READY: I am ready to execute the task.\"\n- \"NOT READY: I need to refine my plan further because [specific reason].\""
+    "initial_plan": "You are {role}. Create a focused execution plan using only the essential steps needed.",
+    "refine_plan": "You are {role}. Refine your plan to address the specific gap while keeping it minimal.",
+    "create_plan_prompt": "You are {role}.\n\nTask: {description}\n\nExpected output: {expected_output}\n\nAvailable tools: {tools}\n\nCreate a focused plan with ONLY the essential steps needed. Most tasks require just 2-5 steps. Do NOT pad with unnecessary steps like \"review\", \"verify\", \"document\", or \"finalize\" unless explicitly required.\n\nFor each step, specify the action and which tool to use (if any).\n\nConclude with:\n- \"READY: I am ready to execute the task.\"\n- \"NOT READY: I need to refine my plan because [specific reason].\"",
+    "refine_plan_prompt": "Your plan:\n{current_plan}\n\nYou indicated you're not ready. Address the specific gap while keeping the plan minimal.\n\nConclude with READY or NOT READY."
+  },
+  "planning": {
+    "system_prompt": "You are a strategic planning assistant. Create concrete, executable plans where every step produces a verifiable result.",
+    "create_plan_prompt": "Create an execution plan for the following task:\n\n## Task\n{description}\n\n## Expected Output\n{expected_output}\n\n## Available Tools\n{tools}\n\n## Planning Principles\nFocus on CONCRETE, EXECUTABLE steps. Each step must clearly state WHAT ACTION to take and HOW to verify it succeeded. The number of steps should match the task complexity. Hard limit: {max_steps} steps.\n\n## Rules:\n- Each step must have a clear DONE criterion\n- Do NOT group unrelated actions: if steps can fail independently, keep them separate\n- NO standalone \"thinking\" or \"planning\" steps — act, don't just observe\n- The last step must produce the required output\n\nAfter your plan, state READY or NOT READY.",
+    "refine_plan_prompt": "Your previous plan:\n{current_plan}\n\nYou indicated you weren't ready. Refine your plan to address the specific gap.\n\nKeep the plan minimal - only add steps that directly address the issue.\n\nConclude with READY or NOT READY as before.",
+    "observation_system_prompt": "You are a Planning Agent observing execution progress. After each step completes, you analyze what happened and decide whether the remaining plan is still valid.\n\nReason step-by-step about:\n1. Did this step produce a concrete, verifiable result? (file created, command succeeded, service running, etc.) — or did it only explore without acting?\n2. What new information was learned from this step's result?\n3. Whether the remaining steps still make sense given this new information\n4. What refinements, if any, are needed for upcoming steps\n5. Whether the overall goal has already been achieved\n\nCritical: mark `step_completed_successfully=false` if:\n- The step result is only exploratory (ls, pwd, cat) without producing the required artifact or action\n- A command returned a non-zero exit code and the error was not recovered\n- The step description required creating/building/starting something and the result shows it was not done\n\nBe conservative about triggering full replans — only do so when the remaining plan is fundamentally wrong, not just suboptimal.\n\nIMPORTANT: Set step_completed_successfully=false if:\n- The step's stated goal was NOT achieved (even if other things were done)\n- The first meaningful action returned an error (file not found, command not found, etc.)\n- The result is exploration/discovery output rather than the concrete action the step required\n- The step ran out of attempts without producing the required output\nSet needs_full_replan=true if the current plan's remaining steps reference paths or state that don't exist yet and need to be created first.",
+    "observation_user_prompt": "## Original task\n{task_description}\n\n## Expected output\n{task_goal}\n{completed_summary}\n\n## Just completed step {step_number}\nDescription: {step_description}\nResult: {step_result}\n{remaining_summary}\n\nAnalyze this step's result and provide your observation.",
+    "step_executor_system_prompt": "You are {role}. {backstory}\n\nYour goal: {goal}\n\nYou are executing ONE specific step in a larger plan. Your ONLY job is to fully complete this step — not to plan ahead.\n\nKey rules:\n- **ACT FIRST.** Execute the primary action of this step immediately. Do NOT read or explore files before attempting the main action unless exploration IS the step's goal.\n- If the step says 'run X', run X NOW. If it says 'write file Y', write Y NOW.\n- If the step requires producing an output file (e.g. /app/move.txt, report.jsonl, summary.csv), you MUST write that file using a tool call — do NOT just state the answer in text.\n- You may use tools MULTIPLE TIMES. After each tool use, check the result. If it failed, try a different approach.\n- Only output your Final Answer AFTER the concrete outcome is verified (file written, build succeeded, command exited 0).\n- If a command is not found or a path does not exist, fix it (different PATH, install missing deps, use absolute paths).\n- Do NOT spend more than 3 tool calls on exploration/analysis before attempting the primary action.{tools_section}",
+    "step_executor_tools_section": "\n\nAvailable tools: {tool_names}\n\nYou may call tools multiple times in sequence. Use this format for EACH tool call:\nThought: <what you observed and what you will try next>\nAction: <tool_name>\nAction Input: <input>\n\nAfter observing each result, decide: is the step complete? If yes:\nThought: The step is done because <evidence>\nFinal Answer: <concise summary of what was accomplished and the key result>",
+    "step_executor_user_prompt": "## Current Step\n{step_description}",
+    "step_executor_suggested_tool": "\nSuggested tool: {tool_to_use}",
+    "step_executor_context_header": "\n## Context from previous steps:",
+    "step_executor_context_entry": "Step {step_number} result: {result}",
+    "step_executor_complete_step": "\n**Execute the primary action of this step NOW.** If the step requires writing a file, write it. If it requires running a command, run it. Verify the outcome with a follow-up tool call, then give your Final Answer. Your Final Answer must confirm what was DONE (file created at path X, command succeeded), not just what should be done.",
+    "todo_system_prompt": "You are {role}. Your goal: {goal}\n\nYou are executing a specific step in a multi-step plan. Focus only on completing the current step. Use the suggested tool if one is provided. Be concise and provide clear results that can be used by subsequent steps.",
+    "synthesis_system_prompt": "You are {role}. You have completed a multi-step task. Synthesize the results from all steps into a single, coherent final response that directly addresses the original task. Do NOT list step numbers or say 'Step 1 result'. Produce a clean, polished answer as if you did it all at once.",
+    "synthesis_user_prompt": "## Original Task\n{task_description}\n\n## Results from each step\n{combined_steps}\n\nSynthesize these results into a single, coherent final answer.",
+    "replan_enhancement_prompt": "\n\nIMPORTANT: Previous execution attempt did not fully succeed. Please create a revised plan that accounts for the following context from the previous attempt:\n\n{previous_context}\n\nConsider:\n1. What steps succeeded and can be built upon\n2. What steps failed and why they might have failed\n3. Alternative approaches that might work better\n4. Whether dependencies need to be restructured",
+    "step_executor_task_context": "## Task Context\nThe following is the full task you are helping complete. Keep this in mind — especially any required output files, exact filenames, and expected formats.\n\n{task_context}\n\n---\n"
  }
-}
+}
--- a/lib/crewai/src/crewai/utilities/agent_utils.py
+++ b/lib/crewai/src/crewai/utilities/agent_utils.py
@@ -3,6 +3,9 @@ from __future__ import annotations
 import asyncio
 from collections.abc import Callable, Sequence
 import concurrent.futures
+from dataclasses import dataclass, field
+from datetime import datetime
+import inspect
 import json
 import re
 from typing import TYPE_CHECKING, Any, Final, Literal, TypedDict
@@ -38,6 +41,7 @@ from crewai.utilities.types import LLMMessage
 if TYPE_CHECKING:
    from crewai.agent import Agent
    from crewai.agents.crew_agent_executor import CrewAgentExecutor
+    from crewai.agents.tools_handler import ToolsHandler
    from crewai.experimental.agent_executor import AgentExecutor
    from crewai.lite_agent import LiteAgent
    from crewai.llm import LLM
@@ -323,6 +327,66 @@ def enforce_rpm_limit(
        request_within_rpm_limit()


+def _prepare_llm_call(
+    executor_context: CrewAgentExecutor | AgentExecutor | LiteAgent | None,
+    messages: list[LLMMessage],
+    printer: Printer,
+    verbose: bool = True,
+) -> list[LLMMessage]:
+    """Shared pre-call logic: run before hooks and resolve messages.
+
+    Args:
+        executor_context: Optional executor context for hook invocation.
+        messages: The messages to send to the LLM.
+        printer: Printer instance for output.
+        verbose: Whether to print output.
+
+    Returns:
+        The resolved messages list (may come from executor_context).
+
+    Raises:
+        ValueError: If a before hook blocks the call.
+    """
+    if executor_context is not None:
+        if not _setup_before_llm_call_hooks(executor_context, printer, verbose=verbose):
+            raise ValueError("LLM call blocked by before_llm_call hook")
+        messages = executor_context.messages
+    return messages
+
+
+def _validate_and_finalize_llm_response(
+    answer: Any,
+    executor_context: CrewAgentExecutor | AgentExecutor | LiteAgent | None,
+    printer: Printer,
+    verbose: bool = True,
+) -> str | BaseModel | Any:
+    """Shared post-call logic: validate response and run after hooks.
+
+    Args:
+        answer: The raw LLM response.
+        executor_context: Optional executor context for hook invocation.
+        printer: Printer instance for output.
+        verbose: Whether to print output.
+
+    Returns:
+        The potentially modified response.
+
+    Raises:
+        ValueError: If the response is None or empty.
+    """
+    if not answer:
+        if verbose:
+            printer.print(
+                content="Received None or empty response from LLM call.",
+                color="red",
+            )
+        raise ValueError("Invalid response from LLM call - None or empty.")
+
+    return _setup_after_llm_call_hooks(
+        executor_context, answer, printer, verbose=verbose
+    )
+
+
 def get_llm_response(
    llm: LLM | BaseLLM,
    messages: list[LLMMessage],
@@ -359,11 +423,7 @@ def get_llm_response(
        Exception: If an error occurs.
        ValueError: If the response is None or empty.
    """
-
-    if executor_context is not None:
-        if not _setup_before_llm_call_hooks(executor_context, printer, verbose=verbose):
-            raise ValueError("LLM call blocked by before_llm_call hook")
-        messages = executor_context.messages
+    messages = _prepare_llm_call(executor_context, messages, printer, verbose=verbose)

    try:
        answer = llm.call(
@@ -377,16 +437,9 @@ def get_llm_response(
        )
    except Exception as e:
        raise e
-    if not answer:
-        if verbose:
-            printer.print(
-                content="Received None or empty response from LLM call.",
-                color="red",
-            )
-        raise ValueError("Invalid response from LLM call - None or empty.")

-    return _setup_after_llm_call_hooks(
-        executor_context, answer, printer, verbose=verbose
+    return _validate_and_finalize_llm_response(
+        answer, executor_context, printer, verbose=verbose
    )


@@ -416,6 +469,7 @@ async def aget_llm_response(
        from_agent: Optional agent context for the LLM call.
        response_model: Optional Pydantic model for structured outputs.
        executor_context: Optional executor context for hook invocation.
+        verbose: Whether to print output.

    Returns:
        The response from the LLM as a string, Pydantic model (when response_model is provided),
@@ -425,10 +479,7 @@ async def aget_llm_response(
        Exception: If an error occurs.
        ValueError: If the response is None or empty.
    """
-    if executor_context is not None:
-        if not _setup_before_llm_call_hooks(executor_context, printer, verbose=verbose):
-            raise ValueError("LLM call blocked by before_llm_call hook")
-        messages = executor_context.messages
+    messages = _prepare_llm_call(executor_context, messages, printer, verbose=verbose)

    try:
        answer = await llm.acall(
@@ -442,16 +493,9 @@ async def aget_llm_response(
        )
    except Exception as e:
        raise e
-    if not answer:
-        if verbose:
-            printer.print(
-                content="Received None or empty response from LLM call.",
-                color="red",
-            )
-        raise ValueError("Invalid response from LLM call - None or empty.")

-    return _setup_after_llm_call_hooks(
-        executor_context, answer, printer, verbose=verbose
+    return _validate_and_finalize_llm_response(
+        answer, executor_context, printer, verbose=verbose
    )


@@ -501,7 +545,9 @@ def handle_agent_action_core(
        - TODO: Remove messages parameter and its usage.
    """
    if step_callback:
-        step_callback(tool_result)
+        cb_result = step_callback(tool_result)
+        if inspect.iscoroutine(cb_result):
+            asyncio.run(cb_result)

    formatted_answer.text += f"\nObservation: {tool_result.result}"
    formatted_answer.result = tool_result.result
@@ -1143,6 +1189,382 @@ def extract_tool_call_info(
    return None


+def is_tool_call_list(response: list[Any]) -> bool:
+    """Check if a response from the LLM is a list of tool calls.
+
+    Supports OpenAI, Anthropic, Bedrock, and Gemini formats.
+
+    Args:
+        response: The response to check.
+
+    Returns:
+        True if the response appears to be a list of tool calls.
+    """
+    if not response:
+        return False
+    first_item = response[0]
+    # OpenAI-style
+    if hasattr(first_item, "function") or (
+        isinstance(first_item, dict) and "function" in first_item
+    ):
+        return True
+    # Anthropic-style (ToolUseBlock)
+    if hasattr(first_item, "type") and getattr(first_item, "type", None) == "tool_use":
+        return True
+    if hasattr(first_item, "name") and hasattr(first_item, "input"):
+        return True
+    # Bedrock-style
+    if isinstance(first_item, dict) and "name" in first_item and "input" in first_item:
+        return True
+    # Gemini-style
+    if hasattr(first_item, "function_call") and first_item.function_call:
+        return True
+    return False
+
+
+def check_native_tool_support(llm: Any, original_tools: list[BaseTool] | None) -> bool:
+    """Check if the LLM supports native function calling and tools are available.
+
+    Args:
+        llm: The LLM instance.
+        original_tools: Original BaseTool instances.
+
+    Returns:
+        True if native function calling is supported and tools exist.
+    """
+    return (
+        hasattr(llm, "supports_function_calling")
+        and callable(getattr(llm, "supports_function_calling", None))
+        and llm.supports_function_calling()
+        and bool(original_tools)
+    )
+
+
+def setup_native_tools(
+    original_tools: list[BaseTool],
+) -> tuple[list[dict[str, Any]], dict[str, Callable[..., Any]]]:
+    """Convert tools to OpenAI schema format for native function calling.
+
+    Args:
+        original_tools: Original BaseTool instances.
+
+    Returns:
+        Tuple of (openai_tools_schema, available_functions_dict).
+    """
+    return convert_tools_to_openai_schema(original_tools)
+
+
+def build_tool_calls_assistant_message(
+    tool_calls: list[Any],
+) -> tuple[LLMMessage | None, list[dict[str, Any]]]:
+    """Build an assistant message containing tool call reports.
+
+    Extracts info from each tool call, builds the standard assistant message
+    format, and preserves raw Gemini parts when applicable.
+
+    Args:
+        tool_calls: Raw tool call objects from the LLM response.
+
+    Returns:
+        Tuple of (assistant_message, tool_calls_to_report).
+        assistant_message is None if no valid tool calls found.
+    """
+    tool_calls_to_report: list[dict[str, Any]] = []
+    for tool_call in tool_calls:
+        info = extract_tool_call_info(tool_call)
+        if not info:
+            continue
+        call_id, func_name, func_args = info
+        tool_calls_to_report.append(
+            {
+                "id": call_id,
+                "type": "function",
+                "function": {
+                    "name": func_name,
+                    "arguments": func_args
+                    if isinstance(func_args, str)
+                    else json.dumps(func_args),
+                },
+            }
+        )
+
+    if not tool_calls_to_report:
+        return None, []
+
+    assistant_message: LLMMessage = {
+        "role": "assistant",
+        "content": None,
+        "tool_calls": tool_calls_to_report,
+    }
+    # Preserve raw parts for Gemini compatibility
+    if all(type(tc).__qualname__ == "Part" for tc in tool_calls):
+        assistant_message["raw_tool_call_parts"] = list(tool_calls)
+
+    return assistant_message, tool_calls_to_report
+
+
+@dataclass
+class NativeToolCallResult:
+    """Result from executing a single native tool call."""
+
+    call_id: str
+    func_name: str
+    result: str
+    from_cache: bool = False
+    result_as_answer: bool = False
+    tool_message: LLMMessage = field(default_factory=dict)  # type: ignore[assignment]
+
+
+def execute_single_native_tool_call(
+    tool_call: Any,
+    *,
+    available_functions: dict[str, Callable[..., Any]],
+    original_tools: list[BaseTool],
+    structured_tools: list[CrewStructuredTool] | None,
+    tools_handler: ToolsHandler | None,
+    agent: Agent | None,
+    task: Task | None,
+    crew: Any | None,
+    event_source: Any,
+    printer: Printer | None = None,
+    verbose: bool = False,
+) -> NativeToolCallResult:
+    """Execute a single native tool call with full lifecycle management.
+
+    Handles: arg parsing, tool lookup, max-usage check, cache read/write,
+    before/after hooks, event emission, and result_as_answer detection.
+
+    Args:
+        tool_call: Raw tool call object from the LLM.
+        available_functions: Map of sanitized tool name -> callable.
+        original_tools: Original BaseTool list (for cache_function, result_as_answer).
+        structured_tools: Structured tools list (for hook context).
+        tools_handler: Optional handler with cache.
+        agent: The agent instance.
+        task: The current task.
+        crew: The crew instance.
+        event_source: The object to use as event emitter source.
+        printer: Optional printer for verbose logging.
+        verbose: Whether to print verbose output.
+
+    Returns:
+        NativeToolCallResult with all execution details.
+    """
+    from crewai.events.event_bus import crewai_event_bus
+    from crewai.events.types.tool_usage_events import (
+        ToolUsageErrorEvent,
+        ToolUsageFinishedEvent,
+        ToolUsageStartedEvent,
+    )
+    from crewai.hooks.tool_hooks import (
+        ToolCallHookContext,
+        get_after_tool_call_hooks,
+        get_before_tool_call_hooks,
+    )
+
+    info = extract_tool_call_info(tool_call)
+    if not info:
+        return NativeToolCallResult(
+            call_id="", func_name="", result="Unrecognized tool call format"
+        )
+
+    call_id, func_name, func_args = info
+
+    # Parse arguments
+    if isinstance(func_args, str):
+        try:
+            args_dict = json.loads(func_args)
+        except json.JSONDecodeError:
+            args_dict = {}
+    else:
+        args_dict = func_args
+
+    agent_key = getattr(agent, "key", "unknown") if agent else "unknown"
+
+    # Find original tool for cache_function and result_as_answer
+    original_tool: BaseTool | None = None
+    for tool in original_tools:
+        if sanitize_tool_name(tool.name) == func_name:
+            original_tool = tool
+            break
+
+    # Check max usage count
+    max_usage_reached = False
+    if (
+        original_tool
+        and original_tool.max_usage_count is not None
+        and original_tool.current_usage_count >= original_tool.max_usage_count
+    ):
+        max_usage_reached = True
+
+    # Check cache
+    from_cache = False
+    input_str = json.dumps(args_dict) if args_dict else ""
+    result = "Tool not found"
+
+    if tools_handler and tools_handler.cache:
+        cached_result = tools_handler.cache.read(tool=func_name, input=input_str)
+        if cached_result is not None:
+            result = (
+                str(cached_result)
+                if not isinstance(cached_result, str)
+                else cached_result
+            )
+            from_cache = True
+
+    # Emit tool started event
+    started_at = datetime.now()
+    crewai_event_bus.emit(
+        event_source,
+        event=ToolUsageStartedEvent(
+            tool_name=func_name,
+            tool_args=args_dict,
+            from_agent=agent,
+            from_task=task,
+            agent_key=agent_key,
+        ),
+    )
+
+    track_delegation_if_needed(func_name, args_dict, task)
+
+    # Find structured tool for hooks
+    structured_tool: CrewStructuredTool | None = None
+    for structured in structured_tools or []:
+        if sanitize_tool_name(structured.name) == func_name:
+            structured_tool = structured
+            break
+
+    # Before hooks
+    hook_blocked = False
+    before_hook_context = ToolCallHookContext(
+        tool_name=func_name,
+        tool_input=args_dict,
+        tool=structured_tool,  # type: ignore[arg-type]
+        agent=agent,
+        task=task,
+        crew=crew,
+    )
+    try:
+        for hook in get_before_tool_call_hooks():
+            if hook(before_hook_context) is False:
+                hook_blocked = True
+                break
+    except Exception:  # noqa: S110
+        pass
+
+    error_event_emitted = False
+    if hook_blocked:
+        result = f"Tool execution blocked by hook. Tool: {func_name}"
+    elif not from_cache and not max_usage_reached:
+        if func_name in available_functions:
+            try:
+                tool_func = available_functions[func_name]
+                raw_result = tool_func(**args_dict)
+
+                # Cache result
+                if tools_handler and tools_handler.cache:
+                    should_cache = True
+                    if original_tool:
+                        should_cache = original_tool.cache_function(
+                            args_dict, raw_result
+                        )
+                    if should_cache:
+                        tools_handler.cache.add(
+                            tool=func_name, input=input_str, output=raw_result
+                        )
+
+                result = (
+                    str(raw_result) if not isinstance(raw_result, str) else raw_result
+                )
+            except Exception as e:
+                result = f"Error executing tool: {e}"
+                if task:
+                    task.increment_tools_errors()
+                crewai_event_bus.emit(
+                    event_source,
+                    event=ToolUsageErrorEvent(
+                        tool_name=func_name,
+                        tool_args=args_dict,
+                        from_agent=agent,
+                        from_task=task,
+                        agent_key=agent_key,
+                        error=e,
+                    ),
+                )
+                error_event_emitted = True
+    elif max_usage_reached and original_tool:
+        result = (
+            f"Tool '{func_name}' has reached its usage limit of "
+            f"{original_tool.max_usage_count} times and cannot be used anymore."
+        )
+
+    # After hooks
+    after_hook_context = ToolCallHookContext(
+        tool_name=func_name,
+        tool_input=args_dict,
+        tool=structured_tool,  # type: ignore[arg-type]
+        agent=agent,
+        task=task,
+        crew=crew,
+        tool_result=result,
+    )
+    try:
+        for after_hook in get_after_tool_call_hooks():
+            hook_result = after_hook(after_hook_context)
+            if hook_result is not None:
+                result = hook_result
+                after_hook_context.tool_result = result
+    except Exception:  # noqa: S110
+        pass
+
+    # Emit tool finished event (only if error event wasn't already emitted)
+    if not error_event_emitted:
+        crewai_event_bus.emit(
+            event_source,
+            event=ToolUsageFinishedEvent(
+                output=result,
+                tool_name=func_name,
+                tool_args=args_dict,
+                from_agent=agent,
+                from_task=task,
+                agent_key=agent_key,
+                started_at=started_at,
+                finished_at=datetime.now(),
+            ),
+        )
+
+    # Build tool result message
+    tool_message: LLMMessage = {
+        "role": "tool",
+        "tool_call_id": call_id,
+        "name": func_name,
+        "content": result,
+    }
+
+    if verbose and printer:
+        cache_info = " (from cache)" if from_cache else ""
+        printer.print(
+            content=f"Tool {func_name} executed with result{cache_info}: {result[:200]}...",
+            color="green",
+        )
+
+    # Check result_as_answer
+    is_result_as_answer = bool(
+        original_tool
+        and hasattr(original_tool, "result_as_answer")
+        and original_tool.result_as_answer
+    )
+
+    return NativeToolCallResult(
+        call_id=call_id,
+        func_name=func_name,
+        result=result,
+        from_cache=from_cache,
+        result_as_answer=is_result_as_answer,
+        tool_message=tool_message,
+    )
+
+
 def _setup_before_llm_call_hooks(
    executor_context: CrewAgentExecutor | AgentExecutor | LiteAgent | None,
    printer: Printer,
--- a/lib/crewai/src/crewai/utilities/i18n.py
+++ b/lib/crewai/src/crewai/utilities/i18n.py
@@ -100,7 +100,13 @@ class I18N(BaseModel):
    def retrieve(
        self,
        kind: Literal[
-            "slices", "errors", "tools", "reasoning", "hierarchical_manager_agent", "memory"
+            "slices",
+            "errors",
+            "tools",
+            "reasoning",
+            "planning",
+            "hierarchical_manager_agent",
+            "memory",
        ],
        key: str,
    ) -> str:
--- a/lib/crewai/src/crewai/utilities/planning_types.py
+++ b/lib/crewai/src/crewai/utilities/planning_types.py
@@ -0,0 +1,256 @@
+"""Types for agent planning and todo tracking."""
+
+from __future__ import annotations
+
+from typing import Literal
+from uuid import uuid4
+
+from pydantic import BaseModel, Field, field_validator
+
+
+# Todo status type
+TodoStatus = Literal["pending", "running", "completed"]
+
+
+class PlanStep(BaseModel):
+    """A single step in the reasoning plan."""
+
+    step_number: int = Field(description="Step number (1-based)")
+    description: str = Field(description="What to do in this step")
+    tool_to_use: str | None = Field(
+        default=None, description="Tool to use for this step, if any"
+    )
+    depends_on: list[int] = Field(
+        default_factory=list, description="Step numbers this step depends on"
+    )
+
+
+class TodoItem(BaseModel):
+    """A single todo item representing a step in the execution plan."""
+
+    id: str = Field(default_factory=lambda: str(uuid4()))
+    step_number: int = Field(description="Order of this step in the plan (1-based)")
+    description: str = Field(description="What needs to be done")
+    tool_to_use: str | None = Field(
+        default=None, description="Tool to use for this step, if any"
+    )
+    status: TodoStatus = Field(default="pending", description="Current status")
+    depends_on: list[int] = Field(
+        default_factory=list, description="Step numbers this depends on"
+    )
+    result: str | None = Field(
+        default=None, description="Result after completion, if any"
+    )
+
+
+class TodoList(BaseModel):
+    """Collection of todos for tracking plan execution."""
+
+    items: list[TodoItem] = Field(default_factory=list)
+
+    @property
+    def current_todo(self) -> TodoItem | None:
+        """Get the currently running todo item."""
+        for item in self.items:
+            if item.status == "running":
+                return item
+        return None
+
+    @property
+    def next_pending(self) -> TodoItem | None:
+        """Get the next pending todo item."""
+        for item in self.items:
+            if item.status == "pending":
+                return item
+        return None
+
+    @property
+    def is_complete(self) -> bool:
+        """Check if all todos are completed."""
+        return len(self.items) > 0 and all(
+            item.status == "completed" for item in self.items
+        )
+
+    @property
+    def pending_count(self) -> int:
+        """Count of pending todos."""
+        return sum(1 for item in self.items if item.status == "pending")
+
+    @property
+    def completed_count(self) -> int:
+        """Count of completed todos."""
+        return sum(1 for item in self.items if item.status == "completed")
+
+    def get_by_step_number(self, step_number: int) -> TodoItem | None:
+        """Get a todo by its step number."""
+        for item in self.items:
+            if item.step_number == step_number:
+                return item
+        return None
+
+    def mark_running(self, step_number: int) -> None:
+        """Mark a todo as running by step number."""
+        item = self.get_by_step_number(step_number)
+        if item:
+            item.status = "running"
+
+    def mark_completed(self, step_number: int, result: str | None = None) -> None:
+        """Mark a todo as completed by step number."""
+        item = self.get_by_step_number(step_number)
+        if item:
+            item.status = "completed"
+            if result:
+                item.result = result
+
+    def _dependencies_satisfied(self, item: TodoItem) -> bool:
+        """Check if all dependencies for a todo item are completed.
+
+        Args:
+            item: The todo item to check dependencies for.
+
+        Returns:
+            True if all dependencies are completed, False otherwise.
+        """
+        for dep_num in item.depends_on:
+            dep = self.get_by_step_number(dep_num)
+            if dep is None or dep.status != "completed":
+                return False
+        return True
+
+    def get_ready_todos(self) -> list[TodoItem]:
+        """Get all todos that are ready to execute (pending with satisfied dependencies).
+
+        Returns:
+            List of TodoItem objects that can be executed now.
+        """
+        ready: list[TodoItem] = []
+        for item in self.items:
+            if item.status != "pending":
+                continue
+            if self._dependencies_satisfied(item):
+                ready.append(item)
+        return ready
+
+    @property
+    def can_parallelize(self) -> bool:
+        """Check if multiple todos can run in parallel.
+
+        Returns:
+            True if more than one todo is ready to execute.
+        """
+        return len(self.get_ready_todos()) > 1
+
+    @property
+    def running_count(self) -> int:
+        """Count of currently running todos."""
+        return sum(1 for item in self.items if item.status == "running")
+
+    def get_completed_todos(self) -> list[TodoItem]:
+        """Get all completed todos.
+
+        Returns:
+            List of completed TodoItem objects.
+        """
+        return [item for item in self.items if item.status == "completed"]
+
+    def get_pending_todos(self) -> list[TodoItem]:
+        """Get all pending todos.
+
+        Returns:
+            List of pending TodoItem objects.
+        """
+        return [item for item in self.items if item.status == "pending"]
+
+    def replace_pending_todos(self, new_items: list[TodoItem]) -> None:
+        """Replace all pending todos with new items.
+
+        Preserves completed and running todos, replaces only pending ones.
+        Used during replanning to swap in a new plan for remaining work.
+
+        Args:
+            new_items: The new todo items to replace pending ones.
+        """
+        non_pending = [item for item in self.items if item.status != "pending"]
+        self.items = non_pending + new_items
+
+
+class StepRefinement(BaseModel):
+    """A structured in-place update for a single pending step.
+
+    Returned as part of StepObservation when the Planner learns new
+    information that makes a pending step description more specific.
+    Applied directly — no second LLM call required.
+    """
+
+    step_number: int = Field(description="The step number to update (1-based)")
+    new_description: str = Field(
+        description="The updated, more specific description for this step"
+    )
+
+
+class StepObservation(BaseModel):
+    """Planner's observation after a step execution completes.
+
+    Returned by the PlannerObserver after EVERY step — not just failures.
+    The Planner uses this to decide whether to continue, refine, or replan.
+
+    Based on PLAN-AND-ACT (Section 3.3): the Planner observes what the Executor
+    did and incorporates new information into the remaining plan.
+
+    Attributes:
+        step_completed_successfully: Whether the step achieved its objective.
+        key_information_learned: New information revealed by this step
+            (e.g., "Found 3 products: A, B, C"). Used to refine upcoming steps.
+        remaining_plan_still_valid: Whether pending todos still make sense
+            given the new information. True does NOT mean no refinement needed.
+        suggested_refinements: Structured in-place updates to pending step
+            descriptions. Each entry targets a specific step by number. These
+            are applied directly without a second LLM call.
+            Example: [{"step_number": 3, "new_description": "Select product B (highest rated)"}]
+        needs_full_replan: The remaining plan is fundamentally wrong and must
+            be regenerated from scratch. Mutually exclusive with
+            remaining_plan_still_valid (if this is True, that should be False).
+        replan_reason: Explanation of why a full replan is needed (None if not).
+        goal_already_achieved: The overall task goal has been satisfied early.
+            No more steps needed — skip remaining todos and finalize.
+    """
+
+    step_completed_successfully: bool = Field(
+        description="Whether the step achieved what it was asked to do"
+    )
+    key_information_learned: str = Field(
+        default="",
+        description="What new information this step revealed",
+    )
+    remaining_plan_still_valid: bool = Field(
+        default=True,
+        description="Whether the remaining pending todos still make sense given new information",
+    )
+    suggested_refinements: list[StepRefinement] | None = Field(
+        default=None,
+        description=(
+            "Structured updates to pending step descriptions based on new information. "
+            "Each entry specifies a step_number and new_description. "
+            "Applied directly — no separate replan needed."
+        ),
+    )
+
+    @field_validator("suggested_refinements", mode="before")
+    @classmethod
+    def coerce_single_refinement_to_list(cls, v):
+        """Coerce a single dict refinement into a list to handle LLM returning a single object."""
+        if isinstance(v, dict):
+            return [v]
+        return v
+    needs_full_replan: bool = Field(
+        default=False,
+        description="The remaining plan is fundamentally wrong and must be regenerated",
+    )
+    replan_reason: str | None = Field(
+        default=None,
+        description="Explanation of why a full replan is needed",
+    )
+    goal_already_achieved: bool = Field(
+        default=False,
+        description="The overall task goal has been satisfied early; no more steps needed",
+    )
--- a/lib/crewai/src/crewai/utilities/reasoning_handler.py
+++ b/lib/crewai/src/crewai/utilities/reasoning_handler.py
@@ -1,10 +1,13 @@
+"""Handles planning/reasoning for agents before task execution."""
+
+from __future__ import annotations
+
 import json
 import logging
-from typing import Any, Final, Literal, cast
+from typing import TYPE_CHECKING, Any, Final, Literal, cast

 from pydantic import BaseModel, Field

-from crewai.agent import Agent
 from crewai.events.event_bus import crewai_event_bus
 from crewai.events.types.reasoning_events import (
    AgentReasoningCompletedEvent,
@@ -12,14 +15,24 @@ from crewai.events.types.reasoning_events import (
    AgentReasoningStartedEvent,
 )
 from crewai.llm import LLM
-from crewai.task import Task
+from crewai.utilities.llm_utils import create_llm
+from crewai.utilities.planning_types import PlanStep
 from crewai.utilities.string_utils import sanitize_tool_name


+if TYPE_CHECKING:
+    from crewai.agent import Agent
+    from crewai.agent.planning_config import PlanningConfig
+    from crewai.task import Task
+
+
 class ReasoningPlan(BaseModel):
    """Model representing a reasoning plan for a task."""

    plan: str = Field(description="The detailed reasoning plan for the task.")
+    steps: list[PlanStep] = Field(
+        default_factory=list, description="Structured steps to execute"
+    )
    ready: bool = Field(description="Whether the agent is ready to execute the task.")


@@ -29,24 +42,63 @@ class AgentReasoningOutput(BaseModel):
    plan: ReasoningPlan = Field(description="The reasoning plan for the task.")


+# Aliases for backward compatibility
+PlanningPlan = ReasoningPlan
+AgentPlanningOutput = AgentReasoningOutput
+
+
 FUNCTION_SCHEMA: Final[dict[str, Any]] = {
    "type": "function",
    "function": {
        "name": "create_reasoning_plan",
-        "description": "Create or refine a reasoning plan for a task",
+        "description": "Create or refine a reasoning plan for a task with structured steps",
        "parameters": {
            "type": "object",
            "properties": {
                "plan": {
                    "type": "string",
-                    "description": "The detailed reasoning plan for the task.",
+                    "description": "A brief summary of the overall plan.",
+                },
+                "steps": {
+                    "type": "array",
+                    "description": "List of discrete steps to execute the plan",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "step_number": {
+                                "type": "integer",
+                                "description": "Step number (1-based)",
+                            },
+                            "description": {
+                                "type": "string",
+                                "description": "What to do in this step",
+                            },
+                            "tool_to_use": {
+                                "type": ["string", "null"],
+                                "description": "Tool to use for this step, or null if no tool needed",
+                            },
+                            "depends_on": {
+                                "type": "array",
+                                "items": {"type": "integer"},
+                                "description": "Step numbers this step depends on (empty array if none)",
+                            },
+                        },
+                        "required": [
+                            "step_number",
+                            "description",
+                            "tool_to_use",
+                            "depends_on",
+                        ],
+                        "additionalProperties": False,
+                    },
                },
                "ready": {
                    "type": "boolean",
                    "description": "Whether the agent is ready to execute the task.",
                },
            },
-            "required": ["plan", "ready"],
+            "required": ["plan", "steps", "ready"],
+            "additionalProperties": False,
        },
    },
 }
@@ -54,41 +106,101 @@ FUNCTION_SCHEMA: Final[dict[str, Any]] = {

 class AgentReasoning:
    """
-    Handles the agent reasoning process, enabling an agent to reflect and create a plan
-    before executing a task.
+    Handles the agent planning/reasoning process, enabling an agent to reflect
+    and create a plan before executing a task.

    Attributes:
-        task: The task for which the agent is reasoning.
-        agent: The agent performing the reasoning.
-        llm: The language model used for reasoning.
+        task: The task for which the agent is planning (optional).
+        agent: The agent performing the planning.
+        config: The planning configuration.
+        llm: The language model used for planning.
        logger: Logger for logging events and errors.
+        description: Task description or input text for planning.
+        expected_output: Expected output description.
    """

-    def __init__(self, task: Task, agent: Agent) -> None:
-        """Initialize the AgentReasoning with a task and an agent.
+    def __init__(
+        self,
+        agent: Agent,
+        task: Task | None = None,
+        *,
+        description: str | None = None,
+        expected_output: str | None = None,
+    ) -> None:
+        """Initialize the AgentReasoning with an agent and optional task.

        Args:
-            task: The task for which the agent is reasoning.
-            agent: The agent performing the reasoning.
+            agent: The agent performing the planning.
+            task: The task for which the agent is planning (optional).
+            description: Task description or input text (used if task is None).
+            expected_output: Expected output (used if task is None).
        """
-        self.task = task
        self.agent = agent
-        self.llm = cast(LLM, agent.llm)
+        self.task = task
+        # Use task attributes if available, otherwise use provided values
+        self._description = description or (
+            task.description if task else "Complete the requested task"
+        )
+        self._expected_output = expected_output or (
+            task.expected_output if task else "Complete the task successfully"
+        )
+        self.config = self._get_planning_config()
+        self.llm = self._resolve_llm()
        self.logger = logging.getLogger(__name__)

-    def handle_agent_reasoning(self) -> AgentReasoningOutput:
-        """Public method for the reasoning process that creates and refines a plan for the task until the agent is ready to execute it.
+    @property
+    def description(self) -> str:
+        """Get the task/input description."""
+        return self._description
+
+    @property
+    def expected_output(self) -> str:
+        """Get the expected output."""
+        return self._expected_output
+
+    def _get_planning_config(self) -> PlanningConfig:
+        """Get the planning configuration from the agent.

        Returns:
-            AgentReasoningOutput: The output of the agent reasoning process.
+            The planning configuration, using defaults if not set.
        """
-        # Emit a reasoning started event (attempt 1)
+        from crewai.agent.planning_config import PlanningConfig
+
+        if self.agent.planning_config is not None:
+            return self.agent.planning_config
+        # Fallback for backward compatibility
+        return PlanningConfig(
+            max_attempts=getattr(self.agent, "max_reasoning_attempts", None),
+        )
+
+    def _resolve_llm(self) -> LLM:
+        """Resolve which LLM to use for planning.
+
+        Returns:
+            The LLM to use - either from config or the agent's LLM.
+        """
+        if self.config.llm is not None:
+            if isinstance(self.config.llm, LLM):
+                return self.config.llm
+            return create_llm(self.config.llm)
+        return cast(LLM, self.agent.llm)
+
+    def handle_agent_reasoning(self) -> AgentReasoningOutput:
+        """Public method for the planning process that creates and refines a plan
+        for the task until the agent is ready to execute it.
+
+        Returns:
+            AgentReasoningOutput: The output of the agent planning process.
+        """
+        task_id = str(self.task.id) if self.task else "kickoff"
+
+        # Emit a planning started event (attempt 1)
        try:
            crewai_event_bus.emit(
                self.agent,
                AgentReasoningStartedEvent(
                    agent_role=self.agent.role,
-                    task_id=str(self.task.id),
+                    task_id=task_id,
                    attempt=1,
                    from_task=self.task,
                ),
@@ -98,13 +210,13 @@ class AgentReasoning:
            pass

        try:
-            output = self.__handle_agent_reasoning()
+            output = self._execute_planning()

            crewai_event_bus.emit(
                self.agent,
                AgentReasoningCompletedEvent(
                    agent_role=self.agent.role,
-                    task_id=str(self.task.id),
+                    task_id=task_id,
                    plan=output.plan.plan,
                    ready=output.plan.ready,
                    attempt=1,
@@ -115,71 +227,76 @@ class AgentReasoning:

            return output
        except Exception as e:
-            # Emit reasoning failed event
+            # Emit planning failed event
            try:
                crewai_event_bus.emit(
                    self.agent,
                    AgentReasoningFailedEvent(
                        agent_role=self.agent.role,
-                        task_id=str(self.task.id),
+                        task_id=task_id,
                        error=str(e),
                        attempt=1,
                        from_task=self.task,
                        from_agent=self.agent,
                    ),
                )
-            except Exception as e:
-                logging.error(f"Error emitting reasoning failed event: {e}")
+            except Exception as event_error:
+                logging.error(f"Error emitting planning failed event: {event_error}")

            raise

-    def __handle_agent_reasoning(self) -> AgentReasoningOutput:
-        """Private method that handles the agent reasoning process.
+    def _execute_planning(self) -> AgentReasoningOutput:
+        """Execute the planning process.

        Returns:
-            The output of the agent reasoning process.
+            The output of the agent planning process.
        """
-        plan, ready = self.__create_initial_plan()
+        plan, steps, ready = self._create_initial_plan()
+        plan, steps, ready = self._refine_plan_if_needed(plan, steps, ready)

-        plan, ready = self.__refine_plan_if_needed(plan, ready)
-
-        reasoning_plan = ReasoningPlan(plan=plan, ready=ready)
+        reasoning_plan = ReasoningPlan(plan=plan, steps=steps, ready=ready)
        return AgentReasoningOutput(plan=reasoning_plan)

-    def __create_initial_plan(self) -> tuple[str, bool]:
-        """Creates the initial reasoning plan for the task.
+    def _create_initial_plan(self) -> tuple[str, list[PlanStep], bool]:
+        """Creates the initial plan for the task.

        Returns:
-            The initial plan and whether the agent is ready to execute the task.
+            A tuple of the plan summary, list of steps, and whether the agent is ready.
        """
-        reasoning_prompt = self.__create_reasoning_prompt()
+        planning_prompt = self._create_planning_prompt()

        if self.llm.supports_function_calling():
-            plan, ready = self.__call_with_function(reasoning_prompt, "initial_plan")
-            return plan, ready
-        response = _call_llm_with_reasoning_prompt(
-            llm=self.llm,
-            prompt=reasoning_prompt,
-            task=self.task,
-            reasoning_agent=self.agent,
-            backstory=self.__get_agent_backstory(),
-            plan_type="initial_plan",
+            plan, steps, ready = self._call_with_function(
+                planning_prompt, "create_plan"
+            )
+            return plan, steps, ready
+
+        response = self._call_llm_with_prompt(
+            prompt=planning_prompt,
+            plan_type="create_plan",
        )

-        return self.__parse_reasoning_response(str(response))
+        plan, ready = self._parse_planning_response(str(response))
+        return plan, [], ready  # No structured steps from text parsing

-    def __refine_plan_if_needed(self, plan: str, ready: bool) -> tuple[str, bool]:
-        """Refines the reasoning plan if the agent is not ready to execute the task.
+    def _refine_plan_if_needed(
+        self, plan: str, steps: list[PlanStep], ready: bool
+    ) -> tuple[str, list[PlanStep], bool]:
+        """Refines the plan if the agent is not ready to execute the task.

        Args:
-            plan: The current reasoning plan.
+            plan: The current plan.
+            steps: The current list of steps.
            ready: Whether the agent is ready to execute the task.

        Returns:
-            The refined plan and whether the agent is ready to execute the task.
+            The refined plan, steps, and whether the agent is ready to execute.
        """
+
        attempt = 1
-        max_attempts = self.agent.max_reasoning_attempts
+        max_attempts = self.config.max_attempts
+        task_id = str(self.task.id) if self.task else "kickoff"
+        current_attempt = attempt + 1

        while not ready and (max_attempts is None or attempt < max_attempts):
            # Emit event for each refinement attempt
@@ -188,62 +305,81 @@ class AgentReasoning:
                    self.agent,
                    AgentReasoningStartedEvent(
                        agent_role=self.agent.role,
-                        task_id=str(self.task.id),
-                        attempt=attempt + 1,
+                        task_id=task_id,
+                        attempt=current_attempt,
                        from_task=self.task,
                    ),
                )
            except Exception:  # noqa: S110
                pass

-            refine_prompt = self.__create_refine_prompt(plan)
+            refine_prompt = self._create_refine_prompt(plan)

            if self.llm.supports_function_calling():
-                plan, ready = self.__call_with_function(refine_prompt, "refine_plan")
+                plan, steps, ready = self._call_with_function(
+                    refine_prompt, "refine_plan"
+                )
            else:
-                response = _call_llm_with_reasoning_prompt(
-                    llm=self.llm,
+                response = self._call_llm_with_prompt(
                    prompt=refine_prompt,
-                    task=self.task,
-                    reasoning_agent=self.agent,
-                    backstory=self.__get_agent_backstory(),
                    plan_type="refine_plan",
                )
-                plan, ready = self.__parse_reasoning_response(str(response))
+                plan, ready = self._parse_planning_response(str(response))
+                steps = []  # No structured steps from text parsing
+
+            # Emit completed event for this refinement attempt
+            try:
+                crewai_event_bus.emit(
+                    self.agent,
+                    AgentReasoningCompletedEvent(
+                        agent_role=self.agent.role,
+                        task_id=task_id,
+                        plan=plan,
+                        ready=ready,
+                        attempt=current_attempt,
+                        from_task=self.task,
+                        from_agent=self.agent,
+                    ),
+                )
+            except Exception:  # noqa: S110
+                pass

            attempt += 1

            if max_attempts is not None and attempt >= max_attempts:
                self.logger.warning(
-                    f"Agent reasoning reached maximum attempts ({max_attempts}) without being ready. Proceeding with current plan."
+                    f"Agent planning reached maximum attempts ({max_attempts}) "
+                    "without being ready. Proceeding with current plan."
                )
                break

-        return plan, ready
+        return plan, steps, ready

-    def __call_with_function(self, prompt: str, prompt_type: str) -> tuple[str, bool]:
-        """Calls the LLM with function calling to get a reasoning plan.
+    def _call_with_function(
+        self, prompt: str, plan_type: Literal["create_plan", "refine_plan"]
+    ) -> tuple[str, list[PlanStep], bool]:
+        """Calls the LLM with function calling to get a plan.

        Args:
            prompt: The prompt to send to the LLM.
-            prompt_type: The type of prompt (initial_plan or refine_plan).
+            plan_type: The type of plan being created.

        Returns:
-            A tuple containing the plan and whether the agent is ready.
+            A tuple containing the plan summary, list of steps, and whether the agent is ready.
        """
-        self.logger.debug(f"Using function calling for {prompt_type} reasoning")
+        self.logger.debug(f"Using function calling for {plan_type} planning")

        try:
-            system_prompt = self.agent.i18n.retrieve("reasoning", prompt_type).format(
-                role=self.agent.role,
-                goal=self.agent.goal,
-                backstory=self.__get_agent_backstory(),
-            )
+            system_prompt = self._get_system_prompt()

            # Prepare a simple callable that just returns the tool arguments as JSON
-            def _create_reasoning_plan(plan: str, ready: bool = True) -> str:
-                """Return the reasoning plan result in JSON string form."""
-                return json.dumps({"plan": plan, "ready": ready})
+            def _create_reasoning_plan(
+                plan: str,
+                steps: list[dict[str, Any]] | None = None,
+                ready: bool = True,
+            ) -> str:
+                """Return the planning result in JSON string form."""
+                return json.dumps({"plan": plan, "steps": steps or [], "ready": ready})

            response = self.llm.call(
                [
@@ -255,19 +391,33 @@ class AgentReasoning:
                from_task=self.task,
                from_agent=self.agent,
            )
-
-            self.logger.debug(f"Function calling response: {response[:100]}...")
-
            try:
                result = json.loads(response)
                if "plan" in result and "ready" in result:
-                    return result["plan"], result["ready"]
+                    # Parse steps from the response
+                    steps: list[PlanStep] = []
+                    raw_steps = result.get("steps", [])
+                    try:
+                        for step_data in raw_steps:
+                            step = PlanStep(
+                                step_number=step_data.get("step_number", 0),
+                                description=step_data.get("description", ""),
+                                tool_to_use=step_data.get("tool_to_use"),
+                                depends_on=step_data.get("depends_on", []),
+                            )
+                            steps.append(step)
+                    except Exception as step_error:
+                        self.logger.warning(
+                            f"Failed to parse step: {step_data}, error: {step_error}"
+                        )
+                    return result["plan"], steps, result["ready"]
            except (json.JSONDecodeError, KeyError):
                pass

            response_str = str(response)
            return (
                response_str,
+                [],
                "READY: I am ready to execute the task." in response_str,
            )

@@ -277,13 +427,7 @@ class AgentReasoning:
            )

            try:
-                system_prompt = self.agent.i18n.retrieve(
-                    "reasoning", prompt_type
-                ).format(
-                    role=self.agent.role,
-                    goal=self.agent.goal,
-                    backstory=self.__get_agent_backstory(),
-                )
+                system_prompt = self._get_system_prompt()

                fallback_response = self.llm.call(
                    [
@@ -297,78 +441,165 @@ class AgentReasoning:
                fallback_str = str(fallback_response)
                return (
                    fallback_str,
+                    [],
                    "READY: I am ready to execute the task." in fallback_str,
                )
            except Exception as inner_e:
                self.logger.error(f"Error during fallback text parsing: {inner_e!s}")
                return (
                    "Failed to generate a plan due to an error.",
+                    [],
                    True,
                )  # Default to ready to avoid getting stuck

-    def __get_agent_backstory(self) -> str:
-        """
-        Safely gets the agent's backstory, providing a default if not available.
+    def _call_llm_with_prompt(
+        self,
+        prompt: str,
+        plan_type: Literal["create_plan", "refine_plan"],
+    ) -> str:
+        """Calls the LLM with the planning prompt.
+
+        Args:
+            prompt: The prompt to send to the LLM.
+            plan_type: The type of plan being created.

        Returns:
-            str: The agent's backstory or a default value.
+            The LLM response.
+        """
+        system_prompt = self._get_system_prompt()
+
+        response = self.llm.call(
+            [
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": prompt},
+            ],
+            from_task=self.task,
+            from_agent=self.agent,
+        )
+        return str(response)
+
+    def _get_system_prompt(self) -> str:
+        """Get the system prompt for planning.
+
+        Returns:
+            The system prompt, either custom or from i18n.
+        """
+        if self.config.system_prompt is not None:
+            return self.config.system_prompt
+
+        # Try new "planning" section first, fall back to "reasoning" for compatibility
+        try:
+            return self.agent.i18n.retrieve("planning", "system_prompt")
+        except (KeyError, AttributeError):
+            # Fallback to reasoning section for backward compatibility
+            return self.agent.i18n.retrieve("reasoning", "initial_plan").format(
+                role=self.agent.role,
+                goal=self.agent.goal,
+                backstory=self._get_agent_backstory(),
+            )
+
+    def _get_agent_backstory(self) -> str:
+        """Safely gets the agent's backstory, providing a default if not available.
+
+        Returns:
+            The agent's backstory or a default value.
        """
        return getattr(self.agent, "backstory", "No backstory provided")

-    def __create_reasoning_prompt(self) -> str:
-        """
-        Creates a prompt for the agent to reason about the task.
+    def _create_planning_prompt(self) -> str:
+        """Creates a prompt for the agent to plan the task.

        Returns:
-            str: The reasoning prompt.
+            The planning prompt.
        """
-        available_tools = self.__format_available_tools()
+        available_tools = self._format_available_tools()

-        return self.agent.i18n.retrieve("reasoning", "create_plan_prompt").format(
-            role=self.agent.role,
-            goal=self.agent.goal,
-            backstory=self.__get_agent_backstory(),
-            description=self.task.description,
-            expected_output=self.task.expected_output,
-            tools=available_tools,
-        )
+        # Use custom prompt if provided
+        if self.config.plan_prompt is not None:
+            return self.config.plan_prompt.format(
+                role=self.agent.role,
+                goal=self.agent.goal,
+                backstory=self._get_agent_backstory(),
+                description=self.description,
+                expected_output=self.expected_output,
+                tools=available_tools,
+                max_steps=self.config.max_steps,
+            )

-    def __format_available_tools(self) -> str:
-        """
-        Formats the available tools for inclusion in the prompt.
+        # Try new "planning" section first
+        try:
+            return self.agent.i18n.retrieve("planning", "create_plan_prompt").format(
+                description=self.description,
+                expected_output=self.expected_output,
+                tools=available_tools,
+                max_steps=self.config.max_steps,
+            )
+        except (KeyError, AttributeError):
+            # Fallback to reasoning section for backward compatibility
+            return self.agent.i18n.retrieve("reasoning", "create_plan_prompt").format(
+                role=self.agent.role,
+                goal=self.agent.goal,
+                backstory=self._get_agent_backstory(),
+                description=self.description,
+                expected_output=self.expected_output,
+                tools=available_tools,
+            )
+
+    def _format_available_tools(self) -> str:
+        """Formats the available tools for inclusion in the prompt.

        Returns:
-            str: Comma-separated list of tool names.
+            Comma-separated list of tool names.
        """
        try:
-            return ", ".join(
-                [sanitize_tool_name(tool.name) for tool in (self.task.tools or [])]
-            )
+            # Try task tools first, then agent tools
+            tools = []
+            if self.task:
+                tools = self.task.tools or []
+            if not tools:
+                tools = getattr(self.agent, "tools", []) or []
+            if not tools:
+                return "No tools available"
+            return ", ".join([sanitize_tool_name(tool.name) for tool in tools])
        except (AttributeError, TypeError):
            return "No tools available"

-    def __create_refine_prompt(self, current_plan: str) -> str:
-        """
-        Creates a prompt for the agent to refine its reasoning plan.
+    def _create_refine_prompt(self, current_plan: str) -> str:
+        """Creates a prompt for the agent to refine its plan.

        Args:
-            current_plan: The current reasoning plan.
+            current_plan: The current plan.

        Returns:
-            str: The refine prompt.
+            The refine prompt.
        """
-        return self.agent.i18n.retrieve("reasoning", "refine_plan_prompt").format(
-            role=self.agent.role,
-            goal=self.agent.goal,
-            backstory=self.__get_agent_backstory(),
-            current_plan=current_plan,
-        )
+        # Use custom prompt if provided
+        if self.config.refine_prompt is not None:
+            return self.config.refine_prompt.format(
+                role=self.agent.role,
+                goal=self.agent.goal,
+                backstory=self._get_agent_backstory(),
+                current_plan=current_plan,
+                max_steps=self.config.max_steps,
+            )
+
+        # Try new "planning" section first
+        try:
+            return self.agent.i18n.retrieve("planning", "refine_plan_prompt").format(
+                current_plan=current_plan,
+            )
+        except (KeyError, AttributeError):
+            # Fallback to reasoning section for backward compatibility
+            return self.agent.i18n.retrieve("reasoning", "refine_plan_prompt").format(
+                role=self.agent.role,
+                goal=self.agent.goal,
+                backstory=self._get_agent_backstory(),
+                current_plan=current_plan,
+            )

    @staticmethod
-    def __parse_reasoning_response(response: str) -> tuple[str, bool]:
-        """
-        Parses the reasoning response to extract the plan and whether
-        the agent is ready to execute the task.
+    def _parse_planning_response(response: str) -> tuple[str, bool]:
+        """Parses the planning response to extract the plan and readiness.

        Args:
            response: The LLM response.
@@ -380,25 +611,13 @@ class AgentReasoning:
            return "No plan was generated.", False

        plan = response
-        ready = False
-
-        if "READY: I am ready to execute the task." in response:
-            ready = True
+        ready = "READY: I am ready to execute the task." in response

        return plan, ready

-    def _handle_agent_reasoning(self) -> AgentReasoningOutput:
-        """
-        Deprecated method for backward compatibility.
-        Use handle_agent_reasoning() instead.

-        Returns:
-            AgentReasoningOutput: The output of the agent reasoning process.
-        """
-        self.logger.warning(
-            "The _handle_agent_reasoning method is deprecated. Use handle_agent_reasoning instead."
-        )
-        return self.handle_agent_reasoning()
+# Alias for backward compatibility
+AgentPlanning = AgentReasoning


 def _call_llm_with_reasoning_prompt(
@@ -409,7 +628,9 @@ def _call_llm_with_reasoning_prompt(
    backstory: str,
    plan_type: Literal["initial_plan", "refine_plan"],
 ) -> str:
-    """Calls the LLM with the reasoning prompt.
+    """Deprecated: Calls the LLM with the reasoning prompt.
+
+    This function is kept for backward compatibility.

    Args:
        llm: The language model to use.
@@ -417,7 +638,7 @@ def _call_llm_with_reasoning_prompt(
        task: The task for which the agent is reasoning.
        reasoning_agent: The agent performing the reasoning.
        backstory: The agent's backstory.
-        plan_type: The type of plan being created ("initial_plan" or "refine_plan").
+        plan_type: The type of plan being created.

    Returns:
        The LLM response.
--- a/lib/crewai/src/crewai/utilities/step_execution_context.py
+++ b/lib/crewai/src/crewai/utilities/step_execution_context.py
@@ -0,0 +1,64 @@
+"""Context and result types for isolated step execution in Plan-and-Execute architecture.
+
+These types mediate between the AgentExecutor (orchestrator) and StepExecutor (per-step worker).
+StepExecutionContext carries only final results from dependencies — never LLM message histories.
+StepResult carries only the outcome of a step — never internal execution traces.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+
+
+@dataclass(frozen=True)
+class StepExecutionContext:
+    """Immutable context passed to a StepExecutor for a single todo.
+
+    Contains only the information the Executor needs to complete one step:
+    the task description, goal, and final results from dependency steps.
+    No LLM message history, no execution traces, no shared mutable state.
+
+    Attributes:
+        task_description: The original task description (from Task or kickoff input).
+        task_goal: The expected output / goal of the overall task.
+        dependency_results: Mapping of step_number → final result string
+            for all completed dependencies of the current step.
+    """
+
+    task_description: str
+    task_goal: str
+    dependency_results: dict[int, str] = field(default_factory=dict)
+
+    def get_dependency_result(self, step_number: int) -> str | None:
+        """Get the final result of a dependency step.
+
+        Args:
+            step_number: The step number to look up.
+
+        Returns:
+            The result string if available, None otherwise.
+        """
+        return self.dependency_results.get(step_number)
+
+
+@dataclass
+class StepResult:
+    """Result returned by a StepExecutor after executing a single todo.
+
+    Contains the final outcome and metadata for debugging/metrics.
+    Tool call details are for audit logging only — they are NOT passed
+    to subsequent steps or the Planner.
+
+    Attributes:
+        success: Whether the step completed successfully.
+        result: The final output string from the step.
+        error: Error message if the step failed (None on success).
+        tool_calls_made: List of tool names invoked (for debugging/logging only).
+        execution_time: Wall-clock time in seconds for the step execution.
+    """
+
+    success: bool
+    result: str
+    error: str | None = None
+    tool_calls_made: list[str] = field(default_factory=list)
+    execution_time: float = 0.0
--- a/lib/crewai/tests/agents/test_agent.py
+++ b/lib/crewai/tests/agents/test_agent.py
@@ -1456,7 +1456,7 @@ def test_agent_execute_task_with_tool():
    )

    result = agent.execute_task(task)
-    assert "you should always think about what to do" in result
+    assert "test query" in result


@pytest.mark.vcr()
@@ -1475,9 +1475,9 @@ def test_agent_execute_task_with_custom_llm():
    )

    result = agent.execute_task(task)
-    assert "In circuits they thrive" in result
-    assert "Artificial minds awake" in result
-    assert "Future's coded drive" in result
+    assert "Artificial minds" in result
+    assert "Code and circuits" in result
+    assert "Future undefined" in result


@pytest.mark.vcr()
--- a/lib/crewai/tests/agents/test_agent_executor.py
+++ b/lib/crewai/tests/agents/test_agent_executor.py
@@ -4,16 +4,27 @@ Tests the Flow-based agent executor implementation including state management,
 flow methods, routing logic, and error handling.
 """

+import asyncio
 import time
-from unittest.mock import Mock, patch
+from unittest.mock import AsyncMock, Mock, patch

 import pytest

+from crewai.agents.step_executor import StepExecutor
+from crewai.agents.planner_observer import PlannerObserver
 from crewai.experimental.agent_executor import (
    AgentReActState,
    AgentExecutor,
 )
 from crewai.agents.parser import AgentAction, AgentFinish
+from crewai.events.event_bus import crewai_event_bus
+from crewai.events.types.tool_usage_events import (
+    ToolUsageFinishedEvent,
+    ToolUsageStartedEvent,
+)
+from crewai.tools.tool_types import ToolResult
+from crewai.utilities.step_execution_context import StepExecutionContext
+from crewai.utilities.planning_types import TodoItem

 class TestAgentReActState:
    """Test AgentReActState Pydantic model."""
@@ -26,6 +37,18 @@ class TestAgentReActState:
        assert state.current_answer is None
        assert state.is_finished is False
        assert state.ask_for_human_input is False
+        # Planning state fields
+        assert state.plan is None
+        assert state.plan_ready is False
+
+    def test_state_with_plan(self):
+        """Test AgentReActState initialization with planning fields."""
+        state = AgentReActState(
+            plan="Step 1: Do X\nStep 2: Do Y",
+            plan_ready=True,
+        )
+        assert state.plan == "Step 1: Do X\nStep 2: Do Y"
+        assert state.plan_ready is True

    def test_state_with_values(self):
        """Test AgentReActState initialization with values."""
@@ -180,6 +203,88 @@ class TestAgentExecutor:
        assert result == "skipped"
        assert executor.state.is_finished is False

+    def test_finalize_skips_synthesis_for_strong_last_todo_result(
+        self, mock_dependencies
+    ):
+        """Finalize should skip synthesis when last todo is already a complete answer."""
+        with patch.object(AgentExecutor, "_show_logs") as mock_show_logs:
+            executor = AgentExecutor(**mock_dependencies)
+            executor.state.todos.items = [
+                TodoItem(
+                    step_number=1,
+                    description="Gather source details",
+                    tool_to_use="search_tool",
+                    status="completed",
+                    result="Source A and Source B identified.",
+                ),
+                TodoItem(
+                    step_number=2,
+                    description="Write final response",
+                    tool_to_use=None,
+                    status="completed",
+                    result=(
+                        "The final recommendation is to adopt a phased rollout plan with "
+                        "weekly checkpoints, explicit ownership, and a rollback path for "
+                        "each milestone. This approach keeps risk controlled while still "
+                        "moving quickly, and it aligns delivery metrics with stakeholder "
+                        "communication and operational readiness."
+                    ),
+                ),
+            ]
+
+            with patch.object(
+                executor, "_synthesize_final_answer_from_todos"
+            ) as mock_synthesize:
+                result = executor.finalize()
+
+            assert result == "completed"
+            assert isinstance(executor.state.current_answer, AgentFinish)
+            assert (
+                executor.state.current_answer.output
+                == executor.state.todos.items[1].result
+            )
+            assert executor.state.is_finished is True
+            mock_synthesize.assert_not_called()
+            mock_show_logs.assert_called_once()
+
+    def test_finalize_keeps_synthesis_when_response_model_is_set(
+        self, mock_dependencies
+    ):
+        """Finalize should still synthesize when response_model is configured."""
+        with patch.object(AgentExecutor, "_show_logs"):
+            executor = AgentExecutor(**mock_dependencies)
+            executor.response_model = Mock()
+            executor.state.todos.items = [
+                TodoItem(
+                    step_number=1,
+                    description="Write final response",
+                    tool_to_use=None,
+                    status="completed",
+                    result=(
+                        "This is already detailed prose with multiple sentences. "
+                        "It should still run synthesis because structured output "
+                        "was requested via response_model."
+                    ),
+                )
+            ]
+
+            def _set_current_answer() -> None:
+                executor.state.current_answer = AgentFinish(
+                    thought="Synthesized",
+                    output="structured-like-answer",
+                    text="structured-like-answer",
+                )
+
+            with patch.object(
+                executor,
+                "_synthesize_final_answer_from_todos",
+                side_effect=_set_current_answer,
+            ) as mock_synthesize:
+                result = executor.finalize()
+
+            assert result == "completed"
+            mock_synthesize.assert_called_once()
+
    def test_format_prompt(self, mock_dependencies):
        """Test prompt formatting."""
        executor = AgentExecutor(**mock_dependencies)
@@ -234,6 +339,113 @@ class TestAgentExecutor:
            AgentFinish(thought="thinking", output="test", text="final")
        )

+    @pytest.mark.asyncio
+    async def test_invoke_step_callback_async_inside_running_loop(
+        self, mock_dependencies
+    ):
+        """Test async step callback scheduling when already in an event loop."""
+        callback = AsyncMock()
+        mock_dependencies["step_callback"] = callback
+        executor = AgentExecutor(**mock_dependencies)
+
+        answer = AgentFinish(thought="thinking", output="test", text="final")
+        with patch("crewai.experimental.agent_executor.asyncio.run") as mock_run:
+            executor._invoke_step_callback(answer)
+            await asyncio.sleep(0)
+
+        callback.assert_awaited_once_with(answer)
+        mock_run.assert_not_called()
+
+
+class TestStepExecutorCriticalFixes:
+    """Regression tests for critical plan-and-execute issues."""
+
+    @pytest.fixture
+    def step_executor(self):
+        llm = Mock()
+        llm.supports_stop_words.return_value = True
+
+        agent = Mock()
+        agent.role = "Test Agent"
+        agent.goal = "Execute tasks"
+        agent.verbose = False
+        agent.key = "test-agent-key"
+
+        tool = Mock()
+        tool.name = "count_words"
+        task = Mock()
+        task.name = "test-task"
+        task.description = "test task description"
+
+        return StepExecutor(
+            llm=llm,
+            tools=[tool],
+            agent=agent,
+            original_tools=[],
+            tools_handler=Mock(),
+            task=task,
+            crew=Mock(),
+            function_calling_llm=None,
+            request_within_rpm_limit=None,
+            callbacks=[],
+        )
+
+    def test_step_executor_fails_when_expected_tool_is_not_called(self, step_executor):
+        """Step should fail if a configured expected tool is not actually invoked."""
+        todo = TodoItem(
+            step_number=1,
+            description="Count words in input text.",
+            tool_to_use="count_words",
+            depends_on=[],
+            status="pending",
+        )
+        context = StepExecutionContext(task_description="task", task_goal="goal")
+
+        with patch.object(step_executor, "_build_isolated_messages", return_value=[]):
+            with patch.object(
+                step_executor, "_execute_text_parsed", return_value="No tool used."
+            ):
+                result = step_executor.execute(todo, context)
+
+        assert result.success is False
+        assert result.error is not None
+        assert "Expected tool 'count_words' was not called" in result.error
+
+    def test_step_executor_text_tool_emits_usage_events(self, step_executor):
+        """Text-parsed tool execution should emit started and finished events."""
+        started_events: list[ToolUsageStartedEvent] = []
+        finished_events: list[ToolUsageFinishedEvent] = []
+
+        tool_name = "count_words"
+        action = AgentAction(
+            thought="Need a tool",
+            tool=tool_name,
+            tool_input='{"text":"hello world"}',
+            text="Action: count_words",
+        )
+
+        @crewai_event_bus.on(ToolUsageStartedEvent)
+        def _on_started(_source, event):
+            if event.tool_name == tool_name:
+                started_events.append(event)
+
+        @crewai_event_bus.on(ToolUsageFinishedEvent)
+        def _on_finished(_source, event):
+            if event.tool_name == tool_name:
+                finished_events.append(event)
+
+        with patch(
+            "crewai.agents.step_executor.execute_tool_and_check_finality",
+            return_value=ToolResult(result="2", result_as_answer=False),
+        ):
+            output = step_executor._execute_text_tool_with_events(action)
+
+        crewai_event_bus.flush()
+
+        assert output == "2"
+        assert len(started_events) >= 1
+        assert len(finished_events) >= 1
+
    @patch("crewai.experimental.agent_executor.handle_output_parser_exception")
    def test_recover_from_parser_error(
        self, mock_handle_exception, mock_dependencies
@@ -636,3 +848,696 @@ class TestNativeToolExecution:
        tool_messages = [m for m in executor.state.messages if m.get("role") == "tool"]
        assert len(tool_messages) == 1
        assert tool_messages[0]["tool_call_id"] == "call_1"
+
+    def test_check_native_todo_completion_requires_expected_tool_match(
+        self, mock_dependencies
+    ):
+        from crewai.utilities.planning_types import TodoList
+
+        executor = AgentExecutor(**mock_dependencies)
+        running = TodoItem(
+            step_number=1,
+            description="Use the expected tool",
+            tool_to_use="expected_tool",
+            status="running",
+        )
+        executor.state.todos = TodoList(items=[running])
+
+        executor.state.last_native_tools_executed = ["other_tool"]
+        assert executor.check_native_todo_completion() == "todo_not_satisfied"
+
+        executor.state.last_native_tools_executed = ["expected_tool"]
+        assert executor.check_native_todo_completion() == "todo_satisfied"
+
+        running.tool_to_use = None
+        executor.state.last_native_tools_executed = ["any_tool"]
+        assert executor.check_native_todo_completion() == "todo_not_satisfied"
+
+
+class TestPlannerObserver:
+    def test_observe_fallback_is_conservative_on_llm_error(self):
+        llm = Mock()
+        llm.call.side_effect = RuntimeError("llm unavailable")
+
+        agent = Mock()
+        agent.role = "Observer Test Agent"
+        agent.llm = llm
+        agent.planning_config = None
+
+        task = Mock()
+        task.description = "Test task"
+        task.expected_output = "Expected result"
+
+        observer = PlannerObserver(agent=agent, task=task)
+
+        completed_step = TodoItem(
+            step_number=1,
+            description="Do something",
+            status="running",
+        )
+        observation = observer.observe(
+            completed_step=completed_step,
+            result="Error: tool timeout",
+            all_completed=[],
+            remaining_todos=[],
+        )
+
+        assert observation.step_completed_successfully is False
+        assert observation.remaining_plan_still_valid is False
+        assert observation.needs_full_replan is True
+        assert observation.replan_reason == "Observer failed to evaluate step result safely"
+
+
+class TestAgentExecutorPlanning:
+    """Test planning functionality in AgentExecutor with real agent kickoff."""
+
+    @pytest.mark.vcr()
+    def test_agent_kickoff_with_planning_stores_plan_in_state(self):
+        """Test that Agent.kickoff() with planning enabled stores plan in executor state."""
+        from crewai import Agent, PlanningConfig
+        from crewai.llm import LLM
+
+        llm = LLM("gpt-4o-mini")
+
+        agent = Agent(
+            role="Math Assistant",
+            goal="Help solve simple math problems",
+            backstory="A helpful assistant that solves math problems step by step",
+            llm=llm,
+            planning_config=PlanningConfig(max_attempts=1),
+            verbose=False,
+        )
+
+        # Execute kickoff with a simple task
+        result = agent.kickoff("What is 2 + 2?")
+
+        # Verify result
+        assert result is not None
+        assert "4" in str(result)
+
+    @pytest.mark.vcr()
+    def test_agent_kickoff_without_planning_skips_plan_generation(self):
+        """Test that Agent.kickoff() without planning skips planning phase."""
+        from crewai import Agent
+        from crewai.llm import LLM
+
+        llm = LLM("gpt-4o-mini")
+
+        agent = Agent(
+            role="Math Assistant",
+            goal="Help solve simple math problems",
+            backstory="A helpful assistant",
+            llm=llm,
+            # No planning_config = no planning
+            verbose=False,
+        )
+
+        # Execute kickoff
+        result = agent.kickoff("What is 3 + 3?")
+
+        # Verify we get a result
+        assert result is not None
+        assert "6" in str(result)
+
+    @pytest.mark.vcr()
+    def test_planning_disabled_skips_planning(self):
+        """Test that planning=False skips planning."""
+        from crewai import Agent
+        from crewai.llm import LLM
+
+        llm = LLM("gpt-4o-mini")
+
+        agent = Agent(
+            role="Math Assistant",
+            goal="Help solve simple math problems",
+            backstory="A helpful assistant",
+            llm=llm,
+            planning=False,  # Explicitly disable planning
+            verbose=False,
+        )
+
+        result = agent.kickoff("What is 5 + 5?")
+
+        # Should still complete successfully
+        assert result is not None
+        assert "10" in str(result)
+
+    def test_backward_compat_reasoning_true_enables_planning(self):
+        """Test that reasoning=True (deprecated) still enables planning."""
+        import warnings
+        from crewai import Agent
+        from crewai.llm import LLM
+
+        llm = LLM("gpt-4o-mini")
+
+        with warnings.catch_warnings(record=True):
+            warnings.simplefilter("always")
+            agent = Agent(
+                role="Test Agent",
+                goal="Complete tasks",
+                backstory="A helpful agent",
+                llm=llm,
+                reasoning=True,  # Deprecated but should still work
+                verbose=False,
+            )
+
+        # Should have planning_config created from reasoning=True
+        assert agent.planning_config is not None
+        assert agent.planning_enabled is True
+
+    @pytest.mark.vcr()
+    def test_executor_state_contains_plan_after_planning(self):
+        """Test that executor state contains plan after planning phase."""
+        from crewai import Agent, PlanningConfig
+        from crewai.llm import LLM
+        from crewai.experimental.agent_executor import AgentExecutor
+
+        llm = LLM("gpt-4o-mini")
+
+        agent = Agent(
+            role="Math Assistant",
+            goal="Help solve simple math problems",
+            backstory="A helpful assistant that solves math problems step by step",
+            llm=llm,
+            planning_config=PlanningConfig(max_attempts=1),
+            verbose=False,
+        )
+
+        # Track executor for inspection
+        executor_ref = [None]
+        original_invoke = AgentExecutor.invoke
+
+        def capture_executor(self, inputs):
+            executor_ref[0] = self
+            return original_invoke(self, inputs)
+
+        with patch.object(AgentExecutor, "invoke", capture_executor):
+            result = agent.kickoff("What is 7 + 7?")
+
+        # Verify result
+        assert result is not None
+
+        # If we captured an executor, check its state
+        if executor_ref[0] is not None:
+            # After planning, state should have plan info
+            assert hasattr(executor_ref[0].state, "plan")
+            assert hasattr(executor_ref[0].state, "plan_ready")
+
+    @pytest.mark.vcr()
+    def test_planning_creates_minimal_steps_for_multi_step_task(self):
+        """Test that planning creates steps and executes them for a multi-step task.
+
+        This task requires multiple dependent steps:
+        1. Identify the first 3 prime numbers (2, 3, 5)
+        2. Sum them (2 + 3 + 5 = 10)
+        3. Multiply by 2 (10 * 2 = 20)
+
+        The plan-and-execute architecture should produce step results.
+        """
+        from crewai import Agent, PlanningConfig
+        from crewai.llm import LLM
+        from crewai.experimental.agent_executor import AgentExecutor
+
+        llm = LLM("gpt-4o-mini")
+
+        agent = Agent(
+            role="Math Tutor",
+            goal="Solve multi-step math problems accurately",
+            backstory="An expert math tutor who breaks down problems step by step",
+            llm=llm,
+            planning_config=PlanningConfig(max_attempts=1, max_steps=10),
+            verbose=False,
+        )
+
+        # Track the plan that gets generated
+        captured_plan = [None]
+        original_invoke = AgentExecutor.invoke
+
+        def capture_plan(self, inputs):
+            result = original_invoke(self, inputs)
+            captured_plan[0] = self.state.plan
+            return result
+
+        with patch.object(AgentExecutor, "invoke", capture_plan):
+            result = agent.kickoff(
+                "Calculate the sum of the first 3 prime numbers, then multiply that result by 2. "
+                "Show your work for each step."
+            )
+
+        # Verify we got a result with step outputs
+        assert result is not None
+        result_str = str(result)
+        # Should contain at least some mathematical content from the steps
+        assert "prime" in result_str.lower() or "2" in result_str or "10" in result_str
+
+        # Verify a plan was generated
+        assert captured_plan[0] is not None
+
+    @pytest.mark.vcr()
+    def test_planning_handles_sequential_dependency_task(self):
+        """Test planning for a task where step N depends on step N-1.
+
+        Task: Convert 100 Celsius to Fahrenheit, then round to nearest 10.
+        Step 1: Apply formula (C * 9/5 + 32) = 212
+        Step 2: Round 212 to nearest 10 = 210
+
+        This tests that the planner creates a plan and executes steps.
+        """
+        from crewai import Agent, PlanningConfig
+        from crewai.llm import LLM
+        from crewai.experimental.agent_executor import AgentExecutor
+
+        llm = LLM("gpt-4o-mini")
+
+        agent = Agent(
+            role="Unit Converter",
+            goal="Accurately convert between units and apply transformations",
+            backstory="A precise unit conversion specialist",
+            llm=llm,
+            planning_config=PlanningConfig(max_attempts=1, max_steps=10),
+            verbose=False,
+        )
+
+        captured_plan = [None]
+        original_invoke = AgentExecutor.invoke
+
+        def capture_plan(self, inputs):
+            result = original_invoke(self, inputs)
+            captured_plan[0] = self.state.plan
+            return result
+
+        with patch.object(AgentExecutor, "invoke", capture_plan):
+            result = agent.kickoff(
+                "Convert 100 degrees Celsius to Fahrenheit, then round the result to the nearest 10."
+            )
+
+        assert result is not None
+        result_str = str(result)
+        # Should contain conversion-related content
+        assert "212" in result_str or "210" in result_str or "Fahrenheit" in result_str or "celsius" in result_str.lower()
+
+        # Plan should exist
+        assert captured_plan[0] is not None
+
+
+class TestResponseFormatWithKickoff:
+    """Test that Agent.kickoff(response_format=MyModel) returns structured output.
+
+    Real LLM calls via VCR cassettes. Tests both with and without planning,
+    using real tools for the planning case to exercise the full Plan-and-Execute
+    path including synthesis with response_model.
+    """
+
+    @pytest.mark.vcr()
+    def test_kickoff_response_format_without_planning(self):
+        """Test that kickoff(response_format) returns structured output without planning."""
+        from pydantic import BaseModel, Field
+        from crewai import Agent
+        from crewai.llm import LLM
+
+        class MathResult(BaseModel):
+            answer: int = Field(description="The numeric answer")
+            explanation: str = Field(description="Brief explanation of the solution")
+
+        llm = LLM("gpt-4o-mini")
+
+        agent = Agent(
+            role="Math Assistant",
+            goal="Solve math problems and return structured results",
+            backstory="A precise math assistant that always returns structured data",
+            llm=llm,
+            verbose=False,
+        )
+
+        result = agent.kickoff("What is 15 + 27?", response_format=MathResult)
+
+        assert result is not None
+        assert result.pydantic is not None
+        assert isinstance(result.pydantic, MathResult)
+        assert result.pydantic.answer == 42
+        assert len(result.pydantic.explanation) > 0
+
+    @pytest.mark.vcr()
+    def test_kickoff_response_format_with_planning_and_tools(self):
+        """Test response_format with planning + tools (multi-step research).
+
+        This is the key test for _synthesize_final_answer_from_todos:
+        1. Planning generates steps that use the EXA search tool
+        2. StepExecutor runs each step in isolation with tool calls
+        3. The synthesis step produces a structured BaseModel output
+
+        The response_format should be respected by the synthesis LLM call,
+        NOT by intermediate step executions.
+        """
+        from pydantic import BaseModel, Field
+        from crewai import Agent, PlanningConfig
+        from crewai.llm import LLM
+        from crewai_tools import EXASearchTool
+
+        class ResearchSummary(BaseModel):
+            topic: str = Field(description="The research topic")
+            key_findings: list[str] = Field(description="List of 3-5 key findings")
+            conclusion: str = Field(description="A brief conclusion paragraph")
+
+        llm = LLM("gpt-4o-mini")
+        exa = EXASearchTool()
+
+        agent = Agent(
+            role="Research Analyst",
+            goal="Research topics using search tools and produce structured summaries",
+            backstory=(
+                "You are a research analyst who searches the web for information, "
+                "identifies key findings, and produces structured research summaries."
+            ),
+            llm=llm,
+            planning_config=PlanningConfig(max_attempts=1, max_steps=5),
+            tools=[exa],
+            verbose=False,
+        )
+
+        result = agent.kickoff(
+            "Research the current state of autonomous AI agents in 2025. "
+            "Search for recent developments, then summarize the key findings.",
+            response_format=ResearchSummary,
+        )
+
+        assert result is not None
+        # The synthesis step should have produced structured output
+        assert result.pydantic is not None
+        assert isinstance(result.pydantic, ResearchSummary)
+        # Verify the structured fields are populated
+        assert len(result.pydantic.topic) > 0
+        assert len(result.pydantic.key_findings) >= 1
+        assert len(result.pydantic.conclusion) > 0
+
+    @pytest.mark.vcr()
+    def test_kickoff_no_response_format_returns_raw_text(self):
+        """Test that kickoff without response_format returns plain text."""
+        from crewai import Agent
+        from crewai.llm import LLM
+
+        llm = LLM("gpt-4o-mini")
+
+        agent = Agent(
+            role="Math Assistant",
+            goal="Solve math problems",
+            backstory="A helpful math assistant",
+            llm=llm,
+            verbose=False,
+        )
+
+        result = agent.kickoff("What is 10 + 10?")
+
+        assert result is not None
+        assert result.pydantic is None
+        assert "20" in str(result)
+
+
+class TestReasoningEffort:
+    """Test reasoning_effort levels in PlanningConfig.
+
+    - low:  observe() runs (validates step success), but skip decide/replan/refine
+    - medium: observe() runs, replan on failure only (mocked)
+    - high: full observation pipeline with decide/replan/refine/goal-achieved
+    """
+
+    @pytest.mark.vcr()
+    def test_reasoning_effort_low_skips_decide_and_replan(self):
+        """Low effort: observe runs but decide/replan/refine are never called.
+
+        Verifies that with reasoning_effort='low':
+        1. The agent produces a correct result
+        2. The observation phase still runs (observations are stored)
+        3. The decide_next_action/refine/replan pipeline is bypassed
+        """
+        from crewai import Agent, PlanningConfig
+        from crewai.llm import LLM
+        from crewai.experimental.agent_executor import AgentExecutor
+
+        llm = LLM("gpt-4o-mini")
+
+        agent = Agent(
+            role="Math Tutor",
+            goal="Solve multi-step math problems accurately",
+            backstory="An expert math tutor who breaks down problems step by step",
+            llm=llm,
+            planning_config=PlanningConfig(
+                reasoning_effort="low",
+                max_attempts=1,
+                max_steps=10,
+            ),
+            verbose=False,
+        )
+
+        # Capture the executor to inspect state after execution
+        executor_ref = [None]
+        original_invoke = AgentExecutor.invoke
+
+        def capture_executor(self, inputs):
+            result = original_invoke(self, inputs)
+            executor_ref[0] = self
+            return result
+
+        with patch.object(AgentExecutor, "invoke", capture_executor):
+            result = agent.kickoff(
+                "What is the sum of the first 3 prime numbers (2, 3, 5)?"
+            )
+
+        assert result is not None
+        assert "10" in str(result)
+
+        # Verify observations were still collected (observe() ran)
+        executor = executor_ref[0]
+        if executor is not None and executor.state.todos.items:
+            assert len(executor.state.observations) > 0, (
+                "Low effort should still run observe() to validate steps"
+            )
+
+            # Verify no replan was triggered
+            assert executor.state.replan_count == 0, (
+                "Low effort should never trigger replanning"
+            )
+
+            # Check execution log for reasoning_effort annotation
+            observation_logs = [
+                log for log in executor.state.execution_log
+                if log.get("type") == "observation"
+            ]
+            for log in observation_logs:
+                assert log.get("reasoning_effort") == "low"
+
+    @pytest.mark.vcr()
+    def test_reasoning_effort_high_runs_full_observation_pipeline(self):
+        """High effort: full observation pipeline with decide/replan/refine.
+
+        Verifies that with reasoning_effort='high':
+        1. The agent produces a correct result
+        2. Observations are stored
+        3. The full decide_next_action pipeline runs (the observation-driven
+           routing is exercised, even if it just routes to continue_plan)
+        """
+        from crewai import Agent, PlanningConfig
+        from crewai.llm import LLM
+        from crewai.experimental.agent_executor import AgentExecutor
+
+        llm = LLM("gpt-4o-mini")
+
+        agent = Agent(
+            role="Math Tutor",
+            goal="Solve multi-step math problems accurately",
+            backstory="An expert math tutor who breaks down problems step by step",
+            llm=llm,
+            planning_config=PlanningConfig(
+                reasoning_effort="high",
+                max_attempts=1,
+                max_steps=10,
+            ),
+            verbose=False,
+        )
+
+        executor_ref = [None]
+        original_invoke = AgentExecutor.invoke
+
+        def capture_executor(self, inputs):
+            result = original_invoke(self, inputs)
+            executor_ref[0] = self
+            return result
+
+        with patch.object(AgentExecutor, "invoke", capture_executor):
+            result = agent.kickoff(
+                "What is the sum of the first 3 prime numbers (2, 3, 5)?"
+            )
+
+        assert result is not None
+        assert "10" in str(result)
+
+        # Verify observations were collected
+        executor = executor_ref[0]
+        if executor is not None and executor.state.todos.items:
+            assert len(executor.state.observations) > 0, (
+                "High effort should run observe() on every step"
+            )
+
+            # Check execution log shows high reasoning_effort
+            observation_logs = [
+                log for log in executor.state.execution_log
+                if log.get("type") == "observation"
+            ]
+            for log in observation_logs:
+                assert log.get("reasoning_effort") == "high"
+
+    def test_reasoning_effort_medium_replans_on_failure(self):
+        """Medium effort: replan triggered when observation reports failure.
+
+        This test mocks the PlannerObserver to simulate a failed step,
+        verifying that medium effort routes to replan_now on failure
+        but continues on success.
+        """
+        from crewai.experimental.agent_executor import AgentExecutor
+        from crewai.utilities.planning_types import (
+            StepObservation,
+            TodoItem,
+            TodoList,
+        )
+
+        # --- Build a minimal mock executor with medium effort ---
+        executor = Mock(spec=AgentExecutor)
+        executor.agent = Mock()
+        executor.agent.verbose = False
+        executor.agent.planning_config = Mock()
+        executor.agent.planning_config.reasoning_effort = "medium"
+
+        # Provide the real method under test (bound to our mock)
+        executor.handle_step_observed_medium = (
+            AgentExecutor.handle_step_observed_medium.__get__(executor)
+        )
+        executor._printer = Mock()
+
+        # --- Case 1: step succeeded → should return "continue_plan" ---
+        success_todo = TodoItem(
+            step_number=1,
+            description="Calculate something",
+            status="running",
+            result="42",
+        )
+        success_observation = StepObservation(
+            step_completed_successfully=True,
+            key_information_learned="Got the answer",
+            remaining_plan_still_valid=True,
+        )
+
+        # Set up state
+        todo_list = TodoList(items=[success_todo])
+        executor.state = Mock()
+        executor.state.todos = todo_list
+        executor.state.observations = {1: success_observation}
+
+        route = executor.handle_step_observed_medium()
+        assert route == "continue_plan", (
+            "Medium effort should continue on successful step"
+        )
+        assert success_todo.status == "completed"
+
+        # --- Case 2: step failed → should return "replan_now" ---
+        failed_todo = TodoItem(
+            step_number=2,
+            description="Divide by zero",
+            status="running",
+            result="Error: division by zero",
+        )
+        failed_observation = StepObservation(
+            step_completed_successfully=False,
+            key_information_learned="Division failed",
+            remaining_plan_still_valid=False,
+            needs_full_replan=True,
+            replan_reason="Step failed with error",
+        )
+
+        todo_list_2 = TodoList(items=[failed_todo])
+        executor.state.todos = todo_list_2
+        executor.state.observations = {2: failed_observation}
+        executor.state.last_replan_reason = None
+
+        route = executor.handle_step_observed_medium()
+        assert route == "replan_now", (
+            "Medium effort should trigger replan on failed step"
+        )
+        assert executor.state.last_replan_reason == "Step did not complete successfully"
+
+    def test_reasoning_effort_low_marks_complete_without_deciding(self):
+        """Low effort: mark_completed is called, decide_next_action is not.
+
+        Unit test verifying the low handler's behavior directly.
+        """
+        from crewai.experimental.agent_executor import AgentExecutor
+        from crewai.utilities.planning_types import TodoItem, TodoList
+
+        executor = Mock(spec=AgentExecutor)
+        executor.agent = Mock()
+        executor.agent.verbose = False
+        executor.agent.planning_config = Mock()
+        executor.agent.planning_config.reasoning_effort = "low"
+
+        # Bind the real method
+        executor.handle_step_observed_low = (
+            AgentExecutor.handle_step_observed_low.__get__(executor)
+        )
+        executor._printer = Mock()
+
+        todo = TodoItem(
+            step_number=1,
+            description="Do something",
+            status="running",
+            result="Done successfully",
+        )
+        todo_list = TodoList(items=[todo])
+        executor.state = Mock()
+        executor.state.todos = todo_list
+
+        route = executor.handle_step_observed_low()
+        assert route == "continue_plan"
+        assert todo.status == "completed"
+        assert todo.result == "Done successfully"
+
+    def test_planning_config_reasoning_effort_default_is_low(self):
+        """Verify PlanningConfig defaults reasoning_effort to 'low'."""
+        from crewai.agent.planning_config import PlanningConfig
+
+        config = PlanningConfig()
+        assert config.reasoning_effort == "low"
+
+    def test_planning_config_reasoning_effort_validation(self):
+        """Verify PlanningConfig rejects invalid reasoning_effort values."""
+        from pydantic import ValidationError
+        from crewai.agent.planning_config import PlanningConfig
+
+        with pytest.raises(ValidationError):
+            PlanningConfig(reasoning_effort="ultra")
+
+        # Valid values should work
+        for level in ("low", "medium", "high"):
+            config = PlanningConfig(reasoning_effort=level)
+            assert config.reasoning_effort == level
+
+    def test_get_reasoning_effort_reads_from_config(self):
+        """Verify _get_reasoning_effort reads from agent.planning_config."""
+        from crewai.experimental.agent_executor import AgentExecutor
+
+        executor = Mock(spec=AgentExecutor)
+        executor._get_reasoning_effort = (
+            AgentExecutor._get_reasoning_effort.__get__(executor)
+        )
+
+        # Case 1: planning_config with reasoning_effort set
+        executor.agent = Mock()
+        executor.agent.planning_config = Mock()
+        executor.agent.planning_config.reasoning_effort = "high"
+        assert executor._get_reasoning_effort() == "high"
+
+        # Case 2: no planning_config → defaults to "low"
+        executor.agent.planning_config = None
+        assert executor._get_reasoning_effort() == "low"
+
+        # Case 3: planning_config without reasoning_effort attr → defaults to "low"
+        executor.agent.planning_config = Mock(spec=[])
+        assert executor._get_reasoning_effort() == "low"
--- a/lib/crewai/tests/agents/test_agent_reasoning.py
+++ b/lib/crewai/tests/agents/test_agent_reasoning.py
@@ -1,240 +1,345 @@
-"""Tests for reasoning in agents."""
+"""Tests for planning/reasoning in agents."""

-import json
+import warnings

 import pytest

-from crewai import Agent, Task
+from crewai import Agent, PlanningConfig, Task
 from crewai.llm import LLM


-@pytest.fixture
-def mock_llm_responses():
-    """Fixture for mock LLM responses."""
-    return {
-        "ready": "I'll solve this simple math problem.\n\nREADY: I am ready to execute the task.\n\n",
-        "not_ready": "I need to think about derivatives.\n\nNOT READY: I need to refine my plan because I'm not sure about the derivative rules.",
-        "ready_after_refine": "I'll use the power rule for derivatives where d/dx(x^n) = n*x^(n-1).\n\nREADY: I am ready to execute the task.",
-        "execution": "4",
-    }
+# =============================================================================
+# Tests for PlanningConfig configuration (no LLM calls needed)
+# =============================================================================


-def test_agent_with_reasoning(mock_llm_responses):
-    """Test agent with reasoning."""
-    llm = LLM("gpt-3.5-turbo")
+def test_planning_config_default_values():
+    """Test PlanningConfig default values."""
+    config = PlanningConfig()
+
+    assert config.max_attempts is None
+    assert config.max_steps == 20
+    assert config.system_prompt is None
+    assert config.plan_prompt is None
+    assert config.refine_prompt is None
+    assert config.llm is None
+
+
+def test_planning_config_custom_values():
+    """Test PlanningConfig with custom values."""
+    config = PlanningConfig(
+        max_attempts=5,
+        max_steps=15,
+        system_prompt="Custom system",
+        plan_prompt="Custom plan: {description}",
+        refine_prompt="Custom refine: {current_plan}",
+        llm="gpt-4",
+    )
+
+    assert config.max_attempts == 5
+    assert config.max_steps == 15
+    assert config.system_prompt == "Custom system"
+    assert config.plan_prompt == "Custom plan: {description}"
+    assert config.refine_prompt == "Custom refine: {current_plan}"
+    assert config.llm == "gpt-4"
+
+
+def test_agent_with_planning_config_custom_prompts():
+    """Test agent with PlanningConfig using custom prompts."""
+    llm = LLM("gpt-4o-mini")
+
+    custom_system_prompt = "You are a specialized planner."
+    custom_plan_prompt = "Plan this task: {description}"
+
+    agent = Agent(
+        role="Test Agent",
+        goal="To test custom prompts",
+        backstory="I am a test agent.",
+        llm=llm,
+        planning_config=PlanningConfig(
+            system_prompt=custom_system_prompt,
+            plan_prompt=custom_plan_prompt,
+            max_steps=10,
+        ),
+        verbose=False,
+    )
+
+    # Just test that the agent is created properly
+    assert agent.planning_config is not None
+    assert agent.planning_config.system_prompt == custom_system_prompt
+    assert agent.planning_config.plan_prompt == custom_plan_prompt
+    assert agent.planning_config.max_steps == 10
+
+
+def test_agent_with_planning_config_disabled():
+    """Test agent with PlanningConfig disabled."""
+    llm = LLM("gpt-4o-mini")
+
+    agent = Agent(
+        role="Test Agent",
+        goal="To test disabled planning",
+        backstory="I am a test agent.",
+        llm=llm,
+        planning=False,
+        verbose=False,
+    )
+
+    # Planning should be disabled
+    assert agent.planning_enabled is False
+
+
+def test_planning_enabled_property():
+    """Test the planning_enabled property on Agent."""
+    llm = LLM("gpt-4o-mini")
+
+    # With planning_config enabled
+    agent_with_planning = Agent(
+        role="Test Agent",
+        goal="Test",
+        backstory="Test",
+        llm=llm,
+        planning=True,
+    )
+    assert agent_with_planning.planning_enabled is True
+
+    # With planning_config disabled
+    agent_disabled = Agent(
+        role="Test Agent",
+        goal="Test",
+        backstory="Test",
+        llm=llm,
+        planning=False,
+    )
+    assert agent_disabled.planning_enabled is False
+
+    # Without planning_config
+    agent_no_planning = Agent(
+        role="Test Agent",
+        goal="Test",
+        backstory="Test",
+        llm=llm,
+    )
+    assert agent_no_planning.planning_enabled is False
+
+
+# =============================================================================
+# Tests for backward compatibility with reasoning=True (no LLM calls)
+# =============================================================================
+
+
+def test_agent_with_reasoning_backward_compat():
+    """Test agent with reasoning=True (backward compatibility)."""
+    llm = LLM("gpt-4o-mini")
+
+    # This should emit a deprecation warning
+    with warnings.catch_warnings(record=True):
+        warnings.simplefilter("always")
+        agent = Agent(
+            role="Test Agent",
+            goal="To test the reasoning feature",
+            backstory="I am a test agent created to verify the reasoning feature works correctly.",
+            llm=llm,
+            reasoning=True,
+            verbose=False,
+        )
+
+    # Should have created a PlanningConfig internally
+    assert agent.planning_config is not None
+    assert agent.planning_enabled is True
+
+
+def test_agent_with_reasoning_and_max_attempts_backward_compat():
+    """Test agent with reasoning=True and max_reasoning_attempts (backward compatibility)."""
+    llm = LLM("gpt-4o-mini")

    agent = Agent(
        role="Test Agent",
        goal="To test the reasoning feature",
-        backstory="I am a test agent created to verify the reasoning feature works correctly.",
+        backstory="I am a test agent.",
        llm=llm,
        reasoning=True,
-        verbose=True,
+        max_reasoning_attempts=5,
+        verbose=False,
    )

-    task = Task(
-        description="Simple math task: What's 2+2?",
-        expected_output="The answer should be a number.",
-        agent=agent,
-    )
-
-    agent.llm.call = lambda messages, *args, **kwargs: (
-        mock_llm_responses["ready"]
-        if any("create a detailed plan" in msg.get("content", "") for msg in messages)
-        else mock_llm_responses["execution"]
-    )
-
-    result = agent.execute_task(task)
-
-    assert result == mock_llm_responses["execution"]
-    assert "Reasoning Plan:" in task.description
+    # Should have created a PlanningConfig with max_attempts
+    assert agent.planning_config is not None
+    assert agent.planning_config.max_attempts == 5


-def test_agent_with_reasoning_not_ready_initially(mock_llm_responses):
-    """Test agent with reasoning that requires refinement."""
-    llm = LLM("gpt-3.5-turbo")
+# =============================================================================
+# Tests for Agent.kickoff() with planning (uses AgentExecutor)
+# =============================================================================
+
+
+@pytest.mark.vcr()
+def test_agent_kickoff_with_planning():
+    """Test Agent.kickoff() with planning enabled generates a plan."""
+    llm = LLM("gpt-4o-mini")

    agent = Agent(
-        role="Test Agent",
-        goal="To test the reasoning feature",
-        backstory="I am a test agent created to verify the reasoning feature works correctly.",
+        role="Math Assistant",
+        goal="Help solve math problems step by step",
+        backstory="A helpful math tutor",
        llm=llm,
-        reasoning=True,
-        max_reasoning_attempts=2,
-        verbose=True,
+        planning_config=PlanningConfig(max_attempts=1),
+        verbose=False,
    )

-    task = Task(
-        description="Complex math task: What's the derivative of x²?",
-        expected_output="The answer should be a mathematical expression.",
-        agent=agent,
-    )
+    result = agent.kickoff("What is 15 + 27?")

-    call_count = [0]
-
-    def mock_llm_call(messages, *args, **kwargs):
-        if any(
-            "create a detailed plan" in msg.get("content", "") for msg in messages
-        ) or any("refine your plan" in msg.get("content", "") for msg in messages):
-            call_count[0] += 1
-            if call_count[0] == 1:
-                return mock_llm_responses["not_ready"]
-            return mock_llm_responses["ready_after_refine"]
-        return "2x"
-
-    agent.llm.call = mock_llm_call
-
-    result = agent.execute_task(task)
-
-    assert result == "2x"
-    assert call_count[0] == 2  # Should have made 2 reasoning calls
-    assert "Reasoning Plan:" in task.description
+    assert result is not None
+    assert "42" in str(result)


-def test_agent_with_reasoning_max_attempts_reached():
-    """Test agent with reasoning that reaches max attempts without being ready."""
-    llm = LLM("gpt-3.5-turbo")
+@pytest.mark.vcr()
+def test_agent_kickoff_without_planning():
+    """Test Agent.kickoff() without planning skips plan generation."""
+    llm = LLM("gpt-4o-mini")

    agent = Agent(
-        role="Test Agent",
-        goal="To test the reasoning feature",
-        backstory="I am a test agent created to verify the reasoning feature works correctly.",
+        role="Math Assistant",
+        goal="Help solve math problems",
+        backstory="A helpful assistant",
        llm=llm,
-        reasoning=True,
-        max_reasoning_attempts=2,
-        verbose=True,
+        # No planning_config = no planning
+        verbose=False,
    )

-    task = Task(
-        description="Complex math task: Solve the Riemann hypothesis.",
-        expected_output="A proof or disproof of the hypothesis.",
-        agent=agent,
-    )
+    result = agent.kickoff("What is 8 * 7?")

-    call_count = [0]
-
-    def mock_llm_call(messages, *args, **kwargs):
-        if any(
-            "create a detailed plan" in msg.get("content", "") for msg in messages
-        ) or any("refine your plan" in msg.get("content", "") for msg in messages):
-            call_count[0] += 1
-            return f"Attempt {call_count[0]}: I need more time to think.\n\nNOT READY: I need to refine my plan further."
-        return "This is an unsolved problem in mathematics."
-
-    agent.llm.call = mock_llm_call
-
-    result = agent.execute_task(task)
-
-    assert result == "This is an unsolved problem in mathematics."
-    assert (
-        call_count[0] == 2
-    )  # Should have made exactly 2 reasoning calls (max_attempts)
-    assert "Reasoning Plan:" in task.description
+    assert result is not None
+    assert "56" in str(result)


-def test_agent_reasoning_error_handling():
-    """Test error handling during the reasoning process."""
-    llm = LLM("gpt-3.5-turbo")
+@pytest.mark.vcr()
+def test_agent_kickoff_with_planning_disabled():
+    """Test Agent.kickoff() with planning explicitly disabled via planning=False."""
+    llm = LLM("gpt-4o-mini")

    agent = Agent(
-        role="Test Agent",
-        goal="To test the reasoning feature",
-        backstory="I am a test agent created to verify the reasoning feature works correctly.",
+        role="Math Assistant",
+        goal="Help solve math problems",
+        backstory="A helpful assistant",
        llm=llm,
-        reasoning=True,
+        planning=False,  # Explicitly disable planning
+        verbose=False,
    )

-    task = Task(
-        description="Task that will cause an error",
-        expected_output="Output that will never be generated",
-        agent=agent,
-    )
+    result = agent.kickoff("What is 100 / 4?")

-    call_count = [0]
-
-    def mock_llm_call_error(*args, **kwargs):
-        call_count[0] += 1
-        if call_count[0] <= 2:  # First calls are for reasoning
-            raise Exception("LLM error during reasoning")
-        return "Fallback execution result"  # Return a value for task execution
-
-    agent.llm.call = mock_llm_call_error
-
-    result = agent.execute_task(task)
-
-    assert result == "Fallback execution result"
-    assert call_count[0] > 2  # Ensure we called the mock multiple times
+    assert result is not None
+    assert "25" in str(result)


-@pytest.mark.skip(reason="Test requires updates for native tool calling changes")
-def test_agent_with_function_calling():
-    """Test agent with reasoning using function calling."""
-    llm = LLM("gpt-3.5-turbo")
+@pytest.mark.vcr()
+def test_agent_kickoff_multi_step_task_with_planning():
+    """Test Agent.kickoff() with a multi-step task that benefits from planning."""
+    llm = LLM("gpt-4o-mini")

    agent = Agent(
-        role="Test Agent",
-        goal="To test the reasoning feature",
-        backstory="I am a test agent created to verify the reasoning feature works correctly.",
+        role="Math Tutor",
+        goal="Solve multi-step math problems",
+        backstory="An expert tutor who explains step by step",
        llm=llm,
-        reasoning=True,
-        verbose=True,
+        planning_config=PlanningConfig(max_attempts=1, max_steps=5),
+        verbose=False,
    )

-    task = Task(
-        description="Simple math task: What's 2+2?",
-        expected_output="The answer should be a number.",
-        agent=agent,
+    # Task requires: find primes, sum them, then double
+    result = agent.kickoff(
+        "Find the first 3 prime numbers, add them together, then multiply by 2."
    )

-    agent.llm.supports_function_calling = lambda: True
-
-    def mock_function_call(messages, *args, **kwargs):
-        if "tools" in kwargs:
-            return json.dumps(
-                {"plan": "I'll solve this simple math problem: 2+2=4.", "ready": True}
-            )
-        return "4"
-
-    agent.llm.call = mock_function_call
-
-    result = agent.execute_task(task)
-
-    assert result == "4"
-    assert "Reasoning Plan:" in task.description
-    assert "I'll solve this simple math problem: 2+2=4." in task.description
+    assert result is not None
+    # First 3 primes: 2, 3, 5 -> sum = 10 -> doubled = 20
+    assert "20" in str(result)


-@pytest.mark.skip(reason="Test requires updates for native tool calling changes")
-def test_agent_with_function_calling_fallback():
-    """Test agent with reasoning using function calling that falls back to text parsing."""
-    llm = LLM("gpt-3.5-turbo")
+# =============================================================================
+# Tests for Agent.execute_task() with planning (uses CrewAgentExecutor)
+# These test the legacy path via handle_reasoning()
+# =============================================================================
+
+
+@pytest.mark.vcr()
+def test_agent_execute_task_with_planning():
+    """Test Agent.execute_task() with planning via CrewAgentExecutor."""
+    llm = LLM("gpt-4o-mini")

    agent = Agent(
-        role="Test Agent",
-        goal="To test the reasoning feature",
-        backstory="I am a test agent created to verify the reasoning feature works correctly.",
+        role="Math Assistant",
+        goal="Help solve math problems",
+        backstory="A helpful math tutor",
        llm=llm,
-        reasoning=True,
-        verbose=True,
+        planning_config=PlanningConfig(max_attempts=1),
+        verbose=False,
    )

    task = Task(
-        description="Simple math task: What's 2+2?",
-        expected_output="The answer should be a number.",
+        description="What is 9 + 11?",
+        expected_output="A number",
        agent=agent,
    )

-    agent.llm.supports_function_calling = lambda: True
+    result = agent.execute_task(task)

-    def mock_function_call(messages, *args, **kwargs):
-        if "tools" in kwargs:
-            return "Invalid JSON that will trigger fallback. READY: I am ready to execute the task."
-        return "4"
+    assert result is not None
+    assert "20" in str(result)
+    # Planning should be appended to task description
+    assert "Planning:" in task.description

-    agent.llm.call = mock_function_call
+
+@pytest.mark.vcr()
+def test_agent_execute_task_without_planning():
+    """Test Agent.execute_task() without planning."""
+    llm = LLM("gpt-4o-mini")
+
+    agent = Agent(
+        role="Math Assistant",
+        goal="Help solve math problems",
+        backstory="A helpful assistant",
+        llm=llm,
+        verbose=False,
+    )
+
+    task = Task(
+        description="What is 12 * 3?",
+        expected_output="A number",
+        agent=agent,
+    )

    result = agent.execute_task(task)

-    assert result == "4"
-    assert "Reasoning Plan:" in task.description
-    assert "Invalid JSON that will trigger fallback" in task.description
+    assert result is not None
+    assert "36" in str(result)
+    # No planning should be added
+    assert "Planning:" not in task.description
+
+
+@pytest.mark.vcr()
+def test_agent_execute_task_with_planning_refine():
+    """Test Agent.execute_task() with planning that requires refinement."""
+    llm = LLM("gpt-4o-mini")
+
+    agent = Agent(
+        role="Math Tutor",
+        goal="Solve complex math problems step by step",
+        backstory="An expert tutor",
+        llm=llm,
+        planning_config=PlanningConfig(max_attempts=2),
+        verbose=False,
+    )
+
+    task = Task(
+        description="Calculate the area of a circle with radius 5 (use pi = 3.14)",
+        expected_output="The area as a number",
+        agent=agent,
+    )
+
+    result = agent.execute_task(task)
+
+    assert result is not None
+    # Area = pi * r^2 = 3.14 * 25 = 78.5
+    assert "78" in str(result) or "79" in str(result)
+    assert "Planning:" in task.description
--- a/lib/crewai/tests/agents/test_async_agent_executor.py
+++ b/lib/crewai/tests/agents/test_async_agent_executor.py
@@ -2,7 +2,7 @@

 import asyncio
 from typing import Any
-from unittest.mock import AsyncMock, MagicMock, patch
+from unittest.mock import AsyncMock, MagicMock, Mock, patch

 import pytest

@@ -291,6 +291,46 @@ class TestAsyncAgentExecutor:
        assert max_concurrent > 1, f"Expected concurrent execution, max concurrent was {max_concurrent}"


+class TestInvokeStepCallback:
+    """Tests for _invoke_step_callback with sync and async callbacks."""
+
+    def test_invoke_step_callback_with_sync_callback(
+        self, executor: CrewAgentExecutor
+    ) -> None:
+        """Test that a sync step callback is called normally."""
+        callback = Mock()
+        executor.step_callback = callback
+        answer = AgentFinish(thought="thinking", output="test", text="final")
+
+        executor._invoke_step_callback(answer)
+
+        callback.assert_called_once_with(answer)
+
+    def test_invoke_step_callback_with_async_callback(
+        self, executor: CrewAgentExecutor
+    ) -> None:
+        """Test that an async step callback is awaited via asyncio.run."""
+        async_callback = AsyncMock()
+        executor.step_callback = async_callback
+        answer = AgentFinish(thought="thinking", output="test", text="final")
+
+        with patch("crewai.agents.crew_agent_executor.asyncio.run") as mock_run:
+            executor._invoke_step_callback(answer)
+
+            async_callback.assert_called_once_with(answer)
+            mock_run.assert_called_once()
+
+    def test_invoke_step_callback_with_none(
+        self, executor: CrewAgentExecutor
+    ) -> None:
+        """Test that no error is raised when step_callback is None."""
+        executor.step_callback = None
+        answer = AgentFinish(thought="thinking", output="test", text="final")
+
+        # Should not raise
+        executor._invoke_step_callback(answer)
+
+
 class TestAsyncLLMResponseHelper:
    """Tests for aget_llm_response helper function."""

--- a/lib/crewai/tests/agents/test_lite_agent.py
+++ b/lib/crewai/tests/agents/test_lite_agent.py
@@ -359,17 +359,34 @@ def test_sets_flow_context_when_inside_flow():

@pytest.mark.vcr()
 def test_guardrail_is_called_using_string():
+    """Test that a string guardrail triggers events and retries correctly.
+
+    Uses a callable guardrail that deterministically fails on the first
+    attempt and passes on the second. This tests the guardrail event
+    machinery (started/completed events, retry loop) without depending
+    on the LLM to comply with contradictory constraints.
+    """
    guardrail_events: dict[str, list] = defaultdict(list)
    from crewai.events.event_types import (
        LLMGuardrailCompletedEvent,
        LLMGuardrailStartedEvent,
    )

+    # Deterministic guardrail: fail first call, pass second
+    call_count = {"n": 0}
+
+    def fail_then_pass_guardrail(output):
+        call_count["n"] += 1
+        if call_count["n"] == 1:
+            return (False, "Missing required format — please use a numbered list")
+        return (True, output)
+
    agent = Agent(
        role="Sports Analyst",
-        goal="Gather information about the best soccer players",
-        backstory="""You are an expert at gathering and organizing information. You carefully collect details and present them in a structured way.""",
-        guardrail="""Only include Brazilian players, both women and men""",
+        goal="List the best soccer players",
+        backstory="You are an expert at gathering and organizing information.",
+        guardrail=fail_then_pass_guardrail,
+        guardrail_max_retries=3,
    )

    condition = threading.Condition()
@@ -388,7 +405,7 @@ def test_guardrail_is_called_using_string():
            guardrail_events["completed"].append(event)
            condition.notify()

-    result = agent.kickoff(messages="Top 10 best players in the world?")
+    result = agent.kickoff(messages="Top 5 best soccer players in the world?")

    with condition:
        success = condition.wait_for(
--- a/lib/crewai/tests/cassettes/TestAgentMultimodalAnthropic.test_image_file[anthropic-claude-3-5-haiku-20241022].yaml
+++ b/lib/crewai/tests/cassettes/TestAgentMultimodalAnthropic.test_image_file[anthropic-claude-3-5-haiku-20241022].yaml
--- a/lib/crewai/tests/cassettes/TestAgentMultimodalAnthropic.test_mixed_files[anthropic-claude-3-5-haiku-20241022].yaml
+++ b/lib/crewai/tests/cassettes/TestAgentMultimodalAnthropic.test_mixed_files[anthropic-claude-3-5-haiku-20241022].yaml
--- a/lib/crewai/tests/cassettes/TestAgentMultimodalAnthropic.test_pdf_file[anthropic-claude-3-5-haiku-20241022].yaml
+++ b/lib/crewai/tests/cassettes/TestAgentMultimodalAnthropic.test_pdf_file[anthropic-claude-3-5-haiku-20241022].yaml
@@ -1,15 +1,9 @@
 interactions:
 - request:
    body: '{"max_tokens":4096,"messages":[{"role":"user","content":[{"type":"text","text":"\nCurrent
-      Task: What type of document is this?\n\nBegin! This is VERY important to you,
-      use the tools available and give your best Final Answer, your job depends on
-      it!\n\nThought:"},{"type":"document","source":{"type":"base64","media_type":"application/pdf","data":"JVBERi0xLjQKMSAwIG9iaiA8PCAvVHlwZSAvQ2F0YWxvZyAvUGFnZXMgMiAwIFIgPj4gZW5kb2JqCjIgMCBvYmogPDwgL1R5cGUgL1BhZ2VzIC9LaWRzIFszIDAgUl0gL0NvdW50IDEgPj4gZW5kb2JqCjMgMCBvYmogPDwgL1R5cGUgL1BhZ2UgL1BhcmVudCAyIDAgUiAvTWVkaWFCb3ggWzAgMCA2MTIgNzkyXSA+PiBlbmRvYmoKeHJlZgowIDQKMDAwMDAwMDAwMCA2NTUzNSBmCjAwMDAwMDAwMDkgMDAwMDAgbgowMDAwMDAwMDU4IDAwMDAwIG4KMDAwMDAwMDExNSAwMDAwMCBuCnRyYWlsZXIgPDwgL1NpemUgNCAvUm9vdCAxIDAgUiA+PgpzdGFydHhyZWYKMTk2CiUlRU9GCg=="},"cache_control":{"type":"ephemeral"}}]}],"model":"claude-3-5-haiku-20241022","stop_sequences":["\nObservation:"],"stream":false,"system":"You
+      Task: What type of document is this?\n\nProvide your complete response:"},{"type":"document","source":{"type":"base64","media_type":"application/pdf","data":"JVBERi0xLjQKMSAwIG9iaiA8PCAvVHlwZSAvQ2F0YWxvZyAvUGFnZXMgMiAwIFIgPj4gZW5kb2JqCjIgMCBvYmogPDwgL1R5cGUgL1BhZ2VzIC9LaWRzIFszIDAgUl0gL0NvdW50IDEgPj4gZW5kb2JqCjMgMCBvYmogPDwgL1R5cGUgL1BhZ2UgL1BhcmVudCAyIDAgUiAvTWVkaWFCb3ggWzAgMCA2MTIgNzkyXSA+PiBlbmRvYmoKeHJlZgowIDQKMDAwMDAwMDAwMCA2NTUzNSBmCjAwMDAwMDAwMDkgMDAwMDAgbgowMDAwMDAwMDU4IDAwMDAwIG4KMDAwMDAwMDExNSAwMDAwMCBuCnRyYWlsZXIgPDwgL1NpemUgNCAvUm9vdCAxIDAgUiA+PgpzdGFydHhyZWYKMTk2CiUlRU9GCg=="},"cache_control":{"type":"ephemeral"}}]}],"model":"claude-3-5-haiku-20241022","stop_sequences":["\nObservation:"],"stream":false,"system":"You
      are File Analyst. Expert at analyzing various file types.\nYour personal goal
-      is: Analyze and describe files accurately\nTo give my best complete final answer
-      to the task respond using the exact following format:\n\nThought: I now can
-      give a great answer\nFinal Answer: Your final answer must be the great and the
-      most complete as possible, it must be outcome described.\n\nI MUST use these
-      formats, my job depends on it!"}'
+      is: Analyze and describe files accurately"}'
    headers:
      User-Agent:
      - X-USER-AGENT-XXX
@@ -22,7 +16,7 @@ interactions:
      connection:
      - keep-alive
      content-length:
-      - '1351'
+      - '950'
      content-type:
      - application/json
      host:
@@ -38,35 +32,35 @@ interactions:
      x-stainless-os:
      - X-STAINLESS-OS-XXX
      x-stainless-package-version:
-      - 0.71.1
+      - 0.73.0
      x-stainless-retry-count:
      - '0'
      x-stainless-runtime:
      - CPython
      x-stainless-runtime-version:
-      - 3.12.10
+      - 3.13.3
      x-stainless-timeout:
      - NOT_GIVEN
    method: POST
    uri: https://api.anthropic.com/v1/messages
  response:
    body:
-      string: '{"model":"claude-3-5-haiku-20241022","id":"msg_01AcygCF93tRhc7A3bfXMqe7","type":"message","role":"assistant","content":[{"type":"text","text":"Thought:
-        I can see this is a PDF document, but the image appears to be completely white
-        or blank. Without any visible content, I cannot definitively determine the
-        specific type of document.\n\nFinal Answer: The document is a PDF file, but
-        the provided image shows a blank white page with no discernible content or
-        text. More information or a clearer image would be needed to identify the
-        precise type of document."}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":1750,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":89,"service_tier":"standard"}}'
+      string: '{"model":"claude-3-5-haiku-20241022","id":"msg_01C8ZkZMunUVDUDd8mh1r1We","type":"message","role":"assistant","content":[{"type":"text","text":"I
+        apologize, but the image appears to be completely blank or white. Without
+        any visible text, graphics, or distinguishing features, I cannot determine
+        the type of document. The file is a PDF, but the content page seems to be
+        empty or failed to render properly."}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":1658,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":58,"service_tier":"standard","inference_geo":"not_available"}}'
    headers:
      CF-RAY:
      - CF-RAY-XXX
      Connection:
      - keep-alive
+      Content-Security-Policy:
+      - CSP-FILTERED
      Content-Type:
      - application/json
      Date:
-      - Fri, 23 Jan 2026 19:08:04 GMT
+      - Thu, 12 Feb 2026 19:30:55 GMT
      Server:
      - cloudflare
      Transfer-Encoding:
@@ -92,7 +86,7 @@ interactions:
      anthropic-ratelimit-requests-remaining:
      - '3999'
      anthropic-ratelimit-requests-reset:
-      - '2026-01-23T19:08:01Z'
+      - '2026-02-12T19:30:53Z'
      anthropic-ratelimit-tokens-limit:
      - ANTHROPIC-RATELIMIT-TOKENS-LIMIT-XXX
      anthropic-ratelimit-tokens-remaining:
@@ -106,7 +100,112 @@ interactions:
      strict-transport-security:
      - STS-XXX
      x-envoy-upstream-service-time:
-      - '2837'
+      - '2129'
+    status:
+      code: 200
+      message: OK
+- request:
+    body: '{"max_tokens":4096,"messages":[{"role":"user","content":[{"type":"text","text":"\nCurrent
+      Task: What type of document is this?\n\nProvide your complete response:"},{"type":"document","source":{"type":"base64","media_type":"application/pdf","data":"JVBERi0xLjQKMSAwIG9iaiA8PCAvVHlwZSAvQ2F0YWxvZyAvUGFnZXMgMiAwIFIgPj4gZW5kb2JqCjIgMCBvYmogPDwgL1R5cGUgL1BhZ2VzIC9LaWRzIFszIDAgUl0gL0NvdW50IDEgPj4gZW5kb2JqCjMgMCBvYmogPDwgL1R5cGUgL1BhZ2UgL1BhcmVudCAyIDAgUiAvTWVkaWFCb3ggWzAgMCA2MTIgNzkyXSA+PiBlbmRvYmoKeHJlZgowIDQKMDAwMDAwMDAwMCA2NTUzNSBmCjAwMDAwMDAwMDkgMDAwMDAgbgowMDAwMDAwMDU4IDAwMDAwIG4KMDAwMDAwMDExNSAwMDAwMCBuCnRyYWlsZXIgPDwgL1NpemUgNCAvUm9vdCAxIDAgUiA+PgpzdGFydHhyZWYKMTk2CiUlRU9GCg=="},"cache_control":{"type":"ephemeral"}}]}],"model":"claude-3-5-haiku-20241022","stop_sequences":["\nObservation:"],"stream":false,"system":"You
+      are File Analyst. Expert at analyzing various file types.\nYour personal goal
+      is: Analyze and describe files accurately"}'
+    headers:
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept:
+      - application/json
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      anthropic-version:
+      - '2023-06-01'
+      connection:
+      - keep-alive
+      content-length:
+      - '950'
+      content-type:
+      - application/json
+      host:
+      - api.anthropic.com
+      x-api-key:
+      - X-API-KEY-XXX
+      x-stainless-arch:
+      - X-STAINLESS-ARCH-XXX
+      x-stainless-async:
+      - 'false'
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - X-STAINLESS-OS-XXX
+      x-stainless-package-version:
+      - 0.73.0
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.13.3
+      x-stainless-timeout:
+      - NOT_GIVEN
+    method: POST
+    uri: https://api.anthropic.com/v1/messages
+  response:
+    body:
+      string: '{"model":"claude-3-5-haiku-20241022","id":"msg_013jb7edagayZxqGs6ioACyU","type":"message","role":"assistant","content":[{"type":"text","text":"I
+        apologize, but the image appears to be completely blank or white. There are
+        no visible contents or text that I can analyze to determine the type of document.
+        Without any discernible information, I cannot definitively state what type
+        of document this is."}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":1658,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":55,"service_tier":"standard","inference_geo":"not_available"}}'
+    headers:
+      CF-RAY:
+      - CF-RAY-XXX
+      Connection:
+      - keep-alive
+      Content-Security-Policy:
+      - CSP-FILTERED
+      Content-Type:
+      - application/json
+      Date:
+      - Thu, 12 Feb 2026 19:30:58 GMT
+      Server:
+      - cloudflare
+      Transfer-Encoding:
+      - chunked
+      X-Robots-Tag:
+      - none
+      anthropic-organization-id:
+      - ANTHROPIC-ORGANIZATION-ID-XXX
+      anthropic-ratelimit-input-tokens-limit:
+      - ANTHROPIC-RATELIMIT-INPUT-TOKENS-LIMIT-XXX
+      anthropic-ratelimit-input-tokens-remaining:
+      - ANTHROPIC-RATELIMIT-INPUT-TOKENS-REMAINING-XXX
+      anthropic-ratelimit-input-tokens-reset:
+      - ANTHROPIC-RATELIMIT-INPUT-TOKENS-RESET-XXX
+      anthropic-ratelimit-output-tokens-limit:
+      - ANTHROPIC-RATELIMIT-OUTPUT-TOKENS-LIMIT-XXX
+      anthropic-ratelimit-output-tokens-remaining:
+      - ANTHROPIC-RATELIMIT-OUTPUT-TOKENS-REMAINING-XXX
+      anthropic-ratelimit-output-tokens-reset:
+      - ANTHROPIC-RATELIMIT-OUTPUT-TOKENS-RESET-XXX
+      anthropic-ratelimit-requests-limit:
+      - '4000'
+      anthropic-ratelimit-requests-remaining:
+      - '3999'
+      anthropic-ratelimit-requests-reset:
+      - '2026-02-12T19:30:56Z'
+      anthropic-ratelimit-tokens-limit:
+      - ANTHROPIC-RATELIMIT-TOKENS-LIMIT-XXX
+      anthropic-ratelimit-tokens-remaining:
+      - ANTHROPIC-RATELIMIT-TOKENS-REMAINING-XXX
+      anthropic-ratelimit-tokens-reset:
+      - ANTHROPIC-RATELIMIT-TOKENS-RESET-XXX
+      cf-cache-status:
+      - DYNAMIC
+      request-id:
+      - REQUEST-ID-XXX
+      strict-transport-security:
+      - STS-XXX
+      x-envoy-upstream-service-time:
+      - '2005'
    status:
      code: 200
      message: OK
--- a/lib/crewai/tests/cassettes/TestAgentMultimodalAsync.test_async_agent_with_image.yaml
+++ b/lib/crewai/tests/cassettes/TestAgentMultimodalAsync.test_async_agent_with_image.yaml
--- a/lib/crewai/tests/cassettes/TestAgentMultimodalFileTypes.test_audio_gemini.yaml
+++ b/lib/crewai/tests/cassettes/TestAgentMultimodalFileTypes.test_audio_gemini.yaml
--- a/lib/crewai/tests/cassettes/TestAgentMultimodalFileTypes.test_image_openai.yaml
+++ b/lib/crewai/tests/cassettes/TestAgentMultimodalFileTypes.test_image_openai.yaml
--- a/lib/crewai/tests/cassettes/TestAgentMultimodalFileTypes.test_pdf_anthropic.yaml
+++ b/lib/crewai/tests/cassettes/TestAgentMultimodalFileTypes.test_pdf_anthropic.yaml
@@ -1,14 +1,9 @@
 interactions:
 - request:
    body: '{"max_tokens":4096,"messages":[{"role":"user","content":[{"type":"text","text":"\nCurrent
-      Task: What is this document?\n\nBegin! This is VERY important to you, use the
-      tools available and give your best Final Answer, your job depends on it!\n\nThought:"},{"type":"document","source":{"type":"base64","media_type":"application/pdf","data":"JVBERi0xLjQKMSAwIG9iaiA8PCAvVHlwZSAvQ2F0YWxvZyAvUGFnZXMgMiAwIFIgPj4gZW5kb2JqCjIgMCBvYmogPDwgL1R5cGUgL1BhZ2VzIC9LaWRzIFszIDAgUl0gL0NvdW50IDEgPj4gZW5kb2JqCjMgMCBvYmogPDwgL1R5cGUgL1BhZ2UgL1BhcmVudCAyIDAgUiAvTWVkaWFCb3ggWzAgMCA2MTIgNzkyXSA+PiBlbmRvYmoKeHJlZgowIDQKMDAwMDAwMDAwMCA2NTUzNSBmCjAwMDAwMDAwMDkgMDAwMDAgbgowMDAwMDAwMDU4IDAwMDAwIG4KMDAwMDAwMDExNSAwMDAwMCBuCnRyYWlsZXIgPDwgL1NpemUgNCAvUm9vdCAxIDAgUiA+PgpzdGFydHhyZWYKMTk2CiUlRU9GCg=="},"cache_control":{"type":"ephemeral"}}]}],"model":"claude-3-5-haiku-20241022","stop_sequences":["\nObservation:"],"stream":false,"system":"You
+      Task: What is this document?\n\nProvide your complete response:"},{"type":"document","source":{"type":"base64","media_type":"application/pdf","data":"JVBERi0xLjQKMSAwIG9iaiA8PCAvVHlwZSAvQ2F0YWxvZyAvUGFnZXMgMiAwIFIgPj4gZW5kb2JqCjIgMCBvYmogPDwgL1R5cGUgL1BhZ2VzIC9LaWRzIFszIDAgUl0gL0NvdW50IDEgPj4gZW5kb2JqCjMgMCBvYmogPDwgL1R5cGUgL1BhZ2UgL1BhcmVudCAyIDAgUiAvTWVkaWFCb3ggWzAgMCA2MTIgNzkyXSA+PiBlbmRvYmoKeHJlZgowIDQKMDAwMDAwMDAwMCA2NTUzNSBmCjAwMDAwMDAwMDkgMDAwMDAgbgowMDAwMDAwMDU4IDAwMDAwIG4KMDAwMDAwMDExNSAwMDAwMCBuCnRyYWlsZXIgPDwgL1NpemUgNCAvUm9vdCAxIDAgUiA+PgpzdGFydHhyZWYKMTk2CiUlRU9GCg=="},"cache_control":{"type":"ephemeral"}}]}],"model":"claude-3-5-haiku-20241022","stop_sequences":["\nObservation:"],"stream":false,"system":"You
      are File Analyst. Expert at analyzing various file types.\nYour personal goal
-      is: Analyze and describe files accurately\nTo give my best complete final answer
-      to the task respond using the exact following format:\n\nThought: I now can
-      give a great answer\nFinal Answer: Your final answer must be the great and the
-      most complete as possible, it must be outcome described.\n\nI MUST use these
-      formats, my job depends on it!"}'
+      is: Analyze and describe files accurately"}'
    headers:
      User-Agent:
      - X-USER-AGENT-XXX
@@ -21,7 +16,7 @@ interactions:
      connection:
      - keep-alive
      content-length:
-      - '1343'
+      - '942'
      content-type:
      - application/json
      host:
@@ -37,34 +32,35 @@ interactions:
      x-stainless-os:
      - X-STAINLESS-OS-XXX
      x-stainless-package-version:
-      - 0.71.1
+      - 0.73.0
      x-stainless-retry-count:
      - '0'
      x-stainless-runtime:
      - CPython
      x-stainless-runtime-version:
-      - 3.12.10
+      - 3.13.3
      x-stainless-timeout:
      - NOT_GIVEN
    method: POST
    uri: https://api.anthropic.com/v1/messages
  response:
    body:
-      string: '{"model":"claude-3-5-haiku-20241022","id":"msg_01XwAhfdaMxwTNzTy7YhmA5e","type":"message","role":"assistant","content":[{"type":"text","text":"Thought:
-        I can see this is a PDF document, but the image appears to be blank or completely
-        white. Without any visible text or content, I cannot determine the specific
-        type or purpose of this document.\n\nFinal Answer: The document appears to
-        be a blank white PDF page with no discernible text, images, or content visible.
-        It could be an empty document, a scanning error, or a placeholder file."}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":1748,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":88,"service_tier":"standard"}}'
+      string: '{"model":"claude-3-5-haiku-20241022","id":"msg_01RnyTYpTE9Dd8BfwyMfuwum","type":"message","role":"assistant","content":[{"type":"text","text":"I
+        apologize, but the image appears to be blank or completely white. Without
+        any visible text or content, I cannot determine the type or nature of the
+        document. If you intended to share a specific document, you may want to check
+        the file and try uploading it again."}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":1656,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":59,"service_tier":"standard","inference_geo":"not_available"}}'
    headers:
      CF-RAY:
      - CF-RAY-XXX
      Connection:
      - keep-alive
+      Content-Security-Policy:
+      - CSP-FILTERED
      Content-Type:
      - application/json
      Date:
-      - Fri, 23 Jan 2026 19:08:19 GMT
+      - Thu, 12 Feb 2026 19:29:25 GMT
      Server:
      - cloudflare
      Transfer-Encoding:
@@ -90,7 +86,7 @@ interactions:
      anthropic-ratelimit-requests-remaining:
      - '3999'
      anthropic-ratelimit-requests-reset:
-      - '2026-01-23T19:08:16Z'
+      - '2026-02-12T19:29:23Z'
      anthropic-ratelimit-tokens-limit:
      - ANTHROPIC-RATELIMIT-TOKENS-LIMIT-XXX
      anthropic-ratelimit-tokens-remaining:
@@ -104,7 +100,111 @@ interactions:
      strict-transport-security:
      - STS-XXX
      x-envoy-upstream-service-time:
-      - '3114'
+      - '2072'
+    status:
+      code: 200
+      message: OK
+- request:
+    body: '{"max_tokens":4096,"messages":[{"role":"user","content":[{"type":"text","text":"\nCurrent
+      Task: What is this document?\n\nProvide your complete response:"},{"type":"document","source":{"type":"base64","media_type":"application/pdf","data":"JVBERi0xLjQKMSAwIG9iaiA8PCAvVHlwZSAvQ2F0YWxvZyAvUGFnZXMgMiAwIFIgPj4gZW5kb2JqCjIgMCBvYmogPDwgL1R5cGUgL1BhZ2VzIC9LaWRzIFszIDAgUl0gL0NvdW50IDEgPj4gZW5kb2JqCjMgMCBvYmogPDwgL1R5cGUgL1BhZ2UgL1BhcmVudCAyIDAgUiAvTWVkaWFCb3ggWzAgMCA2MTIgNzkyXSA+PiBlbmRvYmoKeHJlZgowIDQKMDAwMDAwMDAwMCA2NTUzNSBmCjAwMDAwMDAwMDkgMDAwMDAgbgowMDAwMDAwMDU4IDAwMDAwIG4KMDAwMDAwMDExNSAwMDAwMCBuCnRyYWlsZXIgPDwgL1NpemUgNCAvUm9vdCAxIDAgUiA+PgpzdGFydHhyZWYKMTk2CiUlRU9GCg=="},"cache_control":{"type":"ephemeral"}}]}],"model":"claude-3-5-haiku-20241022","stop_sequences":["\nObservation:"],"stream":false,"system":"You
+      are File Analyst. Expert at analyzing various file types.\nYour personal goal
+      is: Analyze and describe files accurately"}'
+    headers:
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept:
+      - application/json
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      anthropic-version:
+      - '2023-06-01'
+      connection:
+      - keep-alive
+      content-length:
+      - '942'
+      content-type:
+      - application/json
+      host:
+      - api.anthropic.com
+      x-api-key:
+      - X-API-KEY-XXX
+      x-stainless-arch:
+      - X-STAINLESS-ARCH-XXX
+      x-stainless-async:
+      - 'false'
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - X-STAINLESS-OS-XXX
+      x-stainless-package-version:
+      - 0.73.0
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.13.3
+      x-stainless-timeout:
+      - NOT_GIVEN
+    method: POST
+    uri: https://api.anthropic.com/v1/messages
+  response:
+    body:
+      string: '{"model":"claude-3-5-haiku-20241022","id":"msg_011J2La8KpjxAK255NsSpePY","type":"message","role":"assistant","content":[{"type":"text","text":"I
+        apologize, but the document appears to be a blank white page. No text, images,
+        or discernible content is visible in this PDF file. Without any readable information,
+        I cannot determine the type or purpose of this document."}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":1656,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":51,"service_tier":"standard","inference_geo":"not_available"}}'
+    headers:
+      CF-RAY:
+      - CF-RAY-XXX
+      Connection:
+      - keep-alive
+      Content-Security-Policy:
+      - CSP-FILTERED
+      Content-Type:
+      - application/json
+      Date:
+      - Thu, 12 Feb 2026 19:29:27 GMT
+      Server:
+      - cloudflare
+      Transfer-Encoding:
+      - chunked
+      X-Robots-Tag:
+      - none
+      anthropic-organization-id:
+      - ANTHROPIC-ORGANIZATION-ID-XXX
+      anthropic-ratelimit-input-tokens-limit:
+      - ANTHROPIC-RATELIMIT-INPUT-TOKENS-LIMIT-XXX
+      anthropic-ratelimit-input-tokens-remaining:
+      - ANTHROPIC-RATELIMIT-INPUT-TOKENS-REMAINING-XXX
+      anthropic-ratelimit-input-tokens-reset:
+      - ANTHROPIC-RATELIMIT-INPUT-TOKENS-RESET-XXX
+      anthropic-ratelimit-output-tokens-limit:
+      - ANTHROPIC-RATELIMIT-OUTPUT-TOKENS-LIMIT-XXX
+      anthropic-ratelimit-output-tokens-remaining:
+      - ANTHROPIC-RATELIMIT-OUTPUT-TOKENS-REMAINING-XXX
+      anthropic-ratelimit-output-tokens-reset:
+      - ANTHROPIC-RATELIMIT-OUTPUT-TOKENS-RESET-XXX
+      anthropic-ratelimit-requests-limit:
+      - '4000'
+      anthropic-ratelimit-requests-remaining:
+      - '3999'
+      anthropic-ratelimit-requests-reset:
+      - '2026-02-12T19:29:26Z'
+      anthropic-ratelimit-tokens-limit:
+      - ANTHROPIC-RATELIMIT-TOKENS-LIMIT-XXX
+      anthropic-ratelimit-tokens-remaining:
+      - ANTHROPIC-RATELIMIT-TOKENS-REMAINING-XXX
+      anthropic-ratelimit-tokens-reset:
+      - ANTHROPIC-RATELIMIT-TOKENS-RESET-XXX
+      cf-cache-status:
+      - DYNAMIC
+      request-id:
+      - REQUEST-ID-XXX
+      strict-transport-security:
+      - STS-XXX
+      x-envoy-upstream-service-time:
+      - '1802'
    status:
      code: 200
      message: OK
--- a/lib/crewai/tests/cassettes/TestAgentMultimodalFileTypes.test_pdf_openai_responses.yaml
+++ b/lib/crewai/tests/cassettes/TestAgentMultimodalFileTypes.test_pdf_openai_responses.yaml
@@ -1,14 +1,9 @@
 interactions:
 - request:
    body: '{"input":[{"role":"user","content":[{"type":"input_text","text":"\nCurrent
-      Task: What is this document?\n\nBegin! This is VERY important to you, use the
-      tools available and give your best Final Answer, your job depends on it!\n\nThought:"},{"type":"input_file","filename":"document.pdf","file_data":"data:application/pdf;base64,JVBERi0xLjQKMSAwIG9iaiA8PCAvVHlwZSAvQ2F0YWxvZyAvUGFnZXMgMiAwIFIgPj4gZW5kb2JqCjIgMCBvYmogPDwgL1R5cGUgL1BhZ2VzIC9LaWRzIFszIDAgUl0gL0NvdW50IDEgPj4gZW5kb2JqCjMgMCBvYmogPDwgL1R5cGUgL1BhZ2UgL1BhcmVudCAyIDAgUiAvTWVkaWFCb3ggWzAgMCA2MTIgNzkyXSA+PiBlbmRvYmoKeHJlZgowIDQKMDAwMDAwMDAwMCA2NTUzNSBmCjAwMDAwMDAwMDkgMDAwMDAgbgowMDAwMDAwMDU4IDAwMDAwIG4KMDAwMDAwMDExNSAwMDAwMCBuCnRyYWlsZXIgPDwgL1NpemUgNCAvUm9vdCAxIDAgUiA+PgpzdGFydHhyZWYKMTk2CiUlRU9GCg=="}]}],"model":"gpt-4o-mini","instructions":"You
+      Task: What is this document?\n\nProvide your complete response:"},{"type":"input_file","filename":"document.pdf","file_data":"data:application/pdf;base64,JVBERi0xLjQKMSAwIG9iaiA8PCAvVHlwZSAvQ2F0YWxvZyAvUGFnZXMgMiAwIFIgPj4gZW5kb2JqCjIgMCBvYmogPDwgL1R5cGUgL1BhZ2VzIC9LaWRzIFszIDAgUl0gL0NvdW50IDEgPj4gZW5kb2JqCjMgMCBvYmogPDwgL1R5cGUgL1BhZ2UgL1BhcmVudCAyIDAgUiAvTWVkaWFCb3ggWzAgMCA2MTIgNzkyXSA+PiBlbmRvYmoKeHJlZgowIDQKMDAwMDAwMDAwMCA2NTUzNSBmCjAwMDAwMDAwMDkgMDAwMDAgbgowMDAwMDAwMDU4IDAwMDAwIG4KMDAwMDAwMDExNSAwMDAwMCBuCnRyYWlsZXIgPDwgL1NpemUgNCAvUm9vdCAxIDAgUiA+PgpzdGFydHhyZWYKMTk2CiUlRU9GCg=="}]}],"model":"gpt-4o-mini","instructions":"You
      are File Analyst. Expert at analyzing various file types.\nYour personal goal
-      is: Analyze and describe files accurately\nTo give my best complete final answer
-      to the task respond using the exact following format:\n\nThought: I now can
-      give a great answer\nFinal Answer: Your final answer must be the great and the
-      most complete as possible, it must be outcome described.\n\nI MUST use these
-      formats, my job depends on it!"}'
+      is: Analyze and describe files accurately"}'
    headers:
      User-Agent:
      - X-USER-AGENT-XXX
@@ -21,7 +16,7 @@ interactions:
      connection:
      - keep-alive
      content-length:
-      - '1235'
+      - '834'
      content-type:
      - application/json
      host:
@@ -43,47 +38,37 @@ interactions:
      x-stainless-runtime:
      - CPython
      x-stainless-runtime-version:
-      - 3.12.10
+      - 3.13.3
    method: POST
    uri: https://api.openai.com/v1/responses
  response:
    body:
-      string: "{\n  \"id\": \"resp_059d23bc71d450aa006973c72416788197bddcc99157e3a313\",\n
-        \ \"object\": \"response\",\n  \"created_at\": 1769195300,\n  \"status\":
+      string: "{\n  \"id\": \"resp_0751868929a7aa7500698e2a23d5508194b8e4092ff79a8f41\",\n
+        \ \"object\": \"response\",\n  \"created_at\": 1770924579,\n  \"status\":
        \"completed\",\n  \"background\": false,\n  \"billing\": {\n    \"payer\":
-        \"developer\"\n  },\n  \"completed_at\": 1769195307,\n  \"error\": null,\n
+        \"developer\"\n  },\n  \"completed_at\": 1770924581,\n  \"error\": null,\n
        \ \"frequency_penalty\": 0.0,\n  \"incomplete_details\": null,\n  \"instructions\":
        \"You are File Analyst. Expert at analyzing various file types.\\nYour personal
-        goal is: Analyze and describe files accurately\\nTo give my best complete
-        final answer to the task respond using the exact following format:\\n\\nThought:
-        I now can give a great answer\\nFinal Answer: Your final answer must be the
-        great and the most complete as possible, it must be outcome described.\\n\\nI
-        MUST use these formats, my job depends on it!\",\n  \"max_output_tokens\":
+        goal is: Analyze and describe files accurately\",\n  \"max_output_tokens\":
        null,\n  \"max_tool_calls\": null,\n  \"model\": \"gpt-4o-mini-2024-07-18\",\n
-        \ \"output\": [\n    {\n      \"id\": \"msg_059d23bc71d450aa006973c724b1d881979787b0eeb53bdbd2\",\n
+        \ \"output\": [\n    {\n      \"id\": \"msg_0751868929a7aa7500698e2a2474208194a7ea7e8d1179c3fa\",\n
        \     \"type\": \"message\",\n      \"status\": \"completed\",\n      \"content\":
        [\n        {\n          \"type\": \"output_text\",\n          \"annotations\":
-        [],\n          \"logprobs\": [],\n          \"text\": \"Thought: I now can
-        give a great answer.  \\nFinal Answer: Without access to a specific document
-        or its contents, I cannot provide a detailed analysis. However, in general,
-        important aspects of a document can include its format (such as PDF, DOCX,
-        or TXT), purpose (such as legal, informative, or persuasive), and key elements
-        like headings, text structure, and any embedded media (such as images or charts).
-        For a thorough analysis, it's essential to understand the context, audience,
-        and intended use of the document. If you can provide the document itself or
-        more context about it, I would be able to give a complete assessment.\"\n
-        \       }\n      ],\n      \"role\": \"assistant\"\n    }\n  ],\n  \"parallel_tool_calls\":
-        true,\n  \"presence_penalty\": 0.0,\n  \"previous_response_id\": null,\n  \"prompt_cache_key\":
-        null,\n  \"prompt_cache_retention\": null,\n  \"reasoning\": {\n    \"effort\":
-        null,\n    \"summary\": null\n  },\n  \"safety_identifier\": null,\n  \"service_tier\":
-        \"default\",\n  \"store\": true,\n  \"temperature\": 1.0,\n  \"text\": {\n
-        \   \"format\": {\n      \"type\": \"text\"\n    },\n    \"verbosity\": \"medium\"\n
-        \ },\n  \"tool_choice\": \"auto\",\n  \"tools\": [],\n  \"top_logprobs\":
-        0,\n  \"top_p\": 1.0,\n  \"truncation\": \"disabled\",\n  \"usage\": {\n    \"input_tokens\":
-        137,\n    \"input_tokens_details\": {\n      \"cached_tokens\": 0\n    },\n
-        \   \"output_tokens\": 132,\n    \"output_tokens_details\": {\n      \"reasoning_tokens\":
-        0\n    },\n    \"total_tokens\": 269\n  },\n  \"user\": null,\n  \"metadata\":
-        {}\n}"
+        [],\n          \"logprobs\": [],\n          \"text\": \"It seems that you
+        have not uploaded any document or file for analysis. Please provide the file
+        you'd like me to review, and I'll be happy to help you with the analysis and
+        description.\"\n        }\n      ],\n      \"role\": \"assistant\"\n    }\n
+        \ ],\n  \"parallel_tool_calls\": true,\n  \"presence_penalty\": 0.0,\n  \"previous_response_id\":
+        null,\n  \"prompt_cache_key\": null,\n  \"prompt_cache_retention\": null,\n
+        \ \"reasoning\": {\n    \"effort\": null,\n    \"summary\": null\n  },\n  \"safety_identifier\":
+        null,\n  \"service_tier\": \"default\",\n  \"store\": true,\n  \"temperature\":
+        1.0,\n  \"text\": {\n    \"format\": {\n      \"type\": \"text\"\n    },\n
+        \   \"verbosity\": \"medium\"\n  },\n  \"tool_choice\": \"auto\",\n  \"tools\":
+        [],\n  \"top_logprobs\": 0,\n  \"top_p\": 1.0,\n  \"truncation\": \"disabled\",\n
+        \ \"usage\": {\n    \"input_tokens\": 51,\n    \"input_tokens_details\": {\n
+        \     \"cached_tokens\": 0\n    },\n    \"output_tokens\": 38,\n    \"output_tokens_details\":
+        {\n      \"reasoning_tokens\": 0\n    },\n    \"total_tokens\": 89\n  },\n
+        \ \"user\": null,\n  \"metadata\": {}\n}"
    headers:
      CF-RAY:
      - CF-RAY-XXX
@@ -92,11 +77,9 @@ interactions:
      Content-Type:
      - application/json
      Date:
-      - Fri, 23 Jan 2026 19:08:27 GMT
+      - Thu, 12 Feb 2026 19:29:41 GMT
      Server:
      - cloudflare
-      Set-Cookie:
-      - SET-COOKIE-XXX
      Strict-Transport-Security:
      - STS-XXX
      Transfer-Encoding:
@@ -110,13 +93,132 @@ interactions:
      openai-organization:
      - OPENAI-ORG-XXX
      openai-processing-ms:
-      - '7347'
+      - '1581'
      openai-project:
      - OPENAI-PROJECT-XXX
      openai-version:
      - '2020-10-01'
-      x-envoy-upstream-service-time:
-      - '7350'
+      set-cookie:
+      - SET-COOKIE-XXX
+      x-ratelimit-limit-requests:
+      - X-RATELIMIT-LIMIT-REQUESTS-XXX
+      x-ratelimit-limit-tokens:
+      - X-RATELIMIT-LIMIT-TOKENS-XXX
+      x-ratelimit-remaining-requests:
+      - X-RATELIMIT-REMAINING-REQUESTS-XXX
+      x-ratelimit-remaining-tokens:
+      - X-RATELIMIT-REMAINING-TOKENS-XXX
+      x-ratelimit-reset-requests:
+      - X-RATELIMIT-RESET-REQUESTS-XXX
+      x-ratelimit-reset-tokens:
+      - X-RATELIMIT-RESET-TOKENS-XXX
+      x-request-id:
+      - X-REQUEST-ID-XXX
+    status:
+      code: 200
+      message: OK
+- request:
+    body: '{"input":[{"role":"user","content":[{"type":"input_text","text":"\nCurrent
+      Task: What is this document?\n\nProvide your complete response:"},{"type":"input_file","filename":"document.pdf","file_data":"data:application/pdf;base64,JVBERi0xLjQKMSAwIG9iaiA8PCAvVHlwZSAvQ2F0YWxvZyAvUGFnZXMgMiAwIFIgPj4gZW5kb2JqCjIgMCBvYmogPDwgL1R5cGUgL1BhZ2VzIC9LaWRzIFszIDAgUl0gL0NvdW50IDEgPj4gZW5kb2JqCjMgMCBvYmogPDwgL1R5cGUgL1BhZ2UgL1BhcmVudCAyIDAgUiAvTWVkaWFCb3ggWzAgMCA2MTIgNzkyXSA+PiBlbmRvYmoKeHJlZgowIDQKMDAwMDAwMDAwMCA2NTUzNSBmCjAwMDAwMDAwMDkgMDAwMDAgbgowMDAwMDAwMDU4IDAwMDAwIG4KMDAwMDAwMDExNSAwMDAwMCBuCnRyYWlsZXIgPDwgL1NpemUgNCAvUm9vdCAxIDAgUiA+PgpzdGFydHhyZWYKMTk2CiUlRU9GCg=="}]}],"model":"gpt-4o-mini","instructions":"You
+      are File Analyst. Expert at analyzing various file types.\nYour personal goal
+      is: Analyze and describe files accurately"}'
+    headers:
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept:
+      - application/json
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      authorization:
+      - AUTHORIZATION-XXX
+      connection:
+      - keep-alive
+      content-length:
+      - '834'
+      content-type:
+      - application/json
+      cookie:
+      - COOKIE-XXX
+      host:
+      - api.openai.com
+      x-stainless-arch:
+      - X-STAINLESS-ARCH-XXX
+      x-stainless-async:
+      - 'false'
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - X-STAINLESS-OS-XXX
+      x-stainless-package-version:
+      - 1.83.0
+      x-stainless-read-timeout:
+      - X-STAINLESS-READ-TIMEOUT-XXX
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.13.3
+    method: POST
+    uri: https://api.openai.com/v1/responses
+  response:
+    body:
+      string: "{\n  \"id\": \"resp_0c3ca22d310deec300698e2a25842881929a9aad25ea18eb77\",\n
+        \ \"object\": \"response\",\n  \"created_at\": 1770924581,\n  \"status\":
+        \"completed\",\n  \"background\": false,\n  \"billing\": {\n    \"payer\":
+        \"developer\"\n  },\n  \"completed_at\": 1770924582,\n  \"error\": null,\n
+        \ \"frequency_penalty\": 0.0,\n  \"incomplete_details\": null,\n  \"instructions\":
+        \"You are File Analyst. Expert at analyzing various file types.\\nYour personal
+        goal is: Analyze and describe files accurately\",\n  \"max_output_tokens\":
+        null,\n  \"max_tool_calls\": null,\n  \"model\": \"gpt-4o-mini-2024-07-18\",\n
+        \ \"output\": [\n    {\n      \"id\": \"msg_0c3ca22d310deec300698e2a26058081929351f3632bd1aa8e\",\n
+        \     \"type\": \"message\",\n      \"status\": \"completed\",\n      \"content\":
+        [\n        {\n          \"type\": \"output_text\",\n          \"annotations\":
+        [],\n          \"logprobs\": [],\n          \"text\": \"Please upload the
+        document you would like me to analyze, and I'll provide you with a detailed
+        description and analysis of its contents.\"\n        }\n      ],\n      \"role\":
+        \"assistant\"\n    }\n  ],\n  \"parallel_tool_calls\": true,\n  \"presence_penalty\":
+        0.0,\n  \"previous_response_id\": null,\n  \"prompt_cache_key\": null,\n  \"prompt_cache_retention\":
+        null,\n  \"reasoning\": {\n    \"effort\": null,\n    \"summary\": null\n
+        \ },\n  \"safety_identifier\": null,\n  \"service_tier\": \"default\",\n  \"store\":
+        true,\n  \"temperature\": 1.0,\n  \"text\": {\n    \"format\": {\n      \"type\":
+        \"text\"\n    },\n    \"verbosity\": \"medium\"\n  },\n  \"tool_choice\":
+        \"auto\",\n  \"tools\": [],\n  \"top_logprobs\": 0,\n  \"top_p\": 1.0,\n  \"truncation\":
+        \"disabled\",\n  \"usage\": {\n    \"input_tokens\": 51,\n    \"input_tokens_details\":
+        {\n      \"cached_tokens\": 0\n    },\n    \"output_tokens\": 26,\n    \"output_tokens_details\":
+        {\n      \"reasoning_tokens\": 0\n    },\n    \"total_tokens\": 77\n  },\n
+        \ \"user\": null,\n  \"metadata\": {}\n}"
+    headers:
+      CF-RAY:
+      - CF-RAY-XXX
+      Connection:
+      - keep-alive
+      Content-Type:
+      - application/json
+      Date:
+      - Thu, 12 Feb 2026 19:29:42 GMT
+      Server:
+      - cloudflare
+      Strict-Transport-Security:
+      - STS-XXX
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - X-CONTENT-TYPE-XXX
+      alt-svc:
+      - h3=":443"; ma=86400
+      cf-cache-status:
+      - DYNAMIC
+      openai-organization:
+      - OPENAI-ORG-XXX
+      openai-processing-ms:
+      - '870'
+      openai-project:
+      - OPENAI-PROJECT-XXX
+      openai-version:
+      - '2020-10-01'
+      set-cookie:
+      - SET-COOKIE-XXX
      x-ratelimit-limit-requests:
      - X-RATELIMIT-LIMIT-REQUESTS-XXX
      x-ratelimit-limit-tokens:
--- a/lib/crewai/tests/cassettes/TestAgentMultimodalFileTypes.test_text_gemini.yaml
+++ b/lib/crewai/tests/cassettes/TestAgentMultimodalFileTypes.test_text_gemini.yaml
@@ -1,16 +1,11 @@
 interactions:
 - request:
-    body: '{"contents": [{"parts": [{"text": "\nCurrent Task: Summarize this text.\n\nBegin!
-      This is VERY important to you, use the tools available and give your best Final
-      Answer, your job depends on it!\n\nThought:"}, {"inlineData": {"data": "UmV2aWV3IEd1aWRlbGluZXMKCjEuIEJlIGNsZWFyIGFuZCBjb25jaXNlOiBXcml0ZSBmZWVkYmFjayB0aGF0IGlzIGVhc3kgdG8gdW5kZXJzdGFuZC4KMi4gRm9jdXMgb24gYmVoYXZpb3IgYW5kIG91dGNvbWVzOiBEZXNjcmliZSB3aGF0IGhhcHBlbmVkIGFuZCB3aHkgaXQgbWF0dGVycy4KMy4gQmUgc3BlY2lmaWM6IFByb3ZpZGUgZXhhbXBsZXMgdG8gc3VwcG9ydCB5b3VyIHBvaW50cy4KNC4gQmFsYW5jZSBwb3NpdGl2ZXMgYW5kIGltcHJvdmVtZW50czogSGlnaGxpZ2h0IHN0cmVuZ3RocyBhbmQgYXJlYXMgdG8gZ3Jvdy4KNS4gQmUgcmVzcGVjdGZ1bCBhbmQgY29uc3RydWN0aXZlOiBBc3N1bWUgcG9zaXRpdmUgaW50ZW50IGFuZCBvZmZlciBzb2x1dGlvbnMuCjYuIFVzZSBvYmplY3RpdmUgY3JpdGVyaWE6IFJlZmVyZW5jZSBnb2FscywgbWV0cmljcywgb3IgZXhwZWN0YXRpb25zIHdoZXJlIHBvc3NpYmxlLgo3LiBTdWdnZXN0IG5leHQgc3RlcHM6IFJlY29tbWVuZCBhY3Rpb25hYmxlIHdheXMgdG8gaW1wcm92ZS4KOC4gUHJvb2ZyZWFkOiBDaGVjayB0b25lLCBncmFtbWFyLCBhbmQgY2xhcml0eSBiZWZvcmUgc3VibWl0dGluZy4K",
+    body: '{"contents": [{"parts": [{"text": "\nCurrent Task: Summarize this text.\n\nProvide
+      your complete response:"}, {"inlineData": {"data": "UmV2aWV3IEd1aWRlbGluZXMKCjEuIEJlIGNsZWFyIGFuZCBjb25jaXNlOiBXcml0ZSBmZWVkYmFjayB0aGF0IGlzIGVhc3kgdG8gdW5kZXJzdGFuZC4KMi4gRm9jdXMgb24gYmVoYXZpb3IgYW5kIG91dGNvbWVzOiBEZXNjcmliZSB3aGF0IGhhcHBlbmVkIGFuZCB3aHkgaXQgbWF0dGVycy4KMy4gQmUgc3BlY2lmaWM6IFByb3ZpZGUgZXhhbXBsZXMgdG8gc3VwcG9ydCB5b3VyIHBvaW50cy4KNC4gQmFsYW5jZSBwb3NpdGl2ZXMgYW5kIGltcHJvdmVtZW50czogSGlnaGxpZ2h0IHN0cmVuZ3RocyBhbmQgYXJlYXMgdG8gZ3Jvdy4KNS4gQmUgcmVzcGVjdGZ1bCBhbmQgY29uc3RydWN0aXZlOiBBc3N1bWUgcG9zaXRpdmUgaW50ZW50IGFuZCBvZmZlciBzb2x1dGlvbnMuCjYuIFVzZSBvYmplY3RpdmUgY3JpdGVyaWE6IFJlZmVyZW5jZSBnb2FscywgbWV0cmljcywgb3IgZXhwZWN0YXRpb25zIHdoZXJlIHBvc3NpYmxlLgo3LiBTdWdnZXN0IG5leHQgc3RlcHM6IFJlY29tbWVuZCBhY3Rpb25hYmxlIHdheXMgdG8gaW1wcm92ZS4KOC4gUHJvb2ZyZWFkOiBDaGVjayB0b25lLCBncmFtbWFyLCBhbmQgY2xhcml0eSBiZWZvcmUgc3VibWl0dGluZy4K",
      "mimeType": "text/plain"}}], "role": "user"}], "systemInstruction": {"parts":
      [{"text": "You are File Analyst. Expert at analyzing various file types.\nYour
-      personal goal is: Analyze and describe files accurately\nTo give my best complete
-      final answer to the task respond using the exact following format:\n\nThought:
-      I now can give a great answer\nFinal Answer: Your final answer must be the great
-      and the most complete as possible, it must be outcome described.\n\nI MUST use
-      these formats, my job depends on it!"}], "role": "user"}, "generationConfig":
-      {"stopSequences": ["\nObservation:"]}}'
+      personal goal is: Analyze and describe files accurately"}], "role": "user"},
+      "generationConfig": {"stopSequences": ["\nObservation:"]}}'
    headers:
      User-Agent:
      - X-USER-AGENT-XXX
@@ -21,13 +16,13 @@ interactions:
      connection:
      - keep-alive
      content-length:
-      - '1619'
+      - '1218'
      content-type:
      - application/json
      host:
      - generativelanguage.googleapis.com
      x-goog-api-client:
-      - google-genai-sdk/1.49.0 gl-python/3.12.10
+      - google-genai-sdk/1.49.0 gl-python/3.13.3
      x-goog-api-key:
      - X-GOOG-API-KEY-XXX
    method: POST
@@ -35,34 +30,101 @@ interactions:
  response:
    body:
      string: "{\n  \"candidates\": [\n    {\n      \"content\": {\n        \"parts\":
-        [\n          {\n            \"text\": \"Thought: This text provides guidelines
-        for giving effective feedback. I need to summarize these guidelines in a clear
-        and concise manner.\\n\\nFinal Answer: The text outlines eight guidelines
-        for providing effective feedback: be clear and concise, focus on behavior
-        and outcomes, be specific with examples, balance positive aspects with areas
-        for improvement, be respectful and constructive by offering solutions, use
-        objective criteria, suggest actionable next steps, and proofread for tone,
-        grammar, and clarity before submission. These guidelines aim to ensure feedback
-        is easily understood, impactful, and geared towards positive growth.\\n\"\n
-        \         }\n        ],\n        \"role\": \"model\"\n      },\n      \"finishReason\":
-        \"STOP\",\n      \"avgLogprobs\": -0.24753604923282657\n    }\n  ],\n  \"usageMetadata\":
-        {\n    \"promptTokenCount\": 252,\n    \"candidatesTokenCount\": 111,\n    \"totalTokenCount\":
-        363,\n    \"promptTokensDetails\": [\n      {\n        \"modality\": \"TEXT\",\n
-        \       \"tokenCount\": 252\n      }\n    ],\n    \"candidatesTokensDetails\":
-        [\n      {\n        \"modality\": \"TEXT\",\n        \"tokenCount\": 111\n
+        [\n          {\n            \"text\": \"The text provides guidelines for giving
+        effective feedback. Key principles include being clear, focusing on behavior
+        and outcomes with specific examples, balancing positive and constructive criticism,
+        remaining respectful, using objective criteria, suggesting actionable next
+        steps, and proofreading for clarity and tone. In essence, feedback should
+        be easily understood, objective, and geared towards improvement.\\n\"\n          }\n
+        \       ],\n        \"role\": \"model\"\n      },\n      \"finishReason\":
+        \"STOP\",\n      \"avgLogprobs\": -0.24900928895864913\n    }\n  ],\n  \"usageMetadata\":
+        {\n    \"promptTokenCount\": 163,\n    \"candidatesTokenCount\": 67,\n    \"totalTokenCount\":
+        230,\n    \"promptTokensDetails\": [\n      {\n        \"modality\": \"TEXT\",\n
+        \       \"tokenCount\": 163\n      }\n    ],\n    \"candidatesTokensDetails\":
+        [\n      {\n        \"modality\": \"TEXT\",\n        \"tokenCount\": 67\n
        \     }\n    ]\n  },\n  \"modelVersion\": \"gemini-2.0-flash\",\n  \"responseId\":
-        \"88lzae_VGaGOjMcPxNCokQI\"\n}\n"
+        \"SDSOaae8LLzRjMcPptjXkQ4\"\n}\n"
    headers:
      Alt-Svc:
      - h3=":443"; ma=2592000,h3-29=":443"; ma=2592000
      Content-Type:
      - application/json; charset=UTF-8
      Date:
-      - Fri, 23 Jan 2026 19:20:20 GMT
+      - Thu, 12 Feb 2026 20:12:58 GMT
      Server:
      - scaffolding on HTTPServer2
      Server-Timing:
-      - gfet4t7; dur=1200
+      - gfet4t7; dur=1742
+      Transfer-Encoding:
+      - chunked
+      Vary:
+      - Origin
+      - X-Origin
+      - Referer
+      X-Content-Type-Options:
+      - X-CONTENT-TYPE-XXX
+      X-Frame-Options:
+      - X-FRAME-OPTIONS-XXX
+      X-XSS-Protection:
+      - '0'
+    status:
+      code: 200
+      message: OK
+- request:
+    body: '{"contents": [{"parts": [{"text": "\nCurrent Task: Summarize this text.\n\nProvide
+      your complete response:"}, {"inlineData": {"data": "UmV2aWV3IEd1aWRlbGluZXMKCjEuIEJlIGNsZWFyIGFuZCBjb25jaXNlOiBXcml0ZSBmZWVkYmFjayB0aGF0IGlzIGVhc3kgdG8gdW5kZXJzdGFuZC4KMi4gRm9jdXMgb24gYmVoYXZpb3IgYW5kIG91dGNvbWVzOiBEZXNjcmliZSB3aGF0IGhhcHBlbmVkIGFuZCB3aHkgaXQgbWF0dGVycy4KMy4gQmUgc3BlY2lmaWM6IFByb3ZpZGUgZXhhbXBsZXMgdG8gc3VwcG9ydCB5b3VyIHBvaW50cy4KNC4gQmFsYW5jZSBwb3NpdGl2ZXMgYW5kIGltcHJvdmVtZW50czogSGlnaGxpZ2h0IHN0cmVuZ3RocyBhbmQgYXJlYXMgdG8gZ3Jvdy4KNS4gQmUgcmVzcGVjdGZ1bCBhbmQgY29uc3RydWN0aXZlOiBBc3N1bWUgcG9zaXRpdmUgaW50ZW50IGFuZCBvZmZlciBzb2x1dGlvbnMuCjYuIFVzZSBvYmplY3RpdmUgY3JpdGVyaWE6IFJlZmVyZW5jZSBnb2FscywgbWV0cmljcywgb3IgZXhwZWN0YXRpb25zIHdoZXJlIHBvc3NpYmxlLgo3LiBTdWdnZXN0IG5leHQgc3RlcHM6IFJlY29tbWVuZCBhY3Rpb25hYmxlIHdheXMgdG8gaW1wcm92ZS4KOC4gUHJvb2ZyZWFkOiBDaGVjayB0b25lLCBncmFtbWFyLCBhbmQgY2xhcml0eSBiZWZvcmUgc3VibWl0dGluZy4K",
+      "mimeType": "text/plain"}}], "role": "user"}], "systemInstruction": {"parts":
+      [{"text": "You are File Analyst. Expert at analyzing various file types.\nYour
+      personal goal is: Analyze and describe files accurately"}], "role": "user"},
+      "generationConfig": {"stopSequences": ["\nObservation:"]}}'
+    headers:
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept:
+      - '*/*'
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      connection:
+      - keep-alive
+      content-length:
+      - '1218'
+      content-type:
+      - application/json
+      host:
+      - generativelanguage.googleapis.com
+      x-goog-api-client:
+      - google-genai-sdk/1.49.0 gl-python/3.13.3
+      x-goog-api-key:
+      - X-GOOG-API-KEY-XXX
+    method: POST
+    uri: https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent
+  response:
+    body:
+      string: "{\n  \"candidates\": [\n    {\n      \"content\": {\n        \"parts\":
+        [\n          {\n            \"text\": \"The text provides guidelines for writing
+        effective feedback. Key recommendations include being clear, concise, specific,
+        and respectful. Feedback should focus on behavior and outcomes, balance positive
+        and negative aspects, use objective criteria, and suggest actionable next
+        steps. Proofreading is essential before submitting feedback.\\n\"\n          }\n
+        \       ],\n        \"role\": \"model\"\n      },\n      \"finishReason\":
+        \"STOP\",\n      \"avgLogprobs\": -0.29874773892489348\n    }\n  ],\n  \"usageMetadata\":
+        {\n    \"promptTokenCount\": 163,\n    \"candidatesTokenCount\": 55,\n    \"totalTokenCount\":
+        218,\n    \"promptTokensDetails\": [\n      {\n        \"modality\": \"TEXT\",\n
+        \       \"tokenCount\": 163\n      }\n    ],\n    \"candidatesTokensDetails\":
+        [\n      {\n        \"modality\": \"TEXT\",\n        \"tokenCount\": 55\n
+        \     }\n    ]\n  },\n  \"modelVersion\": \"gemini-2.0-flash\",\n  \"responseId\":
+        \"SjSOab3-HaajjMcP38-yyQw\"\n}\n"
+    headers:
+      Alt-Svc:
+      - h3=":443"; ma=2592000,h3-29=":443"; ma=2592000
+      Content-Type:
+      - application/json; charset=UTF-8
+      Date:
+      - Thu, 12 Feb 2026 20:12:59 GMT
+      Server:
+      - scaffolding on HTTPServer2
+      Server-Timing:
+      - gfet4t7; dur=1198
      Transfer-Encoding:
      - chunked
      Vary:
--- a/lib/crewai/tests/cassettes/TestAgentMultimodalFileTypes.test_video_gemini.yaml
+++ b/lib/crewai/tests/cassettes/TestAgentMultimodalFileTypes.test_video_gemini.yaml
--- a/lib/crewai/tests/cassettes/TestAgentMultimodalGemini.test_audio_file[gemini-gemini-2.0-flash].yaml
+++ b/lib/crewai/tests/cassettes/TestAgentMultimodalGemini.test_audio_file[gemini-gemini-2.0-flash].yaml
--- a/lib/crewai/tests/cassettes/TestAgentMultimodalGemini.test_audio_file[gemini-gemini-2.5-flash].yaml
+++ b/lib/crewai/tests/cassettes/TestAgentMultimodalGemini.test_audio_file[gemini-gemini-2.5-flash].yaml
--- a/lib/crewai/tests/cassettes/TestAgentMultimodalGemini.test_image_file[gemini-gemini-2.0-flash].yaml
+++ b/lib/crewai/tests/cassettes/TestAgentMultimodalGemini.test_image_file[gemini-gemini-2.0-flash].yaml
--- a/lib/crewai/tests/cassettes/TestAgentMultimodalGemini.test_image_file[gemini-gemini-2.5-flash].yaml
+++ b/lib/crewai/tests/cassettes/TestAgentMultimodalGemini.test_image_file[gemini-gemini-2.5-flash].yaml
--- a/lib/crewai/tests/cassettes/TestAgentMultimodalGemini.test_mixed_files[gemini-gemini-2.0-flash].yaml
+++ b/lib/crewai/tests/cassettes/TestAgentMultimodalGemini.test_mixed_files[gemini-gemini-2.0-flash].yaml
--- a/lib/crewai/tests/cassettes/TestAgentMultimodalGemini.test_mixed_files[gemini-gemini-2.5-flash].yaml
+++ b/lib/crewai/tests/cassettes/TestAgentMultimodalGemini.test_mixed_files[gemini-gemini-2.5-flash].yaml
--- a/lib/crewai/tests/cassettes/TestAgentMultimodalGemini.test_text_file[gemini-gemini-2.0-flash].yaml
+++ b/lib/crewai/tests/cassettes/TestAgentMultimodalGemini.test_text_file[gemini-gemini-2.0-flash].yaml
@@ -1,17 +1,11 @@
 interactions:
 - request:
    body: '{"contents": [{"parts": [{"text": "\nCurrent Task: Summarize this text
-      briefly.\n\nBegin! This is VERY important to you, use the tools available and
-      give your best Final Answer, your job depends on it!\n\nThought:"}, {"inlineData":
-      {"data": "UmV2aWV3IEd1aWRlbGluZXMKCjEuIEJlIGNsZWFyIGFuZCBjb25jaXNlOiBXcml0ZSBmZWVkYmFjayB0aGF0IGlzIGVhc3kgdG8gdW5kZXJzdGFuZC4KMi4gRm9jdXMgb24gYmVoYXZpb3IgYW5kIG91dGNvbWVzOiBEZXNjcmliZSB3aGF0IGhhcHBlbmVkIGFuZCB3aHkgaXQgbWF0dGVycy4KMy4gQmUgc3BlY2lmaWM6IFByb3ZpZGUgZXhhbXBsZXMgdG8gc3VwcG9ydCB5b3VyIHBvaW50cy4KNC4gQmFsYW5jZSBwb3NpdGl2ZXMgYW5kIGltcHJvdmVtZW50czogSGlnaGxpZ2h0IHN0cmVuZ3RocyBhbmQgYXJlYXMgdG8gZ3Jvdy4KNS4gQmUgcmVzcGVjdGZ1bCBhbmQgY29uc3RydWN0aXZlOiBBc3N1bWUgcG9zaXRpdmUgaW50ZW50IGFuZCBvZmZlciBzb2x1dGlvbnMuCjYuIFVzZSBvYmplY3RpdmUgY3JpdGVyaWE6IFJlZmVyZW5jZSBnb2FscywgbWV0cmljcywgb3IgZXhwZWN0YXRpb25zIHdoZXJlIHBvc3NpYmxlLgo3LiBTdWdnZXN0IG5leHQgc3RlcHM6IFJlY29tbWVuZCBhY3Rpb25hYmxlIHdheXMgdG8gaW1wcm92ZS4KOC4gUHJvb2ZyZWFkOiBDaGVjayB0b25lLCBncmFtbWFyLCBhbmQgY2xhcml0eSBiZWZvcmUgc3VibWl0dGluZy4K",
+      briefly.\n\nProvide your complete response:"}, {"inlineData": {"data": "UmV2aWV3IEd1aWRlbGluZXMKCjEuIEJlIGNsZWFyIGFuZCBjb25jaXNlOiBXcml0ZSBmZWVkYmFjayB0aGF0IGlzIGVhc3kgdG8gdW5kZXJzdGFuZC4KMi4gRm9jdXMgb24gYmVoYXZpb3IgYW5kIG91dGNvbWVzOiBEZXNjcmliZSB3aGF0IGhhcHBlbmVkIGFuZCB3aHkgaXQgbWF0dGVycy4KMy4gQmUgc3BlY2lmaWM6IFByb3ZpZGUgZXhhbXBsZXMgdG8gc3VwcG9ydCB5b3VyIHBvaW50cy4KNC4gQmFsYW5jZSBwb3NpdGl2ZXMgYW5kIGltcHJvdmVtZW50czogSGlnaGxpZ2h0IHN0cmVuZ3RocyBhbmQgYXJlYXMgdG8gZ3Jvdy4KNS4gQmUgcmVzcGVjdGZ1bCBhbmQgY29uc3RydWN0aXZlOiBBc3N1bWUgcG9zaXRpdmUgaW50ZW50IGFuZCBvZmZlciBzb2x1dGlvbnMuCjYuIFVzZSBvYmplY3RpdmUgY3JpdGVyaWE6IFJlZmVyZW5jZSBnb2FscywgbWV0cmljcywgb3IgZXhwZWN0YXRpb25zIHdoZXJlIHBvc3NpYmxlLgo3LiBTdWdnZXN0IG5leHQgc3RlcHM6IFJlY29tbWVuZCBhY3Rpb25hYmxlIHdheXMgdG8gaW1wcm92ZS4KOC4gUHJvb2ZyZWFkOiBDaGVjayB0b25lLCBncmFtbWFyLCBhbmQgY2xhcml0eSBiZWZvcmUgc3VibWl0dGluZy4K",
      "mimeType": "text/plain"}}], "role": "user"}], "systemInstruction": {"parts":
      [{"text": "You are File Analyst. Expert at analyzing various file types.\nYour
-      personal goal is: Analyze and describe files accurately\nTo give my best complete
-      final answer to the task respond using the exact following format:\n\nThought:
-      I now can give a great answer\nFinal Answer: Your final answer must be the great
-      and the most complete as possible, it must be outcome described.\n\nI MUST use
-      these formats, my job depends on it!"}], "role": "user"}, "generationConfig":
-      {"stopSequences": ["\nObservation:"]}}'
+      personal goal is: Analyze and describe files accurately"}], "role": "user"},
+      "generationConfig": {"stopSequences": ["\nObservation:"]}}'
    headers:
      User-Agent:
      - X-USER-AGENT-XXX
@@ -22,13 +16,13 @@ interactions:
      connection:
      - keep-alive
      content-length:
-      - '1627'
+      - '1226'
      content-type:
      - application/json
      host:
      - generativelanguage.googleapis.com
      x-goog-api-client:
-      - google-genai-sdk/1.49.0 gl-python/3.12.10
+      - google-genai-sdk/1.49.0 gl-python/3.13.3
      x-goog-api-key:
      - X-GOOG-API-KEY-XXX
    method: POST
@@ -36,30 +30,100 @@ interactions:
  response:
    body:
      string: "{\n  \"candidates\": [\n    {\n      \"content\": {\n        \"parts\":
-        [\n          {\n            \"text\": \"Thought: The text provides guidelines
-        for giving effective feedback. I need to summarize these guidelines concisely.\\n\\nFinal
-        Answer: The provided text outlines eight guidelines for delivering effective
-        feedback, emphasizing clarity, focus on behavior and outcomes, specificity,
-        balanced perspective, respect, objectivity, actionable suggestions, and proofreading.\\n\"\n
-        \         }\n        ],\n        \"role\": \"model\"\n      },\n      \"finishReason\":
-        \"STOP\",\n      \"avgLogprobs\": -0.18550947507222493\n    }\n  ],\n  \"usageMetadata\":
-        {\n    \"promptTokenCount\": 253,\n    \"candidatesTokenCount\": 60,\n    \"totalTokenCount\":
-        313,\n    \"promptTokensDetails\": [\n      {\n        \"modality\": \"TEXT\",\n
-        \       \"tokenCount\": 253\n      }\n    ],\n    \"candidatesTokensDetails\":
-        [\n      {\n        \"modality\": \"TEXT\",\n        \"tokenCount\": 60\n
+        [\n          {\n            \"text\": \"These guidelines provide instructions
+        for writing effective feedback. Feedback should be clear, concise, specific,
+        and balanced, focusing on behaviors and outcomes with examples. It should
+        also be respectful, constructive, and objective, suggesting actionable next
+        steps for improvement and be proofread before submission.\\n\"\n          }\n
+        \       ],\n        \"role\": \"model\"\n      },\n      \"finishReason\":
+        \"STOP\",\n      \"avgLogprobs\": -0.27340631131772641\n    }\n  ],\n  \"usageMetadata\":
+        {\n    \"promptTokenCount\": 164,\n    \"candidatesTokenCount\": 54,\n    \"totalTokenCount\":
+        218,\n    \"promptTokensDetails\": [\n      {\n        \"modality\": \"TEXT\",\n
+        \       \"tokenCount\": 164\n      }\n    ],\n    \"candidatesTokensDetails\":
+        [\n      {\n        \"modality\": \"TEXT\",\n        \"tokenCount\": 54\n
        \     }\n    ]\n  },\n  \"modelVersion\": \"gemini-2.0-flash\",\n  \"responseId\":
-        \"9MlzacewKpKMjMcPtu7joQI\"\n}\n"
+        \"kSqOadGYAsXQjMcP9YfmuAQ\"\n}\n"
    headers:
      Alt-Svc:
      - h3=":443"; ma=2592000,h3-29=":443"; ma=2592000
      Content-Type:
      - application/json; charset=UTF-8
      Date:
-      - Fri, 23 Jan 2026 19:20:21 GMT
+      - Thu, 12 Feb 2026 19:31:29 GMT
      Server:
      - scaffolding on HTTPServer2
      Server-Timing:
-      - gfet4t7; dur=890
+      - gfet4t7; dur=1041
+      Transfer-Encoding:
+      - chunked
+      Vary:
+      - Origin
+      - X-Origin
+      - Referer
+      X-Content-Type-Options:
+      - X-CONTENT-TYPE-XXX
+      X-Frame-Options:
+      - X-FRAME-OPTIONS-XXX
+      X-XSS-Protection:
+      - '0'
+    status:
+      code: 200
+      message: OK
+- request:
+    body: '{"contents": [{"parts": [{"text": "\nCurrent Task: Summarize this text
+      briefly.\n\nProvide your complete response:"}, {"inlineData": {"data": "UmV2aWV3IEd1aWRlbGluZXMKCjEuIEJlIGNsZWFyIGFuZCBjb25jaXNlOiBXcml0ZSBmZWVkYmFjayB0aGF0IGlzIGVhc3kgdG8gdW5kZXJzdGFuZC4KMi4gRm9jdXMgb24gYmVoYXZpb3IgYW5kIG91dGNvbWVzOiBEZXNjcmliZSB3aGF0IGhhcHBlbmVkIGFuZCB3aHkgaXQgbWF0dGVycy4KMy4gQmUgc3BlY2lmaWM6IFByb3ZpZGUgZXhhbXBsZXMgdG8gc3VwcG9ydCB5b3VyIHBvaW50cy4KNC4gQmFsYW5jZSBwb3NpdGl2ZXMgYW5kIGltcHJvdmVtZW50czogSGlnaGxpZ2h0IHN0cmVuZ3RocyBhbmQgYXJlYXMgdG8gZ3Jvdy4KNS4gQmUgcmVzcGVjdGZ1bCBhbmQgY29uc3RydWN0aXZlOiBBc3N1bWUgcG9zaXRpdmUgaW50ZW50IGFuZCBvZmZlciBzb2x1dGlvbnMuCjYuIFVzZSBvYmplY3RpdmUgY3JpdGVyaWE6IFJlZmVyZW5jZSBnb2FscywgbWV0cmljcywgb3IgZXhwZWN0YXRpb25zIHdoZXJlIHBvc3NpYmxlLgo3LiBTdWdnZXN0IG5leHQgc3RlcHM6IFJlY29tbWVuZCBhY3Rpb25hYmxlIHdheXMgdG8gaW1wcm92ZS4KOC4gUHJvb2ZyZWFkOiBDaGVjayB0b25lLCBncmFtbWFyLCBhbmQgY2xhcml0eSBiZWZvcmUgc3VibWl0dGluZy4K",
+      "mimeType": "text/plain"}}], "role": "user"}], "systemInstruction": {"parts":
+      [{"text": "You are File Analyst. Expert at analyzing various file types.\nYour
+      personal goal is: Analyze and describe files accurately"}], "role": "user"},
+      "generationConfig": {"stopSequences": ["\nObservation:"]}}'
+    headers:
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept:
+      - '*/*'
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      connection:
+      - keep-alive
+      content-length:
+      - '1226'
+      content-type:
+      - application/json
+      host:
+      - generativelanguage.googleapis.com
+      x-goog-api-client:
+      - google-genai-sdk/1.49.0 gl-python/3.13.3
+      x-goog-api-key:
+      - X-GOOG-API-KEY-XXX
+    method: POST
+    uri: https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent
+  response:
+    body:
+      string: "{\n  \"candidates\": [\n    {\n      \"content\": {\n        \"parts\":
+        [\n          {\n            \"text\": \"These guidelines outline how to provide
+        effective feedback: be clear, concise, and specific, focusing on behavior
+        and outcomes with examples. Balance positive aspects with areas for improvement,
+        offering constructive, respectful suggestions and actionable next steps, all
+        while referencing objective criteria and ensuring the feedback is well-written
+        and proofread.\\n\"\n          }\n        ],\n        \"role\": \"model\"\n
+        \     },\n      \"finishReason\": \"STOP\",\n      \"avgLogprobs\": -0.25106738043613119\n
+        \   }\n  ],\n  \"usageMetadata\": {\n    \"promptTokenCount\": 164,\n    \"candidatesTokenCount\":
+        61,\n    \"totalTokenCount\": 225,\n    \"promptTokensDetails\": [\n      {\n
+        \       \"modality\": \"TEXT\",\n        \"tokenCount\": 164\n      }\n    ],\n
+        \   \"candidatesTokensDetails\": [\n      {\n        \"modality\": \"TEXT\",\n
+        \       \"tokenCount\": 61\n      }\n    ]\n  },\n  \"modelVersion\": \"gemini-2.0-flash\",\n
+        \ \"responseId\": \"kiqOaePiC96RjMcP3auj8Q4\"\n}\n"
+    headers:
+      Alt-Svc:
+      - h3=":443"; ma=2592000,h3-29=":443"; ma=2592000
+      Content-Type:
+      - application/json; charset=UTF-8
+      Date:
+      - Thu, 12 Feb 2026 19:31:31 GMT
+      Server:
+      - scaffolding on HTTPServer2
+      Server-Timing:
+      - gfet4t7; dur=1024
      Transfer-Encoding:
      - chunked
      Vary:
--- a/lib/crewai/tests/cassettes/TestAgentMultimodalGemini.test_text_file[gemini-gemini-2.5-flash].yaml
+++ b/lib/crewai/tests/cassettes/TestAgentMultimodalGemini.test_text_file[gemini-gemini-2.5-flash].yaml
@@ -0,0 +1,134 @@
+interactions:
+- request:
+    body: '{"contents": [{"parts": [{"text": "\nCurrent Task: Summarize this text
+      briefly.\n\nProvide your complete response:"}, {"inlineData": {"data": "UmV2aWV3IEd1aWRlbGluZXMKCjEuIEJlIGNsZWFyIGFuZCBjb25jaXNlOiBXcml0ZSBmZWVkYmFjayB0aGF0IGlzIGVhc3kgdG8gdW5kZXJzdGFuZC4KMi4gRm9jdXMgb24gYmVoYXZpb3IgYW5kIG91dGNvbWVzOiBEZXNjcmliZSB3aGF0IGhhcHBlbmVkIGFuZCB3aHkgaXQgbWF0dGVycy4KMy4gQmUgc3BlY2lmaWM6IFByb3ZpZGUgZXhhbXBsZXMgdG8gc3VwcG9ydCB5b3VyIHBvaW50cy4KNC4gQmFsYW5jZSBwb3NpdGl2ZXMgYW5kIGltcHJvdmVtZW50czogSGlnaGxpZ2h0IHN0cmVuZ3RocyBhbmQgYXJlYXMgdG8gZ3Jvdy4KNS4gQmUgcmVzcGVjdGZ1bCBhbmQgY29uc3RydWN0aXZlOiBBc3N1bWUgcG9zaXRpdmUgaW50ZW50IGFuZCBvZmZlciBzb2x1dGlvbnMuCjYuIFVzZSBvYmplY3RpdmUgY3JpdGVyaWE6IFJlZmVyZW5jZSBnb2FscywgbWV0cmljcywgb3IgZXhwZWN0YXRpb25zIHdoZXJlIHBvc3NpYmxlLgo3LiBTdWdnZXN0IG5leHQgc3RlcHM6IFJlY29tbWVuZCBhY3Rpb25hYmxlIHdheXMgdG8gaW1wcm92ZS4KOC4gUHJvb2ZyZWFkOiBDaGVjayB0b25lLCBncmFtbWFyLCBhbmQgY2xhcml0eSBiZWZvcmUgc3VibWl0dGluZy4K",
+      "mimeType": "text/plain"}}], "role": "user"}], "systemInstruction": {"parts":
+      [{"text": "You are File Analyst. Expert at analyzing various file types.\nYour
+      personal goal is: Analyze and describe files accurately"}], "role": "user"},
+      "generationConfig": {"stopSequences": ["\nObservation:"]}}'
+    headers:
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept:
+      - '*/*'
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      connection:
+      - keep-alive
+      content-length:
+      - '1226'
+      content-type:
+      - application/json
+      host:
+      - generativelanguage.googleapis.com
+      x-goog-api-client:
+      - google-genai-sdk/1.49.0 gl-python/3.13.3
+      x-goog-api-key:
+      - X-GOOG-API-KEY-XXX
+    method: POST
+    uri: https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent
+  response:
+    body:
+      string: "{\n  \"candidates\": [\n    {\n      \"content\": {\n        \"parts\":
+        [\n          {\n            \"text\": \"These guidelines provide a framework
+        for giving effective feedback, emphasizing clarity, specificity, balance,
+        respect, objectivity, actionable next steps, and proofreading.\"\n          }\n
+        \       ],\n        \"role\": \"model\"\n      },\n      \"finishReason\":
+        \"STOP\",\n      \"index\": 0\n    }\n  ],\n  \"usageMetadata\": {\n    \"promptTokenCount\":
+        166,\n    \"candidatesTokenCount\": 29,\n    \"totalTokenCount\": 223,\n    \"promptTokensDetails\":
+        [\n      {\n        \"modality\": \"TEXT\",\n        \"tokenCount\": 166\n
+        \     }\n    ],\n    \"thoughtsTokenCount\": 28\n  },\n  \"modelVersion\":
+        \"gemini-2.5-flash\",\n  \"responseId\": \"PUqOaZ3pMYi8_uMP25m7gAQ\"\n}\n"
+    headers:
+      Alt-Svc:
+      - h3=":443"; ma=2592000,h3-29=":443"; ma=2592000
+      Content-Type:
+      - application/json; charset=UTF-8
+      Date:
+      - Thu, 12 Feb 2026 21:46:37 GMT
+      Server:
+      - scaffolding on HTTPServer2
+      Server-Timing:
+      - gfet4t7; dur=671
+      Transfer-Encoding:
+      - chunked
+      Vary:
+      - Origin
+      - X-Origin
+      - Referer
+      X-Content-Type-Options:
+      - X-CONTENT-TYPE-XXX
+      X-Frame-Options:
+      - X-FRAME-OPTIONS-XXX
+      X-XSS-Protection:
+      - '0'
+    status:
+      code: 200
+      message: OK
+- request:
+    body: '{"contents": [{"parts": [{"text": "\nCurrent Task: Summarize this text
+      briefly.\n\nProvide your complete response:"}, {"inlineData": {"data": "UmV2aWV3IEd1aWRlbGluZXMKCjEuIEJlIGNsZWFyIGFuZCBjb25jaXNlOiBXcml0ZSBmZWVkYmFjayB0aGF0IGlzIGVhc3kgdG8gdW5kZXJzdGFuZC4KMi4gRm9jdXMgb24gYmVoYXZpb3IgYW5kIG91dGNvbWVzOiBEZXNjcmliZSB3aGF0IGhhcHBlbmVkIGFuZCB3aHkgaXQgbWF0dGVycy4KMy4gQmUgc3BlY2lmaWM6IFByb3ZpZGUgZXhhbXBsZXMgdG8gc3VwcG9ydCB5b3VyIHBvaW50cy4KNC4gQmFsYW5jZSBwb3NpdGl2ZXMgYW5kIGltcHJvdmVtZW50czogSGlnaGxpZ2h0IHN0cmVuZ3RocyBhbmQgYXJlYXMgdG8gZ3Jvdy4KNS4gQmUgcmVzcGVjdGZ1bCBhbmQgY29uc3RydWN0aXZlOiBBc3N1bWUgcG9zaXRpdmUgaW50ZW50IGFuZCBvZmZlciBzb2x1dGlvbnMuCjYuIFVzZSBvYmplY3RpdmUgY3JpdGVyaWE6IFJlZmVyZW5jZSBnb2FscywgbWV0cmljcywgb3IgZXhwZWN0YXRpb25zIHdoZXJlIHBvc3NpYmxlLgo3LiBTdWdnZXN0IG5leHQgc3RlcHM6IFJlY29tbWVuZCBhY3Rpb25hYmxlIHdheXMgdG8gaW1wcm92ZS4KOC4gUHJvb2ZyZWFkOiBDaGVjayB0b25lLCBncmFtbWFyLCBhbmQgY2xhcml0eSBiZWZvcmUgc3VibWl0dGluZy4K",
+      "mimeType": "text/plain"}}], "role": "user"}], "systemInstruction": {"parts":
+      [{"text": "You are File Analyst. Expert at analyzing various file types.\nYour
+      personal goal is: Analyze and describe files accurately"}], "role": "user"},
+      "generationConfig": {"stopSequences": ["\nObservation:"]}}'
+    headers:
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept:
+      - '*/*'
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      connection:
+      - keep-alive
+      content-length:
+      - '1226'
+      content-type:
+      - application/json
+      host:
+      - generativelanguage.googleapis.com
+      x-goog-api-client:
+      - google-genai-sdk/1.49.0 gl-python/3.13.3
+      x-goog-api-key:
+      - X-GOOG-API-KEY-XXX
+    method: POST
+    uri: https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent
+  response:
+    body:
+      string: "{\n  \"candidates\": [\n    {\n      \"content\": {\n        \"parts\":
+        [\n          {\n            \"text\": \"These guidelines provide instructions
+        on how to deliver effective, constructive, and respectful feedback, emphasizing
+        clarity, specificity, balance, and actionable suggestions for improvement.\"\n
+        \         }\n        ],\n        \"role\": \"model\"\n      },\n      \"finishReason\":
+        \"STOP\",\n      \"index\": 0\n    }\n  ],\n  \"usageMetadata\": {\n    \"promptTokenCount\":
+        166,\n    \"candidatesTokenCount\": 29,\n    \"totalTokenCount\": 269,\n    \"promptTokensDetails\":
+        [\n      {\n        \"modality\": \"TEXT\",\n        \"tokenCount\": 166\n
+        \     }\n    ],\n    \"thoughtsTokenCount\": 74\n  },\n  \"modelVersion\":
+        \"gemini-2.5-flash\",\n  \"responseId\": \"PkqOaf-bLu-v_uMPnorr8Qs\"\n}\n"
+    headers:
+      Alt-Svc:
+      - h3=":443"; ma=2592000,h3-29=":443"; ma=2592000
+      Content-Type:
+      - application/json; charset=UTF-8
+      Date:
+      - Thu, 12 Feb 2026 21:46:38 GMT
+      Server:
+      - scaffolding on HTTPServer2
+      Server-Timing:
+      - gfet4t7; dur=898
+      Transfer-Encoding:
+      - chunked
+      Vary:
+      - Origin
+      - X-Origin
+      - Referer
+      X-Content-Type-Options:
+      - X-CONTENT-TYPE-XXX
+      X-Frame-Options:
+      - X-FRAME-OPTIONS-XXX
+      X-XSS-Protection:
+      - '0'
+    status:
+      code: 200
+      message: OK
+version: 1
--- a/lib/crewai/tests/cassettes/TestAgentMultimodalGemini.test_video_file[gemini-gemini-2.0-flash].yaml
+++ b/lib/crewai/tests/cassettes/TestAgentMultimodalGemini.test_video_file[gemini-gemini-2.0-flash].yaml
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Cursor Agent	62a262d554	Reset finalize guard on each executor invocation	2026-03-03 18:21:27 +00:00
lorenzejay	87e1852746	refactor: enhance planning and execution flow in agents - Updated the PlannerObserver to accept a kickoff input for standalone task execution, improving flexibility in task handling. - Refined the step execution process in StepExecutor to support multi-turn action loops, allowing for iterative tool execution and observation. - Introduced a method to extract relevant task sections from descriptions, ensuring clarity in task requirements. - Enhanced the AgentExecutor to manage step failures more effectively, triggering replans only when necessary and preserving completed task history. - Updated translations to reflect changes in planning principles and execution prompts, emphasizing concrete and executable steps.	2026-03-03 10:17:35 -08:00
lorenzejay	76f329a025	fix: update observation handling in PlannerObserver for LLM errors - Modified the error handling in the PlannerObserver to default to a conservative replan when an LLM call fails. - Updated the return values to indicate that the step was not completed successfully and that a full replan is needed. - Added a new test to verify the behavior of the observer when an LLM error occurs, ensuring the correct replan logic is triggered.	2026-02-25 13:44:50 -08:00
lorenzejay	687d6abdaa	refactor: enhance final answer synthesis logic in AgentExecutor - Updated the finalization process to conditionally skip synthesis when the last todo result is sufficient as a complete answer. - Introduced a new method to determine if the last todo result can be used directly, improving efficiency. - Added tests to verify the new behavior, ensuring synthesis is skipped when appropriate and maintained when a response model is set.	2026-02-24 15:04:02 -08:00
lorenzejay	3302c5ab77	enhance step executor with tool usage events and validation - Added event emissions for tool usage, including started and finished events, to track tool execution. - Implemented validation to ensure expected tools are called during step execution, raising errors when not. - Refactored the method to handle tool execution with event logging. - Introduced a new method for parsing tool input into a structured format. - Updated tests to cover new functionality and ensure correct behavior of tool usage events.	2026-02-24 14:19:27 -08:00
lorenzejay	32059c7d79	refactor: streamline observation and refinement process in PlannerObserver - Updated the PlannerObserver to apply structured refinements directly from observations without requiring a second LLM call. - Renamed method to for clarity. - Enhanced documentation to reflect changes in how refinements are handled. - Removed unnecessary LLM message building and parsing logic, simplifying the refinement process. - Updated event emissions to include summaries of refinements instead of raw data.	2026-02-24 09:03:04 -08:00
lorenzejay	8194bb42f1	improving step executor	2026-02-23 14:53:25 -08:00
lorenzejay	388de2252e	fix datetime	2026-02-23 14:01:28 -08:00
lorenzejay	8f104e6eca	Merge branch 'lorenze/feat/plan-execute-pattern' of github.com:crewAIInc/crewAI into lorenze/feat/planning-pt-3-todo-list-execution	2026-02-23 13:39:05 -08:00
lorenzejay	5317947b4f	Merge branch 'main' of github.com:crewAIInc/crewAI into lorenze/feat/plan-execute-pattern	2026-02-23 13:07:09 -08:00
Greyson LaLonde	51754899a2	feat: migrate CLI http client from requests to httpx Some checks failed Build uv cache / build-cache (3.10) (push) Has been cancelled Details Build uv cache / build-cache (3.11) (push) Has been cancelled Details Build uv cache / build-cache (3.12) (push) Has been cancelled Details Build uv cache / build-cache (3.13) (push) Has been cancelled Details CodeQL Advanced / Analyze (actions) (push) Has been cancelled Details CodeQL Advanced / Analyze (python) (push) Has been cancelled Details Mark stale issues and pull requests / stale (push) Has been cancelled Details	2026-02-20 18:21:05 -05:00
Greyson LaLonde	71b4f8402a	fix: ensure callbacks are ran/awaited if promise Some checks failed CodeQL Advanced / Analyze (actions) (push) Has been cancelled Details CodeQL Advanced / Analyze (python) (push) Has been cancelled Details Build uv cache / build-cache (3.12) (push) Has been cancelled Details Build uv cache / build-cache (3.13) (push) Has been cancelled Details Build uv cache / build-cache (3.10) (push) Has been cancelled Details Build uv cache / build-cache (3.11) (push) Has been cancelled Details	2026-02-20 13:15:50 -05:00
lorenzejay	9fea9fe757	Merge branch 'main' of github.com:crewAIInc/crewAI into lorenze/feat/plan-execute-pattern	2026-02-20 09:54:39 -08:00
lorenzejay	fd6558e0f2	consolidate agent logic	2026-02-13 13:55:28 -08:00
lorenzejay	e26d3e471d	Refactor PlannerObserver and StepExecutor to Utilize I18N for Prompts This update enhances the PlannerObserver and StepExecutor classes by integrating the I18N utility for managing prompts and messages. The system and user prompts are now retrieved from the I18N module, allowing for better localization and maintainability. Additionally, the code has been cleaned up to remove hardcoded strings, improving readability and consistency across the planning and execution processes.	2026-02-13 13:40:39 -08:00
lorenzejay	fad23d804a	Refactor PlannerObserver and StepExecutor to Utilize I18N for Prompts This update enhances the PlannerObserver and StepExecutor classes by integrating the I18N utility for managing prompts and messages. The system and user prompts are now retrieved from the I18N module, allowing for better localization and maintainability. Additionally, the code has been cleaned up to remove hardcoded strings, improving readability and consistency across the planning and execution processes.	2026-02-13 13:40:04 -08:00
lorenzejay	ca89b729f8	dry	2026-02-13 13:33:55 -08:00
lorenzejay	7e09e01215	fixing tests	2026-02-12 14:26:29 -08:00
lorenzejay	eec88ad2bb	cassette regen	2026-02-12 10:53:46 -08:00
lorenzejay	5d4ed12072	regen cassettes for test and fix test	2026-02-11 16:04:24 -08:00
lorenzejay	a164e94f49	Enhance PlanningConfig and AgentExecutor with Reasoning Effort Levels This update introduces a new attribute in the class, allowing users to customize the observation and replanning behavior during task execution. The class has been modified to utilize this new attribute, routing step observations based on the specified reasoning effort level: low, medium, or high. Additionally, tests have been added to validate the functionality of the reasoning effort levels, ensuring that the agent behaves as expected under different configurations. This enhancement improves the adaptability and efficiency of the planning process in agent execution.	2026-02-11 13:55:02 -08:00
lorenzejay	576345140f	fix	2026-02-10 17:38:49 -08:00
lorenzejay	8fd7ef7f43	linted	2026-02-10 17:14:38 -08:00
lorenzejay	9cac1792bd	regen tests	2026-02-10 17:05:47 -08:00
lorenzejay	b2de783559	Merge branch 'lorenze/feat/plan-execute-pattern' of github.com:crewAIInc/crewAI into lorenze/feat/planning-pt-3-todo-list-execution	2026-02-10 16:58:14 -08:00
lorenzejay	d77e2cb1f8	Merge branch 'lorenze/feat/plan-execute-pattern' of github.com:crewAIInc/crewAI into lorenze/feat/plan-execute-pattern	2026-02-10 16:10:20 -08:00
Lorenze Jay	a6dcb275e1	Lorenze/feat planning pt 2 todo list gen (#4449 ) * feat: introduce PlanningConfig for enhanced agent planning capabilities This update adds a new PlanningConfig class to manage agent planning configurations, allowing for customizable planning behavior before task execution. The existing reasoning parameter is deprecated in favor of this new configuration, ensuring backward compatibility while enhancing the planning process. Additionally, the Agent class has been updated to utilize this new configuration, and relevant utility functions have been adjusted accordingly. Tests have been added to validate the new planning functionality and ensure proper integration with existing agent workflows. * dropping redundancy * fix test * revert handle_reasoning here * refactor: update reasoning handling in Agent class This commit modifies the Agent class to conditionally call the handle_reasoning function based on the executor class being used. The legacy CrewAgentExecutor will continue to utilize handle_reasoning, while the new AgentExecutor will manage planning internally. Additionally, the PlanningConfig class has been referenced in the documentation to clarify its role in enabling or disabling planning. Tests have been updated to reflect these changes and ensure proper functionality. * improve planning prompts * matching * refactor: remove default enabled flag from PlanningConfig in Agent class * more cassettes * fix test * feat: enhance agent planning with structured todo management This commit introduces a new planning system within the AgentExecutor class, allowing for the creation of structured todo items from planning steps. The TodoList and TodoItem models have been added to facilitate tracking of plan execution. The reasoning plan now includes a list of steps, improving the clarity and organization of agent tasks. Additionally, tests have been added to validate the new planning functionality and ensure proper integration with existing workflows. * refactor: update planning prompt and remove deprecated methods in reasoning handler * improve planning prompt * improve handler * linted * linted	2026-02-10 16:08:26 -08:00
Lorenze Jay	79a01fca31	feat: introduce PlanningConfig for enhanced agent planning capabilities (#4344 ) * feat: introduce PlanningConfig for enhanced agent planning capabilities This update adds a new PlanningConfig class to manage agent planning configurations, allowing for customizable planning behavior before task execution. The existing reasoning parameter is deprecated in favor of this new configuration, ensuring backward compatibility while enhancing the planning process. Additionally, the Agent class has been updated to utilize this new configuration, and relevant utility functions have been adjusted accordingly. Tests have been added to validate the new planning functionality and ensure proper integration with existing agent workflows. * dropping redundancy * fix test * revert handle_reasoning here * refactor: update reasoning handling in Agent class This commit modifies the Agent class to conditionally call the handle_reasoning function based on the executor class being used. The legacy CrewAgentExecutor will continue to utilize handle_reasoning, while the new AgentExecutor will manage planning internally. Additionally, the PlanningConfig class has been referenced in the documentation to clarify its role in enabling or disabling planning. Tests have been updated to reflect these changes and ensure proper functionality. * improve planning prompts * matching * refactor: remove default enabled flag from PlanningConfig in Agent class * more cassettes * fix test * refactor: update planning prompt and remove deprecated methods in reasoning handler * improve planning prompt	2026-02-10 13:26:49 -08:00
lorenzejay	735a2204fd	refactor: implement structured output handling in final answer synthesis This commit enhances the final answer synthesis process in the AgentExecutor class by introducing support for structured outputs when a response model is specified. The synthesis method now utilizes the response model to produce outputs that conform to the expected schema, while still falling back to concatenation in case of synthesis failures. This change ensures that intermediate steps yield free-text results, but the final output can be structured, improving the overall coherence and usability of the synthesized answers.	2026-02-08 16:19:16 -08:00
lorenzejay	ff57956d05	refactor: enhance final answer synthesis in AgentExecutor This commit improves the synthesis of final answers in the AgentExecutor class by implementing a more coherent approach to combining results from multiple todo items. The method now utilizes a single LLM call to generate a polished response, falling back to concatenation if the synthesis fails. Additionally, the test cases have been updated to reflect the changes in planning and execution, ensuring that the results are properly validated and that the plan-and-execute architecture is functioning as intended.	2026-02-06 15:39:04 -08:00
lorenzejay	9f3c53ca97	refactor: enhance final answer synthesis in AgentExecutor This commit improves the synthesis of final answers in the AgentExecutor class by implementing a more coherent approach to combining results from multiple todo items. The method now utilizes a single LLM call to generate a polished response, falling back to concatenation if the synthesis fails. Additionally, the test cases have been updated to reflect the changes in planning and execution, ensuring that the results are properly validated and that the plan-and-execute architecture is functioning as intended.	2026-02-06 10:38:55 -08:00
lorenzejay	8e1474d371	feat: introduce PlannerObserver and StepExecutor for enhanced plan execution This commit adds the PlannerObserver and StepExecutor classes to the CrewAI framework, implementing the observation phase of the Plan-and-Execute architecture. The PlannerObserver analyzes step execution results, determines plan validity, and suggests refinements, while the StepExecutor executes individual todo items in isolation. These additions improve the overall planning and execution process, allowing for more dynamic and responsive agent behavior. Additionally, new observation events have been defined to facilitate monitoring and logging of the planning process.	2026-02-05 15:46:21 -08:00
lorenzejay	81d9fd4ab3	execute todos and be able to track them	2026-02-05 10:51:54 -08:00
lorenzejay	7e1ae7226b	improve handler	2026-02-03 15:15:22 -08:00
lorenzejay	adee852a2a	Merge branch 'lorenze/feat/planning-pt-1' of github.com:crewAIInc/crewAI into lorenze/feat-planning-pt-2-todo-list-gen	2026-02-03 13:54:47 -08:00
lorenzejay	b7d5a4afef	Merge branch 'main' of github.com:crewAIInc/crewAI into lorenze/feat/planning-pt-1	2026-02-03 13:54:27 -08:00
lorenzejay	abf86d5572	Merge branch 'lorenze/feat/planning-pt-1' of github.com:crewAIInc/crewAI into lorenze/feat-planning-pt-2-todo-list-gen	2026-02-03 13:48:36 -08:00
lorenzejay	02dc39faa2	improve planning prompt	2026-02-03 13:47:38 -08:00
lorenzejay	dd8230f051	Merge branch 'lorenze/feat/planning-pt-1' of github.com:crewAIInc/crewAI into lorenze/feat-planning-pt-2-todo-list-gen	2026-02-03 13:36:48 -08:00
lorenzejay	a3c2c946d3	refactor: update planning prompt and remove deprecated methods in reasoning handler	2026-02-03 13:35:45 -08:00
lorenzejay	bd95cffd41	feat: enhance agent planning with structured todo management This commit introduces a new planning system within the AgentExecutor class, allowing for the creation of structured todo items from planning steps. The TodoList and TodoItem models have been added to facilitate tracking of plan execution. The reasoning plan now includes a list of steps, improving the clarity and organization of agent tasks. Additionally, tests have been added to validate the new planning functionality and ensure proper integration with existing workflows.	2026-02-03 13:24:55 -08:00
lorenzejay	ab6ce4b7aa	fix test	2026-02-03 08:57:47 -08:00
lorenzejay	ac1d1fcfa3	more cassettes	2026-02-03 08:27:10 -08:00
lorenzejay	83f38184ff	refactor: remove default enabled flag from PlanningConfig in Agent class	2026-02-03 08:01:27 -08:00
lorenzejay	f2016f8979	matching	2026-02-03 07:59:27 -08:00
lorenzejay	fe1e29d2f9	improve planning prompts	2026-02-02 16:36:18 -08:00
lorenzejay	861da95aad	refactor: update reasoning handling in Agent class This commit modifies the Agent class to conditionally call the handle_reasoning function based on the executor class being used. The legacy CrewAgentExecutor will continue to utilize handle_reasoning, while the new AgentExecutor will manage planning internally. Additionally, the PlanningConfig class has been referenced in the documentation to clarify its role in enabling or disabling planning. Tests have been updated to reflect these changes and ensure proper functionality.	2026-02-02 16:27:39 -08:00
lorenzejay	50b9b42de9	revert handle_reasoning here	2026-02-02 16:21:36 -08:00
lorenzejay	85d22ba902	fix test	2026-02-02 16:08:12 -08:00
lorenzejay	9277d219e3	dropping redundancy	2026-02-02 16:01:37 -08:00
lorenzejay	710b0ce2ae	feat: introduce PlanningConfig for enhanced agent planning capabilities This update adds a new PlanningConfig class to manage agent planning configurations, allowing for customizable planning behavior before task execution. The existing reasoning parameter is deprecated in favor of this new configuration, ensuring backward compatibility while enhancing the planning process. Additionally, the Agent class has been updated to utilize this new configuration, and relevant utility functions have been adjusted accordingly. Tests have been added to validate the new planning functionality and ensure proper integration with existing agent workflows.	2026-02-02 15:55:28 -08:00