imp compaction (#4399)

* imp compaction * fix lint * cassette gen * cassette gen * improve assert * adding azure * fix global docstring
2026-07-02 05:38:12 +00:00 · 2026-02-11 15:52:03 -08:00
parent 0341e5aee7
commit 2ed0c2c043
13 changed files with 2881 additions and 71 deletions
--- a/lib/crewai/src/crewai/translations/en.json
+++ b/lib/crewai/src/crewai/translations/en.json
@@ -22,9 +22,9 @@
    "expected_output": "\nThis is the expected criteria for your final answer: {expected_output}\nyou MUST return the actual complete content as the final answer, not a summary.",
    "human_feedback": "You got human feedback on your work, re-evaluate it and give a new Final Answer when ready.\n {human_feedback}",
    "getting_input": "This is the agent's final answer: {final_answer}\n\n",
-    "summarizer_system_message": "You are a helpful assistant that summarizes text.",
-    "summarize_instruction": "Summarize the following text, make sure to include all the important information: {group}",
-    "summary": "This is a summary of our conversation so far:\n{merged_summary}",
+    "summarizer_system_message": "You are a precise assistant that creates structured summaries of agent conversations. You preserve critical context needed for seamless task continuation.",
+    "summarize_instruction": "Analyze the following conversation and create a structured summary that preserves all information needed to continue the task seamlessly.\n\n<conversation>\n{conversation}\n</conversation>\n\nCreate a summary with these sections:\n1. **Task Overview**: What is the agent trying to accomplish?\n2. **Current State**: What has been completed so far? What step is the agent on?\n3. **Important Discoveries**: Key facts, data, tool results, or findings that must not be lost.\n4. **Next Steps**: What should the agent do next based on the conversation?\n5. **Context to Preserve**: Any specific values, names, URLs, code snippets, or details referenced in the conversation.\n\nWrap your entire summary in <summary> tags.\n\n<summary>\n[Your structured summary here]\n</summary>",
+    "summary": "<summary>\n{merged_summary}\n</summary>\n\nContinue the task from where the conversation left off. The above is a structured summary of prior context.",
    "manager_request": "Your best answer to your coworker asking you this, accounting for the context shared.",
    "formatted_task_instructions": "Format your final answer according to the following OpenAPI schema: {output_format}\n\nIMPORTANT: Preserve the original content exactly as-is. Do NOT rewrite, paraphrase, or modify the meaning of the content. Only structure it to match the schema format.\n\nDo not include the OpenAPI schema in the final output. Ensure the final output does not include any code block markers like ```json or ```python.",
    "conversation_history_instruction": "You are a member of a crew collaborating to achieve a common goal. Your task is a specific action that contributes to this larger objective. For additional context, please review the conversation history between you and the user that led to the initiation of this crew. Use any relevant information or feedback from the conversation to inform your task execution and ensure your response aligns with both the immediate task and the crew's overall goals.",
--- a/lib/crewai/src/crewai/utilities/agent_utils.py
+++ b/lib/crewai/src/crewai/utilities/agent_utils.py
@@ -2,6 +2,7 @@ from __future__ import annotations

 import asyncio
 from collections.abc import Callable, Sequence
+import concurrent.futures
 import json
 import re
 from typing import TYPE_CHECKING, Any, Final, Literal, TypedDict
@@ -640,6 +641,180 @@ def handle_context_length(
        )


+def _estimate_token_count(text: str) -> int:
+    """Estimate token count using a conservative cross-provider heuristic.
+
+    Args:
+        text: The text to estimate tokens for.
+
+    Returns:
+        Estimated token count (roughly 1 token per 4 characters).
+    """
+    return len(text) // 4
+
+
+def _format_messages_for_summary(messages: list[LLMMessage]) -> str:
+    """Format messages with role labels for summarization.
+
+    Skips system messages. Handles None content, tool_calls, and
+    multimodal content blocks.
+
+    Args:
+        messages: List of messages to format.
+
+    Returns:
+        Role-labeled conversation text.
+    """
+    lines: list[str] = []
+    for msg in messages:
+        role = msg.get("role", "user")
+        if role == "system":
+            continue
+
+        content = msg.get("content")
+        if content is None:
+            # Check for tool_calls on assistant messages with no content
+            tool_calls = msg.get("tool_calls")
+            if tool_calls:
+                tool_names = []
+                for tc in tool_calls:
+                    func = tc.get("function", {})
+                    name = (
+                        func.get("name", "unknown")
+                        if isinstance(func, dict)
+                        else "unknown"
+                    )
+                    tool_names.append(name)
+                content = f"[Called tools: {', '.join(tool_names)}]"
+            else:
+                content = ""
+        elif isinstance(content, list):
+            # Multimodal content blocks — extract text parts
+            text_parts = [
+                block.get("text", "")
+                for block in content
+                if isinstance(block, dict) and block.get("type") == "text"
+            ]
+            content = " ".join(text_parts) if text_parts else "[multimodal content]"
+
+        if role == "assistant":
+            label = "[ASSISTANT]:"
+        elif role == "tool":
+            tool_name = msg.get("name", "unknown")
+            label = f"[TOOL_RESULT ({tool_name})]:"
+        else:
+            label = "[USER]:"
+
+        lines.append(f"{label} {content}")
+
+    return "\n\n".join(lines)
+
+
+def _split_messages_into_chunks(
+    messages: list[LLMMessage], max_tokens: int
+) -> list[list[LLMMessage]]:
+    """Split messages into chunks at message boundaries.
+
+    Excludes system messages from chunks. Each chunk stays under
+    max_tokens based on estimated token count.
+
+    Args:
+        messages: List of messages to split.
+        max_tokens: Maximum estimated tokens per chunk.
+
+    Returns:
+        List of message chunks.
+    """
+    non_system = [m for m in messages if m.get("role") != "system"]
+    if not non_system:
+        return []
+
+    chunks: list[list[LLMMessage]] = []
+    current_chunk: list[LLMMessage] = []
+    current_tokens = 0
+
+    for msg in non_system:
+        content = msg.get("content")
+        if content is None:
+            msg_text = ""
+        elif isinstance(content, list):
+            msg_text = str(content)
+        else:
+            msg_text = str(content)
+
+        msg_tokens = _estimate_token_count(msg_text)
+
+        # If adding this message would exceed the limit and we already have
+        # messages in the current chunk, start a new chunk
+        if current_chunk and (current_tokens + msg_tokens) > max_tokens:
+            chunks.append(current_chunk)
+            current_chunk = []
+            current_tokens = 0
+
+        current_chunk.append(msg)
+        current_tokens += msg_tokens
+
+    if current_chunk:
+        chunks.append(current_chunk)
+
+    return chunks
+
+
+def _extract_summary_tags(text: str) -> str:
+    """Extract content between <summary></summary> tags.
+
+    Falls back to the full text if no tags are found.
+
+    Args:
+        text: Text potentially containing summary tags.
+
+    Returns:
+        Extracted summary content, or full text if no tags found.
+    """
+    match = re.search(r"<summary>(.*?)</summary>", text, re.DOTALL)
+    if match:
+        return match.group(1).strip()
+    return text.strip()
+
+
+async def _asummarize_chunks(
+    chunks: list[list[LLMMessage]],
+    llm: LLM | BaseLLM,
+    callbacks: list[TokenCalcHandler],
+    i18n: I18N,
+) -> list[SummaryContent]:
+    """Summarize multiple message chunks concurrently using asyncio.
+
+    Args:
+        chunks: List of message chunks to summarize.
+        llm: LLM instance (must support ``acall``).
+        callbacks: List of callbacks for the LLM.
+        i18n: I18N instance for prompt templates.
+
+    Returns:
+        Ordered list of summary contents, one per chunk.
+    """
+
+    async def _summarize_one(chunk: list[LLMMessage]) -> SummaryContent:
+        conversation_text = _format_messages_for_summary(chunk)
+        summarization_messages = [
+            format_message_for_llm(
+                i18n.slice("summarizer_system_message"), role="system"
+            ),
+            format_message_for_llm(
+                i18n.slice("summarize_instruction").format(
+                    conversation=conversation_text
+                ),
+            ),
+        ]
+        summary = await llm.acall(summarization_messages, callbacks=callbacks)
+        extracted = _extract_summary_tags(str(summary))
+        return {"content": extracted}
+
+    results = await asyncio.gather(*[_summarize_one(chunk) for chunk in chunks])
+    return list(results)
+
+
 def summarize_messages(
    messages: list[LLMMessage],
    llm: LLM | BaseLLM,
@@ -649,6 +824,10 @@ def summarize_messages(
 ) -> None:
    """Summarize messages to fit within context window.

+    Uses structured context compaction: preserves system messages,
+    splits at message boundaries, formats with role labels, and
+    produces structured summaries for seamless task continuation.
+
    Preserves any files attached to user messages and re-attaches them to
    the summarized message. Files from all user messages are merged.

@@ -657,49 +836,74 @@ def summarize_messages(
        llm: LLM instance for summarization
        callbacks: List of callbacks for LLM
        i18n: I18N instance for messages
+        verbose: Whether to print progress.
    """
+    # 1. Extract & preserve file attachments from user messages
    preserved_files: dict[str, Any] = {}
    for msg in messages:
        if msg.get("role") == "user" and msg.get("files"):
            preserved_files.update(msg["files"])

-    messages_string = " ".join(
-        [str(message.get("content", "")) for message in messages]
-    )
-    cut_size = llm.get_context_window_size()
+    # 2. Extract system messages — never summarize them
+    system_messages = [m for m in messages if m.get("role") == "system"]
+    non_system_messages = [m for m in messages if m.get("role") != "system"]

-    messages_groups = [
-        {"content": messages_string[i : i + cut_size]}
-        for i in range(0, len(messages_string), cut_size)
-    ]
+    # If there are only system messages (or no non-system messages), nothing to summarize
+    if not non_system_messages:
+        return

-    summarized_contents: list[SummaryContent] = []
+    # 3. Split non-system messages into chunks at message boundaries
+    max_tokens = llm.get_context_window_size()
+    chunks = _split_messages_into_chunks(non_system_messages, max_tokens)

-    total_groups = len(messages_groups)
-    for idx, group in enumerate(messages_groups, 1):
+    # 4. Summarize each chunk with role-labeled formatting
+    total_chunks = len(chunks)
+
+    if total_chunks <= 1:
+        # Single chunk — no benefit from async overhead
+        summarized_contents: list[SummaryContent] = []
+        for idx, chunk in enumerate(chunks, 1):
+            if verbose:
+                Printer().print(
+                    content=f"Summarizing {idx}/{total_chunks}...",
+                    color="yellow",
+                )
+            conversation_text = _format_messages_for_summary(chunk)
+            summarization_messages = [
+                format_message_for_llm(
+                    i18n.slice("summarizer_system_message"), role="system"
+                ),
+                format_message_for_llm(
+                    i18n.slice("summarize_instruction").format(
+                        conversation=conversation_text
+                    ),
+                ),
+            ]
+            summary = llm.call(summarization_messages, callbacks=callbacks)
+            extracted = _extract_summary_tags(str(summary))
+            summarized_contents.append({"content": extracted})
+    else:
+        # Multiple chunks — summarize in parallel via asyncio
        if verbose:
            Printer().print(
-                content=f"Summarizing {idx}/{total_groups}...",
+                content=f"Summarizing {total_chunks} chunks in parallel...",
                color="yellow",
            )
-
-        summarization_messages = [
-            format_message_for_llm(
-                i18n.slice("summarizer_system_message"), role="system"
-            ),
-            format_message_for_llm(
-                i18n.slice("summarize_instruction").format(group=group["content"]),
-            ),
-        ]
-        summary = llm.call(
-            summarization_messages,
-            callbacks=callbacks,
+        coro = _asummarize_chunks(
+            chunks=chunks, llm=llm, callbacks=callbacks, i18n=i18n
        )
-        summarized_contents.append({"content": str(summary)})
+        if is_inside_event_loop():
+            with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
+                summarized_contents = pool.submit(asyncio.run, coro).result()
+        else:
+            summarized_contents = asyncio.run(coro)

-    merged_summary = " ".join(content["content"] for content in summarized_contents)
+    merged_summary = "\n\n".join(content["content"] for content in summarized_contents)

+    # 6. Reconstruct messages: [system messages...] + [summary user message]
    messages.clear()
+    messages.extend(system_messages)
+
    summary_message = format_message_for_llm(
        i18n.slice("summary").format(merged_summary=merged_summary)
    )
--- a/lib/crewai/tests/cassettes/utilities/TestAgentExecuteTaskCompaction.test_agent_execute_task_compaction.yaml
+++ b/lib/crewai/tests/cassettes/utilities/TestAgentExecuteTaskCompaction.test_agent_execute_task_compaction.yaml
@@ -0,0 +1,113 @@
+interactions:
+- request:
+    body: '{"messages":[{"role":"system","content":"You are Writer. You are a skilled
+      writer.\nYour personal goal is: Write concise content"},{"role":"user","content":"\nCurrent
+      Task: Write one sentence about the sun.\n\nThis is the expected criteria for
+      your final answer: A single sentence about the sun.\nyou MUST return the actual
+      complete content as the final answer, not a summary.\n\nProvide your complete
+      response:"}],"model":"gpt-4o-mini","temperature":0}'
+    headers:
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept:
+      - application/json
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      authorization:
+      - AUTHORIZATION-XXX
+      connection:
+      - keep-alive
+      content-length:
+      - '453'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+      x-stainless-arch:
+      - X-STAINLESS-ARCH-XXX
+      x-stainless-async:
+      - 'false'
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - X-STAINLESS-OS-XXX
+      x-stainless-package-version:
+      - 1.83.0
+      x-stainless-read-timeout:
+      - X-STAINLESS-READ-TIMEOUT-XXX
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.13.3
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    body:
+      string: "{\n  \"id\": \"chatcmpl-D7RxEngFVCbqdc7tNjV3VjeteqcwT\",\n  \"object\":
+        \"chat.completion\",\n  \"created\": 1770668124,\n  \"model\": \"gpt-4o-mini-2024-07-18\",\n
+        \ \"choices\": [\n    {\n      \"index\": 0,\n      \"message\": {\n        \"role\":
+        \"assistant\",\n        \"content\": \"The sun is a massive ball of glowing
+        gas at the center of our solar system, providing light and warmth essential
+        for life on Earth.\",\n        \"refusal\": null,\n        \"annotations\":
+        []\n      },\n      \"logprobs\": null,\n      \"finish_reason\": \"stop\"\n
+        \   }\n  ],\n  \"usage\": {\n    \"prompt_tokens\": 78,\n    \"completion_tokens\":
+        27,\n    \"total_tokens\": 105,\n    \"prompt_tokens_details\": {\n      \"cached_tokens\":
+        0,\n      \"audio_tokens\": 0\n    },\n    \"completion_tokens_details\":
+        {\n      \"reasoning_tokens\": 0,\n      \"audio_tokens\": 0,\n      \"accepted_prediction_tokens\":
+        0,\n      \"rejected_prediction_tokens\": 0\n    }\n  },\n  \"service_tier\":
+        \"default\",\n  \"system_fingerprint\": \"fp_f4ae844694\"\n}\n"
+    headers:
+      CF-RAY:
+      - CF-RAY-XXX
+      Connection:
+      - keep-alive
+      Content-Type:
+      - application/json
+      Date:
+      - Mon, 09 Feb 2026 20:15:25 GMT
+      Server:
+      - cloudflare
+      Strict-Transport-Security:
+      - STS-XXX
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - X-CONTENT-TYPE-XXX
+      access-control-expose-headers:
+      - ACCESS-CONTROL-XXX
+      alt-svc:
+      - h3=":443"; ma=86400
+      cf-cache-status:
+      - DYNAMIC
+      openai-organization:
+      - OPENAI-ORG-XXX
+      openai-processing-ms:
+      - '664'
+      openai-project:
+      - OPENAI-PROJECT-XXX
+      openai-version:
+      - '2020-10-01'
+      set-cookie:
+      - SET-COOKIE-XXX
+      x-openai-proxy-wasm:
+      - v0.1
+      x-ratelimit-limit-requests:
+      - X-RATELIMIT-LIMIT-REQUESTS-XXX
+      x-ratelimit-limit-tokens:
+      - X-RATELIMIT-LIMIT-TOKENS-XXX
+      x-ratelimit-remaining-requests:
+      - X-RATELIMIT-REMAINING-REQUESTS-XXX
+      x-ratelimit-remaining-tokens:
+      - X-RATELIMIT-REMAINING-TOKENS-XXX
+      x-ratelimit-reset-requests:
+      - X-RATELIMIT-RESET-REQUESTS-XXX
+      x-ratelimit-reset-tokens:
+      - X-RATELIMIT-RESET-TOKENS-XXX
+      x-request-id:
+      - X-REQUEST-ID-XXX
+    status:
+      code: 200
+      message: OK
+version: 1
--- a/lib/crewai/tests/cassettes/utilities/TestCrewKickoffCompaction.test_crew_kickoff_compaction_openai.yaml
+++ b/lib/crewai/tests/cassettes/utilities/TestCrewKickoffCompaction.test_crew_kickoff_compaction_openai.yaml
@@ -0,0 +1,120 @@
+interactions:
+- request:
+    body: '{"messages":[{"role":"system","content":"You are Researcher. You are an
+      expert researcher.\nYour personal goal is: Find information about Python programming"},{"role":"user","content":"\nCurrent
+      Task: What is Python? Give a brief answer.\n\nThis is the expected criteria
+      for your final answer: A short description of Python.\nyou MUST return the actual
+      complete content as the final answer, not a summary.\n\nProvide your complete
+      response:"}],"model":"gpt-4o-mini","temperature":0}'
+    headers:
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept:
+      - application/json
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      authorization:
+      - AUTHORIZATION-XXX
+      connection:
+      - keep-alive
+      content-length:
+      - '482'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+      x-stainless-arch:
+      - X-STAINLESS-ARCH-XXX
+      x-stainless-async:
+      - 'false'
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - X-STAINLESS-OS-XXX
+      x-stainless-package-version:
+      - 1.83.0
+      x-stainless-read-timeout:
+      - X-STAINLESS-READ-TIMEOUT-XXX
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.13.3
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    body:
+      string: "{\n  \"id\": \"chatcmpl-D7RxRv3U0LCLf2iqf40wxOQsuiYFR\",\n  \"object\":
+        \"chat.completion\",\n  \"created\": 1770668137,\n  \"model\": \"gpt-4o-mini-2024-07-18\",\n
+        \ \"choices\": [\n    {\n      \"index\": 0,\n      \"message\": {\n        \"role\":
+        \"assistant\",\n        \"content\": \"Python is a high-level, interpreted
+        programming language known for its readability and simplicity. It was created
+        by Guido van Rossum and first released in 1991. Python supports multiple programming
+        paradigms, including procedural, object-oriented, and functional programming.
+        It has a large standard library and is widely used for web development, data
+        analysis, artificial intelligence, scientific computing, and automation, among
+        other applications. Python's syntax emphasizes code readability, allowing
+        developers to express concepts in fewer lines of code compared to other languages.
+        Its active community and extensive ecosystem of libraries and frameworks make
+        it a popular choice for both beginners and experienced programmers.\",\n        \"refusal\":
+        null,\n        \"annotations\": []\n      },\n      \"logprobs\": null,\n
+        \     \"finish_reason\": \"stop\"\n    }\n  ],\n  \"usage\": {\n    \"prompt_tokens\":
+        82,\n    \"completion_tokens\": 123,\n    \"total_tokens\": 205,\n    \"prompt_tokens_details\":
+        {\n      \"cached_tokens\": 0,\n      \"audio_tokens\": 0\n    },\n    \"completion_tokens_details\":
+        {\n      \"reasoning_tokens\": 0,\n      \"audio_tokens\": 0,\n      \"accepted_prediction_tokens\":
+        0,\n      \"rejected_prediction_tokens\": 0\n    }\n  },\n  \"service_tier\":
+        \"default\",\n  \"system_fingerprint\": \"fp_f4ae844694\"\n}\n"
+    headers:
+      CF-RAY:
+      - CF-RAY-XXX
+      Connection:
+      - keep-alive
+      Content-Type:
+      - application/json
+      Date:
+      - Mon, 09 Feb 2026 20:15:39 GMT
+      Server:
+      - cloudflare
+      Strict-Transport-Security:
+      - STS-XXX
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - X-CONTENT-TYPE-XXX
+      access-control-expose-headers:
+      - ACCESS-CONTROL-XXX
+      alt-svc:
+      - h3=":443"; ma=86400
+      cf-cache-status:
+      - DYNAMIC
+      openai-organization:
+      - OPENAI-ORG-XXX
+      openai-processing-ms:
+      - '2467'
+      openai-project:
+      - OPENAI-PROJECT-XXX
+      openai-version:
+      - '2020-10-01'
+      set-cookie:
+      - SET-COOKIE-XXX
+      x-openai-proxy-wasm:
+      - v0.1
+      x-ratelimit-limit-requests:
+      - X-RATELIMIT-LIMIT-REQUESTS-XXX
+      x-ratelimit-limit-tokens:
+      - X-RATELIMIT-LIMIT-TOKENS-XXX
+      x-ratelimit-remaining-requests:
+      - X-RATELIMIT-REMAINING-REQUESTS-XXX
+      x-ratelimit-remaining-tokens:
+      - X-RATELIMIT-REMAINING-TOKENS-XXX
+      x-ratelimit-reset-requests:
+      - X-RATELIMIT-RESET-REQUESTS-XXX
+      x-ratelimit-reset-tokens:
+      - X-RATELIMIT-RESET-TOKENS-XXX
+      x-request-id:
+      - X-REQUEST-ID-XXX
+    status:
+      code: 200
+      message: OK
+version: 1
--- a/lib/crewai/tests/cassettes/utilities/TestParallelSummarizationVCR.test_parallel_summarize_openai.yaml
+++ b/lib/crewai/tests/cassettes/utilities/TestParallelSummarizationVCR.test_parallel_summarize_openai.yaml
@@ -0,0 +1,435 @@
+interactions:
+- request:
+    body: '{"messages":[{"role":"system","content":"You are a precise assistant that
+      creates structured summaries of agent conversations. You preserve critical context
+      needed for seamless task continuation."},{"role":"user","content":"Analyze the
+      following conversation and create a structured summary that preserves all information
+      needed to continue the task seamlessly.\n\n<conversation>\n[USER]: Explain the
+      Python package ecosystem. How does pip work, what is PyPI, and what are virtual
+      environments? Compare pip with conda and uv.\n\n[ASSISTANT]: PyPI (Python Package
+      Index) is the official repository hosting 400k+ packages. pip is the standard
+      package installer that downloads from PyPI. Virtual environments (venv) create
+      isolated Python installations to avoid dependency conflicts between projects.
+      conda is a cross-language package manager popular in data science that can manage
+      non-Python dependencies. uv is a new Rust-based tool that is 10-100x faster
+      than pip and aims to replace pip, pip-tools, and virtualenv with a single unified
+      tool.\n</conversation>\n\nCreate a summary with these sections:\n1. **Task Overview**:
+      What is the agent trying to accomplish?\n2. **Current State**: What has been
+      completed so far? What step is the agent on?\n3. **Important Discoveries**:
+      Key facts, data, tool results, or findings that must not be lost.\n4. **Next
+      Steps**: What should the agent do next based on the conversation?\n5. **Context
+      to Preserve**: Any specific values, names, URLs, code snippets, or details referenced
+      in the conversation.\n\nWrap your entire summary in <summary> tags.\n\n<summary>\n[Your
+      structured summary here]\n</summary>"}],"model":"gpt-4o-mini","temperature":0}'
+    headers:
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept:
+      - application/json
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      authorization:
+      - AUTHORIZATION-XXX
+      connection:
+      - keep-alive
+      content-length:
+      - '1687'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+      x-stainless-arch:
+      - X-STAINLESS-ARCH-XXX
+      x-stainless-async:
+      - async:asyncio
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - X-STAINLESS-OS-XXX
+      x-stainless-package-version:
+      - 1.83.0
+      x-stainless-read-timeout:
+      - X-STAINLESS-READ-TIMEOUT-XXX
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.13.3
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    body:
+      string: "{\n  \"id\": \"chatcmpl-D7S93xpUu9d5twM82uJOZpurQTD5u\",\n  \"object\":
+        \"chat.completion\",\n  \"created\": 1770668857,\n  \"model\": \"gpt-4o-mini-2024-07-18\",\n
+        \ \"choices\": [\n    {\n      \"index\": 0,\n      \"message\": {\n        \"role\":
+        \"assistant\",\n        \"content\": \"<summary>\\n1. **Task Overview**: The
+        user is seeking an explanation of the Python package ecosystem, specifically
+        focusing on how pip works, the role of PyPI, the concept of virtual environments,
+        and a comparison between pip, conda, and uv.\\n\\n2. **Current State**: The
+        assistant has provided a comprehensive overview of the Python package ecosystem,
+        including definitions and comparisons of pip, PyPI, virtual environments,
+        conda, and uv.\\n\\n3. **Important Discoveries**:\\n   - PyPI (Python Package
+        Index) is the official repository with over 400,000 packages.\\n   - pip is
+        the standard package installer that downloads packages from PyPI.\\n   - Virtual
+        environments (venv) allow for isolated Python installations to prevent dependency
+        conflicts.\\n   - conda is a cross-language package manager, particularly
+        popular in data science, that can manage non-Python dependencies.\\n   - uv
+        is a new Rust-based tool that is significantly faster than pip (10-100x) and
+        aims to unify the functionalities of pip, pip-tools, and virtualenv.\\n\\n4.
+        **Next Steps**: The agent should consider providing further details on how
+        to use pip, conda, and uv, including installation commands, examples of creating
+        virtual environments, and any specific use cases for each tool.\\n\\n5. **Context
+        to Preserve**: \\n   - PyPI: Python Package Index, hosting 400k+ packages.\\n
+        \  - pip: Standard package installer for Python.\\n   - Virtual environments
+        (venv): Isolated Python installations.\\n   - conda: Cross-language package
+        manager for data science.\\n   - uv: Rust-based tool, 10-100x faster than
+        pip, aims to replace pip, pip-tools, and virtualenv.\\n</summary>\",\n        \"refusal\":
+        null,\n        \"annotations\": []\n      },\n      \"logprobs\": null,\n
+        \     \"finish_reason\": \"stop\"\n    }\n  ],\n  \"usage\": {\n    \"prompt_tokens\":
+        333,\n    \"completion_tokens\": 354,\n    \"total_tokens\": 687,\n    \"prompt_tokens_details\":
+        {\n      \"cached_tokens\": 0,\n      \"audio_tokens\": 0\n    },\n    \"completion_tokens_details\":
+        {\n      \"reasoning_tokens\": 0,\n      \"audio_tokens\": 0,\n      \"accepted_prediction_tokens\":
+        0,\n      \"rejected_prediction_tokens\": 0\n    }\n  },\n  \"service_tier\":
+        \"default\",\n  \"system_fingerprint\": \"fp_f4ae844694\"\n}\n"
+    headers:
+      CF-RAY:
+      - CF-RAY-XXX
+      Connection:
+      - keep-alive
+      Content-Type:
+      - application/json
+      Date:
+      - Mon, 09 Feb 2026 20:27:42 GMT
+      Server:
+      - cloudflare
+      Strict-Transport-Security:
+      - STS-XXX
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - X-CONTENT-TYPE-XXX
+      access-control-expose-headers:
+      - ACCESS-CONTROL-XXX
+      alt-svc:
+      - h3=":443"; ma=86400
+      cf-cache-status:
+      - DYNAMIC
+      openai-organization:
+      - OPENAI-ORG-XXX
+      openai-processing-ms:
+      - '4879'
+      openai-project:
+      - OPENAI-PROJECT-XXX
+      openai-version:
+      - '2020-10-01'
+      set-cookie:
+      - SET-COOKIE-XXX
+      x-openai-proxy-wasm:
+      - v0.1
+      x-ratelimit-limit-requests:
+      - X-RATELIMIT-LIMIT-REQUESTS-XXX
+      x-ratelimit-limit-tokens:
+      - X-RATELIMIT-LIMIT-TOKENS-XXX
+      x-ratelimit-remaining-requests:
+      - X-RATELIMIT-REMAINING-REQUESTS-XXX
+      x-ratelimit-remaining-tokens:
+      - X-RATELIMIT-REMAINING-TOKENS-XXX
+      x-ratelimit-reset-requests:
+      - X-RATELIMIT-RESET-REQUESTS-XXX
+      x-ratelimit-reset-tokens:
+      - X-RATELIMIT-RESET-TOKENS-XXX
+      x-request-id:
+      - X-REQUEST-ID-XXX
+    status:
+      code: 200
+      message: OK
+- request:
+    body: '{"messages":[{"role":"system","content":"You are a precise assistant that
+      creates structured summaries of agent conversations. You preserve critical context
+      needed for seamless task continuation."},{"role":"user","content":"Analyze the
+      following conversation and create a structured summary that preserves all information
+      needed to continue the task seamlessly.\n\n<conversation>\n[USER]: Tell me about
+      the history of the Python programming language. Who created it, when was it
+      first released, and what were the main design goals? Please provide a detailed
+      overview covering the major milestones from its inception through Python 3.\n\n[ASSISTANT]:
+      Python was created by Guido van Rossum and first released in 1991. The main
+      design goals were code readability and simplicity. Key milestones: Python 1.0
+      (1994) introduced functional programming tools like lambda and map. Python 2.0
+      (2000) added list comprehensions and garbage collection. Python 3.0 (2008) was
+      a major backward-incompatible release that fixed fundamental design flaws. Python
+      2 reached end-of-life in January 2020.\n</conversation>\n\nCreate a summary
+      with these sections:\n1. **Task Overview**: What is the agent trying to accomplish?\n2.
+      **Current State**: What has been completed so far? What step is the agent on?\n3.
+      **Important Discoveries**: Key facts, data, tool results, or findings that must
+      not be lost.\n4. **Next Steps**: What should the agent do next based on the
+      conversation?\n5. **Context to Preserve**: Any specific values, names, URLs,
+      code snippets, or details referenced in the conversation.\n\nWrap your entire
+      summary in <summary> tags.\n\n<summary>\n[Your structured summary here]\n</summary>"}],"model":"gpt-4o-mini","temperature":0}'
+    headers:
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept:
+      - application/json
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      authorization:
+      - AUTHORIZATION-XXX
+      connection:
+      - keep-alive
+      content-length:
+      - '1726'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+      x-stainless-arch:
+      - X-STAINLESS-ARCH-XXX
+      x-stainless-async:
+      - async:asyncio
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - X-STAINLESS-OS-XXX
+      x-stainless-package-version:
+      - 1.83.0
+      x-stainless-read-timeout:
+      - X-STAINLESS-READ-TIMEOUT-XXX
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.13.3
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    body:
+      string: "{\n  \"id\": \"chatcmpl-D7S93rBUMAtEdwdI6Y2ga0s50IFtv\",\n  \"object\":
+        \"chat.completion\",\n  \"created\": 1770668857,\n  \"model\": \"gpt-4o-mini-2024-07-18\",\n
+        \ \"choices\": [\n    {\n      \"index\": 0,\n      \"message\": {\n        \"role\":
+        \"assistant\",\n        \"content\": \"<summary>\\n1. **Task Overview**: The
+        user is seeking a detailed overview of the history of the Python programming
+        language, including its creator, initial release date, main design goals,
+        and major milestones up to Python 3.\\n\\n2. **Current State**: The assistant
+        has provided a comprehensive response detailing the history of Python, including
+        its creator (Guido van Rossum), first release (1991), main design goals (code
+        readability and simplicity), and key milestones (Python 1.0 in 1994, Python
+        2.0 in 2000, and Python 3.0 in 2008).\\n\\n3. **Important Discoveries**: \\n
+        \  - Python was created by Guido van Rossum.\\n   - First released in 1991.\\n
+        \  - Main design goals: code readability and simplicity.\\n   - Key milestones:\\n
+        \    - Python 1.0 (1994): Introduced functional programming tools like lambda
+        and map.\\n     - Python 2.0 (2000): Added list comprehensions and garbage
+        collection.\\n     - Python 3.0 (2008): Major backward-incompatible release
+        that fixed fundamental design flaws.\\n   - Python 2 reached end-of-life in
+        January 2020.\\n\\n4. **Next Steps**: The agent should be prepared to provide
+        additional details or answer follow-up questions regarding Python's features,
+        community, or specific use cases if the user requests more information.\\n\\n5.
+        **Context to Preserve**: \\n   - Creator: Guido van Rossum\\n   - Initial
+        release: 1991\\n   - Milestones: \\n     - Python 1.0 (1994)\\n     - Python
+        2.0 (2000)\\n     - Python 3.0 (2008)\\n   - End-of-life for Python 2: January
+        2020\\n</summary>\",\n        \"refusal\": null,\n        \"annotations\":
+        []\n      },\n      \"logprobs\": null,\n      \"finish_reason\": \"stop\"\n
+        \   }\n  ],\n  \"usage\": {\n    \"prompt_tokens\": 346,\n    \"completion_tokens\":
+        372,\n    \"total_tokens\": 718,\n    \"prompt_tokens_details\": {\n      \"cached_tokens\":
+        0,\n      \"audio_tokens\": 0\n    },\n    \"completion_tokens_details\":
+        {\n      \"reasoning_tokens\": 0,\n      \"audio_tokens\": 0,\n      \"accepted_prediction_tokens\":
+        0,\n      \"rejected_prediction_tokens\": 0\n    }\n  },\n  \"service_tier\":
+        \"default\",\n  \"system_fingerprint\": \"fp_7e4bf6ad56\"\n}\n"
+    headers:
+      CF-RAY:
+      - CF-RAY-XXX
+      Connection:
+      - keep-alive
+      Content-Type:
+      - application/json
+      Date:
+      - Mon, 09 Feb 2026 20:27:42 GMT
+      Server:
+      - cloudflare
+      Strict-Transport-Security:
+      - STS-XXX
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - X-CONTENT-TYPE-XXX
+      access-control-expose-headers:
+      - ACCESS-CONTROL-XXX
+      alt-svc:
+      - h3=":443"; ma=86400
+      cf-cache-status:
+      - DYNAMIC
+      openai-organization:
+      - OPENAI-ORG-XXX
+      openai-processing-ms:
+      - '5097'
+      openai-project:
+      - OPENAI-PROJECT-XXX
+      openai-version:
+      - '2020-10-01'
+      set-cookie:
+      - SET-COOKIE-XXX
+      x-openai-proxy-wasm:
+      - v0.1
+      x-ratelimit-limit-requests:
+      - X-RATELIMIT-LIMIT-REQUESTS-XXX
+      x-ratelimit-limit-tokens:
+      - X-RATELIMIT-LIMIT-TOKENS-XXX
+      x-ratelimit-remaining-requests:
+      - X-RATELIMIT-REMAINING-REQUESTS-XXX
+      x-ratelimit-remaining-tokens:
+      - X-RATELIMIT-REMAINING-TOKENS-XXX
+      x-ratelimit-reset-requests:
+      - X-RATELIMIT-RESET-REQUESTS-XXX
+      x-ratelimit-reset-tokens:
+      - X-RATELIMIT-RESET-TOKENS-XXX
+      x-request-id:
+      - X-REQUEST-ID-XXX
+    status:
+      code: 200
+      message: OK
+- request:
+    body: '{"messages":[{"role":"system","content":"You are a precise assistant that
+      creates structured summaries of agent conversations. You preserve critical context
+      needed for seamless task continuation."},{"role":"user","content":"Analyze the
+      following conversation and create a structured summary that preserves all information
+      needed to continue the task seamlessly.\n\n<conversation>\n[USER]: What about
+      the async/await features? When were they introduced and how do they compare
+      to similar features in JavaScript and C#? Also explain the Global Interpreter
+      Lock and its implications.\n\n[ASSISTANT]: Async/await was introduced in Python
+      3.5 (PEP 492, 2015). Unlike JavaScript which is single-threaded by design, Python''s
+      asyncio is an opt-in framework. C# introduced async/await in 2012 (C# 5.0) and
+      was a major inspiration for Python''s implementation. The GIL (Global Interpreter
+      Lock) is a mutex that protects access to Python objects, preventing multiple
+      threads from executing Python bytecodes simultaneously. This means CPU-bound
+      multithreaded programs don''t benefit from multiple cores. PEP 703 proposes
+      making the GIL optional in CPython.\n</conversation>\n\nCreate a summary with
+      these sections:\n1. **Task Overview**: What is the agent trying to accomplish?\n2.
+      **Current State**: What has been completed so far? What step is the agent on?\n3.
+      **Important Discoveries**: Key facts, data, tool results, or findings that must
+      not be lost.\n4. **Next Steps**: What should the agent do next based on the
+      conversation?\n5. **Context to Preserve**: Any specific values, names, URLs,
+      code snippets, or details referenced in the conversation.\n\nWrap your entire
+      summary in <summary> tags.\n\n<summary>\n[Your structured summary here]\n</summary>"}],"model":"gpt-4o-mini","temperature":0}'
+    headers:
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept:
+      - application/json
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      authorization:
+      - AUTHORIZATION-XXX
+      connection:
+      - keep-alive
+      content-length:
+      - '1786'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+      x-stainless-arch:
+      - X-STAINLESS-ARCH-XXX
+      x-stainless-async:
+      - async:asyncio
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - X-STAINLESS-OS-XXX
+      x-stainless-package-version:
+      - 1.83.0
+      x-stainless-read-timeout:
+      - X-STAINLESS-READ-TIMEOUT-XXX
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.13.3
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    body:
+      string: "{\n  \"id\": \"chatcmpl-D7S94auQYOLDTKfRzdluGiWAomSqd\",\n  \"object\":
+        \"chat.completion\",\n  \"created\": 1770668858,\n  \"model\": \"gpt-4o-mini-2024-07-18\",\n
+        \ \"choices\": [\n    {\n      \"index\": 0,\n      \"message\": {\n        \"role\":
+        \"assistant\",\n        \"content\": \"<summary>\\n1. **Task Overview**: The
+        user is seeking information about the async/await features in Python, their
+        introduction timeline, comparisons with similar features in JavaScript and
+        C#, and an explanation of the Global Interpreter Lock (GIL) and its implications.\\n\\n2.
+        **Current State**: The assistant has provided information regarding the introduction
+        of async/await in Python (version 3.5, PEP 492 in 2015), comparisons with
+        JavaScript and C# (C# introduced async/await in 2012), and an explanation
+        of the GIL.\\n\\n3. **Important Discoveries**: \\n   - Async/await was introduced
+        in Python 3.5 (PEP 492, 2015).\\n   - JavaScript is single-threaded, while
+        Python's asyncio is an opt-in framework.\\n   - C# introduced async/await
+        in 2012 (C# 5.0) and influenced Python's implementation.\\n   - The GIL (Global
+        Interpreter Lock) is a mutex that prevents multiple threads from executing
+        Python bytecodes simultaneously, affecting CPU-bound multithreaded programs.\\n
+        \  - PEP 703 proposes making the GIL optional in CPython.\\n\\n4. **Next Steps**:
+        The agent should consider providing more detailed comparisons of async/await
+        features between Python, JavaScript, and C#, as well as further implications
+        of the GIL and PEP 703.\\n\\n5. **Context to Preserve**: \\n   - Python async/await
+        introduction: 3.5 (PEP 492, 2015)\\n   - C# async/await introduction: 2012
+        (C# 5.0)\\n   - GIL (Global Interpreter Lock) explanation and implications.\\n
+        \  - Reference to PEP 703 regarding the GIL.\\n</summary>\",\n        \"refusal\":
+        null,\n        \"annotations\": []\n      },\n      \"logprobs\": null,\n
+        \     \"finish_reason\": \"stop\"\n    }\n  ],\n  \"usage\": {\n    \"prompt_tokens\":
+        364,\n    \"completion_tokens\": 368,\n    \"total_tokens\": 732,\n    \"prompt_tokens_details\":
+        {\n      \"cached_tokens\": 0,\n      \"audio_tokens\": 0\n    },\n    \"completion_tokens_details\":
+        {\n      \"reasoning_tokens\": 0,\n      \"audio_tokens\": 0,\n      \"accepted_prediction_tokens\":
+        0,\n      \"rejected_prediction_tokens\": 0\n    }\n  },\n  \"service_tier\":
+        \"default\",\n  \"system_fingerprint\": \"fp_f4ae844694\"\n}\n"
+    headers:
+      CF-RAY:
+      - CF-RAY-XXX
+      Connection:
+      - keep-alive
+      Content-Type:
+      - application/json
+      Date:
+      - Mon, 09 Feb 2026 20:27:44 GMT
+      Server:
+      - cloudflare
+      Strict-Transport-Security:
+      - STS-XXX
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - X-CONTENT-TYPE-XXX
+      access-control-expose-headers:
+      - ACCESS-CONTROL-XXX
+      alt-svc:
+      - h3=":443"; ma=86400
+      cf-cache-status:
+      - DYNAMIC
+      openai-organization:
+      - OPENAI-ORG-XXX
+      openai-processing-ms:
+      - '6339'
+      openai-project:
+      - OPENAI-PROJECT-XXX
+      openai-version:
+      - '2020-10-01'
+      set-cookie:
+      - SET-COOKIE-XXX
+      x-openai-proxy-wasm:
+      - v0.1
+      x-ratelimit-limit-requests:
+      - X-RATELIMIT-LIMIT-REQUESTS-XXX
+      x-ratelimit-limit-tokens:
+      - X-RATELIMIT-LIMIT-TOKENS-XXX
+      x-ratelimit-remaining-requests:
+      - X-RATELIMIT-REMAINING-REQUESTS-XXX
+      x-ratelimit-remaining-tokens:
+      - X-RATELIMIT-REMAINING-TOKENS-XXX
+      x-ratelimit-reset-requests:
+      - X-RATELIMIT-RESET-REQUESTS-XXX
+      x-ratelimit-reset-tokens:
+      - X-RATELIMIT-RESET-TOKENS-XXX
+      x-request-id:
+      - X-REQUEST-ID-XXX
+    status:
+      code: 200
+      message: OK
+version: 1
--- a/lib/crewai/tests/cassettes/utilities/TestParallelSummarizationVCR.test_parallel_summarize_preserves_files.yaml
+++ b/lib/crewai/tests/cassettes/utilities/TestParallelSummarizationVCR.test_parallel_summarize_preserves_files.yaml
@@ -0,0 +1,435 @@
+interactions:
+- request:
+    body: '{"messages":[{"role":"system","content":"You are a precise assistant that
+      creates structured summaries of agent conversations. You preserve critical context
+      needed for seamless task continuation."},{"role":"user","content":"Analyze the
+      following conversation and create a structured summary that preserves all information
+      needed to continue the task seamlessly.\n\n<conversation>\n[USER]: Explain the
+      Python package ecosystem. How does pip work, what is PyPI, and what are virtual
+      environments? Compare pip with conda and uv.\n\n[ASSISTANT]: PyPI (Python Package
+      Index) is the official repository hosting 400k+ packages. pip is the standard
+      package installer that downloads from PyPI. Virtual environments (venv) create
+      isolated Python installations to avoid dependency conflicts between projects.
+      conda is a cross-language package manager popular in data science that can manage
+      non-Python dependencies. uv is a new Rust-based tool that is 10-100x faster
+      than pip and aims to replace pip, pip-tools, and virtualenv with a single unified
+      tool.\n</conversation>\n\nCreate a summary with these sections:\n1. **Task Overview**:
+      What is the agent trying to accomplish?\n2. **Current State**: What has been
+      completed so far? What step is the agent on?\n3. **Important Discoveries**:
+      Key facts, data, tool results, or findings that must not be lost.\n4. **Next
+      Steps**: What should the agent do next based on the conversation?\n5. **Context
+      to Preserve**: Any specific values, names, URLs, code snippets, or details referenced
+      in the conversation.\n\nWrap your entire summary in <summary> tags.\n\n<summary>\n[Your
+      structured summary here]\n</summary>"}],"model":"gpt-4o-mini","temperature":0}'
+    headers:
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept:
+      - application/json
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      authorization:
+      - AUTHORIZATION-XXX
+      connection:
+      - keep-alive
+      content-length:
+      - '1687'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+      x-stainless-arch:
+      - X-STAINLESS-ARCH-XXX
+      x-stainless-async:
+      - async:asyncio
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - X-STAINLESS-OS-XXX
+      x-stainless-package-version:
+      - 1.83.0
+      x-stainless-read-timeout:
+      - X-STAINLESS-READ-TIMEOUT-XXX
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.13.3
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    body:
+      string: "{\n  \"id\": \"chatcmpl-D7S9PnjkuCMHqU912kcH8G5zIIxQU\",\n  \"object\":
+        \"chat.completion\",\n  \"created\": 1770668879,\n  \"model\": \"gpt-4o-mini-2024-07-18\",\n
+        \ \"choices\": [\n    {\n      \"index\": 0,\n      \"message\": {\n        \"role\":
+        \"assistant\",\n        \"content\": \"<summary>\\n1. **Task Overview**: The
+        user is seeking an explanation of the Python package ecosystem, specifically
+        focusing on how pip works, the role of PyPI, the concept of virtual environments,
+        and a comparison between pip, conda, and uv.\\n\\n2. **Current State**: The
+        assistant has provided a comprehensive overview of the Python package ecosystem,
+        including definitions and comparisons of pip, PyPI, virtual environments,
+        conda, and uv.\\n\\n3. **Important Discoveries**:\\n   - PyPI (Python Package
+        Index) is the official repository with over 400,000 packages.\\n   - pip is
+        the standard package installer that downloads packages from PyPI.\\n   - Virtual
+        environments (venv) allow for isolated Python installations to prevent dependency
+        conflicts.\\n   - conda is a cross-language package manager, particularly
+        popular in data science, that can manage non-Python dependencies.\\n   - uv
+        is a new Rust-based tool that is significantly faster than pip (10-100x) and
+        aims to unify the functionalities of pip, pip-tools, and virtualenv.\\n\\n4.
+        **Next Steps**: The agent should consider providing further details or examples
+        on how to use pip, conda, and uv, as well as practical applications of virtual
+        environments in Python projects.\\n\\n5. **Context to Preserve**: \\n   -
+        PyPI: Python Package Index, hosting 400k+ packages.\\n   - pip: Standard package
+        installer for Python.\\n   - Virtual environments (venv): Isolated Python
+        installations.\\n   - conda: Cross-language package manager for data science.\\n
+        \  - uv: Rust-based tool, 10-100x faster than pip, aims to replace pip, pip-tools,
+        and virtualenv.\\n</summary>\",\n        \"refusal\": null,\n        \"annotations\":
+        []\n      },\n      \"logprobs\": null,\n      \"finish_reason\": \"stop\"\n
+        \   }\n  ],\n  \"usage\": {\n    \"prompt_tokens\": 333,\n    \"completion_tokens\":
+        349,\n    \"total_tokens\": 682,\n    \"prompt_tokens_details\": {\n      \"cached_tokens\":
+        0,\n      \"audio_tokens\": 0\n    },\n    \"completion_tokens_details\":
+        {\n      \"reasoning_tokens\": 0,\n      \"audio_tokens\": 0,\n      \"accepted_prediction_tokens\":
+        0,\n      \"rejected_prediction_tokens\": 0\n    }\n  },\n  \"service_tier\":
+        \"default\",\n  \"system_fingerprint\": \"fp_f4ae844694\"\n}\n"
+    headers:
+      CF-RAY:
+      - CF-RAY-XXX
+      Connection:
+      - keep-alive
+      Content-Type:
+      - application/json
+      Date:
+      - Mon, 09 Feb 2026 20:28:04 GMT
+      Server:
+      - cloudflare
+      Strict-Transport-Security:
+      - STS-XXX
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - X-CONTENT-TYPE-XXX
+      access-control-expose-headers:
+      - ACCESS-CONTROL-XXX
+      alt-svc:
+      - h3=":443"; ma=86400
+      cf-cache-status:
+      - DYNAMIC
+      openai-organization:
+      - OPENAI-ORG-XXX
+      openai-processing-ms:
+      - '4979'
+      openai-project:
+      - OPENAI-PROJECT-XXX
+      openai-version:
+      - '2020-10-01'
+      set-cookie:
+      - SET-COOKIE-XXX
+      x-openai-proxy-wasm:
+      - v0.1
+      x-ratelimit-limit-requests:
+      - X-RATELIMIT-LIMIT-REQUESTS-XXX
+      x-ratelimit-limit-tokens:
+      - X-RATELIMIT-LIMIT-TOKENS-XXX
+      x-ratelimit-remaining-requests:
+      - X-RATELIMIT-REMAINING-REQUESTS-XXX
+      x-ratelimit-remaining-tokens:
+      - X-RATELIMIT-REMAINING-TOKENS-XXX
+      x-ratelimit-reset-requests:
+      - X-RATELIMIT-RESET-REQUESTS-XXX
+      x-ratelimit-reset-tokens:
+      - X-RATELIMIT-RESET-TOKENS-XXX
+      x-request-id:
+      - X-REQUEST-ID-XXX
+    status:
+      code: 200
+      message: OK
+- request:
+    body: '{"messages":[{"role":"system","content":"You are a precise assistant that
+      creates structured summaries of agent conversations. You preserve critical context
+      needed for seamless task continuation."},{"role":"user","content":"Analyze the
+      following conversation and create a structured summary that preserves all information
+      needed to continue the task seamlessly.\n\n<conversation>\n[USER]: Tell me about
+      the history of the Python programming language. Who created it, when was it
+      first released, and what were the main design goals? Please provide a detailed
+      overview covering the major milestones from its inception through Python 3.\n\n[ASSISTANT]:
+      Python was created by Guido van Rossum and first released in 1991. The main
+      design goals were code readability and simplicity. Key milestones: Python 1.0
+      (1994) introduced functional programming tools like lambda and map. Python 2.0
+      (2000) added list comprehensions and garbage collection. Python 3.0 (2008) was
+      a major backward-incompatible release that fixed fundamental design flaws. Python
+      2 reached end-of-life in January 2020.\n</conversation>\n\nCreate a summary
+      with these sections:\n1. **Task Overview**: What is the agent trying to accomplish?\n2.
+      **Current State**: What has been completed so far? What step is the agent on?\n3.
+      **Important Discoveries**: Key facts, data, tool results, or findings that must
+      not be lost.\n4. **Next Steps**: What should the agent do next based on the
+      conversation?\n5. **Context to Preserve**: Any specific values, names, URLs,
+      code snippets, or details referenced in the conversation.\n\nWrap your entire
+      summary in <summary> tags.\n\n<summary>\n[Your structured summary here]\n</summary>"}],"model":"gpt-4o-mini","temperature":0}'
+    headers:
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept:
+      - application/json
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      authorization:
+      - AUTHORIZATION-XXX
+      connection:
+      - keep-alive
+      content-length:
+      - '1726'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+      x-stainless-arch:
+      - X-STAINLESS-ARCH-XXX
+      x-stainless-async:
+      - async:asyncio
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - X-STAINLESS-OS-XXX
+      x-stainless-package-version:
+      - 1.83.0
+      x-stainless-read-timeout:
+      - X-STAINLESS-READ-TIMEOUT-XXX
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.13.3
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    body:
+      string: "{\n  \"id\": \"chatcmpl-D7S9PqglWRu0PEoMRHyOiRnpn3yqU\",\n  \"object\":
+        \"chat.completion\",\n  \"created\": 1770668879,\n  \"model\": \"gpt-4o-mini-2024-07-18\",\n
+        \ \"choices\": [\n    {\n      \"index\": 0,\n      \"message\": {\n        \"role\":
+        \"assistant\",\n        \"content\": \"<summary>\\n1. **Task Overview**: The
+        user is seeking a detailed overview of the history of the Python programming
+        language, including its creator, initial release date, main design goals,
+        and major milestones up to Python 3.\\n\\n2. **Current State**: The assistant
+        has provided a comprehensive response detailing the history of Python, including
+        its creator (Guido van Rossum), first release (1991), main design goals (code
+        readability and simplicity), and key milestones (Python 1.0 in 1994, Python
+        2.0 in 2000, and Python 3.0 in 2008).\\n\\n3. **Important Discoveries**: \\n
+        \  - Python was created by Guido van Rossum.\\n   - First released in 1991.\\n
+        \  - Main design goals: code readability and simplicity.\\n   - Key milestones:\\n
+        \    - Python 1.0 (1994): Introduced functional programming tools like lambda
+        and map.\\n     - Python 2.0 (2000): Added list comprehensions and garbage
+        collection.\\n     - Python 3.0 (2008): Major backward-incompatible release
+        that fixed fundamental design flaws.\\n   - Python 2 reached end-of-life in
+        January 2020.\\n\\n4. **Next Steps**: The agent should be prepared to provide
+        further details or answer any follow-up questions the user may have regarding
+        Python's history or its features.\\n\\n5. **Context to Preserve**: \\n   -
+        Creator: Guido van Rossum\\n   - First release: 1991\\n   - Milestones: \\n
+        \    - Python 1.0 (1994)\\n     - Python 2.0 (2000)\\n     - Python 3.0 (2008)\\n
+        \  - End-of-life for Python 2: January 2020\\n</summary>\",\n        \"refusal\":
+        null,\n        \"annotations\": []\n      },\n      \"logprobs\": null,\n
+        \     \"finish_reason\": \"stop\"\n    }\n  ],\n  \"usage\": {\n    \"prompt_tokens\":
+        346,\n    \"completion_tokens\": 367,\n    \"total_tokens\": 713,\n    \"prompt_tokens_details\":
+        {\n      \"cached_tokens\": 0,\n      \"audio_tokens\": 0\n    },\n    \"completion_tokens_details\":
+        {\n      \"reasoning_tokens\": 0,\n      \"audio_tokens\": 0,\n      \"accepted_prediction_tokens\":
+        0,\n      \"rejected_prediction_tokens\": 0\n    }\n  },\n  \"service_tier\":
+        \"default\",\n  \"system_fingerprint\": \"fp_7e4bf6ad56\"\n}\n"
+    headers:
+      CF-RAY:
+      - CF-RAY-XXX
+      Connection:
+      - keep-alive
+      Content-Type:
+      - application/json
+      Date:
+      - Mon, 09 Feb 2026 20:28:04 GMT
+      Server:
+      - cloudflare
+      Strict-Transport-Security:
+      - STS-XXX
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - X-CONTENT-TYPE-XXX
+      access-control-expose-headers:
+      - ACCESS-CONTROL-XXX
+      alt-svc:
+      - h3=":443"; ma=86400
+      cf-cache-status:
+      - DYNAMIC
+      openai-organization:
+      - OPENAI-ORG-XXX
+      openai-processing-ms:
+      - '5368'
+      openai-project:
+      - OPENAI-PROJECT-XXX
+      openai-version:
+      - '2020-10-01'
+      set-cookie:
+      - SET-COOKIE-XXX
+      x-openai-proxy-wasm:
+      - v0.1
+      x-ratelimit-limit-requests:
+      - X-RATELIMIT-LIMIT-REQUESTS-XXX
+      x-ratelimit-limit-tokens:
+      - X-RATELIMIT-LIMIT-TOKENS-XXX
+      x-ratelimit-remaining-requests:
+      - X-RATELIMIT-REMAINING-REQUESTS-XXX
+      x-ratelimit-remaining-tokens:
+      - X-RATELIMIT-REMAINING-TOKENS-XXX
+      x-ratelimit-reset-requests:
+      - X-RATELIMIT-RESET-REQUESTS-XXX
+      x-ratelimit-reset-tokens:
+      - X-RATELIMIT-RESET-TOKENS-XXX
+      x-request-id:
+      - X-REQUEST-ID-XXX
+    status:
+      code: 200
+      message: OK
+- request:
+    body: '{"messages":[{"role":"system","content":"You are a precise assistant that
+      creates structured summaries of agent conversations. You preserve critical context
+      needed for seamless task continuation."},{"role":"user","content":"Analyze the
+      following conversation and create a structured summary that preserves all information
+      needed to continue the task seamlessly.\n\n<conversation>\n[USER]: What about
+      the async/await features? When were they introduced and how do they compare
+      to similar features in JavaScript and C#? Also explain the Global Interpreter
+      Lock and its implications.\n\n[ASSISTANT]: Async/await was introduced in Python
+      3.5 (PEP 492, 2015). Unlike JavaScript which is single-threaded by design, Python''s
+      asyncio is an opt-in framework. C# introduced async/await in 2012 (C# 5.0) and
+      was a major inspiration for Python''s implementation. The GIL (Global Interpreter
+      Lock) is a mutex that protects access to Python objects, preventing multiple
+      threads from executing Python bytecodes simultaneously. This means CPU-bound
+      multithreaded programs don''t benefit from multiple cores. PEP 703 proposes
+      making the GIL optional in CPython.\n</conversation>\n\nCreate a summary with
+      these sections:\n1. **Task Overview**: What is the agent trying to accomplish?\n2.
+      **Current State**: What has been completed so far? What step is the agent on?\n3.
+      **Important Discoveries**: Key facts, data, tool results, or findings that must
+      not be lost.\n4. **Next Steps**: What should the agent do next based on the
+      conversation?\n5. **Context to Preserve**: Any specific values, names, URLs,
+      code snippets, or details referenced in the conversation.\n\nWrap your entire
+      summary in <summary> tags.\n\n<summary>\n[Your structured summary here]\n</summary>"}],"model":"gpt-4o-mini","temperature":0}'
+    headers:
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept:
+      - application/json
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      authorization:
+      - AUTHORIZATION-XXX
+      connection:
+      - keep-alive
+      content-length:
+      - '1786'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+      x-stainless-arch:
+      - X-STAINLESS-ARCH-XXX
+      x-stainless-async:
+      - async:asyncio
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - X-STAINLESS-OS-XXX
+      x-stainless-package-version:
+      - 1.83.0
+      x-stainless-read-timeout:
+      - X-STAINLESS-READ-TIMEOUT-XXX
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.13.3
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    body:
+      string: "{\n  \"id\": \"chatcmpl-D7S9Pcl5ybKLH8cSEZ6hgPuvj5iCv\",\n  \"object\":
+        \"chat.completion\",\n  \"created\": 1770668879,\n  \"model\": \"gpt-4o-mini-2024-07-18\",\n
+        \ \"choices\": [\n    {\n      \"index\": 0,\n      \"message\": {\n        \"role\":
+        \"assistant\",\n        \"content\": \"<summary>\\n1. **Task Overview**: The
+        user is seeking information about the async/await features in Python, their
+        introduction timeline, comparisons with similar features in JavaScript and
+        C#, and an explanation of the Global Interpreter Lock (GIL) and its implications.\\n\\n2.
+        **Current State**: The assistant has provided information regarding the introduction
+        of async/await in Python (version 3.5, PEP 492 in 2015), comparisons with
+        JavaScript and C# (C# introduced async/await in 2012), and an explanation
+        of the GIL.\\n\\n3. **Important Discoveries**: \\n   - Async/await was introduced
+        in Python 3.5 (PEP 492, 2015).\\n   - JavaScript is single-threaded, while
+        Python's asyncio is an opt-in framework.\\n   - C# introduced async/await
+        in 2012 (C# 5.0) and influenced Python's implementation.\\n   - The GIL (Global
+        Interpreter Lock) is a mutex that prevents multiple threads from executing
+        Python bytecodes simultaneously, affecting CPU-bound multithreaded programs.\\n
+        \  - PEP 703 proposes making the GIL optional in CPython.\\n\\n4. **Next Steps**:
+        The agent should consider providing further details on how async/await is
+        implemented in Python, JavaScript, and C#, and explore the implications of
+        the GIL in more depth, including potential alternatives or workarounds.\\n\\n5.
+        **Context to Preserve**: \\n   - Python async/await introduction: version
+        3.5, PEP 492, 2015.\\n   - C# async/await introduction: 2012, C# 5.0.\\n   -
+        GIL (Global Interpreter Lock) and its implications on multithreading in Python.\\n
+        \  - Reference to PEP 703 regarding the GIL.\\n</summary>\",\n        \"refusal\":
+        null,\n        \"annotations\": []\n      },\n      \"logprobs\": null,\n
+        \     \"finish_reason\": \"stop\"\n    }\n  ],\n  \"usage\": {\n    \"prompt_tokens\":
+        364,\n    \"completion_tokens\": 381,\n    \"total_tokens\": 745,\n    \"prompt_tokens_details\":
+        {\n      \"cached_tokens\": 0,\n      \"audio_tokens\": 0\n    },\n    \"completion_tokens_details\":
+        {\n      \"reasoning_tokens\": 0,\n      \"audio_tokens\": 0,\n      \"accepted_prediction_tokens\":
+        0,\n      \"rejected_prediction_tokens\": 0\n    }\n  },\n  \"service_tier\":
+        \"default\",\n  \"system_fingerprint\": \"fp_f4ae844694\"\n}\n"
+    headers:
+      CF-RAY:
+      - CF-RAY-XXX
+      Connection:
+      - keep-alive
+      Content-Type:
+      - application/json
+      Date:
+      - Mon, 09 Feb 2026 20:28:04 GMT
+      Server:
+      - cloudflare
+      Strict-Transport-Security:
+      - STS-XXX
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - X-CONTENT-TYPE-XXX
+      access-control-expose-headers:
+      - ACCESS-CONTROL-XXX
+      alt-svc:
+      - h3=":443"; ma=86400
+      cf-cache-status:
+      - DYNAMIC
+      openai-organization:
+      - OPENAI-ORG-XXX
+      openai-processing-ms:
+      - '5489'
+      openai-project:
+      - OPENAI-PROJECT-XXX
+      openai-version:
+      - '2020-10-01'
+      set-cookie:
+      - SET-COOKIE-XXX
+      x-openai-proxy-wasm:
+      - v0.1
+      x-ratelimit-limit-requests:
+      - X-RATELIMIT-LIMIT-REQUESTS-XXX
+      x-ratelimit-limit-tokens:
+      - X-RATELIMIT-LIMIT-TOKENS-XXX
+      x-ratelimit-remaining-requests:
+      - X-RATELIMIT-REMAINING-REQUESTS-XXX
+      x-ratelimit-remaining-tokens:
+      - X-RATELIMIT-REMAINING-TOKENS-XXX
+      x-ratelimit-reset-requests:
+      - X-RATELIMIT-RESET-REQUESTS-XXX
+      x-ratelimit-reset-tokens:
+      - X-RATELIMIT-RESET-TOKENS-XXX
+      x-request-id:
+      - X-REQUEST-ID-XXX
+    status:
+      code: 200
+      message: OK
+version: 1
--- a/lib/crewai/tests/cassettes/utilities/TestSummarizeDirectAnthropic.test_summarize_direct_anthropic.yaml
+++ b/lib/crewai/tests/cassettes/utilities/TestSummarizeDirectAnthropic.test_summarize_direct_anthropic.yaml
@@ -0,0 +1,136 @@
+interactions:
+- request:
+    body: '{"max_tokens":4096,"messages":[{"role":"user","content":"Analyze the following
+      conversation and create a structured summary that preserves all information
+      needed to continue the task seamlessly.\n\n<conversation>\n[USER]: Research
+      the latest developments in large language models. Focus on architecture improvements
+      and training techniques.\n\n[ASSISTANT]: I''ll research the latest developments
+      in large language models. Based on my knowledge, recent advances include:\n1.
+      Mixture of Experts (MoE) architectures\n2. Improved attention mechanisms like
+      Flash Attention\n3. Better training data curation techniques\n4. Constitutional
+      AI and RLHF improvements\n\n[USER]: Can you go deeper on the MoE architectures?
+      What are the key papers?\n\n[ASSISTANT]: Key papers on Mixture of Experts:\n-
+      Switch Transformers (Google, 2021) - simplified MoE routing\n- GShard - scaling
+      to 600B parameters\n- Mixtral (Mistral AI) - open-source MoE model\nThe main
+      advantage is computational efficiency: only a subset of experts is activated
+      per token.\n</conversation>\n\nCreate a summary with these sections:\n1. **Task
+      Overview**: What is the agent trying to accomplish?\n2. **Current State**: What
+      has been completed so far? What step is the agent on?\n3. **Important Discoveries**:
+      Key facts, data, tool results, or findings that must not be lost.\n4. **Next
+      Steps**: What should the agent do next based on the conversation?\n5. **Context
+      to Preserve**: Any specific values, names, URLs, code snippets, or details referenced
+      in the conversation.\n\nWrap your entire summary in <summary> tags.\n\n<summary>\n[Your
+      structured summary here]\n</summary>"}],"model":"claude-3-5-haiku-latest","stream":false,"system":"You
+      are a precise assistant that creates structured summaries of agent conversations.
+      You preserve critical context needed for seamless task continuation.","temperature":0}'
+    headers:
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept:
+      - application/json
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      anthropic-version:
+      - '2023-06-01'
+      connection:
+      - keep-alive
+      content-length:
+      - '1870'
+      content-type:
+      - application/json
+      host:
+      - api.anthropic.com
+      x-api-key:
+      - X-API-KEY-XXX
+      x-stainless-arch:
+      - X-STAINLESS-ARCH-XXX
+      x-stainless-async:
+      - 'false'
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - X-STAINLESS-OS-XXX
+      x-stainless-package-version:
+      - 0.73.0
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.13.3
+      x-stainless-timeout:
+      - NOT_GIVEN
+    method: POST
+    uri: https://api.anthropic.com/v1/messages
+  response:
+    body:
+      string: '{"model":"claude-3-5-haiku-20241022","id":"msg_01SK3LP6RedPBmpvD1HfKD23","type":"message","role":"assistant","content":[{"type":"text","text":"<summary>\n1.
+        **Task Overview**:\n- Research latest developments in large language models\n-
+        Focus on architecture improvements and training techniques\n\n2. **Current
+        State**:\n- Initial research completed on broad developments\n- Currently
+        exploring Mixture of Experts (MoE) architectures in depth\n- Detailed discussion
+        of key MoE research papers initiated\n\n3. **Important Discoveries**:\nMoE
+        Architecture Insights:\n- Computational efficiency through selective expert
+        activation\n- Key research papers:\n  * Switch Transformers (Google, 2021)\n  *
+        GShard\n  * Mixtral (Mistral AI)\n- Main benefit: Only subset of experts activated
+        per token\n\n4. **Next Steps**:\n- Conduct deeper analysis of MoE architecture
+        mechanisms\n- Compare routing strategies across different MoE implementations\n-
+        Investigate performance metrics and scalability of MoE models\n\n5. **Context
+        to Preserve**:\n- Research Focus: Large Language Model Architectures\n- Specific
+        Interest: Mixture of Experts (MoE) Architectures\n- Key Researchers/Organizations:
+        Google, Mistral AI\n- Years of Significant Papers: 2021 onwards\n</summary>"}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":400,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":270,"service_tier":"standard","inference_geo":"not_available"}}'
+    headers:
+      CF-RAY:
+      - CF-RAY-XXX
+      Connection:
+      - keep-alive
+      Content-Security-Policy:
+      - CSP-FILTERED
+      Content-Type:
+      - application/json
+      Date:
+      - Mon, 09 Feb 2026 20:18:41 GMT
+      Server:
+      - cloudflare
+      Transfer-Encoding:
+      - chunked
+      X-Robots-Tag:
+      - none
+      anthropic-organization-id:
+      - ANTHROPIC-ORGANIZATION-ID-XXX
+      anthropic-ratelimit-input-tokens-limit:
+      - ANTHROPIC-RATELIMIT-INPUT-TOKENS-LIMIT-XXX
+      anthropic-ratelimit-input-tokens-remaining:
+      - ANTHROPIC-RATELIMIT-INPUT-TOKENS-REMAINING-XXX
+      anthropic-ratelimit-input-tokens-reset:
+      - ANTHROPIC-RATELIMIT-INPUT-TOKENS-RESET-XXX
+      anthropic-ratelimit-output-tokens-limit:
+      - ANTHROPIC-RATELIMIT-OUTPUT-TOKENS-LIMIT-XXX
+      anthropic-ratelimit-output-tokens-remaining:
+      - ANTHROPIC-RATELIMIT-OUTPUT-TOKENS-REMAINING-XXX
+      anthropic-ratelimit-output-tokens-reset:
+      - ANTHROPIC-RATELIMIT-OUTPUT-TOKENS-RESET-XXX
+      anthropic-ratelimit-requests-limit:
+      - '4000'
+      anthropic-ratelimit-requests-remaining:
+      - '3999'
+      anthropic-ratelimit-requests-reset:
+      - '2026-02-09T20:18:35Z'
+      anthropic-ratelimit-tokens-limit:
+      - ANTHROPIC-RATELIMIT-TOKENS-LIMIT-XXX
+      anthropic-ratelimit-tokens-remaining:
+      - ANTHROPIC-RATELIMIT-TOKENS-REMAINING-XXX
+      anthropic-ratelimit-tokens-reset:
+      - ANTHROPIC-RATELIMIT-TOKENS-RESET-XXX
+      cf-cache-status:
+      - DYNAMIC
+      request-id:
+      - REQUEST-ID-XXX
+      strict-transport-security:
+      - STS-XXX
+      x-envoy-upstream-service-time:
+      - '5639'
+    status:
+      code: 200
+      message: OK
+version: 1
--- a/lib/crewai/tests/cassettes/utilities/TestSummarizeDirectAzure.test_summarize_direct_azure.yaml
+++ b/lib/crewai/tests/cassettes/utilities/TestSummarizeDirectAzure.test_summarize_direct_azure.yaml
@@ -0,0 +1,110 @@
+interactions:
+- request:
+    body: '{"messages": [{"role": "system", "content": "You are a precise assistant
+      that creates structured summaries of agent conversations. You preserve critical
+      context needed for seamless task continuation."}, {"role": "user", "content":
+      "Analyze the following conversation and create a structured summary that preserves
+      all information needed to continue the task seamlessly.\n\n<conversation>\n[USER]:
+      Research the latest developments in large language models. Focus on architecture
+      improvements and training techniques.\n\n[ASSISTANT]: I''ll research the latest
+      developments in large language models. Based on my knowledge, recent advances
+      include:\n1. Mixture of Experts (MoE) architectures\n2. Improved attention mechanisms
+      like Flash Attention\n3. Better training data curation techniques\n4. Constitutional
+      AI and RLHF improvements\n\n[USER]: Can you go deeper on the MoE architectures?
+      What are the key papers?\n\n[ASSISTANT]: Key papers on Mixture of Experts:\n-
+      Switch Transformers (Google, 2021) - simplified MoE routing\n- GShard - scaling
+      to 600B parameters\n- Mixtral (Mistral AI) - open-source MoE model\nThe main
+      advantage is computational efficiency: only a subset of experts is activated
+      per token.\n</conversation>\n\nCreate a summary with these sections:\n1. **Task
+      Overview**: What is the agent trying to accomplish?\n2. **Current State**: What
+      has been completed so far? What step is the agent on?\n3. **Important Discoveries**:
+      Key facts, data, tool results, or findings that must not be lost.\n4. **Next
+      Steps**: What should the agent do next based on the conversation?\n5. **Context
+      to Preserve**: Any specific values, names, URLs, code snippets, or details referenced
+      in the conversation.\n\nWrap your entire summary in <summary> tags.\n\n<summary>\n[Your
+      structured summary here]\n</summary>"}], "stream": false, "temperature": 0}'
+    headers:
+      Accept:
+      - application/json
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '1849'
+      Content-Type:
+      - application/json
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      api-key:
+      - X-API-KEY-XXX
+      authorization:
+      - AUTHORIZATION-XXX
+      x-ms-client-request-id:
+      - X-MS-CLIENT-REQUEST-ID-XXX
+    method: POST
+    uri: https://fake-azure-endpoint.openai.azure.com/openai/deployments/gpt-4o-mini/chat/completions?api-version=2024-12-01-preview
+  response:
+    body:
+      string: '{"choices":[{"content_filter_results":{"hate":{"filtered":false,"severity":"safe"},"protected_material_code":{"filtered":false,"detected":false},"protected_material_text":{"filtered":false,"detected":false},"self_harm":{"filtered":false,"severity":"safe"},"sexual":{"filtered":false,"severity":"safe"},"violence":{"filtered":false,"severity":"safe"}},"finish_reason":"stop","index":0,"logprobs":null,"message":{"annotations":[],"content":"\u003csummary\u003e\n1.
+        **Task Overview**: The user has requested research on the latest developments
+        in large language models, specifically focusing on architecture improvements
+        and training techniques.\n\n2. **Current State**: The assistant has provided
+        an initial overview of recent advances in large language models, including
+        Mixture of Experts (MoE) architectures, improved attention mechanisms, better
+        training data curation techniques, and advancements in Constitutional AI and
+        Reinforcement Learning from Human Feedback (RLHF).\n\n3. **Important Discoveries**:
+        \n   - Recent advances in large language models include:\n     1. Mixture
+        of Experts (MoE) architectures\n     2. Improved attention mechanisms like
+        Flash Attention\n     3. Better training data curation techniques\n     4.
+        Constitutional AI and RLHF improvements\n   - Key papers on Mixture of Experts:\n     -
+        Switch Transformers (Google, 2021) - simplified MoE routing\n     - GShard
+        - scaling to 600B parameters\n     - Mixtral (Mistral AI) - open-source MoE
+        model\n   - The main advantage of MoE architectures is computational efficiency,
+        as only a subset of experts is activated per token.\n\n4. **Next Steps**:
+        The assistant should delve deeper into the Mixture of Experts architectures,
+        potentially summarizing the key findings and implications from the identified
+        papers.\n\n5. **Context to Preserve**: \n   - Key papers: \n     - Switch
+        Transformers (Google, 2021)\n     - GShard\n     - Mixtral (Mistral AI)\n   -
+        Focus on computational efficiency of MoE architectures.\n\u003c/summary\u003e","refusal":null,"role":"assistant"}}],"created":1770849953,"id":"chatcmpl-D8DFx1H1zzEerW5H0BWfuwmio2sz1","model":"gpt-4o-mini-2024-07-18","object":"chat.completion","prompt_filter_results":[{"prompt_index":0,"content_filter_results":{"hate":{"filtered":false,"severity":"safe"},"jailbreak":{"filtered":false,"detected":false},"self_harm":{"filtered":false,"severity":"safe"},"sexual":{"filtered":false,"severity":"safe"},"violence":{"filtered":false,"severity":"safe"}}}],"system_fingerprint":"fp_f97eff32c5","usage":{"completion_tokens":328,"completion_tokens_details":{"accepted_prediction_tokens":0,"audio_tokens":0,"reasoning_tokens":0,"rejected_prediction_tokens":0},"prompt_tokens":368,"prompt_tokens_details":{"audio_tokens":0,"cached_tokens":0},"total_tokens":696}}
+
+        '
+    headers:
+      Content-Length:
+      - '2786'
+      Content-Type:
+      - application/json
+      Date:
+      - Wed, 11 Feb 2026 22:45:56 GMT
+      Strict-Transport-Security:
+      - STS-XXX
+      apim-request-id:
+      - APIM-REQUEST-ID-XXX
+      azureml-model-session:
+      - AZUREML-MODEL-SESSION-XXX
+      x-accel-buffering:
+      - 'no'
+      x-content-type-options:
+      - X-CONTENT-TYPE-XXX
+      x-ms-client-request-id:
+      - X-MS-CLIENT-REQUEST-ID-XXX
+      x-ms-deployment-name:
+      - gpt-4o-mini
+      x-ms-rai-invoked:
+      - 'true'
+      x-ms-region:
+      - X-MS-REGION-XXX
+      x-ratelimit-limit-requests:
+      - X-RATELIMIT-LIMIT-REQUESTS-XXX
+      x-ratelimit-limit-tokens:
+      - X-RATELIMIT-LIMIT-TOKENS-XXX
+      x-ratelimit-remaining-requests:
+      - X-RATELIMIT-REMAINING-REQUESTS-XXX
+      x-ratelimit-remaining-tokens:
+      - X-RATELIMIT-REMAINING-TOKENS-XXX
+      x-request-id:
+      - X-REQUEST-ID-XXX
+    status:
+      code: 200
+      message: OK
+version: 1
--- a/lib/crewai/tests/cassettes/utilities/TestSummarizeDirectGemini.test_summarize_direct_gemini.yaml
+++ b/lib/crewai/tests/cassettes/utilities/TestSummarizeDirectGemini.test_summarize_direct_gemini.yaml
@@ -0,0 +1,103 @@
+interactions:
+- request:
+    body: '{"contents": [{"parts": [{"text": "Analyze the following conversation and
+      create a structured summary that preserves all information needed to continue
+      the task seamlessly.\n\n<conversation>\n[USER]: Research the latest developments
+      in large language models. Focus on architecture improvements and training techniques.\n\n[ASSISTANT]:
+      I''ll research the latest developments in large language models. Based on my
+      knowledge, recent advances include:\n1. Mixture of Experts (MoE) architectures\n2.
+      Improved attention mechanisms like Flash Attention\n3. Better training data
+      curation techniques\n4. Constitutional AI and RLHF improvements\n\n[USER]: Can
+      you go deeper on the MoE architectures? What are the key papers?\n\n[ASSISTANT]:
+      Key papers on Mixture of Experts:\n- Switch Transformers (Google, 2021) - simplified
+      MoE routing\n- GShard - scaling to 600B parameters\n- Mixtral (Mistral AI) -
+      open-source MoE model\nThe main advantage is computational efficiency: only
+      a subset of experts is activated per token.\n</conversation>\n\nCreate a summary
+      with these sections:\n1. **Task Overview**: What is the agent trying to accomplish?\n2.
+      **Current State**: What has been completed so far? What step is the agent on?\n3.
+      **Important Discoveries**: Key facts, data, tool results, or findings that must
+      not be lost.\n4. **Next Steps**: What should the agent do next based on the
+      conversation?\n5. **Context to Preserve**: Any specific values, names, URLs,
+      code snippets, or details referenced in the conversation.\n\nWrap your entire
+      summary in <summary> tags.\n\n<summary>\n[Your structured summary here]\n</summary>"}],
+      "role": "user"}], "systemInstruction": {"parts": [{"text": "You are a precise
+      assistant that creates structured summaries of agent conversations. You preserve
+      critical context needed for seamless task continuation."}], "role": "user"},
+      "generationConfig": {"temperature": 0.0}}'
+    headers:
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept:
+      - '*/*'
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      connection:
+      - keep-alive
+      content-length:
+      - '1895'
+      content-type:
+      - application/json
+      host:
+      - generativelanguage.googleapis.com
+      x-goog-api-client:
+      - google-genai-sdk/1.49.0 gl-python/3.13.3
+      x-goog-api-key:
+      - X-GOOG-API-KEY-XXX
+    method: POST
+    uri: https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent
+  response:
+    body:
+      string: "{\n  \"candidates\": [\n    {\n      \"content\": {\n        \"parts\":
+        [\n          {\n            \"text\": \"```xml\\n\\u003csummary\\u003e\\n**Task
+        Overview**: Research the latest developments in large language models, focusing
+        on architecture improvements and training techniques.\\n\\n**Current State**:
+        The agent has identified several key areas of advancement in LLMs: Mixture
+        of Experts (MoE) architectures, improved attention mechanisms (Flash Attention),
+        better training data curation, and Constitutional AI/RLHF improvements. The
+        user has requested a deeper dive into MoE architectures. The agent has provided
+        an initial overview of MoE architectures and listed some key papers.\\n\\n**Important
+        Discoveries**:\\n*   Key MoE papers: Switch Transformers (Google, 2021), GShard,
+        Mixtral (Mistral AI).\\n*   MoE advantage: Computational efficiency through
+        selective activation of experts.\\n\\n**Next Steps**: Continue researching
+        MoE architectures based on the user's request for more detail. The agent should
+        elaborate further on the listed papers and potentially find more recent or
+        relevant publications.\\n\\n**Context to Preserve**:\\n*   Focus areas: Architecture
+        improvements and training techniques for LLMs.\\n*   Specific architectures:
+        Mixture of Experts (MoE), Flash Attention.\\n*   Training techniques: Data
+        curation, Constitutional AI, RLHF.\\n*   Specific papers: Switch Transformers
+        (Google, 2021), GShard, Mixtral (Mistral AI).\\n\\u003c/summary\\u003e\\n```\"\n
+        \         }\n        ],\n        \"role\": \"model\"\n      },\n      \"finishReason\":
+        \"STOP\",\n      \"avgLogprobs\": -0.14186729703630721\n    }\n  ],\n  \"usageMetadata\":
+        {\n    \"promptTokenCount\": 373,\n    \"candidatesTokenCount\": 280,\n    \"totalTokenCount\":
+        653,\n    \"promptTokensDetails\": [\n      {\n        \"modality\": \"TEXT\",\n
+        \       \"tokenCount\": 373\n      }\n    ],\n    \"candidatesTokensDetails\":
+        [\n      {\n        \"modality\": \"TEXT\",\n        \"tokenCount\": 280\n
+        \     }\n    ]\n  },\n  \"modelVersion\": \"gemini-2.0-flash\",\n  \"responseId\":
+        \"GEGKabP3OcGH-8YPzZCj2Ao\"\n}\n"
+    headers:
+      Alt-Svc:
+      - h3=":443"; ma=2592000,h3-29=":443"; ma=2592000
+      Content-Type:
+      - application/json; charset=UTF-8
+      Date:
+      - Mon, 09 Feb 2026 20:18:35 GMT
+      Server:
+      - scaffolding on HTTPServer2
+      Server-Timing:
+      - gfet4t7; dur=2310
+      Transfer-Encoding:
+      - chunked
+      Vary:
+      - Origin
+      - X-Origin
+      - Referer
+      X-Content-Type-Options:
+      - X-CONTENT-TYPE-XXX
+      X-Frame-Options:
+      - X-FRAME-OPTIONS-XXX
+      X-XSS-Protection:
+      - '0'
+    status:
+      code: 200
+      message: OK
+version: 1
--- a/lib/crewai/tests/cassettes/utilities/TestSummarizeDirectOpenAI.test_summarize_direct_openai.yaml
+++ b/lib/crewai/tests/cassettes/utilities/TestSummarizeDirectOpenAI.test_summarize_direct_openai.yaml
@@ -0,0 +1,148 @@
+interactions:
+- request:
+    body: '{"messages":[{"role":"system","content":"You are a precise assistant that
+      creates structured summaries of agent conversations. You preserve critical context
+      needed for seamless task continuation."},{"role":"user","content":"Analyze the
+      following conversation and create a structured summary that preserves all information
+      needed to continue the task seamlessly.\n\n<conversation>\n[USER]: Research
+      the latest developments in large language models. Focus on architecture improvements
+      and training techniques.\n\n[ASSISTANT]: I''ll research the latest developments
+      in large language models. Based on my knowledge, recent advances include:\n1.
+      Mixture of Experts (MoE) architectures\n2. Improved attention mechanisms like
+      Flash Attention\n3. Better training data curation techniques\n4. Constitutional
+      AI and RLHF improvements\n\n[USER]: Can you go deeper on the MoE architectures?
+      What are the key papers?\n\n[ASSISTANT]: Key papers on Mixture of Experts:\n-
+      Switch Transformers (Google, 2021) - simplified MoE routing\n- GShard - scaling
+      to 600B parameters\n- Mixtral (Mistral AI) - open-source MoE model\nThe main
+      advantage is computational efficiency: only a subset of experts is activated
+      per token.\n</conversation>\n\nCreate a summary with these sections:\n1. **Task
+      Overview**: What is the agent trying to accomplish?\n2. **Current State**: What
+      has been completed so far? What step is the agent on?\n3. **Important Discoveries**:
+      Key facts, data, tool results, or findings that must not be lost.\n4. **Next
+      Steps**: What should the agent do next based on the conversation?\n5. **Context
+      to Preserve**: Any specific values, names, URLs, code snippets, or details referenced
+      in the conversation.\n\nWrap your entire summary in <summary> tags.\n\n<summary>\n[Your
+      structured summary here]\n</summary>"}],"model":"gpt-4o-mini","temperature":0}'
+    headers:
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept:
+      - application/json
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      authorization:
+      - AUTHORIZATION-XXX
+      connection:
+      - keep-alive
+      content-length:
+      - '1844'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+      x-stainless-arch:
+      - X-STAINLESS-ARCH-XXX
+      x-stainless-async:
+      - 'false'
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - X-STAINLESS-OS-XXX
+      x-stainless-package-version:
+      - 1.83.0
+      x-stainless-read-timeout:
+      - X-STAINLESS-READ-TIMEOUT-XXX
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.13.3
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    body:
+      string: "{\n  \"id\": \"chatcmpl-D7RxGISdQet8JsWImiwzHQ2S9gSD4\",\n  \"object\":
+        \"chat.completion\",\n  \"created\": 1770668126,\n  \"model\": \"gpt-4o-mini-2024-07-18\",\n
+        \ \"choices\": [\n    {\n      \"index\": 0,\n      \"message\": {\n        \"role\":
+        \"assistant\",\n        \"content\": \"<summary>\\n1. **Task Overview**: The
+        agent is tasked with researching the latest developments in large language
+        models, specifically focusing on architecture improvements and training techniques.\\n\\n2.
+        **Current State**: The agent has identified several recent advances in large
+        language models, including Mixture of Experts (MoE) architectures, improved
+        attention mechanisms, better training data curation techniques, and advancements
+        in Constitutional AI and Reinforcement Learning from Human Feedback (RLHF).\\n\\n3.
+        **Important Discoveries**: \\n   - Recent advances in large language models
+        include:\\n     1. Mixture of Experts (MoE) architectures\\n     2. Improved
+        attention mechanisms like Flash Attention\\n     3. Better training data curation
+        techniques\\n     4. Constitutional AI and RLHF improvements\\n   - Key papers
+        on Mixture of Experts:\\n     - Switch Transformers (Google, 2021) - simplified
+        MoE routing\\n     - GShard - scaling to 600B parameters\\n     - Mixtral
+        (Mistral AI) - open-source MoE model\\n   - The main advantage of MoE architectures
+        is computational efficiency, as only a subset of experts is activated per
+        token.\\n\\n4. **Next Steps**: The agent should delve deeper into the Mixture
+        of Experts architectures, reviewing the key papers mentioned and summarizing
+        their contributions and implications for large language models.\\n\\n5. **Context
+        to Preserve**: \\n   - Key papers: \\n     - Switch Transformers (Google,
+        2021)\\n     - GShard\\n     - Mixtral (Mistral AI)\\n   - Focus on computational
+        efficiency of MoE architectures.\\n</summary>\",\n        \"refusal\": null,\n
+        \       \"annotations\": []\n      },\n      \"logprobs\": null,\n      \"finish_reason\":
+        \"stop\"\n    }\n  ],\n  \"usage\": {\n    \"prompt_tokens\": 368,\n    \"completion_tokens\":
+        328,\n    \"total_tokens\": 696,\n    \"prompt_tokens_details\": {\n      \"cached_tokens\":
+        0,\n      \"audio_tokens\": 0\n    },\n    \"completion_tokens_details\":
+        {\n      \"reasoning_tokens\": 0,\n      \"audio_tokens\": 0,\n      \"accepted_prediction_tokens\":
+        0,\n      \"rejected_prediction_tokens\": 0\n    }\n  },\n  \"service_tier\":
+        \"default\",\n  \"system_fingerprint\": \"fp_f4ae844694\"\n}\n"
+    headers:
+      CF-RAY:
+      - CF-RAY-XXX
+      Connection:
+      - keep-alive
+      Content-Type:
+      - application/json
+      Date:
+      - Mon, 09 Feb 2026 20:15:32 GMT
+      Server:
+      - cloudflare
+      Strict-Transport-Security:
+      - STS-XXX
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - X-CONTENT-TYPE-XXX
+      access-control-expose-headers:
+      - ACCESS-CONTROL-XXX
+      alt-svc:
+      - h3=":443"; ma=86400
+      cf-cache-status:
+      - DYNAMIC
+      openai-organization:
+      - OPENAI-ORG-XXX
+      openai-processing-ms:
+      - '5395'
+      openai-project:
+      - OPENAI-PROJECT-XXX
+      openai-version:
+      - '2020-10-01'
+      set-cookie:
+      - SET-COOKIE-XXX
+      x-openai-proxy-wasm:
+      - v0.1
+      x-ratelimit-limit-requests:
+      - X-RATELIMIT-LIMIT-REQUESTS-XXX
+      x-ratelimit-limit-tokens:
+      - X-RATELIMIT-LIMIT-TOKENS-XXX
+      x-ratelimit-remaining-requests:
+      - X-RATELIMIT-REMAINING-REQUESTS-XXX
+      x-ratelimit-remaining-tokens:
+      - X-RATELIMIT-REMAINING-TOKENS-XXX
+      x-ratelimit-reset-requests:
+      - X-RATELIMIT-RESET-REQUESTS-XXX
+      x-ratelimit-reset-tokens:
+      - X-RATELIMIT-RESET-TOKENS-XXX
+      x-request-id:
+      - X-REQUEST-ID-XXX
+    status:
+      code: 200
+      message: OK
+version: 1
--- a/lib/crewai/tests/cassettes/utilities/TestSummarizePreservesFiles.test_summarize_preserves_files_integration.yaml
+++ b/lib/crewai/tests/cassettes/utilities/TestSummarizePreservesFiles.test_summarize_preserves_files_integration.yaml
@@ -0,0 +1,145 @@
+interactions:
+- request:
+    body: '{"messages":[{"role":"system","content":"You are a precise assistant that
+      creates structured summaries of agent conversations. You preserve critical context
+      needed for seamless task continuation."},{"role":"user","content":"Analyze the
+      following conversation and create a structured summary that preserves all information
+      needed to continue the task seamlessly.\n\n<conversation>\n[USER]: Research
+      the latest developments in large language models. Focus on architecture improvements
+      and training techniques.\n\n[ASSISTANT]: I''ll research the latest developments
+      in large language models. Based on my knowledge, recent advances include:\n1.
+      Mixture of Experts (MoE) architectures\n2. Improved attention mechanisms like
+      Flash Attention\n3. Better training data curation techniques\n4. Constitutional
+      AI and RLHF improvements\n\n[USER]: Can you go deeper on the MoE architectures?
+      What are the key papers?\n\n[ASSISTANT]: Key papers on Mixture of Experts:\n-
+      Switch Transformers (Google, 2021) - simplified MoE routing\n- GShard - scaling
+      to 600B parameters\n- Mixtral (Mistral AI) - open-source MoE model\nThe main
+      advantage is computational efficiency: only a subset of experts is activated
+      per token.\n</conversation>\n\nCreate a summary with these sections:\n1. **Task
+      Overview**: What is the agent trying to accomplish?\n2. **Current State**: What
+      has been completed so far? What step is the agent on?\n3. **Important Discoveries**:
+      Key facts, data, tool results, or findings that must not be lost.\n4. **Next
+      Steps**: What should the agent do next based on the conversation?\n5. **Context
+      to Preserve**: Any specific values, names, URLs, code snippets, or details referenced
+      in the conversation.\n\nWrap your entire summary in <summary> tags.\n\n<summary>\n[Your
+      structured summary here]\n</summary>"}],"model":"gpt-4o-mini","temperature":0}'
+    headers:
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept:
+      - application/json
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      authorization:
+      - AUTHORIZATION-XXX
+      connection:
+      - keep-alive
+      content-length:
+      - '1844'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+      x-stainless-arch:
+      - X-STAINLESS-ARCH-XXX
+      x-stainless-async:
+      - 'false'
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - X-STAINLESS-OS-XXX
+      x-stainless-package-version:
+      - 1.83.0
+      x-stainless-read-timeout:
+      - X-STAINLESS-READ-TIMEOUT-XXX
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.13.3
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    body:
+      string: "{\n  \"id\": \"chatcmpl-D7RxM4n36QoACHrC0QocV1pXIwvtD\",\n  \"object\":
+        \"chat.completion\",\n  \"created\": 1770668132,\n  \"model\": \"gpt-4o-mini-2024-07-18\",\n
+        \ \"choices\": [\n    {\n      \"index\": 0,\n      \"message\": {\n        \"role\":
+        \"assistant\",\n        \"content\": \"<summary>\\n1. **Task Overview**: The
+        user has requested research on the latest developments in large language models,
+        specifically focusing on architecture improvements and training techniques.\\n\\n2.
+        **Current State**: The assistant has identified several recent advances in
+        large language models, including Mixture of Experts (MoE) architectures, improved
+        attention mechanisms, better training data curation techniques, and advancements
+        in Constitutional AI and Reinforcement Learning from Human Feedback (RLHF).\\n\\n3.
+        **Important Discoveries**: \\n   - Key papers on Mixture of Experts (MoE)
+        architectures:\\n     - \\\"Switch Transformers\\\" (Google, 2021) - simplified
+        MoE routing.\\n     - \\\"GShard\\\" - scaling to 600B parameters.\\n     -
+        \\\"Mixtral\\\" (Mistral AI) - open-source MoE model.\\n   - The main advantage
+        of MoE architectures is computational efficiency, as only a subset of experts
+        is activated per token.\\n\\n4. **Next Steps**: The assistant should delve
+        deeper into the Mixture of Experts architectures, potentially summarizing
+        the findings from the key papers mentioned.\\n\\n5. **Context to Preserve**:
+        \\n   - Key papers: \\\"Switch Transformers,\\\" \\\"GShard,\\\" \\\"Mixtral.\\\"\\n
+        \  - Notable organizations: Google, Mistral AI.\\n   - Focus areas: MoE architectures,
+        computational efficiency.\\n</summary>\",\n        \"refusal\": null,\n        \"annotations\":
+        []\n      },\n      \"logprobs\": null,\n      \"finish_reason\": \"stop\"\n
+        \   }\n  ],\n  \"usage\": {\n    \"prompt_tokens\": 368,\n    \"completion_tokens\":
+        275,\n    \"total_tokens\": 643,\n    \"prompt_tokens_details\": {\n      \"cached_tokens\":
+        0,\n      \"audio_tokens\": 0\n    },\n    \"completion_tokens_details\":
+        {\n      \"reasoning_tokens\": 0,\n      \"audio_tokens\": 0,\n      \"accepted_prediction_tokens\":
+        0,\n      \"rejected_prediction_tokens\": 0\n    }\n  },\n  \"service_tier\":
+        \"default\",\n  \"system_fingerprint\": \"fp_f4ae844694\"\n}\n"
+    headers:
+      CF-RAY:
+      - CF-RAY-XXX
+      Connection:
+      - keep-alive
+      Content-Type:
+      - application/json
+      Date:
+      - Mon, 09 Feb 2026 20:15:36 GMT
+      Server:
+      - cloudflare
+      Strict-Transport-Security:
+      - STS-XXX
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - X-CONTENT-TYPE-XXX
+      access-control-expose-headers:
+      - ACCESS-CONTROL-XXX
+      alt-svc:
+      - h3=":443"; ma=86400
+      cf-cache-status:
+      - DYNAMIC
+      openai-organization:
+      - OPENAI-ORG-XXX
+      openai-processing-ms:
+      - '4188'
+      openai-project:
+      - OPENAI-PROJECT-XXX
+      openai-version:
+      - '2020-10-01'
+      set-cookie:
+      - SET-COOKIE-XXX
+      x-openai-proxy-wasm:
+      - v0.1
+      x-ratelimit-limit-requests:
+      - X-RATELIMIT-LIMIT-REQUESTS-XXX
+      x-ratelimit-limit-tokens:
+      - X-RATELIMIT-LIMIT-TOKENS-XXX
+      x-ratelimit-remaining-requests:
+      - X-RATELIMIT-REMAINING-REQUESTS-XXX
+      x-ratelimit-remaining-tokens:
+      - X-RATELIMIT-REMAINING-TOKENS-XXX
+      x-ratelimit-reset-requests:
+      - X-RATELIMIT-RESET-REQUESTS-XXX
+      x-ratelimit-reset-tokens:
+      - X-RATELIMIT-RESET-TOKENS-XXX
+      x-request-id:
+      - X-REQUEST-ID-XXX
+    status:
+      code: 200
+      message: OK
+version: 1
--- a/lib/crewai/tests/utilities/test_agent_utils.py
+++ b/lib/crewai/tests/utilities/test_agent_utils.py
@@ -2,13 +2,23 @@

 from __future__ import annotations

+import asyncio
 from typing import Any
-from unittest.mock import MagicMock, patch
+from unittest.mock import AsyncMock, MagicMock, patch

+import pytest
 from pydantic import BaseModel, Field

 from crewai.tools.base_tool import BaseTool
-from crewai.utilities.agent_utils import convert_tools_to_openai_schema, summarize_messages
+from crewai.utilities.agent_utils import (
+    _asummarize_chunks,
+    _estimate_token_count,
+    _extract_summary_tags,
+    _format_messages_for_summary,
+    _split_messages_into_chunks,
+    convert_tools_to_openai_schema,
+    summarize_messages,
+)


 class CalculatorInput(BaseModel):
@@ -214,6 +224,17 @@ class TestConvertToolsToOpenaiSchema:
        assert max_results_prop["default"] == 10


+def _make_mock_i18n() -> MagicMock:
+    """Create a mock i18n with the new structured prompt keys."""
+    mock_i18n = MagicMock()
+    mock_i18n.slice.side_effect = lambda key: {
+        "summarizer_system_message": "You are a precise assistant that creates structured summaries.",
+        "summarize_instruction": "Summarize the conversation:\n{conversation}",
+        "summary": "<summary>\n{merged_summary}\n</summary>\nContinue the task.",
+    }.get(key, "")
+    return mock_i18n
+
+
 class TestSummarizeMessages:
    """Tests for summarize_messages function."""

@@ -229,26 +250,22 @@ class TestSummarizeMessages:

        mock_llm = MagicMock()
        mock_llm.get_context_window_size.return_value = 1000
-        mock_llm.call.return_value = "Summarized conversation about image analysis."
-
-        mock_i18n = MagicMock()
-        mock_i18n.slice.side_effect = lambda key: {
-            "summarizer_system_message": "Summarize the following.",
-            "summarize_instruction": "Summarize: {group}",
-            "summary": "Summary: {merged_summary}",
-        }.get(key, "")
+        mock_llm.call.return_value = "<summary>Summarized conversation about image analysis.</summary>"

        summarize_messages(
            messages=messages,
            llm=mock_llm,
            callbacks=[],
-            i18n=mock_i18n,
+            i18n=_make_mock_i18n(),
        )

-        assert len(messages) == 1
-        assert messages[0]["role"] == "user"
-        assert "files" in messages[0]
-        assert messages[0]["files"] == mock_files
+        # System message preserved + summary message = 2
+        assert len(messages) == 2
+        assert messages[0]["role"] == "system"
+        summary_msg = messages[1]
+        assert summary_msg["role"] == "user"
+        assert "files" in summary_msg
+        assert summary_msg["files"] == mock_files

    def test_merges_files_from_multiple_user_messages(self) -> None:
        """Test that files from multiple user messages are merged."""
@@ -264,20 +281,13 @@ class TestSummarizeMessages:

        mock_llm = MagicMock()
        mock_llm.get_context_window_size.return_value = 1000
-        mock_llm.call.return_value = "Summarized conversation."
-
-        mock_i18n = MagicMock()
-        mock_i18n.slice.side_effect = lambda key: {
-            "summarizer_system_message": "Summarize the following.",
-            "summarize_instruction": "Summarize: {group}",
-            "summary": "Summary: {merged_summary}",
-        }.get(key, "")
+        mock_llm.call.return_value = "<summary>Summarized conversation.</summary>"

        summarize_messages(
            messages=messages,
            llm=mock_llm,
            callbacks=[],
-            i18n=mock_i18n,
+            i18n=_make_mock_i18n(),
        )

        assert len(messages) == 1
@@ -297,20 +307,13 @@ class TestSummarizeMessages:

        mock_llm = MagicMock()
        mock_llm.get_context_window_size.return_value = 1000
-        mock_llm.call.return_value = "A greeting exchange."
-
-        mock_i18n = MagicMock()
-        mock_i18n.slice.side_effect = lambda key: {
-            "summarizer_system_message": "Summarize the following.",
-            "summarize_instruction": "Summarize: {group}",
-            "summary": "Summary: {merged_summary}",
-        }.get(key, "")
+        mock_llm.call.return_value = "<summary>A greeting exchange.</summary>"

        summarize_messages(
            messages=messages,
            llm=mock_llm,
            callbacks=[],
-            i18n=mock_i18n,
+            i18n=_make_mock_i18n(),
        )

        assert len(messages) == 1
@@ -327,21 +330,595 @@ class TestSummarizeMessages:

        mock_llm = MagicMock()
        mock_llm.get_context_window_size.return_value = 1000
-        mock_llm.call.return_value = "Summary"
-
-        mock_i18n = MagicMock()
-        mock_i18n.slice.side_effect = lambda key: {
-            "summarizer_system_message": "Summarize.",
-            "summarize_instruction": "Summarize: {group}",
-            "summary": "Summary: {merged_summary}",
-        }.get(key, "")
+        mock_llm.call.return_value = "<summary>Summary</summary>"

        summarize_messages(
            messages=messages,
            llm=mock_llm,
            callbacks=[],
-            i18n=mock_i18n,
+            i18n=_make_mock_i18n(),
        )

        assert id(messages) == original_list_id
        assert len(messages) == 1
+
+    def test_preserves_system_messages(self) -> None:
+        """Test that system messages are preserved and not summarized."""
+        messages: list[dict[str, Any]] = [
+            {"role": "system", "content": "You are a research assistant."},
+            {"role": "user", "content": "Find information about AI."},
+            {"role": "assistant", "content": "I found several resources on AI."},
+        ]
+
+        mock_llm = MagicMock()
+        mock_llm.get_context_window_size.return_value = 1000
+        mock_llm.call.return_value = "<summary>User asked about AI, assistant found resources.</summary>"
+
+        summarize_messages(
+            messages=messages,
+            llm=mock_llm,
+            callbacks=[],
+            i18n=_make_mock_i18n(),
+        )
+
+        assert len(messages) == 2
+        assert messages[0]["role"] == "system"
+        assert messages[0]["content"] == "You are a research assistant."
+        assert messages[1]["role"] == "user"
+
+    def test_formats_conversation_with_role_labels(self) -> None:
+        """Test that the LLM receives role-labeled conversation text."""
+        messages: list[dict[str, Any]] = [
+            {"role": "system", "content": "System prompt."},
+            {"role": "user", "content": "Hello there"},
+            {"role": "assistant", "content": "Hi! How can I help?"},
+        ]
+
+        mock_llm = MagicMock()
+        mock_llm.get_context_window_size.return_value = 1000
+        mock_llm.call.return_value = "<summary>Greeting exchange.</summary>"
+
+        summarize_messages(
+            messages=messages,
+            llm=mock_llm,
+            callbacks=[],
+            i18n=_make_mock_i18n(),
+        )
+
+        # Check what was passed to llm.call
+        call_args = mock_llm.call.call_args[0][0]
+        user_msg_content = call_args[1]["content"]
+        assert "[USER]:" in user_msg_content
+        assert "[ASSISTANT]:" in user_msg_content
+        # System content should NOT appear in summarization input
+        assert "System prompt." not in user_msg_content
+
+    def test_extracts_summary_from_tags(self) -> None:
+        """Test that <summary> tags are extracted from LLM response."""
+        messages: list[dict[str, Any]] = [
+            {"role": "user", "content": "Do something."},
+            {"role": "assistant", "content": "Done."},
+        ]
+
+        mock_llm = MagicMock()
+        mock_llm.get_context_window_size.return_value = 1000
+        mock_llm.call.return_value = "Here is the summary:\n<summary>The extracted summary content.</summary>\nExtra text."
+
+        summarize_messages(
+            messages=messages,
+            llm=mock_llm,
+            callbacks=[],
+            i18n=_make_mock_i18n(),
+        )
+
+        assert "The extracted summary content." in messages[0]["content"]
+
+    def test_handles_tool_messages(self) -> None:
+        """Test that tool messages are properly formatted in summarization."""
+        messages: list[dict[str, Any]] = [
+            {"role": "user", "content": "Search for Python."},
+            {"role": "assistant", "content": None, "tool_calls": [
+                {"function": {"name": "web_search", "arguments": '{"query": "Python"}'}}
+            ]},
+            {"role": "tool", "content": "Python is a programming language.", "name": "web_search"},
+            {"role": "assistant", "content": "Python is a programming language."},
+        ]
+
+        mock_llm = MagicMock()
+        mock_llm.get_context_window_size.return_value = 1000
+        mock_llm.call.return_value = "<summary>User searched for Python info.</summary>"
+
+        summarize_messages(
+            messages=messages,
+            llm=mock_llm,
+            callbacks=[],
+            i18n=_make_mock_i18n(),
+        )
+
+        # Verify the conversation text sent to LLM contains tool labels
+        call_args = mock_llm.call.call_args[0][0]
+        user_msg_content = call_args[1]["content"]
+        assert "[TOOL_RESULT (web_search)]:" in user_msg_content
+
+    def test_only_system_messages_no_op(self) -> None:
+        """Test that only system messages results in no-op (no summarization)."""
+        messages: list[dict[str, Any]] = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "system", "content": "Additional system instructions."},
+        ]
+
+        mock_llm = MagicMock()
+        mock_llm.get_context_window_size.return_value = 1000
+
+        summarize_messages(
+            messages=messages,
+            llm=mock_llm,
+            callbacks=[],
+            i18n=_make_mock_i18n(),
+        )
+
+        # No LLM call should have been made
+        mock_llm.call.assert_not_called()
+        # System messages should remain untouched
+        assert len(messages) == 2
+        assert messages[0]["content"] == "You are a helpful assistant."
+        assert messages[1]["content"] == "Additional system instructions."
+
+
+class TestFormatMessagesForSummary:
+    """Tests for _format_messages_for_summary helper."""
+
+    def test_skips_system_messages(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "system", "content": "System prompt"},
+            {"role": "user", "content": "Hello"},
+        ]
+        result = _format_messages_for_summary(messages)
+        assert "System prompt" not in result
+        assert "[USER]: Hello" in result
+
+    def test_formats_user_and_assistant(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "user", "content": "Question"},
+            {"role": "assistant", "content": "Answer"},
+        ]
+        result = _format_messages_for_summary(messages)
+        assert "[USER]: Question" in result
+        assert "[ASSISTANT]: Answer" in result
+
+    def test_formats_tool_messages(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "tool", "content": "Result data", "name": "search_tool"},
+        ]
+        result = _format_messages_for_summary(messages)
+        assert "[TOOL_RESULT (search_tool)]:" in result
+        assert "Result data" in result
+
+    def test_handles_none_content_with_tool_calls(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "assistant", "content": None, "tool_calls": [
+                {"function": {"name": "calculator", "arguments": "{}"}}
+            ]},
+        ]
+        result = _format_messages_for_summary(messages)
+        assert "[Called tools: calculator]" in result
+
+    def test_handles_none_content_without_tool_calls(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "assistant", "content": None},
+        ]
+        result = _format_messages_for_summary(messages)
+        assert "[ASSISTANT]:" in result
+
+    def test_handles_multimodal_content(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "user", "content": [
+                {"type": "text", "text": "Describe this image"},
+                {"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}}
+            ]},
+        ]
+        result = _format_messages_for_summary(messages)
+        assert "[USER]: Describe this image" in result
+
+    def test_empty_messages(self) -> None:
+        result = _format_messages_for_summary([])
+        assert result == ""
+
+
+class TestExtractSummaryTags:
+    """Tests for _extract_summary_tags helper."""
+
+    def test_extracts_content_from_tags(self) -> None:
+        text = "Preamble\n<summary>The actual summary.</summary>\nPostamble"
+        assert _extract_summary_tags(text) == "The actual summary."
+
+    def test_handles_multiline_content(self) -> None:
+        text = "<summary>\nLine 1\nLine 2\nLine 3\n</summary>"
+        result = _extract_summary_tags(text)
+        assert "Line 1" in result
+        assert "Line 2" in result
+        assert "Line 3" in result
+
+    def test_falls_back_when_no_tags(self) -> None:
+        text = "Just a plain summary without tags."
+        assert _extract_summary_tags(text) == text
+
+    def test_handles_empty_string(self) -> None:
+        assert _extract_summary_tags("") == ""
+
+    def test_extracts_first_match(self) -> None:
+        text = "<summary>First</summary> text <summary>Second</summary>"
+        assert _extract_summary_tags(text) == "First"
+
+
+class TestSplitMessagesIntoChunks:
+    """Tests for _split_messages_into_chunks helper."""
+
+    def test_single_chunk_when_under_limit(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "user", "content": "Hello"},
+            {"role": "assistant", "content": "Hi"},
+        ]
+        chunks = _split_messages_into_chunks(messages, max_tokens=1000)
+        assert len(chunks) == 1
+        assert len(chunks[0]) == 2
+
+    def test_splits_at_message_boundaries(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "user", "content": "A" * 100},  # ~25 tokens
+            {"role": "assistant", "content": "B" * 100},  # ~25 tokens
+            {"role": "user", "content": "C" * 100},  # ~25 tokens
+        ]
+        # max_tokens=30 should cause splits
+        chunks = _split_messages_into_chunks(messages, max_tokens=30)
+        assert len(chunks) == 3
+
+    def test_excludes_system_messages(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "system", "content": "System prompt"},
+            {"role": "user", "content": "Hello"},
+        ]
+        chunks = _split_messages_into_chunks(messages, max_tokens=1000)
+        assert len(chunks) == 1
+        # The system message should not be in any chunk
+        for chunk in chunks:
+            for msg in chunk:
+                assert msg.get("role") != "system"
+
+    def test_empty_messages(self) -> None:
+        chunks = _split_messages_into_chunks([], max_tokens=1000)
+        assert chunks == []
+
+    def test_only_system_messages(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "system", "content": "System prompt"},
+        ]
+        chunks = _split_messages_into_chunks(messages, max_tokens=1000)
+        assert chunks == []
+
+    def test_handles_none_content(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "assistant", "content": None},
+            {"role": "user", "content": "Follow up"},
+        ]
+        chunks = _split_messages_into_chunks(messages, max_tokens=1000)
+        assert len(chunks) == 1
+        assert len(chunks[0]) == 2
+
+
+class TestEstimateTokenCount:
+    """Tests for _estimate_token_count helper."""
+
+    def test_empty_string(self) -> None:
+        assert _estimate_token_count("") == 0
+
+    def test_short_string(self) -> None:
+        assert _estimate_token_count("hello") == 1  # 5 // 4 = 1
+
+    def test_longer_string(self) -> None:
+        assert _estimate_token_count("a" * 100) == 25  # 100 // 4 = 25
+
+    def test_approximation_is_conservative(self) -> None:
+        # For English text, actual token count is typically lower than char/4
+        text = "The quick brown fox jumps over the lazy dog."
+        estimated = _estimate_token_count(text)
+        assert estimated > 0
+        assert estimated == len(text) // 4
+
+
+class TestParallelSummarization:
+    """Tests for parallel chunk summarization via asyncio."""
+
+    def _make_messages_for_n_chunks(self, n: int) -> list[dict[str, Any]]:
+        """Build a message list that will produce exactly *n* chunks.
+
+        Each message has 400 chars (~100 tokens). With max_tokens=100 returned
+        by the mock LLM, each message lands in its own chunk.
+        """
+        msgs: list[dict[str, Any]] = []
+        for i in range(n):
+            msgs.append({"role": "user", "content": f"msg-{i} " + "x" * 400})
+        return msgs
+
+    def test_multiple_chunks_use_acall(self) -> None:
+        """When there are multiple chunks, summarize_messages should use
+        llm.acall (parallel) instead of llm.call (sequential)."""
+        messages = self._make_messages_for_n_chunks(3)
+
+        mock_llm = MagicMock()
+        mock_llm.get_context_window_size.return_value = 100  # force multiple chunks
+        mock_llm.acall = AsyncMock(
+            side_effect=[
+                "<summary>Summary chunk 1</summary>",
+                "<summary>Summary chunk 2</summary>",
+                "<summary>Summary chunk 3</summary>",
+            ]
+        )
+
+        summarize_messages(
+            messages=messages,
+            llm=mock_llm,
+            callbacks=[],
+            i18n=_make_mock_i18n(),
+        )
+
+        # acall should have been awaited once per chunk
+        assert mock_llm.acall.await_count == 3
+        # sync call should NOT have been used for chunk summarization
+        mock_llm.call.assert_not_called()
+
+    def test_single_chunk_uses_sync_call(self) -> None:
+        """When there is only one chunk, summarize_messages should use
+        the sync llm.call path (no async overhead)."""
+        messages: list[dict[str, Any]] = [
+            {"role": "user", "content": "Short message"},
+            {"role": "assistant", "content": "Short reply"},
+        ]
+
+        mock_llm = MagicMock()
+        mock_llm.get_context_window_size.return_value = 100_000
+        mock_llm.call.return_value = "<summary>Short summary</summary>"
+
+        summarize_messages(
+            messages=messages,
+            llm=mock_llm,
+            callbacks=[],
+            i18n=_make_mock_i18n(),
+        )
+
+        mock_llm.call.assert_called_once()
+
+    def test_parallel_results_preserve_order(self) -> None:
+        """Summaries must appear in the same order as the original chunks,
+        regardless of which async call finishes first."""
+        messages = self._make_messages_for_n_chunks(3)
+
+        mock_llm = MagicMock()
+        mock_llm.get_context_window_size.return_value = 100
+
+        # Simulate varying latencies — chunk 2 finishes before chunk 0
+        async def _delayed_acall(msgs: Any, **kwargs: Any) -> str:
+            user_content = msgs[1]["content"]
+            if "msg-0" in user_content:
+                await asyncio.sleep(0.05)
+                return "<summary>Summary-A</summary>"
+            elif "msg-1" in user_content:
+                return "<summary>Summary-B</summary>"  # fastest
+            else:
+                await asyncio.sleep(0.02)
+                return "<summary>Summary-C</summary>"
+
+        mock_llm.acall = _delayed_acall
+
+        summarize_messages(
+            messages=messages,
+            llm=mock_llm,
+            callbacks=[],
+            i18n=_make_mock_i18n(),
+        )
+
+        # The final summary message should have A, B, C in order
+        summary_content = messages[-1]["content"]
+        pos_a = summary_content.index("Summary-A")
+        pos_b = summary_content.index("Summary-B")
+        pos_c = summary_content.index("Summary-C")
+        assert pos_a < pos_b < pos_c
+
+    def test_asummarize_chunks_returns_ordered_results(self) -> None:
+        """Direct test of the async helper _asummarize_chunks."""
+        chunk_a: list[dict[str, Any]] = [{"role": "user", "content": "Chunk A"}]
+        chunk_b: list[dict[str, Any]] = [{"role": "user", "content": "Chunk B"}]
+
+        mock_llm = MagicMock()
+        mock_llm.acall = AsyncMock(
+            side_effect=[
+                "<summary>Result A</summary>",
+                "<summary>Result B</summary>",
+            ]
+        )
+
+        results = asyncio.run(
+            _asummarize_chunks(
+                chunks=[chunk_a, chunk_b],
+                llm=mock_llm,
+                callbacks=[],
+                i18n=_make_mock_i18n(),
+            )
+        )
+
+        assert len(results) == 2
+        assert results[0]["content"] == "Result A"
+        assert results[1]["content"] == "Result B"
+
+    @patch("crewai.utilities.agent_utils.is_inside_event_loop", return_value=True)
+    def test_works_inside_existing_event_loop(self, _mock_loop: Any) -> None:
+        """When called from inside a running event loop (e.g. a Flow),
+        the ThreadPoolExecutor fallback should still work."""
+        messages = self._make_messages_for_n_chunks(2)
+
+        mock_llm = MagicMock()
+        mock_llm.get_context_window_size.return_value = 100
+        mock_llm.acall = AsyncMock(
+            side_effect=[
+                "<summary>Flow summary 1</summary>",
+                "<summary>Flow summary 2</summary>",
+            ]
+        )
+
+        summarize_messages(
+            messages=messages,
+            llm=mock_llm,
+            callbacks=[],
+            i18n=_make_mock_i18n(),
+        )
+
+        assert mock_llm.acall.await_count == 2
+        # Verify the merged summary made it into messages
+        assert "Flow summary 1" in messages[-1]["content"]
+        assert "Flow summary 2" in messages[-1]["content"]
+
+
+def _build_long_conversation() -> list[dict[str, Any]]:
+    """Build a multi-turn conversation that produces multiple chunks at max_tokens=200.
+
+    Each non-system message is ~100-140 estimated tokens (400-560 chars),
+    so a max_tokens of 200 yields roughly 3 chunks from 6 messages.
+    """
+    return [
+        {
+            "role": "system",
+            "content": "You are a helpful research assistant.",
+        },
+        {
+            "role": "user",
+            "content": (
+                "Tell me about the history of the Python programming language. "
+                "Who created it, when was it first released, and what were the "
+                "main design goals? Please provide a detailed overview covering "
+                "the major milestones from its inception through Python 3."
+            ),
+        },
+        {
+            "role": "assistant",
+            "content": (
+                "Python was created by Guido van Rossum and first released in 1991. "
+                "The main design goals were code readability and simplicity. Key milestones: "
+                "Python 1.0 (1994) introduced functional programming tools like lambda and map. "
+                "Python 2.0 (2000) added list comprehensions and garbage collection. "
+                "Python 3.0 (2008) was a major backward-incompatible release that fixed "
+                "fundamental design flaws. Python 2 reached end-of-life in January 2020."
+            ),
+        },
+        {
+            "role": "user",
+            "content": (
+                "What about the async/await features? When were they introduced "
+                "and how do they compare to similar features in JavaScript and C#? "
+                "Also explain the Global Interpreter Lock and its implications."
+            ),
+        },
+        {
+            "role": "assistant",
+            "content": (
+                "Async/await was introduced in Python 3.5 (PEP 492, 2015). "
+                "Unlike JavaScript which is single-threaded by design, Python's asyncio "
+                "is an opt-in framework. C# introduced async/await in 2012 (C# 5.0) and "
+                "was a major inspiration for Python's implementation. "
+                "The GIL (Global Interpreter Lock) is a mutex that protects access to "
+                "Python objects, preventing multiple threads from executing Python bytecodes "
+                "simultaneously. This means CPU-bound multithreaded programs don't benefit "
+                "from multiple cores. PEP 703 proposes making the GIL optional in CPython."
+            ),
+        },
+        {
+            "role": "user",
+            "content": (
+                "Explain the Python package ecosystem. How does pip work, what is PyPI, "
+                "and what are virtual environments? Compare pip with conda and uv."
+            ),
+        },
+        {
+            "role": "assistant",
+            "content": (
+                "PyPI (Python Package Index) is the official repository hosting 400k+ packages. "
+                "pip is the standard package installer that downloads from PyPI. "
+                "Virtual environments (venv) create isolated Python installations to avoid "
+                "dependency conflicts between projects. conda is a cross-language package manager "
+                "popular in data science that can manage non-Python dependencies. "
+                "uv is a new Rust-based tool that is 10-100x faster than pip and aims to replace "
+                "pip, pip-tools, and virtualenv with a single unified tool."
+            ),
+        },
+    ]
+
+
+class TestParallelSummarizationVCR:
+    """VCR-backed integration tests for parallel summarization.
+
+    These tests use a real LLM but patch get_context_window_size to force
+    multiple chunks, exercising the asyncio.gather + acall parallel path.
+
+    To record cassettes:
+        PYTEST_VCR_RECORD_MODE=all uv run pytest lib/crewai/tests/utilities/test_agent_utils.py::TestParallelSummarizationVCR -v
+    """
+
+    @pytest.mark.vcr()
+    def test_parallel_summarize_openai(self) -> None:
+        """Test that parallel summarization with gpt-4o-mini produces a valid summary."""
+        from crewai.llm import LLM
+        from crewai.utilities.i18n import I18N
+
+        llm = LLM(model="gpt-4o-mini", temperature=0)
+        i18n = I18N()
+        messages = _build_long_conversation()
+
+        original_system = messages[0]["content"]
+
+        # Patch get_context_window_size to return 200 — forces multiple chunks
+        with patch.object(type(llm), "get_context_window_size", return_value=200):
+            # Verify we actually get multiple chunks with this window size
+            non_system = [m for m in messages if m.get("role") != "system"]
+            chunks = _split_messages_into_chunks(non_system, max_tokens=200)
+            assert len(chunks) > 1, f"Expected multiple chunks, got {len(chunks)}"
+
+            summarize_messages(
+                messages=messages,
+                llm=llm,
+                callbacks=[],
+                i18n=i18n,
+            )
+
+        # System message preserved
+        assert messages[0]["role"] == "system"
+        assert messages[0]["content"] == original_system
+
+        # Summary produced as a user message
+        summary_msg = messages[-1]
+        assert summary_msg["role"] == "user"
+        assert len(summary_msg["content"]) > 0
+
+    @pytest.mark.vcr()
+    def test_parallel_summarize_preserves_files(self) -> None:
+        """Test that file references survive parallel summarization."""
+        from crewai.llm import LLM
+        from crewai.utilities.i18n import I18N
+
+        llm = LLM(model="gpt-4o-mini", temperature=0)
+        i18n = I18N()
+        messages = _build_long_conversation()
+
+        mock_file = MagicMock()
+        messages[1]["files"] = {"report.pdf": mock_file}
+
+        with patch.object(type(llm), "get_context_window_size", return_value=200):
+            summarize_messages(
+                messages=messages,
+                llm=llm,
+                callbacks=[],
+                i18n=i18n,
+            )
+
+        summary_msg = messages[-1]
+        assert summary_msg["role"] == "user"
+        assert "files" in summary_msg
+        assert "report.pdf" in summary_msg["files"]
--- a/lib/crewai/tests/utilities/test_summarize_integration.py
+++ b/lib/crewai/tests/utilities/test_summarize_integration.py
@@ -0,0 +1,284 @@
+"""
+Integration tests for structured context compaction (summarize_messages).
+"""
+
+from __future__ import annotations
+
+from typing import Any
+from unittest.mock import MagicMock
+
+import pytest
+
+from crewai.agent import Agent
+from crewai.crew import Crew
+from crewai.llm import LLM
+from crewai.task import Task
+from crewai.utilities.agent_utils import summarize_messages
+from crewai.utilities.i18n import I18N
+
+
+def _build_conversation_messages(
+    *, include_system: bool = True, include_files: bool = False
+) -> list[dict[str, Any]]:
+    """Build a realistic multi-turn conversation for summarization tests."""
+    messages: list[dict[str, Any]] = []
+
+    if include_system:
+        messages.append(
+            {
+                "role": "system",
+                "content": (
+                    "You are a research assistant specializing in AI topics. "
+                    "Your goal is to find accurate, up-to-date information."
+                ),
+            }
+        )
+
+    user_msg: dict[str, Any] = {
+        "role": "user",
+        "content": (
+            "Research the latest developments in large language models. "
+            "Focus on architecture improvements and training techniques."
+        ),
+    }
+    if include_files:
+        user_msg["files"] = {"reference.pdf": MagicMock()}
+    messages.append(user_msg)
+
+    messages.append(
+        {
+            "role": "assistant",
+            "content": (
+                "I'll research the latest developments in large language models. "
+                "Based on my knowledge, recent advances include:\n"
+                "1. Mixture of Experts (MoE) architectures\n"
+                "2. Improved attention mechanisms like Flash Attention\n"
+                "3. Better training data curation techniques\n"
+                "4. Constitutional AI and RLHF improvements"
+            ),
+        }
+    )
+
+    messages.append(
+        {
+            "role": "user",
+            "content": "Can you go deeper on the MoE architectures? What are the key papers?",
+        }
+    )
+
+    messages.append(
+        {
+            "role": "assistant",
+            "content": (
+                "Key papers on Mixture of Experts:\n"
+                "- Switch Transformers (Google, 2021) - simplified MoE routing\n"
+                "- GShard - scaling to 600B parameters\n"
+                "- Mixtral (Mistral AI) - open-source MoE model\n"
+                "The main advantage is computational efficiency: "
+                "only a subset of experts is activated per token."
+            ),
+        }
+    )
+
+    return messages
+
+
+class TestSummarizeDirectOpenAI:
+    """Test direct summarize_messages calls with OpenAI."""
+
+    @pytest.mark.vcr()
+    def test_summarize_direct_openai(self) -> None:
+        """Test summarize_messages with gpt-4o-mini preserves system messages."""
+        llm = LLM(model="gpt-4o-mini", temperature=0)
+        i18n = I18N()
+        messages = _build_conversation_messages(include_system=True)
+
+        original_system_content = messages[0]["content"]
+
+        summarize_messages(
+            messages=messages,
+            llm=llm,
+            callbacks=[],
+            i18n=i18n,
+        )
+
+        # System message should be preserved
+        assert len(messages) >= 2
+        assert messages[0]["role"] == "system"
+        assert messages[0]["content"] == original_system_content
+
+        # Summary should be a user message with <summary> block
+        summary_msg = messages[-1]
+        assert summary_msg["role"] == "user"
+        assert len(summary_msg["content"]) > 0
+        assert "<summary>" in summary_msg["content"]
+        assert "</summary>" in summary_msg["content"]
+
+
+class TestSummarizeDirectAnthropic:
+    """Test direct summarize_messages calls with Anthropic."""
+
+    @pytest.mark.vcr()
+    def test_summarize_direct_anthropic(self) -> None:
+        """Test summarize_messages with claude-3-5-haiku."""
+        llm = LLM(model="anthropic/claude-3-5-haiku-latest", temperature=0)
+        i18n = I18N()
+        messages = _build_conversation_messages(include_system=True)
+
+        summarize_messages(
+            messages=messages,
+            llm=llm,
+            callbacks=[],
+            i18n=i18n,
+        )
+
+        assert len(messages) >= 2
+        assert messages[0]["role"] == "system"
+        summary_msg = messages[-1]
+        assert summary_msg["role"] == "user"
+        assert len(summary_msg["content"]) > 0
+        assert "<summary>" in summary_msg["content"]
+        assert "</summary>" in summary_msg["content"]
+
+
+class TestSummarizeDirectGemini:
+    """Test direct summarize_messages calls with Gemini."""
+
+    @pytest.mark.vcr()
+    def test_summarize_direct_gemini(self) -> None:
+        """Test summarize_messages with gemini-2.0-flash."""
+        llm = LLM(model="gemini/gemini-2.0-flash", temperature=0)
+        i18n = I18N()
+        messages = _build_conversation_messages(include_system=True)
+
+        summarize_messages(
+            messages=messages,
+            llm=llm,
+            callbacks=[],
+            i18n=i18n,
+        )
+
+        assert len(messages) >= 2
+        assert messages[0]["role"] == "system"
+        summary_msg = messages[-1]
+        assert summary_msg["role"] == "user"
+        assert len(summary_msg["content"]) > 0
+        assert "<summary>" in summary_msg["content"]
+        assert "</summary>" in summary_msg["content"]
+
+
+class TestSummarizeDirectAzure:
+    """Test direct summarize_messages calls with Azure."""
+
+    @pytest.mark.vcr()
+    def test_summarize_direct_azure(self) -> None:
+        """Test summarize_messages with azure/gpt-4o-mini."""
+        llm = LLM(model="azure/gpt-4o-mini", temperature=0)
+        i18n = I18N()
+        messages = _build_conversation_messages(include_system=True)
+
+        summarize_messages(
+            messages=messages,
+            llm=llm,
+            callbacks=[],
+            i18n=i18n,
+        )
+
+        assert len(messages) >= 2
+        assert messages[0]["role"] == "system"
+        summary_msg = messages[-1]
+        assert summary_msg["role"] == "user"
+        assert len(summary_msg["content"]) > 0
+        assert "<summary>" in summary_msg["content"]
+        assert "</summary>" in summary_msg["content"]
+
+
+class TestCrewKickoffCompaction:
+    """Test compaction triggered via Crew.kickoff() with small context window."""
+
+    @pytest.mark.vcr()
+    def test_crew_kickoff_compaction_openai(self) -> None:
+        """Test that compaction is triggered during kickoff with small context_window_size."""
+        llm = LLM(model="gpt-4o-mini", temperature=0)
+        # Force a very small context window to trigger compaction
+        llm.context_window_size = 500
+
+        agent = Agent(
+            role="Researcher",
+            goal="Find information about Python programming",
+            backstory="You are an expert researcher.",
+            llm=llm,
+            verbose=False,
+            max_iter=2,
+        )
+
+        task = Task(
+            description="What is Python? Give a brief answer.",
+            expected_output="A short description of Python.",
+            agent=agent,
+        )
+
+        crew = Crew(agents=[agent], tasks=[task], verbose=False)
+
+        # This may or may not trigger compaction depending on actual response sizes.
+        # The test verifies the code path doesn't crash.
+        result = crew.kickoff()
+        assert result is not None
+
+
+class TestAgentExecuteTaskCompaction:
+    """Test compaction triggered via Agent.execute_task()."""
+
+    @pytest.mark.vcr()
+    def test_agent_execute_task_compaction(self) -> None:
+        """Test that Agent.execute_task() works with small context_window_size."""
+        llm = LLM(model="gpt-4o-mini", temperature=0)
+        llm.context_window_size = 500
+
+        agent = Agent(
+            role="Writer",
+            goal="Write concise content",
+            backstory="You are a skilled writer.",
+            llm=llm,
+            verbose=False,
+            max_iter=2,
+        )
+
+        task = Task(
+            description="Write one sentence about the sun.",
+            expected_output="A single sentence about the sun.",
+            agent=agent,
+        )
+
+        result = agent.execute_task(task=task)
+        assert result is not None
+
+
+class TestSummarizePreservesFiles:
+    """Test that files are preserved through real summarization."""
+
+    @pytest.mark.vcr()
+    def test_summarize_preserves_files_integration(self) -> None:
+        """Test that file references survive a real summarization call."""
+        llm = LLM(model="gpt-4o-mini", temperature=0)
+        i18n = I18N()
+        messages = _build_conversation_messages(
+            include_system=True, include_files=True
+        )
+
+        summarize_messages(
+            messages=messages,
+            llm=llm,
+            callbacks=[],
+            i18n=i18n,
+        )
+
+        # System message preserved
+        assert messages[0]["role"] == "system"
+
+        # Files should be on the summary message with <summary> block
+        summary_msg = messages[-1]
+        assert "<summary>" in summary_msg["content"]
+        assert "</summary>" in summary_msg["content"]
+        assert "files" in summary_msg
+        assert "reference.pdf" in summary_msg["files"]