imp compaction (#4399)

* imp compaction * fix lint * cassette gen * cassette gen * improve assert * adding azure * fix global docstring
2026-05-02 07:42:40 +00:00 · 2026-02-11 15:52:03 -08:00
parent 0341e5aee7
commit 2ed0c2c043
13 changed files with 2881 additions and 71 deletions
--- a/lib/crewai/tests/cassettes/utilities/TestAgentExecuteTaskCompaction.test_agent_execute_task_compaction.yaml
+++ b/lib/crewai/tests/cassettes/utilities/TestAgentExecuteTaskCompaction.test_agent_execute_task_compaction.yaml
@@ -0,0 +1,113 @@
+interactions:
+- request:
+    body: '{"messages":[{"role":"system","content":"You are Writer. You are a skilled
+      writer.\nYour personal goal is: Write concise content"},{"role":"user","content":"\nCurrent
+      Task: Write one sentence about the sun.\n\nThis is the expected criteria for
+      your final answer: A single sentence about the sun.\nyou MUST return the actual
+      complete content as the final answer, not a summary.\n\nProvide your complete
+      response:"}],"model":"gpt-4o-mini","temperature":0}'
+    headers:
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept:
+      - application/json
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      authorization:
+      - AUTHORIZATION-XXX
+      connection:
+      - keep-alive
+      content-length:
+      - '453'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+      x-stainless-arch:
+      - X-STAINLESS-ARCH-XXX
+      x-stainless-async:
+      - 'false'
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - X-STAINLESS-OS-XXX
+      x-stainless-package-version:
+      - 1.83.0
+      x-stainless-read-timeout:
+      - X-STAINLESS-READ-TIMEOUT-XXX
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.13.3
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    body:
+      string: "{\n  \"id\": \"chatcmpl-D7RxEngFVCbqdc7tNjV3VjeteqcwT\",\n  \"object\":
+        \"chat.completion\",\n  \"created\": 1770668124,\n  \"model\": \"gpt-4o-mini-2024-07-18\",\n
+        \ \"choices\": [\n    {\n      \"index\": 0,\n      \"message\": {\n        \"role\":
+        \"assistant\",\n        \"content\": \"The sun is a massive ball of glowing
+        gas at the center of our solar system, providing light and warmth essential
+        for life on Earth.\",\n        \"refusal\": null,\n        \"annotations\":
+        []\n      },\n      \"logprobs\": null,\n      \"finish_reason\": \"stop\"\n
+        \   }\n  ],\n  \"usage\": {\n    \"prompt_tokens\": 78,\n    \"completion_tokens\":
+        27,\n    \"total_tokens\": 105,\n    \"prompt_tokens_details\": {\n      \"cached_tokens\":
+        0,\n      \"audio_tokens\": 0\n    },\n    \"completion_tokens_details\":
+        {\n      \"reasoning_tokens\": 0,\n      \"audio_tokens\": 0,\n      \"accepted_prediction_tokens\":
+        0,\n      \"rejected_prediction_tokens\": 0\n    }\n  },\n  \"service_tier\":
+        \"default\",\n  \"system_fingerprint\": \"fp_f4ae844694\"\n}\n"
+    headers:
+      CF-RAY:
+      - CF-RAY-XXX
+      Connection:
+      - keep-alive
+      Content-Type:
+      - application/json
+      Date:
+      - Mon, 09 Feb 2026 20:15:25 GMT
+      Server:
+      - cloudflare
+      Strict-Transport-Security:
+      - STS-XXX
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - X-CONTENT-TYPE-XXX
+      access-control-expose-headers:
+      - ACCESS-CONTROL-XXX
+      alt-svc:
+      - h3=":443"; ma=86400
+      cf-cache-status:
+      - DYNAMIC
+      openai-organization:
+      - OPENAI-ORG-XXX
+      openai-processing-ms:
+      - '664'
+      openai-project:
+      - OPENAI-PROJECT-XXX
+      openai-version:
+      - '2020-10-01'
+      set-cookie:
+      - SET-COOKIE-XXX
+      x-openai-proxy-wasm:
+      - v0.1
+      x-ratelimit-limit-requests:
+      - X-RATELIMIT-LIMIT-REQUESTS-XXX
+      x-ratelimit-limit-tokens:
+      - X-RATELIMIT-LIMIT-TOKENS-XXX
+      x-ratelimit-remaining-requests:
+      - X-RATELIMIT-REMAINING-REQUESTS-XXX
+      x-ratelimit-remaining-tokens:
+      - X-RATELIMIT-REMAINING-TOKENS-XXX
+      x-ratelimit-reset-requests:
+      - X-RATELIMIT-RESET-REQUESTS-XXX
+      x-ratelimit-reset-tokens:
+      - X-RATELIMIT-RESET-TOKENS-XXX
+      x-request-id:
+      - X-REQUEST-ID-XXX
+    status:
+      code: 200
+      message: OK
+version: 1
--- a/lib/crewai/tests/cassettes/utilities/TestCrewKickoffCompaction.test_crew_kickoff_compaction_openai.yaml
+++ b/lib/crewai/tests/cassettes/utilities/TestCrewKickoffCompaction.test_crew_kickoff_compaction_openai.yaml
@@ -0,0 +1,120 @@
+interactions:
+- request:
+    body: '{"messages":[{"role":"system","content":"You are Researcher. You are an
+      expert researcher.\nYour personal goal is: Find information about Python programming"},{"role":"user","content":"\nCurrent
+      Task: What is Python? Give a brief answer.\n\nThis is the expected criteria
+      for your final answer: A short description of Python.\nyou MUST return the actual
+      complete content as the final answer, not a summary.\n\nProvide your complete
+      response:"}],"model":"gpt-4o-mini","temperature":0}'
+    headers:
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept:
+      - application/json
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      authorization:
+      - AUTHORIZATION-XXX
+      connection:
+      - keep-alive
+      content-length:
+      - '482'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+      x-stainless-arch:
+      - X-STAINLESS-ARCH-XXX
+      x-stainless-async:
+      - 'false'
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - X-STAINLESS-OS-XXX
+      x-stainless-package-version:
+      - 1.83.0
+      x-stainless-read-timeout:
+      - X-STAINLESS-READ-TIMEOUT-XXX
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.13.3
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    body:
+      string: "{\n  \"id\": \"chatcmpl-D7RxRv3U0LCLf2iqf40wxOQsuiYFR\",\n  \"object\":
+        \"chat.completion\",\n  \"created\": 1770668137,\n  \"model\": \"gpt-4o-mini-2024-07-18\",\n
+        \ \"choices\": [\n    {\n      \"index\": 0,\n      \"message\": {\n        \"role\":
+        \"assistant\",\n        \"content\": \"Python is a high-level, interpreted
+        programming language known for its readability and simplicity. It was created
+        by Guido van Rossum and first released in 1991. Python supports multiple programming
+        paradigms, including procedural, object-oriented, and functional programming.
+        It has a large standard library and is widely used for web development, data
+        analysis, artificial intelligence, scientific computing, and automation, among
+        other applications. Python's syntax emphasizes code readability, allowing
+        developers to express concepts in fewer lines of code compared to other languages.
+        Its active community and extensive ecosystem of libraries and frameworks make
+        it a popular choice for both beginners and experienced programmers.\",\n        \"refusal\":
+        null,\n        \"annotations\": []\n      },\n      \"logprobs\": null,\n
+        \     \"finish_reason\": \"stop\"\n    }\n  ],\n  \"usage\": {\n    \"prompt_tokens\":
+        82,\n    \"completion_tokens\": 123,\n    \"total_tokens\": 205,\n    \"prompt_tokens_details\":
+        {\n      \"cached_tokens\": 0,\n      \"audio_tokens\": 0\n    },\n    \"completion_tokens_details\":
+        {\n      \"reasoning_tokens\": 0,\n      \"audio_tokens\": 0,\n      \"accepted_prediction_tokens\":
+        0,\n      \"rejected_prediction_tokens\": 0\n    }\n  },\n  \"service_tier\":
+        \"default\",\n  \"system_fingerprint\": \"fp_f4ae844694\"\n}\n"
+    headers:
+      CF-RAY:
+      - CF-RAY-XXX
+      Connection:
+      - keep-alive
+      Content-Type:
+      - application/json
+      Date:
+      - Mon, 09 Feb 2026 20:15:39 GMT
+      Server:
+      - cloudflare
+      Strict-Transport-Security:
+      - STS-XXX
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - X-CONTENT-TYPE-XXX
+      access-control-expose-headers:
+      - ACCESS-CONTROL-XXX
+      alt-svc:
+      - h3=":443"; ma=86400
+      cf-cache-status:
+      - DYNAMIC
+      openai-organization:
+      - OPENAI-ORG-XXX
+      openai-processing-ms:
+      - '2467'
+      openai-project:
+      - OPENAI-PROJECT-XXX
+      openai-version:
+      - '2020-10-01'
+      set-cookie:
+      - SET-COOKIE-XXX
+      x-openai-proxy-wasm:
+      - v0.1
+      x-ratelimit-limit-requests:
+      - X-RATELIMIT-LIMIT-REQUESTS-XXX
+      x-ratelimit-limit-tokens:
+      - X-RATELIMIT-LIMIT-TOKENS-XXX
+      x-ratelimit-remaining-requests:
+      - X-RATELIMIT-REMAINING-REQUESTS-XXX
+      x-ratelimit-remaining-tokens:
+      - X-RATELIMIT-REMAINING-TOKENS-XXX
+      x-ratelimit-reset-requests:
+      - X-RATELIMIT-RESET-REQUESTS-XXX
+      x-ratelimit-reset-tokens:
+      - X-RATELIMIT-RESET-TOKENS-XXX
+      x-request-id:
+      - X-REQUEST-ID-XXX
+    status:
+      code: 200
+      message: OK
+version: 1
--- a/lib/crewai/tests/cassettes/utilities/TestParallelSummarizationVCR.test_parallel_summarize_openai.yaml
+++ b/lib/crewai/tests/cassettes/utilities/TestParallelSummarizationVCR.test_parallel_summarize_openai.yaml
@@ -0,0 +1,435 @@
+interactions:
+- request:
+    body: '{"messages":[{"role":"system","content":"You are a precise assistant that
+      creates structured summaries of agent conversations. You preserve critical context
+      needed for seamless task continuation."},{"role":"user","content":"Analyze the
+      following conversation and create a structured summary that preserves all information
+      needed to continue the task seamlessly.\n\n<conversation>\n[USER]: Explain the
+      Python package ecosystem. How does pip work, what is PyPI, and what are virtual
+      environments? Compare pip with conda and uv.\n\n[ASSISTANT]: PyPI (Python Package
+      Index) is the official repository hosting 400k+ packages. pip is the standard
+      package installer that downloads from PyPI. Virtual environments (venv) create
+      isolated Python installations to avoid dependency conflicts between projects.
+      conda is a cross-language package manager popular in data science that can manage
+      non-Python dependencies. uv is a new Rust-based tool that is 10-100x faster
+      than pip and aims to replace pip, pip-tools, and virtualenv with a single unified
+      tool.\n</conversation>\n\nCreate a summary with these sections:\n1. **Task Overview**:
+      What is the agent trying to accomplish?\n2. **Current State**: What has been
+      completed so far? What step is the agent on?\n3. **Important Discoveries**:
+      Key facts, data, tool results, or findings that must not be lost.\n4. **Next
+      Steps**: What should the agent do next based on the conversation?\n5. **Context
+      to Preserve**: Any specific values, names, URLs, code snippets, or details referenced
+      in the conversation.\n\nWrap your entire summary in <summary> tags.\n\n<summary>\n[Your
+      structured summary here]\n</summary>"}],"model":"gpt-4o-mini","temperature":0}'
+    headers:
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept:
+      - application/json
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      authorization:
+      - AUTHORIZATION-XXX
+      connection:
+      - keep-alive
+      content-length:
+      - '1687'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+      x-stainless-arch:
+      - X-STAINLESS-ARCH-XXX
+      x-stainless-async:
+      - async:asyncio
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - X-STAINLESS-OS-XXX
+      x-stainless-package-version:
+      - 1.83.0
+      x-stainless-read-timeout:
+      - X-STAINLESS-READ-TIMEOUT-XXX
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.13.3
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    body:
+      string: "{\n  \"id\": \"chatcmpl-D7S93xpUu9d5twM82uJOZpurQTD5u\",\n  \"object\":
+        \"chat.completion\",\n  \"created\": 1770668857,\n  \"model\": \"gpt-4o-mini-2024-07-18\",\n
+        \ \"choices\": [\n    {\n      \"index\": 0,\n      \"message\": {\n        \"role\":
+        \"assistant\",\n        \"content\": \"<summary>\\n1. **Task Overview**: The
+        user is seeking an explanation of the Python package ecosystem, specifically
+        focusing on how pip works, the role of PyPI, the concept of virtual environments,
+        and a comparison between pip, conda, and uv.\\n\\n2. **Current State**: The
+        assistant has provided a comprehensive overview of the Python package ecosystem,
+        including definitions and comparisons of pip, PyPI, virtual environments,
+        conda, and uv.\\n\\n3. **Important Discoveries**:\\n   - PyPI (Python Package
+        Index) is the official repository with over 400,000 packages.\\n   - pip is
+        the standard package installer that downloads packages from PyPI.\\n   - Virtual
+        environments (venv) allow for isolated Python installations to prevent dependency
+        conflicts.\\n   - conda is a cross-language package manager, particularly
+        popular in data science, that can manage non-Python dependencies.\\n   - uv
+        is a new Rust-based tool that is significantly faster than pip (10-100x) and
+        aims to unify the functionalities of pip, pip-tools, and virtualenv.\\n\\n4.
+        **Next Steps**: The agent should consider providing further details on how
+        to use pip, conda, and uv, including installation commands, examples of creating
+        virtual environments, and any specific use cases for each tool.\\n\\n5. **Context
+        to Preserve**: \\n   - PyPI: Python Package Index, hosting 400k+ packages.\\n
+        \  - pip: Standard package installer for Python.\\n   - Virtual environments
+        (venv): Isolated Python installations.\\n   - conda: Cross-language package
+        manager for data science.\\n   - uv: Rust-based tool, 10-100x faster than
+        pip, aims to replace pip, pip-tools, and virtualenv.\\n</summary>\",\n        \"refusal\":
+        null,\n        \"annotations\": []\n      },\n      \"logprobs\": null,\n
+        \     \"finish_reason\": \"stop\"\n    }\n  ],\n  \"usage\": {\n    \"prompt_tokens\":
+        333,\n    \"completion_tokens\": 354,\n    \"total_tokens\": 687,\n    \"prompt_tokens_details\":
+        {\n      \"cached_tokens\": 0,\n      \"audio_tokens\": 0\n    },\n    \"completion_tokens_details\":
+        {\n      \"reasoning_tokens\": 0,\n      \"audio_tokens\": 0,\n      \"accepted_prediction_tokens\":
+        0,\n      \"rejected_prediction_tokens\": 0\n    }\n  },\n  \"service_tier\":
+        \"default\",\n  \"system_fingerprint\": \"fp_f4ae844694\"\n}\n"
+    headers:
+      CF-RAY:
+      - CF-RAY-XXX
+      Connection:
+      - keep-alive
+      Content-Type:
+      - application/json
+      Date:
+      - Mon, 09 Feb 2026 20:27:42 GMT
+      Server:
+      - cloudflare
+      Strict-Transport-Security:
+      - STS-XXX
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - X-CONTENT-TYPE-XXX
+      access-control-expose-headers:
+      - ACCESS-CONTROL-XXX
+      alt-svc:
+      - h3=":443"; ma=86400
+      cf-cache-status:
+      - DYNAMIC
+      openai-organization:
+      - OPENAI-ORG-XXX
+      openai-processing-ms:
+      - '4879'
+      openai-project:
+      - OPENAI-PROJECT-XXX
+      openai-version:
+      - '2020-10-01'
+      set-cookie:
+      - SET-COOKIE-XXX
+      x-openai-proxy-wasm:
+      - v0.1
+      x-ratelimit-limit-requests:
+      - X-RATELIMIT-LIMIT-REQUESTS-XXX
+      x-ratelimit-limit-tokens:
+      - X-RATELIMIT-LIMIT-TOKENS-XXX
+      x-ratelimit-remaining-requests:
+      - X-RATELIMIT-REMAINING-REQUESTS-XXX
+      x-ratelimit-remaining-tokens:
+      - X-RATELIMIT-REMAINING-TOKENS-XXX
+      x-ratelimit-reset-requests:
+      - X-RATELIMIT-RESET-REQUESTS-XXX
+      x-ratelimit-reset-tokens:
+      - X-RATELIMIT-RESET-TOKENS-XXX
+      x-request-id:
+      - X-REQUEST-ID-XXX
+    status:
+      code: 200
+      message: OK
+- request:
+    body: '{"messages":[{"role":"system","content":"You are a precise assistant that
+      creates structured summaries of agent conversations. You preserve critical context
+      needed for seamless task continuation."},{"role":"user","content":"Analyze the
+      following conversation and create a structured summary that preserves all information
+      needed to continue the task seamlessly.\n\n<conversation>\n[USER]: Tell me about
+      the history of the Python programming language. Who created it, when was it
+      first released, and what were the main design goals? Please provide a detailed
+      overview covering the major milestones from its inception through Python 3.\n\n[ASSISTANT]:
+      Python was created by Guido van Rossum and first released in 1991. The main
+      design goals were code readability and simplicity. Key milestones: Python 1.0
+      (1994) introduced functional programming tools like lambda and map. Python 2.0
+      (2000) added list comprehensions and garbage collection. Python 3.0 (2008) was
+      a major backward-incompatible release that fixed fundamental design flaws. Python
+      2 reached end-of-life in January 2020.\n</conversation>\n\nCreate a summary
+      with these sections:\n1. **Task Overview**: What is the agent trying to accomplish?\n2.
+      **Current State**: What has been completed so far? What step is the agent on?\n3.
+      **Important Discoveries**: Key facts, data, tool results, or findings that must
+      not be lost.\n4. **Next Steps**: What should the agent do next based on the
+      conversation?\n5. **Context to Preserve**: Any specific values, names, URLs,
+      code snippets, or details referenced in the conversation.\n\nWrap your entire
+      summary in <summary> tags.\n\n<summary>\n[Your structured summary here]\n</summary>"}],"model":"gpt-4o-mini","temperature":0}'
+    headers:
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept:
+      - application/json
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      authorization:
+      - AUTHORIZATION-XXX
+      connection:
+      - keep-alive
+      content-length:
+      - '1726'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+      x-stainless-arch:
+      - X-STAINLESS-ARCH-XXX
+      x-stainless-async:
+      - async:asyncio
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - X-STAINLESS-OS-XXX
+      x-stainless-package-version:
+      - 1.83.0
+      x-stainless-read-timeout:
+      - X-STAINLESS-READ-TIMEOUT-XXX
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.13.3
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    body:
+      string: "{\n  \"id\": \"chatcmpl-D7S93rBUMAtEdwdI6Y2ga0s50IFtv\",\n  \"object\":
+        \"chat.completion\",\n  \"created\": 1770668857,\n  \"model\": \"gpt-4o-mini-2024-07-18\",\n
+        \ \"choices\": [\n    {\n      \"index\": 0,\n      \"message\": {\n        \"role\":
+        \"assistant\",\n        \"content\": \"<summary>\\n1. **Task Overview**: The
+        user is seeking a detailed overview of the history of the Python programming
+        language, including its creator, initial release date, main design goals,
+        and major milestones up to Python 3.\\n\\n2. **Current State**: The assistant
+        has provided a comprehensive response detailing the history of Python, including
+        its creator (Guido van Rossum), first release (1991), main design goals (code
+        readability and simplicity), and key milestones (Python 1.0 in 1994, Python
+        2.0 in 2000, and Python 3.0 in 2008).\\n\\n3. **Important Discoveries**: \\n
+        \  - Python was created by Guido van Rossum.\\n   - First released in 1991.\\n
+        \  - Main design goals: code readability and simplicity.\\n   - Key milestones:\\n
+        \    - Python 1.0 (1994): Introduced functional programming tools like lambda
+        and map.\\n     - Python 2.0 (2000): Added list comprehensions and garbage
+        collection.\\n     - Python 3.0 (2008): Major backward-incompatible release
+        that fixed fundamental design flaws.\\n   - Python 2 reached end-of-life in
+        January 2020.\\n\\n4. **Next Steps**: The agent should be prepared to provide
+        additional details or answer follow-up questions regarding Python's features,
+        community, or specific use cases if the user requests more information.\\n\\n5.
+        **Context to Preserve**: \\n   - Creator: Guido van Rossum\\n   - Initial
+        release: 1991\\n   - Milestones: \\n     - Python 1.0 (1994)\\n     - Python
+        2.0 (2000)\\n     - Python 3.0 (2008)\\n   - End-of-life for Python 2: January
+        2020\\n</summary>\",\n        \"refusal\": null,\n        \"annotations\":
+        []\n      },\n      \"logprobs\": null,\n      \"finish_reason\": \"stop\"\n
+        \   }\n  ],\n  \"usage\": {\n    \"prompt_tokens\": 346,\n    \"completion_tokens\":
+        372,\n    \"total_tokens\": 718,\n    \"prompt_tokens_details\": {\n      \"cached_tokens\":
+        0,\n      \"audio_tokens\": 0\n    },\n    \"completion_tokens_details\":
+        {\n      \"reasoning_tokens\": 0,\n      \"audio_tokens\": 0,\n      \"accepted_prediction_tokens\":
+        0,\n      \"rejected_prediction_tokens\": 0\n    }\n  },\n  \"service_tier\":
+        \"default\",\n  \"system_fingerprint\": \"fp_7e4bf6ad56\"\n}\n"
+    headers:
+      CF-RAY:
+      - CF-RAY-XXX
+      Connection:
+      - keep-alive
+      Content-Type:
+      - application/json
+      Date:
+      - Mon, 09 Feb 2026 20:27:42 GMT
+      Server:
+      - cloudflare
+      Strict-Transport-Security:
+      - STS-XXX
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - X-CONTENT-TYPE-XXX
+      access-control-expose-headers:
+      - ACCESS-CONTROL-XXX
+      alt-svc:
+      - h3=":443"; ma=86400
+      cf-cache-status:
+      - DYNAMIC
+      openai-organization:
+      - OPENAI-ORG-XXX
+      openai-processing-ms:
+      - '5097'
+      openai-project:
+      - OPENAI-PROJECT-XXX
+      openai-version:
+      - '2020-10-01'
+      set-cookie:
+      - SET-COOKIE-XXX
+      x-openai-proxy-wasm:
+      - v0.1
+      x-ratelimit-limit-requests:
+      - X-RATELIMIT-LIMIT-REQUESTS-XXX
+      x-ratelimit-limit-tokens:
+      - X-RATELIMIT-LIMIT-TOKENS-XXX
+      x-ratelimit-remaining-requests:
+      - X-RATELIMIT-REMAINING-REQUESTS-XXX
+      x-ratelimit-remaining-tokens:
+      - X-RATELIMIT-REMAINING-TOKENS-XXX
+      x-ratelimit-reset-requests:
+      - X-RATELIMIT-RESET-REQUESTS-XXX
+      x-ratelimit-reset-tokens:
+      - X-RATELIMIT-RESET-TOKENS-XXX
+      x-request-id:
+      - X-REQUEST-ID-XXX
+    status:
+      code: 200
+      message: OK
+- request:
+    body: '{"messages":[{"role":"system","content":"You are a precise assistant that
+      creates structured summaries of agent conversations. You preserve critical context
+      needed for seamless task continuation."},{"role":"user","content":"Analyze the
+      following conversation and create a structured summary that preserves all information
+      needed to continue the task seamlessly.\n\n<conversation>\n[USER]: What about
+      the async/await features? When were they introduced and how do they compare
+      to similar features in JavaScript and C#? Also explain the Global Interpreter
+      Lock and its implications.\n\n[ASSISTANT]: Async/await was introduced in Python
+      3.5 (PEP 492, 2015). Unlike JavaScript which is single-threaded by design, Python''s
+      asyncio is an opt-in framework. C# introduced async/await in 2012 (C# 5.0) and
+      was a major inspiration for Python''s implementation. The GIL (Global Interpreter
+      Lock) is a mutex that protects access to Python objects, preventing multiple
+      threads from executing Python bytecodes simultaneously. This means CPU-bound
+      multithreaded programs don''t benefit from multiple cores. PEP 703 proposes
+      making the GIL optional in CPython.\n</conversation>\n\nCreate a summary with
+      these sections:\n1. **Task Overview**: What is the agent trying to accomplish?\n2.
+      **Current State**: What has been completed so far? What step is the agent on?\n3.
+      **Important Discoveries**: Key facts, data, tool results, or findings that must
+      not be lost.\n4. **Next Steps**: What should the agent do next based on the
+      conversation?\n5. **Context to Preserve**: Any specific values, names, URLs,
+      code snippets, or details referenced in the conversation.\n\nWrap your entire
+      summary in <summary> tags.\n\n<summary>\n[Your structured summary here]\n</summary>"}],"model":"gpt-4o-mini","temperature":0}'
+    headers:
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept:
+      - application/json
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      authorization:
+      - AUTHORIZATION-XXX
+      connection:
+      - keep-alive
+      content-length:
+      - '1786'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+      x-stainless-arch:
+      - X-STAINLESS-ARCH-XXX
+      x-stainless-async:
+      - async:asyncio
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - X-STAINLESS-OS-XXX
+      x-stainless-package-version:
+      - 1.83.0
+      x-stainless-read-timeout:
+      - X-STAINLESS-READ-TIMEOUT-XXX
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.13.3
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    body:
+      string: "{\n  \"id\": \"chatcmpl-D7S94auQYOLDTKfRzdluGiWAomSqd\",\n  \"object\":
+        \"chat.completion\",\n  \"created\": 1770668858,\n  \"model\": \"gpt-4o-mini-2024-07-18\",\n
+        \ \"choices\": [\n    {\n      \"index\": 0,\n      \"message\": {\n        \"role\":
+        \"assistant\",\n        \"content\": \"<summary>\\n1. **Task Overview**: The
+        user is seeking information about the async/await features in Python, their
+        introduction timeline, comparisons with similar features in JavaScript and
+        C#, and an explanation of the Global Interpreter Lock (GIL) and its implications.\\n\\n2.
+        **Current State**: The assistant has provided information regarding the introduction
+        of async/await in Python (version 3.5, PEP 492 in 2015), comparisons with
+        JavaScript and C# (C# introduced async/await in 2012), and an explanation
+        of the GIL.\\n\\n3. **Important Discoveries**: \\n   - Async/await was introduced
+        in Python 3.5 (PEP 492, 2015).\\n   - JavaScript is single-threaded, while
+        Python's asyncio is an opt-in framework.\\n   - C# introduced async/await
+        in 2012 (C# 5.0) and influenced Python's implementation.\\n   - The GIL (Global
+        Interpreter Lock) is a mutex that prevents multiple threads from executing
+        Python bytecodes simultaneously, affecting CPU-bound multithreaded programs.\\n
+        \  - PEP 703 proposes making the GIL optional in CPython.\\n\\n4. **Next Steps**:
+        The agent should consider providing more detailed comparisons of async/await
+        features between Python, JavaScript, and C#, as well as further implications
+        of the GIL and PEP 703.\\n\\n5. **Context to Preserve**: \\n   - Python async/await
+        introduction: 3.5 (PEP 492, 2015)\\n   - C# async/await introduction: 2012
+        (C# 5.0)\\n   - GIL (Global Interpreter Lock) explanation and implications.\\n
+        \  - Reference to PEP 703 regarding the GIL.\\n</summary>\",\n        \"refusal\":
+        null,\n        \"annotations\": []\n      },\n      \"logprobs\": null,\n
+        \     \"finish_reason\": \"stop\"\n    }\n  ],\n  \"usage\": {\n    \"prompt_tokens\":
+        364,\n    \"completion_tokens\": 368,\n    \"total_tokens\": 732,\n    \"prompt_tokens_details\":
+        {\n      \"cached_tokens\": 0,\n      \"audio_tokens\": 0\n    },\n    \"completion_tokens_details\":
+        {\n      \"reasoning_tokens\": 0,\n      \"audio_tokens\": 0,\n      \"accepted_prediction_tokens\":
+        0,\n      \"rejected_prediction_tokens\": 0\n    }\n  },\n  \"service_tier\":
+        \"default\",\n  \"system_fingerprint\": \"fp_f4ae844694\"\n}\n"
+    headers:
+      CF-RAY:
+      - CF-RAY-XXX
+      Connection:
+      - keep-alive
+      Content-Type:
+      - application/json
+      Date:
+      - Mon, 09 Feb 2026 20:27:44 GMT
+      Server:
+      - cloudflare
+      Strict-Transport-Security:
+      - STS-XXX
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - X-CONTENT-TYPE-XXX
+      access-control-expose-headers:
+      - ACCESS-CONTROL-XXX
+      alt-svc:
+      - h3=":443"; ma=86400
+      cf-cache-status:
+      - DYNAMIC
+      openai-organization:
+      - OPENAI-ORG-XXX
+      openai-processing-ms:
+      - '6339'
+      openai-project:
+      - OPENAI-PROJECT-XXX
+      openai-version:
+      - '2020-10-01'
+      set-cookie:
+      - SET-COOKIE-XXX
+      x-openai-proxy-wasm:
+      - v0.1
+      x-ratelimit-limit-requests:
+      - X-RATELIMIT-LIMIT-REQUESTS-XXX
+      x-ratelimit-limit-tokens:
+      - X-RATELIMIT-LIMIT-TOKENS-XXX
+      x-ratelimit-remaining-requests:
+      - X-RATELIMIT-REMAINING-REQUESTS-XXX
+      x-ratelimit-remaining-tokens:
+      - X-RATELIMIT-REMAINING-TOKENS-XXX
+      x-ratelimit-reset-requests:
+      - X-RATELIMIT-RESET-REQUESTS-XXX
+      x-ratelimit-reset-tokens:
+      - X-RATELIMIT-RESET-TOKENS-XXX
+      x-request-id:
+      - X-REQUEST-ID-XXX
+    status:
+      code: 200
+      message: OK
+version: 1
--- a/lib/crewai/tests/cassettes/utilities/TestParallelSummarizationVCR.test_parallel_summarize_preserves_files.yaml
+++ b/lib/crewai/tests/cassettes/utilities/TestParallelSummarizationVCR.test_parallel_summarize_preserves_files.yaml
@@ -0,0 +1,435 @@
+interactions:
+- request:
+    body: '{"messages":[{"role":"system","content":"You are a precise assistant that
+      creates structured summaries of agent conversations. You preserve critical context
+      needed for seamless task continuation."},{"role":"user","content":"Analyze the
+      following conversation and create a structured summary that preserves all information
+      needed to continue the task seamlessly.\n\n<conversation>\n[USER]: Explain the
+      Python package ecosystem. How does pip work, what is PyPI, and what are virtual
+      environments? Compare pip with conda and uv.\n\n[ASSISTANT]: PyPI (Python Package
+      Index) is the official repository hosting 400k+ packages. pip is the standard
+      package installer that downloads from PyPI. Virtual environments (venv) create
+      isolated Python installations to avoid dependency conflicts between projects.
+      conda is a cross-language package manager popular in data science that can manage
+      non-Python dependencies. uv is a new Rust-based tool that is 10-100x faster
+      than pip and aims to replace pip, pip-tools, and virtualenv with a single unified
+      tool.\n</conversation>\n\nCreate a summary with these sections:\n1. **Task Overview**:
+      What is the agent trying to accomplish?\n2. **Current State**: What has been
+      completed so far? What step is the agent on?\n3. **Important Discoveries**:
+      Key facts, data, tool results, or findings that must not be lost.\n4. **Next
+      Steps**: What should the agent do next based on the conversation?\n5. **Context
+      to Preserve**: Any specific values, names, URLs, code snippets, or details referenced
+      in the conversation.\n\nWrap your entire summary in <summary> tags.\n\n<summary>\n[Your
+      structured summary here]\n</summary>"}],"model":"gpt-4o-mini","temperature":0}'
+    headers:
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept:
+      - application/json
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      authorization:
+      - AUTHORIZATION-XXX
+      connection:
+      - keep-alive
+      content-length:
+      - '1687'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+      x-stainless-arch:
+      - X-STAINLESS-ARCH-XXX
+      x-stainless-async:
+      - async:asyncio
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - X-STAINLESS-OS-XXX
+      x-stainless-package-version:
+      - 1.83.0
+      x-stainless-read-timeout:
+      - X-STAINLESS-READ-TIMEOUT-XXX
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.13.3
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    body:
+      string: "{\n  \"id\": \"chatcmpl-D7S9PnjkuCMHqU912kcH8G5zIIxQU\",\n  \"object\":
+        \"chat.completion\",\n  \"created\": 1770668879,\n  \"model\": \"gpt-4o-mini-2024-07-18\",\n
+        \ \"choices\": [\n    {\n      \"index\": 0,\n      \"message\": {\n        \"role\":
+        \"assistant\",\n        \"content\": \"<summary>\\n1. **Task Overview**: The
+        user is seeking an explanation of the Python package ecosystem, specifically
+        focusing on how pip works, the role of PyPI, the concept of virtual environments,
+        and a comparison between pip, conda, and uv.\\n\\n2. **Current State**: The
+        assistant has provided a comprehensive overview of the Python package ecosystem,
+        including definitions and comparisons of pip, PyPI, virtual environments,
+        conda, and uv.\\n\\n3. **Important Discoveries**:\\n   - PyPI (Python Package
+        Index) is the official repository with over 400,000 packages.\\n   - pip is
+        the standard package installer that downloads packages from PyPI.\\n   - Virtual
+        environments (venv) allow for isolated Python installations to prevent dependency
+        conflicts.\\n   - conda is a cross-language package manager, particularly
+        popular in data science, that can manage non-Python dependencies.\\n   - uv
+        is a new Rust-based tool that is significantly faster than pip (10-100x) and
+        aims to unify the functionalities of pip, pip-tools, and virtualenv.\\n\\n4.
+        **Next Steps**: The agent should consider providing further details or examples
+        on how to use pip, conda, and uv, as well as practical applications of virtual
+        environments in Python projects.\\n\\n5. **Context to Preserve**: \\n   -
+        PyPI: Python Package Index, hosting 400k+ packages.\\n   - pip: Standard package
+        installer for Python.\\n   - Virtual environments (venv): Isolated Python
+        installations.\\n   - conda: Cross-language package manager for data science.\\n
+        \  - uv: Rust-based tool, 10-100x faster than pip, aims to replace pip, pip-tools,
+        and virtualenv.\\n</summary>\",\n        \"refusal\": null,\n        \"annotations\":
+        []\n      },\n      \"logprobs\": null,\n      \"finish_reason\": \"stop\"\n
+        \   }\n  ],\n  \"usage\": {\n    \"prompt_tokens\": 333,\n    \"completion_tokens\":
+        349,\n    \"total_tokens\": 682,\n    \"prompt_tokens_details\": {\n      \"cached_tokens\":
+        0,\n      \"audio_tokens\": 0\n    },\n    \"completion_tokens_details\":
+        {\n      \"reasoning_tokens\": 0,\n      \"audio_tokens\": 0,\n      \"accepted_prediction_tokens\":
+        0,\n      \"rejected_prediction_tokens\": 0\n    }\n  },\n  \"service_tier\":
+        \"default\",\n  \"system_fingerprint\": \"fp_f4ae844694\"\n}\n"
+    headers:
+      CF-RAY:
+      - CF-RAY-XXX
+      Connection:
+      - keep-alive
+      Content-Type:
+      - application/json
+      Date:
+      - Mon, 09 Feb 2026 20:28:04 GMT
+      Server:
+      - cloudflare
+      Strict-Transport-Security:
+      - STS-XXX
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - X-CONTENT-TYPE-XXX
+      access-control-expose-headers:
+      - ACCESS-CONTROL-XXX
+      alt-svc:
+      - h3=":443"; ma=86400
+      cf-cache-status:
+      - DYNAMIC
+      openai-organization:
+      - OPENAI-ORG-XXX
+      openai-processing-ms:
+      - '4979'
+      openai-project:
+      - OPENAI-PROJECT-XXX
+      openai-version:
+      - '2020-10-01'
+      set-cookie:
+      - SET-COOKIE-XXX
+      x-openai-proxy-wasm:
+      - v0.1
+      x-ratelimit-limit-requests:
+      - X-RATELIMIT-LIMIT-REQUESTS-XXX
+      x-ratelimit-limit-tokens:
+      - X-RATELIMIT-LIMIT-TOKENS-XXX
+      x-ratelimit-remaining-requests:
+      - X-RATELIMIT-REMAINING-REQUESTS-XXX
+      x-ratelimit-remaining-tokens:
+      - X-RATELIMIT-REMAINING-TOKENS-XXX
+      x-ratelimit-reset-requests:
+      - X-RATELIMIT-RESET-REQUESTS-XXX
+      x-ratelimit-reset-tokens:
+      - X-RATELIMIT-RESET-TOKENS-XXX
+      x-request-id:
+      - X-REQUEST-ID-XXX
+    status:
+      code: 200
+      message: OK
+- request:
+    body: '{"messages":[{"role":"system","content":"You are a precise assistant that
+      creates structured summaries of agent conversations. You preserve critical context
+      needed for seamless task continuation."},{"role":"user","content":"Analyze the
+      following conversation and create a structured summary that preserves all information
+      needed to continue the task seamlessly.\n\n<conversation>\n[USER]: Tell me about
+      the history of the Python programming language. Who created it, when was it
+      first released, and what were the main design goals? Please provide a detailed
+      overview covering the major milestones from its inception through Python 3.\n\n[ASSISTANT]:
+      Python was created by Guido van Rossum and first released in 1991. The main
+      design goals were code readability and simplicity. Key milestones: Python 1.0
+      (1994) introduced functional programming tools like lambda and map. Python 2.0
+      (2000) added list comprehensions and garbage collection. Python 3.0 (2008) was
+      a major backward-incompatible release that fixed fundamental design flaws. Python
+      2 reached end-of-life in January 2020.\n</conversation>\n\nCreate a summary
+      with these sections:\n1. **Task Overview**: What is the agent trying to accomplish?\n2.
+      **Current State**: What has been completed so far? What step is the agent on?\n3.
+      **Important Discoveries**: Key facts, data, tool results, or findings that must
+      not be lost.\n4. **Next Steps**: What should the agent do next based on the
+      conversation?\n5. **Context to Preserve**: Any specific values, names, URLs,
+      code snippets, or details referenced in the conversation.\n\nWrap your entire
+      summary in <summary> tags.\n\n<summary>\n[Your structured summary here]\n</summary>"}],"model":"gpt-4o-mini","temperature":0}'
+    headers:
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept:
+      - application/json
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      authorization:
+      - AUTHORIZATION-XXX
+      connection:
+      - keep-alive
+      content-length:
+      - '1726'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+      x-stainless-arch:
+      - X-STAINLESS-ARCH-XXX
+      x-stainless-async:
+      - async:asyncio
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - X-STAINLESS-OS-XXX
+      x-stainless-package-version:
+      - 1.83.0
+      x-stainless-read-timeout:
+      - X-STAINLESS-READ-TIMEOUT-XXX
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.13.3
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    body:
+      string: "{\n  \"id\": \"chatcmpl-D7S9PqglWRu0PEoMRHyOiRnpn3yqU\",\n  \"object\":
+        \"chat.completion\",\n  \"created\": 1770668879,\n  \"model\": \"gpt-4o-mini-2024-07-18\",\n
+        \ \"choices\": [\n    {\n      \"index\": 0,\n      \"message\": {\n        \"role\":
+        \"assistant\",\n        \"content\": \"<summary>\\n1. **Task Overview**: The
+        user is seeking a detailed overview of the history of the Python programming
+        language, including its creator, initial release date, main design goals,
+        and major milestones up to Python 3.\\n\\n2. **Current State**: The assistant
+        has provided a comprehensive response detailing the history of Python, including
+        its creator (Guido van Rossum), first release (1991), main design goals (code
+        readability and simplicity), and key milestones (Python 1.0 in 1994, Python
+        2.0 in 2000, and Python 3.0 in 2008).\\n\\n3. **Important Discoveries**: \\n
+        \  - Python was created by Guido van Rossum.\\n   - First released in 1991.\\n
+        \  - Main design goals: code readability and simplicity.\\n   - Key milestones:\\n
+        \    - Python 1.0 (1994): Introduced functional programming tools like lambda
+        and map.\\n     - Python 2.0 (2000): Added list comprehensions and garbage
+        collection.\\n     - Python 3.0 (2008): Major backward-incompatible release
+        that fixed fundamental design flaws.\\n   - Python 2 reached end-of-life in
+        January 2020.\\n\\n4. **Next Steps**: The agent should be prepared to provide
+        further details or answer any follow-up questions the user may have regarding
+        Python's history or its features.\\n\\n5. **Context to Preserve**: \\n   -
+        Creator: Guido van Rossum\\n   - First release: 1991\\n   - Milestones: \\n
+        \    - Python 1.0 (1994)\\n     - Python 2.0 (2000)\\n     - Python 3.0 (2008)\\n
+        \  - End-of-life for Python 2: January 2020\\n</summary>\",\n        \"refusal\":
+        null,\n        \"annotations\": []\n      },\n      \"logprobs\": null,\n
+        \     \"finish_reason\": \"stop\"\n    }\n  ],\n  \"usage\": {\n    \"prompt_tokens\":
+        346,\n    \"completion_tokens\": 367,\n    \"total_tokens\": 713,\n    \"prompt_tokens_details\":
+        {\n      \"cached_tokens\": 0,\n      \"audio_tokens\": 0\n    },\n    \"completion_tokens_details\":
+        {\n      \"reasoning_tokens\": 0,\n      \"audio_tokens\": 0,\n      \"accepted_prediction_tokens\":
+        0,\n      \"rejected_prediction_tokens\": 0\n    }\n  },\n  \"service_tier\":
+        \"default\",\n  \"system_fingerprint\": \"fp_7e4bf6ad56\"\n}\n"
+    headers:
+      CF-RAY:
+      - CF-RAY-XXX
+      Connection:
+      - keep-alive
+      Content-Type:
+      - application/json
+      Date:
+      - Mon, 09 Feb 2026 20:28:04 GMT
+      Server:
+      - cloudflare
+      Strict-Transport-Security:
+      - STS-XXX
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - X-CONTENT-TYPE-XXX
+      access-control-expose-headers:
+      - ACCESS-CONTROL-XXX
+      alt-svc:
+      - h3=":443"; ma=86400
+      cf-cache-status:
+      - DYNAMIC
+      openai-organization:
+      - OPENAI-ORG-XXX
+      openai-processing-ms:
+      - '5368'
+      openai-project:
+      - OPENAI-PROJECT-XXX
+      openai-version:
+      - '2020-10-01'
+      set-cookie:
+      - SET-COOKIE-XXX
+      x-openai-proxy-wasm:
+      - v0.1
+      x-ratelimit-limit-requests:
+      - X-RATELIMIT-LIMIT-REQUESTS-XXX
+      x-ratelimit-limit-tokens:
+      - X-RATELIMIT-LIMIT-TOKENS-XXX
+      x-ratelimit-remaining-requests:
+      - X-RATELIMIT-REMAINING-REQUESTS-XXX
+      x-ratelimit-remaining-tokens:
+      - X-RATELIMIT-REMAINING-TOKENS-XXX
+      x-ratelimit-reset-requests:
+      - X-RATELIMIT-RESET-REQUESTS-XXX
+      x-ratelimit-reset-tokens:
+      - X-RATELIMIT-RESET-TOKENS-XXX
+      x-request-id:
+      - X-REQUEST-ID-XXX
+    status:
+      code: 200
+      message: OK
+- request:
+    body: '{"messages":[{"role":"system","content":"You are a precise assistant that
+      creates structured summaries of agent conversations. You preserve critical context
+      needed for seamless task continuation."},{"role":"user","content":"Analyze the
+      following conversation and create a structured summary that preserves all information
+      needed to continue the task seamlessly.\n\n<conversation>\n[USER]: What about
+      the async/await features? When were they introduced and how do they compare
+      to similar features in JavaScript and C#? Also explain the Global Interpreter
+      Lock and its implications.\n\n[ASSISTANT]: Async/await was introduced in Python
+      3.5 (PEP 492, 2015). Unlike JavaScript which is single-threaded by design, Python''s
+      asyncio is an opt-in framework. C# introduced async/await in 2012 (C# 5.0) and
+      was a major inspiration for Python''s implementation. The GIL (Global Interpreter
+      Lock) is a mutex that protects access to Python objects, preventing multiple
+      threads from executing Python bytecodes simultaneously. This means CPU-bound
+      multithreaded programs don''t benefit from multiple cores. PEP 703 proposes
+      making the GIL optional in CPython.\n</conversation>\n\nCreate a summary with
+      these sections:\n1. **Task Overview**: What is the agent trying to accomplish?\n2.
+      **Current State**: What has been completed so far? What step is the agent on?\n3.
+      **Important Discoveries**: Key facts, data, tool results, or findings that must
+      not be lost.\n4. **Next Steps**: What should the agent do next based on the
+      conversation?\n5. **Context to Preserve**: Any specific values, names, URLs,
+      code snippets, or details referenced in the conversation.\n\nWrap your entire
+      summary in <summary> tags.\n\n<summary>\n[Your structured summary here]\n</summary>"}],"model":"gpt-4o-mini","temperature":0}'
+    headers:
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept:
+      - application/json
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      authorization:
+      - AUTHORIZATION-XXX
+      connection:
+      - keep-alive
+      content-length:
+      - '1786'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+      x-stainless-arch:
+      - X-STAINLESS-ARCH-XXX
+      x-stainless-async:
+      - async:asyncio
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - X-STAINLESS-OS-XXX
+      x-stainless-package-version:
+      - 1.83.0
+      x-stainless-read-timeout:
+      - X-STAINLESS-READ-TIMEOUT-XXX
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.13.3
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    body:
+      string: "{\n  \"id\": \"chatcmpl-D7S9Pcl5ybKLH8cSEZ6hgPuvj5iCv\",\n  \"object\":
+        \"chat.completion\",\n  \"created\": 1770668879,\n  \"model\": \"gpt-4o-mini-2024-07-18\",\n
+        \ \"choices\": [\n    {\n      \"index\": 0,\n      \"message\": {\n        \"role\":
+        \"assistant\",\n        \"content\": \"<summary>\\n1. **Task Overview**: The
+        user is seeking information about the async/await features in Python, their
+        introduction timeline, comparisons with similar features in JavaScript and
+        C#, and an explanation of the Global Interpreter Lock (GIL) and its implications.\\n\\n2.
+        **Current State**: The assistant has provided information regarding the introduction
+        of async/await in Python (version 3.5, PEP 492 in 2015), comparisons with
+        JavaScript and C# (C# introduced async/await in 2012), and an explanation
+        of the GIL.\\n\\n3. **Important Discoveries**: \\n   - Async/await was introduced
+        in Python 3.5 (PEP 492, 2015).\\n   - JavaScript is single-threaded, while
+        Python's asyncio is an opt-in framework.\\n   - C# introduced async/await
+        in 2012 (C# 5.0) and influenced Python's implementation.\\n   - The GIL (Global
+        Interpreter Lock) is a mutex that prevents multiple threads from executing
+        Python bytecodes simultaneously, affecting CPU-bound multithreaded programs.\\n
+        \  - PEP 703 proposes making the GIL optional in CPython.\\n\\n4. **Next Steps**:
+        The agent should consider providing further details on how async/await is
+        implemented in Python, JavaScript, and C#, and explore the implications of
+        the GIL in more depth, including potential alternatives or workarounds.\\n\\n5.
+        **Context to Preserve**: \\n   - Python async/await introduction: version
+        3.5, PEP 492, 2015.\\n   - C# async/await introduction: 2012, C# 5.0.\\n   -
+        GIL (Global Interpreter Lock) and its implications on multithreading in Python.\\n
+        \  - Reference to PEP 703 regarding the GIL.\\n</summary>\",\n        \"refusal\":
+        null,\n        \"annotations\": []\n      },\n      \"logprobs\": null,\n
+        \     \"finish_reason\": \"stop\"\n    }\n  ],\n  \"usage\": {\n    \"prompt_tokens\":
+        364,\n    \"completion_tokens\": 381,\n    \"total_tokens\": 745,\n    \"prompt_tokens_details\":
+        {\n      \"cached_tokens\": 0,\n      \"audio_tokens\": 0\n    },\n    \"completion_tokens_details\":
+        {\n      \"reasoning_tokens\": 0,\n      \"audio_tokens\": 0,\n      \"accepted_prediction_tokens\":
+        0,\n      \"rejected_prediction_tokens\": 0\n    }\n  },\n  \"service_tier\":
+        \"default\",\n  \"system_fingerprint\": \"fp_f4ae844694\"\n}\n"
+    headers:
+      CF-RAY:
+      - CF-RAY-XXX
+      Connection:
+      - keep-alive
+      Content-Type:
+      - application/json
+      Date:
+      - Mon, 09 Feb 2026 20:28:04 GMT
+      Server:
+      - cloudflare
+      Strict-Transport-Security:
+      - STS-XXX
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - X-CONTENT-TYPE-XXX
+      access-control-expose-headers:
+      - ACCESS-CONTROL-XXX
+      alt-svc:
+      - h3=":443"; ma=86400
+      cf-cache-status:
+      - DYNAMIC
+      openai-organization:
+      - OPENAI-ORG-XXX
+      openai-processing-ms:
+      - '5489'
+      openai-project:
+      - OPENAI-PROJECT-XXX
+      openai-version:
+      - '2020-10-01'
+      set-cookie:
+      - SET-COOKIE-XXX
+      x-openai-proxy-wasm:
+      - v0.1
+      x-ratelimit-limit-requests:
+      - X-RATELIMIT-LIMIT-REQUESTS-XXX
+      x-ratelimit-limit-tokens:
+      - X-RATELIMIT-LIMIT-TOKENS-XXX
+      x-ratelimit-remaining-requests:
+      - X-RATELIMIT-REMAINING-REQUESTS-XXX
+      x-ratelimit-remaining-tokens:
+      - X-RATELIMIT-REMAINING-TOKENS-XXX
+      x-ratelimit-reset-requests:
+      - X-RATELIMIT-RESET-REQUESTS-XXX
+      x-ratelimit-reset-tokens:
+      - X-RATELIMIT-RESET-TOKENS-XXX
+      x-request-id:
+      - X-REQUEST-ID-XXX
+    status:
+      code: 200
+      message: OK
+version: 1
--- a/lib/crewai/tests/cassettes/utilities/TestSummarizeDirectAnthropic.test_summarize_direct_anthropic.yaml
+++ b/lib/crewai/tests/cassettes/utilities/TestSummarizeDirectAnthropic.test_summarize_direct_anthropic.yaml
@@ -0,0 +1,136 @@
+interactions:
+- request:
+    body: '{"max_tokens":4096,"messages":[{"role":"user","content":"Analyze the following
+      conversation and create a structured summary that preserves all information
+      needed to continue the task seamlessly.\n\n<conversation>\n[USER]: Research
+      the latest developments in large language models. Focus on architecture improvements
+      and training techniques.\n\n[ASSISTANT]: I''ll research the latest developments
+      in large language models. Based on my knowledge, recent advances include:\n1.
+      Mixture of Experts (MoE) architectures\n2. Improved attention mechanisms like
+      Flash Attention\n3. Better training data curation techniques\n4. Constitutional
+      AI and RLHF improvements\n\n[USER]: Can you go deeper on the MoE architectures?
+      What are the key papers?\n\n[ASSISTANT]: Key papers on Mixture of Experts:\n-
+      Switch Transformers (Google, 2021) - simplified MoE routing\n- GShard - scaling
+      to 600B parameters\n- Mixtral (Mistral AI) - open-source MoE model\nThe main
+      advantage is computational efficiency: only a subset of experts is activated
+      per token.\n</conversation>\n\nCreate a summary with these sections:\n1. **Task
+      Overview**: What is the agent trying to accomplish?\n2. **Current State**: What
+      has been completed so far? What step is the agent on?\n3. **Important Discoveries**:
+      Key facts, data, tool results, or findings that must not be lost.\n4. **Next
+      Steps**: What should the agent do next based on the conversation?\n5. **Context
+      to Preserve**: Any specific values, names, URLs, code snippets, or details referenced
+      in the conversation.\n\nWrap your entire summary in <summary> tags.\n\n<summary>\n[Your
+      structured summary here]\n</summary>"}],"model":"claude-3-5-haiku-latest","stream":false,"system":"You
+      are a precise assistant that creates structured summaries of agent conversations.
+      You preserve critical context needed for seamless task continuation.","temperature":0}'
+    headers:
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept:
+      - application/json
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      anthropic-version:
+      - '2023-06-01'
+      connection:
+      - keep-alive
+      content-length:
+      - '1870'
+      content-type:
+      - application/json
+      host:
+      - api.anthropic.com
+      x-api-key:
+      - X-API-KEY-XXX
+      x-stainless-arch:
+      - X-STAINLESS-ARCH-XXX
+      x-stainless-async:
+      - 'false'
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - X-STAINLESS-OS-XXX
+      x-stainless-package-version:
+      - 0.73.0
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.13.3
+      x-stainless-timeout:
+      - NOT_GIVEN
+    method: POST
+    uri: https://api.anthropic.com/v1/messages
+  response:
+    body:
+      string: '{"model":"claude-3-5-haiku-20241022","id":"msg_01SK3LP6RedPBmpvD1HfKD23","type":"message","role":"assistant","content":[{"type":"text","text":"<summary>\n1.
+        **Task Overview**:\n- Research latest developments in large language models\n-
+        Focus on architecture improvements and training techniques\n\n2. **Current
+        State**:\n- Initial research completed on broad developments\n- Currently
+        exploring Mixture of Experts (MoE) architectures in depth\n- Detailed discussion
+        of key MoE research papers initiated\n\n3. **Important Discoveries**:\nMoE
+        Architecture Insights:\n- Computational efficiency through selective expert
+        activation\n- Key research papers:\n  * Switch Transformers (Google, 2021)\n  *
+        GShard\n  * Mixtral (Mistral AI)\n- Main benefit: Only subset of experts activated
+        per token\n\n4. **Next Steps**:\n- Conduct deeper analysis of MoE architecture
+        mechanisms\n- Compare routing strategies across different MoE implementations\n-
+        Investigate performance metrics and scalability of MoE models\n\n5. **Context
+        to Preserve**:\n- Research Focus: Large Language Model Architectures\n- Specific
+        Interest: Mixture of Experts (MoE) Architectures\n- Key Researchers/Organizations:
+        Google, Mistral AI\n- Years of Significant Papers: 2021 onwards\n</summary>"}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":400,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":270,"service_tier":"standard","inference_geo":"not_available"}}'
+    headers:
+      CF-RAY:
+      - CF-RAY-XXX
+      Connection:
+      - keep-alive
+      Content-Security-Policy:
+      - CSP-FILTERED
+      Content-Type:
+      - application/json
+      Date:
+      - Mon, 09 Feb 2026 20:18:41 GMT
+      Server:
+      - cloudflare
+      Transfer-Encoding:
+      - chunked
+      X-Robots-Tag:
+      - none
+      anthropic-organization-id:
+      - ANTHROPIC-ORGANIZATION-ID-XXX
+      anthropic-ratelimit-input-tokens-limit:
+      - ANTHROPIC-RATELIMIT-INPUT-TOKENS-LIMIT-XXX
+      anthropic-ratelimit-input-tokens-remaining:
+      - ANTHROPIC-RATELIMIT-INPUT-TOKENS-REMAINING-XXX
+      anthropic-ratelimit-input-tokens-reset:
+      - ANTHROPIC-RATELIMIT-INPUT-TOKENS-RESET-XXX
+      anthropic-ratelimit-output-tokens-limit:
+      - ANTHROPIC-RATELIMIT-OUTPUT-TOKENS-LIMIT-XXX
+      anthropic-ratelimit-output-tokens-remaining:
+      - ANTHROPIC-RATELIMIT-OUTPUT-TOKENS-REMAINING-XXX
+      anthropic-ratelimit-output-tokens-reset:
+      - ANTHROPIC-RATELIMIT-OUTPUT-TOKENS-RESET-XXX
+      anthropic-ratelimit-requests-limit:
+      - '4000'
+      anthropic-ratelimit-requests-remaining:
+      - '3999'
+      anthropic-ratelimit-requests-reset:
+      - '2026-02-09T20:18:35Z'
+      anthropic-ratelimit-tokens-limit:
+      - ANTHROPIC-RATELIMIT-TOKENS-LIMIT-XXX
+      anthropic-ratelimit-tokens-remaining:
+      - ANTHROPIC-RATELIMIT-TOKENS-REMAINING-XXX
+      anthropic-ratelimit-tokens-reset:
+      - ANTHROPIC-RATELIMIT-TOKENS-RESET-XXX
+      cf-cache-status:
+      - DYNAMIC
+      request-id:
+      - REQUEST-ID-XXX
+      strict-transport-security:
+      - STS-XXX
+      x-envoy-upstream-service-time:
+      - '5639'
+    status:
+      code: 200
+      message: OK
+version: 1
--- a/lib/crewai/tests/cassettes/utilities/TestSummarizeDirectAzure.test_summarize_direct_azure.yaml
+++ b/lib/crewai/tests/cassettes/utilities/TestSummarizeDirectAzure.test_summarize_direct_azure.yaml
@@ -0,0 +1,110 @@
+interactions:
+- request:
+    body: '{"messages": [{"role": "system", "content": "You are a precise assistant
+      that creates structured summaries of agent conversations. You preserve critical
+      context needed for seamless task continuation."}, {"role": "user", "content":
+      "Analyze the following conversation and create a structured summary that preserves
+      all information needed to continue the task seamlessly.\n\n<conversation>\n[USER]:
+      Research the latest developments in large language models. Focus on architecture
+      improvements and training techniques.\n\n[ASSISTANT]: I''ll research the latest
+      developments in large language models. Based on my knowledge, recent advances
+      include:\n1. Mixture of Experts (MoE) architectures\n2. Improved attention mechanisms
+      like Flash Attention\n3. Better training data curation techniques\n4. Constitutional
+      AI and RLHF improvements\n\n[USER]: Can you go deeper on the MoE architectures?
+      What are the key papers?\n\n[ASSISTANT]: Key papers on Mixture of Experts:\n-
+      Switch Transformers (Google, 2021) - simplified MoE routing\n- GShard - scaling
+      to 600B parameters\n- Mixtral (Mistral AI) - open-source MoE model\nThe main
+      advantage is computational efficiency: only a subset of experts is activated
+      per token.\n</conversation>\n\nCreate a summary with these sections:\n1. **Task
+      Overview**: What is the agent trying to accomplish?\n2. **Current State**: What
+      has been completed so far? What step is the agent on?\n3. **Important Discoveries**:
+      Key facts, data, tool results, or findings that must not be lost.\n4. **Next
+      Steps**: What should the agent do next based on the conversation?\n5. **Context
+      to Preserve**: Any specific values, names, URLs, code snippets, or details referenced
+      in the conversation.\n\nWrap your entire summary in <summary> tags.\n\n<summary>\n[Your
+      structured summary here]\n</summary>"}], "stream": false, "temperature": 0}'
+    headers:
+      Accept:
+      - application/json
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '1849'
+      Content-Type:
+      - application/json
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      api-key:
+      - X-API-KEY-XXX
+      authorization:
+      - AUTHORIZATION-XXX
+      x-ms-client-request-id:
+      - X-MS-CLIENT-REQUEST-ID-XXX
+    method: POST
+    uri: https://fake-azure-endpoint.openai.azure.com/openai/deployments/gpt-4o-mini/chat/completions?api-version=2024-12-01-preview
+  response:
+    body:
+      string: '{"choices":[{"content_filter_results":{"hate":{"filtered":false,"severity":"safe"},"protected_material_code":{"filtered":false,"detected":false},"protected_material_text":{"filtered":false,"detected":false},"self_harm":{"filtered":false,"severity":"safe"},"sexual":{"filtered":false,"severity":"safe"},"violence":{"filtered":false,"severity":"safe"}},"finish_reason":"stop","index":0,"logprobs":null,"message":{"annotations":[],"content":"\u003csummary\u003e\n1.
+        **Task Overview**: The user has requested research on the latest developments
+        in large language models, specifically focusing on architecture improvements
+        and training techniques.\n\n2. **Current State**: The assistant has provided
+        an initial overview of recent advances in large language models, including
+        Mixture of Experts (MoE) architectures, improved attention mechanisms, better
+        training data curation techniques, and advancements in Constitutional AI and
+        Reinforcement Learning from Human Feedback (RLHF).\n\n3. **Important Discoveries**:
+        \n   - Recent advances in large language models include:\n     1. Mixture
+        of Experts (MoE) architectures\n     2. Improved attention mechanisms like
+        Flash Attention\n     3. Better training data curation techniques\n     4.
+        Constitutional AI and RLHF improvements\n   - Key papers on Mixture of Experts:\n     -
+        Switch Transformers (Google, 2021) - simplified MoE routing\n     - GShard
+        - scaling to 600B parameters\n     - Mixtral (Mistral AI) - open-source MoE
+        model\n   - The main advantage of MoE architectures is computational efficiency,
+        as only a subset of experts is activated per token.\n\n4. **Next Steps**:
+        The assistant should delve deeper into the Mixture of Experts architectures,
+        potentially summarizing the key findings and implications from the identified
+        papers.\n\n5. **Context to Preserve**: \n   - Key papers: \n     - Switch
+        Transformers (Google, 2021)\n     - GShard\n     - Mixtral (Mistral AI)\n   -
+        Focus on computational efficiency of MoE architectures.\n\u003c/summary\u003e","refusal":null,"role":"assistant"}}],"created":1770849953,"id":"chatcmpl-D8DFx1H1zzEerW5H0BWfuwmio2sz1","model":"gpt-4o-mini-2024-07-18","object":"chat.completion","prompt_filter_results":[{"prompt_index":0,"content_filter_results":{"hate":{"filtered":false,"severity":"safe"},"jailbreak":{"filtered":false,"detected":false},"self_harm":{"filtered":false,"severity":"safe"},"sexual":{"filtered":false,"severity":"safe"},"violence":{"filtered":false,"severity":"safe"}}}],"system_fingerprint":"fp_f97eff32c5","usage":{"completion_tokens":328,"completion_tokens_details":{"accepted_prediction_tokens":0,"audio_tokens":0,"reasoning_tokens":0,"rejected_prediction_tokens":0},"prompt_tokens":368,"prompt_tokens_details":{"audio_tokens":0,"cached_tokens":0},"total_tokens":696}}
+
+        '
+    headers:
+      Content-Length:
+      - '2786'
+      Content-Type:
+      - application/json
+      Date:
+      - Wed, 11 Feb 2026 22:45:56 GMT
+      Strict-Transport-Security:
+      - STS-XXX
+      apim-request-id:
+      - APIM-REQUEST-ID-XXX
+      azureml-model-session:
+      - AZUREML-MODEL-SESSION-XXX
+      x-accel-buffering:
+      - 'no'
+      x-content-type-options:
+      - X-CONTENT-TYPE-XXX
+      x-ms-client-request-id:
+      - X-MS-CLIENT-REQUEST-ID-XXX
+      x-ms-deployment-name:
+      - gpt-4o-mini
+      x-ms-rai-invoked:
+      - 'true'
+      x-ms-region:
+      - X-MS-REGION-XXX
+      x-ratelimit-limit-requests:
+      - X-RATELIMIT-LIMIT-REQUESTS-XXX
+      x-ratelimit-limit-tokens:
+      - X-RATELIMIT-LIMIT-TOKENS-XXX
+      x-ratelimit-remaining-requests:
+      - X-RATELIMIT-REMAINING-REQUESTS-XXX
+      x-ratelimit-remaining-tokens:
+      - X-RATELIMIT-REMAINING-TOKENS-XXX
+      x-request-id:
+      - X-REQUEST-ID-XXX
+    status:
+      code: 200
+      message: OK
+version: 1
--- a/lib/crewai/tests/cassettes/utilities/TestSummarizeDirectGemini.test_summarize_direct_gemini.yaml
+++ b/lib/crewai/tests/cassettes/utilities/TestSummarizeDirectGemini.test_summarize_direct_gemini.yaml
@@ -0,0 +1,103 @@
+interactions:
+- request:
+    body: '{"contents": [{"parts": [{"text": "Analyze the following conversation and
+      create a structured summary that preserves all information needed to continue
+      the task seamlessly.\n\n<conversation>\n[USER]: Research the latest developments
+      in large language models. Focus on architecture improvements and training techniques.\n\n[ASSISTANT]:
+      I''ll research the latest developments in large language models. Based on my
+      knowledge, recent advances include:\n1. Mixture of Experts (MoE) architectures\n2.
+      Improved attention mechanisms like Flash Attention\n3. Better training data
+      curation techniques\n4. Constitutional AI and RLHF improvements\n\n[USER]: Can
+      you go deeper on the MoE architectures? What are the key papers?\n\n[ASSISTANT]:
+      Key papers on Mixture of Experts:\n- Switch Transformers (Google, 2021) - simplified
+      MoE routing\n- GShard - scaling to 600B parameters\n- Mixtral (Mistral AI) -
+      open-source MoE model\nThe main advantage is computational efficiency: only
+      a subset of experts is activated per token.\n</conversation>\n\nCreate a summary
+      with these sections:\n1. **Task Overview**: What is the agent trying to accomplish?\n2.
+      **Current State**: What has been completed so far? What step is the agent on?\n3.
+      **Important Discoveries**: Key facts, data, tool results, or findings that must
+      not be lost.\n4. **Next Steps**: What should the agent do next based on the
+      conversation?\n5. **Context to Preserve**: Any specific values, names, URLs,
+      code snippets, or details referenced in the conversation.\n\nWrap your entire
+      summary in <summary> tags.\n\n<summary>\n[Your structured summary here]\n</summary>"}],
+      "role": "user"}], "systemInstruction": {"parts": [{"text": "You are a precise
+      assistant that creates structured summaries of agent conversations. You preserve
+      critical context needed for seamless task continuation."}], "role": "user"},
+      "generationConfig": {"temperature": 0.0}}'
+    headers:
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept:
+      - '*/*'
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      connection:
+      - keep-alive
+      content-length:
+      - '1895'
+      content-type:
+      - application/json
+      host:
+      - generativelanguage.googleapis.com
+      x-goog-api-client:
+      - google-genai-sdk/1.49.0 gl-python/3.13.3
+      x-goog-api-key:
+      - X-GOOG-API-KEY-XXX
+    method: POST
+    uri: https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent
+  response:
+    body:
+      string: "{\n  \"candidates\": [\n    {\n      \"content\": {\n        \"parts\":
+        [\n          {\n            \"text\": \"```xml\\n\\u003csummary\\u003e\\n**Task
+        Overview**: Research the latest developments in large language models, focusing
+        on architecture improvements and training techniques.\\n\\n**Current State**:
+        The agent has identified several key areas of advancement in LLMs: Mixture
+        of Experts (MoE) architectures, improved attention mechanisms (Flash Attention),
+        better training data curation, and Constitutional AI/RLHF improvements. The
+        user has requested a deeper dive into MoE architectures. The agent has provided
+        an initial overview of MoE architectures and listed some key papers.\\n\\n**Important
+        Discoveries**:\\n*   Key MoE papers: Switch Transformers (Google, 2021), GShard,
+        Mixtral (Mistral AI).\\n*   MoE advantage: Computational efficiency through
+        selective activation of experts.\\n\\n**Next Steps**: Continue researching
+        MoE architectures based on the user's request for more detail. The agent should
+        elaborate further on the listed papers and potentially find more recent or
+        relevant publications.\\n\\n**Context to Preserve**:\\n*   Focus areas: Architecture
+        improvements and training techniques for LLMs.\\n*   Specific architectures:
+        Mixture of Experts (MoE), Flash Attention.\\n*   Training techniques: Data
+        curation, Constitutional AI, RLHF.\\n*   Specific papers: Switch Transformers
+        (Google, 2021), GShard, Mixtral (Mistral AI).\\n\\u003c/summary\\u003e\\n```\"\n
+        \         }\n        ],\n        \"role\": \"model\"\n      },\n      \"finishReason\":
+        \"STOP\",\n      \"avgLogprobs\": -0.14186729703630721\n    }\n  ],\n  \"usageMetadata\":
+        {\n    \"promptTokenCount\": 373,\n    \"candidatesTokenCount\": 280,\n    \"totalTokenCount\":
+        653,\n    \"promptTokensDetails\": [\n      {\n        \"modality\": \"TEXT\",\n
+        \       \"tokenCount\": 373\n      }\n    ],\n    \"candidatesTokensDetails\":
+        [\n      {\n        \"modality\": \"TEXT\",\n        \"tokenCount\": 280\n
+        \     }\n    ]\n  },\n  \"modelVersion\": \"gemini-2.0-flash\",\n  \"responseId\":
+        \"GEGKabP3OcGH-8YPzZCj2Ao\"\n}\n"
+    headers:
+      Alt-Svc:
+      - h3=":443"; ma=2592000,h3-29=":443"; ma=2592000
+      Content-Type:
+      - application/json; charset=UTF-8
+      Date:
+      - Mon, 09 Feb 2026 20:18:35 GMT
+      Server:
+      - scaffolding on HTTPServer2
+      Server-Timing:
+      - gfet4t7; dur=2310
+      Transfer-Encoding:
+      - chunked
+      Vary:
+      - Origin
+      - X-Origin
+      - Referer
+      X-Content-Type-Options:
+      - X-CONTENT-TYPE-XXX
+      X-Frame-Options:
+      - X-FRAME-OPTIONS-XXX
+      X-XSS-Protection:
+      - '0'
+    status:
+      code: 200
+      message: OK
+version: 1
--- a/lib/crewai/tests/cassettes/utilities/TestSummarizeDirectOpenAI.test_summarize_direct_openai.yaml
+++ b/lib/crewai/tests/cassettes/utilities/TestSummarizeDirectOpenAI.test_summarize_direct_openai.yaml
@@ -0,0 +1,148 @@
+interactions:
+- request:
+    body: '{"messages":[{"role":"system","content":"You are a precise assistant that
+      creates structured summaries of agent conversations. You preserve critical context
+      needed for seamless task continuation."},{"role":"user","content":"Analyze the
+      following conversation and create a structured summary that preserves all information
+      needed to continue the task seamlessly.\n\n<conversation>\n[USER]: Research
+      the latest developments in large language models. Focus on architecture improvements
+      and training techniques.\n\n[ASSISTANT]: I''ll research the latest developments
+      in large language models. Based on my knowledge, recent advances include:\n1.
+      Mixture of Experts (MoE) architectures\n2. Improved attention mechanisms like
+      Flash Attention\n3. Better training data curation techniques\n4. Constitutional
+      AI and RLHF improvements\n\n[USER]: Can you go deeper on the MoE architectures?
+      What are the key papers?\n\n[ASSISTANT]: Key papers on Mixture of Experts:\n-
+      Switch Transformers (Google, 2021) - simplified MoE routing\n- GShard - scaling
+      to 600B parameters\n- Mixtral (Mistral AI) - open-source MoE model\nThe main
+      advantage is computational efficiency: only a subset of experts is activated
+      per token.\n</conversation>\n\nCreate a summary with these sections:\n1. **Task
+      Overview**: What is the agent trying to accomplish?\n2. **Current State**: What
+      has been completed so far? What step is the agent on?\n3. **Important Discoveries**:
+      Key facts, data, tool results, or findings that must not be lost.\n4. **Next
+      Steps**: What should the agent do next based on the conversation?\n5. **Context
+      to Preserve**: Any specific values, names, URLs, code snippets, or details referenced
+      in the conversation.\n\nWrap your entire summary in <summary> tags.\n\n<summary>\n[Your
+      structured summary here]\n</summary>"}],"model":"gpt-4o-mini","temperature":0}'
+    headers:
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept:
+      - application/json
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      authorization:
+      - AUTHORIZATION-XXX
+      connection:
+      - keep-alive
+      content-length:
+      - '1844'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+      x-stainless-arch:
+      - X-STAINLESS-ARCH-XXX
+      x-stainless-async:
+      - 'false'
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - X-STAINLESS-OS-XXX
+      x-stainless-package-version:
+      - 1.83.0
+      x-stainless-read-timeout:
+      - X-STAINLESS-READ-TIMEOUT-XXX
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.13.3
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    body:
+      string: "{\n  \"id\": \"chatcmpl-D7RxGISdQet8JsWImiwzHQ2S9gSD4\",\n  \"object\":
+        \"chat.completion\",\n  \"created\": 1770668126,\n  \"model\": \"gpt-4o-mini-2024-07-18\",\n
+        \ \"choices\": [\n    {\n      \"index\": 0,\n      \"message\": {\n        \"role\":
+        \"assistant\",\n        \"content\": \"<summary>\\n1. **Task Overview**: The
+        agent is tasked with researching the latest developments in large language
+        models, specifically focusing on architecture improvements and training techniques.\\n\\n2.
+        **Current State**: The agent has identified several recent advances in large
+        language models, including Mixture of Experts (MoE) architectures, improved
+        attention mechanisms, better training data curation techniques, and advancements
+        in Constitutional AI and Reinforcement Learning from Human Feedback (RLHF).\\n\\n3.
+        **Important Discoveries**: \\n   - Recent advances in large language models
+        include:\\n     1. Mixture of Experts (MoE) architectures\\n     2. Improved
+        attention mechanisms like Flash Attention\\n     3. Better training data curation
+        techniques\\n     4. Constitutional AI and RLHF improvements\\n   - Key papers
+        on Mixture of Experts:\\n     - Switch Transformers (Google, 2021) - simplified
+        MoE routing\\n     - GShard - scaling to 600B parameters\\n     - Mixtral
+        (Mistral AI) - open-source MoE model\\n   - The main advantage of MoE architectures
+        is computational efficiency, as only a subset of experts is activated per
+        token.\\n\\n4. **Next Steps**: The agent should delve deeper into the Mixture
+        of Experts architectures, reviewing the key papers mentioned and summarizing
+        their contributions and implications for large language models.\\n\\n5. **Context
+        to Preserve**: \\n   - Key papers: \\n     - Switch Transformers (Google,
+        2021)\\n     - GShard\\n     - Mixtral (Mistral AI)\\n   - Focus on computational
+        efficiency of MoE architectures.\\n</summary>\",\n        \"refusal\": null,\n
+        \       \"annotations\": []\n      },\n      \"logprobs\": null,\n      \"finish_reason\":
+        \"stop\"\n    }\n  ],\n  \"usage\": {\n    \"prompt_tokens\": 368,\n    \"completion_tokens\":
+        328,\n    \"total_tokens\": 696,\n    \"prompt_tokens_details\": {\n      \"cached_tokens\":
+        0,\n      \"audio_tokens\": 0\n    },\n    \"completion_tokens_details\":
+        {\n      \"reasoning_tokens\": 0,\n      \"audio_tokens\": 0,\n      \"accepted_prediction_tokens\":
+        0,\n      \"rejected_prediction_tokens\": 0\n    }\n  },\n  \"service_tier\":
+        \"default\",\n  \"system_fingerprint\": \"fp_f4ae844694\"\n}\n"
+    headers:
+      CF-RAY:
+      - CF-RAY-XXX
+      Connection:
+      - keep-alive
+      Content-Type:
+      - application/json
+      Date:
+      - Mon, 09 Feb 2026 20:15:32 GMT
+      Server:
+      - cloudflare
+      Strict-Transport-Security:
+      - STS-XXX
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - X-CONTENT-TYPE-XXX
+      access-control-expose-headers:
+      - ACCESS-CONTROL-XXX
+      alt-svc:
+      - h3=":443"; ma=86400
+      cf-cache-status:
+      - DYNAMIC
+      openai-organization:
+      - OPENAI-ORG-XXX
+      openai-processing-ms:
+      - '5395'
+      openai-project:
+      - OPENAI-PROJECT-XXX
+      openai-version:
+      - '2020-10-01'
+      set-cookie:
+      - SET-COOKIE-XXX
+      x-openai-proxy-wasm:
+      - v0.1
+      x-ratelimit-limit-requests:
+      - X-RATELIMIT-LIMIT-REQUESTS-XXX
+      x-ratelimit-limit-tokens:
+      - X-RATELIMIT-LIMIT-TOKENS-XXX
+      x-ratelimit-remaining-requests:
+      - X-RATELIMIT-REMAINING-REQUESTS-XXX
+      x-ratelimit-remaining-tokens:
+      - X-RATELIMIT-REMAINING-TOKENS-XXX
+      x-ratelimit-reset-requests:
+      - X-RATELIMIT-RESET-REQUESTS-XXX
+      x-ratelimit-reset-tokens:
+      - X-RATELIMIT-RESET-TOKENS-XXX
+      x-request-id:
+      - X-REQUEST-ID-XXX
+    status:
+      code: 200
+      message: OK
+version: 1
--- a/lib/crewai/tests/cassettes/utilities/TestSummarizePreservesFiles.test_summarize_preserves_files_integration.yaml
+++ b/lib/crewai/tests/cassettes/utilities/TestSummarizePreservesFiles.test_summarize_preserves_files_integration.yaml
@@ -0,0 +1,145 @@
+interactions:
+- request:
+    body: '{"messages":[{"role":"system","content":"You are a precise assistant that
+      creates structured summaries of agent conversations. You preserve critical context
+      needed for seamless task continuation."},{"role":"user","content":"Analyze the
+      following conversation and create a structured summary that preserves all information
+      needed to continue the task seamlessly.\n\n<conversation>\n[USER]: Research
+      the latest developments in large language models. Focus on architecture improvements
+      and training techniques.\n\n[ASSISTANT]: I''ll research the latest developments
+      in large language models. Based on my knowledge, recent advances include:\n1.
+      Mixture of Experts (MoE) architectures\n2. Improved attention mechanisms like
+      Flash Attention\n3. Better training data curation techniques\n4. Constitutional
+      AI and RLHF improvements\n\n[USER]: Can you go deeper on the MoE architectures?
+      What are the key papers?\n\n[ASSISTANT]: Key papers on Mixture of Experts:\n-
+      Switch Transformers (Google, 2021) - simplified MoE routing\n- GShard - scaling
+      to 600B parameters\n- Mixtral (Mistral AI) - open-source MoE model\nThe main
+      advantage is computational efficiency: only a subset of experts is activated
+      per token.\n</conversation>\n\nCreate a summary with these sections:\n1. **Task
+      Overview**: What is the agent trying to accomplish?\n2. **Current State**: What
+      has been completed so far? What step is the agent on?\n3. **Important Discoveries**:
+      Key facts, data, tool results, or findings that must not be lost.\n4. **Next
+      Steps**: What should the agent do next based on the conversation?\n5. **Context
+      to Preserve**: Any specific values, names, URLs, code snippets, or details referenced
+      in the conversation.\n\nWrap your entire summary in <summary> tags.\n\n<summary>\n[Your
+      structured summary here]\n</summary>"}],"model":"gpt-4o-mini","temperature":0}'
+    headers:
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept:
+      - application/json
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      authorization:
+      - AUTHORIZATION-XXX
+      connection:
+      - keep-alive
+      content-length:
+      - '1844'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+      x-stainless-arch:
+      - X-STAINLESS-ARCH-XXX
+      x-stainless-async:
+      - 'false'
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - X-STAINLESS-OS-XXX
+      x-stainless-package-version:
+      - 1.83.0
+      x-stainless-read-timeout:
+      - X-STAINLESS-READ-TIMEOUT-XXX
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.13.3
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    body:
+      string: "{\n  \"id\": \"chatcmpl-D7RxM4n36QoACHrC0QocV1pXIwvtD\",\n  \"object\":
+        \"chat.completion\",\n  \"created\": 1770668132,\n  \"model\": \"gpt-4o-mini-2024-07-18\",\n
+        \ \"choices\": [\n    {\n      \"index\": 0,\n      \"message\": {\n        \"role\":
+        \"assistant\",\n        \"content\": \"<summary>\\n1. **Task Overview**: The
+        user has requested research on the latest developments in large language models,
+        specifically focusing on architecture improvements and training techniques.\\n\\n2.
+        **Current State**: The assistant has identified several recent advances in
+        large language models, including Mixture of Experts (MoE) architectures, improved
+        attention mechanisms, better training data curation techniques, and advancements
+        in Constitutional AI and Reinforcement Learning from Human Feedback (RLHF).\\n\\n3.
+        **Important Discoveries**: \\n   - Key papers on Mixture of Experts (MoE)
+        architectures:\\n     - \\\"Switch Transformers\\\" (Google, 2021) - simplified
+        MoE routing.\\n     - \\\"GShard\\\" - scaling to 600B parameters.\\n     -
+        \\\"Mixtral\\\" (Mistral AI) - open-source MoE model.\\n   - The main advantage
+        of MoE architectures is computational efficiency, as only a subset of experts
+        is activated per token.\\n\\n4. **Next Steps**: The assistant should delve
+        deeper into the Mixture of Experts architectures, potentially summarizing
+        the findings from the key papers mentioned.\\n\\n5. **Context to Preserve**:
+        \\n   - Key papers: \\\"Switch Transformers,\\\" \\\"GShard,\\\" \\\"Mixtral.\\\"\\n
+        \  - Notable organizations: Google, Mistral AI.\\n   - Focus areas: MoE architectures,
+        computational efficiency.\\n</summary>\",\n        \"refusal\": null,\n        \"annotations\":
+        []\n      },\n      \"logprobs\": null,\n      \"finish_reason\": \"stop\"\n
+        \   }\n  ],\n  \"usage\": {\n    \"prompt_tokens\": 368,\n    \"completion_tokens\":
+        275,\n    \"total_tokens\": 643,\n    \"prompt_tokens_details\": {\n      \"cached_tokens\":
+        0,\n      \"audio_tokens\": 0\n    },\n    \"completion_tokens_details\":
+        {\n      \"reasoning_tokens\": 0,\n      \"audio_tokens\": 0,\n      \"accepted_prediction_tokens\":
+        0,\n      \"rejected_prediction_tokens\": 0\n    }\n  },\n  \"service_tier\":
+        \"default\",\n  \"system_fingerprint\": \"fp_f4ae844694\"\n}\n"
+    headers:
+      CF-RAY:
+      - CF-RAY-XXX
+      Connection:
+      - keep-alive
+      Content-Type:
+      - application/json
+      Date:
+      - Mon, 09 Feb 2026 20:15:36 GMT
+      Server:
+      - cloudflare
+      Strict-Transport-Security:
+      - STS-XXX
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - X-CONTENT-TYPE-XXX
+      access-control-expose-headers:
+      - ACCESS-CONTROL-XXX
+      alt-svc:
+      - h3=":443"; ma=86400
+      cf-cache-status:
+      - DYNAMIC
+      openai-organization:
+      - OPENAI-ORG-XXX
+      openai-processing-ms:
+      - '4188'
+      openai-project:
+      - OPENAI-PROJECT-XXX
+      openai-version:
+      - '2020-10-01'
+      set-cookie:
+      - SET-COOKIE-XXX
+      x-openai-proxy-wasm:
+      - v0.1
+      x-ratelimit-limit-requests:
+      - X-RATELIMIT-LIMIT-REQUESTS-XXX
+      x-ratelimit-limit-tokens:
+      - X-RATELIMIT-LIMIT-TOKENS-XXX
+      x-ratelimit-remaining-requests:
+      - X-RATELIMIT-REMAINING-REQUESTS-XXX
+      x-ratelimit-remaining-tokens:
+      - X-RATELIMIT-REMAINING-TOKENS-XXX
+      x-ratelimit-reset-requests:
+      - X-RATELIMIT-RESET-REQUESTS-XXX
+      x-ratelimit-reset-tokens:
+      - X-RATELIMIT-RESET-TOKENS-XXX
+      x-request-id:
+      - X-REQUEST-ID-XXX
+    status:
+      code: 200
+      message: OK
+version: 1
--- a/lib/crewai/tests/utilities/test_agent_utils.py
+++ b/lib/crewai/tests/utilities/test_agent_utils.py
@@ -2,13 +2,23 @@

 from __future__ import annotations

+import asyncio
 from typing import Any
-from unittest.mock import MagicMock, patch
+from unittest.mock import AsyncMock, MagicMock, patch

+import pytest
 from pydantic import BaseModel, Field

 from crewai.tools.base_tool import BaseTool
-from crewai.utilities.agent_utils import convert_tools_to_openai_schema, summarize_messages
+from crewai.utilities.agent_utils import (
+    _asummarize_chunks,
+    _estimate_token_count,
+    _extract_summary_tags,
+    _format_messages_for_summary,
+    _split_messages_into_chunks,
+    convert_tools_to_openai_schema,
+    summarize_messages,
+)


 class CalculatorInput(BaseModel):
@@ -214,6 +224,17 @@ class TestConvertToolsToOpenaiSchema:
        assert max_results_prop["default"] == 10


+def _make_mock_i18n() -> MagicMock:
+    """Create a mock i18n with the new structured prompt keys."""
+    mock_i18n = MagicMock()
+    mock_i18n.slice.side_effect = lambda key: {
+        "summarizer_system_message": "You are a precise assistant that creates structured summaries.",
+        "summarize_instruction": "Summarize the conversation:\n{conversation}",
+        "summary": "<summary>\n{merged_summary}\n</summary>\nContinue the task.",
+    }.get(key, "")
+    return mock_i18n
+
+
 class TestSummarizeMessages:
    """Tests for summarize_messages function."""

@@ -229,26 +250,22 @@ class TestSummarizeMessages:

        mock_llm = MagicMock()
        mock_llm.get_context_window_size.return_value = 1000
-        mock_llm.call.return_value = "Summarized conversation about image analysis."
-
-        mock_i18n = MagicMock()
-        mock_i18n.slice.side_effect = lambda key: {
-            "summarizer_system_message": "Summarize the following.",
-            "summarize_instruction": "Summarize: {group}",
-            "summary": "Summary: {merged_summary}",
-        }.get(key, "")
+        mock_llm.call.return_value = "<summary>Summarized conversation about image analysis.</summary>"

        summarize_messages(
            messages=messages,
            llm=mock_llm,
            callbacks=[],
-            i18n=mock_i18n,
+            i18n=_make_mock_i18n(),
        )

-        assert len(messages) == 1
-        assert messages[0]["role"] == "user"
-        assert "files" in messages[0]
-        assert messages[0]["files"] == mock_files
+        # System message preserved + summary message = 2
+        assert len(messages) == 2
+        assert messages[0]["role"] == "system"
+        summary_msg = messages[1]
+        assert summary_msg["role"] == "user"
+        assert "files" in summary_msg
+        assert summary_msg["files"] == mock_files

    def test_merges_files_from_multiple_user_messages(self) -> None:
        """Test that files from multiple user messages are merged."""
@@ -264,20 +281,13 @@ class TestSummarizeMessages:

        mock_llm = MagicMock()
        mock_llm.get_context_window_size.return_value = 1000
-        mock_llm.call.return_value = "Summarized conversation."
-
-        mock_i18n = MagicMock()
-        mock_i18n.slice.side_effect = lambda key: {
-            "summarizer_system_message": "Summarize the following.",
-            "summarize_instruction": "Summarize: {group}",
-            "summary": "Summary: {merged_summary}",
-        }.get(key, "")
+        mock_llm.call.return_value = "<summary>Summarized conversation.</summary>"

        summarize_messages(
            messages=messages,
            llm=mock_llm,
            callbacks=[],
-            i18n=mock_i18n,
+            i18n=_make_mock_i18n(),
        )

        assert len(messages) == 1
@@ -297,20 +307,13 @@ class TestSummarizeMessages:

        mock_llm = MagicMock()
        mock_llm.get_context_window_size.return_value = 1000
-        mock_llm.call.return_value = "A greeting exchange."
-
-        mock_i18n = MagicMock()
-        mock_i18n.slice.side_effect = lambda key: {
-            "summarizer_system_message": "Summarize the following.",
-            "summarize_instruction": "Summarize: {group}",
-            "summary": "Summary: {merged_summary}",
-        }.get(key, "")
+        mock_llm.call.return_value = "<summary>A greeting exchange.</summary>"

        summarize_messages(
            messages=messages,
            llm=mock_llm,
            callbacks=[],
-            i18n=mock_i18n,
+            i18n=_make_mock_i18n(),
        )

        assert len(messages) == 1
@@ -327,21 +330,595 @@ class TestSummarizeMessages:

        mock_llm = MagicMock()
        mock_llm.get_context_window_size.return_value = 1000
-        mock_llm.call.return_value = "Summary"
-
-        mock_i18n = MagicMock()
-        mock_i18n.slice.side_effect = lambda key: {
-            "summarizer_system_message": "Summarize.",
-            "summarize_instruction": "Summarize: {group}",
-            "summary": "Summary: {merged_summary}",
-        }.get(key, "")
+        mock_llm.call.return_value = "<summary>Summary</summary>"

        summarize_messages(
            messages=messages,
            llm=mock_llm,
            callbacks=[],
-            i18n=mock_i18n,
+            i18n=_make_mock_i18n(),
        )

        assert id(messages) == original_list_id
        assert len(messages) == 1
+
+    def test_preserves_system_messages(self) -> None:
+        """Test that system messages are preserved and not summarized."""
+        messages: list[dict[str, Any]] = [
+            {"role": "system", "content": "You are a research assistant."},
+            {"role": "user", "content": "Find information about AI."},
+            {"role": "assistant", "content": "I found several resources on AI."},
+        ]
+
+        mock_llm = MagicMock()
+        mock_llm.get_context_window_size.return_value = 1000
+        mock_llm.call.return_value = "<summary>User asked about AI, assistant found resources.</summary>"
+
+        summarize_messages(
+            messages=messages,
+            llm=mock_llm,
+            callbacks=[],
+            i18n=_make_mock_i18n(),
+        )
+
+        assert len(messages) == 2
+        assert messages[0]["role"] == "system"
+        assert messages[0]["content"] == "You are a research assistant."
+        assert messages[1]["role"] == "user"
+
+    def test_formats_conversation_with_role_labels(self) -> None:
+        """Test that the LLM receives role-labeled conversation text."""
+        messages: list[dict[str, Any]] = [
+            {"role": "system", "content": "System prompt."},
+            {"role": "user", "content": "Hello there"},
+            {"role": "assistant", "content": "Hi! How can I help?"},
+        ]
+
+        mock_llm = MagicMock()
+        mock_llm.get_context_window_size.return_value = 1000
+        mock_llm.call.return_value = "<summary>Greeting exchange.</summary>"
+
+        summarize_messages(
+            messages=messages,
+            llm=mock_llm,
+            callbacks=[],
+            i18n=_make_mock_i18n(),
+        )
+
+        # Check what was passed to llm.call
+        call_args = mock_llm.call.call_args[0][0]
+        user_msg_content = call_args[1]["content"]
+        assert "[USER]:" in user_msg_content
+        assert "[ASSISTANT]:" in user_msg_content
+        # System content should NOT appear in summarization input
+        assert "System prompt." not in user_msg_content
+
+    def test_extracts_summary_from_tags(self) -> None:
+        """Test that <summary> tags are extracted from LLM response."""
+        messages: list[dict[str, Any]] = [
+            {"role": "user", "content": "Do something."},
+            {"role": "assistant", "content": "Done."},
+        ]
+
+        mock_llm = MagicMock()
+        mock_llm.get_context_window_size.return_value = 1000
+        mock_llm.call.return_value = "Here is the summary:\n<summary>The extracted summary content.</summary>\nExtra text."
+
+        summarize_messages(
+            messages=messages,
+            llm=mock_llm,
+            callbacks=[],
+            i18n=_make_mock_i18n(),
+        )
+
+        assert "The extracted summary content." in messages[0]["content"]
+
+    def test_handles_tool_messages(self) -> None:
+        """Test that tool messages are properly formatted in summarization."""
+        messages: list[dict[str, Any]] = [
+            {"role": "user", "content": "Search for Python."},
+            {"role": "assistant", "content": None, "tool_calls": [
+                {"function": {"name": "web_search", "arguments": '{"query": "Python"}'}}
+            ]},
+            {"role": "tool", "content": "Python is a programming language.", "name": "web_search"},
+            {"role": "assistant", "content": "Python is a programming language."},
+        ]
+
+        mock_llm = MagicMock()
+        mock_llm.get_context_window_size.return_value = 1000
+        mock_llm.call.return_value = "<summary>User searched for Python info.</summary>"
+
+        summarize_messages(
+            messages=messages,
+            llm=mock_llm,
+            callbacks=[],
+            i18n=_make_mock_i18n(),
+        )
+
+        # Verify the conversation text sent to LLM contains tool labels
+        call_args = mock_llm.call.call_args[0][0]
+        user_msg_content = call_args[1]["content"]
+        assert "[TOOL_RESULT (web_search)]:" in user_msg_content
+
+    def test_only_system_messages_no_op(self) -> None:
+        """Test that only system messages results in no-op (no summarization)."""
+        messages: list[dict[str, Any]] = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "system", "content": "Additional system instructions."},
+        ]
+
+        mock_llm = MagicMock()
+        mock_llm.get_context_window_size.return_value = 1000
+
+        summarize_messages(
+            messages=messages,
+            llm=mock_llm,
+            callbacks=[],
+            i18n=_make_mock_i18n(),
+        )
+
+        # No LLM call should have been made
+        mock_llm.call.assert_not_called()
+        # System messages should remain untouched
+        assert len(messages) == 2
+        assert messages[0]["content"] == "You are a helpful assistant."
+        assert messages[1]["content"] == "Additional system instructions."
+
+
+class TestFormatMessagesForSummary:
+    """Tests for _format_messages_for_summary helper."""
+
+    def test_skips_system_messages(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "system", "content": "System prompt"},
+            {"role": "user", "content": "Hello"},
+        ]
+        result = _format_messages_for_summary(messages)
+        assert "System prompt" not in result
+        assert "[USER]: Hello" in result
+
+    def test_formats_user_and_assistant(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "user", "content": "Question"},
+            {"role": "assistant", "content": "Answer"},
+        ]
+        result = _format_messages_for_summary(messages)
+        assert "[USER]: Question" in result
+        assert "[ASSISTANT]: Answer" in result
+
+    def test_formats_tool_messages(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "tool", "content": "Result data", "name": "search_tool"},
+        ]
+        result = _format_messages_for_summary(messages)
+        assert "[TOOL_RESULT (search_tool)]:" in result
+        assert "Result data" in result
+
+    def test_handles_none_content_with_tool_calls(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "assistant", "content": None, "tool_calls": [
+                {"function": {"name": "calculator", "arguments": "{}"}}
+            ]},
+        ]
+        result = _format_messages_for_summary(messages)
+        assert "[Called tools: calculator]" in result
+
+    def test_handles_none_content_without_tool_calls(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "assistant", "content": None},
+        ]
+        result = _format_messages_for_summary(messages)
+        assert "[ASSISTANT]:" in result
+
+    def test_handles_multimodal_content(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "user", "content": [
+                {"type": "text", "text": "Describe this image"},
+                {"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}}
+            ]},
+        ]
+        result = _format_messages_for_summary(messages)
+        assert "[USER]: Describe this image" in result
+
+    def test_empty_messages(self) -> None:
+        result = _format_messages_for_summary([])
+        assert result == ""
+
+
+class TestExtractSummaryTags:
+    """Tests for _extract_summary_tags helper."""
+
+    def test_extracts_content_from_tags(self) -> None:
+        text = "Preamble\n<summary>The actual summary.</summary>\nPostamble"
+        assert _extract_summary_tags(text) == "The actual summary."
+
+    def test_handles_multiline_content(self) -> None:
+        text = "<summary>\nLine 1\nLine 2\nLine 3\n</summary>"
+        result = _extract_summary_tags(text)
+        assert "Line 1" in result
+        assert "Line 2" in result
+        assert "Line 3" in result
+
+    def test_falls_back_when_no_tags(self) -> None:
+        text = "Just a plain summary without tags."
+        assert _extract_summary_tags(text) == text
+
+    def test_handles_empty_string(self) -> None:
+        assert _extract_summary_tags("") == ""
+
+    def test_extracts_first_match(self) -> None:
+        text = "<summary>First</summary> text <summary>Second</summary>"
+        assert _extract_summary_tags(text) == "First"
+
+
+class TestSplitMessagesIntoChunks:
+    """Tests for _split_messages_into_chunks helper."""
+
+    def test_single_chunk_when_under_limit(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "user", "content": "Hello"},
+            {"role": "assistant", "content": "Hi"},
+        ]
+        chunks = _split_messages_into_chunks(messages, max_tokens=1000)
+        assert len(chunks) == 1
+        assert len(chunks[0]) == 2
+
+    def test_splits_at_message_boundaries(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "user", "content": "A" * 100},  # ~25 tokens
+            {"role": "assistant", "content": "B" * 100},  # ~25 tokens
+            {"role": "user", "content": "C" * 100},  # ~25 tokens
+        ]
+        # max_tokens=30 should cause splits
+        chunks = _split_messages_into_chunks(messages, max_tokens=30)
+        assert len(chunks) == 3
+
+    def test_excludes_system_messages(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "system", "content": "System prompt"},
+            {"role": "user", "content": "Hello"},
+        ]
+        chunks = _split_messages_into_chunks(messages, max_tokens=1000)
+        assert len(chunks) == 1
+        # The system message should not be in any chunk
+        for chunk in chunks:
+            for msg in chunk:
+                assert msg.get("role") != "system"
+
+    def test_empty_messages(self) -> None:
+        chunks = _split_messages_into_chunks([], max_tokens=1000)
+        assert chunks == []
+
+    def test_only_system_messages(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "system", "content": "System prompt"},
+        ]
+        chunks = _split_messages_into_chunks(messages, max_tokens=1000)
+        assert chunks == []
+
+    def test_handles_none_content(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "assistant", "content": None},
+            {"role": "user", "content": "Follow up"},
+        ]
+        chunks = _split_messages_into_chunks(messages, max_tokens=1000)
+        assert len(chunks) == 1
+        assert len(chunks[0]) == 2
+
+
+class TestEstimateTokenCount:
+    """Tests for _estimate_token_count helper."""
+
+    def test_empty_string(self) -> None:
+        assert _estimate_token_count("") == 0
+
+    def test_short_string(self) -> None:
+        assert _estimate_token_count("hello") == 1  # 5 // 4 = 1
+
+    def test_longer_string(self) -> None:
+        assert _estimate_token_count("a" * 100) == 25  # 100 // 4 = 25
+
+    def test_approximation_is_conservative(self) -> None:
+        # For English text, actual token count is typically lower than char/4
+        text = "The quick brown fox jumps over the lazy dog."
+        estimated = _estimate_token_count(text)
+        assert estimated > 0
+        assert estimated == len(text) // 4
+
+
+class TestParallelSummarization:
+    """Tests for parallel chunk summarization via asyncio."""
+
+    def _make_messages_for_n_chunks(self, n: int) -> list[dict[str, Any]]:
+        """Build a message list that will produce exactly *n* chunks.
+
+        Each message has 400 chars (~100 tokens). With max_tokens=100 returned
+        by the mock LLM, each message lands in its own chunk.
+        """
+        msgs: list[dict[str, Any]] = []
+        for i in range(n):
+            msgs.append({"role": "user", "content": f"msg-{i} " + "x" * 400})
+        return msgs
+
+    def test_multiple_chunks_use_acall(self) -> None:
+        """When there are multiple chunks, summarize_messages should use
+        llm.acall (parallel) instead of llm.call (sequential)."""
+        messages = self._make_messages_for_n_chunks(3)
+
+        mock_llm = MagicMock()
+        mock_llm.get_context_window_size.return_value = 100  # force multiple chunks
+        mock_llm.acall = AsyncMock(
+            side_effect=[
+                "<summary>Summary chunk 1</summary>",
+                "<summary>Summary chunk 2</summary>",
+                "<summary>Summary chunk 3</summary>",
+            ]
+        )
+
+        summarize_messages(
+            messages=messages,
+            llm=mock_llm,
+            callbacks=[],
+            i18n=_make_mock_i18n(),
+        )
+
+        # acall should have been awaited once per chunk
+        assert mock_llm.acall.await_count == 3
+        # sync call should NOT have been used for chunk summarization
+        mock_llm.call.assert_not_called()
+
+    def test_single_chunk_uses_sync_call(self) -> None:
+        """When there is only one chunk, summarize_messages should use
+        the sync llm.call path (no async overhead)."""
+        messages: list[dict[str, Any]] = [
+            {"role": "user", "content": "Short message"},
+            {"role": "assistant", "content": "Short reply"},
+        ]
+
+        mock_llm = MagicMock()
+        mock_llm.get_context_window_size.return_value = 100_000
+        mock_llm.call.return_value = "<summary>Short summary</summary>"
+
+        summarize_messages(
+            messages=messages,
+            llm=mock_llm,
+            callbacks=[],
+            i18n=_make_mock_i18n(),
+        )
+
+        mock_llm.call.assert_called_once()
+
+    def test_parallel_results_preserve_order(self) -> None:
+        """Summaries must appear in the same order as the original chunks,
+        regardless of which async call finishes first."""
+        messages = self._make_messages_for_n_chunks(3)
+
+        mock_llm = MagicMock()
+        mock_llm.get_context_window_size.return_value = 100
+
+        # Simulate varying latencies — chunk 2 finishes before chunk 0
+        async def _delayed_acall(msgs: Any, **kwargs: Any) -> str:
+            user_content = msgs[1]["content"]
+            if "msg-0" in user_content:
+                await asyncio.sleep(0.05)
+                return "<summary>Summary-A</summary>"
+            elif "msg-1" in user_content:
+                return "<summary>Summary-B</summary>"  # fastest
+            else:
+                await asyncio.sleep(0.02)
+                return "<summary>Summary-C</summary>"
+
+        mock_llm.acall = _delayed_acall
+
+        summarize_messages(
+            messages=messages,
+            llm=mock_llm,
+            callbacks=[],
+            i18n=_make_mock_i18n(),
+        )
+
+        # The final summary message should have A, B, C in order
+        summary_content = messages[-1]["content"]
+        pos_a = summary_content.index("Summary-A")
+        pos_b = summary_content.index("Summary-B")
+        pos_c = summary_content.index("Summary-C")
+        assert pos_a < pos_b < pos_c
+
+    def test_asummarize_chunks_returns_ordered_results(self) -> None:
+        """Direct test of the async helper _asummarize_chunks."""
+        chunk_a: list[dict[str, Any]] = [{"role": "user", "content": "Chunk A"}]
+        chunk_b: list[dict[str, Any]] = [{"role": "user", "content": "Chunk B"}]
+
+        mock_llm = MagicMock()
+        mock_llm.acall = AsyncMock(
+            side_effect=[
+                "<summary>Result A</summary>",
+                "<summary>Result B</summary>",
+            ]
+        )
+
+        results = asyncio.run(
+            _asummarize_chunks(
+                chunks=[chunk_a, chunk_b],
+                llm=mock_llm,
+                callbacks=[],
+                i18n=_make_mock_i18n(),
+            )
+        )
+
+        assert len(results) == 2
+        assert results[0]["content"] == "Result A"
+        assert results[1]["content"] == "Result B"
+
+    @patch("crewai.utilities.agent_utils.is_inside_event_loop", return_value=True)
+    def test_works_inside_existing_event_loop(self, _mock_loop: Any) -> None:
+        """When called from inside a running event loop (e.g. a Flow),
+        the ThreadPoolExecutor fallback should still work."""
+        messages = self._make_messages_for_n_chunks(2)
+
+        mock_llm = MagicMock()
+        mock_llm.get_context_window_size.return_value = 100
+        mock_llm.acall = AsyncMock(
+            side_effect=[
+                "<summary>Flow summary 1</summary>",
+                "<summary>Flow summary 2</summary>",
+            ]
+        )
+
+        summarize_messages(
+            messages=messages,
+            llm=mock_llm,
+            callbacks=[],
+            i18n=_make_mock_i18n(),
+        )
+
+        assert mock_llm.acall.await_count == 2
+        # Verify the merged summary made it into messages
+        assert "Flow summary 1" in messages[-1]["content"]
+        assert "Flow summary 2" in messages[-1]["content"]
+
+
+def _build_long_conversation() -> list[dict[str, Any]]:
+    """Build a multi-turn conversation that produces multiple chunks at max_tokens=200.
+
+    Each non-system message is ~100-140 estimated tokens (400-560 chars),
+    so a max_tokens of 200 yields roughly 3 chunks from 6 messages.
+    """
+    return [
+        {
+            "role": "system",
+            "content": "You are a helpful research assistant.",
+        },
+        {
+            "role": "user",
+            "content": (
+                "Tell me about the history of the Python programming language. "
+                "Who created it, when was it first released, and what were the "
+                "main design goals? Please provide a detailed overview covering "
+                "the major milestones from its inception through Python 3."
+            ),
+        },
+        {
+            "role": "assistant",
+            "content": (
+                "Python was created by Guido van Rossum and first released in 1991. "
+                "The main design goals were code readability and simplicity. Key milestones: "
+                "Python 1.0 (1994) introduced functional programming tools like lambda and map. "
+                "Python 2.0 (2000) added list comprehensions and garbage collection. "
+                "Python 3.0 (2008) was a major backward-incompatible release that fixed "
+                "fundamental design flaws. Python 2 reached end-of-life in January 2020."
+            ),
+        },
+        {
+            "role": "user",
+            "content": (
+                "What about the async/await features? When were they introduced "
+                "and how do they compare to similar features in JavaScript and C#? "
+                "Also explain the Global Interpreter Lock and its implications."
+            ),
+        },
+        {
+            "role": "assistant",
+            "content": (
+                "Async/await was introduced in Python 3.5 (PEP 492, 2015). "
+                "Unlike JavaScript which is single-threaded by design, Python's asyncio "
+                "is an opt-in framework. C# introduced async/await in 2012 (C# 5.0) and "
+                "was a major inspiration for Python's implementation. "
+                "The GIL (Global Interpreter Lock) is a mutex that protects access to "
+                "Python objects, preventing multiple threads from executing Python bytecodes "
+                "simultaneously. This means CPU-bound multithreaded programs don't benefit "
+                "from multiple cores. PEP 703 proposes making the GIL optional in CPython."
+            ),
+        },
+        {
+            "role": "user",
+            "content": (
+                "Explain the Python package ecosystem. How does pip work, what is PyPI, "
+                "and what are virtual environments? Compare pip with conda and uv."
+            ),
+        },
+        {
+            "role": "assistant",
+            "content": (
+                "PyPI (Python Package Index) is the official repository hosting 400k+ packages. "
+                "pip is the standard package installer that downloads from PyPI. "
+                "Virtual environments (venv) create isolated Python installations to avoid "
+                "dependency conflicts between projects. conda is a cross-language package manager "
+                "popular in data science that can manage non-Python dependencies. "
+                "uv is a new Rust-based tool that is 10-100x faster than pip and aims to replace "
+                "pip, pip-tools, and virtualenv with a single unified tool."
+            ),
+        },
+    ]
+
+
+class TestParallelSummarizationVCR:
+    """VCR-backed integration tests for parallel summarization.
+
+    These tests use a real LLM but patch get_context_window_size to force
+    multiple chunks, exercising the asyncio.gather + acall parallel path.
+
+    To record cassettes:
+        PYTEST_VCR_RECORD_MODE=all uv run pytest lib/crewai/tests/utilities/test_agent_utils.py::TestParallelSummarizationVCR -v
+    """
+
+    @pytest.mark.vcr()
+    def test_parallel_summarize_openai(self) -> None:
+        """Test that parallel summarization with gpt-4o-mini produces a valid summary."""
+        from crewai.llm import LLM
+        from crewai.utilities.i18n import I18N
+
+        llm = LLM(model="gpt-4o-mini", temperature=0)
+        i18n = I18N()
+        messages = _build_long_conversation()
+
+        original_system = messages[0]["content"]
+
+        # Patch get_context_window_size to return 200 — forces multiple chunks
+        with patch.object(type(llm), "get_context_window_size", return_value=200):
+            # Verify we actually get multiple chunks with this window size
+            non_system = [m for m in messages if m.get("role") != "system"]
+            chunks = _split_messages_into_chunks(non_system, max_tokens=200)
+            assert len(chunks) > 1, f"Expected multiple chunks, got {len(chunks)}"
+
+            summarize_messages(
+                messages=messages,
+                llm=llm,
+                callbacks=[],
+                i18n=i18n,
+            )
+
+        # System message preserved
+        assert messages[0]["role"] == "system"
+        assert messages[0]["content"] == original_system
+
+        # Summary produced as a user message
+        summary_msg = messages[-1]
+        assert summary_msg["role"] == "user"
+        assert len(summary_msg["content"]) > 0
+
+    @pytest.mark.vcr()
+    def test_parallel_summarize_preserves_files(self) -> None:
+        """Test that file references survive parallel summarization."""
+        from crewai.llm import LLM
+        from crewai.utilities.i18n import I18N
+
+        llm = LLM(model="gpt-4o-mini", temperature=0)
+        i18n = I18N()
+        messages = _build_long_conversation()
+
+        mock_file = MagicMock()
+        messages[1]["files"] = {"report.pdf": mock_file}
+
+        with patch.object(type(llm), "get_context_window_size", return_value=200):
+            summarize_messages(
+                messages=messages,
+                llm=llm,
+                callbacks=[],
+                i18n=i18n,
+            )
+
+        summary_msg = messages[-1]
+        assert summary_msg["role"] == "user"
+        assert "files" in summary_msg
+        assert "report.pdf" in summary_msg["files"]
--- a/lib/crewai/tests/utilities/test_summarize_integration.py
+++ b/lib/crewai/tests/utilities/test_summarize_integration.py
@@ -0,0 +1,284 @@
+"""
+Integration tests for structured context compaction (summarize_messages).
+"""
+
+from __future__ import annotations
+
+from typing import Any
+from unittest.mock import MagicMock
+
+import pytest
+
+from crewai.agent import Agent
+from crewai.crew import Crew
+from crewai.llm import LLM
+from crewai.task import Task
+from crewai.utilities.agent_utils import summarize_messages
+from crewai.utilities.i18n import I18N
+
+
+def _build_conversation_messages(
+    *, include_system: bool = True, include_files: bool = False
+) -> list[dict[str, Any]]:
+    """Build a realistic multi-turn conversation for summarization tests."""
+    messages: list[dict[str, Any]] = []
+
+    if include_system:
+        messages.append(
+            {
+                "role": "system",
+                "content": (
+                    "You are a research assistant specializing in AI topics. "
+                    "Your goal is to find accurate, up-to-date information."
+                ),
+            }
+        )
+
+    user_msg: dict[str, Any] = {
+        "role": "user",
+        "content": (
+            "Research the latest developments in large language models. "
+            "Focus on architecture improvements and training techniques."
+        ),
+    }
+    if include_files:
+        user_msg["files"] = {"reference.pdf": MagicMock()}
+    messages.append(user_msg)
+
+    messages.append(
+        {
+            "role": "assistant",
+            "content": (
+                "I'll research the latest developments in large language models. "
+                "Based on my knowledge, recent advances include:\n"
+                "1. Mixture of Experts (MoE) architectures\n"
+                "2. Improved attention mechanisms like Flash Attention\n"
+                "3. Better training data curation techniques\n"
+                "4. Constitutional AI and RLHF improvements"
+            ),
+        }
+    )
+
+    messages.append(
+        {
+            "role": "user",
+            "content": "Can you go deeper on the MoE architectures? What are the key papers?",
+        }
+    )
+
+    messages.append(
+        {
+            "role": "assistant",
+            "content": (
+                "Key papers on Mixture of Experts:\n"
+                "- Switch Transformers (Google, 2021) - simplified MoE routing\n"
+                "- GShard - scaling to 600B parameters\n"
+                "- Mixtral (Mistral AI) - open-source MoE model\n"
+                "The main advantage is computational efficiency: "
+                "only a subset of experts is activated per token."
+            ),
+        }
+    )
+
+    return messages
+
+
+class TestSummarizeDirectOpenAI:
+    """Test direct summarize_messages calls with OpenAI."""
+
+    @pytest.mark.vcr()
+    def test_summarize_direct_openai(self) -> None:
+        """Test summarize_messages with gpt-4o-mini preserves system messages."""
+        llm = LLM(model="gpt-4o-mini", temperature=0)
+        i18n = I18N()
+        messages = _build_conversation_messages(include_system=True)
+
+        original_system_content = messages[0]["content"]
+
+        summarize_messages(
+            messages=messages,
+            llm=llm,
+            callbacks=[],
+            i18n=i18n,
+        )
+
+        # System message should be preserved
+        assert len(messages) >= 2
+        assert messages[0]["role"] == "system"
+        assert messages[0]["content"] == original_system_content
+
+        # Summary should be a user message with <summary> block
+        summary_msg = messages[-1]
+        assert summary_msg["role"] == "user"
+        assert len(summary_msg["content"]) > 0
+        assert "<summary>" in summary_msg["content"]
+        assert "</summary>" in summary_msg["content"]
+
+
+class TestSummarizeDirectAnthropic:
+    """Test direct summarize_messages calls with Anthropic."""
+
+    @pytest.mark.vcr()
+    def test_summarize_direct_anthropic(self) -> None:
+        """Test summarize_messages with claude-3-5-haiku."""
+        llm = LLM(model="anthropic/claude-3-5-haiku-latest", temperature=0)
+        i18n = I18N()
+        messages = _build_conversation_messages(include_system=True)
+
+        summarize_messages(
+            messages=messages,
+            llm=llm,
+            callbacks=[],
+            i18n=i18n,
+        )
+
+        assert len(messages) >= 2
+        assert messages[0]["role"] == "system"
+        summary_msg = messages[-1]
+        assert summary_msg["role"] == "user"
+        assert len(summary_msg["content"]) > 0
+        assert "<summary>" in summary_msg["content"]
+        assert "</summary>" in summary_msg["content"]
+
+
+class TestSummarizeDirectGemini:
+    """Test direct summarize_messages calls with Gemini."""
+
+    @pytest.mark.vcr()
+    def test_summarize_direct_gemini(self) -> None:
+        """Test summarize_messages with gemini-2.0-flash."""
+        llm = LLM(model="gemini/gemini-2.0-flash", temperature=0)
+        i18n = I18N()
+        messages = _build_conversation_messages(include_system=True)
+
+        summarize_messages(
+            messages=messages,
+            llm=llm,
+            callbacks=[],
+            i18n=i18n,
+        )
+
+        assert len(messages) >= 2
+        assert messages[0]["role"] == "system"
+        summary_msg = messages[-1]
+        assert summary_msg["role"] == "user"
+        assert len(summary_msg["content"]) > 0
+        assert "<summary>" in summary_msg["content"]
+        assert "</summary>" in summary_msg["content"]
+
+
+class TestSummarizeDirectAzure:
+    """Test direct summarize_messages calls with Azure."""
+
+    @pytest.mark.vcr()
+    def test_summarize_direct_azure(self) -> None:
+        """Test summarize_messages with azure/gpt-4o-mini."""
+        llm = LLM(model="azure/gpt-4o-mini", temperature=0)
+        i18n = I18N()
+        messages = _build_conversation_messages(include_system=True)
+
+        summarize_messages(
+            messages=messages,
+            llm=llm,
+            callbacks=[],
+            i18n=i18n,
+        )
+
+        assert len(messages) >= 2
+        assert messages[0]["role"] == "system"
+        summary_msg = messages[-1]
+        assert summary_msg["role"] == "user"
+        assert len(summary_msg["content"]) > 0
+        assert "<summary>" in summary_msg["content"]
+        assert "</summary>" in summary_msg["content"]
+
+
+class TestCrewKickoffCompaction:
+    """Test compaction triggered via Crew.kickoff() with small context window."""
+
+    @pytest.mark.vcr()
+    def test_crew_kickoff_compaction_openai(self) -> None:
+        """Test that compaction is triggered during kickoff with small context_window_size."""
+        llm = LLM(model="gpt-4o-mini", temperature=0)
+        # Force a very small context window to trigger compaction
+        llm.context_window_size = 500
+
+        agent = Agent(
+            role="Researcher",
+            goal="Find information about Python programming",
+            backstory="You are an expert researcher.",
+            llm=llm,
+            verbose=False,
+            max_iter=2,
+        )
+
+        task = Task(
+            description="What is Python? Give a brief answer.",
+            expected_output="A short description of Python.",
+            agent=agent,
+        )
+
+        crew = Crew(agents=[agent], tasks=[task], verbose=False)
+
+        # This may or may not trigger compaction depending on actual response sizes.
+        # The test verifies the code path doesn't crash.
+        result = crew.kickoff()
+        assert result is not None
+
+
+class TestAgentExecuteTaskCompaction:
+    """Test compaction triggered via Agent.execute_task()."""
+
+    @pytest.mark.vcr()
+    def test_agent_execute_task_compaction(self) -> None:
+        """Test that Agent.execute_task() works with small context_window_size."""
+        llm = LLM(model="gpt-4o-mini", temperature=0)
+        llm.context_window_size = 500
+
+        agent = Agent(
+            role="Writer",
+            goal="Write concise content",
+            backstory="You are a skilled writer.",
+            llm=llm,
+            verbose=False,
+            max_iter=2,
+        )
+
+        task = Task(
+            description="Write one sentence about the sun.",
+            expected_output="A single sentence about the sun.",
+            agent=agent,
+        )
+
+        result = agent.execute_task(task=task)
+        assert result is not None
+
+
+class TestSummarizePreservesFiles:
+    """Test that files are preserved through real summarization."""
+
+    @pytest.mark.vcr()
+    def test_summarize_preserves_files_integration(self) -> None:
+        """Test that file references survive a real summarization call."""
+        llm = LLM(model="gpt-4o-mini", temperature=0)
+        i18n = I18N()
+        messages = _build_conversation_messages(
+            include_system=True, include_files=True
+        )
+
+        summarize_messages(
+            messages=messages,
+            llm=llm,
+            callbacks=[],
+            i18n=i18n,
+        )
+
+        # System message preserved
+        assert messages[0]["role"] == "system"
+
+        # Files should be on the summary message with <summary> block
+        summary_msg = messages[-1]
+        assert "<summary>" in summary_msg["content"]
+        assert "</summary>" in summary_msg["content"]
+        assert "files" in summary_msg
+        assert "reference.pdf" in summary_msg["files"]