imp compaction (#4399)

* imp compaction * fix lint * cassette gen * cassette gen * improve assert * adding azure * fix global docstring
2026-05-01 23:32:39 +00:00 · 2026-02-11 15:52:03 -08:00
parent 0341e5aee7
commit 2ed0c2c043
13 changed files with 2881 additions and 71 deletions
--- a/lib/crewai/tests/utilities/test_agent_utils.py
+++ b/lib/crewai/tests/utilities/test_agent_utils.py
@@ -2,13 +2,23 @@

 from __future__ import annotations

+import asyncio
 from typing import Any
-from unittest.mock import MagicMock, patch
+from unittest.mock import AsyncMock, MagicMock, patch

+import pytest
 from pydantic import BaseModel, Field

 from crewai.tools.base_tool import BaseTool
-from crewai.utilities.agent_utils import convert_tools_to_openai_schema, summarize_messages
+from crewai.utilities.agent_utils import (
+    _asummarize_chunks,
+    _estimate_token_count,
+    _extract_summary_tags,
+    _format_messages_for_summary,
+    _split_messages_into_chunks,
+    convert_tools_to_openai_schema,
+    summarize_messages,
+)


 class CalculatorInput(BaseModel):
@@ -214,6 +224,17 @@ class TestConvertToolsToOpenaiSchema:
        assert max_results_prop["default"] == 10


+def _make_mock_i18n() -> MagicMock:
+    """Create a mock i18n with the new structured prompt keys."""
+    mock_i18n = MagicMock()
+    mock_i18n.slice.side_effect = lambda key: {
+        "summarizer_system_message": "You are a precise assistant that creates structured summaries.",
+        "summarize_instruction": "Summarize the conversation:\n{conversation}",
+        "summary": "<summary>\n{merged_summary}\n</summary>\nContinue the task.",
+    }.get(key, "")
+    return mock_i18n
+
+
 class TestSummarizeMessages:
    """Tests for summarize_messages function."""

@@ -229,26 +250,22 @@ class TestSummarizeMessages:

        mock_llm = MagicMock()
        mock_llm.get_context_window_size.return_value = 1000
-        mock_llm.call.return_value = "Summarized conversation about image analysis."
-
-        mock_i18n = MagicMock()
-        mock_i18n.slice.side_effect = lambda key: {
-            "summarizer_system_message": "Summarize the following.",
-            "summarize_instruction": "Summarize: {group}",
-            "summary": "Summary: {merged_summary}",
-        }.get(key, "")
+        mock_llm.call.return_value = "<summary>Summarized conversation about image analysis.</summary>"

        summarize_messages(
            messages=messages,
            llm=mock_llm,
            callbacks=[],
-            i18n=mock_i18n,
+            i18n=_make_mock_i18n(),
        )

-        assert len(messages) == 1
-        assert messages[0]["role"] == "user"
-        assert "files" in messages[0]
-        assert messages[0]["files"] == mock_files
+        # System message preserved + summary message = 2
+        assert len(messages) == 2
+        assert messages[0]["role"] == "system"
+        summary_msg = messages[1]
+        assert summary_msg["role"] == "user"
+        assert "files" in summary_msg
+        assert summary_msg["files"] == mock_files

    def test_merges_files_from_multiple_user_messages(self) -> None:
        """Test that files from multiple user messages are merged."""
@@ -264,20 +281,13 @@ class TestSummarizeMessages:

        mock_llm = MagicMock()
        mock_llm.get_context_window_size.return_value = 1000
-        mock_llm.call.return_value = "Summarized conversation."
-
-        mock_i18n = MagicMock()
-        mock_i18n.slice.side_effect = lambda key: {
-            "summarizer_system_message": "Summarize the following.",
-            "summarize_instruction": "Summarize: {group}",
-            "summary": "Summary: {merged_summary}",
-        }.get(key, "")
+        mock_llm.call.return_value = "<summary>Summarized conversation.</summary>"

        summarize_messages(
            messages=messages,
            llm=mock_llm,
            callbacks=[],
-            i18n=mock_i18n,
+            i18n=_make_mock_i18n(),
        )

        assert len(messages) == 1
@@ -297,20 +307,13 @@ class TestSummarizeMessages:

        mock_llm = MagicMock()
        mock_llm.get_context_window_size.return_value = 1000
-        mock_llm.call.return_value = "A greeting exchange."
-
-        mock_i18n = MagicMock()
-        mock_i18n.slice.side_effect = lambda key: {
-            "summarizer_system_message": "Summarize the following.",
-            "summarize_instruction": "Summarize: {group}",
-            "summary": "Summary: {merged_summary}",
-        }.get(key, "")
+        mock_llm.call.return_value = "<summary>A greeting exchange.</summary>"

        summarize_messages(
            messages=messages,
            llm=mock_llm,
            callbacks=[],
-            i18n=mock_i18n,
+            i18n=_make_mock_i18n(),
        )

        assert len(messages) == 1
@@ -327,21 +330,595 @@ class TestSummarizeMessages:

        mock_llm = MagicMock()
        mock_llm.get_context_window_size.return_value = 1000
-        mock_llm.call.return_value = "Summary"
-
-        mock_i18n = MagicMock()
-        mock_i18n.slice.side_effect = lambda key: {
-            "summarizer_system_message": "Summarize.",
-            "summarize_instruction": "Summarize: {group}",
-            "summary": "Summary: {merged_summary}",
-        }.get(key, "")
+        mock_llm.call.return_value = "<summary>Summary</summary>"

        summarize_messages(
            messages=messages,
            llm=mock_llm,
            callbacks=[],
-            i18n=mock_i18n,
+            i18n=_make_mock_i18n(),
        )

        assert id(messages) == original_list_id
        assert len(messages) == 1
+
+    def test_preserves_system_messages(self) -> None:
+        """Test that system messages are preserved and not summarized."""
+        messages: list[dict[str, Any]] = [
+            {"role": "system", "content": "You are a research assistant."},
+            {"role": "user", "content": "Find information about AI."},
+            {"role": "assistant", "content": "I found several resources on AI."},
+        ]
+
+        mock_llm = MagicMock()
+        mock_llm.get_context_window_size.return_value = 1000
+        mock_llm.call.return_value = "<summary>User asked about AI, assistant found resources.</summary>"
+
+        summarize_messages(
+            messages=messages,
+            llm=mock_llm,
+            callbacks=[],
+            i18n=_make_mock_i18n(),
+        )
+
+        assert len(messages) == 2
+        assert messages[0]["role"] == "system"
+        assert messages[0]["content"] == "You are a research assistant."
+        assert messages[1]["role"] == "user"
+
+    def test_formats_conversation_with_role_labels(self) -> None:
+        """Test that the LLM receives role-labeled conversation text."""
+        messages: list[dict[str, Any]] = [
+            {"role": "system", "content": "System prompt."},
+            {"role": "user", "content": "Hello there"},
+            {"role": "assistant", "content": "Hi! How can I help?"},
+        ]
+
+        mock_llm = MagicMock()
+        mock_llm.get_context_window_size.return_value = 1000
+        mock_llm.call.return_value = "<summary>Greeting exchange.</summary>"
+
+        summarize_messages(
+            messages=messages,
+            llm=mock_llm,
+            callbacks=[],
+            i18n=_make_mock_i18n(),
+        )
+
+        # Check what was passed to llm.call
+        call_args = mock_llm.call.call_args[0][0]
+        user_msg_content = call_args[1]["content"]
+        assert "[USER]:" in user_msg_content
+        assert "[ASSISTANT]:" in user_msg_content
+        # System content should NOT appear in summarization input
+        assert "System prompt." not in user_msg_content
+
+    def test_extracts_summary_from_tags(self) -> None:
+        """Test that <summary> tags are extracted from LLM response."""
+        messages: list[dict[str, Any]] = [
+            {"role": "user", "content": "Do something."},
+            {"role": "assistant", "content": "Done."},
+        ]
+
+        mock_llm = MagicMock()
+        mock_llm.get_context_window_size.return_value = 1000
+        mock_llm.call.return_value = "Here is the summary:\n<summary>The extracted summary content.</summary>\nExtra text."
+
+        summarize_messages(
+            messages=messages,
+            llm=mock_llm,
+            callbacks=[],
+            i18n=_make_mock_i18n(),
+        )
+
+        assert "The extracted summary content." in messages[0]["content"]
+
+    def test_handles_tool_messages(self) -> None:
+        """Test that tool messages are properly formatted in summarization."""
+        messages: list[dict[str, Any]] = [
+            {"role": "user", "content": "Search for Python."},
+            {"role": "assistant", "content": None, "tool_calls": [
+                {"function": {"name": "web_search", "arguments": '{"query": "Python"}'}}
+            ]},
+            {"role": "tool", "content": "Python is a programming language.", "name": "web_search"},
+            {"role": "assistant", "content": "Python is a programming language."},
+        ]
+
+        mock_llm = MagicMock()
+        mock_llm.get_context_window_size.return_value = 1000
+        mock_llm.call.return_value = "<summary>User searched for Python info.</summary>"
+
+        summarize_messages(
+            messages=messages,
+            llm=mock_llm,
+            callbacks=[],
+            i18n=_make_mock_i18n(),
+        )
+
+        # Verify the conversation text sent to LLM contains tool labels
+        call_args = mock_llm.call.call_args[0][0]
+        user_msg_content = call_args[1]["content"]
+        assert "[TOOL_RESULT (web_search)]:" in user_msg_content
+
+    def test_only_system_messages_no_op(self) -> None:
+        """Test that only system messages results in no-op (no summarization)."""
+        messages: list[dict[str, Any]] = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "system", "content": "Additional system instructions."},
+        ]
+
+        mock_llm = MagicMock()
+        mock_llm.get_context_window_size.return_value = 1000
+
+        summarize_messages(
+            messages=messages,
+            llm=mock_llm,
+            callbacks=[],
+            i18n=_make_mock_i18n(),
+        )
+
+        # No LLM call should have been made
+        mock_llm.call.assert_not_called()
+        # System messages should remain untouched
+        assert len(messages) == 2
+        assert messages[0]["content"] == "You are a helpful assistant."
+        assert messages[1]["content"] == "Additional system instructions."
+
+
+class TestFormatMessagesForSummary:
+    """Tests for _format_messages_for_summary helper."""
+
+    def test_skips_system_messages(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "system", "content": "System prompt"},
+            {"role": "user", "content": "Hello"},
+        ]
+        result = _format_messages_for_summary(messages)
+        assert "System prompt" not in result
+        assert "[USER]: Hello" in result
+
+    def test_formats_user_and_assistant(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "user", "content": "Question"},
+            {"role": "assistant", "content": "Answer"},
+        ]
+        result = _format_messages_for_summary(messages)
+        assert "[USER]: Question" in result
+        assert "[ASSISTANT]: Answer" in result
+
+    def test_formats_tool_messages(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "tool", "content": "Result data", "name": "search_tool"},
+        ]
+        result = _format_messages_for_summary(messages)
+        assert "[TOOL_RESULT (search_tool)]:" in result
+        assert "Result data" in result
+
+    def test_handles_none_content_with_tool_calls(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "assistant", "content": None, "tool_calls": [
+                {"function": {"name": "calculator", "arguments": "{}"}}
+            ]},
+        ]
+        result = _format_messages_for_summary(messages)
+        assert "[Called tools: calculator]" in result
+
+    def test_handles_none_content_without_tool_calls(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "assistant", "content": None},
+        ]
+        result = _format_messages_for_summary(messages)
+        assert "[ASSISTANT]:" in result
+
+    def test_handles_multimodal_content(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "user", "content": [
+                {"type": "text", "text": "Describe this image"},
+                {"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}}
+            ]},
+        ]
+        result = _format_messages_for_summary(messages)
+        assert "[USER]: Describe this image" in result
+
+    def test_empty_messages(self) -> None:
+        result = _format_messages_for_summary([])
+        assert result == ""
+
+
+class TestExtractSummaryTags:
+    """Tests for _extract_summary_tags helper."""
+
+    def test_extracts_content_from_tags(self) -> None:
+        text = "Preamble\n<summary>The actual summary.</summary>\nPostamble"
+        assert _extract_summary_tags(text) == "The actual summary."
+
+    def test_handles_multiline_content(self) -> None:
+        text = "<summary>\nLine 1\nLine 2\nLine 3\n</summary>"
+        result = _extract_summary_tags(text)
+        assert "Line 1" in result
+        assert "Line 2" in result
+        assert "Line 3" in result
+
+    def test_falls_back_when_no_tags(self) -> None:
+        text = "Just a plain summary without tags."
+        assert _extract_summary_tags(text) == text
+
+    def test_handles_empty_string(self) -> None:
+        assert _extract_summary_tags("") == ""
+
+    def test_extracts_first_match(self) -> None:
+        text = "<summary>First</summary> text <summary>Second</summary>"
+        assert _extract_summary_tags(text) == "First"
+
+
+class TestSplitMessagesIntoChunks:
+    """Tests for _split_messages_into_chunks helper."""
+
+    def test_single_chunk_when_under_limit(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "user", "content": "Hello"},
+            {"role": "assistant", "content": "Hi"},
+        ]
+        chunks = _split_messages_into_chunks(messages, max_tokens=1000)
+        assert len(chunks) == 1
+        assert len(chunks[0]) == 2
+
+    def test_splits_at_message_boundaries(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "user", "content": "A" * 100},  # ~25 tokens
+            {"role": "assistant", "content": "B" * 100},  # ~25 tokens
+            {"role": "user", "content": "C" * 100},  # ~25 tokens
+        ]
+        # max_tokens=30 should cause splits
+        chunks = _split_messages_into_chunks(messages, max_tokens=30)
+        assert len(chunks) == 3
+
+    def test_excludes_system_messages(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "system", "content": "System prompt"},
+            {"role": "user", "content": "Hello"},
+        ]
+        chunks = _split_messages_into_chunks(messages, max_tokens=1000)
+        assert len(chunks) == 1
+        # The system message should not be in any chunk
+        for chunk in chunks:
+            for msg in chunk:
+                assert msg.get("role") != "system"
+
+    def test_empty_messages(self) -> None:
+        chunks = _split_messages_into_chunks([], max_tokens=1000)
+        assert chunks == []
+
+    def test_only_system_messages(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "system", "content": "System prompt"},
+        ]
+        chunks = _split_messages_into_chunks(messages, max_tokens=1000)
+        assert chunks == []
+
+    def test_handles_none_content(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "assistant", "content": None},
+            {"role": "user", "content": "Follow up"},
+        ]
+        chunks = _split_messages_into_chunks(messages, max_tokens=1000)
+        assert len(chunks) == 1
+        assert len(chunks[0]) == 2
+
+
+class TestEstimateTokenCount:
+    """Tests for _estimate_token_count helper."""
+
+    def test_empty_string(self) -> None:
+        assert _estimate_token_count("") == 0
+
+    def test_short_string(self) -> None:
+        assert _estimate_token_count("hello") == 1  # 5 // 4 = 1
+
+    def test_longer_string(self) -> None:
+        assert _estimate_token_count("a" * 100) == 25  # 100 // 4 = 25
+
+    def test_approximation_is_conservative(self) -> None:
+        # For English text, actual token count is typically lower than char/4
+        text = "The quick brown fox jumps over the lazy dog."
+        estimated = _estimate_token_count(text)
+        assert estimated > 0
+        assert estimated == len(text) // 4
+
+
+class TestParallelSummarization:
+    """Tests for parallel chunk summarization via asyncio."""
+
+    def _make_messages_for_n_chunks(self, n: int) -> list[dict[str, Any]]:
+        """Build a message list that will produce exactly *n* chunks.
+
+        Each message has 400 chars (~100 tokens). With max_tokens=100 returned
+        by the mock LLM, each message lands in its own chunk.
+        """
+        msgs: list[dict[str, Any]] = []
+        for i in range(n):
+            msgs.append({"role": "user", "content": f"msg-{i} " + "x" * 400})
+        return msgs
+
+    def test_multiple_chunks_use_acall(self) -> None:
+        """When there are multiple chunks, summarize_messages should use
+        llm.acall (parallel) instead of llm.call (sequential)."""
+        messages = self._make_messages_for_n_chunks(3)
+
+        mock_llm = MagicMock()
+        mock_llm.get_context_window_size.return_value = 100  # force multiple chunks
+        mock_llm.acall = AsyncMock(
+            side_effect=[
+                "<summary>Summary chunk 1</summary>",
+                "<summary>Summary chunk 2</summary>",
+                "<summary>Summary chunk 3</summary>",
+            ]
+        )
+
+        summarize_messages(
+            messages=messages,
+            llm=mock_llm,
+            callbacks=[],
+            i18n=_make_mock_i18n(),
+        )
+
+        # acall should have been awaited once per chunk
+        assert mock_llm.acall.await_count == 3
+        # sync call should NOT have been used for chunk summarization
+        mock_llm.call.assert_not_called()
+
+    def test_single_chunk_uses_sync_call(self) -> None:
+        """When there is only one chunk, summarize_messages should use
+        the sync llm.call path (no async overhead)."""
+        messages: list[dict[str, Any]] = [
+            {"role": "user", "content": "Short message"},
+            {"role": "assistant", "content": "Short reply"},
+        ]
+
+        mock_llm = MagicMock()
+        mock_llm.get_context_window_size.return_value = 100_000
+        mock_llm.call.return_value = "<summary>Short summary</summary>"
+
+        summarize_messages(
+            messages=messages,
+            llm=mock_llm,
+            callbacks=[],
+            i18n=_make_mock_i18n(),
+        )
+
+        mock_llm.call.assert_called_once()
+
+    def test_parallel_results_preserve_order(self) -> None:
+        """Summaries must appear in the same order as the original chunks,
+        regardless of which async call finishes first."""
+        messages = self._make_messages_for_n_chunks(3)
+
+        mock_llm = MagicMock()
+        mock_llm.get_context_window_size.return_value = 100
+
+        # Simulate varying latencies — chunk 2 finishes before chunk 0
+        async def _delayed_acall(msgs: Any, **kwargs: Any) -> str:
+            user_content = msgs[1]["content"]
+            if "msg-0" in user_content:
+                await asyncio.sleep(0.05)
+                return "<summary>Summary-A</summary>"
+            elif "msg-1" in user_content:
+                return "<summary>Summary-B</summary>"  # fastest
+            else:
+                await asyncio.sleep(0.02)
+                return "<summary>Summary-C</summary>"
+
+        mock_llm.acall = _delayed_acall
+
+        summarize_messages(
+            messages=messages,
+            llm=mock_llm,
+            callbacks=[],
+            i18n=_make_mock_i18n(),
+        )
+
+        # The final summary message should have A, B, C in order
+        summary_content = messages[-1]["content"]
+        pos_a = summary_content.index("Summary-A")
+        pos_b = summary_content.index("Summary-B")
+        pos_c = summary_content.index("Summary-C")
+        assert pos_a < pos_b < pos_c
+
+    def test_asummarize_chunks_returns_ordered_results(self) -> None:
+        """Direct test of the async helper _asummarize_chunks."""
+        chunk_a: list[dict[str, Any]] = [{"role": "user", "content": "Chunk A"}]
+        chunk_b: list[dict[str, Any]] = [{"role": "user", "content": "Chunk B"}]
+
+        mock_llm = MagicMock()
+        mock_llm.acall = AsyncMock(
+            side_effect=[
+                "<summary>Result A</summary>",
+                "<summary>Result B</summary>",
+            ]
+        )
+
+        results = asyncio.run(
+            _asummarize_chunks(
+                chunks=[chunk_a, chunk_b],
+                llm=mock_llm,
+                callbacks=[],
+                i18n=_make_mock_i18n(),
+            )
+        )
+
+        assert len(results) == 2
+        assert results[0]["content"] == "Result A"
+        assert results[1]["content"] == "Result B"
+
+    @patch("crewai.utilities.agent_utils.is_inside_event_loop", return_value=True)
+    def test_works_inside_existing_event_loop(self, _mock_loop: Any) -> None:
+        """When called from inside a running event loop (e.g. a Flow),
+        the ThreadPoolExecutor fallback should still work."""
+        messages = self._make_messages_for_n_chunks(2)
+
+        mock_llm = MagicMock()
+        mock_llm.get_context_window_size.return_value = 100
+        mock_llm.acall = AsyncMock(
+            side_effect=[
+                "<summary>Flow summary 1</summary>",
+                "<summary>Flow summary 2</summary>",
+            ]
+        )
+
+        summarize_messages(
+            messages=messages,
+            llm=mock_llm,
+            callbacks=[],
+            i18n=_make_mock_i18n(),
+        )
+
+        assert mock_llm.acall.await_count == 2
+        # Verify the merged summary made it into messages
+        assert "Flow summary 1" in messages[-1]["content"]
+        assert "Flow summary 2" in messages[-1]["content"]
+
+
+def _build_long_conversation() -> list[dict[str, Any]]:
+    """Build a multi-turn conversation that produces multiple chunks at max_tokens=200.
+
+    Each non-system message is ~100-140 estimated tokens (400-560 chars),
+    so a max_tokens of 200 yields roughly 3 chunks from 6 messages.
+    """
+    return [
+        {
+            "role": "system",
+            "content": "You are a helpful research assistant.",
+        },
+        {
+            "role": "user",
+            "content": (
+                "Tell me about the history of the Python programming language. "
+                "Who created it, when was it first released, and what were the "
+                "main design goals? Please provide a detailed overview covering "
+                "the major milestones from its inception through Python 3."
+            ),
+        },
+        {
+            "role": "assistant",
+            "content": (
+                "Python was created by Guido van Rossum and first released in 1991. "
+                "The main design goals were code readability and simplicity. Key milestones: "
+                "Python 1.0 (1994) introduced functional programming tools like lambda and map. "
+                "Python 2.0 (2000) added list comprehensions and garbage collection. "
+                "Python 3.0 (2008) was a major backward-incompatible release that fixed "
+                "fundamental design flaws. Python 2 reached end-of-life in January 2020."
+            ),
+        },
+        {
+            "role": "user",
+            "content": (
+                "What about the async/await features? When were they introduced "
+                "and how do they compare to similar features in JavaScript and C#? "
+                "Also explain the Global Interpreter Lock and its implications."
+            ),
+        },
+        {
+            "role": "assistant",
+            "content": (
+                "Async/await was introduced in Python 3.5 (PEP 492, 2015). "
+                "Unlike JavaScript which is single-threaded by design, Python's asyncio "
+                "is an opt-in framework. C# introduced async/await in 2012 (C# 5.0) and "
+                "was a major inspiration for Python's implementation. "
+                "The GIL (Global Interpreter Lock) is a mutex that protects access to "
+                "Python objects, preventing multiple threads from executing Python bytecodes "
+                "simultaneously. This means CPU-bound multithreaded programs don't benefit "
+                "from multiple cores. PEP 703 proposes making the GIL optional in CPython."
+            ),
+        },
+        {
+            "role": "user",
+            "content": (
+                "Explain the Python package ecosystem. How does pip work, what is PyPI, "
+                "and what are virtual environments? Compare pip with conda and uv."
+            ),
+        },
+        {
+            "role": "assistant",
+            "content": (
+                "PyPI (Python Package Index) is the official repository hosting 400k+ packages. "
+                "pip is the standard package installer that downloads from PyPI. "
+                "Virtual environments (venv) create isolated Python installations to avoid "
+                "dependency conflicts between projects. conda is a cross-language package manager "
+                "popular in data science that can manage non-Python dependencies. "
+                "uv is a new Rust-based tool that is 10-100x faster than pip and aims to replace "
+                "pip, pip-tools, and virtualenv with a single unified tool."
+            ),
+        },
+    ]
+
+
+class TestParallelSummarizationVCR:
+    """VCR-backed integration tests for parallel summarization.
+
+    These tests use a real LLM but patch get_context_window_size to force
+    multiple chunks, exercising the asyncio.gather + acall parallel path.
+
+    To record cassettes:
+        PYTEST_VCR_RECORD_MODE=all uv run pytest lib/crewai/tests/utilities/test_agent_utils.py::TestParallelSummarizationVCR -v
+    """
+
+    @pytest.mark.vcr()
+    def test_parallel_summarize_openai(self) -> None:
+        """Test that parallel summarization with gpt-4o-mini produces a valid summary."""
+        from crewai.llm import LLM
+        from crewai.utilities.i18n import I18N
+
+        llm = LLM(model="gpt-4o-mini", temperature=0)
+        i18n = I18N()
+        messages = _build_long_conversation()
+
+        original_system = messages[0]["content"]
+
+        # Patch get_context_window_size to return 200 — forces multiple chunks
+        with patch.object(type(llm), "get_context_window_size", return_value=200):
+            # Verify we actually get multiple chunks with this window size
+            non_system = [m for m in messages if m.get("role") != "system"]
+            chunks = _split_messages_into_chunks(non_system, max_tokens=200)
+            assert len(chunks) > 1, f"Expected multiple chunks, got {len(chunks)}"
+
+            summarize_messages(
+                messages=messages,
+                llm=llm,
+                callbacks=[],
+                i18n=i18n,
+            )
+
+        # System message preserved
+        assert messages[0]["role"] == "system"
+        assert messages[0]["content"] == original_system
+
+        # Summary produced as a user message
+        summary_msg = messages[-1]
+        assert summary_msg["role"] == "user"
+        assert len(summary_msg["content"]) > 0
+
+    @pytest.mark.vcr()
+    def test_parallel_summarize_preserves_files(self) -> None:
+        """Test that file references survive parallel summarization."""
+        from crewai.llm import LLM
+        from crewai.utilities.i18n import I18N
+
+        llm = LLM(model="gpt-4o-mini", temperature=0)
+        i18n = I18N()
+        messages = _build_long_conversation()
+
+        mock_file = MagicMock()
+        messages[1]["files"] = {"report.pdf": mock_file}
+
+        with patch.object(type(llm), "get_context_window_size", return_value=200):
+            summarize_messages(
+                messages=messages,
+                llm=llm,
+                callbacks=[],
+                i18n=i18n,
+            )
+
+        summary_msg = messages[-1]
+        assert summary_msg["role"] == "user"
+        assert "files" in summary_msg
+        assert "report.pdf" in summary_msg["files"]