imp compaction

2026-02-07 14:38:13 +00:00 · 2026-02-06 14:37:33 -08:00
4 changed files with 741 additions and 58 deletions
--- a/lib/crewai/src/crewai/translations/en.json
+++ b/lib/crewai/src/crewai/translations/en.json
@@ -22,9 +22,9 @@
    "expected_output": "\nThis is the expected criteria for your final answer: {expected_output}\nyou MUST return the actual complete content as the final answer, not a summary.",
    "human_feedback": "You got human feedback on your work, re-evaluate it and give a new Final Answer when ready.\n {human_feedback}",
    "getting_input": "This is the agent's final answer: {final_answer}\n\n",
-    "summarizer_system_message": "You are a helpful assistant that summarizes text.",
-    "summarize_instruction": "Summarize the following text, make sure to include all the important information: {group}",
-    "summary": "This is a summary of our conversation so far:\n{merged_summary}",
+    "summarizer_system_message": "You are a precise assistant that creates structured summaries of agent conversations. You preserve critical context needed for seamless task continuation.",
+    "summarize_instruction": "Analyze the following conversation and create a structured summary that preserves all information needed to continue the task seamlessly.\n\n<conversation>\n{conversation}\n</conversation>\n\nCreate a summary with these sections:\n1. **Task Overview**: What is the agent trying to accomplish?\n2. **Current State**: What has been completed so far? What step is the agent on?\n3. **Important Discoveries**: Key facts, data, tool results, or findings that must not be lost.\n4. **Next Steps**: What should the agent do next based on the conversation?\n5. **Context to Preserve**: Any specific values, names, URLs, code snippets, or details referenced in the conversation.\n\nWrap your entire summary in <summary> tags.\n\n<summary>\n[Your structured summary here]\n</summary>",
+    "summary": "<summary>\n{merged_summary}\n</summary>\n\nContinue the task from where the conversation left off. The above is a structured summary of prior context.",
    "manager_request": "Your best answer to your coworker asking you this, accounting for the context shared.",
    "formatted_task_instructions": "Format your final answer according to the following OpenAPI schema: {output_format}\n\nIMPORTANT: Preserve the original content exactly as-is. Do NOT rewrite, paraphrase, or modify the meaning of the content. Only structure it to match the schema format.\n\nDo not include the OpenAPI schema in the final output. Ensure the final output does not include any code block markers like ```json or ```python.",
    "conversation_history_instruction": "You are a member of a crew collaborating to achieve a common goal. Your task is a specific action that contributes to this larger objective. For additional context, please review the conversation history between you and the user that led to the initiation of this crew. Use any relevant information or feedback from the conversation to inform your task execution and ensure your response aligns with both the immediate task and the crew's overall goals.",
--- a/lib/crewai/src/crewai/utilities/agent_utils.py
+++ b/lib/crewai/src/crewai/utilities/agent_utils.py
@@ -640,6 +640,137 @@ def handle_context_length(
        )


+def _estimate_token_count(text: str) -> int:
+    """Estimate token count using a conservative cross-provider heuristic.
+
+    Args:
+        text: The text to estimate tokens for.
+
+    Returns:
+        Estimated token count (roughly 1 token per 4 characters).
+    """
+    return len(text) // 4
+
+
+def _format_messages_for_summary(messages: list[LLMMessage]) -> str:
+    """Format messages with role labels for summarization.
+
+    Skips system messages. Handles None content, tool_calls, and
+    multimodal content blocks.
+
+    Args:
+        messages: List of messages to format.
+
+    Returns:
+        Role-labeled conversation text.
+    """
+    lines: list[str] = []
+    for msg in messages:
+        role = msg.get("role", "user")
+        if role == "system":
+            continue
+
+        content = msg.get("content")
+        if content is None:
+            # Check for tool_calls on assistant messages with no content
+            tool_calls = msg.get("tool_calls")
+            if tool_calls:
+                tool_names = []
+                for tc in tool_calls:
+                    func = tc.get("function", {})
+                    name = func.get("name", "unknown") if isinstance(func, dict) else "unknown"
+                    tool_names.append(name)
+                content = f"[Called tools: {', '.join(tool_names)}]"
+            else:
+                content = ""
+        elif isinstance(content, list):
+            # Multimodal content blocks — extract text parts
+            text_parts = []
+            for block in content:
+                if isinstance(block, dict) and block.get("type") == "text":
+                    text_parts.append(block.get("text", ""))
+            content = " ".join(text_parts) if text_parts else "[multimodal content]"
+
+        if role == "assistant":
+            label = "[ASSISTANT]:"
+        elif role == "tool":
+            tool_name = msg.get("name", "unknown")
+            label = f"[TOOL_RESULT ({tool_name})]:"
+        else:
+            label = "[USER]:"
+
+        lines.append(f"{label} {content}")
+
+    return "\n\n".join(lines)
+
+
+def _split_messages_into_chunks(
+    messages: list[LLMMessage], max_tokens: int
+) -> list[list[LLMMessage]]:
+    """Split messages into chunks at message boundaries.
+
+    Excludes system messages from chunks. Each chunk stays under
+    max_tokens based on estimated token count.
+
+    Args:
+        messages: List of messages to split.
+        max_tokens: Maximum estimated tokens per chunk.
+
+    Returns:
+        List of message chunks.
+    """
+    non_system = [m for m in messages if m.get("role") != "system"]
+    if not non_system:
+        return []
+
+    chunks: list[list[LLMMessage]] = []
+    current_chunk: list[LLMMessage] = []
+    current_tokens = 0
+
+    for msg in non_system:
+        content = msg.get("content")
+        if content is None:
+            msg_text = ""
+        elif isinstance(content, list):
+            msg_text = str(content)
+        else:
+            msg_text = str(content)
+
+        msg_tokens = _estimate_token_count(msg_text)
+
+        # If adding this message would exceed the limit and we already have
+        # messages in the current chunk, start a new chunk
+        if current_chunk and (current_tokens + msg_tokens) > max_tokens:
+            chunks.append(current_chunk)
+            current_chunk = []
+            current_tokens = 0
+
+        current_chunk.append(msg)
+        current_tokens += msg_tokens
+
+    if current_chunk:
+        chunks.append(current_chunk)
+
+    return chunks
+
+
+def _extract_summary_tags(text: str) -> str:
+    """Extract content between <summary></summary> tags.
+
+    Falls back to the full text if no tags are found.
+
+    Args:
+        text: Text potentially containing summary tags.
+
+    Returns:
+        Extracted summary content, or full text if no tags found.
+    """
+    match = re.search(r"<summary>(.*?)</summary>", text, re.DOTALL)
+    if match:
+        return match.group(1).strip()
+    return text.strip()
+
+
 def summarize_messages(
    messages: list[LLMMessage],
    llm: LLM | BaseLLM,
@@ -649,6 +780,10 @@ def summarize_messages(
 ) -> None:
    """Summarize messages to fit within context window.

+    Uses structured context compaction: preserves system messages,
+    splits at message boundaries, formats with role labels, and
+    produces structured summaries for seamless task continuation.
+
    Preserves any files attached to user messages and re-attaches them to
    the summarized message. Files from all user messages are merged.

@@ -657,49 +792,64 @@ def summarize_messages(
        llm: LLM instance for summarization
        callbacks: List of callbacks for LLM
        i18n: I18N instance for messages
+        verbose: Whether to print progress.
    """
+    # 1. Extract & preserve file attachments from user messages
    preserved_files: dict[str, Any] = {}
    for msg in messages:
        if msg.get("role") == "user" and msg.get("files"):
            preserved_files.update(msg["files"])

-    messages_string = " ".join(
-        [str(message.get("content", "")) for message in messages]
-    )
-    cut_size = llm.get_context_window_size()
+    # 2. Extract system messages — never summarize them
+    system_messages = [m for m in messages if m.get("role") == "system"]
+    non_system_messages = [m for m in messages if m.get("role") != "system"]

-    messages_groups = [
-        {"content": messages_string[i : i + cut_size]}
-        for i in range(0, len(messages_string), cut_size)
-    ]
+    # If there are only system messages (or no non-system messages), nothing to summarize
+    if not non_system_messages:
+        return

+    # 3. Split non-system messages into chunks at message boundaries
+    max_tokens = llm.get_context_window_size()
+    chunks = _split_messages_into_chunks(non_system_messages, max_tokens)
+
+    # 4. Summarize each chunk with role-labeled formatting
    summarized_contents: list[SummaryContent] = []
+    total_chunks = len(chunks)

-    total_groups = len(messages_groups)
-    for idx, group in enumerate(messages_groups, 1):
+    for idx, chunk in enumerate(chunks, 1):
        if verbose:
            Printer().print(
-                content=f"Summarizing {idx}/{total_groups}...",
+                content=f"Summarizing {idx}/{total_chunks}...",
                color="yellow",
            )

+        conversation_text = _format_messages_for_summary(chunk)
+
        summarization_messages = [
            format_message_for_llm(
                i18n.slice("summarizer_system_message"), role="system"
            ),
            format_message_for_llm(
-                i18n.slice("summarize_instruction").format(group=group["content"]),
+                i18n.slice("summarize_instruction").format(
+                    conversation=conversation_text
+                ),
            ),
        ]
        summary = llm.call(
            summarization_messages,
            callbacks=callbacks,
        )
-        summarized_contents.append({"content": str(summary)})
+        # Extract content from <summary> tags with graceful fallback
+        extracted = _extract_summary_tags(str(summary))
+        summarized_contents.append({"content": extracted})

-    merged_summary = " ".join(content["content"] for content in summarized_contents)
+    merged_summary = "\n\n".join(content["content"] for content in summarized_contents)

+    # 6. Reconstruct messages: [system messages...] + [summary user message]
    messages.clear()
+    for sys_msg in system_messages:
+        messages.append(sys_msg)
+
    summary_message = format_message_for_llm(
        i18n.slice("summary").format(merged_summary=merged_summary)
    )
--- a/lib/crewai/tests/utilities/test_agent_utils.py
+++ b/lib/crewai/tests/utilities/test_agent_utils.py
@@ -8,7 +8,14 @@ from unittest.mock import MagicMock, patch
 from pydantic import BaseModel, Field

 from crewai.tools.base_tool import BaseTool
-from crewai.utilities.agent_utils import convert_tools_to_openai_schema, summarize_messages
+from crewai.utilities.agent_utils import (
+    _estimate_token_count,
+    _extract_summary_tags,
+    _format_messages_for_summary,
+    _split_messages_into_chunks,
+    convert_tools_to_openai_schema,
+    summarize_messages,
+)


 class CalculatorInput(BaseModel):
@@ -214,6 +221,17 @@ class TestConvertToolsToOpenaiSchema:
        assert max_results_prop["default"] == 10


+def _make_mock_i18n() -> MagicMock:
+    """Create a mock i18n with the new structured prompt keys."""
+    mock_i18n = MagicMock()
+    mock_i18n.slice.side_effect = lambda key: {
+        "summarizer_system_message": "You are a precise assistant that creates structured summaries.",
+        "summarize_instruction": "Summarize the conversation:\n{conversation}",
+        "summary": "<summary>\n{merged_summary}\n</summary>\nContinue the task.",
+    }.get(key, "")
+    return mock_i18n
+
+
 class TestSummarizeMessages:
    """Tests for summarize_messages function."""

@@ -229,26 +247,22 @@ class TestSummarizeMessages:

        mock_llm = MagicMock()
        mock_llm.get_context_window_size.return_value = 1000
-        mock_llm.call.return_value = "Summarized conversation about image analysis."
-
-        mock_i18n = MagicMock()
-        mock_i18n.slice.side_effect = lambda key: {
-            "summarizer_system_message": "Summarize the following.",
-            "summarize_instruction": "Summarize: {group}",
-            "summary": "Summary: {merged_summary}",
-        }.get(key, "")
+        mock_llm.call.return_value = "<summary>Summarized conversation about image analysis.</summary>"

        summarize_messages(
            messages=messages,
            llm=mock_llm,
            callbacks=[],
-            i18n=mock_i18n,
+            i18n=_make_mock_i18n(),
        )

-        assert len(messages) == 1
-        assert messages[0]["role"] == "user"
-        assert "files" in messages[0]
-        assert messages[0]["files"] == mock_files
+        # System message preserved + summary message = 2
+        assert len(messages) == 2
+        assert messages[0]["role"] == "system"
+        summary_msg = messages[1]
+        assert summary_msg["role"] == "user"
+        assert "files" in summary_msg
+        assert summary_msg["files"] == mock_files

    def test_merges_files_from_multiple_user_messages(self) -> None:
        """Test that files from multiple user messages are merged."""
@@ -264,20 +278,13 @@ class TestSummarizeMessages:

        mock_llm = MagicMock()
        mock_llm.get_context_window_size.return_value = 1000
-        mock_llm.call.return_value = "Summarized conversation."
-
-        mock_i18n = MagicMock()
-        mock_i18n.slice.side_effect = lambda key: {
-            "summarizer_system_message": "Summarize the following.",
-            "summarize_instruction": "Summarize: {group}",
-            "summary": "Summary: {merged_summary}",
-        }.get(key, "")
+        mock_llm.call.return_value = "<summary>Summarized conversation.</summary>"

        summarize_messages(
            messages=messages,
            llm=mock_llm,
            callbacks=[],
-            i18n=mock_i18n,
+            i18n=_make_mock_i18n(),
        )

        assert len(messages) == 1
@@ -297,20 +304,13 @@ class TestSummarizeMessages:

        mock_llm = MagicMock()
        mock_llm.get_context_window_size.return_value = 1000
-        mock_llm.call.return_value = "A greeting exchange."
-
-        mock_i18n = MagicMock()
-        mock_i18n.slice.side_effect = lambda key: {
-            "summarizer_system_message": "Summarize the following.",
-            "summarize_instruction": "Summarize: {group}",
-            "summary": "Summary: {merged_summary}",
-        }.get(key, "")
+        mock_llm.call.return_value = "<summary>A greeting exchange.</summary>"

        summarize_messages(
            messages=messages,
            llm=mock_llm,
            callbacks=[],
-            i18n=mock_i18n,
+            i18n=_make_mock_i18n(),
        )

        assert len(messages) == 1
@@ -327,21 +327,297 @@ class TestSummarizeMessages:

        mock_llm = MagicMock()
        mock_llm.get_context_window_size.return_value = 1000
-        mock_llm.call.return_value = "Summary"
-
-        mock_i18n = MagicMock()
-        mock_i18n.slice.side_effect = lambda key: {
-            "summarizer_system_message": "Summarize.",
-            "summarize_instruction": "Summarize: {group}",
-            "summary": "Summary: {merged_summary}",
-        }.get(key, "")
+        mock_llm.call.return_value = "<summary>Summary</summary>"

        summarize_messages(
            messages=messages,
            llm=mock_llm,
            callbacks=[],
-            i18n=mock_i18n,
+            i18n=_make_mock_i18n(),
        )

        assert id(messages) == original_list_id
        assert len(messages) == 1
+
+    def test_preserves_system_messages(self) -> None:
+        """Test that system messages are preserved and not summarized."""
+        messages: list[dict[str, Any]] = [
+            {"role": "system", "content": "You are a research assistant."},
+            {"role": "user", "content": "Find information about AI."},
+            {"role": "assistant", "content": "I found several resources on AI."},
+        ]
+
+        mock_llm = MagicMock()
+        mock_llm.get_context_window_size.return_value = 1000
+        mock_llm.call.return_value = "<summary>User asked about AI, assistant found resources.</summary>"
+
+        summarize_messages(
+            messages=messages,
+            llm=mock_llm,
+            callbacks=[],
+            i18n=_make_mock_i18n(),
+        )
+
+        assert len(messages) == 2
+        assert messages[0]["role"] == "system"
+        assert messages[0]["content"] == "You are a research assistant."
+        assert messages[1]["role"] == "user"
+
+    def test_formats_conversation_with_role_labels(self) -> None:
+        """Test that the LLM receives role-labeled conversation text."""
+        messages: list[dict[str, Any]] = [
+            {"role": "system", "content": "System prompt."},
+            {"role": "user", "content": "Hello there"},
+            {"role": "assistant", "content": "Hi! How can I help?"},
+        ]
+
+        mock_llm = MagicMock()
+        mock_llm.get_context_window_size.return_value = 1000
+        mock_llm.call.return_value = "<summary>Greeting exchange.</summary>"
+
+        summarize_messages(
+            messages=messages,
+            llm=mock_llm,
+            callbacks=[],
+            i18n=_make_mock_i18n(),
+        )
+
+        # Check what was passed to llm.call
+        call_args = mock_llm.call.call_args[0][0]
+        user_msg_content = call_args[1]["content"]
+        assert "[USER]:" in user_msg_content
+        assert "[ASSISTANT]:" in user_msg_content
+        # System content should NOT appear in summarization input
+        assert "System prompt." not in user_msg_content
+
+    def test_extracts_summary_from_tags(self) -> None:
+        """Test that <summary> tags are extracted from LLM response."""
+        messages: list[dict[str, Any]] = [
+            {"role": "user", "content": "Do something."},
+            {"role": "assistant", "content": "Done."},
+        ]
+
+        mock_llm = MagicMock()
+        mock_llm.get_context_window_size.return_value = 1000
+        mock_llm.call.return_value = "Here is the summary:\n<summary>The extracted summary content.</summary>\nExtra text."
+
+        summarize_messages(
+            messages=messages,
+            llm=mock_llm,
+            callbacks=[],
+            i18n=_make_mock_i18n(),
+        )
+
+        assert "The extracted summary content." in messages[0]["content"]
+
+    def test_handles_tool_messages(self) -> None:
+        """Test that tool messages are properly formatted in summarization."""
+        messages: list[dict[str, Any]] = [
+            {"role": "user", "content": "Search for Python."},
+            {"role": "assistant", "content": None, "tool_calls": [
+                {"function": {"name": "web_search", "arguments": '{"query": "Python"}'}}
+            ]},
+            {"role": "tool", "content": "Python is a programming language.", "name": "web_search"},
+            {"role": "assistant", "content": "Python is a programming language."},
+        ]
+
+        mock_llm = MagicMock()
+        mock_llm.get_context_window_size.return_value = 1000
+        mock_llm.call.return_value = "<summary>User searched for Python info.</summary>"
+
+        summarize_messages(
+            messages=messages,
+            llm=mock_llm,
+            callbacks=[],
+            i18n=_make_mock_i18n(),
+        )
+
+        # Verify the conversation text sent to LLM contains tool labels
+        call_args = mock_llm.call.call_args[0][0]
+        user_msg_content = call_args[1]["content"]
+        assert "[TOOL_RESULT (web_search)]:" in user_msg_content
+
+    def test_only_system_messages_no_op(self) -> None:
+        """Test that only system messages results in no-op (no summarization)."""
+        messages: list[dict[str, Any]] = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "system", "content": "Additional system instructions."},
+        ]
+
+        mock_llm = MagicMock()
+        mock_llm.get_context_window_size.return_value = 1000
+
+        summarize_messages(
+            messages=messages,
+            llm=mock_llm,
+            callbacks=[],
+            i18n=_make_mock_i18n(),
+        )
+
+        # No LLM call should have been made
+        mock_llm.call.assert_not_called()
+        # System messages should remain untouched
+        assert len(messages) == 2
+        assert messages[0]["content"] == "You are a helpful assistant."
+        assert messages[1]["content"] == "Additional system instructions."
+
+
+class TestFormatMessagesForSummary:
+    """Tests for _format_messages_for_summary helper."""
+
+    def test_skips_system_messages(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "system", "content": "System prompt"},
+            {"role": "user", "content": "Hello"},
+        ]
+        result = _format_messages_for_summary(messages)
+        assert "System prompt" not in result
+        assert "[USER]: Hello" in result
+
+    def test_formats_user_and_assistant(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "user", "content": "Question"},
+            {"role": "assistant", "content": "Answer"},
+        ]
+        result = _format_messages_for_summary(messages)
+        assert "[USER]: Question" in result
+        assert "[ASSISTANT]: Answer" in result
+
+    def test_formats_tool_messages(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "tool", "content": "Result data", "name": "search_tool"},
+        ]
+        result = _format_messages_for_summary(messages)
+        assert "[TOOL_RESULT (search_tool)]:" in result
+        assert "Result data" in result
+
+    def test_handles_none_content_with_tool_calls(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "assistant", "content": None, "tool_calls": [
+                {"function": {"name": "calculator", "arguments": "{}"}}
+            ]},
+        ]
+        result = _format_messages_for_summary(messages)
+        assert "[Called tools: calculator]" in result
+
+    def test_handles_none_content_without_tool_calls(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "assistant", "content": None},
+        ]
+        result = _format_messages_for_summary(messages)
+        assert "[ASSISTANT]:" in result
+
+    def test_handles_multimodal_content(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "user", "content": [
+                {"type": "text", "text": "Describe this image"},
+                {"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}}
+            ]},
+        ]
+        result = _format_messages_for_summary(messages)
+        assert "[USER]: Describe this image" in result
+
+    def test_empty_messages(self) -> None:
+        result = _format_messages_for_summary([])
+        assert result == ""
+
+
+class TestExtractSummaryTags:
+    """Tests for _extract_summary_tags helper."""
+
+    def test_extracts_content_from_tags(self) -> None:
+        text = "Preamble\n<summary>The actual summary.</summary>\nPostamble"
+        assert _extract_summary_tags(text) == "The actual summary."
+
+    def test_handles_multiline_content(self) -> None:
+        text = "<summary>\nLine 1\nLine 2\nLine 3\n</summary>"
+        result = _extract_summary_tags(text)
+        assert "Line 1" in result
+        assert "Line 2" in result
+        assert "Line 3" in result
+
+    def test_falls_back_when_no_tags(self) -> None:
+        text = "Just a plain summary without tags."
+        assert _extract_summary_tags(text) == text
+
+    def test_handles_empty_string(self) -> None:
+        assert _extract_summary_tags("") == ""
+
+    def test_extracts_first_match(self) -> None:
+        text = "<summary>First</summary> text <summary>Second</summary>"
+        assert _extract_summary_tags(text) == "First"
+
+
+class TestSplitMessagesIntoChunks:
+    """Tests for _split_messages_into_chunks helper."""
+
+    def test_single_chunk_when_under_limit(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "user", "content": "Hello"},
+            {"role": "assistant", "content": "Hi"},
+        ]
+        chunks = _split_messages_into_chunks(messages, max_tokens=1000)
+        assert len(chunks) == 1
+        assert len(chunks[0]) == 2
+
+    def test_splits_at_message_boundaries(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "user", "content": "A" * 100},  # ~25 tokens
+            {"role": "assistant", "content": "B" * 100},  # ~25 tokens
+            {"role": "user", "content": "C" * 100},  # ~25 tokens
+        ]
+        # max_tokens=30 should cause splits
+        chunks = _split_messages_into_chunks(messages, max_tokens=30)
+        assert len(chunks) == 3
+
+    def test_excludes_system_messages(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "system", "content": "System prompt"},
+            {"role": "user", "content": "Hello"},
+        ]
+        chunks = _split_messages_into_chunks(messages, max_tokens=1000)
+        assert len(chunks) == 1
+        # The system message should not be in any chunk
+        for chunk in chunks:
+            for msg in chunk:
+                assert msg.get("role") != "system"
+
+    def test_empty_messages(self) -> None:
+        chunks = _split_messages_into_chunks([], max_tokens=1000)
+        assert chunks == []
+
+    def test_only_system_messages(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "system", "content": "System prompt"},
+        ]
+        chunks = _split_messages_into_chunks(messages, max_tokens=1000)
+        assert chunks == []
+
+    def test_handles_none_content(self) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "assistant", "content": None},
+            {"role": "user", "content": "Follow up"},
+        ]
+        chunks = _split_messages_into_chunks(messages, max_tokens=1000)
+        assert len(chunks) == 1
+        assert len(chunks[0]) == 2
+
+
+class TestEstimateTokenCount:
+    """Tests for _estimate_token_count helper."""
+
+    def test_empty_string(self) -> None:
+        assert _estimate_token_count("") == 0
+
+    def test_short_string(self) -> None:
+        assert _estimate_token_count("hello") == 1  # 5 // 4 = 1
+
+    def test_longer_string(self) -> None:
+        assert _estimate_token_count("a" * 100) == 25  # 100 // 4 = 25
+
+    def test_approximation_is_conservative(self) -> None:
+        # For English text, actual token count is typically lower than char/4
+        text = "The quick brown fox jumps over the lazy dog."
+        estimated = _estimate_token_count(text)
+        assert estimated > 0
+        assert estimated == len(text) // 4
--- a/lib/crewai/tests/utilities/test_summarize_integration.py
+++ b/lib/crewai/tests/utilities/test_summarize_integration.py
@@ -0,0 +1,257 @@
+"""Integration tests for structured context compaction (summarize_messages).
+
+These tests use VCR cassettes to record and replay real API calls.
+
+To record cassettes for the first time:
+    PYTEST_VCR_RECORD_MODE=all pytest lib/crewai/tests/utilities/test_summarize_integration.py -v
+
+To replay from cassettes:
+    pytest lib/crewai/tests/utilities/test_summarize_integration.py -v
+"""
+
+from __future__ import annotations
+
+from typing import Any
+from unittest.mock import MagicMock
+
+import pytest
+
+from crewai.agent import Agent
+from crewai.crew import Crew
+from crewai.llm import LLM
+from crewai.task import Task
+from crewai.utilities.agent_utils import summarize_messages
+from crewai.utilities.i18n import I18N
+
+
+def _build_conversation_messages(
+    *, include_system: bool = True, include_files: bool = False
+) -> list[dict[str, Any]]:
+    """Build a realistic multi-turn conversation for summarization tests."""
+    messages: list[dict[str, Any]] = []
+
+    if include_system:
+        messages.append(
+            {
+                "role": "system",
+                "content": (
+                    "You are a research assistant specializing in AI topics. "
+                    "Your goal is to find accurate, up-to-date information."
+                ),
+            }
+        )
+
+    user_msg: dict[str, Any] = {
+        "role": "user",
+        "content": (
+            "Research the latest developments in large language models. "
+            "Focus on architecture improvements and training techniques."
+        ),
+    }
+    if include_files:
+        user_msg["files"] = {"reference.pdf": MagicMock()}
+    messages.append(user_msg)
+
+    messages.append(
+        {
+            "role": "assistant",
+            "content": (
+                "I'll research the latest developments in large language models. "
+                "Based on my knowledge, recent advances include:\n"
+                "1. Mixture of Experts (MoE) architectures\n"
+                "2. Improved attention mechanisms like Flash Attention\n"
+                "3. Better training data curation techniques\n"
+                "4. Constitutional AI and RLHF improvements"
+            ),
+        }
+    )
+
+    messages.append(
+        {
+            "role": "user",
+            "content": "Can you go deeper on the MoE architectures? What are the key papers?",
+        }
+    )
+
+    messages.append(
+        {
+            "role": "assistant",
+            "content": (
+                "Key papers on Mixture of Experts:\n"
+                "- Switch Transformers (Google, 2021) - simplified MoE routing\n"
+                "- GShard - scaling to 600B parameters\n"
+                "- Mixtral (Mistral AI) - open-source MoE model\n"
+                "The main advantage is computational efficiency: "
+                "only a subset of experts is activated per token."
+            ),
+        }
+    )
+
+    return messages
+
+
+class TestSummarizeDirectOpenAI:
+    """Test direct summarize_messages calls with OpenAI."""
+
+    @pytest.mark.vcr()
+    def test_summarize_direct_openai(self) -> None:
+        """Test summarize_messages with gpt-4o-mini preserves system messages."""
+        llm = LLM(model="gpt-4o-mini", temperature=0)
+        i18n = I18N()
+        messages = _build_conversation_messages(include_system=True)
+
+        original_system_content = messages[0]["content"]
+
+        summarize_messages(
+            messages=messages,
+            llm=llm,
+            callbacks=[],
+            i18n=i18n,
+        )
+
+        # System message should be preserved
+        assert len(messages) >= 2
+        assert messages[0]["role"] == "system"
+        assert messages[0]["content"] == original_system_content
+
+        # Summary should be a user message
+        summary_msg = messages[-1]
+        assert summary_msg["role"] == "user"
+        assert len(summary_msg["content"]) > 0
+
+
+class TestSummarizeDirectAnthropic:
+    """Test direct summarize_messages calls with Anthropic."""
+
+    @pytest.mark.vcr()
+    def test_summarize_direct_anthropic(self) -> None:
+        """Test summarize_messages with claude-3-5-haiku."""
+        llm = LLM(model="anthropic/claude-3-5-haiku-latest", temperature=0)
+        i18n = I18N()
+        messages = _build_conversation_messages(include_system=True)
+
+        summarize_messages(
+            messages=messages,
+            llm=llm,
+            callbacks=[],
+            i18n=i18n,
+        )
+
+        assert len(messages) >= 2
+        assert messages[0]["role"] == "system"
+        summary_msg = messages[-1]
+        assert summary_msg["role"] == "user"
+        assert len(summary_msg["content"]) > 0
+
+
+class TestSummarizeDirectGemini:
+    """Test direct summarize_messages calls with Gemini."""
+
+    @pytest.mark.vcr()
+    def test_summarize_direct_gemini(self) -> None:
+        """Test summarize_messages with gemini-2.0-flash."""
+        llm = LLM(model="gemini/gemini-2.0-flash", temperature=0)
+        i18n = I18N()
+        messages = _build_conversation_messages(include_system=True)
+
+        summarize_messages(
+            messages=messages,
+            llm=llm,
+            callbacks=[],
+            i18n=i18n,
+        )
+
+        assert len(messages) >= 2
+        assert messages[0]["role"] == "system"
+        summary_msg = messages[-1]
+        assert summary_msg["role"] == "user"
+        assert len(summary_msg["content"]) > 0
+
+
+class TestCrewKickoffCompaction:
+    """Test compaction triggered via Crew.kickoff() with small context window."""
+
+    @pytest.mark.vcr()
+    def test_crew_kickoff_compaction_openai(self) -> None:
+        """Test that compaction is triggered during kickoff with small context_window_size."""
+        llm = LLM(model="gpt-4o-mini", temperature=0)
+        # Force a very small context window to trigger compaction
+        llm.context_window_size = 500
+
+        agent = Agent(
+            role="Researcher",
+            goal="Find information about Python programming",
+            backstory="You are an expert researcher.",
+            llm=llm,
+            verbose=False,
+            max_iter=2,
+        )
+
+        task = Task(
+            description="What is Python? Give a brief answer.",
+            expected_output="A short description of Python.",
+            agent=agent,
+        )
+
+        crew = Crew(agents=[agent], tasks=[task], verbose=False)
+
+        # This may or may not trigger compaction depending on actual response sizes.
+        # The test verifies the code path doesn't crash.
+        result = crew.kickoff()
+        assert result is not None
+
+
+class TestAgentExecuteTaskCompaction:
+    """Test compaction triggered via Agent.execute_task()."""
+
+    @pytest.mark.vcr()
+    def test_agent_execute_task_compaction(self) -> None:
+        """Test that Agent.execute_task() works with small context_window_size."""
+        llm = LLM(model="gpt-4o-mini", temperature=0)
+        llm.context_window_size = 500
+
+        agent = Agent(
+            role="Writer",
+            goal="Write concise content",
+            backstory="You are a skilled writer.",
+            llm=llm,
+            verbose=False,
+            max_iter=2,
+        )
+
+        task = Task(
+            description="Write one sentence about the sun.",
+            expected_output="A single sentence about the sun.",
+            agent=agent,
+        )
+
+        result = agent.execute_task(task=task)
+        assert result is not None
+
+
+class TestSummarizePreservesFiles:
+    """Test that files are preserved through real summarization."""
+
+    @pytest.mark.vcr()
+    def test_summarize_preserves_files_integration(self) -> None:
+        """Test that file references survive a real summarization call."""
+        llm = LLM(model="gpt-4o-mini", temperature=0)
+        i18n = I18N()
+        messages = _build_conversation_messages(
+            include_system=True, include_files=True
+        )
+
+        summarize_messages(
+            messages=messages,
+            llm=llm,
+            callbacks=[],
+            i18n=i18n,
+        )
+
+        # System message preserved
+        assert messages[0]["role"] == "system"
+
+        # Files should be on the summary message
+        summary_msg = messages[-1]
+        assert "files" in summary_msg
+        assert "reference.pdf" in summary_msg["files"]