Fix #5930: Extract text from PDFs in ReadFileTool instead of returning base64

When using input_files with PDFFile, the read_file tool was returning the entire PDF as base64-encoded binary data. This caused: - Massive context bloat for the LLM - Inconsistent responses and context overflow - The same file being re-processed on each tool call Now ReadFileTool detects application/pdf content and extracts text using pypdf (already a dependency via crewai-files) instead of base64-encoding the raw bytes. Each page is labeled with a page number header for clarity. Graceful fallbacks are provided when: - pypdf is not installed (short install message) - The PDF contains no extractable text (friendly message) - The PDF is corrupted (error message, never base64) Co-Authored-By: João <joao@crewai.com>
2026-07-03 06:08:15 +00:00 · 2026-05-26 13:13:56 +00:00
parent bad64b1ee6
commit 53a726ad6d
2 changed files with 164 additions and 8 deletions
--- a/lib/crewai/src/crewai/tools/agent_tools/read_file_tool.py
+++ b/lib/crewai/src/crewai/tools/agent_tools/read_file_tool.py
@@ -3,6 +3,8 @@
 from __future__ import annotations

 import base64
+import io
+import logging
 from typing import TYPE_CHECKING

 from pydantic import BaseModel, Field, PrivateAttr
@@ -13,6 +15,8 @@ from crewai.tools.base_tool import BaseTool
 if TYPE_CHECKING:
    from crewai_files import FileInput

+logger = logging.getLogger(__name__)
+

 class ReadFileToolSchema(BaseModel):
    """Schema for read file tool arguments."""
@@ -29,7 +33,8 @@ class ReadFileTool(BaseTool):
    name: str = "read_file"
    description: str = (
        "Read content from an input file by name. "
-        "Returns file content as text for text files, or base64 for binary files."
+        "Returns file content as text for text files, "
+        "extracted text for PDFs, or base64 for other binary files."
    )
    args_schema: type[BaseModel] = ReadFileToolSchema

@@ -50,7 +55,8 @@ class ReadFileTool(BaseTool):
            file_name: The name of the file to read.

        Returns:
-            File content as text for text files, or base64 encoded for binary.
+            File content as text for text files, extracted text for PDFs,
+            or base64 encoded for other binary files.
        """
        if not self._files:
            return "No input files available."
@@ -74,5 +80,53 @@ class ReadFileTool(BaseTool):
        if any(content_type.startswith(t) for t in text_types):
            return content.decode("utf-8")

+        if content_type == "application/pdf":
+            return self._extract_pdf_text(content, filename)
+
        encoded = base64.b64encode(content).decode("ascii")
        return f"[Binary file: {filename} ({content_type})]\nBase64: {encoded}"
+
+    @staticmethod
+    def _extract_pdf_text(content: bytes, filename: str) -> str:
+        """Extract text from PDF bytes using pypdf.
+
+        Falls back to a short error message (never base64) when extraction
+        is not possible, so that the LLM context stays small.
+
+        Args:
+            content: Raw PDF bytes.
+            filename: Name of the PDF file (for logging/messages).
+
+        Returns:
+            Extracted text, or a short diagnostic message on failure.
+        """
+        try:
+            from pypdf import PdfReader
+        except ImportError:
+            logger.warning(
+                "pypdf is not installed — cannot extract text from '%s'. "
+                "Install it with: pip install pypdf",
+                filename,
+            )
+            return (
+                f"[PDF file: {filename}] "
+                "Unable to extract text: pypdf is not installed. "
+                "Install it with: pip install pypdf"
+            )
+
+        try:
+            reader = PdfReader(io.BytesIO(content))
+            pages: list[str] = []
+            for page_num, page in enumerate(reader.pages, start=1):
+                page_text = page.extract_text()
+                if page_text:
+                    pages.append(f"--- Page {page_num} ---\n{page_text}")
+            if pages:
+                return "\n\n".join(pages)
+            return (
+                f"[PDF file: {filename}] "
+                "No extractable text found (the PDF may contain only images)."
+            )
+        except Exception as exc:
+            logger.warning("Failed to extract text from PDF '%s': %s", filename, exc)
+            return f"[PDF file: {filename}] Failed to extract text: {exc}"
--- a/lib/crewai/tests/tools/agent_tools/test_read_file_tool.py
+++ b/lib/crewai/tests/tools/agent_tools/test_read_file_tool.py
@@ -1,13 +1,53 @@
 """Unit tests for ReadFileTool."""

+from __future__ import annotations
+
 import base64
+import io
+from unittest.mock import patch

 import pytest
+from pypdf import PdfWriter
+from pypdf.generic import (
+    DecodedStreamObject,
+    DictionaryObject,
+    NameObject,
+)

 from crewai.tools.agent_tools.read_file_tool import ReadFileTool
 from crewai_files import ImageFile, PDFFile, TextFile


+def _make_pdf(*page_texts: str) -> bytes:
+    """Build a minimal valid PDF with extractable text on each page."""
+    writer = PdfWriter()
+    for text in page_texts:
+        writer.add_blank_page(width=200, height=200)
+        page = writer.pages[-1]
+
+        font_dict = DictionaryObject()
+        font_dict[NameObject("/Type")] = NameObject("/Font")
+        font_dict[NameObject("/Subtype")] = NameObject("/Type1")
+        font_dict[NameObject("/BaseFont")] = NameObject("/Helvetica")
+        font_ref = writer._add_object(font_dict)
+
+        resources = DictionaryObject()
+        fonts = DictionaryObject()
+        fonts[NameObject("/F1")] = font_ref
+        resources[NameObject("/Font")] = fonts
+        page[NameObject("/Resources")] = resources
+
+        stream = DecodedStreamObject()
+        escaped = text.replace("(", "\\(").replace(")", "\\)")
+        stream.set_data(f"BT /F1 12 Tf 50 100 Td ({escaped}) Tj ET".encode())
+        stream_ref = writer._add_object(stream)
+        page[NameObject("/Contents")] = stream_ref
+
+    buf = io.BytesIO()
+    writer.write(buf)
+    return buf.getvalue()
+
+
 class TestReadFileTool:
    """Tests for ReadFileTool."""

@@ -75,15 +115,77 @@ class TestReadFileTool:
        decoded = base64.b64decode(b64_part)
        assert decoded == png_bytes

-    def test_run_pdf_file_returns_base64(self) -> None:
-        """Test reading a PDF file returns base64 encoded content."""
-        pdf_bytes = b"%PDF-1.4 some content here"
+    def test_run_pdf_file_extracts_text(self) -> None:
+        """Test reading a PDF extracts text instead of returning base64."""
+        pdf_bytes = _make_pdf("Hello World from PDF")
        self.tool.set_files({"doc.pdf": PDFFile(source=pdf_bytes)})

        result = self.tool._run(file_name="doc.pdf")

-        assert "[Binary file:" in result
-        assert "application/pdf" in result
+        assert "Hello World from PDF" in result
+        assert "Base64" not in result
+        assert "--- Page 1 ---" in result
+
+    def test_run_pdf_multipage_extracts_all_pages(self) -> None:
+        """Test reading a multi-page PDF extracts text from every page."""
+        pdf_bytes = _make_pdf("First page content", "Second page content")
+        self.tool.set_files({"report.pdf": PDFFile(source=pdf_bytes)})
+
+        result = self.tool._run(file_name="report.pdf")
+
+        assert "First page content" in result
+        assert "Second page content" in result
+        assert "--- Page 1 ---" in result
+        assert "--- Page 2 ---" in result
+        assert "Base64" not in result
+
+    def test_run_pdf_no_extractable_text(self) -> None:
+        """Test PDF with no extractable text returns a friendly message."""
+        # A blank page with no text content
+        writer = PdfWriter()
+        writer.add_blank_page(width=200, height=200)
+        buf = io.BytesIO()
+        writer.write(buf)
+        blank_pdf = buf.getvalue()
+
+        self.tool.set_files({"blank.pdf": PDFFile(source=blank_pdf)})
+
+        result = self.tool._run(file_name="blank.pdf")
+
+        assert "No extractable text found" in result
+        assert "Base64" not in result
+
+    def test_run_pdf_corrupted_returns_error_message(self) -> None:
+        """Test that a corrupted PDF returns a short error, never base64."""
+        corrupted = b"%PDF-1.4 this is not a valid PDF structure"
+        self.tool.set_files({"bad.pdf": PDFFile(source=corrupted)})
+
+        result = self.tool._run(file_name="bad.pdf")
+
+        assert "[PDF file: bad.pdf]" in result
+        assert "Failed to extract text" in result
+        assert "Base64" not in result
+
+    def test_run_pdf_no_pypdf_returns_install_message(self) -> None:
+        """Test graceful fallback when pypdf is not installed."""
+        pdf_bytes = _make_pdf("Some text")
+        self.tool.set_files({"doc.pdf": PDFFile(source=pdf_bytes)})
+
+        with patch.dict("sys.modules", {"pypdf": None}):
+            result = self.tool._run(file_name="doc.pdf")
+
+        assert "pypdf is not installed" in result
+        assert "Base64" not in result
+
+    def test_run_pdf_result_much_smaller_than_base64(self) -> None:
+        """Extracted text should be far smaller than a base64-encoded PDF."""
+        pdf_bytes = _make_pdf("Short text")
+        self.tool.set_files({"doc.pdf": PDFFile(source=pdf_bytes)})
+
+        result = self.tool._run(file_name="doc.pdf")
+
+        base64_size = len(base64.b64encode(pdf_bytes))
+        assert len(result) < base64_size

    def test_set_files_none(self) -> None:
        """Test setting files to None."""
@@ -119,4 +221,4 @@ class TestReadFileTool:
        schema = self.tool.args_schema

        assert "file_name" in schema.model_fields
-        assert schema.model_fields["file_name"].is_required()
+        assert schema.model_fields["file_name"].is_required()