From 53a726ad6d74c166c44accf0639c7e2c8e49b808 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Tue, 26 May 2026 13:13:56 +0000
Subject: [PATCH] Fix #5930: Extract text from PDFs in ReadFileTool instead of
 returning base64
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When using input_files with PDFFile, the read_file tool was returning
the entire PDF as base64-encoded binary data. This caused:
- Massive context bloat for the LLM
- Inconsistent responses and context overflow
- The same file being re-processed on each tool call

Now ReadFileTool detects application/pdf content and extracts text
using pypdf (already a dependency via crewai-files) instead of
base64-encoding the raw bytes. Each page is labeled with a page
number header for clarity. Graceful fallbacks are provided when:
- pypdf is not installed (short install message)
- The PDF contains no extractable text (friendly message)
- The PDF is corrupted (error message, never base64)

Co-Authored-By: João <joao@crewai.com>
---
 .../tools/agent_tools/read_file_tool.py       |  58 ++++++++-
 .../tools/agent_tools/test_read_file_tool.py  | 114 +++++++++++++++++-
 2 files changed, 164 insertions(+), 8 deletions(-)

diff --git a/lib/crewai/src/crewai/tools/agent_tools/read_file_tool.py b/lib/crewai/src/crewai/tools/agent_tools/read_file_tool.py
index e41d5390d..a016857b4 100644
--- a/lib/crewai/src/crewai/tools/agent_tools/read_file_tool.py
+++ b/lib/crewai/src/crewai/tools/agent_tools/read_file_tool.py
@@ -3,6 +3,8 @@
 from __future__ import annotations
 
 import base64
+import io
+import logging
 from typing import TYPE_CHECKING
 
 from pydantic import BaseModel, Field, PrivateAttr
@@ -13,6 +15,8 @@ from crewai.tools.base_tool import BaseTool
 if TYPE_CHECKING:
     from crewai_files import FileInput
 
+logger = logging.getLogger(__name__)
+
 
 class ReadFileToolSchema(BaseModel):
     """Schema for read file tool arguments."""
@@ -29,7 +33,8 @@ class ReadFileTool(BaseTool):
     name: str = "read_file"
     description: str = (
         "Read content from an input file by name. "
-        "Returns file content as text for text files, or base64 for binary files."
+        "Returns file content as text for text files, "
+        "extracted text for PDFs, or base64 for other binary files."
     )
     args_schema: type[BaseModel] = ReadFileToolSchema
 
@@ -50,7 +55,8 @@ class ReadFileTool(BaseTool):
             file_name: The name of the file to read.
 
         Returns:
-            File content as text for text files, or base64 encoded for binary.
+            File content as text for text files, extracted text for PDFs,
+            or base64 encoded for other binary files.
         """
         if not self._files:
             return "No input files available."
@@ -74,5 +80,53 @@ class ReadFileTool(BaseTool):
         if any(content_type.startswith(t) for t in text_types):
             return content.decode("utf-8")
 
+        if content_type == "application/pdf":
+            return self._extract_pdf_text(content, filename)
+
         encoded = base64.b64encode(content).decode("ascii")
         return f"[Binary file: {filename} ({content_type})]\nBase64: {encoded}"
+
+    @staticmethod
+    def _extract_pdf_text(content: bytes, filename: str) -> str:
+        """Extract text from PDF bytes using pypdf.
+
+        Falls back to a short error message (never base64) when extraction
+        is not possible, so that the LLM context stays small.
+
+        Args:
+            content: Raw PDF bytes.
+            filename: Name of the PDF file (for logging/messages).
+
+        Returns:
+            Extracted text, or a short diagnostic message on failure.
+        """
+        try:
+            from pypdf import PdfReader
+        except ImportError:
+            logger.warning(
+                "pypdf is not installed — cannot extract text from '%s'. "
+                "Install it with: pip install pypdf",
+                filename,
+            )
+            return (
+                f"[PDF file: {filename}] "
+                "Unable to extract text: pypdf is not installed. "
+                "Install it with: pip install pypdf"
+            )
+
+        try:
+            reader = PdfReader(io.BytesIO(content))
+            pages: list[str] = []
+            for page_num, page in enumerate(reader.pages, start=1):
+                page_text = page.extract_text()
+                if page_text:
+                    pages.append(f"--- Page {page_num} ---\n{page_text}")
+            if pages:
+                return "\n\n".join(pages)
+            return (
+                f"[PDF file: {filename}] "
+                "No extractable text found (the PDF may contain only images)."
+            )
+        except Exception as exc:
+            logger.warning("Failed to extract text from PDF '%s': %s", filename, exc)
+            return f"[PDF file: {filename}] Failed to extract text: {exc}"
diff --git a/lib/crewai/tests/tools/agent_tools/test_read_file_tool.py b/lib/crewai/tests/tools/agent_tools/test_read_file_tool.py
index 92f83abd4..37994b08d 100644
--- a/lib/crewai/tests/tools/agent_tools/test_read_file_tool.py
+++ b/lib/crewai/tests/tools/agent_tools/test_read_file_tool.py
@@ -1,13 +1,53 @@
 """Unit tests for ReadFileTool."""
 
+from __future__ import annotations
+
 import base64
+import io
+from unittest.mock import patch
 
 import pytest
+from pypdf import PdfWriter
+from pypdf.generic import (
+    DecodedStreamObject,
+    DictionaryObject,
+    NameObject,
+)
 
 from crewai.tools.agent_tools.read_file_tool import ReadFileTool
 from crewai_files import ImageFile, PDFFile, TextFile
 
 
+def _make_pdf(*page_texts: str) -> bytes:
+    """Build a minimal valid PDF with extractable text on each page."""
+    writer = PdfWriter()
+    for text in page_texts:
+        writer.add_blank_page(width=200, height=200)
+        page = writer.pages[-1]
+
+        font_dict = DictionaryObject()
+        font_dict[NameObject("/Type")] = NameObject("/Font")
+        font_dict[NameObject("/Subtype")] = NameObject("/Type1")
+        font_dict[NameObject("/BaseFont")] = NameObject("/Helvetica")
+        font_ref = writer._add_object(font_dict)
+
+        resources = DictionaryObject()
+        fonts = DictionaryObject()
+        fonts[NameObject("/F1")] = font_ref
+        resources[NameObject("/Font")] = fonts
+        page[NameObject("/Resources")] = resources
+
+        stream = DecodedStreamObject()
+        escaped = text.replace("(", "\\(").replace(")", "\\)")
+        stream.set_data(f"BT /F1 12 Tf 50 100 Td ({escaped}) Tj ET".encode())
+        stream_ref = writer._add_object(stream)
+        page[NameObject("/Contents")] = stream_ref
+
+    buf = io.BytesIO()
+    writer.write(buf)
+    return buf.getvalue()
+
+
 class TestReadFileTool:
     """Tests for ReadFileTool."""
 
@@ -75,15 +115,77 @@ class TestReadFileTool:
         decoded = base64.b64decode(b64_part)
         assert decoded == png_bytes
 
-    def test_run_pdf_file_returns_base64(self) -> None:
-        """Test reading a PDF file returns base64 encoded content."""
-        pdf_bytes = b"%PDF-1.4 some content here"
+    def test_run_pdf_file_extracts_text(self) -> None:
+        """Test reading a PDF extracts text instead of returning base64."""
+        pdf_bytes = _make_pdf("Hello World from PDF")
         self.tool.set_files({"doc.pdf": PDFFile(source=pdf_bytes)})
 
         result = self.tool._run(file_name="doc.pdf")
 
-        assert "[Binary file:" in result
-        assert "application/pdf" in result
+        assert "Hello World from PDF" in result
+        assert "Base64" not in result
+        assert "--- Page 1 ---" in result
+
+    def test_run_pdf_multipage_extracts_all_pages(self) -> None:
+        """Test reading a multi-page PDF extracts text from every page."""
+        pdf_bytes = _make_pdf("First page content", "Second page content")
+        self.tool.set_files({"report.pdf": PDFFile(source=pdf_bytes)})
+
+        result = self.tool._run(file_name="report.pdf")
+
+        assert "First page content" in result
+        assert "Second page content" in result
+        assert "--- Page 1 ---" in result
+        assert "--- Page 2 ---" in result
+        assert "Base64" not in result
+
+    def test_run_pdf_no_extractable_text(self) -> None:
+        """Test PDF with no extractable text returns a friendly message."""
+        # A blank page with no text content
+        writer = PdfWriter()
+        writer.add_blank_page(width=200, height=200)
+        buf = io.BytesIO()
+        writer.write(buf)
+        blank_pdf = buf.getvalue()
+
+        self.tool.set_files({"blank.pdf": PDFFile(source=blank_pdf)})
+
+        result = self.tool._run(file_name="blank.pdf")
+
+        assert "No extractable text found" in result
+        assert "Base64" not in result
+
+    def test_run_pdf_corrupted_returns_error_message(self) -> None:
+        """Test that a corrupted PDF returns a short error, never base64."""
+        corrupted = b"%PDF-1.4 this is not a valid PDF structure"
+        self.tool.set_files({"bad.pdf": PDFFile(source=corrupted)})
+
+        result = self.tool._run(file_name="bad.pdf")
+
+        assert "[PDF file: bad.pdf]" in result
+        assert "Failed to extract text" in result
+        assert "Base64" not in result
+
+    def test_run_pdf_no_pypdf_returns_install_message(self) -> None:
+        """Test graceful fallback when pypdf is not installed."""
+        pdf_bytes = _make_pdf("Some text")
+        self.tool.set_files({"doc.pdf": PDFFile(source=pdf_bytes)})
+
+        with patch.dict("sys.modules", {"pypdf": None}):
+            result = self.tool._run(file_name="doc.pdf")
+
+        assert "pypdf is not installed" in result
+        assert "Base64" not in result
+
+    def test_run_pdf_result_much_smaller_than_base64(self) -> None:
+        """Extracted text should be far smaller than a base64-encoded PDF."""
+        pdf_bytes = _make_pdf("Short text")
+        self.tool.set_files({"doc.pdf": PDFFile(source=pdf_bytes)})
+
+        result = self.tool._run(file_name="doc.pdf")
+
+        base64_size = len(base64.b64encode(pdf_bytes))
+        assert len(result) < base64_size
 
     def test_set_files_none(self) -> None:
         """Test setting files to None."""
@@ -119,4 +221,4 @@ class TestReadFileTool:
         schema = self.tool.args_schema
 
         assert "file_name" in schema.model_fields
-        assert schema.model_fields["file_name"].is_required()
\ No newline at end of file
+        assert schema.model_fields["file_name"].is_required()