From 53a726ad6d74c166c44accf0639c7e2c8e49b808 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Tue, 26 May 2026 13:13:56 +0000 Subject: [PATCH] Fix #5930: Extract text from PDFs in ReadFileTool instead of returning base64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When using input_files with PDFFile, the read_file tool was returning the entire PDF as base64-encoded binary data. This caused: - Massive context bloat for the LLM - Inconsistent responses and context overflow - The same file being re-processed on each tool call Now ReadFileTool detects application/pdf content and extracts text using pypdf (already a dependency via crewai-files) instead of base64-encoding the raw bytes. Each page is labeled with a page number header for clarity. Graceful fallbacks are provided when: - pypdf is not installed (short install message) - The PDF contains no extractable text (friendly message) - The PDF is corrupted (error message, never base64) Co-Authored-By: João --- .../tools/agent_tools/read_file_tool.py | 58 ++++++++- .../tools/agent_tools/test_read_file_tool.py | 114 +++++++++++++++++- 2 files changed, 164 insertions(+), 8 deletions(-) diff --git a/lib/crewai/src/crewai/tools/agent_tools/read_file_tool.py b/lib/crewai/src/crewai/tools/agent_tools/read_file_tool.py index e41d5390d..a016857b4 100644 --- a/lib/crewai/src/crewai/tools/agent_tools/read_file_tool.py +++ b/lib/crewai/src/crewai/tools/agent_tools/read_file_tool.py @@ -3,6 +3,8 @@ from __future__ import annotations import base64 +import io +import logging from typing import TYPE_CHECKING from pydantic import BaseModel, Field, PrivateAttr @@ -13,6 +15,8 @@ from crewai.tools.base_tool import BaseTool if TYPE_CHECKING: from crewai_files import FileInput +logger = logging.getLogger(__name__) + class ReadFileToolSchema(BaseModel): """Schema for read file tool arguments.""" @@ -29,7 +33,8 @@ class ReadFileTool(BaseTool): name: str = "read_file" description: str = ( "Read content from an input file by name. " - "Returns file content as text for text files, or base64 for binary files." + "Returns file content as text for text files, " + "extracted text for PDFs, or base64 for other binary files." ) args_schema: type[BaseModel] = ReadFileToolSchema @@ -50,7 +55,8 @@ class ReadFileTool(BaseTool): file_name: The name of the file to read. Returns: - File content as text for text files, or base64 encoded for binary. + File content as text for text files, extracted text for PDFs, + or base64 encoded for other binary files. """ if not self._files: return "No input files available." @@ -74,5 +80,53 @@ class ReadFileTool(BaseTool): if any(content_type.startswith(t) for t in text_types): return content.decode("utf-8") + if content_type == "application/pdf": + return self._extract_pdf_text(content, filename) + encoded = base64.b64encode(content).decode("ascii") return f"[Binary file: {filename} ({content_type})]\nBase64: {encoded}" + + @staticmethod + def _extract_pdf_text(content: bytes, filename: str) -> str: + """Extract text from PDF bytes using pypdf. + + Falls back to a short error message (never base64) when extraction + is not possible, so that the LLM context stays small. + + Args: + content: Raw PDF bytes. + filename: Name of the PDF file (for logging/messages). + + Returns: + Extracted text, or a short diagnostic message on failure. + """ + try: + from pypdf import PdfReader + except ImportError: + logger.warning( + "pypdf is not installed — cannot extract text from '%s'. " + "Install it with: pip install pypdf", + filename, + ) + return ( + f"[PDF file: {filename}] " + "Unable to extract text: pypdf is not installed. " + "Install it with: pip install pypdf" + ) + + try: + reader = PdfReader(io.BytesIO(content)) + pages: list[str] = [] + for page_num, page in enumerate(reader.pages, start=1): + page_text = page.extract_text() + if page_text: + pages.append(f"--- Page {page_num} ---\n{page_text}") + if pages: + return "\n\n".join(pages) + return ( + f"[PDF file: {filename}] " + "No extractable text found (the PDF may contain only images)." + ) + except Exception as exc: + logger.warning("Failed to extract text from PDF '%s': %s", filename, exc) + return f"[PDF file: {filename}] Failed to extract text: {exc}" diff --git a/lib/crewai/tests/tools/agent_tools/test_read_file_tool.py b/lib/crewai/tests/tools/agent_tools/test_read_file_tool.py index 92f83abd4..37994b08d 100644 --- a/lib/crewai/tests/tools/agent_tools/test_read_file_tool.py +++ b/lib/crewai/tests/tools/agent_tools/test_read_file_tool.py @@ -1,13 +1,53 @@ """Unit tests for ReadFileTool.""" +from __future__ import annotations + import base64 +import io +from unittest.mock import patch import pytest +from pypdf import PdfWriter +from pypdf.generic import ( + DecodedStreamObject, + DictionaryObject, + NameObject, +) from crewai.tools.agent_tools.read_file_tool import ReadFileTool from crewai_files import ImageFile, PDFFile, TextFile +def _make_pdf(*page_texts: str) -> bytes: + """Build a minimal valid PDF with extractable text on each page.""" + writer = PdfWriter() + for text in page_texts: + writer.add_blank_page(width=200, height=200) + page = writer.pages[-1] + + font_dict = DictionaryObject() + font_dict[NameObject("/Type")] = NameObject("/Font") + font_dict[NameObject("/Subtype")] = NameObject("/Type1") + font_dict[NameObject("/BaseFont")] = NameObject("/Helvetica") + font_ref = writer._add_object(font_dict) + + resources = DictionaryObject() + fonts = DictionaryObject() + fonts[NameObject("/F1")] = font_ref + resources[NameObject("/Font")] = fonts + page[NameObject("/Resources")] = resources + + stream = DecodedStreamObject() + escaped = text.replace("(", "\\(").replace(")", "\\)") + stream.set_data(f"BT /F1 12 Tf 50 100 Td ({escaped}) Tj ET".encode()) + stream_ref = writer._add_object(stream) + page[NameObject("/Contents")] = stream_ref + + buf = io.BytesIO() + writer.write(buf) + return buf.getvalue() + + class TestReadFileTool: """Tests for ReadFileTool.""" @@ -75,15 +115,77 @@ class TestReadFileTool: decoded = base64.b64decode(b64_part) assert decoded == png_bytes - def test_run_pdf_file_returns_base64(self) -> None: - """Test reading a PDF file returns base64 encoded content.""" - pdf_bytes = b"%PDF-1.4 some content here" + def test_run_pdf_file_extracts_text(self) -> None: + """Test reading a PDF extracts text instead of returning base64.""" + pdf_bytes = _make_pdf("Hello World from PDF") self.tool.set_files({"doc.pdf": PDFFile(source=pdf_bytes)}) result = self.tool._run(file_name="doc.pdf") - assert "[Binary file:" in result - assert "application/pdf" in result + assert "Hello World from PDF" in result + assert "Base64" not in result + assert "--- Page 1 ---" in result + + def test_run_pdf_multipage_extracts_all_pages(self) -> None: + """Test reading a multi-page PDF extracts text from every page.""" + pdf_bytes = _make_pdf("First page content", "Second page content") + self.tool.set_files({"report.pdf": PDFFile(source=pdf_bytes)}) + + result = self.tool._run(file_name="report.pdf") + + assert "First page content" in result + assert "Second page content" in result + assert "--- Page 1 ---" in result + assert "--- Page 2 ---" in result + assert "Base64" not in result + + def test_run_pdf_no_extractable_text(self) -> None: + """Test PDF with no extractable text returns a friendly message.""" + # A blank page with no text content + writer = PdfWriter() + writer.add_blank_page(width=200, height=200) + buf = io.BytesIO() + writer.write(buf) + blank_pdf = buf.getvalue() + + self.tool.set_files({"blank.pdf": PDFFile(source=blank_pdf)}) + + result = self.tool._run(file_name="blank.pdf") + + assert "No extractable text found" in result + assert "Base64" not in result + + def test_run_pdf_corrupted_returns_error_message(self) -> None: + """Test that a corrupted PDF returns a short error, never base64.""" + corrupted = b"%PDF-1.4 this is not a valid PDF structure" + self.tool.set_files({"bad.pdf": PDFFile(source=corrupted)}) + + result = self.tool._run(file_name="bad.pdf") + + assert "[PDF file: bad.pdf]" in result + assert "Failed to extract text" in result + assert "Base64" not in result + + def test_run_pdf_no_pypdf_returns_install_message(self) -> None: + """Test graceful fallback when pypdf is not installed.""" + pdf_bytes = _make_pdf("Some text") + self.tool.set_files({"doc.pdf": PDFFile(source=pdf_bytes)}) + + with patch.dict("sys.modules", {"pypdf": None}): + result = self.tool._run(file_name="doc.pdf") + + assert "pypdf is not installed" in result + assert "Base64" not in result + + def test_run_pdf_result_much_smaller_than_base64(self) -> None: + """Extracted text should be far smaller than a base64-encoded PDF.""" + pdf_bytes = _make_pdf("Short text") + self.tool.set_files({"doc.pdf": PDFFile(source=pdf_bytes)}) + + result = self.tool._run(file_name="doc.pdf") + + base64_size = len(base64.b64encode(pdf_bytes)) + assert len(result) < base64_size def test_set_files_none(self) -> None: """Test setting files to None.""" @@ -119,4 +221,4 @@ class TestReadFileTool: schema = self.tool.args_schema assert "file_name" in schema.model_fields - assert schema.model_fields["file_name"].is_required() \ No newline at end of file + assert schema.model_fields["file_name"].is_required()