Fix #5930: Extract text from PDFs in ReadFileTool instead of returning base64

When using input_files with PDFFile, the read_file tool was returning
the entire PDF as base64-encoded binary data. This caused:
- Massive context bloat for the LLM
- Inconsistent responses and context overflow
- The same file being re-processed on each tool call

Now ReadFileTool detects application/pdf content and extracts text
using pypdf (already a dependency via crewai-files) instead of
base64-encoding the raw bytes. Each page is labeled with a page
number header for clarity. Graceful fallbacks are provided when:
- pypdf is not installed (short install message)
- The PDF contains no extractable text (friendly message)
- The PDF is corrupted (error message, never base64)

Co-Authored-By: João <joao@crewai.com>
This commit is contained in:
Devin AI
2026-05-26 13:13:56 +00:00
parent bad64b1ee6
commit 53a726ad6d
2 changed files with 164 additions and 8 deletions

View File

@@ -3,6 +3,8 @@
from __future__ import annotations
import base64
import io
import logging
from typing import TYPE_CHECKING
from pydantic import BaseModel, Field, PrivateAttr
@@ -13,6 +15,8 @@ from crewai.tools.base_tool import BaseTool
if TYPE_CHECKING:
from crewai_files import FileInput
logger = logging.getLogger(__name__)
class ReadFileToolSchema(BaseModel):
"""Schema for read file tool arguments."""
@@ -29,7 +33,8 @@ class ReadFileTool(BaseTool):
name: str = "read_file"
description: str = (
"Read content from an input file by name. "
"Returns file content as text for text files, or base64 for binary files."
"Returns file content as text for text files, "
"extracted text for PDFs, or base64 for other binary files."
)
args_schema: type[BaseModel] = ReadFileToolSchema
@@ -50,7 +55,8 @@ class ReadFileTool(BaseTool):
file_name: The name of the file to read.
Returns:
File content as text for text files, or base64 encoded for binary.
File content as text for text files, extracted text for PDFs,
or base64 encoded for other binary files.
"""
if not self._files:
return "No input files available."
@@ -74,5 +80,53 @@ class ReadFileTool(BaseTool):
if any(content_type.startswith(t) for t in text_types):
return content.decode("utf-8")
if content_type == "application/pdf":
return self._extract_pdf_text(content, filename)
encoded = base64.b64encode(content).decode("ascii")
return f"[Binary file: {filename} ({content_type})]\nBase64: {encoded}"
@staticmethod
def _extract_pdf_text(content: bytes, filename: str) -> str:
"""Extract text from PDF bytes using pypdf.
Falls back to a short error message (never base64) when extraction
is not possible, so that the LLM context stays small.
Args:
content: Raw PDF bytes.
filename: Name of the PDF file (for logging/messages).
Returns:
Extracted text, or a short diagnostic message on failure.
"""
try:
from pypdf import PdfReader
except ImportError:
logger.warning(
"pypdf is not installed — cannot extract text from '%s'. "
"Install it with: pip install pypdf",
filename,
)
return (
f"[PDF file: {filename}] "
"Unable to extract text: pypdf is not installed. "
"Install it with: pip install pypdf"
)
try:
reader = PdfReader(io.BytesIO(content))
pages: list[str] = []
for page_num, page in enumerate(reader.pages, start=1):
page_text = page.extract_text()
if page_text:
pages.append(f"--- Page {page_num} ---\n{page_text}")
if pages:
return "\n\n".join(pages)
return (
f"[PDF file: {filename}] "
"No extractable text found (the PDF may contain only images)."
)
except Exception as exc:
logger.warning("Failed to extract text from PDF '%s': %s", filename, exc)
return f"[PDF file: {filename}] Failed to extract text: {exc}"

View File

@@ -1,13 +1,53 @@
"""Unit tests for ReadFileTool."""
from __future__ import annotations
import base64
import io
from unittest.mock import patch
import pytest
from pypdf import PdfWriter
from pypdf.generic import (
DecodedStreamObject,
DictionaryObject,
NameObject,
)
from crewai.tools.agent_tools.read_file_tool import ReadFileTool
from crewai_files import ImageFile, PDFFile, TextFile
def _make_pdf(*page_texts: str) -> bytes:
"""Build a minimal valid PDF with extractable text on each page."""
writer = PdfWriter()
for text in page_texts:
writer.add_blank_page(width=200, height=200)
page = writer.pages[-1]
font_dict = DictionaryObject()
font_dict[NameObject("/Type")] = NameObject("/Font")
font_dict[NameObject("/Subtype")] = NameObject("/Type1")
font_dict[NameObject("/BaseFont")] = NameObject("/Helvetica")
font_ref = writer._add_object(font_dict)
resources = DictionaryObject()
fonts = DictionaryObject()
fonts[NameObject("/F1")] = font_ref
resources[NameObject("/Font")] = fonts
page[NameObject("/Resources")] = resources
stream = DecodedStreamObject()
escaped = text.replace("(", "\\(").replace(")", "\\)")
stream.set_data(f"BT /F1 12 Tf 50 100 Td ({escaped}) Tj ET".encode())
stream_ref = writer._add_object(stream)
page[NameObject("/Contents")] = stream_ref
buf = io.BytesIO()
writer.write(buf)
return buf.getvalue()
class TestReadFileTool:
"""Tests for ReadFileTool."""
@@ -75,15 +115,77 @@ class TestReadFileTool:
decoded = base64.b64decode(b64_part)
assert decoded == png_bytes
def test_run_pdf_file_returns_base64(self) -> None:
"""Test reading a PDF file returns base64 encoded content."""
pdf_bytes = b"%PDF-1.4 some content here"
def test_run_pdf_file_extracts_text(self) -> None:
"""Test reading a PDF extracts text instead of returning base64."""
pdf_bytes = _make_pdf("Hello World from PDF")
self.tool.set_files({"doc.pdf": PDFFile(source=pdf_bytes)})
result = self.tool._run(file_name="doc.pdf")
assert "[Binary file:" in result
assert "application/pdf" in result
assert "Hello World from PDF" in result
assert "Base64" not in result
assert "--- Page 1 ---" in result
def test_run_pdf_multipage_extracts_all_pages(self) -> None:
"""Test reading a multi-page PDF extracts text from every page."""
pdf_bytes = _make_pdf("First page content", "Second page content")
self.tool.set_files({"report.pdf": PDFFile(source=pdf_bytes)})
result = self.tool._run(file_name="report.pdf")
assert "First page content" in result
assert "Second page content" in result
assert "--- Page 1 ---" in result
assert "--- Page 2 ---" in result
assert "Base64" not in result
def test_run_pdf_no_extractable_text(self) -> None:
"""Test PDF with no extractable text returns a friendly message."""
# A blank page with no text content
writer = PdfWriter()
writer.add_blank_page(width=200, height=200)
buf = io.BytesIO()
writer.write(buf)
blank_pdf = buf.getvalue()
self.tool.set_files({"blank.pdf": PDFFile(source=blank_pdf)})
result = self.tool._run(file_name="blank.pdf")
assert "No extractable text found" in result
assert "Base64" not in result
def test_run_pdf_corrupted_returns_error_message(self) -> None:
"""Test that a corrupted PDF returns a short error, never base64."""
corrupted = b"%PDF-1.4 this is not a valid PDF structure"
self.tool.set_files({"bad.pdf": PDFFile(source=corrupted)})
result = self.tool._run(file_name="bad.pdf")
assert "[PDF file: bad.pdf]" in result
assert "Failed to extract text" in result
assert "Base64" not in result
def test_run_pdf_no_pypdf_returns_install_message(self) -> None:
"""Test graceful fallback when pypdf is not installed."""
pdf_bytes = _make_pdf("Some text")
self.tool.set_files({"doc.pdf": PDFFile(source=pdf_bytes)})
with patch.dict("sys.modules", {"pypdf": None}):
result = self.tool._run(file_name="doc.pdf")
assert "pypdf is not installed" in result
assert "Base64" not in result
def test_run_pdf_result_much_smaller_than_base64(self) -> None:
"""Extracted text should be far smaller than a base64-encoded PDF."""
pdf_bytes = _make_pdf("Short text")
self.tool.set_files({"doc.pdf": PDFFile(source=pdf_bytes)})
result = self.tool._run(file_name="doc.pdf")
base64_size = len(base64.b64encode(pdf_bytes))
assert len(result) < base64_size
def test_set_files_none(self) -> None:
"""Test setting files to None."""
@@ -119,4 +221,4 @@ class TestReadFileTool:
schema = self.tool.args_schema
assert "file_name" in schema.model_fields
assert schema.model_fields["file_name"].is_required()
assert schema.model_fields["file_name"].is_required()