mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-07-03 06:08:15 +00:00
Fix #5930: Extract text from PDFs in ReadFileTool instead of returning base64
When using input_files with PDFFile, the read_file tool was returning the entire PDF as base64-encoded binary data. This caused: - Massive context bloat for the LLM - Inconsistent responses and context overflow - The same file being re-processed on each tool call Now ReadFileTool detects application/pdf content and extracts text using pypdf (already a dependency via crewai-files) instead of base64-encoding the raw bytes. Each page is labeled with a page number header for clarity. Graceful fallbacks are provided when: - pypdf is not installed (short install message) - The PDF contains no extractable text (friendly message) - The PDF is corrupted (error message, never base64) Co-Authored-By: João <joao@crewai.com>
This commit is contained in:
@@ -3,6 +3,8 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import io
|
||||
import logging
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from pydantic import BaseModel, Field, PrivateAttr
|
||||
@@ -13,6 +15,8 @@ from crewai.tools.base_tool import BaseTool
|
||||
if TYPE_CHECKING:
|
||||
from crewai_files import FileInput
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ReadFileToolSchema(BaseModel):
|
||||
"""Schema for read file tool arguments."""
|
||||
@@ -29,7 +33,8 @@ class ReadFileTool(BaseTool):
|
||||
name: str = "read_file"
|
||||
description: str = (
|
||||
"Read content from an input file by name. "
|
||||
"Returns file content as text for text files, or base64 for binary files."
|
||||
"Returns file content as text for text files, "
|
||||
"extracted text for PDFs, or base64 for other binary files."
|
||||
)
|
||||
args_schema: type[BaseModel] = ReadFileToolSchema
|
||||
|
||||
@@ -50,7 +55,8 @@ class ReadFileTool(BaseTool):
|
||||
file_name: The name of the file to read.
|
||||
|
||||
Returns:
|
||||
File content as text for text files, or base64 encoded for binary.
|
||||
File content as text for text files, extracted text for PDFs,
|
||||
or base64 encoded for other binary files.
|
||||
"""
|
||||
if not self._files:
|
||||
return "No input files available."
|
||||
@@ -74,5 +80,53 @@ class ReadFileTool(BaseTool):
|
||||
if any(content_type.startswith(t) for t in text_types):
|
||||
return content.decode("utf-8")
|
||||
|
||||
if content_type == "application/pdf":
|
||||
return self._extract_pdf_text(content, filename)
|
||||
|
||||
encoded = base64.b64encode(content).decode("ascii")
|
||||
return f"[Binary file: {filename} ({content_type})]\nBase64: {encoded}"
|
||||
|
||||
@staticmethod
|
||||
def _extract_pdf_text(content: bytes, filename: str) -> str:
|
||||
"""Extract text from PDF bytes using pypdf.
|
||||
|
||||
Falls back to a short error message (never base64) when extraction
|
||||
is not possible, so that the LLM context stays small.
|
||||
|
||||
Args:
|
||||
content: Raw PDF bytes.
|
||||
filename: Name of the PDF file (for logging/messages).
|
||||
|
||||
Returns:
|
||||
Extracted text, or a short diagnostic message on failure.
|
||||
"""
|
||||
try:
|
||||
from pypdf import PdfReader
|
||||
except ImportError:
|
||||
logger.warning(
|
||||
"pypdf is not installed — cannot extract text from '%s'. "
|
||||
"Install it with: pip install pypdf",
|
||||
filename,
|
||||
)
|
||||
return (
|
||||
f"[PDF file: {filename}] "
|
||||
"Unable to extract text: pypdf is not installed. "
|
||||
"Install it with: pip install pypdf"
|
||||
)
|
||||
|
||||
try:
|
||||
reader = PdfReader(io.BytesIO(content))
|
||||
pages: list[str] = []
|
||||
for page_num, page in enumerate(reader.pages, start=1):
|
||||
page_text = page.extract_text()
|
||||
if page_text:
|
||||
pages.append(f"--- Page {page_num} ---\n{page_text}")
|
||||
if pages:
|
||||
return "\n\n".join(pages)
|
||||
return (
|
||||
f"[PDF file: {filename}] "
|
||||
"No extractable text found (the PDF may contain only images)."
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning("Failed to extract text from PDF '%s': %s", filename, exc)
|
||||
return f"[PDF file: {filename}] Failed to extract text: {exc}"
|
||||
|
||||
@@ -1,13 +1,53 @@
|
||||
"""Unit tests for ReadFileTool."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import io
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
from pypdf import PdfWriter
|
||||
from pypdf.generic import (
|
||||
DecodedStreamObject,
|
||||
DictionaryObject,
|
||||
NameObject,
|
||||
)
|
||||
|
||||
from crewai.tools.agent_tools.read_file_tool import ReadFileTool
|
||||
from crewai_files import ImageFile, PDFFile, TextFile
|
||||
|
||||
|
||||
def _make_pdf(*page_texts: str) -> bytes:
|
||||
"""Build a minimal valid PDF with extractable text on each page."""
|
||||
writer = PdfWriter()
|
||||
for text in page_texts:
|
||||
writer.add_blank_page(width=200, height=200)
|
||||
page = writer.pages[-1]
|
||||
|
||||
font_dict = DictionaryObject()
|
||||
font_dict[NameObject("/Type")] = NameObject("/Font")
|
||||
font_dict[NameObject("/Subtype")] = NameObject("/Type1")
|
||||
font_dict[NameObject("/BaseFont")] = NameObject("/Helvetica")
|
||||
font_ref = writer._add_object(font_dict)
|
||||
|
||||
resources = DictionaryObject()
|
||||
fonts = DictionaryObject()
|
||||
fonts[NameObject("/F1")] = font_ref
|
||||
resources[NameObject("/Font")] = fonts
|
||||
page[NameObject("/Resources")] = resources
|
||||
|
||||
stream = DecodedStreamObject()
|
||||
escaped = text.replace("(", "\\(").replace(")", "\\)")
|
||||
stream.set_data(f"BT /F1 12 Tf 50 100 Td ({escaped}) Tj ET".encode())
|
||||
stream_ref = writer._add_object(stream)
|
||||
page[NameObject("/Contents")] = stream_ref
|
||||
|
||||
buf = io.BytesIO()
|
||||
writer.write(buf)
|
||||
return buf.getvalue()
|
||||
|
||||
|
||||
class TestReadFileTool:
|
||||
"""Tests for ReadFileTool."""
|
||||
|
||||
@@ -75,15 +115,77 @@ class TestReadFileTool:
|
||||
decoded = base64.b64decode(b64_part)
|
||||
assert decoded == png_bytes
|
||||
|
||||
def test_run_pdf_file_returns_base64(self) -> None:
|
||||
"""Test reading a PDF file returns base64 encoded content."""
|
||||
pdf_bytes = b"%PDF-1.4 some content here"
|
||||
def test_run_pdf_file_extracts_text(self) -> None:
|
||||
"""Test reading a PDF extracts text instead of returning base64."""
|
||||
pdf_bytes = _make_pdf("Hello World from PDF")
|
||||
self.tool.set_files({"doc.pdf": PDFFile(source=pdf_bytes)})
|
||||
|
||||
result = self.tool._run(file_name="doc.pdf")
|
||||
|
||||
assert "[Binary file:" in result
|
||||
assert "application/pdf" in result
|
||||
assert "Hello World from PDF" in result
|
||||
assert "Base64" not in result
|
||||
assert "--- Page 1 ---" in result
|
||||
|
||||
def test_run_pdf_multipage_extracts_all_pages(self) -> None:
|
||||
"""Test reading a multi-page PDF extracts text from every page."""
|
||||
pdf_bytes = _make_pdf("First page content", "Second page content")
|
||||
self.tool.set_files({"report.pdf": PDFFile(source=pdf_bytes)})
|
||||
|
||||
result = self.tool._run(file_name="report.pdf")
|
||||
|
||||
assert "First page content" in result
|
||||
assert "Second page content" in result
|
||||
assert "--- Page 1 ---" in result
|
||||
assert "--- Page 2 ---" in result
|
||||
assert "Base64" not in result
|
||||
|
||||
def test_run_pdf_no_extractable_text(self) -> None:
|
||||
"""Test PDF with no extractable text returns a friendly message."""
|
||||
# A blank page with no text content
|
||||
writer = PdfWriter()
|
||||
writer.add_blank_page(width=200, height=200)
|
||||
buf = io.BytesIO()
|
||||
writer.write(buf)
|
||||
blank_pdf = buf.getvalue()
|
||||
|
||||
self.tool.set_files({"blank.pdf": PDFFile(source=blank_pdf)})
|
||||
|
||||
result = self.tool._run(file_name="blank.pdf")
|
||||
|
||||
assert "No extractable text found" in result
|
||||
assert "Base64" not in result
|
||||
|
||||
def test_run_pdf_corrupted_returns_error_message(self) -> None:
|
||||
"""Test that a corrupted PDF returns a short error, never base64."""
|
||||
corrupted = b"%PDF-1.4 this is not a valid PDF structure"
|
||||
self.tool.set_files({"bad.pdf": PDFFile(source=corrupted)})
|
||||
|
||||
result = self.tool._run(file_name="bad.pdf")
|
||||
|
||||
assert "[PDF file: bad.pdf]" in result
|
||||
assert "Failed to extract text" in result
|
||||
assert "Base64" not in result
|
||||
|
||||
def test_run_pdf_no_pypdf_returns_install_message(self) -> None:
|
||||
"""Test graceful fallback when pypdf is not installed."""
|
||||
pdf_bytes = _make_pdf("Some text")
|
||||
self.tool.set_files({"doc.pdf": PDFFile(source=pdf_bytes)})
|
||||
|
||||
with patch.dict("sys.modules", {"pypdf": None}):
|
||||
result = self.tool._run(file_name="doc.pdf")
|
||||
|
||||
assert "pypdf is not installed" in result
|
||||
assert "Base64" not in result
|
||||
|
||||
def test_run_pdf_result_much_smaller_than_base64(self) -> None:
|
||||
"""Extracted text should be far smaller than a base64-encoded PDF."""
|
||||
pdf_bytes = _make_pdf("Short text")
|
||||
self.tool.set_files({"doc.pdf": PDFFile(source=pdf_bytes)})
|
||||
|
||||
result = self.tool._run(file_name="doc.pdf")
|
||||
|
||||
base64_size = len(base64.b64encode(pdf_bytes))
|
||||
assert len(result) < base64_size
|
||||
|
||||
def test_set_files_none(self) -> None:
|
||||
"""Test setting files to None."""
|
||||
@@ -119,4 +221,4 @@ class TestReadFileTool:
|
||||
schema = self.tool.args_schema
|
||||
|
||||
assert "file_name" in schema.model_fields
|
||||
assert schema.model_fields["file_name"].is_required()
|
||||
assert schema.model_fields["file_name"].is_required()
|
||||
|
||||
Reference in New Issue
Block a user