mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-09 16:18:30 +00:00
72 lines
2.3 KiB
Python
72 lines
2.3 KiB
Python
"""PDF loader for extracting text from PDF files."""
|
|
|
|
import os
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
from crewai_tools.rag.base_loader import BaseLoader, LoaderResult
|
|
from crewai_tools.rag.source_content import SourceContent
|
|
|
|
|
|
class PDFLoader(BaseLoader):
|
|
"""Loader for PDF files."""
|
|
|
|
def load(self, source: SourceContent, **kwargs) -> LoaderResult:
|
|
"""Load and extract text from a PDF file.
|
|
|
|
Args:
|
|
source: The source content containing the PDF file path
|
|
|
|
Returns:
|
|
LoaderResult with extracted text content
|
|
|
|
Raises:
|
|
FileNotFoundError: If the PDF file doesn't exist
|
|
ImportError: If required PDF libraries aren't installed
|
|
"""
|
|
try:
|
|
import pypdf
|
|
except ImportError:
|
|
try:
|
|
import PyPDF2 as pypdf # noqa: N813
|
|
except ImportError as e:
|
|
raise ImportError(
|
|
"PDF support requires pypdf or PyPDF2. Install with: uv add pypdf"
|
|
) from e
|
|
|
|
file_path = source.source
|
|
|
|
if not os.path.isfile(file_path):
|
|
raise FileNotFoundError(f"PDF file not found: {file_path}")
|
|
|
|
text_content = []
|
|
metadata: dict[str, Any] = {
|
|
"source": str(file_path),
|
|
"file_name": Path(file_path).name,
|
|
"file_type": "pdf",
|
|
}
|
|
|
|
try:
|
|
with open(file_path, "rb") as file:
|
|
pdf_reader = pypdf.PdfReader(file)
|
|
metadata["num_pages"] = len(pdf_reader.pages)
|
|
|
|
for page_num, page in enumerate(pdf_reader.pages, 1):
|
|
page_text = page.extract_text()
|
|
if page_text.strip():
|
|
text_content.append(f"Page {page_num}:\n{page_text}")
|
|
except Exception as e:
|
|
raise ValueError(f"Error reading PDF file {file_path}: {e!s}") from e
|
|
|
|
if not text_content:
|
|
content = f"[PDF file with no extractable text: {Path(file_path).name}]"
|
|
else:
|
|
content = "\n\n".join(text_content)
|
|
|
|
return LoaderResult(
|
|
content=content,
|
|
source=str(file_path),
|
|
metadata=metadata,
|
|
doc_id=self.generate_doc_id(source_ref=str(file_path), content=content),
|
|
)
|