mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-09 16:18:30 +00:00
Some checks failed
Mark stale issues and pull requests / stale (push) Has been cancelled
CodeQL Advanced / Analyze (actions) (push) Has been cancelled
CodeQL Advanced / Analyze (python) (push) Has been cancelled
Notify Downstream / notify-downstream (push) Has been cancelled
Build uv cache / build-cache (3.10) (push) Has been cancelled
Build uv cache / build-cache (3.11) (push) Has been cancelled
Build uv cache / build-cache (3.12) (push) Has been cancelled
Build uv cache / build-cache (3.13) (push) Has been cancelled
* fix: ensure parameters in RagTool.add, add typing, tests * feat: substitute pymupdf for pypdf, better parsing performance --------- Co-authored-by: Lorenze Jay <63378463+lorenzejay@users.noreply.github.com>
114 lines
3.5 KiB
Python
114 lines
3.5 KiB
Python
"""PDF loader for extracting text from PDF files."""
|
|
|
|
import os
|
|
from pathlib import Path
|
|
from typing import Any, cast
|
|
from urllib.parse import urlparse
|
|
import urllib.request
|
|
|
|
from crewai_tools.rag.base_loader import BaseLoader, LoaderResult
|
|
from crewai_tools.rag.source_content import SourceContent
|
|
|
|
|
|
class PDFLoader(BaseLoader):
|
|
"""Loader for PDF files and URLs."""
|
|
|
|
@staticmethod
|
|
def _is_url(path: str) -> bool:
|
|
"""Check if the path is a URL."""
|
|
try:
|
|
parsed = urlparse(path)
|
|
return parsed.scheme in ("http", "https")
|
|
except Exception:
|
|
return False
|
|
|
|
@staticmethod
|
|
def _download_pdf(url: str) -> bytes:
|
|
"""Download PDF content from a URL.
|
|
|
|
Args:
|
|
url: The URL to download from.
|
|
|
|
Returns:
|
|
The PDF content as bytes.
|
|
|
|
Raises:
|
|
ValueError: If the download fails.
|
|
"""
|
|
|
|
try:
|
|
with urllib.request.urlopen(url, timeout=30) as response: # noqa: S310
|
|
return cast(bytes, response.read())
|
|
except Exception as e:
|
|
raise ValueError(f"Failed to download PDF from {url}: {e!s}") from e
|
|
|
|
def load(self, source: SourceContent, **kwargs: Any) -> LoaderResult: # type: ignore[override]
|
|
"""Load and extract text from a PDF file or URL.
|
|
|
|
Args:
|
|
source: The source content containing the PDF file path or URL.
|
|
|
|
Returns:
|
|
LoaderResult with extracted text content.
|
|
|
|
Raises:
|
|
FileNotFoundError: If the PDF file doesn't exist.
|
|
ImportError: If required PDF libraries aren't installed.
|
|
ValueError: If the PDF cannot be read or downloaded.
|
|
"""
|
|
try:
|
|
import pymupdf # type: ignore[import-untyped]
|
|
except ImportError as e:
|
|
raise ImportError(
|
|
"PDF support requires pymupdf. Install with: uv add pymupdf"
|
|
) from e
|
|
|
|
file_path = source.source
|
|
is_url = self._is_url(file_path)
|
|
|
|
if is_url:
|
|
source_name = Path(urlparse(file_path).path).name or "downloaded.pdf"
|
|
else:
|
|
source_name = Path(file_path).name
|
|
|
|
text_content: list[str] = []
|
|
metadata: dict[str, Any] = {
|
|
"source": file_path,
|
|
"file_name": source_name,
|
|
"file_type": "pdf",
|
|
}
|
|
|
|
try:
|
|
if is_url:
|
|
pdf_bytes = self._download_pdf(file_path)
|
|
doc = pymupdf.open(stream=pdf_bytes, filetype="pdf")
|
|
else:
|
|
if not os.path.isfile(file_path):
|
|
raise FileNotFoundError(f"PDF file not found: {file_path}")
|
|
doc = pymupdf.open(file_path)
|
|
|
|
metadata["num_pages"] = len(doc)
|
|
|
|
for page_num, page in enumerate(doc, 1):
|
|
page_text = page.get_text()
|
|
if page_text.strip():
|
|
text_content.append(f"Page {page_num}:\n{page_text}")
|
|
|
|
doc.close()
|
|
except FileNotFoundError:
|
|
raise
|
|
except Exception as e:
|
|
raise ValueError(f"Error reading PDF from {file_path}: {e!s}") from e
|
|
|
|
if not text_content:
|
|
content = f"[PDF file with no extractable text: {source_name}]"
|
|
else:
|
|
content = "\n\n".join(text_content)
|
|
|
|
return LoaderResult(
|
|
content=content,
|
|
source=file_path,
|
|
metadata=metadata,
|
|
doc_id=self.generate_doc_id(source_ref=file_path, content=content),
|
|
)
|