mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-07-03 14:09:24 +00:00
Some checks failed
CodeQL Advanced / Analyze (actions) (push) Has been cancelled
CodeQL Advanced / Analyze (python) (push) Has been cancelled
Check Documentation Broken Links / Check broken links (push) Has been cancelled
Vulnerability Scan / pip-audit (push) Has been cancelled
Nightly Canary Release / Check for new commits (push) Has been cancelled
Nightly Canary Release / Build nightly packages (push) Has been cancelled
Nightly Canary Release / Publish nightly to PyPI (push) Has been cancelled
* Validate redirects for scraping URL fetches * Prevent credential forwarding across redirects
127 lines
4.0 KiB
Python
127 lines
4.0 KiB
Python
"""PDF loader for extracting text from PDF files."""
|
|
|
|
import os
|
|
from pathlib import Path
|
|
import tempfile
|
|
from typing import Any
|
|
from urllib.parse import urlparse
|
|
|
|
from crewai_tools.rag.base_loader import BaseLoader, LoaderResult
|
|
from crewai_tools.rag.source_content import SourceContent
|
|
from crewai_tools.security.safe_requests import safe_get
|
|
|
|
|
|
class PDFLoader(BaseLoader):
|
|
"""Loader for PDF files and URLs."""
|
|
|
|
@staticmethod
|
|
def _is_url(path: str) -> bool:
|
|
"""Check if the path is a URL."""
|
|
try:
|
|
parsed = urlparse(path)
|
|
return parsed.scheme in ("http", "https")
|
|
except Exception:
|
|
return False
|
|
|
|
@staticmethod
|
|
def _download_from_url(url: str, kwargs: dict[str, Any]) -> str:
|
|
"""Download PDF from a URL to a temporary file and return its path.
|
|
|
|
Args:
|
|
url: The URL to download from.
|
|
kwargs: Optional dict that may contain custom headers.
|
|
|
|
Returns:
|
|
Path to the temporary file containing the PDF.
|
|
|
|
Raises:
|
|
ValueError: If the download fails.
|
|
"""
|
|
headers = kwargs.get(
|
|
"headers",
|
|
{
|
|
"Accept": "application/pdf",
|
|
"User-Agent": "Mozilla/5.0 (compatible; crewai-tools PDFLoader)",
|
|
},
|
|
)
|
|
|
|
try:
|
|
response = safe_get(url, headers=headers, timeout=30)
|
|
response.raise_for_status()
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file:
|
|
temp_file.write(response.content)
|
|
return temp_file.name
|
|
except Exception as e:
|
|
raise ValueError(f"Failed to download PDF from {url}: {e!s}") from e
|
|
|
|
def load(self, source: SourceContent, **kwargs: Any) -> LoaderResult: # type: ignore[override]
|
|
"""Load and extract text from a PDF file or URL.
|
|
|
|
Args:
|
|
source: The source content containing the PDF file path or URL.
|
|
|
|
Returns:
|
|
LoaderResult with extracted text content.
|
|
|
|
Raises:
|
|
FileNotFoundError: If the PDF file doesn't exist.
|
|
ImportError: If required PDF libraries aren't installed.
|
|
ValueError: If the PDF cannot be read or downloaded.
|
|
"""
|
|
try:
|
|
import pymupdf # type: ignore[import-untyped]
|
|
except ImportError as e:
|
|
raise ImportError(
|
|
"PDF support requires pymupdf. Install with: uv add pymupdf"
|
|
) from e
|
|
|
|
file_path = source.source
|
|
is_url = self._is_url(file_path)
|
|
|
|
if is_url:
|
|
source_name = Path(urlparse(file_path).path).name or "downloaded.pdf"
|
|
else:
|
|
source_name = Path(file_path).name
|
|
|
|
text_content: list[str] = []
|
|
metadata: dict[str, Any] = {
|
|
"source": file_path,
|
|
"file_name": source_name,
|
|
"file_type": "pdf",
|
|
}
|
|
|
|
try:
|
|
if is_url:
|
|
local_path = self._download_from_url(file_path, kwargs)
|
|
doc = pymupdf.open(local_path)
|
|
else:
|
|
if not os.path.isfile(file_path):
|
|
raise FileNotFoundError(f"PDF file not found: {file_path}")
|
|
doc = pymupdf.open(file_path)
|
|
|
|
metadata["num_pages"] = len(doc)
|
|
|
|
for page_num, page in enumerate(doc, 1):
|
|
page_text = page.get_text()
|
|
if page_text.strip():
|
|
text_content.append(f"Page {page_num}:\n{page_text}")
|
|
|
|
doc.close()
|
|
except FileNotFoundError:
|
|
raise
|
|
except Exception as e:
|
|
raise ValueError(f"Error reading PDF from {file_path}: {e!s}") from e
|
|
|
|
if not text_content:
|
|
content = f"[PDF file with no extractable text: {source_name}]"
|
|
else:
|
|
content = "\n\n".join(text_content)
|
|
|
|
return LoaderResult(
|
|
content=content,
|
|
source=file_path,
|
|
metadata=metadata,
|
|
doc_id=self.generate_doc_id(source_ref=file_path, content=content),
|
|
)
|