Files
crewAI/lib/crewai-tools/src/crewai_tools/rag/loaders/pdf_loader.py
Rip&Tear 5d4851eac7
Some checks failed
CodeQL Advanced / Analyze (actions) (push) Has been cancelled
CodeQL Advanced / Analyze (python) (push) Has been cancelled
Check Documentation Broken Links / Check broken links (push) Has been cancelled
Vulnerability Scan / pip-audit (push) Has been cancelled
Nightly Canary Release / Check for new commits (push) Has been cancelled
Nightly Canary Release / Build nightly packages (push) Has been cancelled
Nightly Canary Release / Publish nightly to PyPI (push) Has been cancelled
Fix SSRF redirect bypass in scraping fetches (#6331)
* Validate redirects for scraping URL fetches

* Prevent credential forwarding across redirects
2026-06-25 17:42:49 -07:00

127 lines
4.0 KiB
Python

"""PDF loader for extracting text from PDF files."""
import os
from pathlib import Path
import tempfile
from typing import Any
from urllib.parse import urlparse
from crewai_tools.rag.base_loader import BaseLoader, LoaderResult
from crewai_tools.rag.source_content import SourceContent
from crewai_tools.security.safe_requests import safe_get
class PDFLoader(BaseLoader):
"""Loader for PDF files and URLs."""
@staticmethod
def _is_url(path: str) -> bool:
"""Check if the path is a URL."""
try:
parsed = urlparse(path)
return parsed.scheme in ("http", "https")
except Exception:
return False
@staticmethod
def _download_from_url(url: str, kwargs: dict[str, Any]) -> str:
"""Download PDF from a URL to a temporary file and return its path.
Args:
url: The URL to download from.
kwargs: Optional dict that may contain custom headers.
Returns:
Path to the temporary file containing the PDF.
Raises:
ValueError: If the download fails.
"""
headers = kwargs.get(
"headers",
{
"Accept": "application/pdf",
"User-Agent": "Mozilla/5.0 (compatible; crewai-tools PDFLoader)",
},
)
try:
response = safe_get(url, headers=headers, timeout=30)
response.raise_for_status()
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file:
temp_file.write(response.content)
return temp_file.name
except Exception as e:
raise ValueError(f"Failed to download PDF from {url}: {e!s}") from e
def load(self, source: SourceContent, **kwargs: Any) -> LoaderResult: # type: ignore[override]
"""Load and extract text from a PDF file or URL.
Args:
source: The source content containing the PDF file path or URL.
Returns:
LoaderResult with extracted text content.
Raises:
FileNotFoundError: If the PDF file doesn't exist.
ImportError: If required PDF libraries aren't installed.
ValueError: If the PDF cannot be read or downloaded.
"""
try:
import pymupdf # type: ignore[import-untyped]
except ImportError as e:
raise ImportError(
"PDF support requires pymupdf. Install with: uv add pymupdf"
) from e
file_path = source.source
is_url = self._is_url(file_path)
if is_url:
source_name = Path(urlparse(file_path).path).name or "downloaded.pdf"
else:
source_name = Path(file_path).name
text_content: list[str] = []
metadata: dict[str, Any] = {
"source": file_path,
"file_name": source_name,
"file_type": "pdf",
}
try:
if is_url:
local_path = self._download_from_url(file_path, kwargs)
doc = pymupdf.open(local_path)
else:
if not os.path.isfile(file_path):
raise FileNotFoundError(f"PDF file not found: {file_path}")
doc = pymupdf.open(file_path)
metadata["num_pages"] = len(doc)
for page_num, page in enumerate(doc, 1):
page_text = page.get_text()
if page_text.strip():
text_content.append(f"Page {page_num}:\n{page_text}")
doc.close()
except FileNotFoundError:
raise
except Exception as e:
raise ValueError(f"Error reading PDF from {file_path}: {e!s}") from e
if not text_content:
content = f"[PDF file with no extractable text: {source_name}]"
else:
content = "\n\n".join(text_content)
return LoaderResult(
content=content,
source=file_path,
metadata=metadata,
doc_id=self.generate_doc_id(source_ref=file_path, content=content),
)