crewAI/lib/crewai-tools/src/crewai_tools/rag/loaders/pdf_loader.py

"""PDF loader for extracting text from PDF files."""

import os
from pathlib import Path
import tempfile
from typing import Any
from urllib.parse import urlparse

from crewai_tools.rag.base_loader import BaseLoader, LoaderResult
from crewai_tools.rag.source_content import SourceContent
from crewai_tools.security.safe_requests import safe_get


class PDFLoader(BaseLoader):
    """Loader for PDF files and URLs."""

    @staticmethod
    def _is_url(path: str) -> bool:
        """Check if the path is a URL."""
        try:
            parsed = urlparse(path)
            return parsed.scheme in ("http", "https")
        except Exception:
            return False

    @staticmethod
    def _download_from_url(url: str, kwargs: dict[str, Any]) -> str:
        """Download PDF from a URL to a temporary file and return its path.

        Args:
            url: The URL to download from.
            kwargs: Optional dict that may contain custom headers.

        Returns:
            Path to the temporary file containing the PDF.

        Raises:
            ValueError: If the download fails.
        """
        headers = kwargs.get(
            "headers",
            {
                "Accept": "application/pdf",
                "User-Agent": "Mozilla/5.0 (compatible; crewai-tools PDFLoader)",
            },
        )

        try:
            response = safe_get(url, headers=headers, timeout=30)
            response.raise_for_status()

            with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file:
                temp_file.write(response.content)
                return temp_file.name
        except Exception as e:
            raise ValueError(f"Failed to download PDF from {url}: {e!s}") from e

    def load(self, source: SourceContent, **kwargs: Any) -> LoaderResult:  # type: ignore[override]
        """Load and extract text from a PDF file or URL.

        Args:
            source: The source content containing the PDF file path or URL.

        Returns:
            LoaderResult with extracted text content.

        Raises:
            FileNotFoundError: If the PDF file doesn't exist.
            ImportError: If required PDF libraries aren't installed.
            ValueError: If the PDF cannot be read or downloaded.
        """
        try:
            import pymupdf  # type: ignore[import-untyped]
        except ImportError as e:
            raise ImportError(
                "PDF support requires pymupdf. Install with: uv add pymupdf"
            ) from e

        file_path = source.source
        is_url = self._is_url(file_path)

        if is_url:
            source_name = Path(urlparse(file_path).path).name or "downloaded.pdf"
        else:
            source_name = Path(file_path).name

        text_content: list[str] = []
        metadata: dict[str, Any] = {
            "source": file_path,
            "file_name": source_name,
            "file_type": "pdf",
        }

        try:
            if is_url:
                local_path = self._download_from_url(file_path, kwargs)
                doc = pymupdf.open(local_path)
            else:
                if not os.path.isfile(file_path):
                    raise FileNotFoundError(f"PDF file not found: {file_path}")
                doc = pymupdf.open(file_path)

            metadata["num_pages"] = len(doc)

            for page_num, page in enumerate(doc, 1):
                page_text = page.get_text()
                if page_text.strip():
                    text_content.append(f"Page {page_num}:\n{page_text}")

            doc.close()
        except FileNotFoundError:
            raise
        except Exception as e:
            raise ValueError(f"Error reading PDF from {file_path}: {e!s}") from e

        if not text_content:
            content = f"[PDF file with no extractable text: {source_name}]"
        else:
            content = "\n\n".join(text_content)

        return LoaderResult(
            content=content,
            source=file_path,
            metadata=metadata,
            doc_id=self.generate_doc_id(source_ref=file_path, content=content),
        )