crewAI/lib/crewai-files/src/crewai_files/processing/constraints.py

"""Provider-specific file constraints for multimodal content."""

from dataclasses import dataclass
from functools import lru_cache
from typing import Literal

from crewai_files.core.types import (
    AudioMimeType,
    ImageMimeType,
    TextContentType,
    VideoMimeType,
)


ProviderName = Literal[
    "anthropic",
    "openai",
    "gemini",
    "bedrock",
    "azure",
]

DEFAULT_IMAGE_FORMATS: tuple[ImageMimeType, ...] = (
    "image/png",
    "image/jpeg",
    "image/gif",
    "image/webp",
)

GEMINI_IMAGE_FORMATS: tuple[ImageMimeType, ...] = (
    "image/png",
    "image/jpeg",
    "image/gif",
    "image/webp",
    "image/heic",
    "image/heif",
)

DEFAULT_AUDIO_FORMATS: tuple[AudioMimeType, ...] = (
    "audio/mp3",
    "audio/mpeg",
    "audio/wav",
    "audio/ogg",
    "audio/flac",
    "audio/aac",
    "audio/m4a",
)

GEMINI_AUDIO_FORMATS: tuple[AudioMimeType, ...] = (
    "audio/mp3",
    "audio/mpeg",
    "audio/wav",
    "audio/ogg",
    "audio/flac",
    "audio/aac",
    "audio/m4a",
    "audio/opus",
)

DEFAULT_VIDEO_FORMATS: tuple[VideoMimeType, ...] = (
    "video/mp4",
    "video/mpeg",
    "video/webm",
    "video/quicktime",
)

GEMINI_VIDEO_FORMATS: tuple[VideoMimeType, ...] = (
    "video/mp4",
    "video/mpeg",
    "video/webm",
    "video/quicktime",
    "video/x-msvideo",
    "video/x-flv",
)

DEFAULT_TEXT_FORMATS: tuple[TextContentType, ...] = (
    "text/plain",
    "text/markdown",
    "text/csv",
    "application/json",
    "text/xml",
    "text/html",
)

GEMINI_TEXT_FORMATS: tuple[TextContentType, ...] = (
    "text/plain",
    "text/markdown",
    "text/csv",
    "application/json",
    "application/xml",
    "text/xml",
    "application/x-yaml",
    "text/yaml",
    "text/html",
)


@dataclass(frozen=True)
class ImageConstraints:
    """Constraints for image files.

    Attributes:
        max_size_bytes: Maximum file size in bytes.
        max_width: Maximum image width in pixels.
        max_height: Maximum image height in pixels.
        max_images_per_request: Maximum number of images per request.
        supported_formats: Supported image MIME types.
    """

    max_size_bytes: int
    max_width: int | None = None
    max_height: int | None = None
    max_images_per_request: int | None = None
    supported_formats: tuple[ImageMimeType, ...] = DEFAULT_IMAGE_FORMATS


@dataclass(frozen=True)
class PDFConstraints:
    """Constraints for PDF files.

    Attributes:
        max_size_bytes: Maximum file size in bytes.
        max_pages: Maximum number of pages.
    """

    max_size_bytes: int
    max_pages: int | None = None


@dataclass(frozen=True)
class AudioConstraints:
    """Constraints for audio files.

    Attributes:
        max_size_bytes: Maximum file size in bytes.
        max_duration_seconds: Maximum audio duration in seconds.
        supported_formats: Supported audio MIME types.
    """

    max_size_bytes: int
    max_duration_seconds: int | None = None
    supported_formats: tuple[AudioMimeType, ...] = DEFAULT_AUDIO_FORMATS


@dataclass(frozen=True)
class VideoConstraints:
    """Constraints for video files.

    Attributes:
        max_size_bytes: Maximum file size in bytes.
        max_duration_seconds: Maximum video duration in seconds.
        supported_formats: Supported video MIME types.
    """

    max_size_bytes: int
    max_duration_seconds: int | None = None
    supported_formats: tuple[VideoMimeType, ...] = DEFAULT_VIDEO_FORMATS


@dataclass(frozen=True)
class TextConstraints:
    """Constraints for text files.

    Attributes:
        max_size_bytes: Maximum file size in bytes.
        supported_formats: Supported text MIME types.
    """

    max_size_bytes: int
    supported_formats: tuple[TextContentType, ...] = DEFAULT_TEXT_FORMATS


@dataclass(frozen=True)
class ProviderConstraints:
    """Complete set of constraints for a provider.

    Attributes:
        name: Provider name identifier.
        image: Image file constraints.
        pdf: PDF file constraints.
        audio: Audio file constraints.
        video: Video file constraints.
        text: Text file constraints.
        general_max_size_bytes: Maximum size for any file type.
        supports_file_upload: Whether the provider supports file upload APIs.
        file_upload_threshold_bytes: Size threshold above which to use file upload.
        supports_url_references: Whether the provider supports URL-based file references.
    """

    name: ProviderName
    image: ImageConstraints | None = None
    pdf: PDFConstraints | None = None
    audio: AudioConstraints | None = None
    video: VideoConstraints | None = None
    text: TextConstraints | None = None
    general_max_size_bytes: int | None = None
    supports_file_upload: bool = False
    file_upload_threshold_bytes: int | None = None
    supports_url_references: bool = False


ANTHROPIC_CONSTRAINTS = ProviderConstraints(
    name="anthropic",
    image=ImageConstraints(
        max_size_bytes=5_242_880,  # 5 MB per image
        max_width=8000,
        max_height=8000,
        max_images_per_request=100,
    ),
    pdf=PDFConstraints(
        max_size_bytes=33_554_432,  # 32 MB request size limit
        max_pages=100,
    ),
    supports_file_upload=True,
    file_upload_threshold_bytes=5_242_880,
    supports_url_references=True,
)

OPENAI_COMPLETIONS_CONSTRAINTS = ProviderConstraints(
    name="openai",
    image=ImageConstraints(
        max_size_bytes=20_971_520,
        max_images_per_request=10,
    ),
    supports_file_upload=True,
    file_upload_threshold_bytes=5_242_880,
    supports_url_references=True,
)

OPENAI_RESPONSES_CONSTRAINTS = ProviderConstraints(
    name="openai_responses",
    image=ImageConstraints(
        max_size_bytes=20_971_520,
        max_images_per_request=10,
    ),
    pdf=PDFConstraints(
        max_size_bytes=33_554_432,  # 32 MB total across all file inputs
        max_pages=100,
    ),
    audio=AudioConstraints(
        max_size_bytes=26_214_400,  # 25 MB - whisper limit
        max_duration_seconds=1500,  # 25 minutes, arbitrary-ish, this is from the transcriptions limit
    ),
    supports_file_upload=True,
    file_upload_threshold_bytes=5_242_880,
    supports_url_references=True,
)

OPENAI_CONSTRAINTS = OPENAI_COMPLETIONS_CONSTRAINTS

GEMINI_CONSTRAINTS = ProviderConstraints(
    name="gemini",
    image=ImageConstraints(
        max_size_bytes=104_857_600,
        supported_formats=GEMINI_IMAGE_FORMATS,
    ),
    pdf=PDFConstraints(
        max_size_bytes=52_428_800,
    ),
    audio=AudioConstraints(
        max_size_bytes=104_857_600,
        max_duration_seconds=34200,  # 9.5 hours
        supported_formats=GEMINI_AUDIO_FORMATS,
    ),
    video=VideoConstraints(
        max_size_bytes=2_147_483_648,
        max_duration_seconds=3600,  # 1 hour at default resolution
        supported_formats=GEMINI_VIDEO_FORMATS,
    ),
    text=TextConstraints(
        max_size_bytes=104_857_600,
        supported_formats=GEMINI_TEXT_FORMATS,
    ),
    supports_file_upload=True,
    file_upload_threshold_bytes=20_971_520,
    supports_url_references=True,
)

BEDROCK_CONSTRAINTS = ProviderConstraints(
    name="bedrock",
    image=ImageConstraints(
        max_size_bytes=4_608_000,
        max_width=8000,
        max_height=8000,
    ),
    pdf=PDFConstraints(
        max_size_bytes=3_840_000,
        max_pages=100,
    ),
    supports_url_references=True,  # S3 URIs supported
)

AZURE_CONSTRAINTS = ProviderConstraints(
    name="azure",
    image=ImageConstraints(
        max_size_bytes=20_971_520,
        max_images_per_request=10,
    ),
    audio=AudioConstraints(
        max_size_bytes=26_214_400,  # 25 MB - same as openai
        max_duration_seconds=1500,  # 25 minutes - same as openai
    ),
    supports_url_references=True,
)


_PROVIDER_CONSTRAINTS_MAP: dict[str, ProviderConstraints] = {
    "anthropic": ANTHROPIC_CONSTRAINTS,
    "openai": OPENAI_CONSTRAINTS,
    "openai_responses": OPENAI_RESPONSES_CONSTRAINTS,
    "gemini": GEMINI_CONSTRAINTS,
    "bedrock": BEDROCK_CONSTRAINTS,
    "azure": AZURE_CONSTRAINTS,
    "claude": ANTHROPIC_CONSTRAINTS,
    "gpt": OPENAI_CONSTRAINTS,
    "google": GEMINI_CONSTRAINTS,
    "aws": BEDROCK_CONSTRAINTS,
}


@lru_cache(maxsize=32)
def get_constraints_for_provider(
    provider: str | ProviderConstraints,
) -> ProviderConstraints | None:
    """Get constraints for a provider by name or return if already ProviderConstraints.

    Args:
        provider: Provider name string or ProviderConstraints instance.

    Returns:
        ProviderConstraints for the provider, or None if not found.
    """
    if isinstance(provider, ProviderConstraints):
        return provider

    provider_lower = provider.lower()

    if provider_lower in _PROVIDER_CONSTRAINTS_MAP:
        return _PROVIDER_CONSTRAINTS_MAP[provider_lower]

    for key, constraints in _PROVIDER_CONSTRAINTS_MAP.items():
        if key in provider_lower:
            return constraints

    return None


def get_supported_content_types(provider: str, api: str | None = None) -> list[str]:
    """Get supported MIME type prefixes for a provider.

    Args:
        provider: Provider name string.
        api: Optional API variant (e.g., "responses" for OpenAI Responses API).

    Returns:
        List of supported MIME type prefixes (e.g., ["image/", "application/pdf"]).
    """
    lookup_key = provider
    if api == "responses" and "openai" in provider.lower():
        lookup_key = "openai_responses"

    constraints = get_constraints_for_provider(lookup_key)
    if not constraints:
        return []

    types: list[str] = []
    if constraints.image:
        types.append("image/")
    if constraints.pdf:
        types.append("application/pdf")
    if constraints.audio:
        types.append("audio/")
    if constraints.video:
        types.append("video/")
    if constraints.text:
        types.append("text/")
    return types