crewAI/lib/crewai-files/src/crewai_files/formatting/bedrock.py

"""Bedrock content block formatter."""

from __future__ import annotations

import base64
from typing import Any

from crewai_files.core.resolved import (
    FileReference,
    InlineBase64,
    InlineBytes,
    ResolvedFileType,
    UrlReference,
)
from crewai_files.core.types import FileInput


_DOCUMENT_FORMATS: dict[str, str] = {
    "application/pdf": "pdf",
    "text/csv": "csv",
    "text/plain": "txt",
    "text/markdown": "md",
    "text/html": "html",
    "application/msword": "doc",
    "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
    "application/vnd.ms-excel": "xls",
    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
}

_VIDEO_FORMATS: dict[str, str] = {
    "video/mp4": "mp4",
    "video/quicktime": "mov",
    "video/x-matroska": "mkv",
    "video/webm": "webm",
    "video/x-flv": "flv",
    "video/mpeg": "mpeg",
    "video/3gpp": "three_gp",
}


class BedrockFormatter:
    """Formats resolved files into Bedrock Converse API content blocks."""

    def __init__(self, s3_bucket_owner: str | None = None) -> None:
        """Initialize formatter.

        Args:
            s3_bucket_owner: Optional S3 bucket owner for file references.
        """
        self.s3_bucket_owner = s3_bucket_owner

    def format_block(
        self,
        file: FileInput,
        resolved: ResolvedFileType,
        name: str | None = None,
    ) -> dict[str, Any] | None:
        """Format a resolved file into a Bedrock content block.

        Args:
            file: Original file input with metadata.
            resolved: Resolved file.
            name: File name (required for document blocks).

        Returns:
            Content block dict or None if not supported.
        """
        content_type = file.content_type

        if isinstance(resolved, FileReference):
            if not resolved.file_uri:
                raise ValueError("Bedrock requires file_uri for FileReference (S3 URI)")
            return self._format_s3_block(content_type, resolved.file_uri, name)

        if isinstance(resolved, InlineBytes):
            return self._format_bytes_block(content_type, resolved.data, name)

        if isinstance(resolved, InlineBase64):
            file_bytes = base64.b64decode(resolved.data)
            return self._format_bytes_block(content_type, file_bytes, name)

        if isinstance(resolved, UrlReference):
            raise ValueError(
                "Bedrock does not support URL references - resolve to bytes first"
            )

        raise TypeError(f"Unexpected resolved type: {type(resolved).__name__}")

    def _format_s3_block(
        self,
        content_type: str,
        file_uri: str,
        name: str | None,
    ) -> dict[str, Any] | None:
        """Format block with S3 location source.

        Args:
            content_type: MIME type.
            file_uri: S3 URI.
            name: File name for documents.

        Returns:
            Content block dict or None.
        """
        s3_location: dict[str, Any] = {"uri": file_uri}
        if self.s3_bucket_owner:
            s3_location["bucketOwner"] = self.s3_bucket_owner

        if content_type.startswith("image/"):
            return {
                "image": {
                    "format": self._get_image_format(content_type),
                    "source": {"s3Location": s3_location},
                }
            }

        if content_type.startswith("video/"):
            video_format = _VIDEO_FORMATS.get(content_type)
            if video_format:
                return {
                    "video": {
                        "format": video_format,
                        "source": {"s3Location": s3_location},
                    }
                }
            return None

        doc_format = _DOCUMENT_FORMATS.get(content_type)
        if doc_format:
            return {
                "document": {
                    "name": name or "document",
                    "format": doc_format,
                    "source": {"s3Location": s3_location},
                }
            }

        return None

    def _format_bytes_block(
        self,
        content_type: str,
        file_bytes: bytes,
        name: str | None,
    ) -> dict[str, Any] | None:
        """Format block with inline bytes source.

        Args:
            content_type: MIME type.
            file_bytes: Raw file bytes.
            name: File name for documents.

        Returns:
            Content block dict or None.
        """
        if content_type.startswith("image/"):
            return {
                "image": {
                    "format": self._get_image_format(content_type),
                    "source": {"bytes": file_bytes},
                }
            }

        if content_type.startswith("video/"):
            video_format = _VIDEO_FORMATS.get(content_type)
            if video_format:
                return {
                    "video": {
                        "format": video_format,
                        "source": {"bytes": file_bytes},
                    }
                }
            return None

        doc_format = _DOCUMENT_FORMATS.get(content_type)
        if doc_format:
            return {
                "document": {
                    "name": name or "document",
                    "format": doc_format,
                    "source": {"bytes": file_bytes},
                }
            }

        return None

    @staticmethod
    def _get_image_format(content_type: str) -> str:
        """Get Bedrock image format from content type.

        Args:
            content_type: MIME type.

        Returns:
            Format string for Bedrock.
        """
        media_type = content_type.split("/")[-1]
        if media_type == "jpg":
            return "jpeg"
        return media_type