feat: add multimodal support to LLM providers

- Add format_multimodal_content() to all LLM providers - Support inline base64 and file reference formats - Add FileResolver integration for upload caching - Add module exports for files package
2026-01-23 15:18:14 +00:00 · 2026-01-21 20:05:33 -05:00
parent 50728b10e8
commit 771eccfcdf
20 changed files with 2382 additions and 15 deletions
--- a/lib/crewai/src/crewai/llm.py
+++ b/lib/crewai/src/crewai/llm.py
@@ -70,6 +70,7 @@ if TYPE_CHECKING:
    from crewai.llms.providers.anthropic.completion import AnthropicThinkingConfig
    from crewai.task import Task
    from crewai.tools.base_tool import BaseTool
+    from crewai.utilities.files import FileInput, UploadCache
    from crewai.utilities.types import LLMMessage

 try:
@@ -683,7 +684,7 @@ class LLM(BaseLLM):
            "temperature": self.temperature,
            "top_p": self.top_p,
            "n": self.n,
-            "stop": self.stop,
+            "stop": self.stop or None,
            "max_tokens": self.max_tokens or self.max_completion_tokens,
            "presence_penalty": self.presence_penalty,
            "frequency_penalty": self.frequency_penalty,
@@ -931,7 +932,6 @@ class LLM(BaseLLM):
            self._handle_streaming_callbacks(callbacks, usage_info, last_chunk)

            if not tool_calls or not available_functions:
-
                if response_model and self.is_litellm:
                    instructor_instance = InternalInstructor(
                        content=full_response,
@@ -1144,8 +1144,12 @@ class LLM(BaseLLM):
            if response_model:
                params["response_model"] = response_model
            response = litellm.completion(**params)
-            
-            if hasattr(response,"usage") and not isinstance(response.usage, type) and response.usage:
+
+            if (
+                hasattr(response, "usage")
+                and not isinstance(response.usage, type)
+                and response.usage
+            ):
                usage_info = response.usage
                self._track_token_usage_internal(usage_info)

@@ -1273,7 +1277,11 @@ class LLM(BaseLLM):
                params["response_model"] = response_model
            response = await litellm.acompletion(**params)

-            if hasattr(response,"usage") and not isinstance(response.usage, type) and response.usage:
+            if (
+                hasattr(response, "usage")
+                and not isinstance(response.usage, type)
+                and response.usage
+            ):
                usage_info = response.usage
                self._track_token_usage_internal(usage_info)

@@ -1363,7 +1371,7 @@ class LLM(BaseLLM):
        """
        full_response = ""
        chunk_count = 0
-        
+
        usage_info = None

        accumulated_tool_args: defaultdict[int, AccumulatedToolArgs] = defaultdict(
@@ -2205,3 +2213,107 @@ class LLM(BaseLLM):
            stop=copy.deepcopy(self.stop, memo) if self.stop else None,
            **filtered_params,
        )
+
+    def supports_multimodal(self) -> bool:
+        """Check if the model supports multimodal inputs.
+
+        For litellm, check common vision-enabled model prefixes.
+
+        Returns:
+            True if the model likely supports images.
+        """
+        vision_prefixes = (
+            "gpt-4o",
+            "gpt-4-turbo",
+            "gpt-4-vision",
+            "gpt-4.1",
+            "claude-3",
+            "claude-4",
+            "gemini",
+        )
+        model_lower = self.model.lower()
+        return any(
+            model_lower.startswith(p) or f"/{p}" in model_lower for p in vision_prefixes
+        )
+
+    def supported_multimodal_content_types(self) -> list[str]:
+        """Get content types supported for multimodal input.
+
+        Determines supported types based on the underlying model.
+
+        Returns:
+            List of supported MIME type prefixes.
+        """
+        if not self.supports_multimodal():
+            return []
+
+        model_lower = self.model.lower()
+
+        if "gemini" in model_lower:
+            return ["image/", "audio/", "video/", "application/pdf", "text/"]
+        if "claude-3" in model_lower or "claude-4" in model_lower:
+            return ["image/", "application/pdf"]
+        return ["image/"]
+
+    def format_multimodal_content(
+        self,
+        files: dict[str, FileInput],
+        upload_cache: UploadCache | None = None,
+    ) -> list[dict[str, Any]]:
+        """Format files as multimodal content blocks for litellm.
+
+        Uses OpenAI-compatible format which litellm translates to provider format.
+        Uses FileResolver for consistent base64 encoding.
+
+        Args:
+            files: Dictionary mapping file names to FileInput objects.
+            upload_cache: Optional cache (not used by litellm but kept for interface consistency).
+
+        Returns:
+            List of content blocks in OpenAI's expected format.
+        """
+        import base64
+
+        from crewai.utilities.files import (
+            FileResolver,
+            FileResolverConfig,
+            InlineBase64,
+        )
+
+        if not self.supports_multimodal():
+            return []
+
+        content_blocks: list[dict[str, Any]] = []
+        supported_types = self.supported_multimodal_content_types()
+
+        # LiteLLM uses OpenAI-compatible format
+        config = FileResolverConfig(prefer_upload=False)
+        resolver = FileResolver(config=config, upload_cache=upload_cache)
+
+        for file_input in files.values():
+            content_type = file_input.content_type
+            if not any(content_type.startswith(t) for t in supported_types):
+                continue
+
+            resolved = resolver.resolve(file_input, "openai")
+
+            if isinstance(resolved, InlineBase64):
+                content_blocks.append(
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:{resolved.content_type};base64,{resolved.data}"
+                        },
+                    }
+                )
+            else:
+                # Fallback to direct base64 encoding
+                data = base64.b64encode(file_input.read()).decode("ascii")
+                content_blocks.append(
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:{content_type};base64,{data}"},
+                    }
+                )
+
+        return content_blocks
--- a/lib/crewai/src/crewai/llms/base_llm.py
+++ b/lib/crewai/src/crewai/llms/base_llm.py
@@ -35,6 +35,7 @@ if TYPE_CHECKING:
    from crewai.agent.core import Agent
    from crewai.task import Task
    from crewai.tools.base_tool import BaseTool
+    from crewai.utilities.files import FileInput, UploadCache
    from crewai.utilities.types import LLMMessage


@@ -280,6 +281,54 @@ class BaseLLM(ABC):
        # Default implementation - subclasses should override with model-specific values
        return DEFAULT_CONTEXT_WINDOW_SIZE

+    def supports_multimodal(self) -> bool:
+        """Check if the LLM supports multimodal inputs.
+
+        Returns:
+            True if the LLM supports images, PDFs, audio, or video.
+        """
+        return False
+
+    def supported_multimodal_content_types(self) -> list[str]:
+        """Get the content types supported by this LLM for multimodal input.
+
+        Returns:
+            List of supported MIME type prefixes (e.g., ["image/", "application/pdf"]).
+        """
+        return []
+
+    def format_multimodal_content(
+        self,
+        files: dict[str, FileInput],
+        upload_cache: UploadCache | None = None,
+    ) -> list[dict[str, Any]]:
+        """Format files as multimodal content blocks for the LLM.
+
+        Subclasses should override this to provide provider-specific formatting.
+
+        Args:
+            files: Dictionary mapping file names to FileInput objects.
+            upload_cache: Optional cache for tracking uploaded files.
+
+        Returns:
+            List of content blocks in the provider's expected format.
+        """
+        return []
+
+    def format_text_content(self, text: str) -> dict[str, Any]:
+        """Format text as a content block for the LLM.
+
+        Default implementation uses OpenAI/Anthropic format.
+        Subclasses should override for provider-specific formatting.
+
+        Args:
+            text: The text content to format.
+
+        Returns:
+            A content block in the provider's expected format.
+        """
+        return {"type": "text", "text": text}
+
    # Common helper methods for native SDK implementations

    def _emit_call_started_event(
--- a/lib/crewai/src/crewai/llms/providers/anthropic/completion.py
+++ b/lib/crewai/src/crewai/llms/providers/anthropic/completion.py
@@ -1,5 +1,6 @@
 from __future__ import annotations

+import base64
 import json
 import logging
 import os
@@ -20,6 +21,9 @@ from crewai.utilities.types import LLMMessage

 if TYPE_CHECKING:
    from crewai.llms.hooks.base import BaseInterceptor
+    from crewai.utilities.files import FileInput, UploadCache
+
+DEFAULT_CACHE_TTL = "ephemeral"

 try:
    from anthropic import Anthropic, AsyncAnthropic
@@ -1231,3 +1235,138 @@ class AnthropicCompletion(BaseLLM):
                "total_tokens": input_tokens + output_tokens,
            }
        return {"total_tokens": 0}
+
+    def supports_multimodal(self) -> bool:
+        """Check if the model supports multimodal inputs.
+
+        All Claude 3+ models support vision and PDFs.
+
+        Returns:
+            True if the model supports images and PDFs.
+        """
+        return "claude-3" in self.model.lower() or "claude-4" in self.model.lower()
+
+    def supported_multimodal_content_types(self) -> list[str]:
+        """Get content types supported by Anthropic for multimodal input.
+
+        Returns:
+            List of supported MIME type prefixes.
+        """
+        if not self.supports_multimodal():
+            return []
+        return ["image/", "application/pdf"]
+
+    def format_multimodal_content(
+        self,
+        files: dict[str, FileInput],
+        upload_cache: UploadCache | None = None,
+        enable_caching: bool = True,
+        cache_ttl: str | None = None,
+    ) -> list[dict[str, Any]]:
+        """Format files as Anthropic multimodal content blocks.
+
+        Anthropic supports both base64 inline format and file references via Files API.
+        Uses FileResolver to determine the best delivery method based on file size.
+        Supports prompt caching to reduce costs and latency for repeated file usage.
+
+        Args:
+            files: Dictionary mapping file names to FileInput objects.
+            upload_cache: Optional cache for tracking uploaded files.
+            enable_caching: Whether to add cache_control markers (default: True).
+            cache_ttl: Cache TTL - "ephemeral" (5min) or "1h" (1hr for supported models).
+
+        Returns:
+            List of content blocks in Anthropic's expected format.
+        """
+        if not self.supports_multimodal():
+            return []
+
+        from crewai.utilities.files import (
+            FileReference,
+            FileResolver,
+            FileResolverConfig,
+            InlineBase64,
+        )
+
+        content_blocks: list[dict[str, Any]] = []
+        supported_types = self.supported_multimodal_content_types()
+
+        config = FileResolverConfig(prefer_upload=False)
+        resolver = FileResolver(config=config, upload_cache=upload_cache)
+
+        file_list = list(files.values())
+        num_files = len(file_list)
+
+        for i, file_input in enumerate(file_list):
+            content_type = file_input.content_type
+            if not any(content_type.startswith(t) for t in supported_types):
+                continue
+
+            resolved = resolver.resolve(file_input, "anthropic")
+            block: dict[str, Any] = {}
+
+            if isinstance(resolved, FileReference):
+                if content_type.startswith("image/"):
+                    block = {
+                        "type": "image",
+                        "source": {
+                            "type": "file",
+                            "file_id": resolved.file_id,
+                        },
+                    }
+                elif content_type == "application/pdf":
+                    block = {
+                        "type": "document",
+                        "source": {
+                            "type": "file",
+                            "file_id": resolved.file_id,
+                        },
+                    }
+            elif isinstance(resolved, InlineBase64):
+                if content_type.startswith("image/"):
+                    block = {
+                        "type": "image",
+                        "source": {
+                            "type": "base64",
+                            "media_type": resolved.content_type,
+                            "data": resolved.data,
+                        },
+                    }
+                elif content_type == "application/pdf":
+                    block = {
+                        "type": "document",
+                        "source": {
+                            "type": "base64",
+                            "media_type": resolved.content_type,
+                            "data": resolved.data,
+                        },
+                    }
+            else:
+                data = base64.b64encode(file_input.read()).decode("ascii")
+                if content_type.startswith("image/"):
+                    block = {
+                        "type": "image",
+                        "source": {
+                            "type": "base64",
+                            "media_type": content_type,
+                            "data": data,
+                        },
+                    }
+                elif content_type == "application/pdf":
+                    block = {
+                        "type": "document",
+                        "source": {
+                            "type": "base64",
+                            "media_type": content_type,
+                            "data": data,
+                        },
+                    }
+
+            if block and enable_caching and i == num_files - 1:
+                cache_control: dict[str, str] = {"type": cache_ttl or DEFAULT_CACHE_TTL}
+                block["cache_control"] = cache_control
+
+            if block:
+                content_blocks.append(block)
+
+        return content_blocks
--- a/lib/crewai/src/crewai/llms/providers/azure/completion.py
+++ b/lib/crewai/src/crewai/llms/providers/azure/completion.py
@@ -1,5 +1,6 @@
 from __future__ import annotations

+import base64
 import json
 import logging
 import os
@@ -18,6 +19,7 @@ from crewai.utilities.types import LLMMessage

 if TYPE_CHECKING:
    from crewai.llms.hooks.base import BaseInterceptor
+    from crewai.utilities.files import FileInput, UploadCache


 try:
@@ -1016,3 +1018,85 @@ class AzureCompletion(BaseLLM):
    async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
        """Async context manager exit."""
        await self.aclose()
+
+    def supports_multimodal(self) -> bool:
+        """Check if the model supports multimodal inputs.
+
+        Azure OpenAI vision-enabled models include GPT-4o and GPT-4 Turbo with Vision.
+
+        Returns:
+            True if the model supports images.
+        """
+        vision_models = ("gpt-4o", "gpt-4-turbo", "gpt-4-vision", "gpt-4v")
+        return any(self.model.lower().startswith(m) for m in vision_models)
+
+    def supported_multimodal_content_types(self) -> list[str]:
+        """Get content types supported by Azure for multimodal input.
+
+        Returns:
+            List of supported MIME type prefixes.
+        """
+        if not self.supports_multimodal():
+            return []
+        return ["image/"]
+
+    def format_multimodal_content(
+        self,
+        files: dict[str, FileInput],
+        upload_cache: UploadCache | None = None,
+    ) -> list[dict[str, Any]]:
+        """Format files as Azure OpenAI multimodal content blocks.
+
+        Azure OpenAI uses the same image_url format as OpenAI.
+        Uses FileResolver for consistent base64 encoding.
+
+        Args:
+            files: Dictionary mapping file names to FileInput objects.
+            upload_cache: Optional cache (not used by Azure but kept for interface consistency).
+
+        Returns:
+            List of content blocks in Azure OpenAI's expected format.
+        """
+        if not self.supports_multimodal():
+            return []
+
+        from crewai.utilities.files import (
+            FileResolver,
+            FileResolverConfig,
+            InlineBase64,
+        )
+
+        content_blocks: list[dict[str, Any]] = []
+        supported_types = self.supported_multimodal_content_types()
+
+        # Azure doesn't support file uploads for images, so just use inline
+        config = FileResolverConfig(prefer_upload=False)
+        resolver = FileResolver(config=config, upload_cache=upload_cache)
+
+        for file_input in files.values():
+            content_type = file_input.content_type
+            if not any(content_type.startswith(t) for t in supported_types):
+                continue
+
+            resolved = resolver.resolve(file_input, "azure")
+
+            if isinstance(resolved, InlineBase64):
+                content_blocks.append(
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:{resolved.content_type};base64,{resolved.data}"
+                        },
+                    }
+                )
+            else:
+                # Fallback to direct base64 encoding
+                data = base64.b64encode(file_input.read()).decode("ascii")
+                content_blocks.append(
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:{content_type};base64,{data}"},
+                    }
+                )
+
+        return content_blocks
--- a/lib/crewai/src/crewai/llms/providers/bedrock/completion.py
+++ b/lib/crewai/src/crewai/llms/providers/bedrock/completion.py
@@ -33,6 +33,7 @@ if TYPE_CHECKING:
    )

    from crewai.llms.hooks.base import BaseInterceptor
+    from crewai.utilities.files import FileInput, UploadCache


 try:
@@ -1450,3 +1451,92 @@ class BedrockCompletion(BaseLLM):

        # Default context window size
        return int(8192 * CONTEXT_WINDOW_USAGE_RATIO)
+
+    def supports_multimodal(self) -> bool:
+        """Check if the model supports multimodal inputs.
+
+        Claude models on Bedrock support vision.
+
+        Returns:
+            True if the model supports images.
+        """
+        vision_models = ("anthropic.claude-3",)
+        return any(self.model.lower().startswith(m) for m in vision_models)
+
+    def supported_multimodal_content_types(self) -> list[str]:
+        """Get content types supported by Bedrock for multimodal input.
+
+        Returns:
+            List of supported MIME type prefixes.
+        """
+        if not self.supports_multimodal():
+            return []
+        return ["image/", "application/pdf"]
+
+    def format_multimodal_content(
+        self,
+        files: dict[str, FileInput],
+        upload_cache: UploadCache | None = None,
+    ) -> list[dict[str, Any]]:
+        """Format files as Bedrock Converse API multimodal content blocks.
+
+        Bedrock Converse API uses specific formats for images and documents with raw bytes.
+        Uses FileResolver to get InlineBytes format for Bedrock's byte-based API.
+
+        Args:
+            files: Dictionary mapping file names to FileInput objects.
+            upload_cache: Optional cache (not used by Bedrock but kept for interface consistency).
+
+        Returns:
+            List of content blocks in Bedrock's expected format.
+        """
+        if not self.supports_multimodal():
+            return []
+
+        from crewai.utilities.files import (
+            FileResolver,
+            FileResolverConfig,
+            InlineBytes,
+        )
+
+        content_blocks: list[dict[str, Any]] = []
+
+        # Bedrock uses raw bytes, configure resolver accordingly
+        config = FileResolverConfig(prefer_upload=False, use_bytes_for_bedrock=True)
+        resolver = FileResolver(config=config, upload_cache=upload_cache)
+
+        for name, file_input in files.items():
+            content_type = file_input.content_type
+
+            resolved = resolver.resolve(file_input, "bedrock")
+
+            if isinstance(resolved, InlineBytes):
+                file_bytes = resolved.data
+            else:
+                # Fallback to reading directly
+                file_bytes = file_input.read()
+
+            if content_type.startswith("image/"):
+                media_type = content_type.split("/")[-1]
+                if media_type == "jpg":
+                    media_type = "jpeg"
+                content_blocks.append(
+                    {
+                        "image": {
+                            "format": media_type,
+                            "source": {"bytes": file_bytes},
+                        }
+                    }
+                )
+            elif content_type == "application/pdf":
+                content_blocks.append(
+                    {
+                        "document": {
+                            "name": name,
+                            "format": "pdf",
+                            "source": {"bytes": file_bytes},
+                        }
+                    }
+                )
+
+        return content_blocks
--- a/lib/crewai/src/crewai/llms/providers/gemini/completion.py
+++ b/lib/crewai/src/crewai/llms/providers/gemini/completion.py
@@ -1,5 +1,6 @@
 from __future__ import annotations

+import base64
 import json
 import logging
 import os
@@ -19,6 +20,10 @@ from crewai.utilities.types import LLMMessage

 if TYPE_CHECKING:
    from crewai.llms.hooks.base import BaseInterceptor
+    from crewai.utilities.files import (
+        FileInput,
+        UploadCache,
+    )


 try:
@@ -516,17 +521,31 @@ class GeminiCompletion(BaseLLM):
            role = message["role"]
            content = message["content"]

-            # Convert content to string if it's a list
+            # Build parts list from content
+            parts: list[types.Part] = []
            if isinstance(content, list):
-                text_content = " ".join(
-                    str(item.get("text", "")) if isinstance(item, dict) else str(item)
-                    for item in content
-                )
+                for item in content:
+                    if isinstance(item, dict):
+                        if "text" in item:
+                            parts.append(types.Part.from_text(text=str(item["text"])))
+                        elif "inlineData" in item:
+                            inline = item["inlineData"]
+                            parts.append(
+                                types.Part.from_bytes(
+                                    data=base64.b64decode(inline["data"]),
+                                    mime_type=inline["mimeType"],
+                                )
+                            )
+                    else:
+                        parts.append(types.Part.from_text(text=str(item)))
            else:
-                text_content = str(content) if content else ""
+                parts.append(types.Part.from_text(text=str(content) if content else ""))

            if role == "system":
                # Extract system instruction - Gemini handles it separately
+                text_content = " ".join(
+                    p.text for p in parts if hasattr(p, "text") and p.text
+                )
                if system_instruction:
                    system_instruction += f"\n\n{text_content}"
                else:
@@ -536,9 +555,7 @@ class GeminiCompletion(BaseLLM):
                gemini_role = "model" if role == "assistant" else "user"

                # Create Content object
-                gemini_content = types.Content(
-                    role=gemini_role, parts=[types.Part.from_text(text=text_content)]
-                )
+                gemini_content = types.Content(role=gemini_role, parts=parts)
                contents.append(gemini_content)

        return contents, system_instruction
@@ -1060,3 +1077,106 @@ class GeminiCompletion(BaseLLM):
                )
            )
        return result
+
+    def supports_multimodal(self) -> bool:
+        """Check if the model supports multimodal inputs.
+
+        Gemini models support images, audio, video, and PDFs.
+
+        Returns:
+            True if the model supports multimodal inputs.
+        """
+        return True
+
+    def supported_multimodal_content_types(self) -> list[str]:
+        """Get content types supported by Gemini for multimodal input.
+
+        Returns:
+            List of supported MIME type prefixes.
+        """
+        return ["image/", "audio/", "video/", "application/pdf", "text/"]
+
+    def format_multimodal_content(
+        self,
+        files: dict[str, FileInput],
+        upload_cache: UploadCache | None = None,
+    ) -> list[dict[str, Any]]:
+        """Format files as Gemini multimodal content blocks.
+
+        Gemini supports both inlineData format and file references via File API.
+        Uses FileResolver to determine the best delivery method based on file size.
+
+        Args:
+            files: Dictionary mapping file names to FileInput objects.
+            upload_cache: Optional cache for tracking uploaded files.
+
+        Returns:
+            List of content blocks in Gemini's expected format.
+        """
+        from crewai.utilities.files import (
+            FileReference,
+            FileResolver,
+            FileResolverConfig,
+            InlineBase64,
+        )
+
+        content_blocks: list[dict[str, Any]] = []
+        supported_types = self.supported_multimodal_content_types()
+
+        # Create resolver with optional cache
+        config = FileResolverConfig(prefer_upload=False)
+        resolver = FileResolver(config=config, upload_cache=upload_cache)
+
+        for file_input in files.values():
+            content_type = file_input.content_type
+            if not any(content_type.startswith(t) for t in supported_types):
+                continue
+
+            resolved = resolver.resolve(file_input, "gemini")
+
+            if isinstance(resolved, FileReference) and resolved.file_uri:
+                # Use file reference format for uploaded files
+                content_blocks.append(
+                    {
+                        "fileData": {
+                            "mimeType": resolved.content_type,
+                            "fileUri": resolved.file_uri,
+                        }
+                    }
+                )
+            elif isinstance(resolved, InlineBase64):
+                # Use inline format for smaller files
+                content_blocks.append(
+                    {
+                        "inlineData": {
+                            "mimeType": resolved.content_type,
+                            "data": resolved.data,
+                        }
+                    }
+                )
+            else:
+                # Fallback to base64 encoding
+                data = base64.b64encode(file_input.read()).decode("ascii")
+                content_blocks.append(
+                    {
+                        "inlineData": {
+                            "mimeType": content_type,
+                            "data": data,
+                        }
+                    }
+                )
+
+        return content_blocks
+
+    def format_text_content(self, text: str) -> dict[str, Any]:
+        """Format text as a Gemini content block.
+
+        Gemini uses {"text": "..."} format instead of {"type": "text", "text": "..."}.
+
+        Args:
+            text: The text content to format.
+
+        Returns:
+            A content block in Gemini's expected format.
+        """
+        return {"text": text}
--- a/lib/crewai/src/crewai/llms/providers/openai/completion.py
+++ b/lib/crewai/src/crewai/llms/providers/openai/completion.py
@@ -1,5 +1,6 @@
 from __future__ import annotations

+import base64
 from collections.abc import AsyncIterator
 import json
 import logging
@@ -30,6 +31,7 @@ if TYPE_CHECKING:
    from crewai.llms.hooks.base import BaseInterceptor
    from crewai.task import Task
    from crewai.tools.base_tool import BaseTool
+    from crewai.utilities.files import FileInput, UploadCache


 class OpenAICompletion(BaseLLM):
@@ -1048,3 +1050,101 @@ class OpenAICompletion(BaseLLM):
                formatted_messages.append(message)

        return formatted_messages
+
+    def supports_multimodal(self) -> bool:
+        """Check if the model supports multimodal inputs.
+
+        OpenAI vision-enabled models include GPT-4o, GPT-4.1, and o-series.
+
+        Returns:
+            True if the model supports images.
+        """
+        vision_models = (
+            "gpt-4o",
+            "gpt-4.1",
+            "gpt-4-turbo",
+            "gpt-4-vision",
+            "o1",
+            "o3",
+            "o4",
+        )
+        return any(self.model.lower().startswith(m) for m in vision_models)
+
+    def supported_multimodal_content_types(self) -> list[str]:
+        """Get content types supported by OpenAI for multimodal input.
+
+        Returns:
+            List of supported MIME type prefixes.
+        """
+        if not self.supports_multimodal():
+            return []
+        return ["image/"]
+
+    def format_multimodal_content(
+        self,
+        files: dict[str, FileInput],
+        upload_cache: UploadCache | None = None,
+    ) -> list[dict[str, Any]]:
+        """Format files as OpenAI multimodal content blocks.
+
+        OpenAI supports both base64 data URLs and file_id references via Files API.
+        Uses FileResolver to determine the best delivery method based on file size.
+
+        Args:
+            files: Dictionary mapping file names to FileInput objects.
+            upload_cache: Optional cache for tracking uploaded files.
+
+        Returns:
+            List of content blocks in OpenAI's expected format.
+        """
+        if not self.supports_multimodal():
+            return []
+
+        from crewai.utilities.files import (
+            FileReference,
+            FileResolver,
+            FileResolverConfig,
+            InlineBase64,
+        )
+
+        content_blocks: list[dict[str, Any]] = []
+        supported_types = self.supported_multimodal_content_types()
+
+        config = FileResolverConfig(prefer_upload=False)
+        resolver = FileResolver(config=config, upload_cache=upload_cache)
+
+        for file_input in files.values():
+            content_type = file_input.content_type
+            if not any(content_type.startswith(t) for t in supported_types):
+                continue
+
+            resolved = resolver.resolve(file_input, "openai")
+
+            if isinstance(resolved, FileReference):
+                content_blocks.append(
+                    {
+                        "type": "file",
+                        "file": {
+                            "file_id": resolved.file_id,
+                        },
+                    }
+                )
+            elif isinstance(resolved, InlineBase64):
+                content_blocks.append(
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:{resolved.content_type};base64,{resolved.data}"
+                        },
+                    }
+                )
+            else:
+                data = base64.b64encode(file_input.read()).decode("ascii")
+                content_blocks.append(
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:{content_type};base64,{data}"},
+                    }
+                )
+
+        return content_blocks
--- a/lib/crewai/src/crewai/utilities/files/init.py
+++ b/lib/crewai/src/crewai/utilities/files/init.py
@@ -0,0 +1,210 @@
+"""File handling utilities for crewAI tasks."""
+
+from crewai.utilities.files.cleanup import (
+    cleanup_expired_files,
+    cleanup_provider_files,
+    cleanup_uploaded_files,
+)
+from crewai.utilities.files.content_types import (
+    AudioContentType,
+    AudioExtension,
+    AudioFile,
+    BaseFile,
+    File,
+    FileMode,
+    ImageContentType,
+    ImageExtension,
+    ImageFile,
+    PDFContentType,
+    PDFExtension,
+    PDFFile,
+    TextContentType,
+    TextExtension,
+    TextFile,
+    VideoContentType,
+    VideoExtension,
+    VideoFile,
+)
+from crewai.utilities.files.file import (
+    FileBytes,
+    FilePath,
+    FileSource,
+    FileSourceInput,
+    FileStream,
+    RawFileInput,
+)
+from crewai.utilities.files.processing import (
+    ANTHROPIC_CONSTRAINTS,
+    BEDROCK_CONSTRAINTS,
+    GEMINI_CONSTRAINTS,
+    OPENAI_CONSTRAINTS,
+    AudioConstraints,
+    FileHandling,
+    FileProcessingError,
+    FileProcessor,
+    FileTooLargeError,
+    FileValidationError,
+    ImageConstraints,
+    PDFConstraints,
+    ProcessingDependencyError,
+    ProviderConstraints,
+    UnsupportedFileTypeError,
+    VideoConstraints,
+    get_constraints_for_provider,
+)
+from crewai.utilities.files.resolved import (
+    FileReference,
+    InlineBase64,
+    InlineBytes,
+    ResolvedFile,
+    ResolvedFileType,
+    UrlReference,
+)
+from crewai.utilities.files.resolver import (
+    FileResolver,
+    FileResolverConfig,
+    create_resolver,
+)
+from crewai.utilities.files.upload_cache import (
+    CachedUpload,
+    UploadCache,
+    get_upload_cache,
+    reset_upload_cache,
+)
+from crewai.utilities.files.uploaders import FileUploader, UploadResult, get_uploader
+
+
+FileInput = AudioFile | File | ImageFile | PDFFile | TextFile | VideoFile
+
+
+def wrap_file_source(source: FileSource) -> FileInput:
+    """Wrap a FileSource in the appropriate typed FileInput wrapper.
+
+    Args:
+        source: The file source to wrap.
+
+    Returns:
+        Typed FileInput wrapper based on content type.
+    """
+    content_type = source.content_type
+
+    if content_type.startswith("image/"):
+        return ImageFile(source=source)
+    if content_type.startswith("audio/"):
+        return AudioFile(source=source)
+    if content_type.startswith("video/"):
+        return VideoFile(source=source)
+    if content_type == "application/pdf":
+        return PDFFile(source=source)
+    # Default to text for anything else
+    return TextFile(source=source)
+
+
+def normalize_input_files(
+    input_files: list[FileSourceInput | FileInput],
+) -> dict[str, FileInput]:
+    """Convert a list of file sources to a named dictionary of FileInputs.
+
+    Args:
+        input_files: List of file source inputs or File objects.
+
+    Returns:
+        Dictionary mapping names to FileInput wrappers.
+    """
+    from pathlib import Path
+
+    result: dict[str, FileInput] = {}
+
+    for i, item in enumerate(input_files):
+        # If it's already a typed File wrapper, use it directly
+        if isinstance(item, BaseFile):
+            name = item.filename or f"file_{i}"
+            # Remove extension from name for cleaner keys
+            if "." in name:
+                name = name.rsplit(".", 1)[0]
+            result[name] = item
+            continue
+
+        file_source: FilePath | FileBytes | FileStream
+        if isinstance(item, (FilePath, FileBytes, FileStream)):
+            file_source = item
+        elif isinstance(item, Path):
+            file_source = FilePath(path=item)
+        elif isinstance(item, str):
+            file_source = FilePath(path=Path(item))
+        elif isinstance(item, (bytes, memoryview)):
+            file_source = FileBytes(data=bytes(item))
+        else:
+            continue
+
+        name = file_source.filename or f"file_{i}"
+        result[name] = wrap_file_source(file_source)
+
+    return result
+
+
+__all__ = [
+    "ANTHROPIC_CONSTRAINTS",
+    "BEDROCK_CONSTRAINTS",
+    "GEMINI_CONSTRAINTS",
+    "OPENAI_CONSTRAINTS",
+    "AudioConstraints",
+    "AudioContentType",
+    "AudioExtension",
+    "AudioFile",
+    "BaseFile",
+    "CachedUpload",
+    "File",
+    "FileBytes",
+    "FileHandling",
+    "FileInput",
+    "FileMode",
+    "FilePath",
+    "FileProcessingError",
+    "FileProcessor",
+    "FileReference",
+    "FileResolver",
+    "FileResolverConfig",
+    "FileSource",
+    "FileSourceInput",
+    "FileStream",
+    "FileTooLargeError",
+    "FileUploader",
+    "FileValidationError",
+    "ImageConstraints",
+    "ImageContentType",
+    "ImageExtension",
+    "ImageFile",
+    "InlineBase64",
+    "InlineBytes",
+    "PDFConstraints",
+    "PDFContentType",
+    "PDFExtension",
+    "PDFFile",
+    "ProcessingDependencyError",
+    "ProviderConstraints",
+    "RawFileInput",
+    "ResolvedFile",
+    "ResolvedFileType",
+    "TextContentType",
+    "TextExtension",
+    "TextFile",
+    "UnsupportedFileTypeError",
+    "UploadCache",
+    "UploadResult",
+    "UrlReference",
+    "VideoConstraints",
+    "VideoContentType",
+    "VideoExtension",
+    "VideoFile",
+    "cleanup_expired_files",
+    "cleanup_provider_files",
+    "cleanup_uploaded_files",
+    "create_resolver",
+    "get_constraints_for_provider",
+    "get_upload_cache",
+    "get_uploader",
+    "normalize_input_files",
+    "reset_upload_cache",
+    "wrap_file_source",
+]
--- a/lib/crewai/src/crewai/utilities/files/processing/init.py
+++ b/lib/crewai/src/crewai/utilities/files/processing/init.py
@@ -0,0 +1,62 @@
+"""File processing module for multimodal content handling.
+
+This module provides validation, transformation, and processing utilities
+for files used in multimodal LLM interactions.
+"""
+
+from crewai.utilities.files.processing.constraints import (
+    ANTHROPIC_CONSTRAINTS,
+    BEDROCK_CONSTRAINTS,
+    GEMINI_CONSTRAINTS,
+    OPENAI_CONSTRAINTS,
+    AudioConstraints,
+    ImageConstraints,
+    PDFConstraints,
+    ProviderConstraints,
+    VideoConstraints,
+    get_constraints_for_provider,
+)
+from crewai.utilities.files.processing.enums import FileHandling
+from crewai.utilities.files.processing.exceptions import (
+    FileProcessingError,
+    FileTooLargeError,
+    FileValidationError,
+    ProcessingDependencyError,
+    UnsupportedFileTypeError,
+)
+from crewai.utilities.files.processing.processor import FileProcessor
+from crewai.utilities.files.processing.validators import (
+    validate_audio,
+    validate_file,
+    validate_image,
+    validate_pdf,
+    validate_text,
+    validate_video,
+)
+
+
+__all__ = [
+    "ANTHROPIC_CONSTRAINTS",
+    "BEDROCK_CONSTRAINTS",
+    "GEMINI_CONSTRAINTS",
+    "OPENAI_CONSTRAINTS",
+    "AudioConstraints",
+    "FileHandling",
+    "FileProcessingError",
+    "FileProcessor",
+    "FileTooLargeError",
+    "FileValidationError",
+    "ImageConstraints",
+    "PDFConstraints",
+    "ProcessingDependencyError",
+    "ProviderConstraints",
+    "UnsupportedFileTypeError",
+    "VideoConstraints",
+    "get_constraints_for_provider",
+    "validate_audio",
+    "validate_file",
+    "validate_image",
+    "validate_pdf",
+    "validate_text",
+    "validate_video",
+]
--- a/lib/crewai/tests/cassettes/llms/TestAnthropicMultimodalIntegration.test_analyze_pdf.yaml
+++ b/lib/crewai/tests/cassettes/llms/TestAnthropicMultimodalIntegration.test_analyze_pdf.yaml
@@ -0,0 +1,104 @@
+interactions:
+- request:
+    body: '{"max_tokens":4096,"messages":[{"role":"user","content":[{"type":"text","text":"What
+      type of document is this? Answer in one word."},{"type":"document","source":{"type":"base64","media_type":"application/pdf","data":"JVBERi0xLjQKMSAwIG9iaiA8PCAvVHlwZSAvQ2F0YWxvZyAvUGFnZXMgMiAwIFIgPj4gZW5kb2JqCjIgMCBvYmogPDwgL1R5cGUgL1BhZ2VzIC9LaWRzIFszIDAgUl0gL0NvdW50IDEgPj4gZW5kb2JqCjMgMCBvYmogPDwgL1R5cGUgL1BhZ2UgL1BhcmVudCAyIDAgUiAvTWVkaWFCb3ggWzAgMCA2MTIgNzkyXSA+PiBlbmRvYmoKeHJlZgowIDQKMDAwMDAwMDAwMCA2NTUzNSBmCjAwMDAwMDAwMDkgMDAwMDAgbgowMDAwMDAwMDU4IDAwMDAwIG4KMDAwMDAwMDExNSAwMDAwMCBuCnRyYWlsZXIgPDwgL1NpemUgNCAvUm9vdCAxIDAgUiA+PgpzdGFydHhyZWYKMTk2CiUlRU9GCg=="},"cache_control":{"type":"ephemeral"}}]}],"model":"claude-3-5-haiku-20241022","stream":false}'
+    headers:
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept:
+      - application/json
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      anthropic-version:
+      - '2023-06-01'
+      connection:
+      - keep-alive
+      content-length:
+      - '748'
+      content-type:
+      - application/json
+      host:
+      - api.anthropic.com
+      x-api-key:
+      - X-API-KEY-XXX
+      x-stainless-arch:
+      - X-STAINLESS-ARCH-XXX
+      x-stainless-async:
+      - 'false'
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - X-STAINLESS-OS-XXX
+      x-stainless-package-version:
+      - 0.71.1
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.12.10
+      x-stainless-timeout:
+      - NOT_GIVEN
+    method: POST
+    uri: https://api.anthropic.com/v1/messages
+  response:
+    body:
+      string: !!binary |
+        H4sIAAAAAAAA/3WQTUvEMBCG/8ucW2jr7rL25sKCKHrQiyASYjJsw6ZJzUxEKf3vTheLX3hKeJ8n
+        8zIZoY8WPbRgvM4Wy7NyXXbaHXPZVM2qrpoGCnBWhJ4Oqqovd/nBnt92tF1dX+z3u6t7ffO8FYff
+        B5wtJNIHlCBFPweayBHrwBKZGBjl1j6Oi8/4NpPT0cIdUu4RpqcCiOOgEmqKQQAGqzinAJ+A8CVj
+        MDIhZO8LyKfSdgQXhsyK4xEDQVtvmo3UatOhMjKMXQzqp1ItXLD9jy1v5wYcOuwxaa/W/V//i9bd
+        bzoVEDN/j1ayDqZXZ1CxwySLzl9ldbIwTR/rySkqnAEAAA==
+    headers:
+      CF-RAY:
+      - CF-RAY-XXX
+      Connection:
+      - keep-alive
+      Content-Type:
+      - application/json
+      Date:
+      - Thu, 22 Jan 2026 00:18:50 GMT
+      Server:
+      - cloudflare
+      Transfer-Encoding:
+      - chunked
+      X-Robots-Tag:
+      - none
+      anthropic-organization-id:
+      - ANTHROPIC-ORGANIZATION-ID-XXX
+      anthropic-ratelimit-input-tokens-limit:
+      - ANTHROPIC-RATELIMIT-INPUT-TOKENS-LIMIT-XXX
+      anthropic-ratelimit-input-tokens-remaining:
+      - ANTHROPIC-RATELIMIT-INPUT-TOKENS-REMAINING-XXX
+      anthropic-ratelimit-input-tokens-reset:
+      - ANTHROPIC-RATELIMIT-INPUT-TOKENS-RESET-XXX
+      anthropic-ratelimit-output-tokens-limit:
+      - ANTHROPIC-RATELIMIT-OUTPUT-TOKENS-LIMIT-XXX
+      anthropic-ratelimit-output-tokens-remaining:
+      - ANTHROPIC-RATELIMIT-OUTPUT-TOKENS-REMAINING-XXX
+      anthropic-ratelimit-output-tokens-reset:
+      - ANTHROPIC-RATELIMIT-OUTPUT-TOKENS-RESET-XXX
+      anthropic-ratelimit-requests-limit:
+      - '4000'
+      anthropic-ratelimit-requests-remaining:
+      - '3999'
+      anthropic-ratelimit-requests-reset:
+      - '2026-01-22T00:18:50Z'
+      anthropic-ratelimit-tokens-limit:
+      - ANTHROPIC-RATELIMIT-TOKENS-LIMIT-XXX
+      anthropic-ratelimit-tokens-remaining:
+      - ANTHROPIC-RATELIMIT-TOKENS-REMAINING-XXX
+      anthropic-ratelimit-tokens-reset:
+      - ANTHROPIC-RATELIMIT-TOKENS-RESET-XXX
+      cf-cache-status:
+      - DYNAMIC
+      request-id:
+      - REQUEST-ID-XXX
+      strict-transport-security:
+      - STS-XXX
+      x-envoy-upstream-service-time:
+      - '750'
+    status:
+      code: 200
+      message: OK
+version: 1
--- a/lib/crewai/tests/cassettes/llms/TestAnthropicMultimodalIntegration.test_describe_image.yaml
+++ b/lib/crewai/tests/cassettes/llms/TestAnthropicMultimodalIntegration.test_describe_image.yaml
--- a/lib/crewai/tests/cassettes/llms/TestGenericFileIntegration.test_generic_file_image_openai.yaml
+++ b/lib/crewai/tests/cassettes/llms/TestGenericFileIntegration.test_generic_file_image_openai.yaml
--- a/lib/crewai/tests/cassettes/llms/TestGenericFileIntegration.test_generic_file_mixed_types.yaml
+++ b/lib/crewai/tests/cassettes/llms/TestGenericFileIntegration.test_generic_file_mixed_types.yaml
--- a/lib/crewai/tests/cassettes/llms/TestGenericFileIntegration.test_generic_file_pdf_anthropic.yaml
+++ b/lib/crewai/tests/cassettes/llms/TestGenericFileIntegration.test_generic_file_pdf_anthropic.yaml
@@ -0,0 +1,104 @@
+interactions:
+- request:
+    body: '{"max_tokens":4096,"messages":[{"role":"user","content":[{"type":"text","text":"What
+      type of document is this? Answer in one word."},{"type":"document","source":{"type":"base64","media_type":"application/pdf","data":"JVBERi0xLjQKMSAwIG9iaiA8PCAvVHlwZSAvQ2F0YWxvZyAvUGFnZXMgMiAwIFIgPj4gZW5kb2JqCjIgMCBvYmogPDwgL1R5cGUgL1BhZ2VzIC9LaWRzIFszIDAgUl0gL0NvdW50IDEgPj4gZW5kb2JqCjMgMCBvYmogPDwgL1R5cGUgL1BhZ2UgL1BhcmVudCAyIDAgUiAvTWVkaWFCb3ggWzAgMCA2MTIgNzkyXSA+PiBlbmRvYmoKeHJlZgowIDQKMDAwMDAwMDAwMCA2NTUzNSBmCjAwMDAwMDAwMDkgMDAwMDAgbgowMDAwMDAwMDU4IDAwMDAwIG4KMDAwMDAwMDExNSAwMDAwMCBuCnRyYWlsZXIgPDwgL1NpemUgNCAvUm9vdCAxIDAgUiA+PgpzdGFydHhyZWYKMTk2CiUlRU9GCg=="},"cache_control":{"type":"ephemeral"}}]}],"model":"claude-3-5-haiku-20241022","stream":false}'
+    headers:
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept:
+      - application/json
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      anthropic-version:
+      - '2023-06-01'
+      connection:
+      - keep-alive
+      content-length:
+      - '748'
+      content-type:
+      - application/json
+      host:
+      - api.anthropic.com
+      x-api-key:
+      - X-API-KEY-XXX
+      x-stainless-arch:
+      - X-STAINLESS-ARCH-XXX
+      x-stainless-async:
+      - 'false'
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - X-STAINLESS-OS-XXX
+      x-stainless-package-version:
+      - 0.71.1
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.12.10
+      x-stainless-timeout:
+      - NOT_GIVEN
+    method: POST
+    uri: https://api.anthropic.com/v1/messages
+  response:
+    body:
+      string: !!binary |
+        H4sIAAAAAAAA/3WQTUvEMBCG/8ucW2hju4eeRUU97EFRFAkhGbZh06Qmk1Up/e9OF4tf7CnhfZ7J
+        y2SCIRh00IF2Khssz8q27JXd51JUoqkrIaAAa1gY0k5W9bbptXo7PD60l/V1f/V0J+5vxQ079DHi
+        YmFKaoccxOCWQKVkEylPHOngCfnWPU+rT/i+kOPRwfb8AuaXAhKFUUZUKXhO0RtJOXr4AglfM3rN
+        4z47V0A+NnYTWD9mkhT26BN09UZsuFPpHqXmx8gGL38r1coZm1NsnV0acOxxwKicbIf//jet+790
+        LiBk+hk1vA7Gg9UoyWLkRZd/MioamOdP24g1JZkBAAA=
+    headers:
+      CF-RAY:
+      - CF-RAY-XXX
+      Connection:
+      - keep-alive
+      Content-Type:
+      - application/json
+      Date:
+      - Thu, 22 Jan 2026 00:18:56 GMT
+      Server:
+      - cloudflare
+      Transfer-Encoding:
+      - chunked
+      X-Robots-Tag:
+      - none
+      anthropic-organization-id:
+      - ANTHROPIC-ORGANIZATION-ID-XXX
+      anthropic-ratelimit-input-tokens-limit:
+      - ANTHROPIC-RATELIMIT-INPUT-TOKENS-LIMIT-XXX
+      anthropic-ratelimit-input-tokens-remaining:
+      - ANTHROPIC-RATELIMIT-INPUT-TOKENS-REMAINING-XXX
+      anthropic-ratelimit-input-tokens-reset:
+      - ANTHROPIC-RATELIMIT-INPUT-TOKENS-RESET-XXX
+      anthropic-ratelimit-output-tokens-limit:
+      - ANTHROPIC-RATELIMIT-OUTPUT-TOKENS-LIMIT-XXX
+      anthropic-ratelimit-output-tokens-remaining:
+      - ANTHROPIC-RATELIMIT-OUTPUT-TOKENS-REMAINING-XXX
+      anthropic-ratelimit-output-tokens-reset:
+      - ANTHROPIC-RATELIMIT-OUTPUT-TOKENS-RESET-XXX
+      anthropic-ratelimit-requests-limit:
+      - '4000'
+      anthropic-ratelimit-requests-remaining:
+      - '3999'
+      anthropic-ratelimit-requests-reset:
+      - '2026-01-22T00:18:55Z'
+      anthropic-ratelimit-tokens-limit:
+      - ANTHROPIC-RATELIMIT-TOKENS-LIMIT-XXX
+      anthropic-ratelimit-tokens-remaining:
+      - ANTHROPIC-RATELIMIT-TOKENS-REMAINING-XXX
+      anthropic-ratelimit-tokens-reset:
+      - ANTHROPIC-RATELIMIT-TOKENS-RESET-XXX
+      cf-cache-status:
+      - DYNAMIC
+      request-id:
+      - REQUEST-ID-XXX
+      strict-transport-security:
+      - STS-XXX
+      x-envoy-upstream-service-time:
+      - '648'
+    status:
+      code: 200
+      message: OK
+version: 1
--- a/lib/crewai/tests/cassettes/llms/TestLiteLLMMultimodalIntegration.test_describe_image_claude.yaml
+++ b/lib/crewai/tests/cassettes/llms/TestLiteLLMMultimodalIntegration.test_describe_image_claude.yaml
--- a/lib/crewai/tests/cassettes/llms/TestLiteLLMMultimodalIntegration.test_describe_image_gpt4o.yaml
+++ b/lib/crewai/tests/cassettes/llms/TestLiteLLMMultimodalIntegration.test_describe_image_gpt4o.yaml
--- a/lib/crewai/tests/cassettes/llms/TestMultipleFilesIntegration.test_mixed_content_anthropic.yaml
+++ b/lib/crewai/tests/cassettes/llms/TestMultipleFilesIntegration.test_mixed_content_anthropic.yaml
--- a/lib/crewai/tests/cassettes/llms/TestMultipleFilesIntegration.test_multiple_images_openai.yaml
+++ b/lib/crewai/tests/cassettes/llms/TestMultipleFilesIntegration.test_multiple_images_openai.yaml
--- a/lib/crewai/tests/cassettes/llms/TestOpenAIMultimodalIntegration.test_describe_image.yaml
+++ b/lib/crewai/tests/cassettes/llms/TestOpenAIMultimodalIntegration.test_describe_image.yaml
--- a/lib/crewai/tests/llms/test_multimodal_integration.py
+++ b/lib/crewai/tests/llms/test_multimodal_integration.py
@@ -0,0 +1,329 @@
+"""Integration tests for LLM multimodal functionality with cassettes.
+
+These tests make actual API calls (recorded via VCR cassettes) to verify
+multimodal content is properly sent and processed by each provider.
+"""
+
+from pathlib import Path
+
+import pytest
+
+from crewai.llm import LLM
+from crewai.utilities.files import File, ImageFile, PDFFile, TextFile
+
+
+# Path to test data files
+TEST_DATA_DIR = Path(__file__).parent.parent.parent.parent.parent / "data"
+TEST_IMAGE_PATH = TEST_DATA_DIR / "revenue_chart.png"
+TEST_TEXT_PATH = TEST_DATA_DIR / "review_guidelines.txt"
+
+
+@pytest.fixture
+def test_image_bytes() -> bytes:
+    """Load test image bytes."""
+    return TEST_IMAGE_PATH.read_bytes()
+
+
+@pytest.fixture
+def test_text_bytes() -> bytes:
+    """Load test text bytes."""
+    return TEST_TEXT_PATH.read_bytes()
+
+
+# Minimal PDF for testing (real PDF structure)
+MINIMAL_PDF = b"""%PDF-1.4
+1 0 obj << /Type /Catalog /Pages 2 0 R >> endobj
+2 0 obj << /Type /Pages /Kids [3 0 R] /Count 1 >> endobj
+3 0 obj << /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >> endobj
+xref
+0 4
+0000000000 65535 f
+0000000009 00000 n
+0000000058 00000 n
+0000000115 00000 n
+trailer << /Size 4 /Root 1 0 R >>
+startxref
+196
+%%EOF
+"""
+
+
+def _build_multimodal_message(llm: LLM, prompt: str, files: dict) -> list[dict]:
+    """Build a multimodal message with text and file content."""
+    content_blocks = llm.format_multimodal_content(files)
+    return [
+        {
+            "role": "user",
+            "content": [
+                llm.format_text_content(prompt),
+                *content_blocks,
+            ],
+        }
+    ]
+
+
+class TestOpenAIMultimodalIntegration:
+    """Integration tests for OpenAI multimodal with real API calls."""
+
+    @pytest.mark.vcr()
+    def test_describe_image(self, test_image_bytes: bytes) -> None:
+        """Test OpenAI can describe an image."""
+        llm = LLM(model="openai/gpt-4o-mini")
+        files = {"image": ImageFile(source=test_image_bytes)}
+
+        messages = _build_multimodal_message(
+            llm,
+            "Describe this image in one sentence. Be brief.",
+            files,
+        )
+
+        response = llm.call(messages)
+
+        assert response
+        assert isinstance(response, str)
+        assert len(response) > 0
+
+
+class TestAnthropicMultimodalIntegration:
+    """Integration tests for Anthropic multimodal with real API calls."""
+
+    @pytest.mark.vcr()
+    def test_describe_image(self, test_image_bytes: bytes) -> None:
+        """Test Anthropic can describe an image."""
+        llm = LLM(model="anthropic/claude-3-5-haiku-20241022")
+        files = {"image": ImageFile(source=test_image_bytes)}
+
+        messages = _build_multimodal_message(
+            llm,
+            "Describe this image in one sentence. Be brief.",
+            files,
+        )
+
+        response = llm.call(messages)
+
+        assert response
+        assert isinstance(response, str)
+        assert len(response) > 0
+
+    @pytest.mark.vcr()
+    def test_analyze_pdf(self) -> None:
+        """Test Anthropic can analyze a PDF."""
+        llm = LLM(model="anthropic/claude-3-5-haiku-20241022")
+        files = {"document": PDFFile(source=MINIMAL_PDF)}
+
+        messages = _build_multimodal_message(
+            llm,
+            "What type of document is this? Answer in one word.",
+            files,
+        )
+
+        response = llm.call(messages)
+
+        assert response
+        assert isinstance(response, str)
+        assert len(response) > 0
+
+
+class TestGeminiMultimodalIntegration:
+    """Integration tests for Gemini multimodal with real API calls."""
+
+    @pytest.mark.vcr()
+    def test_describe_image(self, test_image_bytes: bytes) -> None:
+        """Test Gemini can describe an image."""
+        llm = LLM(model="gemini/gemini-2.0-flash")
+        files = {"image": ImageFile(source=test_image_bytes)}
+
+        messages = _build_multimodal_message(
+            llm,
+            "Describe this image in one sentence. Be brief.",
+            files,
+        )
+
+        response = llm.call(messages)
+
+        assert response
+        assert isinstance(response, str)
+        assert len(response) > 0
+
+    @pytest.mark.vcr()
+    def test_analyze_text_file(self, test_text_bytes: bytes) -> None:
+        """Test Gemini can analyze a text file."""
+        llm = LLM(model="gemini/gemini-2.0-flash")
+        files = {"readme": TextFile(source=test_text_bytes)}
+
+        messages = _build_multimodal_message(
+            llm,
+            "Summarize what this text file says in one sentence.",
+            files,
+        )
+
+        response = llm.call(messages)
+
+        assert response
+        assert isinstance(response, str)
+        assert len(response) > 0
+
+
+class TestLiteLLMMultimodalIntegration:
+    """Integration tests for LiteLLM wrapper multimodal with real API calls."""
+
+    @pytest.mark.vcr()
+    def test_describe_image_gpt4o(self, test_image_bytes: bytes) -> None:
+        """Test LiteLLM with GPT-4o can describe an image."""
+        llm = LLM(model="gpt-4o-mini", is_litellm=True)
+        files = {"image": ImageFile(source=test_image_bytes)}
+
+        messages = _build_multimodal_message(
+            llm,
+            "Describe this image in one sentence. Be brief.",
+            files,
+        )
+
+        response = llm.call(messages)
+
+        assert response
+        assert isinstance(response, str)
+        assert len(response) > 0
+
+    @pytest.mark.vcr()
+    def test_describe_image_claude(self, test_image_bytes: bytes) -> None:
+        """Test LiteLLM with Claude can describe an image."""
+        llm = LLM(model="anthropic/claude-3-5-haiku-20241022", is_litellm=True)
+        files = {"image": ImageFile(source=test_image_bytes)}
+
+        messages = _build_multimodal_message(
+            llm,
+            "Describe this image in one sentence. Be brief.",
+            files,
+        )
+
+        response = llm.call(messages)
+
+        assert response
+        assert isinstance(response, str)
+        assert len(response) > 0
+
+
+class TestMultipleFilesIntegration:
+    """Integration tests for multiple files in a single request."""
+
+    @pytest.mark.vcr()
+    def test_multiple_images_openai(self, test_image_bytes: bytes) -> None:
+        """Test OpenAI can process multiple images."""
+        llm = LLM(model="openai/gpt-4o-mini")
+        files = {
+            "image1": ImageFile(source=test_image_bytes),
+            "image2": ImageFile(source=test_image_bytes),
+        }
+
+        messages = _build_multimodal_message(
+            llm,
+            "How many images do you see? Answer with just the number.",
+            files,
+        )
+
+        response = llm.call(messages)
+
+        assert response
+        assert isinstance(response, str)
+        assert "2" in response or "two" in response.lower()
+
+    @pytest.mark.vcr()
+    def test_mixed_content_anthropic(self, test_image_bytes: bytes) -> None:
+        """Test Anthropic can process image and PDF together."""
+        llm = LLM(model="anthropic/claude-3-5-haiku-20241022")
+        files = {
+            "image": ImageFile(source=test_image_bytes),
+            "document": PDFFile(source=MINIMAL_PDF),
+        }
+
+        messages = _build_multimodal_message(
+            llm,
+            "What types of files did I send you? List them briefly.",
+            files,
+        )
+
+        response = llm.call(messages)
+
+        assert response
+        assert isinstance(response, str)
+        assert len(response) > 0
+
+
+class TestGenericFileIntegration:
+    """Integration tests for the generic File class with auto-detection."""
+
+    @pytest.mark.vcr()
+    def test_generic_file_image_openai(self, test_image_bytes: bytes) -> None:
+        """Test generic File auto-detects image and sends correct content type."""
+        llm = LLM(model="openai/gpt-4o-mini")
+        files = {"image": File(source=test_image_bytes)}
+
+        messages = _build_multimodal_message(
+            llm,
+            "Describe this image in one sentence. Be brief.",
+            files,
+        )
+
+        response = llm.call(messages)
+
+        assert response
+        assert isinstance(response, str)
+        assert len(response) > 0
+
+    @pytest.mark.vcr()
+    def test_generic_file_pdf_anthropic(self) -> None:
+        """Test generic File auto-detects PDF and sends correct content type."""
+        llm = LLM(model="anthropic/claude-3-5-haiku-20241022")
+        files = {"document": File(source=MINIMAL_PDF)}
+
+        messages = _build_multimodal_message(
+            llm,
+            "What type of document is this? Answer in one word.",
+            files,
+        )
+
+        response = llm.call(messages)
+
+        assert response
+        assert isinstance(response, str)
+        assert len(response) > 0
+
+    @pytest.mark.vcr()
+    def test_generic_file_text_gemini(self, test_text_bytes: bytes) -> None:
+        """Test generic File auto-detects text and sends correct content type."""
+        llm = LLM(model="gemini/gemini-2.0-flash")
+        files = {"content": File(source=test_text_bytes)}
+
+        messages = _build_multimodal_message(
+            llm,
+            "Summarize what this text says in one sentence.",
+            files,
+        )
+
+        response = llm.call(messages)
+
+        assert response
+        assert isinstance(response, str)
+        assert len(response) > 0
+
+    @pytest.mark.vcr()
+    def test_generic_file_mixed_types(self, test_image_bytes: bytes) -> None:
+        """Test generic File works with multiple auto-detected types."""
+        llm = LLM(model="anthropic/claude-3-5-haiku-20241022")
+        files = {
+            "chart": File(source=test_image_bytes),
+            "doc": File(source=MINIMAL_PDF),
+        }
+
+        messages = _build_multimodal_message(
+            llm,
+            "What types of files did I send? List them briefly.",
+            files,
+        )
+
+        response = llm.call(messages)
+
+        assert response
+        assert isinstance(response, str)
+        assert len(response) > 0