feat: add multimodal support to LLM providers

- Add format_multimodal_content() to all LLM providers
- Support inline base64 and file reference formats
- Add FileResolver integration for upload caching
- Add module exports for files package
This commit is contained in:
Greyson LaLonde
2026-01-21 20:05:33 -05:00
parent 50728b10e8
commit 771eccfcdf
20 changed files with 2382 additions and 15 deletions

View File

@@ -70,6 +70,7 @@ if TYPE_CHECKING:
from crewai.llms.providers.anthropic.completion import AnthropicThinkingConfig
from crewai.task import Task
from crewai.tools.base_tool import BaseTool
from crewai.utilities.files import FileInput, UploadCache
from crewai.utilities.types import LLMMessage
try:
@@ -683,7 +684,7 @@ class LLM(BaseLLM):
"temperature": self.temperature,
"top_p": self.top_p,
"n": self.n,
"stop": self.stop,
"stop": self.stop or None,
"max_tokens": self.max_tokens or self.max_completion_tokens,
"presence_penalty": self.presence_penalty,
"frequency_penalty": self.frequency_penalty,
@@ -931,7 +932,6 @@ class LLM(BaseLLM):
self._handle_streaming_callbacks(callbacks, usage_info, last_chunk)
if not tool_calls or not available_functions:
if response_model and self.is_litellm:
instructor_instance = InternalInstructor(
content=full_response,
@@ -1144,8 +1144,12 @@ class LLM(BaseLLM):
if response_model:
params["response_model"] = response_model
response = litellm.completion(**params)
if hasattr(response,"usage") and not isinstance(response.usage, type) and response.usage:
if (
hasattr(response, "usage")
and not isinstance(response.usage, type)
and response.usage
):
usage_info = response.usage
self._track_token_usage_internal(usage_info)
@@ -1273,7 +1277,11 @@ class LLM(BaseLLM):
params["response_model"] = response_model
response = await litellm.acompletion(**params)
if hasattr(response,"usage") and not isinstance(response.usage, type) and response.usage:
if (
hasattr(response, "usage")
and not isinstance(response.usage, type)
and response.usage
):
usage_info = response.usage
self._track_token_usage_internal(usage_info)
@@ -1363,7 +1371,7 @@ class LLM(BaseLLM):
"""
full_response = ""
chunk_count = 0
usage_info = None
accumulated_tool_args: defaultdict[int, AccumulatedToolArgs] = defaultdict(
@@ -2205,3 +2213,107 @@ class LLM(BaseLLM):
stop=copy.deepcopy(self.stop, memo) if self.stop else None,
**filtered_params,
)
def supports_multimodal(self) -> bool:
"""Check if the model supports multimodal inputs.
For litellm, check common vision-enabled model prefixes.
Returns:
True if the model likely supports images.
"""
vision_prefixes = (
"gpt-4o",
"gpt-4-turbo",
"gpt-4-vision",
"gpt-4.1",
"claude-3",
"claude-4",
"gemini",
)
model_lower = self.model.lower()
return any(
model_lower.startswith(p) or f"/{p}" in model_lower for p in vision_prefixes
)
def supported_multimodal_content_types(self) -> list[str]:
"""Get content types supported for multimodal input.
Determines supported types based on the underlying model.
Returns:
List of supported MIME type prefixes.
"""
if not self.supports_multimodal():
return []
model_lower = self.model.lower()
if "gemini" in model_lower:
return ["image/", "audio/", "video/", "application/pdf", "text/"]
if "claude-3" in model_lower or "claude-4" in model_lower:
return ["image/", "application/pdf"]
return ["image/"]
def format_multimodal_content(
self,
files: dict[str, FileInput],
upload_cache: UploadCache | None = None,
) -> list[dict[str, Any]]:
"""Format files as multimodal content blocks for litellm.
Uses OpenAI-compatible format which litellm translates to provider format.
Uses FileResolver for consistent base64 encoding.
Args:
files: Dictionary mapping file names to FileInput objects.
upload_cache: Optional cache (not used by litellm but kept for interface consistency).
Returns:
List of content blocks in OpenAI's expected format.
"""
import base64
from crewai.utilities.files import (
FileResolver,
FileResolverConfig,
InlineBase64,
)
if not self.supports_multimodal():
return []
content_blocks: list[dict[str, Any]] = []
supported_types = self.supported_multimodal_content_types()
# LiteLLM uses OpenAI-compatible format
config = FileResolverConfig(prefer_upload=False)
resolver = FileResolver(config=config, upload_cache=upload_cache)
for file_input in files.values():
content_type = file_input.content_type
if not any(content_type.startswith(t) for t in supported_types):
continue
resolved = resolver.resolve(file_input, "openai")
if isinstance(resolved, InlineBase64):
content_blocks.append(
{
"type": "image_url",
"image_url": {
"url": f"data:{resolved.content_type};base64,{resolved.data}"
},
}
)
else:
# Fallback to direct base64 encoding
data = base64.b64encode(file_input.read()).decode("ascii")
content_blocks.append(
{
"type": "image_url",
"image_url": {"url": f"data:{content_type};base64,{data}"},
}
)
return content_blocks

View File

@@ -35,6 +35,7 @@ if TYPE_CHECKING:
from crewai.agent.core import Agent
from crewai.task import Task
from crewai.tools.base_tool import BaseTool
from crewai.utilities.files import FileInput, UploadCache
from crewai.utilities.types import LLMMessage
@@ -280,6 +281,54 @@ class BaseLLM(ABC):
# Default implementation - subclasses should override with model-specific values
return DEFAULT_CONTEXT_WINDOW_SIZE
def supports_multimodal(self) -> bool:
"""Check if the LLM supports multimodal inputs.
Returns:
True if the LLM supports images, PDFs, audio, or video.
"""
return False
def supported_multimodal_content_types(self) -> list[str]:
"""Get the content types supported by this LLM for multimodal input.
Returns:
List of supported MIME type prefixes (e.g., ["image/", "application/pdf"]).
"""
return []
def format_multimodal_content(
self,
files: dict[str, FileInput],
upload_cache: UploadCache | None = None,
) -> list[dict[str, Any]]:
"""Format files as multimodal content blocks for the LLM.
Subclasses should override this to provide provider-specific formatting.
Args:
files: Dictionary mapping file names to FileInput objects.
upload_cache: Optional cache for tracking uploaded files.
Returns:
List of content blocks in the provider's expected format.
"""
return []
def format_text_content(self, text: str) -> dict[str, Any]:
"""Format text as a content block for the LLM.
Default implementation uses OpenAI/Anthropic format.
Subclasses should override for provider-specific formatting.
Args:
text: The text content to format.
Returns:
A content block in the provider's expected format.
"""
return {"type": "text", "text": text}
# Common helper methods for native SDK implementations
def _emit_call_started_event(

View File

@@ -1,5 +1,6 @@
from __future__ import annotations
import base64
import json
import logging
import os
@@ -20,6 +21,9 @@ from crewai.utilities.types import LLMMessage
if TYPE_CHECKING:
from crewai.llms.hooks.base import BaseInterceptor
from crewai.utilities.files import FileInput, UploadCache
DEFAULT_CACHE_TTL = "ephemeral"
try:
from anthropic import Anthropic, AsyncAnthropic
@@ -1231,3 +1235,138 @@ class AnthropicCompletion(BaseLLM):
"total_tokens": input_tokens + output_tokens,
}
return {"total_tokens": 0}
def supports_multimodal(self) -> bool:
"""Check if the model supports multimodal inputs.
All Claude 3+ models support vision and PDFs.
Returns:
True if the model supports images and PDFs.
"""
return "claude-3" in self.model.lower() or "claude-4" in self.model.lower()
def supported_multimodal_content_types(self) -> list[str]:
"""Get content types supported by Anthropic for multimodal input.
Returns:
List of supported MIME type prefixes.
"""
if not self.supports_multimodal():
return []
return ["image/", "application/pdf"]
def format_multimodal_content(
self,
files: dict[str, FileInput],
upload_cache: UploadCache | None = None,
enable_caching: bool = True,
cache_ttl: str | None = None,
) -> list[dict[str, Any]]:
"""Format files as Anthropic multimodal content blocks.
Anthropic supports both base64 inline format and file references via Files API.
Uses FileResolver to determine the best delivery method based on file size.
Supports prompt caching to reduce costs and latency for repeated file usage.
Args:
files: Dictionary mapping file names to FileInput objects.
upload_cache: Optional cache for tracking uploaded files.
enable_caching: Whether to add cache_control markers (default: True).
cache_ttl: Cache TTL - "ephemeral" (5min) or "1h" (1hr for supported models).
Returns:
List of content blocks in Anthropic's expected format.
"""
if not self.supports_multimodal():
return []
from crewai.utilities.files import (
FileReference,
FileResolver,
FileResolverConfig,
InlineBase64,
)
content_blocks: list[dict[str, Any]] = []
supported_types = self.supported_multimodal_content_types()
config = FileResolverConfig(prefer_upload=False)
resolver = FileResolver(config=config, upload_cache=upload_cache)
file_list = list(files.values())
num_files = len(file_list)
for i, file_input in enumerate(file_list):
content_type = file_input.content_type
if not any(content_type.startswith(t) for t in supported_types):
continue
resolved = resolver.resolve(file_input, "anthropic")
block: dict[str, Any] = {}
if isinstance(resolved, FileReference):
if content_type.startswith("image/"):
block = {
"type": "image",
"source": {
"type": "file",
"file_id": resolved.file_id,
},
}
elif content_type == "application/pdf":
block = {
"type": "document",
"source": {
"type": "file",
"file_id": resolved.file_id,
},
}
elif isinstance(resolved, InlineBase64):
if content_type.startswith("image/"):
block = {
"type": "image",
"source": {
"type": "base64",
"media_type": resolved.content_type,
"data": resolved.data,
},
}
elif content_type == "application/pdf":
block = {
"type": "document",
"source": {
"type": "base64",
"media_type": resolved.content_type,
"data": resolved.data,
},
}
else:
data = base64.b64encode(file_input.read()).decode("ascii")
if content_type.startswith("image/"):
block = {
"type": "image",
"source": {
"type": "base64",
"media_type": content_type,
"data": data,
},
}
elif content_type == "application/pdf":
block = {
"type": "document",
"source": {
"type": "base64",
"media_type": content_type,
"data": data,
},
}
if block and enable_caching and i == num_files - 1:
cache_control: dict[str, str] = {"type": cache_ttl or DEFAULT_CACHE_TTL}
block["cache_control"] = cache_control
if block:
content_blocks.append(block)
return content_blocks

View File

@@ -1,5 +1,6 @@
from __future__ import annotations
import base64
import json
import logging
import os
@@ -18,6 +19,7 @@ from crewai.utilities.types import LLMMessage
if TYPE_CHECKING:
from crewai.llms.hooks.base import BaseInterceptor
from crewai.utilities.files import FileInput, UploadCache
try:
@@ -1016,3 +1018,85 @@ class AzureCompletion(BaseLLM):
async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
"""Async context manager exit."""
await self.aclose()
def supports_multimodal(self) -> bool:
"""Check if the model supports multimodal inputs.
Azure OpenAI vision-enabled models include GPT-4o and GPT-4 Turbo with Vision.
Returns:
True if the model supports images.
"""
vision_models = ("gpt-4o", "gpt-4-turbo", "gpt-4-vision", "gpt-4v")
return any(self.model.lower().startswith(m) for m in vision_models)
def supported_multimodal_content_types(self) -> list[str]:
"""Get content types supported by Azure for multimodal input.
Returns:
List of supported MIME type prefixes.
"""
if not self.supports_multimodal():
return []
return ["image/"]
def format_multimodal_content(
self,
files: dict[str, FileInput],
upload_cache: UploadCache | None = None,
) -> list[dict[str, Any]]:
"""Format files as Azure OpenAI multimodal content blocks.
Azure OpenAI uses the same image_url format as OpenAI.
Uses FileResolver for consistent base64 encoding.
Args:
files: Dictionary mapping file names to FileInput objects.
upload_cache: Optional cache (not used by Azure but kept for interface consistency).
Returns:
List of content blocks in Azure OpenAI's expected format.
"""
if not self.supports_multimodal():
return []
from crewai.utilities.files import (
FileResolver,
FileResolverConfig,
InlineBase64,
)
content_blocks: list[dict[str, Any]] = []
supported_types = self.supported_multimodal_content_types()
# Azure doesn't support file uploads for images, so just use inline
config = FileResolverConfig(prefer_upload=False)
resolver = FileResolver(config=config, upload_cache=upload_cache)
for file_input in files.values():
content_type = file_input.content_type
if not any(content_type.startswith(t) for t in supported_types):
continue
resolved = resolver.resolve(file_input, "azure")
if isinstance(resolved, InlineBase64):
content_blocks.append(
{
"type": "image_url",
"image_url": {
"url": f"data:{resolved.content_type};base64,{resolved.data}"
},
}
)
else:
# Fallback to direct base64 encoding
data = base64.b64encode(file_input.read()).decode("ascii")
content_blocks.append(
{
"type": "image_url",
"image_url": {"url": f"data:{content_type};base64,{data}"},
}
)
return content_blocks

View File

@@ -33,6 +33,7 @@ if TYPE_CHECKING:
)
from crewai.llms.hooks.base import BaseInterceptor
from crewai.utilities.files import FileInput, UploadCache
try:
@@ -1450,3 +1451,92 @@ class BedrockCompletion(BaseLLM):
# Default context window size
return int(8192 * CONTEXT_WINDOW_USAGE_RATIO)
def supports_multimodal(self) -> bool:
"""Check if the model supports multimodal inputs.
Claude models on Bedrock support vision.
Returns:
True if the model supports images.
"""
vision_models = ("anthropic.claude-3",)
return any(self.model.lower().startswith(m) for m in vision_models)
def supported_multimodal_content_types(self) -> list[str]:
"""Get content types supported by Bedrock for multimodal input.
Returns:
List of supported MIME type prefixes.
"""
if not self.supports_multimodal():
return []
return ["image/", "application/pdf"]
def format_multimodal_content(
self,
files: dict[str, FileInput],
upload_cache: UploadCache | None = None,
) -> list[dict[str, Any]]:
"""Format files as Bedrock Converse API multimodal content blocks.
Bedrock Converse API uses specific formats for images and documents with raw bytes.
Uses FileResolver to get InlineBytes format for Bedrock's byte-based API.
Args:
files: Dictionary mapping file names to FileInput objects.
upload_cache: Optional cache (not used by Bedrock but kept for interface consistency).
Returns:
List of content blocks in Bedrock's expected format.
"""
if not self.supports_multimodal():
return []
from crewai.utilities.files import (
FileResolver,
FileResolverConfig,
InlineBytes,
)
content_blocks: list[dict[str, Any]] = []
# Bedrock uses raw bytes, configure resolver accordingly
config = FileResolverConfig(prefer_upload=False, use_bytes_for_bedrock=True)
resolver = FileResolver(config=config, upload_cache=upload_cache)
for name, file_input in files.items():
content_type = file_input.content_type
resolved = resolver.resolve(file_input, "bedrock")
if isinstance(resolved, InlineBytes):
file_bytes = resolved.data
else:
# Fallback to reading directly
file_bytes = file_input.read()
if content_type.startswith("image/"):
media_type = content_type.split("/")[-1]
if media_type == "jpg":
media_type = "jpeg"
content_blocks.append(
{
"image": {
"format": media_type,
"source": {"bytes": file_bytes},
}
}
)
elif content_type == "application/pdf":
content_blocks.append(
{
"document": {
"name": name,
"format": "pdf",
"source": {"bytes": file_bytes},
}
}
)
return content_blocks

View File

@@ -1,5 +1,6 @@
from __future__ import annotations
import base64
import json
import logging
import os
@@ -19,6 +20,10 @@ from crewai.utilities.types import LLMMessage
if TYPE_CHECKING:
from crewai.llms.hooks.base import BaseInterceptor
from crewai.utilities.files import (
FileInput,
UploadCache,
)
try:
@@ -516,17 +521,31 @@ class GeminiCompletion(BaseLLM):
role = message["role"]
content = message["content"]
# Convert content to string if it's a list
# Build parts list from content
parts: list[types.Part] = []
if isinstance(content, list):
text_content = " ".join(
str(item.get("text", "")) if isinstance(item, dict) else str(item)
for item in content
)
for item in content:
if isinstance(item, dict):
if "text" in item:
parts.append(types.Part.from_text(text=str(item["text"])))
elif "inlineData" in item:
inline = item["inlineData"]
parts.append(
types.Part.from_bytes(
data=base64.b64decode(inline["data"]),
mime_type=inline["mimeType"],
)
)
else:
parts.append(types.Part.from_text(text=str(item)))
else:
text_content = str(content) if content else ""
parts.append(types.Part.from_text(text=str(content) if content else ""))
if role == "system":
# Extract system instruction - Gemini handles it separately
text_content = " ".join(
p.text for p in parts if hasattr(p, "text") and p.text
)
if system_instruction:
system_instruction += f"\n\n{text_content}"
else:
@@ -536,9 +555,7 @@ class GeminiCompletion(BaseLLM):
gemini_role = "model" if role == "assistant" else "user"
# Create Content object
gemini_content = types.Content(
role=gemini_role, parts=[types.Part.from_text(text=text_content)]
)
gemini_content = types.Content(role=gemini_role, parts=parts)
contents.append(gemini_content)
return contents, system_instruction
@@ -1060,3 +1077,106 @@ class GeminiCompletion(BaseLLM):
)
)
return result
def supports_multimodal(self) -> bool:
"""Check if the model supports multimodal inputs.
Gemini models support images, audio, video, and PDFs.
Returns:
True if the model supports multimodal inputs.
"""
return True
def supported_multimodal_content_types(self) -> list[str]:
"""Get content types supported by Gemini for multimodal input.
Returns:
List of supported MIME type prefixes.
"""
return ["image/", "audio/", "video/", "application/pdf", "text/"]
def format_multimodal_content(
self,
files: dict[str, FileInput],
upload_cache: UploadCache | None = None,
) -> list[dict[str, Any]]:
"""Format files as Gemini multimodal content blocks.
Gemini supports both inlineData format and file references via File API.
Uses FileResolver to determine the best delivery method based on file size.
Args:
files: Dictionary mapping file names to FileInput objects.
upload_cache: Optional cache for tracking uploaded files.
Returns:
List of content blocks in Gemini's expected format.
"""
from crewai.utilities.files import (
FileReference,
FileResolver,
FileResolverConfig,
InlineBase64,
)
content_blocks: list[dict[str, Any]] = []
supported_types = self.supported_multimodal_content_types()
# Create resolver with optional cache
config = FileResolverConfig(prefer_upload=False)
resolver = FileResolver(config=config, upload_cache=upload_cache)
for file_input in files.values():
content_type = file_input.content_type
if not any(content_type.startswith(t) for t in supported_types):
continue
resolved = resolver.resolve(file_input, "gemini")
if isinstance(resolved, FileReference) and resolved.file_uri:
# Use file reference format for uploaded files
content_blocks.append(
{
"fileData": {
"mimeType": resolved.content_type,
"fileUri": resolved.file_uri,
}
}
)
elif isinstance(resolved, InlineBase64):
# Use inline format for smaller files
content_blocks.append(
{
"inlineData": {
"mimeType": resolved.content_type,
"data": resolved.data,
}
}
)
else:
# Fallback to base64 encoding
data = base64.b64encode(file_input.read()).decode("ascii")
content_blocks.append(
{
"inlineData": {
"mimeType": content_type,
"data": data,
}
}
)
return content_blocks
def format_text_content(self, text: str) -> dict[str, Any]:
"""Format text as a Gemini content block.
Gemini uses {"text": "..."} format instead of {"type": "text", "text": "..."}.
Args:
text: The text content to format.
Returns:
A content block in Gemini's expected format.
"""
return {"text": text}

View File

@@ -1,5 +1,6 @@
from __future__ import annotations
import base64
from collections.abc import AsyncIterator
import json
import logging
@@ -30,6 +31,7 @@ if TYPE_CHECKING:
from crewai.llms.hooks.base import BaseInterceptor
from crewai.task import Task
from crewai.tools.base_tool import BaseTool
from crewai.utilities.files import FileInput, UploadCache
class OpenAICompletion(BaseLLM):
@@ -1048,3 +1050,101 @@ class OpenAICompletion(BaseLLM):
formatted_messages.append(message)
return formatted_messages
def supports_multimodal(self) -> bool:
"""Check if the model supports multimodal inputs.
OpenAI vision-enabled models include GPT-4o, GPT-4.1, and o-series.
Returns:
True if the model supports images.
"""
vision_models = (
"gpt-4o",
"gpt-4.1",
"gpt-4-turbo",
"gpt-4-vision",
"o1",
"o3",
"o4",
)
return any(self.model.lower().startswith(m) for m in vision_models)
def supported_multimodal_content_types(self) -> list[str]:
"""Get content types supported by OpenAI for multimodal input.
Returns:
List of supported MIME type prefixes.
"""
if not self.supports_multimodal():
return []
return ["image/"]
def format_multimodal_content(
self,
files: dict[str, FileInput],
upload_cache: UploadCache | None = None,
) -> list[dict[str, Any]]:
"""Format files as OpenAI multimodal content blocks.
OpenAI supports both base64 data URLs and file_id references via Files API.
Uses FileResolver to determine the best delivery method based on file size.
Args:
files: Dictionary mapping file names to FileInput objects.
upload_cache: Optional cache for tracking uploaded files.
Returns:
List of content blocks in OpenAI's expected format.
"""
if not self.supports_multimodal():
return []
from crewai.utilities.files import (
FileReference,
FileResolver,
FileResolverConfig,
InlineBase64,
)
content_blocks: list[dict[str, Any]] = []
supported_types = self.supported_multimodal_content_types()
config = FileResolverConfig(prefer_upload=False)
resolver = FileResolver(config=config, upload_cache=upload_cache)
for file_input in files.values():
content_type = file_input.content_type
if not any(content_type.startswith(t) for t in supported_types):
continue
resolved = resolver.resolve(file_input, "openai")
if isinstance(resolved, FileReference):
content_blocks.append(
{
"type": "file",
"file": {
"file_id": resolved.file_id,
},
}
)
elif isinstance(resolved, InlineBase64):
content_blocks.append(
{
"type": "image_url",
"image_url": {
"url": f"data:{resolved.content_type};base64,{resolved.data}"
},
}
)
else:
data = base64.b64encode(file_input.read()).decode("ascii")
content_blocks.append(
{
"type": "image_url",
"image_url": {"url": f"data:{content_type};base64,{data}"},
}
)
return content_blocks

View File

@@ -0,0 +1,210 @@
"""File handling utilities for crewAI tasks."""
from crewai.utilities.files.cleanup import (
cleanup_expired_files,
cleanup_provider_files,
cleanup_uploaded_files,
)
from crewai.utilities.files.content_types import (
AudioContentType,
AudioExtension,
AudioFile,
BaseFile,
File,
FileMode,
ImageContentType,
ImageExtension,
ImageFile,
PDFContentType,
PDFExtension,
PDFFile,
TextContentType,
TextExtension,
TextFile,
VideoContentType,
VideoExtension,
VideoFile,
)
from crewai.utilities.files.file import (
FileBytes,
FilePath,
FileSource,
FileSourceInput,
FileStream,
RawFileInput,
)
from crewai.utilities.files.processing import (
ANTHROPIC_CONSTRAINTS,
BEDROCK_CONSTRAINTS,
GEMINI_CONSTRAINTS,
OPENAI_CONSTRAINTS,
AudioConstraints,
FileHandling,
FileProcessingError,
FileProcessor,
FileTooLargeError,
FileValidationError,
ImageConstraints,
PDFConstraints,
ProcessingDependencyError,
ProviderConstraints,
UnsupportedFileTypeError,
VideoConstraints,
get_constraints_for_provider,
)
from crewai.utilities.files.resolved import (
FileReference,
InlineBase64,
InlineBytes,
ResolvedFile,
ResolvedFileType,
UrlReference,
)
from crewai.utilities.files.resolver import (
FileResolver,
FileResolverConfig,
create_resolver,
)
from crewai.utilities.files.upload_cache import (
CachedUpload,
UploadCache,
get_upload_cache,
reset_upload_cache,
)
from crewai.utilities.files.uploaders import FileUploader, UploadResult, get_uploader
FileInput = AudioFile | File | ImageFile | PDFFile | TextFile | VideoFile
def wrap_file_source(source: FileSource) -> FileInput:
"""Wrap a FileSource in the appropriate typed FileInput wrapper.
Args:
source: The file source to wrap.
Returns:
Typed FileInput wrapper based on content type.
"""
content_type = source.content_type
if content_type.startswith("image/"):
return ImageFile(source=source)
if content_type.startswith("audio/"):
return AudioFile(source=source)
if content_type.startswith("video/"):
return VideoFile(source=source)
if content_type == "application/pdf":
return PDFFile(source=source)
# Default to text for anything else
return TextFile(source=source)
def normalize_input_files(
input_files: list[FileSourceInput | FileInput],
) -> dict[str, FileInput]:
"""Convert a list of file sources to a named dictionary of FileInputs.
Args:
input_files: List of file source inputs or File objects.
Returns:
Dictionary mapping names to FileInput wrappers.
"""
from pathlib import Path
result: dict[str, FileInput] = {}
for i, item in enumerate(input_files):
# If it's already a typed File wrapper, use it directly
if isinstance(item, BaseFile):
name = item.filename or f"file_{i}"
# Remove extension from name for cleaner keys
if "." in name:
name = name.rsplit(".", 1)[0]
result[name] = item
continue
file_source: FilePath | FileBytes | FileStream
if isinstance(item, (FilePath, FileBytes, FileStream)):
file_source = item
elif isinstance(item, Path):
file_source = FilePath(path=item)
elif isinstance(item, str):
file_source = FilePath(path=Path(item))
elif isinstance(item, (bytes, memoryview)):
file_source = FileBytes(data=bytes(item))
else:
continue
name = file_source.filename or f"file_{i}"
result[name] = wrap_file_source(file_source)
return result
__all__ = [
"ANTHROPIC_CONSTRAINTS",
"BEDROCK_CONSTRAINTS",
"GEMINI_CONSTRAINTS",
"OPENAI_CONSTRAINTS",
"AudioConstraints",
"AudioContentType",
"AudioExtension",
"AudioFile",
"BaseFile",
"CachedUpload",
"File",
"FileBytes",
"FileHandling",
"FileInput",
"FileMode",
"FilePath",
"FileProcessingError",
"FileProcessor",
"FileReference",
"FileResolver",
"FileResolverConfig",
"FileSource",
"FileSourceInput",
"FileStream",
"FileTooLargeError",
"FileUploader",
"FileValidationError",
"ImageConstraints",
"ImageContentType",
"ImageExtension",
"ImageFile",
"InlineBase64",
"InlineBytes",
"PDFConstraints",
"PDFContentType",
"PDFExtension",
"PDFFile",
"ProcessingDependencyError",
"ProviderConstraints",
"RawFileInput",
"ResolvedFile",
"ResolvedFileType",
"TextContentType",
"TextExtension",
"TextFile",
"UnsupportedFileTypeError",
"UploadCache",
"UploadResult",
"UrlReference",
"VideoConstraints",
"VideoContentType",
"VideoExtension",
"VideoFile",
"cleanup_expired_files",
"cleanup_provider_files",
"cleanup_uploaded_files",
"create_resolver",
"get_constraints_for_provider",
"get_upload_cache",
"get_uploader",
"normalize_input_files",
"reset_upload_cache",
"wrap_file_source",
]

View File

@@ -0,0 +1,62 @@
"""File processing module for multimodal content handling.
This module provides validation, transformation, and processing utilities
for files used in multimodal LLM interactions.
"""
from crewai.utilities.files.processing.constraints import (
ANTHROPIC_CONSTRAINTS,
BEDROCK_CONSTRAINTS,
GEMINI_CONSTRAINTS,
OPENAI_CONSTRAINTS,
AudioConstraints,
ImageConstraints,
PDFConstraints,
ProviderConstraints,
VideoConstraints,
get_constraints_for_provider,
)
from crewai.utilities.files.processing.enums import FileHandling
from crewai.utilities.files.processing.exceptions import (
FileProcessingError,
FileTooLargeError,
FileValidationError,
ProcessingDependencyError,
UnsupportedFileTypeError,
)
from crewai.utilities.files.processing.processor import FileProcessor
from crewai.utilities.files.processing.validators import (
validate_audio,
validate_file,
validate_image,
validate_pdf,
validate_text,
validate_video,
)
__all__ = [
"ANTHROPIC_CONSTRAINTS",
"BEDROCK_CONSTRAINTS",
"GEMINI_CONSTRAINTS",
"OPENAI_CONSTRAINTS",
"AudioConstraints",
"FileHandling",
"FileProcessingError",
"FileProcessor",
"FileTooLargeError",
"FileValidationError",
"ImageConstraints",
"PDFConstraints",
"ProcessingDependencyError",
"ProviderConstraints",
"UnsupportedFileTypeError",
"VideoConstraints",
"get_constraints_for_provider",
"validate_audio",
"validate_file",
"validate_image",
"validate_pdf",
"validate_text",
"validate_video",
]

View File

@@ -0,0 +1,104 @@
interactions:
- request:
body: '{"max_tokens":4096,"messages":[{"role":"user","content":[{"type":"text","text":"What
type of document is this? Answer in one word."},{"type":"document","source":{"type":"base64","media_type":"application/pdf","data":"JVBERi0xLjQKMSAwIG9iaiA8PCAvVHlwZSAvQ2F0YWxvZyAvUGFnZXMgMiAwIFIgPj4gZW5kb2JqCjIgMCBvYmogPDwgL1R5cGUgL1BhZ2VzIC9LaWRzIFszIDAgUl0gL0NvdW50IDEgPj4gZW5kb2JqCjMgMCBvYmogPDwgL1R5cGUgL1BhZ2UgL1BhcmVudCAyIDAgUiAvTWVkaWFCb3ggWzAgMCA2MTIgNzkyXSA+PiBlbmRvYmoKeHJlZgowIDQKMDAwMDAwMDAwMCA2NTUzNSBmCjAwMDAwMDAwMDkgMDAwMDAgbgowMDAwMDAwMDU4IDAwMDAwIG4KMDAwMDAwMDExNSAwMDAwMCBuCnRyYWlsZXIgPDwgL1NpemUgNCAvUm9vdCAxIDAgUiA+PgpzdGFydHhyZWYKMTk2CiUlRU9GCg=="},"cache_control":{"type":"ephemeral"}}]}],"model":"claude-3-5-haiku-20241022","stream":false}'
headers:
User-Agent:
- X-USER-AGENT-XXX
accept:
- application/json
accept-encoding:
- ACCEPT-ENCODING-XXX
anthropic-version:
- '2023-06-01'
connection:
- keep-alive
content-length:
- '748'
content-type:
- application/json
host:
- api.anthropic.com
x-api-key:
- X-API-KEY-XXX
x-stainless-arch:
- X-STAINLESS-ARCH-XXX
x-stainless-async:
- 'false'
x-stainless-lang:
- python
x-stainless-os:
- X-STAINLESS-OS-XXX
x-stainless-package-version:
- 0.71.1
x-stainless-retry-count:
- '0'
x-stainless-runtime:
- CPython
x-stainless-runtime-version:
- 3.12.10
x-stainless-timeout:
- NOT_GIVEN
method: POST
uri: https://api.anthropic.com/v1/messages
response:
body:
string: !!binary |
H4sIAAAAAAAA/3WQTUvEMBCG/8ucW2jr7rL25sKCKHrQiyASYjJsw6ZJzUxEKf3vTheLX3hKeJ8n
8zIZoY8WPbRgvM4Wy7NyXXbaHXPZVM2qrpoGCnBWhJ4Oqqovd/nBnt92tF1dX+z3u6t7ffO8FYff
B5wtJNIHlCBFPweayBHrwBKZGBjl1j6Oi8/4NpPT0cIdUu4RpqcCiOOgEmqKQQAGqzinAJ+A8CVj
MDIhZO8LyKfSdgQXhsyK4xEDQVtvmo3UatOhMjKMXQzqp1ItXLD9jy1v5wYcOuwxaa/W/V//i9bd
bzoVEDN/j1ayDqZXZ1CxwySLzl9ldbIwTR/rySkqnAEAAA==
headers:
CF-RAY:
- CF-RAY-XXX
Connection:
- keep-alive
Content-Type:
- application/json
Date:
- Thu, 22 Jan 2026 00:18:50 GMT
Server:
- cloudflare
Transfer-Encoding:
- chunked
X-Robots-Tag:
- none
anthropic-organization-id:
- ANTHROPIC-ORGANIZATION-ID-XXX
anthropic-ratelimit-input-tokens-limit:
- ANTHROPIC-RATELIMIT-INPUT-TOKENS-LIMIT-XXX
anthropic-ratelimit-input-tokens-remaining:
- ANTHROPIC-RATELIMIT-INPUT-TOKENS-REMAINING-XXX
anthropic-ratelimit-input-tokens-reset:
- ANTHROPIC-RATELIMIT-INPUT-TOKENS-RESET-XXX
anthropic-ratelimit-output-tokens-limit:
- ANTHROPIC-RATELIMIT-OUTPUT-TOKENS-LIMIT-XXX
anthropic-ratelimit-output-tokens-remaining:
- ANTHROPIC-RATELIMIT-OUTPUT-TOKENS-REMAINING-XXX
anthropic-ratelimit-output-tokens-reset:
- ANTHROPIC-RATELIMIT-OUTPUT-TOKENS-RESET-XXX
anthropic-ratelimit-requests-limit:
- '4000'
anthropic-ratelimit-requests-remaining:
- '3999'
anthropic-ratelimit-requests-reset:
- '2026-01-22T00:18:50Z'
anthropic-ratelimit-tokens-limit:
- ANTHROPIC-RATELIMIT-TOKENS-LIMIT-XXX
anthropic-ratelimit-tokens-remaining:
- ANTHROPIC-RATELIMIT-TOKENS-REMAINING-XXX
anthropic-ratelimit-tokens-reset:
- ANTHROPIC-RATELIMIT-TOKENS-RESET-XXX
cf-cache-status:
- DYNAMIC
request-id:
- REQUEST-ID-XXX
strict-transport-security:
- STS-XXX
x-envoy-upstream-service-time:
- '750'
status:
code: 200
message: OK
version: 1

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,104 @@
interactions:
- request:
body: '{"max_tokens":4096,"messages":[{"role":"user","content":[{"type":"text","text":"What
type of document is this? Answer in one word."},{"type":"document","source":{"type":"base64","media_type":"application/pdf","data":"JVBERi0xLjQKMSAwIG9iaiA8PCAvVHlwZSAvQ2F0YWxvZyAvUGFnZXMgMiAwIFIgPj4gZW5kb2JqCjIgMCBvYmogPDwgL1R5cGUgL1BhZ2VzIC9LaWRzIFszIDAgUl0gL0NvdW50IDEgPj4gZW5kb2JqCjMgMCBvYmogPDwgL1R5cGUgL1BhZ2UgL1BhcmVudCAyIDAgUiAvTWVkaWFCb3ggWzAgMCA2MTIgNzkyXSA+PiBlbmRvYmoKeHJlZgowIDQKMDAwMDAwMDAwMCA2NTUzNSBmCjAwMDAwMDAwMDkgMDAwMDAgbgowMDAwMDAwMDU4IDAwMDAwIG4KMDAwMDAwMDExNSAwMDAwMCBuCnRyYWlsZXIgPDwgL1NpemUgNCAvUm9vdCAxIDAgUiA+PgpzdGFydHhyZWYKMTk2CiUlRU9GCg=="},"cache_control":{"type":"ephemeral"}}]}],"model":"claude-3-5-haiku-20241022","stream":false}'
headers:
User-Agent:
- X-USER-AGENT-XXX
accept:
- application/json
accept-encoding:
- ACCEPT-ENCODING-XXX
anthropic-version:
- '2023-06-01'
connection:
- keep-alive
content-length:
- '748'
content-type:
- application/json
host:
- api.anthropic.com
x-api-key:
- X-API-KEY-XXX
x-stainless-arch:
- X-STAINLESS-ARCH-XXX
x-stainless-async:
- 'false'
x-stainless-lang:
- python
x-stainless-os:
- X-STAINLESS-OS-XXX
x-stainless-package-version:
- 0.71.1
x-stainless-retry-count:
- '0'
x-stainless-runtime:
- CPython
x-stainless-runtime-version:
- 3.12.10
x-stainless-timeout:
- NOT_GIVEN
method: POST
uri: https://api.anthropic.com/v1/messages
response:
body:
string: !!binary |
H4sIAAAAAAAA/3WQTUvEMBCG/8ucW2hju4eeRUU97EFRFAkhGbZh06Qmk1Up/e9OF4tf7CnhfZ7J
y2SCIRh00IF2Khssz8q27JXd51JUoqkrIaAAa1gY0k5W9bbptXo7PD60l/V1f/V0J+5vxQ079DHi
YmFKaoccxOCWQKVkEylPHOngCfnWPU+rT/i+kOPRwfb8AuaXAhKFUUZUKXhO0RtJOXr4AglfM3rN
4z47V0A+NnYTWD9mkhT26BN09UZsuFPpHqXmx8gGL38r1coZm1NsnV0acOxxwKicbIf//jet+790
LiBk+hk1vA7Gg9UoyWLkRZd/MioamOdP24g1JZkBAAA=
headers:
CF-RAY:
- CF-RAY-XXX
Connection:
- keep-alive
Content-Type:
- application/json
Date:
- Thu, 22 Jan 2026 00:18:56 GMT
Server:
- cloudflare
Transfer-Encoding:
- chunked
X-Robots-Tag:
- none
anthropic-organization-id:
- ANTHROPIC-ORGANIZATION-ID-XXX
anthropic-ratelimit-input-tokens-limit:
- ANTHROPIC-RATELIMIT-INPUT-TOKENS-LIMIT-XXX
anthropic-ratelimit-input-tokens-remaining:
- ANTHROPIC-RATELIMIT-INPUT-TOKENS-REMAINING-XXX
anthropic-ratelimit-input-tokens-reset:
- ANTHROPIC-RATELIMIT-INPUT-TOKENS-RESET-XXX
anthropic-ratelimit-output-tokens-limit:
- ANTHROPIC-RATELIMIT-OUTPUT-TOKENS-LIMIT-XXX
anthropic-ratelimit-output-tokens-remaining:
- ANTHROPIC-RATELIMIT-OUTPUT-TOKENS-REMAINING-XXX
anthropic-ratelimit-output-tokens-reset:
- ANTHROPIC-RATELIMIT-OUTPUT-TOKENS-RESET-XXX
anthropic-ratelimit-requests-limit:
- '4000'
anthropic-ratelimit-requests-remaining:
- '3999'
anthropic-ratelimit-requests-reset:
- '2026-01-22T00:18:55Z'
anthropic-ratelimit-tokens-limit:
- ANTHROPIC-RATELIMIT-TOKENS-LIMIT-XXX
anthropic-ratelimit-tokens-remaining:
- ANTHROPIC-RATELIMIT-TOKENS-REMAINING-XXX
anthropic-ratelimit-tokens-reset:
- ANTHROPIC-RATELIMIT-TOKENS-RESET-XXX
cf-cache-status:
- DYNAMIC
request-id:
- REQUEST-ID-XXX
strict-transport-security:
- STS-XXX
x-envoy-upstream-service-time:
- '648'
status:
code: 200
message: OK
version: 1

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,329 @@
"""Integration tests for LLM multimodal functionality with cassettes.
These tests make actual API calls (recorded via VCR cassettes) to verify
multimodal content is properly sent and processed by each provider.
"""
from pathlib import Path
import pytest
from crewai.llm import LLM
from crewai.utilities.files import File, ImageFile, PDFFile, TextFile
# Path to test data files
TEST_DATA_DIR = Path(__file__).parent.parent.parent.parent.parent / "data"
TEST_IMAGE_PATH = TEST_DATA_DIR / "revenue_chart.png"
TEST_TEXT_PATH = TEST_DATA_DIR / "review_guidelines.txt"
@pytest.fixture
def test_image_bytes() -> bytes:
"""Load test image bytes."""
return TEST_IMAGE_PATH.read_bytes()
@pytest.fixture
def test_text_bytes() -> bytes:
"""Load test text bytes."""
return TEST_TEXT_PATH.read_bytes()
# Minimal PDF for testing (real PDF structure)
MINIMAL_PDF = b"""%PDF-1.4
1 0 obj << /Type /Catalog /Pages 2 0 R >> endobj
2 0 obj << /Type /Pages /Kids [3 0 R] /Count 1 >> endobj
3 0 obj << /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >> endobj
xref
0 4
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000115 00000 n
trailer << /Size 4 /Root 1 0 R >>
startxref
196
%%EOF
"""
def _build_multimodal_message(llm: LLM, prompt: str, files: dict) -> list[dict]:
"""Build a multimodal message with text and file content."""
content_blocks = llm.format_multimodal_content(files)
return [
{
"role": "user",
"content": [
llm.format_text_content(prompt),
*content_blocks,
],
}
]
class TestOpenAIMultimodalIntegration:
"""Integration tests for OpenAI multimodal with real API calls."""
@pytest.mark.vcr()
def test_describe_image(self, test_image_bytes: bytes) -> None:
"""Test OpenAI can describe an image."""
llm = LLM(model="openai/gpt-4o-mini")
files = {"image": ImageFile(source=test_image_bytes)}
messages = _build_multimodal_message(
llm,
"Describe this image in one sentence. Be brief.",
files,
)
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
class TestAnthropicMultimodalIntegration:
"""Integration tests for Anthropic multimodal with real API calls."""
@pytest.mark.vcr()
def test_describe_image(self, test_image_bytes: bytes) -> None:
"""Test Anthropic can describe an image."""
llm = LLM(model="anthropic/claude-3-5-haiku-20241022")
files = {"image": ImageFile(source=test_image_bytes)}
messages = _build_multimodal_message(
llm,
"Describe this image in one sentence. Be brief.",
files,
)
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
@pytest.mark.vcr()
def test_analyze_pdf(self) -> None:
"""Test Anthropic can analyze a PDF."""
llm = LLM(model="anthropic/claude-3-5-haiku-20241022")
files = {"document": PDFFile(source=MINIMAL_PDF)}
messages = _build_multimodal_message(
llm,
"What type of document is this? Answer in one word.",
files,
)
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
class TestGeminiMultimodalIntegration:
"""Integration tests for Gemini multimodal with real API calls."""
@pytest.mark.vcr()
def test_describe_image(self, test_image_bytes: bytes) -> None:
"""Test Gemini can describe an image."""
llm = LLM(model="gemini/gemini-2.0-flash")
files = {"image": ImageFile(source=test_image_bytes)}
messages = _build_multimodal_message(
llm,
"Describe this image in one sentence. Be brief.",
files,
)
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
@pytest.mark.vcr()
def test_analyze_text_file(self, test_text_bytes: bytes) -> None:
"""Test Gemini can analyze a text file."""
llm = LLM(model="gemini/gemini-2.0-flash")
files = {"readme": TextFile(source=test_text_bytes)}
messages = _build_multimodal_message(
llm,
"Summarize what this text file says in one sentence.",
files,
)
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
class TestLiteLLMMultimodalIntegration:
"""Integration tests for LiteLLM wrapper multimodal with real API calls."""
@pytest.mark.vcr()
def test_describe_image_gpt4o(self, test_image_bytes: bytes) -> None:
"""Test LiteLLM with GPT-4o can describe an image."""
llm = LLM(model="gpt-4o-mini", is_litellm=True)
files = {"image": ImageFile(source=test_image_bytes)}
messages = _build_multimodal_message(
llm,
"Describe this image in one sentence. Be brief.",
files,
)
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
@pytest.mark.vcr()
def test_describe_image_claude(self, test_image_bytes: bytes) -> None:
"""Test LiteLLM with Claude can describe an image."""
llm = LLM(model="anthropic/claude-3-5-haiku-20241022", is_litellm=True)
files = {"image": ImageFile(source=test_image_bytes)}
messages = _build_multimodal_message(
llm,
"Describe this image in one sentence. Be brief.",
files,
)
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
class TestMultipleFilesIntegration:
"""Integration tests for multiple files in a single request."""
@pytest.mark.vcr()
def test_multiple_images_openai(self, test_image_bytes: bytes) -> None:
"""Test OpenAI can process multiple images."""
llm = LLM(model="openai/gpt-4o-mini")
files = {
"image1": ImageFile(source=test_image_bytes),
"image2": ImageFile(source=test_image_bytes),
}
messages = _build_multimodal_message(
llm,
"How many images do you see? Answer with just the number.",
files,
)
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert "2" in response or "two" in response.lower()
@pytest.mark.vcr()
def test_mixed_content_anthropic(self, test_image_bytes: bytes) -> None:
"""Test Anthropic can process image and PDF together."""
llm = LLM(model="anthropic/claude-3-5-haiku-20241022")
files = {
"image": ImageFile(source=test_image_bytes),
"document": PDFFile(source=MINIMAL_PDF),
}
messages = _build_multimodal_message(
llm,
"What types of files did I send you? List them briefly.",
files,
)
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
class TestGenericFileIntegration:
"""Integration tests for the generic File class with auto-detection."""
@pytest.mark.vcr()
def test_generic_file_image_openai(self, test_image_bytes: bytes) -> None:
"""Test generic File auto-detects image and sends correct content type."""
llm = LLM(model="openai/gpt-4o-mini")
files = {"image": File(source=test_image_bytes)}
messages = _build_multimodal_message(
llm,
"Describe this image in one sentence. Be brief.",
files,
)
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
@pytest.mark.vcr()
def test_generic_file_pdf_anthropic(self) -> None:
"""Test generic File auto-detects PDF and sends correct content type."""
llm = LLM(model="anthropic/claude-3-5-haiku-20241022")
files = {"document": File(source=MINIMAL_PDF)}
messages = _build_multimodal_message(
llm,
"What type of document is this? Answer in one word.",
files,
)
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
@pytest.mark.vcr()
def test_generic_file_text_gemini(self, test_text_bytes: bytes) -> None:
"""Test generic File auto-detects text and sends correct content type."""
llm = LLM(model="gemini/gemini-2.0-flash")
files = {"content": File(source=test_text_bytes)}
messages = _build_multimodal_message(
llm,
"Summarize what this text says in one sentence.",
files,
)
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
@pytest.mark.vcr()
def test_generic_file_mixed_types(self, test_image_bytes: bytes) -> None:
"""Test generic File works with multiple auto-detected types."""
llm = LLM(model="anthropic/claude-3-5-haiku-20241022")
files = {
"chart": File(source=test_image_bytes),
"doc": File(source=MINIMAL_PDF),
}
messages = _build_multimodal_message(
llm,
"What types of files did I send? List them briefly.",
files,
)
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0