mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-23 15:18:14 +00:00
feat: add multimodal support to LLM providers
- Add format_multimodal_content() to all LLM providers - Support inline base64 and file reference formats - Add FileResolver integration for upload caching - Add module exports for files package
This commit is contained in:
@@ -70,6 +70,7 @@ if TYPE_CHECKING:
|
||||
from crewai.llms.providers.anthropic.completion import AnthropicThinkingConfig
|
||||
from crewai.task import Task
|
||||
from crewai.tools.base_tool import BaseTool
|
||||
from crewai.utilities.files import FileInput, UploadCache
|
||||
from crewai.utilities.types import LLMMessage
|
||||
|
||||
try:
|
||||
@@ -683,7 +684,7 @@ class LLM(BaseLLM):
|
||||
"temperature": self.temperature,
|
||||
"top_p": self.top_p,
|
||||
"n": self.n,
|
||||
"stop": self.stop,
|
||||
"stop": self.stop or None,
|
||||
"max_tokens": self.max_tokens or self.max_completion_tokens,
|
||||
"presence_penalty": self.presence_penalty,
|
||||
"frequency_penalty": self.frequency_penalty,
|
||||
@@ -931,7 +932,6 @@ class LLM(BaseLLM):
|
||||
self._handle_streaming_callbacks(callbacks, usage_info, last_chunk)
|
||||
|
||||
if not tool_calls or not available_functions:
|
||||
|
||||
if response_model and self.is_litellm:
|
||||
instructor_instance = InternalInstructor(
|
||||
content=full_response,
|
||||
@@ -1144,8 +1144,12 @@ class LLM(BaseLLM):
|
||||
if response_model:
|
||||
params["response_model"] = response_model
|
||||
response = litellm.completion(**params)
|
||||
|
||||
if hasattr(response,"usage") and not isinstance(response.usage, type) and response.usage:
|
||||
|
||||
if (
|
||||
hasattr(response, "usage")
|
||||
and not isinstance(response.usage, type)
|
||||
and response.usage
|
||||
):
|
||||
usage_info = response.usage
|
||||
self._track_token_usage_internal(usage_info)
|
||||
|
||||
@@ -1273,7 +1277,11 @@ class LLM(BaseLLM):
|
||||
params["response_model"] = response_model
|
||||
response = await litellm.acompletion(**params)
|
||||
|
||||
if hasattr(response,"usage") and not isinstance(response.usage, type) and response.usage:
|
||||
if (
|
||||
hasattr(response, "usage")
|
||||
and not isinstance(response.usage, type)
|
||||
and response.usage
|
||||
):
|
||||
usage_info = response.usage
|
||||
self._track_token_usage_internal(usage_info)
|
||||
|
||||
@@ -1363,7 +1371,7 @@ class LLM(BaseLLM):
|
||||
"""
|
||||
full_response = ""
|
||||
chunk_count = 0
|
||||
|
||||
|
||||
usage_info = None
|
||||
|
||||
accumulated_tool_args: defaultdict[int, AccumulatedToolArgs] = defaultdict(
|
||||
@@ -2205,3 +2213,107 @@ class LLM(BaseLLM):
|
||||
stop=copy.deepcopy(self.stop, memo) if self.stop else None,
|
||||
**filtered_params,
|
||||
)
|
||||
|
||||
def supports_multimodal(self) -> bool:
|
||||
"""Check if the model supports multimodal inputs.
|
||||
|
||||
For litellm, check common vision-enabled model prefixes.
|
||||
|
||||
Returns:
|
||||
True if the model likely supports images.
|
||||
"""
|
||||
vision_prefixes = (
|
||||
"gpt-4o",
|
||||
"gpt-4-turbo",
|
||||
"gpt-4-vision",
|
||||
"gpt-4.1",
|
||||
"claude-3",
|
||||
"claude-4",
|
||||
"gemini",
|
||||
)
|
||||
model_lower = self.model.lower()
|
||||
return any(
|
||||
model_lower.startswith(p) or f"/{p}" in model_lower for p in vision_prefixes
|
||||
)
|
||||
|
||||
def supported_multimodal_content_types(self) -> list[str]:
|
||||
"""Get content types supported for multimodal input.
|
||||
|
||||
Determines supported types based on the underlying model.
|
||||
|
||||
Returns:
|
||||
List of supported MIME type prefixes.
|
||||
"""
|
||||
if not self.supports_multimodal():
|
||||
return []
|
||||
|
||||
model_lower = self.model.lower()
|
||||
|
||||
if "gemini" in model_lower:
|
||||
return ["image/", "audio/", "video/", "application/pdf", "text/"]
|
||||
if "claude-3" in model_lower or "claude-4" in model_lower:
|
||||
return ["image/", "application/pdf"]
|
||||
return ["image/"]
|
||||
|
||||
def format_multimodal_content(
|
||||
self,
|
||||
files: dict[str, FileInput],
|
||||
upload_cache: UploadCache | None = None,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Format files as multimodal content blocks for litellm.
|
||||
|
||||
Uses OpenAI-compatible format which litellm translates to provider format.
|
||||
Uses FileResolver for consistent base64 encoding.
|
||||
|
||||
Args:
|
||||
files: Dictionary mapping file names to FileInput objects.
|
||||
upload_cache: Optional cache (not used by litellm but kept for interface consistency).
|
||||
|
||||
Returns:
|
||||
List of content blocks in OpenAI's expected format.
|
||||
"""
|
||||
import base64
|
||||
|
||||
from crewai.utilities.files import (
|
||||
FileResolver,
|
||||
FileResolverConfig,
|
||||
InlineBase64,
|
||||
)
|
||||
|
||||
if not self.supports_multimodal():
|
||||
return []
|
||||
|
||||
content_blocks: list[dict[str, Any]] = []
|
||||
supported_types = self.supported_multimodal_content_types()
|
||||
|
||||
# LiteLLM uses OpenAI-compatible format
|
||||
config = FileResolverConfig(prefer_upload=False)
|
||||
resolver = FileResolver(config=config, upload_cache=upload_cache)
|
||||
|
||||
for file_input in files.values():
|
||||
content_type = file_input.content_type
|
||||
if not any(content_type.startswith(t) for t in supported_types):
|
||||
continue
|
||||
|
||||
resolved = resolver.resolve(file_input, "openai")
|
||||
|
||||
if isinstance(resolved, InlineBase64):
|
||||
content_blocks.append(
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:{resolved.content_type};base64,{resolved.data}"
|
||||
},
|
||||
}
|
||||
)
|
||||
else:
|
||||
# Fallback to direct base64 encoding
|
||||
data = base64.b64encode(file_input.read()).decode("ascii")
|
||||
content_blocks.append(
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:{content_type};base64,{data}"},
|
||||
}
|
||||
)
|
||||
|
||||
return content_blocks
|
||||
|
||||
@@ -35,6 +35,7 @@ if TYPE_CHECKING:
|
||||
from crewai.agent.core import Agent
|
||||
from crewai.task import Task
|
||||
from crewai.tools.base_tool import BaseTool
|
||||
from crewai.utilities.files import FileInput, UploadCache
|
||||
from crewai.utilities.types import LLMMessage
|
||||
|
||||
|
||||
@@ -280,6 +281,54 @@ class BaseLLM(ABC):
|
||||
# Default implementation - subclasses should override with model-specific values
|
||||
return DEFAULT_CONTEXT_WINDOW_SIZE
|
||||
|
||||
def supports_multimodal(self) -> bool:
|
||||
"""Check if the LLM supports multimodal inputs.
|
||||
|
||||
Returns:
|
||||
True if the LLM supports images, PDFs, audio, or video.
|
||||
"""
|
||||
return False
|
||||
|
||||
def supported_multimodal_content_types(self) -> list[str]:
|
||||
"""Get the content types supported by this LLM for multimodal input.
|
||||
|
||||
Returns:
|
||||
List of supported MIME type prefixes (e.g., ["image/", "application/pdf"]).
|
||||
"""
|
||||
return []
|
||||
|
||||
def format_multimodal_content(
|
||||
self,
|
||||
files: dict[str, FileInput],
|
||||
upload_cache: UploadCache | None = None,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Format files as multimodal content blocks for the LLM.
|
||||
|
||||
Subclasses should override this to provide provider-specific formatting.
|
||||
|
||||
Args:
|
||||
files: Dictionary mapping file names to FileInput objects.
|
||||
upload_cache: Optional cache for tracking uploaded files.
|
||||
|
||||
Returns:
|
||||
List of content blocks in the provider's expected format.
|
||||
"""
|
||||
return []
|
||||
|
||||
def format_text_content(self, text: str) -> dict[str, Any]:
|
||||
"""Format text as a content block for the LLM.
|
||||
|
||||
Default implementation uses OpenAI/Anthropic format.
|
||||
Subclasses should override for provider-specific formatting.
|
||||
|
||||
Args:
|
||||
text: The text content to format.
|
||||
|
||||
Returns:
|
||||
A content block in the provider's expected format.
|
||||
"""
|
||||
return {"type": "text", "text": text}
|
||||
|
||||
# Common helper methods for native SDK implementations
|
||||
|
||||
def _emit_call_started_event(
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
@@ -20,6 +21,9 @@ from crewai.utilities.types import LLMMessage
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from crewai.llms.hooks.base import BaseInterceptor
|
||||
from crewai.utilities.files import FileInput, UploadCache
|
||||
|
||||
DEFAULT_CACHE_TTL = "ephemeral"
|
||||
|
||||
try:
|
||||
from anthropic import Anthropic, AsyncAnthropic
|
||||
@@ -1231,3 +1235,138 @@ class AnthropicCompletion(BaseLLM):
|
||||
"total_tokens": input_tokens + output_tokens,
|
||||
}
|
||||
return {"total_tokens": 0}
|
||||
|
||||
def supports_multimodal(self) -> bool:
|
||||
"""Check if the model supports multimodal inputs.
|
||||
|
||||
All Claude 3+ models support vision and PDFs.
|
||||
|
||||
Returns:
|
||||
True if the model supports images and PDFs.
|
||||
"""
|
||||
return "claude-3" in self.model.lower() or "claude-4" in self.model.lower()
|
||||
|
||||
def supported_multimodal_content_types(self) -> list[str]:
|
||||
"""Get content types supported by Anthropic for multimodal input.
|
||||
|
||||
Returns:
|
||||
List of supported MIME type prefixes.
|
||||
"""
|
||||
if not self.supports_multimodal():
|
||||
return []
|
||||
return ["image/", "application/pdf"]
|
||||
|
||||
def format_multimodal_content(
|
||||
self,
|
||||
files: dict[str, FileInput],
|
||||
upload_cache: UploadCache | None = None,
|
||||
enable_caching: bool = True,
|
||||
cache_ttl: str | None = None,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Format files as Anthropic multimodal content blocks.
|
||||
|
||||
Anthropic supports both base64 inline format and file references via Files API.
|
||||
Uses FileResolver to determine the best delivery method based on file size.
|
||||
Supports prompt caching to reduce costs and latency for repeated file usage.
|
||||
|
||||
Args:
|
||||
files: Dictionary mapping file names to FileInput objects.
|
||||
upload_cache: Optional cache for tracking uploaded files.
|
||||
enable_caching: Whether to add cache_control markers (default: True).
|
||||
cache_ttl: Cache TTL - "ephemeral" (5min) or "1h" (1hr for supported models).
|
||||
|
||||
Returns:
|
||||
List of content blocks in Anthropic's expected format.
|
||||
"""
|
||||
if not self.supports_multimodal():
|
||||
return []
|
||||
|
||||
from crewai.utilities.files import (
|
||||
FileReference,
|
||||
FileResolver,
|
||||
FileResolverConfig,
|
||||
InlineBase64,
|
||||
)
|
||||
|
||||
content_blocks: list[dict[str, Any]] = []
|
||||
supported_types = self.supported_multimodal_content_types()
|
||||
|
||||
config = FileResolverConfig(prefer_upload=False)
|
||||
resolver = FileResolver(config=config, upload_cache=upload_cache)
|
||||
|
||||
file_list = list(files.values())
|
||||
num_files = len(file_list)
|
||||
|
||||
for i, file_input in enumerate(file_list):
|
||||
content_type = file_input.content_type
|
||||
if not any(content_type.startswith(t) for t in supported_types):
|
||||
continue
|
||||
|
||||
resolved = resolver.resolve(file_input, "anthropic")
|
||||
block: dict[str, Any] = {}
|
||||
|
||||
if isinstance(resolved, FileReference):
|
||||
if content_type.startswith("image/"):
|
||||
block = {
|
||||
"type": "image",
|
||||
"source": {
|
||||
"type": "file",
|
||||
"file_id": resolved.file_id,
|
||||
},
|
||||
}
|
||||
elif content_type == "application/pdf":
|
||||
block = {
|
||||
"type": "document",
|
||||
"source": {
|
||||
"type": "file",
|
||||
"file_id": resolved.file_id,
|
||||
},
|
||||
}
|
||||
elif isinstance(resolved, InlineBase64):
|
||||
if content_type.startswith("image/"):
|
||||
block = {
|
||||
"type": "image",
|
||||
"source": {
|
||||
"type": "base64",
|
||||
"media_type": resolved.content_type,
|
||||
"data": resolved.data,
|
||||
},
|
||||
}
|
||||
elif content_type == "application/pdf":
|
||||
block = {
|
||||
"type": "document",
|
||||
"source": {
|
||||
"type": "base64",
|
||||
"media_type": resolved.content_type,
|
||||
"data": resolved.data,
|
||||
},
|
||||
}
|
||||
else:
|
||||
data = base64.b64encode(file_input.read()).decode("ascii")
|
||||
if content_type.startswith("image/"):
|
||||
block = {
|
||||
"type": "image",
|
||||
"source": {
|
||||
"type": "base64",
|
||||
"media_type": content_type,
|
||||
"data": data,
|
||||
},
|
||||
}
|
||||
elif content_type == "application/pdf":
|
||||
block = {
|
||||
"type": "document",
|
||||
"source": {
|
||||
"type": "base64",
|
||||
"media_type": content_type,
|
||||
"data": data,
|
||||
},
|
||||
}
|
||||
|
||||
if block and enable_caching and i == num_files - 1:
|
||||
cache_control: dict[str, str] = {"type": cache_ttl or DEFAULT_CACHE_TTL}
|
||||
block["cache_control"] = cache_control
|
||||
|
||||
if block:
|
||||
content_blocks.append(block)
|
||||
|
||||
return content_blocks
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
@@ -18,6 +19,7 @@ from crewai.utilities.types import LLMMessage
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from crewai.llms.hooks.base import BaseInterceptor
|
||||
from crewai.utilities.files import FileInput, UploadCache
|
||||
|
||||
|
||||
try:
|
||||
@@ -1016,3 +1018,85 @@ class AzureCompletion(BaseLLM):
|
||||
async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
|
||||
"""Async context manager exit."""
|
||||
await self.aclose()
|
||||
|
||||
def supports_multimodal(self) -> bool:
|
||||
"""Check if the model supports multimodal inputs.
|
||||
|
||||
Azure OpenAI vision-enabled models include GPT-4o and GPT-4 Turbo with Vision.
|
||||
|
||||
Returns:
|
||||
True if the model supports images.
|
||||
"""
|
||||
vision_models = ("gpt-4o", "gpt-4-turbo", "gpt-4-vision", "gpt-4v")
|
||||
return any(self.model.lower().startswith(m) for m in vision_models)
|
||||
|
||||
def supported_multimodal_content_types(self) -> list[str]:
|
||||
"""Get content types supported by Azure for multimodal input.
|
||||
|
||||
Returns:
|
||||
List of supported MIME type prefixes.
|
||||
"""
|
||||
if not self.supports_multimodal():
|
||||
return []
|
||||
return ["image/"]
|
||||
|
||||
def format_multimodal_content(
|
||||
self,
|
||||
files: dict[str, FileInput],
|
||||
upload_cache: UploadCache | None = None,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Format files as Azure OpenAI multimodal content blocks.
|
||||
|
||||
Azure OpenAI uses the same image_url format as OpenAI.
|
||||
Uses FileResolver for consistent base64 encoding.
|
||||
|
||||
Args:
|
||||
files: Dictionary mapping file names to FileInput objects.
|
||||
upload_cache: Optional cache (not used by Azure but kept for interface consistency).
|
||||
|
||||
Returns:
|
||||
List of content blocks in Azure OpenAI's expected format.
|
||||
"""
|
||||
if not self.supports_multimodal():
|
||||
return []
|
||||
|
||||
from crewai.utilities.files import (
|
||||
FileResolver,
|
||||
FileResolverConfig,
|
||||
InlineBase64,
|
||||
)
|
||||
|
||||
content_blocks: list[dict[str, Any]] = []
|
||||
supported_types = self.supported_multimodal_content_types()
|
||||
|
||||
# Azure doesn't support file uploads for images, so just use inline
|
||||
config = FileResolverConfig(prefer_upload=False)
|
||||
resolver = FileResolver(config=config, upload_cache=upload_cache)
|
||||
|
||||
for file_input in files.values():
|
||||
content_type = file_input.content_type
|
||||
if not any(content_type.startswith(t) for t in supported_types):
|
||||
continue
|
||||
|
||||
resolved = resolver.resolve(file_input, "azure")
|
||||
|
||||
if isinstance(resolved, InlineBase64):
|
||||
content_blocks.append(
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:{resolved.content_type};base64,{resolved.data}"
|
||||
},
|
||||
}
|
||||
)
|
||||
else:
|
||||
# Fallback to direct base64 encoding
|
||||
data = base64.b64encode(file_input.read()).decode("ascii")
|
||||
content_blocks.append(
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:{content_type};base64,{data}"},
|
||||
}
|
||||
)
|
||||
|
||||
return content_blocks
|
||||
|
||||
@@ -33,6 +33,7 @@ if TYPE_CHECKING:
|
||||
)
|
||||
|
||||
from crewai.llms.hooks.base import BaseInterceptor
|
||||
from crewai.utilities.files import FileInput, UploadCache
|
||||
|
||||
|
||||
try:
|
||||
@@ -1450,3 +1451,92 @@ class BedrockCompletion(BaseLLM):
|
||||
|
||||
# Default context window size
|
||||
return int(8192 * CONTEXT_WINDOW_USAGE_RATIO)
|
||||
|
||||
def supports_multimodal(self) -> bool:
|
||||
"""Check if the model supports multimodal inputs.
|
||||
|
||||
Claude models on Bedrock support vision.
|
||||
|
||||
Returns:
|
||||
True if the model supports images.
|
||||
"""
|
||||
vision_models = ("anthropic.claude-3",)
|
||||
return any(self.model.lower().startswith(m) for m in vision_models)
|
||||
|
||||
def supported_multimodal_content_types(self) -> list[str]:
|
||||
"""Get content types supported by Bedrock for multimodal input.
|
||||
|
||||
Returns:
|
||||
List of supported MIME type prefixes.
|
||||
"""
|
||||
if not self.supports_multimodal():
|
||||
return []
|
||||
return ["image/", "application/pdf"]
|
||||
|
||||
def format_multimodal_content(
|
||||
self,
|
||||
files: dict[str, FileInput],
|
||||
upload_cache: UploadCache | None = None,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Format files as Bedrock Converse API multimodal content blocks.
|
||||
|
||||
Bedrock Converse API uses specific formats for images and documents with raw bytes.
|
||||
Uses FileResolver to get InlineBytes format for Bedrock's byte-based API.
|
||||
|
||||
Args:
|
||||
files: Dictionary mapping file names to FileInput objects.
|
||||
upload_cache: Optional cache (not used by Bedrock but kept for interface consistency).
|
||||
|
||||
Returns:
|
||||
List of content blocks in Bedrock's expected format.
|
||||
"""
|
||||
if not self.supports_multimodal():
|
||||
return []
|
||||
|
||||
from crewai.utilities.files import (
|
||||
FileResolver,
|
||||
FileResolverConfig,
|
||||
InlineBytes,
|
||||
)
|
||||
|
||||
content_blocks: list[dict[str, Any]] = []
|
||||
|
||||
# Bedrock uses raw bytes, configure resolver accordingly
|
||||
config = FileResolverConfig(prefer_upload=False, use_bytes_for_bedrock=True)
|
||||
resolver = FileResolver(config=config, upload_cache=upload_cache)
|
||||
|
||||
for name, file_input in files.items():
|
||||
content_type = file_input.content_type
|
||||
|
||||
resolved = resolver.resolve(file_input, "bedrock")
|
||||
|
||||
if isinstance(resolved, InlineBytes):
|
||||
file_bytes = resolved.data
|
||||
else:
|
||||
# Fallback to reading directly
|
||||
file_bytes = file_input.read()
|
||||
|
||||
if content_type.startswith("image/"):
|
||||
media_type = content_type.split("/")[-1]
|
||||
if media_type == "jpg":
|
||||
media_type = "jpeg"
|
||||
content_blocks.append(
|
||||
{
|
||||
"image": {
|
||||
"format": media_type,
|
||||
"source": {"bytes": file_bytes},
|
||||
}
|
||||
}
|
||||
)
|
||||
elif content_type == "application/pdf":
|
||||
content_blocks.append(
|
||||
{
|
||||
"document": {
|
||||
"name": name,
|
||||
"format": "pdf",
|
||||
"source": {"bytes": file_bytes},
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
return content_blocks
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
@@ -19,6 +20,10 @@ from crewai.utilities.types import LLMMessage
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from crewai.llms.hooks.base import BaseInterceptor
|
||||
from crewai.utilities.files import (
|
||||
FileInput,
|
||||
UploadCache,
|
||||
)
|
||||
|
||||
|
||||
try:
|
||||
@@ -516,17 +521,31 @@ class GeminiCompletion(BaseLLM):
|
||||
role = message["role"]
|
||||
content = message["content"]
|
||||
|
||||
# Convert content to string if it's a list
|
||||
# Build parts list from content
|
||||
parts: list[types.Part] = []
|
||||
if isinstance(content, list):
|
||||
text_content = " ".join(
|
||||
str(item.get("text", "")) if isinstance(item, dict) else str(item)
|
||||
for item in content
|
||||
)
|
||||
for item in content:
|
||||
if isinstance(item, dict):
|
||||
if "text" in item:
|
||||
parts.append(types.Part.from_text(text=str(item["text"])))
|
||||
elif "inlineData" in item:
|
||||
inline = item["inlineData"]
|
||||
parts.append(
|
||||
types.Part.from_bytes(
|
||||
data=base64.b64decode(inline["data"]),
|
||||
mime_type=inline["mimeType"],
|
||||
)
|
||||
)
|
||||
else:
|
||||
parts.append(types.Part.from_text(text=str(item)))
|
||||
else:
|
||||
text_content = str(content) if content else ""
|
||||
parts.append(types.Part.from_text(text=str(content) if content else ""))
|
||||
|
||||
if role == "system":
|
||||
# Extract system instruction - Gemini handles it separately
|
||||
text_content = " ".join(
|
||||
p.text for p in parts if hasattr(p, "text") and p.text
|
||||
)
|
||||
if system_instruction:
|
||||
system_instruction += f"\n\n{text_content}"
|
||||
else:
|
||||
@@ -536,9 +555,7 @@ class GeminiCompletion(BaseLLM):
|
||||
gemini_role = "model" if role == "assistant" else "user"
|
||||
|
||||
# Create Content object
|
||||
gemini_content = types.Content(
|
||||
role=gemini_role, parts=[types.Part.from_text(text=text_content)]
|
||||
)
|
||||
gemini_content = types.Content(role=gemini_role, parts=parts)
|
||||
contents.append(gemini_content)
|
||||
|
||||
return contents, system_instruction
|
||||
@@ -1060,3 +1077,106 @@ class GeminiCompletion(BaseLLM):
|
||||
)
|
||||
)
|
||||
return result
|
||||
|
||||
def supports_multimodal(self) -> bool:
|
||||
"""Check if the model supports multimodal inputs.
|
||||
|
||||
Gemini models support images, audio, video, and PDFs.
|
||||
|
||||
Returns:
|
||||
True if the model supports multimodal inputs.
|
||||
"""
|
||||
return True
|
||||
|
||||
def supported_multimodal_content_types(self) -> list[str]:
|
||||
"""Get content types supported by Gemini for multimodal input.
|
||||
|
||||
Returns:
|
||||
List of supported MIME type prefixes.
|
||||
"""
|
||||
return ["image/", "audio/", "video/", "application/pdf", "text/"]
|
||||
|
||||
def format_multimodal_content(
|
||||
self,
|
||||
files: dict[str, FileInput],
|
||||
upload_cache: UploadCache | None = None,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Format files as Gemini multimodal content blocks.
|
||||
|
||||
Gemini supports both inlineData format and file references via File API.
|
||||
Uses FileResolver to determine the best delivery method based on file size.
|
||||
|
||||
Args:
|
||||
files: Dictionary mapping file names to FileInput objects.
|
||||
upload_cache: Optional cache for tracking uploaded files.
|
||||
|
||||
Returns:
|
||||
List of content blocks in Gemini's expected format.
|
||||
"""
|
||||
from crewai.utilities.files import (
|
||||
FileReference,
|
||||
FileResolver,
|
||||
FileResolverConfig,
|
||||
InlineBase64,
|
||||
)
|
||||
|
||||
content_blocks: list[dict[str, Any]] = []
|
||||
supported_types = self.supported_multimodal_content_types()
|
||||
|
||||
# Create resolver with optional cache
|
||||
config = FileResolverConfig(prefer_upload=False)
|
||||
resolver = FileResolver(config=config, upload_cache=upload_cache)
|
||||
|
||||
for file_input in files.values():
|
||||
content_type = file_input.content_type
|
||||
if not any(content_type.startswith(t) for t in supported_types):
|
||||
continue
|
||||
|
||||
resolved = resolver.resolve(file_input, "gemini")
|
||||
|
||||
if isinstance(resolved, FileReference) and resolved.file_uri:
|
||||
# Use file reference format for uploaded files
|
||||
content_blocks.append(
|
||||
{
|
||||
"fileData": {
|
||||
"mimeType": resolved.content_type,
|
||||
"fileUri": resolved.file_uri,
|
||||
}
|
||||
}
|
||||
)
|
||||
elif isinstance(resolved, InlineBase64):
|
||||
# Use inline format for smaller files
|
||||
content_blocks.append(
|
||||
{
|
||||
"inlineData": {
|
||||
"mimeType": resolved.content_type,
|
||||
"data": resolved.data,
|
||||
}
|
||||
}
|
||||
)
|
||||
else:
|
||||
# Fallback to base64 encoding
|
||||
data = base64.b64encode(file_input.read()).decode("ascii")
|
||||
content_blocks.append(
|
||||
{
|
||||
"inlineData": {
|
||||
"mimeType": content_type,
|
||||
"data": data,
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
return content_blocks
|
||||
|
||||
def format_text_content(self, text: str) -> dict[str, Any]:
|
||||
"""Format text as a Gemini content block.
|
||||
|
||||
Gemini uses {"text": "..."} format instead of {"type": "text", "text": "..."}.
|
||||
|
||||
Args:
|
||||
text: The text content to format.
|
||||
|
||||
Returns:
|
||||
A content block in Gemini's expected format.
|
||||
"""
|
||||
return {"text": text}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
from collections.abc import AsyncIterator
|
||||
import json
|
||||
import logging
|
||||
@@ -30,6 +31,7 @@ if TYPE_CHECKING:
|
||||
from crewai.llms.hooks.base import BaseInterceptor
|
||||
from crewai.task import Task
|
||||
from crewai.tools.base_tool import BaseTool
|
||||
from crewai.utilities.files import FileInput, UploadCache
|
||||
|
||||
|
||||
class OpenAICompletion(BaseLLM):
|
||||
@@ -1048,3 +1050,101 @@ class OpenAICompletion(BaseLLM):
|
||||
formatted_messages.append(message)
|
||||
|
||||
return formatted_messages
|
||||
|
||||
def supports_multimodal(self) -> bool:
|
||||
"""Check if the model supports multimodal inputs.
|
||||
|
||||
OpenAI vision-enabled models include GPT-4o, GPT-4.1, and o-series.
|
||||
|
||||
Returns:
|
||||
True if the model supports images.
|
||||
"""
|
||||
vision_models = (
|
||||
"gpt-4o",
|
||||
"gpt-4.1",
|
||||
"gpt-4-turbo",
|
||||
"gpt-4-vision",
|
||||
"o1",
|
||||
"o3",
|
||||
"o4",
|
||||
)
|
||||
return any(self.model.lower().startswith(m) for m in vision_models)
|
||||
|
||||
def supported_multimodal_content_types(self) -> list[str]:
|
||||
"""Get content types supported by OpenAI for multimodal input.
|
||||
|
||||
Returns:
|
||||
List of supported MIME type prefixes.
|
||||
"""
|
||||
if not self.supports_multimodal():
|
||||
return []
|
||||
return ["image/"]
|
||||
|
||||
def format_multimodal_content(
|
||||
self,
|
||||
files: dict[str, FileInput],
|
||||
upload_cache: UploadCache | None = None,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Format files as OpenAI multimodal content blocks.
|
||||
|
||||
OpenAI supports both base64 data URLs and file_id references via Files API.
|
||||
Uses FileResolver to determine the best delivery method based on file size.
|
||||
|
||||
Args:
|
||||
files: Dictionary mapping file names to FileInput objects.
|
||||
upload_cache: Optional cache for tracking uploaded files.
|
||||
|
||||
Returns:
|
||||
List of content blocks in OpenAI's expected format.
|
||||
"""
|
||||
if not self.supports_multimodal():
|
||||
return []
|
||||
|
||||
from crewai.utilities.files import (
|
||||
FileReference,
|
||||
FileResolver,
|
||||
FileResolverConfig,
|
||||
InlineBase64,
|
||||
)
|
||||
|
||||
content_blocks: list[dict[str, Any]] = []
|
||||
supported_types = self.supported_multimodal_content_types()
|
||||
|
||||
config = FileResolverConfig(prefer_upload=False)
|
||||
resolver = FileResolver(config=config, upload_cache=upload_cache)
|
||||
|
||||
for file_input in files.values():
|
||||
content_type = file_input.content_type
|
||||
if not any(content_type.startswith(t) for t in supported_types):
|
||||
continue
|
||||
|
||||
resolved = resolver.resolve(file_input, "openai")
|
||||
|
||||
if isinstance(resolved, FileReference):
|
||||
content_blocks.append(
|
||||
{
|
||||
"type": "file",
|
||||
"file": {
|
||||
"file_id": resolved.file_id,
|
||||
},
|
||||
}
|
||||
)
|
||||
elif isinstance(resolved, InlineBase64):
|
||||
content_blocks.append(
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:{resolved.content_type};base64,{resolved.data}"
|
||||
},
|
||||
}
|
||||
)
|
||||
else:
|
||||
data = base64.b64encode(file_input.read()).decode("ascii")
|
||||
content_blocks.append(
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:{content_type};base64,{data}"},
|
||||
}
|
||||
)
|
||||
|
||||
return content_blocks
|
||||
|
||||
210
lib/crewai/src/crewai/utilities/files/__init__.py
Normal file
210
lib/crewai/src/crewai/utilities/files/__init__.py
Normal file
@@ -0,0 +1,210 @@
|
||||
"""File handling utilities for crewAI tasks."""
|
||||
|
||||
from crewai.utilities.files.cleanup import (
|
||||
cleanup_expired_files,
|
||||
cleanup_provider_files,
|
||||
cleanup_uploaded_files,
|
||||
)
|
||||
from crewai.utilities.files.content_types import (
|
||||
AudioContentType,
|
||||
AudioExtension,
|
||||
AudioFile,
|
||||
BaseFile,
|
||||
File,
|
||||
FileMode,
|
||||
ImageContentType,
|
||||
ImageExtension,
|
||||
ImageFile,
|
||||
PDFContentType,
|
||||
PDFExtension,
|
||||
PDFFile,
|
||||
TextContentType,
|
||||
TextExtension,
|
||||
TextFile,
|
||||
VideoContentType,
|
||||
VideoExtension,
|
||||
VideoFile,
|
||||
)
|
||||
from crewai.utilities.files.file import (
|
||||
FileBytes,
|
||||
FilePath,
|
||||
FileSource,
|
||||
FileSourceInput,
|
||||
FileStream,
|
||||
RawFileInput,
|
||||
)
|
||||
from crewai.utilities.files.processing import (
|
||||
ANTHROPIC_CONSTRAINTS,
|
||||
BEDROCK_CONSTRAINTS,
|
||||
GEMINI_CONSTRAINTS,
|
||||
OPENAI_CONSTRAINTS,
|
||||
AudioConstraints,
|
||||
FileHandling,
|
||||
FileProcessingError,
|
||||
FileProcessor,
|
||||
FileTooLargeError,
|
||||
FileValidationError,
|
||||
ImageConstraints,
|
||||
PDFConstraints,
|
||||
ProcessingDependencyError,
|
||||
ProviderConstraints,
|
||||
UnsupportedFileTypeError,
|
||||
VideoConstraints,
|
||||
get_constraints_for_provider,
|
||||
)
|
||||
from crewai.utilities.files.resolved import (
|
||||
FileReference,
|
||||
InlineBase64,
|
||||
InlineBytes,
|
||||
ResolvedFile,
|
||||
ResolvedFileType,
|
||||
UrlReference,
|
||||
)
|
||||
from crewai.utilities.files.resolver import (
|
||||
FileResolver,
|
||||
FileResolverConfig,
|
||||
create_resolver,
|
||||
)
|
||||
from crewai.utilities.files.upload_cache import (
|
||||
CachedUpload,
|
||||
UploadCache,
|
||||
get_upload_cache,
|
||||
reset_upload_cache,
|
||||
)
|
||||
from crewai.utilities.files.uploaders import FileUploader, UploadResult, get_uploader
|
||||
|
||||
|
||||
FileInput = AudioFile | File | ImageFile | PDFFile | TextFile | VideoFile
|
||||
|
||||
|
||||
def wrap_file_source(source: FileSource) -> FileInput:
|
||||
"""Wrap a FileSource in the appropriate typed FileInput wrapper.
|
||||
|
||||
Args:
|
||||
source: The file source to wrap.
|
||||
|
||||
Returns:
|
||||
Typed FileInput wrapper based on content type.
|
||||
"""
|
||||
content_type = source.content_type
|
||||
|
||||
if content_type.startswith("image/"):
|
||||
return ImageFile(source=source)
|
||||
if content_type.startswith("audio/"):
|
||||
return AudioFile(source=source)
|
||||
if content_type.startswith("video/"):
|
||||
return VideoFile(source=source)
|
||||
if content_type == "application/pdf":
|
||||
return PDFFile(source=source)
|
||||
# Default to text for anything else
|
||||
return TextFile(source=source)
|
||||
|
||||
|
||||
def normalize_input_files(
|
||||
input_files: list[FileSourceInput | FileInput],
|
||||
) -> dict[str, FileInput]:
|
||||
"""Convert a list of file sources to a named dictionary of FileInputs.
|
||||
|
||||
Args:
|
||||
input_files: List of file source inputs or File objects.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping names to FileInput wrappers.
|
||||
"""
|
||||
from pathlib import Path
|
||||
|
||||
result: dict[str, FileInput] = {}
|
||||
|
||||
for i, item in enumerate(input_files):
|
||||
# If it's already a typed File wrapper, use it directly
|
||||
if isinstance(item, BaseFile):
|
||||
name = item.filename or f"file_{i}"
|
||||
# Remove extension from name for cleaner keys
|
||||
if "." in name:
|
||||
name = name.rsplit(".", 1)[0]
|
||||
result[name] = item
|
||||
continue
|
||||
|
||||
file_source: FilePath | FileBytes | FileStream
|
||||
if isinstance(item, (FilePath, FileBytes, FileStream)):
|
||||
file_source = item
|
||||
elif isinstance(item, Path):
|
||||
file_source = FilePath(path=item)
|
||||
elif isinstance(item, str):
|
||||
file_source = FilePath(path=Path(item))
|
||||
elif isinstance(item, (bytes, memoryview)):
|
||||
file_source = FileBytes(data=bytes(item))
|
||||
else:
|
||||
continue
|
||||
|
||||
name = file_source.filename or f"file_{i}"
|
||||
result[name] = wrap_file_source(file_source)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
__all__ = [
|
||||
"ANTHROPIC_CONSTRAINTS",
|
||||
"BEDROCK_CONSTRAINTS",
|
||||
"GEMINI_CONSTRAINTS",
|
||||
"OPENAI_CONSTRAINTS",
|
||||
"AudioConstraints",
|
||||
"AudioContentType",
|
||||
"AudioExtension",
|
||||
"AudioFile",
|
||||
"BaseFile",
|
||||
"CachedUpload",
|
||||
"File",
|
||||
"FileBytes",
|
||||
"FileHandling",
|
||||
"FileInput",
|
||||
"FileMode",
|
||||
"FilePath",
|
||||
"FileProcessingError",
|
||||
"FileProcessor",
|
||||
"FileReference",
|
||||
"FileResolver",
|
||||
"FileResolverConfig",
|
||||
"FileSource",
|
||||
"FileSourceInput",
|
||||
"FileStream",
|
||||
"FileTooLargeError",
|
||||
"FileUploader",
|
||||
"FileValidationError",
|
||||
"ImageConstraints",
|
||||
"ImageContentType",
|
||||
"ImageExtension",
|
||||
"ImageFile",
|
||||
"InlineBase64",
|
||||
"InlineBytes",
|
||||
"PDFConstraints",
|
||||
"PDFContentType",
|
||||
"PDFExtension",
|
||||
"PDFFile",
|
||||
"ProcessingDependencyError",
|
||||
"ProviderConstraints",
|
||||
"RawFileInput",
|
||||
"ResolvedFile",
|
||||
"ResolvedFileType",
|
||||
"TextContentType",
|
||||
"TextExtension",
|
||||
"TextFile",
|
||||
"UnsupportedFileTypeError",
|
||||
"UploadCache",
|
||||
"UploadResult",
|
||||
"UrlReference",
|
||||
"VideoConstraints",
|
||||
"VideoContentType",
|
||||
"VideoExtension",
|
||||
"VideoFile",
|
||||
"cleanup_expired_files",
|
||||
"cleanup_provider_files",
|
||||
"cleanup_uploaded_files",
|
||||
"create_resolver",
|
||||
"get_constraints_for_provider",
|
||||
"get_upload_cache",
|
||||
"get_uploader",
|
||||
"normalize_input_files",
|
||||
"reset_upload_cache",
|
||||
"wrap_file_source",
|
||||
]
|
||||
62
lib/crewai/src/crewai/utilities/files/processing/__init__.py
Normal file
62
lib/crewai/src/crewai/utilities/files/processing/__init__.py
Normal file
@@ -0,0 +1,62 @@
|
||||
"""File processing module for multimodal content handling.
|
||||
|
||||
This module provides validation, transformation, and processing utilities
|
||||
for files used in multimodal LLM interactions.
|
||||
"""
|
||||
|
||||
from crewai.utilities.files.processing.constraints import (
|
||||
ANTHROPIC_CONSTRAINTS,
|
||||
BEDROCK_CONSTRAINTS,
|
||||
GEMINI_CONSTRAINTS,
|
||||
OPENAI_CONSTRAINTS,
|
||||
AudioConstraints,
|
||||
ImageConstraints,
|
||||
PDFConstraints,
|
||||
ProviderConstraints,
|
||||
VideoConstraints,
|
||||
get_constraints_for_provider,
|
||||
)
|
||||
from crewai.utilities.files.processing.enums import FileHandling
|
||||
from crewai.utilities.files.processing.exceptions import (
|
||||
FileProcessingError,
|
||||
FileTooLargeError,
|
||||
FileValidationError,
|
||||
ProcessingDependencyError,
|
||||
UnsupportedFileTypeError,
|
||||
)
|
||||
from crewai.utilities.files.processing.processor import FileProcessor
|
||||
from crewai.utilities.files.processing.validators import (
|
||||
validate_audio,
|
||||
validate_file,
|
||||
validate_image,
|
||||
validate_pdf,
|
||||
validate_text,
|
||||
validate_video,
|
||||
)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"ANTHROPIC_CONSTRAINTS",
|
||||
"BEDROCK_CONSTRAINTS",
|
||||
"GEMINI_CONSTRAINTS",
|
||||
"OPENAI_CONSTRAINTS",
|
||||
"AudioConstraints",
|
||||
"FileHandling",
|
||||
"FileProcessingError",
|
||||
"FileProcessor",
|
||||
"FileTooLargeError",
|
||||
"FileValidationError",
|
||||
"ImageConstraints",
|
||||
"PDFConstraints",
|
||||
"ProcessingDependencyError",
|
||||
"ProviderConstraints",
|
||||
"UnsupportedFileTypeError",
|
||||
"VideoConstraints",
|
||||
"get_constraints_for_provider",
|
||||
"validate_audio",
|
||||
"validate_file",
|
||||
"validate_image",
|
||||
"validate_pdf",
|
||||
"validate_text",
|
||||
"validate_video",
|
||||
]
|
||||
@@ -0,0 +1,104 @@
|
||||
interactions:
|
||||
- request:
|
||||
body: '{"max_tokens":4096,"messages":[{"role":"user","content":[{"type":"text","text":"What
|
||||
type of document is this? Answer in one word."},{"type":"document","source":{"type":"base64","media_type":"application/pdf","data":"JVBERi0xLjQKMSAwIG9iaiA8PCAvVHlwZSAvQ2F0YWxvZyAvUGFnZXMgMiAwIFIgPj4gZW5kb2JqCjIgMCBvYmogPDwgL1R5cGUgL1BhZ2VzIC9LaWRzIFszIDAgUl0gL0NvdW50IDEgPj4gZW5kb2JqCjMgMCBvYmogPDwgL1R5cGUgL1BhZ2UgL1BhcmVudCAyIDAgUiAvTWVkaWFCb3ggWzAgMCA2MTIgNzkyXSA+PiBlbmRvYmoKeHJlZgowIDQKMDAwMDAwMDAwMCA2NTUzNSBmCjAwMDAwMDAwMDkgMDAwMDAgbgowMDAwMDAwMDU4IDAwMDAwIG4KMDAwMDAwMDExNSAwMDAwMCBuCnRyYWlsZXIgPDwgL1NpemUgNCAvUm9vdCAxIDAgUiA+PgpzdGFydHhyZWYKMTk2CiUlRU9GCg=="},"cache_control":{"type":"ephemeral"}}]}],"model":"claude-3-5-haiku-20241022","stream":false}'
|
||||
headers:
|
||||
User-Agent:
|
||||
- X-USER-AGENT-XXX
|
||||
accept:
|
||||
- application/json
|
||||
accept-encoding:
|
||||
- ACCEPT-ENCODING-XXX
|
||||
anthropic-version:
|
||||
- '2023-06-01'
|
||||
connection:
|
||||
- keep-alive
|
||||
content-length:
|
||||
- '748'
|
||||
content-type:
|
||||
- application/json
|
||||
host:
|
||||
- api.anthropic.com
|
||||
x-api-key:
|
||||
- X-API-KEY-XXX
|
||||
x-stainless-arch:
|
||||
- X-STAINLESS-ARCH-XXX
|
||||
x-stainless-async:
|
||||
- 'false'
|
||||
x-stainless-lang:
|
||||
- python
|
||||
x-stainless-os:
|
||||
- X-STAINLESS-OS-XXX
|
||||
x-stainless-package-version:
|
||||
- 0.71.1
|
||||
x-stainless-retry-count:
|
||||
- '0'
|
||||
x-stainless-runtime:
|
||||
- CPython
|
||||
x-stainless-runtime-version:
|
||||
- 3.12.10
|
||||
x-stainless-timeout:
|
||||
- NOT_GIVEN
|
||||
method: POST
|
||||
uri: https://api.anthropic.com/v1/messages
|
||||
response:
|
||||
body:
|
||||
string: !!binary |
|
||||
H4sIAAAAAAAA/3WQTUvEMBCG/8ucW2jr7rL25sKCKHrQiyASYjJsw6ZJzUxEKf3vTheLX3hKeJ8n
|
||||
8zIZoY8WPbRgvM4Wy7NyXXbaHXPZVM2qrpoGCnBWhJ4Oqqovd/nBnt92tF1dX+z3u6t7ffO8FYff
|
||||
B5wtJNIHlCBFPweayBHrwBKZGBjl1j6Oi8/4NpPT0cIdUu4RpqcCiOOgEmqKQQAGqzinAJ+A8CVj
|
||||
MDIhZO8LyKfSdgQXhsyK4xEDQVtvmo3UatOhMjKMXQzqp1ItXLD9jy1v5wYcOuwxaa/W/V//i9bd
|
||||
bzoVEDN/j1ayDqZXZ1CxwySLzl9ldbIwTR/rySkqnAEAAA==
|
||||
headers:
|
||||
CF-RAY:
|
||||
- CF-RAY-XXX
|
||||
Connection:
|
||||
- keep-alive
|
||||
Content-Type:
|
||||
- application/json
|
||||
Date:
|
||||
- Thu, 22 Jan 2026 00:18:50 GMT
|
||||
Server:
|
||||
- cloudflare
|
||||
Transfer-Encoding:
|
||||
- chunked
|
||||
X-Robots-Tag:
|
||||
- none
|
||||
anthropic-organization-id:
|
||||
- ANTHROPIC-ORGANIZATION-ID-XXX
|
||||
anthropic-ratelimit-input-tokens-limit:
|
||||
- ANTHROPIC-RATELIMIT-INPUT-TOKENS-LIMIT-XXX
|
||||
anthropic-ratelimit-input-tokens-remaining:
|
||||
- ANTHROPIC-RATELIMIT-INPUT-TOKENS-REMAINING-XXX
|
||||
anthropic-ratelimit-input-tokens-reset:
|
||||
- ANTHROPIC-RATELIMIT-INPUT-TOKENS-RESET-XXX
|
||||
anthropic-ratelimit-output-tokens-limit:
|
||||
- ANTHROPIC-RATELIMIT-OUTPUT-TOKENS-LIMIT-XXX
|
||||
anthropic-ratelimit-output-tokens-remaining:
|
||||
- ANTHROPIC-RATELIMIT-OUTPUT-TOKENS-REMAINING-XXX
|
||||
anthropic-ratelimit-output-tokens-reset:
|
||||
- ANTHROPIC-RATELIMIT-OUTPUT-TOKENS-RESET-XXX
|
||||
anthropic-ratelimit-requests-limit:
|
||||
- '4000'
|
||||
anthropic-ratelimit-requests-remaining:
|
||||
- '3999'
|
||||
anthropic-ratelimit-requests-reset:
|
||||
- '2026-01-22T00:18:50Z'
|
||||
anthropic-ratelimit-tokens-limit:
|
||||
- ANTHROPIC-RATELIMIT-TOKENS-LIMIT-XXX
|
||||
anthropic-ratelimit-tokens-remaining:
|
||||
- ANTHROPIC-RATELIMIT-TOKENS-REMAINING-XXX
|
||||
anthropic-ratelimit-tokens-reset:
|
||||
- ANTHROPIC-RATELIMIT-TOKENS-RESET-XXX
|
||||
cf-cache-status:
|
||||
- DYNAMIC
|
||||
request-id:
|
||||
- REQUEST-ID-XXX
|
||||
strict-transport-security:
|
||||
- STS-XXX
|
||||
x-envoy-upstream-service-time:
|
||||
- '750'
|
||||
status:
|
||||
code: 200
|
||||
message: OK
|
||||
version: 1
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -0,0 +1,104 @@
|
||||
interactions:
|
||||
- request:
|
||||
body: '{"max_tokens":4096,"messages":[{"role":"user","content":[{"type":"text","text":"What
|
||||
type of document is this? Answer in one word."},{"type":"document","source":{"type":"base64","media_type":"application/pdf","data":"JVBERi0xLjQKMSAwIG9iaiA8PCAvVHlwZSAvQ2F0YWxvZyAvUGFnZXMgMiAwIFIgPj4gZW5kb2JqCjIgMCBvYmogPDwgL1R5cGUgL1BhZ2VzIC9LaWRzIFszIDAgUl0gL0NvdW50IDEgPj4gZW5kb2JqCjMgMCBvYmogPDwgL1R5cGUgL1BhZ2UgL1BhcmVudCAyIDAgUiAvTWVkaWFCb3ggWzAgMCA2MTIgNzkyXSA+PiBlbmRvYmoKeHJlZgowIDQKMDAwMDAwMDAwMCA2NTUzNSBmCjAwMDAwMDAwMDkgMDAwMDAgbgowMDAwMDAwMDU4IDAwMDAwIG4KMDAwMDAwMDExNSAwMDAwMCBuCnRyYWlsZXIgPDwgL1NpemUgNCAvUm9vdCAxIDAgUiA+PgpzdGFydHhyZWYKMTk2CiUlRU9GCg=="},"cache_control":{"type":"ephemeral"}}]}],"model":"claude-3-5-haiku-20241022","stream":false}'
|
||||
headers:
|
||||
User-Agent:
|
||||
- X-USER-AGENT-XXX
|
||||
accept:
|
||||
- application/json
|
||||
accept-encoding:
|
||||
- ACCEPT-ENCODING-XXX
|
||||
anthropic-version:
|
||||
- '2023-06-01'
|
||||
connection:
|
||||
- keep-alive
|
||||
content-length:
|
||||
- '748'
|
||||
content-type:
|
||||
- application/json
|
||||
host:
|
||||
- api.anthropic.com
|
||||
x-api-key:
|
||||
- X-API-KEY-XXX
|
||||
x-stainless-arch:
|
||||
- X-STAINLESS-ARCH-XXX
|
||||
x-stainless-async:
|
||||
- 'false'
|
||||
x-stainless-lang:
|
||||
- python
|
||||
x-stainless-os:
|
||||
- X-STAINLESS-OS-XXX
|
||||
x-stainless-package-version:
|
||||
- 0.71.1
|
||||
x-stainless-retry-count:
|
||||
- '0'
|
||||
x-stainless-runtime:
|
||||
- CPython
|
||||
x-stainless-runtime-version:
|
||||
- 3.12.10
|
||||
x-stainless-timeout:
|
||||
- NOT_GIVEN
|
||||
method: POST
|
||||
uri: https://api.anthropic.com/v1/messages
|
||||
response:
|
||||
body:
|
||||
string: !!binary |
|
||||
H4sIAAAAAAAA/3WQTUvEMBCG/8ucW2hju4eeRUU97EFRFAkhGbZh06Qmk1Up/e9OF4tf7CnhfZ7J
|
||||
y2SCIRh00IF2Khssz8q27JXd51JUoqkrIaAAa1gY0k5W9bbptXo7PD60l/V1f/V0J+5vxQ079DHi
|
||||
YmFKaoccxOCWQKVkEylPHOngCfnWPU+rT/i+kOPRwfb8AuaXAhKFUUZUKXhO0RtJOXr4AglfM3rN
|
||||
4z47V0A+NnYTWD9mkhT26BN09UZsuFPpHqXmx8gGL38r1coZm1NsnV0acOxxwKicbIf//jet+790
|
||||
LiBk+hk1vA7Gg9UoyWLkRZd/MioamOdP24g1JZkBAAA=
|
||||
headers:
|
||||
CF-RAY:
|
||||
- CF-RAY-XXX
|
||||
Connection:
|
||||
- keep-alive
|
||||
Content-Type:
|
||||
- application/json
|
||||
Date:
|
||||
- Thu, 22 Jan 2026 00:18:56 GMT
|
||||
Server:
|
||||
- cloudflare
|
||||
Transfer-Encoding:
|
||||
- chunked
|
||||
X-Robots-Tag:
|
||||
- none
|
||||
anthropic-organization-id:
|
||||
- ANTHROPIC-ORGANIZATION-ID-XXX
|
||||
anthropic-ratelimit-input-tokens-limit:
|
||||
- ANTHROPIC-RATELIMIT-INPUT-TOKENS-LIMIT-XXX
|
||||
anthropic-ratelimit-input-tokens-remaining:
|
||||
- ANTHROPIC-RATELIMIT-INPUT-TOKENS-REMAINING-XXX
|
||||
anthropic-ratelimit-input-tokens-reset:
|
||||
- ANTHROPIC-RATELIMIT-INPUT-TOKENS-RESET-XXX
|
||||
anthropic-ratelimit-output-tokens-limit:
|
||||
- ANTHROPIC-RATELIMIT-OUTPUT-TOKENS-LIMIT-XXX
|
||||
anthropic-ratelimit-output-tokens-remaining:
|
||||
- ANTHROPIC-RATELIMIT-OUTPUT-TOKENS-REMAINING-XXX
|
||||
anthropic-ratelimit-output-tokens-reset:
|
||||
- ANTHROPIC-RATELIMIT-OUTPUT-TOKENS-RESET-XXX
|
||||
anthropic-ratelimit-requests-limit:
|
||||
- '4000'
|
||||
anthropic-ratelimit-requests-remaining:
|
||||
- '3999'
|
||||
anthropic-ratelimit-requests-reset:
|
||||
- '2026-01-22T00:18:55Z'
|
||||
anthropic-ratelimit-tokens-limit:
|
||||
- ANTHROPIC-RATELIMIT-TOKENS-LIMIT-XXX
|
||||
anthropic-ratelimit-tokens-remaining:
|
||||
- ANTHROPIC-RATELIMIT-TOKENS-REMAINING-XXX
|
||||
anthropic-ratelimit-tokens-reset:
|
||||
- ANTHROPIC-RATELIMIT-TOKENS-RESET-XXX
|
||||
cf-cache-status:
|
||||
- DYNAMIC
|
||||
request-id:
|
||||
- REQUEST-ID-XXX
|
||||
strict-transport-security:
|
||||
- STS-XXX
|
||||
x-envoy-upstream-service-time:
|
||||
- '648'
|
||||
status:
|
||||
code: 200
|
||||
message: OK
|
||||
version: 1
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
329
lib/crewai/tests/llms/test_multimodal_integration.py
Normal file
329
lib/crewai/tests/llms/test_multimodal_integration.py
Normal file
@@ -0,0 +1,329 @@
|
||||
"""Integration tests for LLM multimodal functionality with cassettes.
|
||||
|
||||
These tests make actual API calls (recorded via VCR cassettes) to verify
|
||||
multimodal content is properly sent and processed by each provider.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from crewai.llm import LLM
|
||||
from crewai.utilities.files import File, ImageFile, PDFFile, TextFile
|
||||
|
||||
|
||||
# Path to test data files
|
||||
TEST_DATA_DIR = Path(__file__).parent.parent.parent.parent.parent / "data"
|
||||
TEST_IMAGE_PATH = TEST_DATA_DIR / "revenue_chart.png"
|
||||
TEST_TEXT_PATH = TEST_DATA_DIR / "review_guidelines.txt"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_image_bytes() -> bytes:
|
||||
"""Load test image bytes."""
|
||||
return TEST_IMAGE_PATH.read_bytes()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_text_bytes() -> bytes:
|
||||
"""Load test text bytes."""
|
||||
return TEST_TEXT_PATH.read_bytes()
|
||||
|
||||
|
||||
# Minimal PDF for testing (real PDF structure)
|
||||
MINIMAL_PDF = b"""%PDF-1.4
|
||||
1 0 obj << /Type /Catalog /Pages 2 0 R >> endobj
|
||||
2 0 obj << /Type /Pages /Kids [3 0 R] /Count 1 >> endobj
|
||||
3 0 obj << /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >> endobj
|
||||
xref
|
||||
0 4
|
||||
0000000000 65535 f
|
||||
0000000009 00000 n
|
||||
0000000058 00000 n
|
||||
0000000115 00000 n
|
||||
trailer << /Size 4 /Root 1 0 R >>
|
||||
startxref
|
||||
196
|
||||
%%EOF
|
||||
"""
|
||||
|
||||
|
||||
def _build_multimodal_message(llm: LLM, prompt: str, files: dict) -> list[dict]:
|
||||
"""Build a multimodal message with text and file content."""
|
||||
content_blocks = llm.format_multimodal_content(files)
|
||||
return [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
llm.format_text_content(prompt),
|
||||
*content_blocks,
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
class TestOpenAIMultimodalIntegration:
|
||||
"""Integration tests for OpenAI multimodal with real API calls."""
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_describe_image(self, test_image_bytes: bytes) -> None:
|
||||
"""Test OpenAI can describe an image."""
|
||||
llm = LLM(model="openai/gpt-4o-mini")
|
||||
files = {"image": ImageFile(source=test_image_bytes)}
|
||||
|
||||
messages = _build_multimodal_message(
|
||||
llm,
|
||||
"Describe this image in one sentence. Be brief.",
|
||||
files,
|
||||
)
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
|
||||
class TestAnthropicMultimodalIntegration:
|
||||
"""Integration tests for Anthropic multimodal with real API calls."""
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_describe_image(self, test_image_bytes: bytes) -> None:
|
||||
"""Test Anthropic can describe an image."""
|
||||
llm = LLM(model="anthropic/claude-3-5-haiku-20241022")
|
||||
files = {"image": ImageFile(source=test_image_bytes)}
|
||||
|
||||
messages = _build_multimodal_message(
|
||||
llm,
|
||||
"Describe this image in one sentence. Be brief.",
|
||||
files,
|
||||
)
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_analyze_pdf(self) -> None:
|
||||
"""Test Anthropic can analyze a PDF."""
|
||||
llm = LLM(model="anthropic/claude-3-5-haiku-20241022")
|
||||
files = {"document": PDFFile(source=MINIMAL_PDF)}
|
||||
|
||||
messages = _build_multimodal_message(
|
||||
llm,
|
||||
"What type of document is this? Answer in one word.",
|
||||
files,
|
||||
)
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
|
||||
class TestGeminiMultimodalIntegration:
|
||||
"""Integration tests for Gemini multimodal with real API calls."""
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_describe_image(self, test_image_bytes: bytes) -> None:
|
||||
"""Test Gemini can describe an image."""
|
||||
llm = LLM(model="gemini/gemini-2.0-flash")
|
||||
files = {"image": ImageFile(source=test_image_bytes)}
|
||||
|
||||
messages = _build_multimodal_message(
|
||||
llm,
|
||||
"Describe this image in one sentence. Be brief.",
|
||||
files,
|
||||
)
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_analyze_text_file(self, test_text_bytes: bytes) -> None:
|
||||
"""Test Gemini can analyze a text file."""
|
||||
llm = LLM(model="gemini/gemini-2.0-flash")
|
||||
files = {"readme": TextFile(source=test_text_bytes)}
|
||||
|
||||
messages = _build_multimodal_message(
|
||||
llm,
|
||||
"Summarize what this text file says in one sentence.",
|
||||
files,
|
||||
)
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
|
||||
class TestLiteLLMMultimodalIntegration:
|
||||
"""Integration tests for LiteLLM wrapper multimodal with real API calls."""
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_describe_image_gpt4o(self, test_image_bytes: bytes) -> None:
|
||||
"""Test LiteLLM with GPT-4o can describe an image."""
|
||||
llm = LLM(model="gpt-4o-mini", is_litellm=True)
|
||||
files = {"image": ImageFile(source=test_image_bytes)}
|
||||
|
||||
messages = _build_multimodal_message(
|
||||
llm,
|
||||
"Describe this image in one sentence. Be brief.",
|
||||
files,
|
||||
)
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_describe_image_claude(self, test_image_bytes: bytes) -> None:
|
||||
"""Test LiteLLM with Claude can describe an image."""
|
||||
llm = LLM(model="anthropic/claude-3-5-haiku-20241022", is_litellm=True)
|
||||
files = {"image": ImageFile(source=test_image_bytes)}
|
||||
|
||||
messages = _build_multimodal_message(
|
||||
llm,
|
||||
"Describe this image in one sentence. Be brief.",
|
||||
files,
|
||||
)
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
|
||||
class TestMultipleFilesIntegration:
|
||||
"""Integration tests for multiple files in a single request."""
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_multiple_images_openai(self, test_image_bytes: bytes) -> None:
|
||||
"""Test OpenAI can process multiple images."""
|
||||
llm = LLM(model="openai/gpt-4o-mini")
|
||||
files = {
|
||||
"image1": ImageFile(source=test_image_bytes),
|
||||
"image2": ImageFile(source=test_image_bytes),
|
||||
}
|
||||
|
||||
messages = _build_multimodal_message(
|
||||
llm,
|
||||
"How many images do you see? Answer with just the number.",
|
||||
files,
|
||||
)
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert "2" in response or "two" in response.lower()
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_mixed_content_anthropic(self, test_image_bytes: bytes) -> None:
|
||||
"""Test Anthropic can process image and PDF together."""
|
||||
llm = LLM(model="anthropic/claude-3-5-haiku-20241022")
|
||||
files = {
|
||||
"image": ImageFile(source=test_image_bytes),
|
||||
"document": PDFFile(source=MINIMAL_PDF),
|
||||
}
|
||||
|
||||
messages = _build_multimodal_message(
|
||||
llm,
|
||||
"What types of files did I send you? List them briefly.",
|
||||
files,
|
||||
)
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
|
||||
class TestGenericFileIntegration:
|
||||
"""Integration tests for the generic File class with auto-detection."""
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_generic_file_image_openai(self, test_image_bytes: bytes) -> None:
|
||||
"""Test generic File auto-detects image and sends correct content type."""
|
||||
llm = LLM(model="openai/gpt-4o-mini")
|
||||
files = {"image": File(source=test_image_bytes)}
|
||||
|
||||
messages = _build_multimodal_message(
|
||||
llm,
|
||||
"Describe this image in one sentence. Be brief.",
|
||||
files,
|
||||
)
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_generic_file_pdf_anthropic(self) -> None:
|
||||
"""Test generic File auto-detects PDF and sends correct content type."""
|
||||
llm = LLM(model="anthropic/claude-3-5-haiku-20241022")
|
||||
files = {"document": File(source=MINIMAL_PDF)}
|
||||
|
||||
messages = _build_multimodal_message(
|
||||
llm,
|
||||
"What type of document is this? Answer in one word.",
|
||||
files,
|
||||
)
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_generic_file_text_gemini(self, test_text_bytes: bytes) -> None:
|
||||
"""Test generic File auto-detects text and sends correct content type."""
|
||||
llm = LLM(model="gemini/gemini-2.0-flash")
|
||||
files = {"content": File(source=test_text_bytes)}
|
||||
|
||||
messages = _build_multimodal_message(
|
||||
llm,
|
||||
"Summarize what this text says in one sentence.",
|
||||
files,
|
||||
)
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_generic_file_mixed_types(self, test_image_bytes: bytes) -> None:
|
||||
"""Test generic File works with multiple auto-detected types."""
|
||||
llm = LLM(model="anthropic/claude-3-5-haiku-20241022")
|
||||
files = {
|
||||
"chart": File(source=test_image_bytes),
|
||||
"doc": File(source=MINIMAL_PDF),
|
||||
}
|
||||
|
||||
messages = _build_multimodal_message(
|
||||
llm,
|
||||
"What types of files did I send? List them briefly.",
|
||||
files,
|
||||
)
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
Reference in New Issue
Block a user