feat: native multimodal file handling; openai responses api

- add input_files parameter to Crew.kickoff(), Flow.kickoff(), Task, and Agent.kickoff()
- add provider-specific file uploaders for OpenAI, Anthropic, Gemini, and Bedrock
- add file type detection, constraint validation, and automatic format conversion
- add URL file source support for multimodal content
- add streaming uploads for large files
- add prompt caching support for Anthropic
- add OpenAI Responses API support
This commit is contained in:
Greyson LaLonde
2026-01-23 15:13:25 -05:00
committed by GitHub
parent bd4d039f63
commit c4c9208229
303 changed files with 46268 additions and 8097 deletions

View File

@@ -6,7 +6,7 @@ import openai
import pytest
from crewai.llm import LLM
from crewai.llms.providers.openai.completion import OpenAICompletion
from crewai.llms.providers.openai.completion import OpenAICompletion, ResponsesAPIResult
from crewai.crew import Crew
from crewai.agent import Agent
from crewai.task import Task
@@ -43,6 +43,7 @@ def test_openai_is_default_provider_without_explicit_llm_set_on_agent():
role="Research Assistant",
goal="Find information about the population of Tokyo",
backstory="You are a helpful research assistant.",
llm=LLM(model="gpt-4o-mini"),
)
task = Task(
description="Find information about the population of Tokyo",
@@ -52,7 +53,7 @@ def test_openai_is_default_provider_without_explicit_llm_set_on_agent():
crew = Crew(agents=[agent], tasks=[task])
crew.kickoff()
assert crew.agents[0].llm.__class__.__name__ == "OpenAICompletion"
assert crew.agents[0].llm.model == DEFAULT_LLM_MODEL
assert crew.agents[0].llm.model == "gpt-4o-mini"
@@ -621,3 +622,773 @@ def test_openai_streaming_returns_usage_metrics():
assert result.token_usage.prompt_tokens > 0
assert result.token_usage.completion_tokens > 0
assert result.token_usage.successful_requests >= 1
def test_openai_responses_api_initialization():
"""Test that OpenAI Responses API can be initialized with api='responses'."""
llm = OpenAICompletion(
model="gpt-5",
api="responses",
instructions="You are a helpful assistant.",
store=True,
)
assert llm.api == "responses"
assert llm.instructions == "You are a helpful assistant."
assert llm.store is True
assert llm.model == "gpt-5"
def test_openai_responses_api_default_is_completions():
"""Test that the default API is 'completions' for backward compatibility."""
llm = OpenAICompletion(model="gpt-4o")
assert llm.api == "completions"
def test_openai_responses_api_prepare_params():
"""Test that Responses API params are prepared correctly."""
llm = OpenAICompletion(
model="gpt-5",
api="responses",
instructions="Base instructions.",
store=True,
temperature=0.7,
)
messages = [
{"role": "system", "content": "System message."},
{"role": "user", "content": "Hello!"},
]
params = llm._prepare_responses_params(messages)
assert params["model"] == "gpt-5"
assert "Base instructions." in params["instructions"]
assert "System message." in params["instructions"]
assert params["store"] is True
assert params["temperature"] == 0.7
assert params["input"] == [{"role": "user", "content": "Hello!"}]
def test_openai_responses_api_tool_format():
"""Test that tools are converted to Responses API format (internally-tagged)."""
llm = OpenAICompletion(model="gpt-5", api="responses")
tools = [
{
"name": "get_weather",
"description": "Get the weather for a location",
"parameters": {
"type": "object",
"properties": {"location": {"type": "string"}},
"required": ["location"],
},
}
]
responses_tools = llm._convert_tools_for_responses(tools)
assert len(responses_tools) == 1
tool = responses_tools[0]
assert tool["type"] == "function"
assert tool["name"] == "get_weather"
assert tool["description"] == "Get the weather for a location"
assert "parameters" in tool
assert "function" not in tool
def test_openai_completions_api_tool_format():
"""Test that tools are converted to Chat Completions API format (externally-tagged)."""
llm = OpenAICompletion(model="gpt-4o", api="completions")
tools = [
{
"name": "get_weather",
"description": "Get the weather for a location",
"parameters": {
"type": "object",
"properties": {"location": {"type": "string"}},
"required": ["location"],
},
}
]
completions_tools = llm._convert_tools_for_interference(tools)
assert len(completions_tools) == 1
tool = completions_tools[0]
assert tool["type"] == "function"
assert "function" in tool
assert tool["function"]["name"] == "get_weather"
assert tool["function"]["description"] == "Get the weather for a location"
def test_openai_responses_api_structured_output_format():
"""Test that structured outputs use text.format for Responses API."""
from pydantic import BaseModel
class Person(BaseModel):
name: str
age: int
llm = OpenAICompletion(model="gpt-5", api="responses")
messages = [{"role": "user", "content": "Extract: Jane, 25"}]
params = llm._prepare_responses_params(messages, response_model=Person)
assert "text" in params
assert "format" in params["text"]
assert params["text"]["format"]["type"] == "json_schema"
assert params["text"]["format"]["name"] == "Person"
assert params["text"]["format"]["strict"] is True
def test_openai_responses_api_with_previous_response_id():
"""Test that previous_response_id is passed for multi-turn conversations."""
llm = OpenAICompletion(
model="gpt-5",
api="responses",
previous_response_id="resp_abc123",
store=True,
)
messages = [{"role": "user", "content": "Continue our conversation."}]
params = llm._prepare_responses_params(messages)
assert params["previous_response_id"] == "resp_abc123"
assert params["store"] is True
def test_openai_responses_api_call_routing():
"""Test that call() routes to the correct API based on the api parameter."""
from unittest.mock import patch, MagicMock
llm_completions = OpenAICompletion(model="gpt-4o", api="completions")
llm_responses = OpenAICompletion(model="gpt-5", api="responses")
with patch.object(
llm_completions, "_call_completions", return_value="completions result"
) as mock_completions:
result = llm_completions.call("Hello")
mock_completions.assert_called_once()
assert result == "completions result"
with patch.object(
llm_responses, "_call_responses", return_value="responses result"
) as mock_responses:
result = llm_responses.call("Hello")
mock_responses.assert_called_once()
assert result == "responses result"
# =============================================================================
# VCR Integration Tests for Responses API
# =============================================================================
@pytest.mark.vcr()
def test_openai_responses_api_basic_call():
"""Test basic Responses API call with text generation."""
llm = OpenAICompletion(
model="gpt-4o-mini",
api="responses",
instructions="You are a helpful assistant. Be concise.",
)
result = llm.call("What is 2 + 2? Answer with just the number.")
assert isinstance(result, str)
assert "4" in result
@pytest.mark.vcr()
def test_openai_responses_api_with_structured_output():
"""Test Responses API with structured output using Pydantic model."""
from pydantic import BaseModel, Field
class MathAnswer(BaseModel):
"""Structured math answer."""
result: int = Field(description="The numerical result")
explanation: str = Field(description="Brief explanation")
llm = OpenAICompletion(
model="gpt-4o-mini",
api="responses",
)
result = llm.call("What is 5 * 7?", response_model=MathAnswer)
assert isinstance(result, MathAnswer)
assert result.result == 35
@pytest.mark.vcr()
def test_openai_responses_api_with_system_message_extraction():
"""Test that system messages are properly extracted to instructions."""
llm = OpenAICompletion(
model="gpt-4o-mini",
api="responses",
)
messages = [
{"role": "system", "content": "You always respond in uppercase letters only."},
{"role": "user", "content": "Say hello"},
]
result = llm.call(messages)
assert isinstance(result, str)
assert result.isupper() or "HELLO" in result.upper()
@pytest.mark.vcr()
def test_openai_responses_api_streaming():
"""Test Responses API with streaming enabled."""
llm = OpenAICompletion(
model="gpt-4o-mini",
api="responses",
stream=True,
instructions="Be very concise.",
)
result = llm.call("Count from 1 to 3, separated by commas.")
assert isinstance(result, str)
assert "1" in result
assert "2" in result
assert "3" in result
@pytest.mark.vcr()
def test_openai_responses_api_returns_usage_metrics():
"""Test that Responses API calls return proper token usage metrics."""
llm = OpenAICompletion(
model="gpt-4o-mini",
api="responses",
)
llm.call("Say hello")
usage = llm.get_token_usage_summary()
assert usage.total_tokens > 0
assert usage.prompt_tokens > 0
assert usage.completion_tokens > 0
def test_openai_responses_api_builtin_tools_param():
"""Test that builtin_tools parameter is properly configured."""
llm = OpenAICompletion(
model="gpt-4o",
api="responses",
builtin_tools=["web_search", "code_interpreter"],
)
assert llm.builtin_tools == ["web_search", "code_interpreter"]
messages = [{"role": "user", "content": "Test"}]
params = llm._prepare_responses_params(messages)
assert "tools" in params
tool_types = [t["type"] for t in params["tools"]]
assert "web_search_preview" in tool_types
assert "code_interpreter" in tool_types
def test_openai_responses_api_builtin_tools_with_custom_tools():
"""Test that builtin_tools can be combined with custom function tools."""
llm = OpenAICompletion(
model="gpt-4o",
api="responses",
builtin_tools=["web_search"],
)
custom_tools = [
{
"name": "get_weather",
"description": "Get weather for a location",
"parameters": {"type": "object", "properties": {}},
}
]
messages = [{"role": "user", "content": "Test"}]
params = llm._prepare_responses_params(messages, tools=custom_tools)
assert len(params["tools"]) == 2
tool_types = [t.get("type") for t in params["tools"]]
assert "web_search_preview" in tool_types
assert "function" in tool_types
@pytest.mark.vcr()
def test_openai_responses_api_with_web_search():
"""Test Responses API with web_search built-in tool."""
llm = OpenAICompletion(
model="gpt-4o-mini",
api="responses",
builtin_tools=["web_search"],
)
result = llm.call("What is the current population of Tokyo? Be brief.")
assert isinstance(result, str)
assert len(result) > 0
def test_responses_api_result_dataclass():
"""Test ResponsesAPIResult dataclass functionality."""
result = ResponsesAPIResult(
text="Hello, world!",
response_id="resp_123",
)
assert result.text == "Hello, world!"
assert result.response_id == "resp_123"
assert result.web_search_results == []
assert result.file_search_results == []
assert result.code_interpreter_results == []
assert result.computer_use_results == []
assert result.reasoning_summaries == []
assert result.function_calls == []
assert not result.has_tool_outputs()
assert not result.has_reasoning()
def test_responses_api_result_has_tool_outputs():
"""Test ResponsesAPIResult.has_tool_outputs() method."""
result_with_web = ResponsesAPIResult(
text="Test",
web_search_results=[{"id": "ws_1", "status": "completed", "type": "web_search_call"}],
)
assert result_with_web.has_tool_outputs()
result_with_file = ResponsesAPIResult(
text="Test",
file_search_results=[{"id": "fs_1", "status": "completed", "type": "file_search_call", "queries": [], "results": []}],
)
assert result_with_file.has_tool_outputs()
def test_responses_api_result_has_reasoning():
"""Test ResponsesAPIResult.has_reasoning() method."""
result_with_reasoning = ResponsesAPIResult(
text="Test",
reasoning_summaries=[{"id": "r_1", "type": "reasoning", "summary": []}],
)
assert result_with_reasoning.has_reasoning()
result_without = ResponsesAPIResult(text="Test")
assert not result_without.has_reasoning()
def test_openai_responses_api_parse_tool_outputs_param():
"""Test that parse_tool_outputs parameter is properly configured."""
llm = OpenAICompletion(
model="gpt-4o",
api="responses",
parse_tool_outputs=True,
)
assert llm.parse_tool_outputs is True
def test_openai_responses_api_parse_tool_outputs_default_false():
"""Test that parse_tool_outputs defaults to False."""
llm = OpenAICompletion(
model="gpt-4o",
api="responses",
)
assert llm.parse_tool_outputs is False
@pytest.mark.vcr()
def test_openai_responses_api_with_parse_tool_outputs():
"""Test Responses API with parse_tool_outputs enabled returns ResponsesAPIResult."""
llm = OpenAICompletion(
model="gpt-4o-mini",
api="responses",
builtin_tools=["web_search"],
parse_tool_outputs=True,
)
result = llm.call("What is the current population of Tokyo? Be very brief.")
assert isinstance(result, ResponsesAPIResult)
assert len(result.text) > 0
assert result.response_id is not None
# Web search should have been used
assert len(result.web_search_results) > 0
assert result.has_tool_outputs()
@pytest.mark.vcr()
def test_openai_responses_api_parse_tool_outputs_basic_call():
"""Test Responses API with parse_tool_outputs but no built-in tools."""
llm = OpenAICompletion(
model="gpt-4o-mini",
api="responses",
parse_tool_outputs=True,
)
result = llm.call("Say hello in exactly 3 words.")
assert isinstance(result, ResponsesAPIResult)
assert len(result.text) > 0
assert result.response_id is not None
# No built-in tools used
assert not result.has_tool_outputs()
# ============================================================================
# Auto-Chaining Tests (Responses API)
# ============================================================================
def test_openai_responses_api_auto_chain_param():
"""Test that auto_chain parameter is properly configured."""
llm = OpenAICompletion(
model="gpt-4o",
api="responses",
auto_chain=True,
)
assert llm.auto_chain is True
assert llm._last_response_id is None
def test_openai_responses_api_auto_chain_default_false():
"""Test that auto_chain defaults to False."""
llm = OpenAICompletion(
model="gpt-4o",
api="responses",
)
assert llm.auto_chain is False
def test_openai_responses_api_last_response_id_property():
"""Test last_response_id property."""
llm = OpenAICompletion(
model="gpt-4o",
api="responses",
auto_chain=True,
)
# Initially None
assert llm.last_response_id is None
# Simulate setting the internal value
llm._last_response_id = "resp_test_123"
assert llm.last_response_id == "resp_test_123"
def test_openai_responses_api_reset_chain():
"""Test reset_chain() method clears the response ID."""
llm = OpenAICompletion(
model="gpt-4o",
api="responses",
auto_chain=True,
)
# Set a response ID
llm._last_response_id = "resp_test_123"
assert llm.last_response_id == "resp_test_123"
# Reset the chain
llm.reset_chain()
assert llm.last_response_id is None
def test_openai_responses_api_auto_chain_prepare_params():
"""Test that _prepare_responses_params uses auto-chained response ID."""
llm = OpenAICompletion(
model="gpt-4o",
api="responses",
auto_chain=True,
)
# No previous response ID yet
params = llm._prepare_responses_params(messages=[{"role": "user", "content": "test"}])
assert "previous_response_id" not in params
# Set a previous response ID
llm._last_response_id = "resp_previous_123"
params = llm._prepare_responses_params(messages=[{"role": "user", "content": "test"}])
assert params.get("previous_response_id") == "resp_previous_123"
def test_openai_responses_api_explicit_previous_response_id_takes_precedence():
"""Test that explicit previous_response_id overrides auto-chained ID."""
llm = OpenAICompletion(
model="gpt-4o",
api="responses",
auto_chain=True,
previous_response_id="resp_explicit_456",
)
# Set an auto-chained response ID
llm._last_response_id = "resp_auto_123"
# Explicit should take precedence
params = llm._prepare_responses_params(messages=[{"role": "user", "content": "test"}])
assert params.get("previous_response_id") == "resp_explicit_456"
def test_openai_responses_api_auto_chain_disabled_no_tracking():
"""Test that response ID is not tracked when auto_chain is False."""
llm = OpenAICompletion(
model="gpt-4o",
api="responses",
auto_chain=False,
)
# Even with a "previous" response ID set internally, params shouldn't use it
llm._last_response_id = "resp_should_not_use"
params = llm._prepare_responses_params(messages=[{"role": "user", "content": "test"}])
assert "previous_response_id" not in params
@pytest.mark.vcr()
def test_openai_responses_api_auto_chain_integration():
"""Test auto-chaining tracks response IDs across calls."""
llm = OpenAICompletion(
model="gpt-4o-mini",
api="responses",
auto_chain=True,
)
# First call - should not have previous_response_id
assert llm.last_response_id is None
result1 = llm.call("My name is Alice. Remember this.")
# After first call, should have a response ID
assert llm.last_response_id is not None
first_response_id = llm.last_response_id
assert first_response_id.startswith("resp_")
# Second call - should use the first response ID
result2 = llm.call("What is my name?")
# Response ID should be updated
assert llm.last_response_id is not None
assert llm.last_response_id != first_response_id # Should be a new ID
# The response should remember context (Alice)
assert isinstance(result1, str)
assert isinstance(result2, str)
@pytest.mark.vcr()
def test_openai_responses_api_auto_chain_with_reset():
"""Test that reset_chain() properly starts a new conversation."""
llm = OpenAICompletion(
model="gpt-4o-mini",
api="responses",
auto_chain=True,
)
# First conversation
llm.call("My favorite color is blue.")
first_chain_id = llm.last_response_id
assert first_chain_id is not None
# Reset and start new conversation
llm.reset_chain()
assert llm.last_response_id is None
# New call should start fresh
llm.call("Hello!")
second_chain_id = llm.last_response_id
assert second_chain_id is not None
# New conversation, so different response ID
assert second_chain_id != first_chain_id
# =============================================================================
# Encrypted Reasoning for ZDR (Zero Data Retention) Tests
# =============================================================================
def test_openai_responses_api_auto_chain_reasoning_param():
"""Test that auto_chain_reasoning parameter is properly configured."""
llm = OpenAICompletion(
model="gpt-4o",
api="responses",
auto_chain_reasoning=True,
)
assert llm.auto_chain_reasoning is True
assert llm._last_reasoning_items is None
def test_openai_responses_api_auto_chain_reasoning_default_false():
"""Test that auto_chain_reasoning defaults to False."""
llm = OpenAICompletion(
model="gpt-4o",
api="responses",
)
assert llm.auto_chain_reasoning is False
def test_openai_responses_api_last_reasoning_items_property():
"""Test last_reasoning_items property."""
llm = OpenAICompletion(
model="gpt-4o",
api="responses",
auto_chain_reasoning=True,
)
# Initially None
assert llm.last_reasoning_items is None
# Simulate setting the internal value
mock_items = [{"id": "rs_test_123", "type": "reasoning"}]
llm._last_reasoning_items = mock_items
assert llm.last_reasoning_items == mock_items
def test_openai_responses_api_reset_reasoning_chain():
"""Test reset_reasoning_chain() method clears reasoning items."""
llm = OpenAICompletion(
model="gpt-4o",
api="responses",
auto_chain_reasoning=True,
)
# Set reasoning items
mock_items = [{"id": "rs_test_123", "type": "reasoning"}]
llm._last_reasoning_items = mock_items
assert llm.last_reasoning_items == mock_items
# Reset the reasoning chain
llm.reset_reasoning_chain()
assert llm.last_reasoning_items is None
def test_openai_responses_api_auto_chain_reasoning_adds_include():
"""Test that auto_chain_reasoning adds reasoning.encrypted_content to include."""
llm = OpenAICompletion(
model="gpt-4o",
api="responses",
auto_chain_reasoning=True,
)
params = llm._prepare_responses_params(messages=[{"role": "user", "content": "test"}])
assert "include" in params
assert "reasoning.encrypted_content" in params["include"]
def test_openai_responses_api_auto_chain_reasoning_preserves_existing_include():
"""Test that auto_chain_reasoning preserves existing include items."""
llm = OpenAICompletion(
model="gpt-4o",
api="responses",
auto_chain_reasoning=True,
include=["file_search_call.results"],
)
params = llm._prepare_responses_params(messages=[{"role": "user", "content": "test"}])
assert "include" in params
assert "reasoning.encrypted_content" in params["include"]
assert "file_search_call.results" in params["include"]
def test_openai_responses_api_auto_chain_reasoning_no_duplicate_include():
"""Test that reasoning.encrypted_content is not duplicated if already in include."""
llm = OpenAICompletion(
model="gpt-4o",
api="responses",
auto_chain_reasoning=True,
include=["reasoning.encrypted_content"],
)
params = llm._prepare_responses_params(messages=[{"role": "user", "content": "test"}])
assert "include" in params
# Should only appear once
assert params["include"].count("reasoning.encrypted_content") == 1
def test_openai_responses_api_auto_chain_reasoning_prepends_to_input():
"""Test that stored reasoning items are prepended to input."""
llm = OpenAICompletion(
model="gpt-4o",
api="responses",
auto_chain_reasoning=True,
)
# Simulate stored reasoning items
mock_reasoning = MagicMock()
mock_reasoning.type = "reasoning"
mock_reasoning.id = "rs_test_123"
llm._last_reasoning_items = [mock_reasoning]
params = llm._prepare_responses_params(messages=[{"role": "user", "content": "test"}])
# Input should have reasoning item first, then the message
assert len(params["input"]) == 2
assert params["input"][0] == mock_reasoning
assert params["input"][1]["role"] == "user"
def test_openai_responses_api_auto_chain_reasoning_disabled_no_include():
"""Test that reasoning.encrypted_content is not added when auto_chain_reasoning is False."""
llm = OpenAICompletion(
model="gpt-4o",
api="responses",
auto_chain_reasoning=False,
)
params = llm._prepare_responses_params(messages=[{"role": "user", "content": "test"}])
# Should not have include at all (unless explicitly set)
assert "include" not in params or "reasoning.encrypted_content" not in params.get("include", [])
def test_openai_responses_api_auto_chain_reasoning_disabled_no_prepend():
"""Test that reasoning items are not prepended when auto_chain_reasoning is False."""
llm = OpenAICompletion(
model="gpt-4o",
api="responses",
auto_chain_reasoning=False,
)
# Even with stored reasoning items, they should not be prepended
mock_reasoning = MagicMock()
mock_reasoning.type = "reasoning"
llm._last_reasoning_items = [mock_reasoning]
params = llm._prepare_responses_params(messages=[{"role": "user", "content": "test"}])
# Input should only have the message, not the reasoning item
assert len(params["input"]) == 1
assert params["input"][0]["role"] == "user"
def test_openai_responses_api_both_auto_chains_work_together():
"""Test that auto_chain and auto_chain_reasoning can be used together."""
llm = OpenAICompletion(
model="gpt-4o",
api="responses",
auto_chain=True,
auto_chain_reasoning=True,
)
assert llm.auto_chain is True
assert llm.auto_chain_reasoning is True
assert llm._last_response_id is None
assert llm._last_reasoning_items is None
# Set both internal values
llm._last_response_id = "resp_123"
mock_reasoning = MagicMock()
mock_reasoning.type = "reasoning"
llm._last_reasoning_items = [mock_reasoning]
params = llm._prepare_responses_params(messages=[{"role": "user", "content": "test"}])
# Both should be applied
assert params.get("previous_response_id") == "resp_123"
assert "reasoning.encrypted_content" in params["include"]
assert len(params["input"]) == 2 # Reasoning item + message

View File

@@ -0,0 +1,375 @@
"""Unit tests for LLM multimodal functionality across all providers."""
import base64
import os
from unittest.mock import patch
import pytest
from crewai.llm import LLM
from crewai_files import ImageFile, PDFFile, TextFile, format_multimodal_content
# Check for optional provider dependencies
try:
from crewai.llms.providers.anthropic.completion import AnthropicCompletion
HAS_ANTHROPIC = True
except ImportError:
HAS_ANTHROPIC = False
try:
from crewai.llms.providers.azure.completion import AzureCompletion
HAS_AZURE = True
except ImportError:
HAS_AZURE = False
try:
from crewai.llms.providers.bedrock.completion import BedrockCompletion
HAS_BEDROCK = True
except ImportError:
HAS_BEDROCK = False
# Minimal valid PNG for testing
MINIMAL_PNG = (
b"\x89PNG\r\n\x1a\n"
b"\x00\x00\x00\rIHDR"
b"\x00\x00\x00\x01\x00\x00\x00\x01\x08\x02\x00\x00\x00"
b"\x90wS\xde"
b"\x00\x00\x00\x00IEND\xaeB`\x82"
)
MINIMAL_PDF = b"%PDF-1.4 test content"
@pytest.fixture(autouse=True)
def mock_api_keys():
"""Mock API keys for all providers."""
env_vars = {
"ANTHROPIC_API_KEY": "test-key",
"OPENAI_API_KEY": "test-key",
"GOOGLE_API_KEY": "test-key",
"AZURE_API_KEY": "test-key",
"AWS_ACCESS_KEY_ID": "test-key",
"AWS_SECRET_ACCESS_KEY": "test-key",
}
with patch.dict(os.environ, env_vars):
yield
class TestLiteLLMMultimodal:
"""Tests for LLM class (litellm wrapper) multimodal functionality.
These tests use `is_litellm=True` to ensure the litellm wrapper is used
instead of native providers.
"""
def test_supports_multimodal_gpt4o(self) -> None:
"""Test GPT-4o model supports multimodal."""
llm = LLM(model="gpt-4o", is_litellm=True)
assert llm.supports_multimodal() is True
def test_supports_multimodal_gpt4_turbo(self) -> None:
"""Test GPT-4 Turbo model supports multimodal."""
llm = LLM(model="gpt-4-turbo", is_litellm=True)
assert llm.supports_multimodal() is True
def test_supports_multimodal_claude3(self) -> None:
"""Test Claude 3 model supports multimodal via litellm."""
# Use litellm/ prefix to avoid native provider import
llm = LLM(model="litellm/claude-3-sonnet-20240229")
assert llm.supports_multimodal() is True
def test_supports_multimodal_gemini(self) -> None:
"""Test Gemini model supports multimodal."""
llm = LLM(model="gemini/gemini-pro", is_litellm=True)
assert llm.supports_multimodal() is True
def test_supports_multimodal_gpt35_does_not(self) -> None:
"""Test GPT-3.5 model does not support multimodal."""
llm = LLM(model="gpt-3.5-turbo", is_litellm=True)
assert llm.supports_multimodal() is False
def test_format_multimodal_content_image(self) -> None:
"""Test formatting image content."""
llm = LLM(model="gpt-4o", is_litellm=True)
files = {"chart": ImageFile(source=MINIMAL_PNG)}
result = format_multimodal_content(files, getattr(llm, "provider", None) or llm.model)
assert len(result) == 1
assert result[0]["type"] == "image_url"
assert "data:image/png;base64," in result[0]["image_url"]["url"]
def test_format_multimodal_content_unsupported_type(self) -> None:
"""Test unsupported content type is skipped."""
llm = LLM(model="gpt-4o", is_litellm=True) # OpenAI doesn't support text files
files = {"doc": TextFile(source=b"hello world")}
result = format_multimodal_content(files, getattr(llm, "provider", None) or llm.model)
assert result == []
@pytest.mark.skipif(not HAS_ANTHROPIC, reason="Anthropic SDK not installed")
class TestAnthropicMultimodal:
"""Tests for Anthropic provider multimodal functionality."""
def test_supports_multimodal_claude3(self) -> None:
"""Test Claude 3 supports multimodal."""
llm = LLM(model="anthropic/claude-3-sonnet-20240229")
assert llm.supports_multimodal() is True
def test_supports_multimodal_claude4(self) -> None:
"""Test Claude 4 supports multimodal."""
llm = LLM(model="anthropic/claude-4-opus")
assert llm.supports_multimodal() is True
def test_format_multimodal_content_image(self) -> None:
"""Test Anthropic image format uses source-based structure."""
llm = LLM(model="anthropic/claude-3-sonnet-20240229")
files = {"chart": ImageFile(source=MINIMAL_PNG)}
result = format_multimodal_content(files, getattr(llm, "provider", None) or llm.model)
assert len(result) == 1
assert result[0]["type"] == "image"
assert result[0]["source"]["type"] == "base64"
assert result[0]["source"]["media_type"] == "image/png"
assert "data" in result[0]["source"]
def test_format_multimodal_content_pdf(self) -> None:
"""Test Anthropic PDF format uses document structure."""
llm = LLM(model="anthropic/claude-3-sonnet-20240229")
files = {"doc": PDFFile(source=MINIMAL_PDF)}
result = format_multimodal_content(files, getattr(llm, "provider", None) or llm.model)
assert len(result) == 1
assert result[0]["type"] == "document"
assert result[0]["source"]["type"] == "base64"
assert result[0]["source"]["media_type"] == "application/pdf"
class TestOpenAIMultimodal:
"""Tests for OpenAI provider multimodal functionality."""
def test_supports_multimodal_gpt4o(self) -> None:
"""Test GPT-4o supports multimodal."""
llm = LLM(model="openai/gpt-4o")
assert llm.supports_multimodal() is True
def test_supports_multimodal_gpt4_vision(self) -> None:
"""Test GPT-4 Vision supports multimodal."""
llm = LLM(model="openai/gpt-4-vision-preview")
assert llm.supports_multimodal() is True
def test_supports_multimodal_o1(self) -> None:
"""Test O1 model supports multimodal."""
llm = LLM(model="openai/o1-preview")
assert llm.supports_multimodal() is True
def test_does_not_support_gpt35(self) -> None:
"""Test GPT-3.5 does not support multimodal."""
llm = LLM(model="openai/gpt-3.5-turbo")
assert llm.supports_multimodal() is False
def test_format_multimodal_content_image(self) -> None:
"""Test OpenAI uses image_url format."""
llm = LLM(model="openai/gpt-4o")
files = {"chart": ImageFile(source=MINIMAL_PNG)}
result = format_multimodal_content(files, getattr(llm, "provider", None) or llm.model)
assert len(result) == 1
assert result[0]["type"] == "image_url"
url = result[0]["image_url"]["url"]
assert url.startswith("data:image/png;base64,")
# Verify base64 content
b64_data = url.split(",")[1]
assert base64.b64decode(b64_data) == MINIMAL_PNG
class TestGeminiMultimodal:
"""Tests for Gemini provider multimodal functionality."""
def test_supports_multimodal_always_true(self) -> None:
"""Test Gemini always supports multimodal."""
llm = LLM(model="gemini/gemini-pro")
assert llm.supports_multimodal() is True
def test_format_multimodal_content_image(self) -> None:
"""Test Gemini uses inlineData format."""
llm = LLM(model="gemini/gemini-pro")
files = {"chart": ImageFile(source=MINIMAL_PNG)}
result = format_multimodal_content(files, getattr(llm, "provider", None) or llm.model)
assert len(result) == 1
assert "inlineData" in result[0]
assert result[0]["inlineData"]["mimeType"] == "image/png"
assert "data" in result[0]["inlineData"]
def test_format_text_content(self) -> None:
"""Test Gemini text format uses simple text key."""
llm = LLM(model="gemini/gemini-pro")
result = llm.format_text_content("Hello world")
assert result == {"text": "Hello world"}
@pytest.mark.skipif(not HAS_AZURE, reason="Azure AI Inference SDK not installed")
class TestAzureMultimodal:
"""Tests for Azure OpenAI provider multimodal functionality."""
@pytest.fixture(autouse=True)
def mock_azure_env(self):
"""Mock Azure-specific environment variables."""
env_vars = {
"AZURE_API_KEY": "test-key",
"AZURE_API_BASE": "https://test.openai.azure.com",
"AZURE_API_VERSION": "2024-02-01",
}
with patch.dict(os.environ, env_vars):
yield
def test_supports_multimodal_gpt4o(self) -> None:
"""Test Azure GPT-4o supports multimodal."""
llm = LLM(model="azure/gpt-4o")
assert llm.supports_multimodal() is True
def test_supports_multimodal_gpt4_turbo(self) -> None:
"""Test Azure GPT-4 Turbo supports multimodal."""
llm = LLM(model="azure/gpt-4-turbo")
assert llm.supports_multimodal() is True
def test_does_not_support_gpt35(self) -> None:
"""Test Azure GPT-3.5 does not support multimodal."""
llm = LLM(model="azure/gpt-35-turbo")
assert llm.supports_multimodal() is False
def test_format_multimodal_content_image(self) -> None:
"""Test Azure uses same format as OpenAI."""
llm = LLM(model="azure/gpt-4o")
files = {"chart": ImageFile(source=MINIMAL_PNG)}
result = format_multimodal_content(files, getattr(llm, "provider", None) or llm.model)
assert len(result) == 1
assert result[0]["type"] == "image_url"
assert "data:image/png;base64," in result[0]["image_url"]["url"]
@pytest.mark.skipif(not HAS_BEDROCK, reason="AWS Bedrock SDK not installed")
class TestBedrockMultimodal:
"""Tests for AWS Bedrock provider multimodal functionality."""
@pytest.fixture(autouse=True)
def mock_bedrock_env(self):
"""Mock AWS-specific environment variables."""
env_vars = {
"AWS_ACCESS_KEY_ID": "test-key",
"AWS_SECRET_ACCESS_KEY": "test-secret",
"AWS_DEFAULT_REGION": "us-east-1",
}
with patch.dict(os.environ, env_vars):
yield
def test_supports_multimodal_claude3(self) -> None:
"""Test Bedrock Claude 3 supports multimodal."""
llm = LLM(model="bedrock/anthropic.claude-3-sonnet")
assert llm.supports_multimodal() is True
def test_does_not_support_claude2(self) -> None:
"""Test Bedrock Claude 2 does not support multimodal."""
llm = LLM(model="bedrock/anthropic.claude-v2")
assert llm.supports_multimodal() is False
def test_format_multimodal_content_image(self) -> None:
"""Test Bedrock uses Converse API image format."""
llm = LLM(model="bedrock/anthropic.claude-3-sonnet")
files = {"chart": ImageFile(source=MINIMAL_PNG)}
result = format_multimodal_content(files, getattr(llm, "provider", None) or llm.model)
assert len(result) == 1
assert "image" in result[0]
assert result[0]["image"]["format"] == "png"
assert "source" in result[0]["image"]
assert "bytes" in result[0]["image"]["source"]
def test_format_multimodal_content_pdf(self) -> None:
"""Test Bedrock uses Converse API document format."""
llm = LLM(model="bedrock/anthropic.claude-3-sonnet")
files = {"doc": PDFFile(source=MINIMAL_PDF)}
result = format_multimodal_content(files, getattr(llm, "provider", None) or llm.model)
assert len(result) == 1
assert "document" in result[0]
assert result[0]["document"]["format"] == "pdf"
assert "source" in result[0]["document"]
class TestBaseLLMMultimodal:
"""Tests for BaseLLM default multimodal behavior."""
def test_base_supports_multimodal_false(self) -> None:
"""Test base implementation returns False."""
from crewai.llms.base_llm import BaseLLM
class TestLLM(BaseLLM):
def call(self, messages, tools=None, callbacks=None):
return "test"
llm = TestLLM(model="test")
assert llm.supports_multimodal() is False
def test_base_format_text_content(self) -> None:
"""Test base text formatting uses OpenAI/Anthropic style."""
from crewai.llms.base_llm import BaseLLM
class TestLLM(BaseLLM):
def call(self, messages, tools=None, callbacks=None):
return "test"
llm = TestLLM(model="test")
result = llm.format_text_content("Hello")
assert result == {"type": "text", "text": "Hello"}
class TestMultipleFilesFormatting:
"""Tests for formatting multiple files at once."""
def test_format_multiple_images(self) -> None:
"""Test formatting multiple images."""
llm = LLM(model="gpt-4o")
files = {
"chart1": ImageFile(source=MINIMAL_PNG),
"chart2": ImageFile(source=MINIMAL_PNG),
}
result = format_multimodal_content(files, getattr(llm, "provider", None) or llm.model)
assert len(result) == 2
def test_format_mixed_supported_and_unsupported(self) -> None:
"""Test only supported types are formatted."""
llm = LLM(model="gpt-4o") # OpenAI - images only
files = {
"chart": ImageFile(source=MINIMAL_PNG),
"doc": PDFFile(source=MINIMAL_PDF), # Not supported by OpenAI
"text": TextFile(source=b"hello"), # Not supported
}
result = format_multimodal_content(files, getattr(llm, "provider", None) or llm.model)
assert len(result) == 1 # Only image supported
def test_format_empty_files_dict(self) -> None:
"""Test empty files dict returns empty list."""
llm = LLM(model="gpt-4o")
result = format_multimodal_content({}, llm.model)
assert result == []

View File

@@ -0,0 +1,782 @@
"""Integration tests for LLM multimodal functionality with cassettes.
These tests make actual API calls (recorded via VCR cassettes) to verify
multimodal content is properly sent and processed by each provider.
"""
from pathlib import Path
import pytest
from crewai.llm import LLM
from crewai_files import (
AudioFile,
File,
ImageFile,
PDFFile,
TextFile,
VideoFile,
format_multimodal_content,
)
from crewai_files.resolution.resolver import FileResolver, FileResolverConfig
# Path to test data files
TEST_FIXTURES_DIR = Path(__file__).parent.parent.parent.parent / "crewai-files" / "tests" / "fixtures"
TEST_IMAGE_PATH = TEST_FIXTURES_DIR / "revenue_chart.png"
TEST_TEXT_PATH = TEST_FIXTURES_DIR / "review_guidelines.txt"
TEST_VIDEO_PATH = TEST_FIXTURES_DIR / "sample_video.mp4"
TEST_AUDIO_PATH = TEST_FIXTURES_DIR / "sample_audio.wav"
@pytest.fixture
def test_image_bytes() -> bytes:
"""Load test image bytes."""
return TEST_IMAGE_PATH.read_bytes()
@pytest.fixture
def test_text_bytes() -> bytes:
"""Load test text bytes."""
return TEST_TEXT_PATH.read_bytes()
@pytest.fixture
def test_video_bytes() -> bytes:
"""Load test video bytes."""
if not TEST_VIDEO_PATH.exists():
pytest.skip("sample_video.mp4 fixture not found")
return TEST_VIDEO_PATH.read_bytes()
@pytest.fixture
def test_audio_bytes() -> bytes:
"""Load test audio bytes."""
if not TEST_AUDIO_PATH.exists():
pytest.skip("sample_audio.wav fixture not found")
return TEST_AUDIO_PATH.read_bytes()
# Minimal PDF for testing (real PDF structure)
MINIMAL_PDF = b"""%PDF-1.4
1 0 obj << /Type /Catalog /Pages 2 0 R >> endobj
2 0 obj << /Type /Pages /Kids [3 0 R] /Count 1 >> endobj
3 0 obj << /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >> endobj
xref
0 4
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000115 00000 n
trailer << /Size 4 /Root 1 0 R >>
startxref
196
%%EOF
"""
def _build_multimodal_message(llm: LLM, prompt: str, files: dict) -> list[dict]:
"""Build a multimodal message with text and file content."""
provider = getattr(llm, "provider", None) or llm.model
content_blocks = format_multimodal_content(files, provider)
return [
{
"role": "user",
"content": [
llm.format_text_content(prompt),
*content_blocks,
],
}
]
class TestOpenAIMultimodalIntegration:
"""Integration tests for OpenAI multimodal with real API calls."""
@pytest.mark.vcr()
def test_describe_image(self, test_image_bytes: bytes) -> None:
"""Test OpenAI can describe an image."""
llm = LLM(model="openai/gpt-4o-mini")
files = {"image": ImageFile(source=test_image_bytes)}
messages = _build_multimodal_message(
llm,
"Describe this image in one sentence. Be brief.",
files,
)
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
class TestOpenAIO4MiniMultimodalIntegration:
"""Integration tests for OpenAI o4-mini reasoning model with vision."""
@pytest.mark.vcr()
def test_describe_image(self, test_image_bytes: bytes) -> None:
"""Test o4-mini can describe an image."""
llm = LLM(model="openai/o4-mini")
files = {"image": ImageFile(source=test_image_bytes)}
messages = _build_multimodal_message(
llm,
"Describe this image in one sentence. Be brief.",
files,
)
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
class TestOpenAIGPT41MiniMultimodalIntegration:
"""Integration tests for OpenAI GPT-4.1-mini with vision."""
@pytest.mark.vcr()
def test_describe_image(self, test_image_bytes: bytes) -> None:
"""Test GPT-4.1-mini can describe an image."""
llm = LLM(model="openai/gpt-4.1-mini")
files = {"image": ImageFile(source=test_image_bytes)}
messages = _build_multimodal_message(
llm,
"Describe this image in one sentence. Be brief.",
files,
)
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
class TestOpenAIGPT5MultimodalIntegration:
"""Integration tests for OpenAI GPT-5 with vision."""
@pytest.mark.vcr()
def test_describe_image(self, test_image_bytes: bytes) -> None:
"""Test GPT-5 can describe an image."""
llm = LLM(model="openai/gpt-5")
files = {"image": ImageFile(source=test_image_bytes)}
messages = _build_multimodal_message(
llm,
"Describe this image in one sentence. Be brief.",
files,
)
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
class TestOpenAIGPT5MiniMultimodalIntegration:
"""Integration tests for OpenAI GPT-5-mini with vision."""
@pytest.mark.vcr()
def test_describe_image(self, test_image_bytes: bytes) -> None:
"""Test GPT-5-mini can describe an image."""
llm = LLM(model="openai/gpt-5-mini")
files = {"image": ImageFile(source=test_image_bytes)}
messages = _build_multimodal_message(
llm,
"Describe this image in one sentence. Be brief.",
files,
)
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
class TestOpenAIGPT5NanoMultimodalIntegration:
"""Integration tests for OpenAI GPT-5-nano with vision."""
@pytest.mark.vcr()
def test_describe_image(self, test_image_bytes: bytes) -> None:
"""Test GPT-5-nano can describe an image."""
llm = LLM(model="openai/gpt-5-nano")
files = {"image": ImageFile(source=test_image_bytes)}
messages = _build_multimodal_message(
llm,
"Describe this image in one sentence. Be brief.",
files,
)
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
class TestAnthropicMultimodalIntegration:
"""Integration tests for Anthropic multimodal with real API calls."""
@pytest.mark.vcr()
def test_describe_image(self, test_image_bytes: bytes) -> None:
"""Test Anthropic can describe an image."""
llm = LLM(model="anthropic/claude-3-5-haiku-20241022")
files = {"image": ImageFile(source=test_image_bytes)}
messages = _build_multimodal_message(
llm,
"Describe this image in one sentence. Be brief.",
files,
)
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
@pytest.mark.vcr()
def test_analyze_pdf(self) -> None:
"""Test Anthropic can analyze a PDF."""
llm = LLM(model="anthropic/claude-3-5-haiku-20241022")
files = {"document": PDFFile(source=MINIMAL_PDF)}
messages = _build_multimodal_message(
llm,
"What type of document is this? Answer in one word.",
files,
)
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
class TestAzureMultimodalIntegration:
"""Integration tests for Azure OpenAI multimodal with real API calls."""
@pytest.mark.vcr()
def test_describe_image(self, test_image_bytes: bytes) -> None:
"""Test Azure OpenAI can describe an image."""
llm = LLM(model="azure/gpt-4o")
files = {"image": ImageFile(source=test_image_bytes)}
messages = _build_multimodal_message(
llm,
"Describe this image in one sentence. Be brief.",
files,
)
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
class TestBedrockMultimodalIntegration:
"""Integration tests for AWS Bedrock multimodal with real API calls."""
@pytest.mark.vcr()
def test_describe_image(self, test_image_bytes: bytes) -> None:
"""Test Bedrock Claude can describe an image."""
llm = LLM(model="bedrock/anthropic.claude-3-haiku-20240307-v1:0")
files = {"image": ImageFile(source=test_image_bytes)}
messages = _build_multimodal_message(
llm,
"Describe this image in one sentence. Be brief.",
files,
)
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
@pytest.mark.vcr()
def test_analyze_pdf(self) -> None:
"""Test Bedrock Claude can analyze a PDF."""
llm = LLM(model="bedrock/anthropic.claude-3-haiku-20240307-v1:0")
files = {"document": PDFFile(source=MINIMAL_PDF)}
messages = _build_multimodal_message(
llm,
"What type of document is this? Answer in one word.",
files,
)
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
class TestGeminiMultimodalIntegration:
"""Integration tests for Gemini multimodal with real API calls."""
@pytest.mark.vcr()
def test_describe_image(self, test_image_bytes: bytes) -> None:
"""Test Gemini can describe an image."""
llm = LLM(model="gemini/gemini-2.0-flash")
files = {"image": ImageFile(source=test_image_bytes)}
messages = _build_multimodal_message(
llm,
"Describe this image in one sentence. Be brief.",
files,
)
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
@pytest.mark.vcr()
def test_analyze_text_file(self, test_text_bytes: bytes) -> None:
"""Test Gemini can analyze a text file."""
llm = LLM(model="gemini/gemini-2.0-flash")
files = {"readme": TextFile(source=test_text_bytes)}
messages = _build_multimodal_message(
llm,
"Summarize what this text file says in one sentence.",
files,
)
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
@pytest.mark.vcr()
def test_analyze_video_file(self, test_video_bytes: bytes) -> None:
"""Test Gemini can analyze a video file."""
llm = LLM(model="gemini/gemini-2.0-flash")
files = {"video": VideoFile(source=test_video_bytes)}
messages = _build_multimodal_message(
llm,
"Describe what you see in this video in one sentence. Be brief.",
files,
)
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
@pytest.mark.vcr()
def test_analyze_audio_file(self, test_audio_bytes: bytes) -> None:
"""Test Gemini can analyze an audio file."""
llm = LLM(model="gemini/gemini-2.0-flash")
files = {"audio": AudioFile(source=test_audio_bytes)}
messages = _build_multimodal_message(
llm,
"Describe what you hear in this audio in one sentence. Be brief.",
files,
)
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
class TestLiteLLMMultimodalIntegration:
"""Integration tests for LiteLLM wrapper multimodal with real API calls."""
@pytest.mark.vcr()
def test_describe_image_gpt4o(self, test_image_bytes: bytes) -> None:
"""Test LiteLLM with GPT-4o can describe an image."""
llm = LLM(model="gpt-4o-mini", is_litellm=True)
files = {"image": ImageFile(source=test_image_bytes)}
messages = _build_multimodal_message(
llm,
"Describe this image in one sentence. Be brief.",
files,
)
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
@pytest.mark.vcr()
def test_describe_image_claude(self, test_image_bytes: bytes) -> None:
"""Test LiteLLM with Claude can describe an image."""
llm = LLM(model="anthropic/claude-3-5-haiku-20241022", is_litellm=True)
files = {"image": ImageFile(source=test_image_bytes)}
messages = _build_multimodal_message(
llm,
"Describe this image in one sentence. Be brief.",
files,
)
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
class TestMultipleFilesIntegration:
"""Integration tests for multiple files in a single request."""
@pytest.mark.vcr()
def test_multiple_images_openai(self, test_image_bytes: bytes) -> None:
"""Test OpenAI can process multiple images."""
llm = LLM(model="openai/gpt-4o-mini")
files = {
"image1": ImageFile(source=test_image_bytes),
"image2": ImageFile(source=test_image_bytes),
}
messages = _build_multimodal_message(
llm,
"How many images do you see? Answer with just the number.",
files,
)
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert "2" in response or "two" in response.lower()
@pytest.mark.vcr()
def test_mixed_content_anthropic(self, test_image_bytes: bytes) -> None:
"""Test Anthropic can process image and PDF together."""
llm = LLM(model="anthropic/claude-3-5-haiku-20241022")
files = {
"image": ImageFile(source=test_image_bytes),
"document": PDFFile(source=MINIMAL_PDF),
}
messages = _build_multimodal_message(
llm,
"What types of files did I send you? List them briefly.",
files,
)
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
class TestGenericFileIntegration:
"""Integration tests for the generic File class with auto-detection."""
@pytest.mark.vcr()
def test_generic_file_image_openai(self, test_image_bytes: bytes) -> None:
"""Test generic File auto-detects image and sends correct content type."""
llm = LLM(model="openai/gpt-4o-mini")
files = {"image": File(source=test_image_bytes)}
messages = _build_multimodal_message(
llm,
"Describe this image in one sentence. Be brief.",
files,
)
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
@pytest.mark.vcr()
def test_generic_file_pdf_anthropic(self) -> None:
"""Test generic File auto-detects PDF and sends correct content type."""
llm = LLM(model="anthropic/claude-3-5-haiku-20241022")
files = {"document": File(source=MINIMAL_PDF)}
messages = _build_multimodal_message(
llm,
"What type of document is this? Answer in one word.",
files,
)
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
@pytest.mark.vcr()
def test_generic_file_text_gemini(self, test_text_bytes: bytes) -> None:
"""Test generic File auto-detects text and sends correct content type."""
llm = LLM(model="gemini/gemini-2.0-flash")
files = {"content": File(source=test_text_bytes)}
messages = _build_multimodal_message(
llm,
"Summarize what this text says in one sentence.",
files,
)
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
@pytest.mark.vcr()
def test_generic_file_mixed_types(self, test_image_bytes: bytes) -> None:
"""Test generic File works with multiple auto-detected types."""
llm = LLM(model="anthropic/claude-3-5-haiku-20241022")
files = {
"chart": File(source=test_image_bytes),
"doc": File(source=MINIMAL_PDF),
}
messages = _build_multimodal_message(
llm,
"What types of files did I send? List them briefly.",
files,
)
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
def _build_multimodal_message_with_upload(
llm: LLM, prompt: str, files: dict
) -> tuple[list[dict], list[dict]]:
"""Build a multimodal message using file_id uploads instead of inline base64.
Note: OpenAI Chat Completions API only supports file_id for PDFs via
type="file", not for images. For image file_id support, OpenAI requires
the Responses API (type="input_image"). Since crewAI uses Chat Completions,
we test file_id uploads with Anthropic which supports file_id for all types.
Returns:
Tuple of (messages, content_blocks) where content_blocks can be inspected
to verify file_id was used.
"""
from crewai_files.formatting.anthropic import AnthropicFormatter
config = FileResolverConfig(prefer_upload=True)
resolver = FileResolver(config=config)
formatter = AnthropicFormatter()
content_blocks = []
for file in files.values():
resolved = resolver.resolve(file, "anthropic")
block = formatter.format_block(file, resolved)
if block is not None:
content_blocks.append(block)
messages = [
{
"role": "user",
"content": [
llm.format_text_content(prompt),
*content_blocks,
],
}
]
return messages, content_blocks
def _build_responses_message_with_upload(
llm: LLM, prompt: str, files: dict
) -> tuple[list[dict], list[dict]]:
"""Build a Responses API message using file_id uploads.
The Responses API supports file_id for images via type="input_image".
Returns:
Tuple of (messages, content_blocks) where content_blocks can be inspected
to verify file_id was used.
"""
from crewai_files.formatting import OpenAIResponsesFormatter
config = FileResolverConfig(prefer_upload=True)
resolver = FileResolver(config=config)
content_blocks = []
for file in files.values():
resolved = resolver.resolve(file, "openai")
block = OpenAIResponsesFormatter.format_block(resolved, file.content_type)
content_blocks.append(block)
messages = [
{
"role": "user",
"content": [
{"type": "input_text", "text": prompt},
*content_blocks,
],
}
]
return messages, content_blocks
class TestAnthropicFileUploadIntegration:
"""Integration tests for Anthropic multimodal with file_id uploads.
We test file_id uploads with Anthropic because OpenAI Chat Completions API
only supports file_id references for PDFs (type="file"), not images.
OpenAI's Responses API supports image file_id (type="input_image"), but
crewAI currently uses Chat Completions. Anthropic supports file_id for
all content types including images.
"""
@pytest.mark.vcr()
def test_describe_image_with_file_id(self, test_image_bytes: bytes) -> None:
"""Test Anthropic can describe an image uploaded via Files API."""
llm = LLM(model="anthropic/claude-3-5-haiku-20241022")
files = {"image": ImageFile(source=test_image_bytes)}
messages, content_blocks = _build_multimodal_message_with_upload(
llm,
"Describe this image in one sentence. Be brief.",
files,
)
# Verify we're using file_id, not base64
assert len(content_blocks) == 1
source = content_blocks[0].get("source", {})
assert source.get("type") == "file", (
f"Expected source type 'file' for file_id upload, got '{source.get('type')}'. "
"This test verifies file_id uploads work - if falling back to base64, "
"check that the Anthropic Files API uploader is working correctly."
)
assert "file_id" in source, "Expected file_id in source for file_id upload"
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
class TestOpenAIResponsesFileUploadIntegration:
"""Integration tests for OpenAI Responses API with file_id uploads.
The Responses API supports file_id for images via type="input_image",
unlike Chat Completions which only supports file_id for PDFs.
"""
@pytest.mark.vcr()
def test_describe_image_with_file_id(self, test_image_bytes: bytes) -> None:
"""Test OpenAI Responses API can describe an image uploaded via Files API."""
llm = LLM(model="openai/gpt-4o-mini", api="responses")
files = {"image": ImageFile(source=test_image_bytes)}
messages, content_blocks = _build_responses_message_with_upload(
llm,
"Describe this image in one sentence. Be brief.",
files,
)
# Verify we're using file_id with input_image type
assert len(content_blocks) == 1
block = content_blocks[0]
assert block.get("type") == "input_image", (
f"Expected type 'input_image' for Responses API, got '{block.get('type')}'. "
"This test verifies file_id uploads work with the Responses API."
)
assert "file_id" in block, "Expected file_id in block for file_id upload"
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
@pytest.mark.vcr()
def test_describe_image_via_format_api(self, test_image_bytes: bytes) -> None:
"""Test format_multimodal_content with api='responses' parameter."""
llm = LLM(model="openai/gpt-4o-mini", api="responses")
files = {"image": ImageFile(source=test_image_bytes)}
content_blocks = format_multimodal_content(files, "openai", api="responses")
# Verify content blocks use Responses API format
assert len(content_blocks) == 1
block = content_blocks[0]
assert block.get("type") == "input_image", (
f"Expected type 'input_image' for Responses API, got '{block.get('type')}'"
)
# Should have image_url (base64 data URL) since we're not forcing upload
assert "image_url" in block, "Expected image_url in block for inline image"
messages = [
{
"role": "user",
"content": [
{"type": "input_text", "text": "Describe this image in one sentence."},
*content_blocks,
],
}
]
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
@pytest.mark.vcr()
def test_describe_image_via_format_api_with_upload(self, test_image_bytes: bytes) -> None:
"""Test format_multimodal_content with prefer_upload=True uploads the file."""
llm = LLM(model="openai/gpt-4o-mini", api="responses")
files = {"image": ImageFile(source=test_image_bytes)}
content_blocks = format_multimodal_content(
files, "openai", api="responses", prefer_upload=True
)
# Verify content blocks use file_id from upload
assert len(content_blocks) == 1
block = content_blocks[0]
assert block.get("type") == "input_image", (
f"Expected type 'input_image' for Responses API, got '{block.get('type')}'"
)
assert "file_id" in block, (
"Expected file_id in block when prefer_upload=True. "
f"Got keys: {list(block.keys())}"
)
messages = [
{
"role": "user",
"content": [
{"type": "input_text", "text": "Describe this image in one sentence."},
*content_blocks,
],
}
]
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0