mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-05-03 16:22:49 +00:00
feat: native multimodal file handling; openai responses api
- add input_files parameter to Crew.kickoff(), Flow.kickoff(), Task, and Agent.kickoff() - add provider-specific file uploaders for OpenAI, Anthropic, Gemini, and Bedrock - add file type detection, constraint validation, and automatic format conversion - add URL file source support for multimodal content - add streaming uploads for large files - add prompt caching support for Anthropic - add OpenAI Responses API support
This commit is contained in:
@@ -6,7 +6,7 @@ import openai
|
||||
import pytest
|
||||
|
||||
from crewai.llm import LLM
|
||||
from crewai.llms.providers.openai.completion import OpenAICompletion
|
||||
from crewai.llms.providers.openai.completion import OpenAICompletion, ResponsesAPIResult
|
||||
from crewai.crew import Crew
|
||||
from crewai.agent import Agent
|
||||
from crewai.task import Task
|
||||
@@ -43,6 +43,7 @@ def test_openai_is_default_provider_without_explicit_llm_set_on_agent():
|
||||
role="Research Assistant",
|
||||
goal="Find information about the population of Tokyo",
|
||||
backstory="You are a helpful research assistant.",
|
||||
llm=LLM(model="gpt-4o-mini"),
|
||||
)
|
||||
task = Task(
|
||||
description="Find information about the population of Tokyo",
|
||||
@@ -52,7 +53,7 @@ def test_openai_is_default_provider_without_explicit_llm_set_on_agent():
|
||||
crew = Crew(agents=[agent], tasks=[task])
|
||||
crew.kickoff()
|
||||
assert crew.agents[0].llm.__class__.__name__ == "OpenAICompletion"
|
||||
assert crew.agents[0].llm.model == DEFAULT_LLM_MODEL
|
||||
assert crew.agents[0].llm.model == "gpt-4o-mini"
|
||||
|
||||
|
||||
|
||||
@@ -621,3 +622,773 @@ def test_openai_streaming_returns_usage_metrics():
|
||||
assert result.token_usage.prompt_tokens > 0
|
||||
assert result.token_usage.completion_tokens > 0
|
||||
assert result.token_usage.successful_requests >= 1
|
||||
|
||||
|
||||
def test_openai_responses_api_initialization():
|
||||
"""Test that OpenAI Responses API can be initialized with api='responses'."""
|
||||
llm = OpenAICompletion(
|
||||
model="gpt-5",
|
||||
api="responses",
|
||||
instructions="You are a helpful assistant.",
|
||||
store=True,
|
||||
)
|
||||
|
||||
assert llm.api == "responses"
|
||||
assert llm.instructions == "You are a helpful assistant."
|
||||
assert llm.store is True
|
||||
assert llm.model == "gpt-5"
|
||||
|
||||
|
||||
def test_openai_responses_api_default_is_completions():
|
||||
"""Test that the default API is 'completions' for backward compatibility."""
|
||||
llm = OpenAICompletion(model="gpt-4o")
|
||||
|
||||
assert llm.api == "completions"
|
||||
|
||||
|
||||
def test_openai_responses_api_prepare_params():
|
||||
"""Test that Responses API params are prepared correctly."""
|
||||
llm = OpenAICompletion(
|
||||
model="gpt-5",
|
||||
api="responses",
|
||||
instructions="Base instructions.",
|
||||
store=True,
|
||||
temperature=0.7,
|
||||
)
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": "System message."},
|
||||
{"role": "user", "content": "Hello!"},
|
||||
]
|
||||
|
||||
params = llm._prepare_responses_params(messages)
|
||||
|
||||
assert params["model"] == "gpt-5"
|
||||
assert "Base instructions." in params["instructions"]
|
||||
assert "System message." in params["instructions"]
|
||||
assert params["store"] is True
|
||||
assert params["temperature"] == 0.7
|
||||
assert params["input"] == [{"role": "user", "content": "Hello!"}]
|
||||
|
||||
|
||||
def test_openai_responses_api_tool_format():
|
||||
"""Test that tools are converted to Responses API format (internally-tagged)."""
|
||||
llm = OpenAICompletion(model="gpt-5", api="responses")
|
||||
|
||||
tools = [
|
||||
{
|
||||
"name": "get_weather",
|
||||
"description": "Get the weather for a location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {"location": {"type": "string"}},
|
||||
"required": ["location"],
|
||||
},
|
||||
}
|
||||
]
|
||||
|
||||
responses_tools = llm._convert_tools_for_responses(tools)
|
||||
|
||||
assert len(responses_tools) == 1
|
||||
tool = responses_tools[0]
|
||||
assert tool["type"] == "function"
|
||||
assert tool["name"] == "get_weather"
|
||||
assert tool["description"] == "Get the weather for a location"
|
||||
assert "parameters" in tool
|
||||
assert "function" not in tool
|
||||
|
||||
|
||||
def test_openai_completions_api_tool_format():
|
||||
"""Test that tools are converted to Chat Completions API format (externally-tagged)."""
|
||||
llm = OpenAICompletion(model="gpt-4o", api="completions")
|
||||
|
||||
tools = [
|
||||
{
|
||||
"name": "get_weather",
|
||||
"description": "Get the weather for a location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {"location": {"type": "string"}},
|
||||
"required": ["location"],
|
||||
},
|
||||
}
|
||||
]
|
||||
|
||||
completions_tools = llm._convert_tools_for_interference(tools)
|
||||
|
||||
assert len(completions_tools) == 1
|
||||
tool = completions_tools[0]
|
||||
assert tool["type"] == "function"
|
||||
assert "function" in tool
|
||||
assert tool["function"]["name"] == "get_weather"
|
||||
assert tool["function"]["description"] == "Get the weather for a location"
|
||||
|
||||
|
||||
def test_openai_responses_api_structured_output_format():
|
||||
"""Test that structured outputs use text.format for Responses API."""
|
||||
from pydantic import BaseModel
|
||||
|
||||
class Person(BaseModel):
|
||||
name: str
|
||||
age: int
|
||||
|
||||
llm = OpenAICompletion(model="gpt-5", api="responses")
|
||||
|
||||
messages = [{"role": "user", "content": "Extract: Jane, 25"}]
|
||||
params = llm._prepare_responses_params(messages, response_model=Person)
|
||||
|
||||
assert "text" in params
|
||||
assert "format" in params["text"]
|
||||
assert params["text"]["format"]["type"] == "json_schema"
|
||||
assert params["text"]["format"]["name"] == "Person"
|
||||
assert params["text"]["format"]["strict"] is True
|
||||
|
||||
|
||||
def test_openai_responses_api_with_previous_response_id():
|
||||
"""Test that previous_response_id is passed for multi-turn conversations."""
|
||||
llm = OpenAICompletion(
|
||||
model="gpt-5",
|
||||
api="responses",
|
||||
previous_response_id="resp_abc123",
|
||||
store=True,
|
||||
)
|
||||
|
||||
messages = [{"role": "user", "content": "Continue our conversation."}]
|
||||
params = llm._prepare_responses_params(messages)
|
||||
|
||||
assert params["previous_response_id"] == "resp_abc123"
|
||||
assert params["store"] is True
|
||||
|
||||
|
||||
def test_openai_responses_api_call_routing():
|
||||
"""Test that call() routes to the correct API based on the api parameter."""
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
llm_completions = OpenAICompletion(model="gpt-4o", api="completions")
|
||||
llm_responses = OpenAICompletion(model="gpt-5", api="responses")
|
||||
|
||||
with patch.object(
|
||||
llm_completions, "_call_completions", return_value="completions result"
|
||||
) as mock_completions:
|
||||
result = llm_completions.call("Hello")
|
||||
mock_completions.assert_called_once()
|
||||
assert result == "completions result"
|
||||
|
||||
with patch.object(
|
||||
llm_responses, "_call_responses", return_value="responses result"
|
||||
) as mock_responses:
|
||||
result = llm_responses.call("Hello")
|
||||
mock_responses.assert_called_once()
|
||||
assert result == "responses result"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# VCR Integration Tests for Responses API
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_openai_responses_api_basic_call():
|
||||
"""Test basic Responses API call with text generation."""
|
||||
llm = OpenAICompletion(
|
||||
model="gpt-4o-mini",
|
||||
api="responses",
|
||||
instructions="You are a helpful assistant. Be concise.",
|
||||
)
|
||||
|
||||
result = llm.call("What is 2 + 2? Answer with just the number.")
|
||||
|
||||
assert isinstance(result, str)
|
||||
assert "4" in result
|
||||
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_openai_responses_api_with_structured_output():
|
||||
"""Test Responses API with structured output using Pydantic model."""
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
class MathAnswer(BaseModel):
|
||||
"""Structured math answer."""
|
||||
|
||||
result: int = Field(description="The numerical result")
|
||||
explanation: str = Field(description="Brief explanation")
|
||||
|
||||
llm = OpenAICompletion(
|
||||
model="gpt-4o-mini",
|
||||
api="responses",
|
||||
)
|
||||
|
||||
result = llm.call("What is 5 * 7?", response_model=MathAnswer)
|
||||
|
||||
assert isinstance(result, MathAnswer)
|
||||
assert result.result == 35
|
||||
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_openai_responses_api_with_system_message_extraction():
|
||||
"""Test that system messages are properly extracted to instructions."""
|
||||
llm = OpenAICompletion(
|
||||
model="gpt-4o-mini",
|
||||
api="responses",
|
||||
)
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": "You always respond in uppercase letters only."},
|
||||
{"role": "user", "content": "Say hello"},
|
||||
]
|
||||
|
||||
result = llm.call(messages)
|
||||
|
||||
assert isinstance(result, str)
|
||||
assert result.isupper() or "HELLO" in result.upper()
|
||||
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_openai_responses_api_streaming():
|
||||
"""Test Responses API with streaming enabled."""
|
||||
llm = OpenAICompletion(
|
||||
model="gpt-4o-mini",
|
||||
api="responses",
|
||||
stream=True,
|
||||
instructions="Be very concise.",
|
||||
)
|
||||
|
||||
result = llm.call("Count from 1 to 3, separated by commas.")
|
||||
|
||||
assert isinstance(result, str)
|
||||
assert "1" in result
|
||||
assert "2" in result
|
||||
assert "3" in result
|
||||
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_openai_responses_api_returns_usage_metrics():
|
||||
"""Test that Responses API calls return proper token usage metrics."""
|
||||
llm = OpenAICompletion(
|
||||
model="gpt-4o-mini",
|
||||
api="responses",
|
||||
)
|
||||
|
||||
llm.call("Say hello")
|
||||
|
||||
usage = llm.get_token_usage_summary()
|
||||
assert usage.total_tokens > 0
|
||||
assert usage.prompt_tokens > 0
|
||||
assert usage.completion_tokens > 0
|
||||
|
||||
|
||||
def test_openai_responses_api_builtin_tools_param():
|
||||
"""Test that builtin_tools parameter is properly configured."""
|
||||
llm = OpenAICompletion(
|
||||
model="gpt-4o",
|
||||
api="responses",
|
||||
builtin_tools=["web_search", "code_interpreter"],
|
||||
)
|
||||
|
||||
assert llm.builtin_tools == ["web_search", "code_interpreter"]
|
||||
|
||||
messages = [{"role": "user", "content": "Test"}]
|
||||
params = llm._prepare_responses_params(messages)
|
||||
|
||||
assert "tools" in params
|
||||
tool_types = [t["type"] for t in params["tools"]]
|
||||
assert "web_search_preview" in tool_types
|
||||
assert "code_interpreter" in tool_types
|
||||
|
||||
|
||||
def test_openai_responses_api_builtin_tools_with_custom_tools():
|
||||
"""Test that builtin_tools can be combined with custom function tools."""
|
||||
llm = OpenAICompletion(
|
||||
model="gpt-4o",
|
||||
api="responses",
|
||||
builtin_tools=["web_search"],
|
||||
)
|
||||
|
||||
custom_tools = [
|
||||
{
|
||||
"name": "get_weather",
|
||||
"description": "Get weather for a location",
|
||||
"parameters": {"type": "object", "properties": {}},
|
||||
}
|
||||
]
|
||||
|
||||
messages = [{"role": "user", "content": "Test"}]
|
||||
params = llm._prepare_responses_params(messages, tools=custom_tools)
|
||||
|
||||
assert len(params["tools"]) == 2
|
||||
tool_types = [t.get("type") for t in params["tools"]]
|
||||
assert "web_search_preview" in tool_types
|
||||
assert "function" in tool_types
|
||||
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_openai_responses_api_with_web_search():
|
||||
"""Test Responses API with web_search built-in tool."""
|
||||
llm = OpenAICompletion(
|
||||
model="gpt-4o-mini",
|
||||
api="responses",
|
||||
builtin_tools=["web_search"],
|
||||
)
|
||||
|
||||
result = llm.call("What is the current population of Tokyo? Be brief.")
|
||||
|
||||
assert isinstance(result, str)
|
||||
assert len(result) > 0
|
||||
|
||||
|
||||
def test_responses_api_result_dataclass():
|
||||
"""Test ResponsesAPIResult dataclass functionality."""
|
||||
result = ResponsesAPIResult(
|
||||
text="Hello, world!",
|
||||
response_id="resp_123",
|
||||
)
|
||||
|
||||
assert result.text == "Hello, world!"
|
||||
assert result.response_id == "resp_123"
|
||||
assert result.web_search_results == []
|
||||
assert result.file_search_results == []
|
||||
assert result.code_interpreter_results == []
|
||||
assert result.computer_use_results == []
|
||||
assert result.reasoning_summaries == []
|
||||
assert result.function_calls == []
|
||||
assert not result.has_tool_outputs()
|
||||
assert not result.has_reasoning()
|
||||
|
||||
|
||||
def test_responses_api_result_has_tool_outputs():
|
||||
"""Test ResponsesAPIResult.has_tool_outputs() method."""
|
||||
result_with_web = ResponsesAPIResult(
|
||||
text="Test",
|
||||
web_search_results=[{"id": "ws_1", "status": "completed", "type": "web_search_call"}],
|
||||
)
|
||||
assert result_with_web.has_tool_outputs()
|
||||
|
||||
result_with_file = ResponsesAPIResult(
|
||||
text="Test",
|
||||
file_search_results=[{"id": "fs_1", "status": "completed", "type": "file_search_call", "queries": [], "results": []}],
|
||||
)
|
||||
assert result_with_file.has_tool_outputs()
|
||||
|
||||
|
||||
def test_responses_api_result_has_reasoning():
|
||||
"""Test ResponsesAPIResult.has_reasoning() method."""
|
||||
result_with_reasoning = ResponsesAPIResult(
|
||||
text="Test",
|
||||
reasoning_summaries=[{"id": "r_1", "type": "reasoning", "summary": []}],
|
||||
)
|
||||
assert result_with_reasoning.has_reasoning()
|
||||
|
||||
result_without = ResponsesAPIResult(text="Test")
|
||||
assert not result_without.has_reasoning()
|
||||
|
||||
|
||||
def test_openai_responses_api_parse_tool_outputs_param():
|
||||
"""Test that parse_tool_outputs parameter is properly configured."""
|
||||
llm = OpenAICompletion(
|
||||
model="gpt-4o",
|
||||
api="responses",
|
||||
parse_tool_outputs=True,
|
||||
)
|
||||
|
||||
assert llm.parse_tool_outputs is True
|
||||
|
||||
|
||||
def test_openai_responses_api_parse_tool_outputs_default_false():
|
||||
"""Test that parse_tool_outputs defaults to False."""
|
||||
llm = OpenAICompletion(
|
||||
model="gpt-4o",
|
||||
api="responses",
|
||||
)
|
||||
|
||||
assert llm.parse_tool_outputs is False
|
||||
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_openai_responses_api_with_parse_tool_outputs():
|
||||
"""Test Responses API with parse_tool_outputs enabled returns ResponsesAPIResult."""
|
||||
llm = OpenAICompletion(
|
||||
model="gpt-4o-mini",
|
||||
api="responses",
|
||||
builtin_tools=["web_search"],
|
||||
parse_tool_outputs=True,
|
||||
)
|
||||
|
||||
result = llm.call("What is the current population of Tokyo? Be very brief.")
|
||||
|
||||
assert isinstance(result, ResponsesAPIResult)
|
||||
assert len(result.text) > 0
|
||||
assert result.response_id is not None
|
||||
# Web search should have been used
|
||||
assert len(result.web_search_results) > 0
|
||||
assert result.has_tool_outputs()
|
||||
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_openai_responses_api_parse_tool_outputs_basic_call():
|
||||
"""Test Responses API with parse_tool_outputs but no built-in tools."""
|
||||
llm = OpenAICompletion(
|
||||
model="gpt-4o-mini",
|
||||
api="responses",
|
||||
parse_tool_outputs=True,
|
||||
)
|
||||
|
||||
result = llm.call("Say hello in exactly 3 words.")
|
||||
|
||||
assert isinstance(result, ResponsesAPIResult)
|
||||
assert len(result.text) > 0
|
||||
assert result.response_id is not None
|
||||
# No built-in tools used
|
||||
assert not result.has_tool_outputs()
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Auto-Chaining Tests (Responses API)
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def test_openai_responses_api_auto_chain_param():
|
||||
"""Test that auto_chain parameter is properly configured."""
|
||||
llm = OpenAICompletion(
|
||||
model="gpt-4o",
|
||||
api="responses",
|
||||
auto_chain=True,
|
||||
)
|
||||
|
||||
assert llm.auto_chain is True
|
||||
assert llm._last_response_id is None
|
||||
|
||||
|
||||
def test_openai_responses_api_auto_chain_default_false():
|
||||
"""Test that auto_chain defaults to False."""
|
||||
llm = OpenAICompletion(
|
||||
model="gpt-4o",
|
||||
api="responses",
|
||||
)
|
||||
|
||||
assert llm.auto_chain is False
|
||||
|
||||
|
||||
def test_openai_responses_api_last_response_id_property():
|
||||
"""Test last_response_id property."""
|
||||
llm = OpenAICompletion(
|
||||
model="gpt-4o",
|
||||
api="responses",
|
||||
auto_chain=True,
|
||||
)
|
||||
|
||||
# Initially None
|
||||
assert llm.last_response_id is None
|
||||
|
||||
# Simulate setting the internal value
|
||||
llm._last_response_id = "resp_test_123"
|
||||
assert llm.last_response_id == "resp_test_123"
|
||||
|
||||
|
||||
def test_openai_responses_api_reset_chain():
|
||||
"""Test reset_chain() method clears the response ID."""
|
||||
llm = OpenAICompletion(
|
||||
model="gpt-4o",
|
||||
api="responses",
|
||||
auto_chain=True,
|
||||
)
|
||||
|
||||
# Set a response ID
|
||||
llm._last_response_id = "resp_test_123"
|
||||
assert llm.last_response_id == "resp_test_123"
|
||||
|
||||
# Reset the chain
|
||||
llm.reset_chain()
|
||||
assert llm.last_response_id is None
|
||||
|
||||
|
||||
def test_openai_responses_api_auto_chain_prepare_params():
|
||||
"""Test that _prepare_responses_params uses auto-chained response ID."""
|
||||
llm = OpenAICompletion(
|
||||
model="gpt-4o",
|
||||
api="responses",
|
||||
auto_chain=True,
|
||||
)
|
||||
|
||||
# No previous response ID yet
|
||||
params = llm._prepare_responses_params(messages=[{"role": "user", "content": "test"}])
|
||||
assert "previous_response_id" not in params
|
||||
|
||||
# Set a previous response ID
|
||||
llm._last_response_id = "resp_previous_123"
|
||||
params = llm._prepare_responses_params(messages=[{"role": "user", "content": "test"}])
|
||||
assert params.get("previous_response_id") == "resp_previous_123"
|
||||
|
||||
|
||||
def test_openai_responses_api_explicit_previous_response_id_takes_precedence():
|
||||
"""Test that explicit previous_response_id overrides auto-chained ID."""
|
||||
llm = OpenAICompletion(
|
||||
model="gpt-4o",
|
||||
api="responses",
|
||||
auto_chain=True,
|
||||
previous_response_id="resp_explicit_456",
|
||||
)
|
||||
|
||||
# Set an auto-chained response ID
|
||||
llm._last_response_id = "resp_auto_123"
|
||||
|
||||
# Explicit should take precedence
|
||||
params = llm._prepare_responses_params(messages=[{"role": "user", "content": "test"}])
|
||||
assert params.get("previous_response_id") == "resp_explicit_456"
|
||||
|
||||
|
||||
def test_openai_responses_api_auto_chain_disabled_no_tracking():
|
||||
"""Test that response ID is not tracked when auto_chain is False."""
|
||||
llm = OpenAICompletion(
|
||||
model="gpt-4o",
|
||||
api="responses",
|
||||
auto_chain=False,
|
||||
)
|
||||
|
||||
# Even with a "previous" response ID set internally, params shouldn't use it
|
||||
llm._last_response_id = "resp_should_not_use"
|
||||
params = llm._prepare_responses_params(messages=[{"role": "user", "content": "test"}])
|
||||
assert "previous_response_id" not in params
|
||||
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_openai_responses_api_auto_chain_integration():
|
||||
"""Test auto-chaining tracks response IDs across calls."""
|
||||
llm = OpenAICompletion(
|
||||
model="gpt-4o-mini",
|
||||
api="responses",
|
||||
auto_chain=True,
|
||||
)
|
||||
|
||||
# First call - should not have previous_response_id
|
||||
assert llm.last_response_id is None
|
||||
result1 = llm.call("My name is Alice. Remember this.")
|
||||
|
||||
# After first call, should have a response ID
|
||||
assert llm.last_response_id is not None
|
||||
first_response_id = llm.last_response_id
|
||||
assert first_response_id.startswith("resp_")
|
||||
|
||||
# Second call - should use the first response ID
|
||||
result2 = llm.call("What is my name?")
|
||||
|
||||
# Response ID should be updated
|
||||
assert llm.last_response_id is not None
|
||||
assert llm.last_response_id != first_response_id # Should be a new ID
|
||||
|
||||
# The response should remember context (Alice)
|
||||
assert isinstance(result1, str)
|
||||
assert isinstance(result2, str)
|
||||
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_openai_responses_api_auto_chain_with_reset():
|
||||
"""Test that reset_chain() properly starts a new conversation."""
|
||||
llm = OpenAICompletion(
|
||||
model="gpt-4o-mini",
|
||||
api="responses",
|
||||
auto_chain=True,
|
||||
)
|
||||
|
||||
# First conversation
|
||||
llm.call("My favorite color is blue.")
|
||||
first_chain_id = llm.last_response_id
|
||||
assert first_chain_id is not None
|
||||
|
||||
# Reset and start new conversation
|
||||
llm.reset_chain()
|
||||
assert llm.last_response_id is None
|
||||
|
||||
# New call should start fresh
|
||||
llm.call("Hello!")
|
||||
second_chain_id = llm.last_response_id
|
||||
assert second_chain_id is not None
|
||||
# New conversation, so different response ID
|
||||
assert second_chain_id != first_chain_id
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Encrypted Reasoning for ZDR (Zero Data Retention) Tests
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def test_openai_responses_api_auto_chain_reasoning_param():
|
||||
"""Test that auto_chain_reasoning parameter is properly configured."""
|
||||
llm = OpenAICompletion(
|
||||
model="gpt-4o",
|
||||
api="responses",
|
||||
auto_chain_reasoning=True,
|
||||
)
|
||||
|
||||
assert llm.auto_chain_reasoning is True
|
||||
assert llm._last_reasoning_items is None
|
||||
|
||||
|
||||
def test_openai_responses_api_auto_chain_reasoning_default_false():
|
||||
"""Test that auto_chain_reasoning defaults to False."""
|
||||
llm = OpenAICompletion(
|
||||
model="gpt-4o",
|
||||
api="responses",
|
||||
)
|
||||
|
||||
assert llm.auto_chain_reasoning is False
|
||||
|
||||
|
||||
def test_openai_responses_api_last_reasoning_items_property():
|
||||
"""Test last_reasoning_items property."""
|
||||
llm = OpenAICompletion(
|
||||
model="gpt-4o",
|
||||
api="responses",
|
||||
auto_chain_reasoning=True,
|
||||
)
|
||||
|
||||
# Initially None
|
||||
assert llm.last_reasoning_items is None
|
||||
|
||||
# Simulate setting the internal value
|
||||
mock_items = [{"id": "rs_test_123", "type": "reasoning"}]
|
||||
llm._last_reasoning_items = mock_items
|
||||
assert llm.last_reasoning_items == mock_items
|
||||
|
||||
|
||||
def test_openai_responses_api_reset_reasoning_chain():
|
||||
"""Test reset_reasoning_chain() method clears reasoning items."""
|
||||
llm = OpenAICompletion(
|
||||
model="gpt-4o",
|
||||
api="responses",
|
||||
auto_chain_reasoning=True,
|
||||
)
|
||||
|
||||
# Set reasoning items
|
||||
mock_items = [{"id": "rs_test_123", "type": "reasoning"}]
|
||||
llm._last_reasoning_items = mock_items
|
||||
assert llm.last_reasoning_items == mock_items
|
||||
|
||||
# Reset the reasoning chain
|
||||
llm.reset_reasoning_chain()
|
||||
assert llm.last_reasoning_items is None
|
||||
|
||||
|
||||
def test_openai_responses_api_auto_chain_reasoning_adds_include():
|
||||
"""Test that auto_chain_reasoning adds reasoning.encrypted_content to include."""
|
||||
llm = OpenAICompletion(
|
||||
model="gpt-4o",
|
||||
api="responses",
|
||||
auto_chain_reasoning=True,
|
||||
)
|
||||
|
||||
params = llm._prepare_responses_params(messages=[{"role": "user", "content": "test"}])
|
||||
assert "include" in params
|
||||
assert "reasoning.encrypted_content" in params["include"]
|
||||
|
||||
|
||||
def test_openai_responses_api_auto_chain_reasoning_preserves_existing_include():
|
||||
"""Test that auto_chain_reasoning preserves existing include items."""
|
||||
llm = OpenAICompletion(
|
||||
model="gpt-4o",
|
||||
api="responses",
|
||||
auto_chain_reasoning=True,
|
||||
include=["file_search_call.results"],
|
||||
)
|
||||
|
||||
params = llm._prepare_responses_params(messages=[{"role": "user", "content": "test"}])
|
||||
assert "include" in params
|
||||
assert "reasoning.encrypted_content" in params["include"]
|
||||
assert "file_search_call.results" in params["include"]
|
||||
|
||||
|
||||
def test_openai_responses_api_auto_chain_reasoning_no_duplicate_include():
|
||||
"""Test that reasoning.encrypted_content is not duplicated if already in include."""
|
||||
llm = OpenAICompletion(
|
||||
model="gpt-4o",
|
||||
api="responses",
|
||||
auto_chain_reasoning=True,
|
||||
include=["reasoning.encrypted_content"],
|
||||
)
|
||||
|
||||
params = llm._prepare_responses_params(messages=[{"role": "user", "content": "test"}])
|
||||
assert "include" in params
|
||||
# Should only appear once
|
||||
assert params["include"].count("reasoning.encrypted_content") == 1
|
||||
|
||||
|
||||
def test_openai_responses_api_auto_chain_reasoning_prepends_to_input():
|
||||
"""Test that stored reasoning items are prepended to input."""
|
||||
llm = OpenAICompletion(
|
||||
model="gpt-4o",
|
||||
api="responses",
|
||||
auto_chain_reasoning=True,
|
||||
)
|
||||
|
||||
# Simulate stored reasoning items
|
||||
mock_reasoning = MagicMock()
|
||||
mock_reasoning.type = "reasoning"
|
||||
mock_reasoning.id = "rs_test_123"
|
||||
llm._last_reasoning_items = [mock_reasoning]
|
||||
|
||||
params = llm._prepare_responses_params(messages=[{"role": "user", "content": "test"}])
|
||||
|
||||
# Input should have reasoning item first, then the message
|
||||
assert len(params["input"]) == 2
|
||||
assert params["input"][0] == mock_reasoning
|
||||
assert params["input"][1]["role"] == "user"
|
||||
|
||||
|
||||
def test_openai_responses_api_auto_chain_reasoning_disabled_no_include():
|
||||
"""Test that reasoning.encrypted_content is not added when auto_chain_reasoning is False."""
|
||||
llm = OpenAICompletion(
|
||||
model="gpt-4o",
|
||||
api="responses",
|
||||
auto_chain_reasoning=False,
|
||||
)
|
||||
|
||||
params = llm._prepare_responses_params(messages=[{"role": "user", "content": "test"}])
|
||||
# Should not have include at all (unless explicitly set)
|
||||
assert "include" not in params or "reasoning.encrypted_content" not in params.get("include", [])
|
||||
|
||||
|
||||
def test_openai_responses_api_auto_chain_reasoning_disabled_no_prepend():
|
||||
"""Test that reasoning items are not prepended when auto_chain_reasoning is False."""
|
||||
llm = OpenAICompletion(
|
||||
model="gpt-4o",
|
||||
api="responses",
|
||||
auto_chain_reasoning=False,
|
||||
)
|
||||
|
||||
# Even with stored reasoning items, they should not be prepended
|
||||
mock_reasoning = MagicMock()
|
||||
mock_reasoning.type = "reasoning"
|
||||
llm._last_reasoning_items = [mock_reasoning]
|
||||
|
||||
params = llm._prepare_responses_params(messages=[{"role": "user", "content": "test"}])
|
||||
|
||||
# Input should only have the message, not the reasoning item
|
||||
assert len(params["input"]) == 1
|
||||
assert params["input"][0]["role"] == "user"
|
||||
|
||||
|
||||
def test_openai_responses_api_both_auto_chains_work_together():
|
||||
"""Test that auto_chain and auto_chain_reasoning can be used together."""
|
||||
llm = OpenAICompletion(
|
||||
model="gpt-4o",
|
||||
api="responses",
|
||||
auto_chain=True,
|
||||
auto_chain_reasoning=True,
|
||||
)
|
||||
|
||||
assert llm.auto_chain is True
|
||||
assert llm.auto_chain_reasoning is True
|
||||
assert llm._last_response_id is None
|
||||
assert llm._last_reasoning_items is None
|
||||
|
||||
# Set both internal values
|
||||
llm._last_response_id = "resp_123"
|
||||
mock_reasoning = MagicMock()
|
||||
mock_reasoning.type = "reasoning"
|
||||
llm._last_reasoning_items = [mock_reasoning]
|
||||
|
||||
params = llm._prepare_responses_params(messages=[{"role": "user", "content": "test"}])
|
||||
|
||||
# Both should be applied
|
||||
assert params.get("previous_response_id") == "resp_123"
|
||||
assert "reasoning.encrypted_content" in params["include"]
|
||||
assert len(params["input"]) == 2 # Reasoning item + message
|
||||
|
||||
375
lib/crewai/tests/llms/test_multimodal.py
Normal file
375
lib/crewai/tests/llms/test_multimodal.py
Normal file
@@ -0,0 +1,375 @@
|
||||
"""Unit tests for LLM multimodal functionality across all providers."""
|
||||
|
||||
import base64
|
||||
import os
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from crewai.llm import LLM
|
||||
from crewai_files import ImageFile, PDFFile, TextFile, format_multimodal_content
|
||||
|
||||
# Check for optional provider dependencies
|
||||
try:
|
||||
from crewai.llms.providers.anthropic.completion import AnthropicCompletion
|
||||
HAS_ANTHROPIC = True
|
||||
except ImportError:
|
||||
HAS_ANTHROPIC = False
|
||||
|
||||
try:
|
||||
from crewai.llms.providers.azure.completion import AzureCompletion
|
||||
HAS_AZURE = True
|
||||
except ImportError:
|
||||
HAS_AZURE = False
|
||||
|
||||
try:
|
||||
from crewai.llms.providers.bedrock.completion import BedrockCompletion
|
||||
HAS_BEDROCK = True
|
||||
except ImportError:
|
||||
HAS_BEDROCK = False
|
||||
|
||||
|
||||
# Minimal valid PNG for testing
|
||||
MINIMAL_PNG = (
|
||||
b"\x89PNG\r\n\x1a\n"
|
||||
b"\x00\x00\x00\rIHDR"
|
||||
b"\x00\x00\x00\x01\x00\x00\x00\x01\x08\x02\x00\x00\x00"
|
||||
b"\x90wS\xde"
|
||||
b"\x00\x00\x00\x00IEND\xaeB`\x82"
|
||||
)
|
||||
|
||||
MINIMAL_PDF = b"%PDF-1.4 test content"
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def mock_api_keys():
|
||||
"""Mock API keys for all providers."""
|
||||
env_vars = {
|
||||
"ANTHROPIC_API_KEY": "test-key",
|
||||
"OPENAI_API_KEY": "test-key",
|
||||
"GOOGLE_API_KEY": "test-key",
|
||||
"AZURE_API_KEY": "test-key",
|
||||
"AWS_ACCESS_KEY_ID": "test-key",
|
||||
"AWS_SECRET_ACCESS_KEY": "test-key",
|
||||
}
|
||||
with patch.dict(os.environ, env_vars):
|
||||
yield
|
||||
|
||||
|
||||
class TestLiteLLMMultimodal:
|
||||
"""Tests for LLM class (litellm wrapper) multimodal functionality.
|
||||
|
||||
These tests use `is_litellm=True` to ensure the litellm wrapper is used
|
||||
instead of native providers.
|
||||
"""
|
||||
|
||||
def test_supports_multimodal_gpt4o(self) -> None:
|
||||
"""Test GPT-4o model supports multimodal."""
|
||||
llm = LLM(model="gpt-4o", is_litellm=True)
|
||||
assert llm.supports_multimodal() is True
|
||||
|
||||
def test_supports_multimodal_gpt4_turbo(self) -> None:
|
||||
"""Test GPT-4 Turbo model supports multimodal."""
|
||||
llm = LLM(model="gpt-4-turbo", is_litellm=True)
|
||||
assert llm.supports_multimodal() is True
|
||||
|
||||
def test_supports_multimodal_claude3(self) -> None:
|
||||
"""Test Claude 3 model supports multimodal via litellm."""
|
||||
# Use litellm/ prefix to avoid native provider import
|
||||
llm = LLM(model="litellm/claude-3-sonnet-20240229")
|
||||
assert llm.supports_multimodal() is True
|
||||
|
||||
def test_supports_multimodal_gemini(self) -> None:
|
||||
"""Test Gemini model supports multimodal."""
|
||||
llm = LLM(model="gemini/gemini-pro", is_litellm=True)
|
||||
assert llm.supports_multimodal() is True
|
||||
|
||||
def test_supports_multimodal_gpt35_does_not(self) -> None:
|
||||
"""Test GPT-3.5 model does not support multimodal."""
|
||||
llm = LLM(model="gpt-3.5-turbo", is_litellm=True)
|
||||
assert llm.supports_multimodal() is False
|
||||
|
||||
def test_format_multimodal_content_image(self) -> None:
|
||||
"""Test formatting image content."""
|
||||
llm = LLM(model="gpt-4o", is_litellm=True)
|
||||
files = {"chart": ImageFile(source=MINIMAL_PNG)}
|
||||
|
||||
result = format_multimodal_content(files, getattr(llm, "provider", None) or llm.model)
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0]["type"] == "image_url"
|
||||
assert "data:image/png;base64," in result[0]["image_url"]["url"]
|
||||
|
||||
def test_format_multimodal_content_unsupported_type(self) -> None:
|
||||
"""Test unsupported content type is skipped."""
|
||||
llm = LLM(model="gpt-4o", is_litellm=True) # OpenAI doesn't support text files
|
||||
files = {"doc": TextFile(source=b"hello world")}
|
||||
|
||||
result = format_multimodal_content(files, getattr(llm, "provider", None) or llm.model)
|
||||
|
||||
assert result == []
|
||||
|
||||
|
||||
@pytest.mark.skipif(not HAS_ANTHROPIC, reason="Anthropic SDK not installed")
|
||||
class TestAnthropicMultimodal:
|
||||
"""Tests for Anthropic provider multimodal functionality."""
|
||||
|
||||
def test_supports_multimodal_claude3(self) -> None:
|
||||
"""Test Claude 3 supports multimodal."""
|
||||
llm = LLM(model="anthropic/claude-3-sonnet-20240229")
|
||||
assert llm.supports_multimodal() is True
|
||||
|
||||
def test_supports_multimodal_claude4(self) -> None:
|
||||
"""Test Claude 4 supports multimodal."""
|
||||
llm = LLM(model="anthropic/claude-4-opus")
|
||||
assert llm.supports_multimodal() is True
|
||||
|
||||
def test_format_multimodal_content_image(self) -> None:
|
||||
"""Test Anthropic image format uses source-based structure."""
|
||||
llm = LLM(model="anthropic/claude-3-sonnet-20240229")
|
||||
files = {"chart": ImageFile(source=MINIMAL_PNG)}
|
||||
|
||||
result = format_multimodal_content(files, getattr(llm, "provider", None) or llm.model)
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0]["type"] == "image"
|
||||
assert result[0]["source"]["type"] == "base64"
|
||||
assert result[0]["source"]["media_type"] == "image/png"
|
||||
assert "data" in result[0]["source"]
|
||||
|
||||
def test_format_multimodal_content_pdf(self) -> None:
|
||||
"""Test Anthropic PDF format uses document structure."""
|
||||
llm = LLM(model="anthropic/claude-3-sonnet-20240229")
|
||||
files = {"doc": PDFFile(source=MINIMAL_PDF)}
|
||||
|
||||
result = format_multimodal_content(files, getattr(llm, "provider", None) or llm.model)
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0]["type"] == "document"
|
||||
assert result[0]["source"]["type"] == "base64"
|
||||
assert result[0]["source"]["media_type"] == "application/pdf"
|
||||
|
||||
|
||||
class TestOpenAIMultimodal:
|
||||
"""Tests for OpenAI provider multimodal functionality."""
|
||||
|
||||
def test_supports_multimodal_gpt4o(self) -> None:
|
||||
"""Test GPT-4o supports multimodal."""
|
||||
llm = LLM(model="openai/gpt-4o")
|
||||
assert llm.supports_multimodal() is True
|
||||
|
||||
def test_supports_multimodal_gpt4_vision(self) -> None:
|
||||
"""Test GPT-4 Vision supports multimodal."""
|
||||
llm = LLM(model="openai/gpt-4-vision-preview")
|
||||
assert llm.supports_multimodal() is True
|
||||
|
||||
def test_supports_multimodal_o1(self) -> None:
|
||||
"""Test O1 model supports multimodal."""
|
||||
llm = LLM(model="openai/o1-preview")
|
||||
assert llm.supports_multimodal() is True
|
||||
|
||||
def test_does_not_support_gpt35(self) -> None:
|
||||
"""Test GPT-3.5 does not support multimodal."""
|
||||
llm = LLM(model="openai/gpt-3.5-turbo")
|
||||
assert llm.supports_multimodal() is False
|
||||
|
||||
def test_format_multimodal_content_image(self) -> None:
|
||||
"""Test OpenAI uses image_url format."""
|
||||
llm = LLM(model="openai/gpt-4o")
|
||||
files = {"chart": ImageFile(source=MINIMAL_PNG)}
|
||||
|
||||
result = format_multimodal_content(files, getattr(llm, "provider", None) or llm.model)
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0]["type"] == "image_url"
|
||||
url = result[0]["image_url"]["url"]
|
||||
assert url.startswith("data:image/png;base64,")
|
||||
# Verify base64 content
|
||||
b64_data = url.split(",")[1]
|
||||
assert base64.b64decode(b64_data) == MINIMAL_PNG
|
||||
|
||||
|
||||
class TestGeminiMultimodal:
|
||||
"""Tests for Gemini provider multimodal functionality."""
|
||||
|
||||
def test_supports_multimodal_always_true(self) -> None:
|
||||
"""Test Gemini always supports multimodal."""
|
||||
llm = LLM(model="gemini/gemini-pro")
|
||||
assert llm.supports_multimodal() is True
|
||||
|
||||
def test_format_multimodal_content_image(self) -> None:
|
||||
"""Test Gemini uses inlineData format."""
|
||||
llm = LLM(model="gemini/gemini-pro")
|
||||
files = {"chart": ImageFile(source=MINIMAL_PNG)}
|
||||
|
||||
result = format_multimodal_content(files, getattr(llm, "provider", None) or llm.model)
|
||||
|
||||
assert len(result) == 1
|
||||
assert "inlineData" in result[0]
|
||||
assert result[0]["inlineData"]["mimeType"] == "image/png"
|
||||
assert "data" in result[0]["inlineData"]
|
||||
|
||||
def test_format_text_content(self) -> None:
|
||||
"""Test Gemini text format uses simple text key."""
|
||||
llm = LLM(model="gemini/gemini-pro")
|
||||
|
||||
result = llm.format_text_content("Hello world")
|
||||
|
||||
assert result == {"text": "Hello world"}
|
||||
|
||||
|
||||
@pytest.mark.skipif(not HAS_AZURE, reason="Azure AI Inference SDK not installed")
|
||||
class TestAzureMultimodal:
|
||||
"""Tests for Azure OpenAI provider multimodal functionality."""
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def mock_azure_env(self):
|
||||
"""Mock Azure-specific environment variables."""
|
||||
env_vars = {
|
||||
"AZURE_API_KEY": "test-key",
|
||||
"AZURE_API_BASE": "https://test.openai.azure.com",
|
||||
"AZURE_API_VERSION": "2024-02-01",
|
||||
}
|
||||
with patch.dict(os.environ, env_vars):
|
||||
yield
|
||||
|
||||
def test_supports_multimodal_gpt4o(self) -> None:
|
||||
"""Test Azure GPT-4o supports multimodal."""
|
||||
llm = LLM(model="azure/gpt-4o")
|
||||
assert llm.supports_multimodal() is True
|
||||
|
||||
def test_supports_multimodal_gpt4_turbo(self) -> None:
|
||||
"""Test Azure GPT-4 Turbo supports multimodal."""
|
||||
llm = LLM(model="azure/gpt-4-turbo")
|
||||
assert llm.supports_multimodal() is True
|
||||
|
||||
def test_does_not_support_gpt35(self) -> None:
|
||||
"""Test Azure GPT-3.5 does not support multimodal."""
|
||||
llm = LLM(model="azure/gpt-35-turbo")
|
||||
assert llm.supports_multimodal() is False
|
||||
|
||||
def test_format_multimodal_content_image(self) -> None:
|
||||
"""Test Azure uses same format as OpenAI."""
|
||||
llm = LLM(model="azure/gpt-4o")
|
||||
files = {"chart": ImageFile(source=MINIMAL_PNG)}
|
||||
|
||||
result = format_multimodal_content(files, getattr(llm, "provider", None) or llm.model)
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0]["type"] == "image_url"
|
||||
assert "data:image/png;base64," in result[0]["image_url"]["url"]
|
||||
|
||||
|
||||
@pytest.mark.skipif(not HAS_BEDROCK, reason="AWS Bedrock SDK not installed")
|
||||
class TestBedrockMultimodal:
|
||||
"""Tests for AWS Bedrock provider multimodal functionality."""
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def mock_bedrock_env(self):
|
||||
"""Mock AWS-specific environment variables."""
|
||||
env_vars = {
|
||||
"AWS_ACCESS_KEY_ID": "test-key",
|
||||
"AWS_SECRET_ACCESS_KEY": "test-secret",
|
||||
"AWS_DEFAULT_REGION": "us-east-1",
|
||||
}
|
||||
with patch.dict(os.environ, env_vars):
|
||||
yield
|
||||
|
||||
def test_supports_multimodal_claude3(self) -> None:
|
||||
"""Test Bedrock Claude 3 supports multimodal."""
|
||||
llm = LLM(model="bedrock/anthropic.claude-3-sonnet")
|
||||
assert llm.supports_multimodal() is True
|
||||
|
||||
def test_does_not_support_claude2(self) -> None:
|
||||
"""Test Bedrock Claude 2 does not support multimodal."""
|
||||
llm = LLM(model="bedrock/anthropic.claude-v2")
|
||||
assert llm.supports_multimodal() is False
|
||||
|
||||
def test_format_multimodal_content_image(self) -> None:
|
||||
"""Test Bedrock uses Converse API image format."""
|
||||
llm = LLM(model="bedrock/anthropic.claude-3-sonnet")
|
||||
files = {"chart": ImageFile(source=MINIMAL_PNG)}
|
||||
|
||||
result = format_multimodal_content(files, getattr(llm, "provider", None) or llm.model)
|
||||
|
||||
assert len(result) == 1
|
||||
assert "image" in result[0]
|
||||
assert result[0]["image"]["format"] == "png"
|
||||
assert "source" in result[0]["image"]
|
||||
assert "bytes" in result[0]["image"]["source"]
|
||||
|
||||
def test_format_multimodal_content_pdf(self) -> None:
|
||||
"""Test Bedrock uses Converse API document format."""
|
||||
llm = LLM(model="bedrock/anthropic.claude-3-sonnet")
|
||||
files = {"doc": PDFFile(source=MINIMAL_PDF)}
|
||||
|
||||
result = format_multimodal_content(files, getattr(llm, "provider", None) or llm.model)
|
||||
|
||||
assert len(result) == 1
|
||||
assert "document" in result[0]
|
||||
assert result[0]["document"]["format"] == "pdf"
|
||||
assert "source" in result[0]["document"]
|
||||
|
||||
|
||||
class TestBaseLLMMultimodal:
|
||||
"""Tests for BaseLLM default multimodal behavior."""
|
||||
|
||||
def test_base_supports_multimodal_false(self) -> None:
|
||||
"""Test base implementation returns False."""
|
||||
from crewai.llms.base_llm import BaseLLM
|
||||
|
||||
class TestLLM(BaseLLM):
|
||||
def call(self, messages, tools=None, callbacks=None):
|
||||
return "test"
|
||||
|
||||
llm = TestLLM(model="test")
|
||||
assert llm.supports_multimodal() is False
|
||||
|
||||
def test_base_format_text_content(self) -> None:
|
||||
"""Test base text formatting uses OpenAI/Anthropic style."""
|
||||
from crewai.llms.base_llm import BaseLLM
|
||||
|
||||
class TestLLM(BaseLLM):
|
||||
def call(self, messages, tools=None, callbacks=None):
|
||||
return "test"
|
||||
|
||||
llm = TestLLM(model="test")
|
||||
result = llm.format_text_content("Hello")
|
||||
assert result == {"type": "text", "text": "Hello"}
|
||||
|
||||
|
||||
class TestMultipleFilesFormatting:
|
||||
"""Tests for formatting multiple files at once."""
|
||||
|
||||
def test_format_multiple_images(self) -> None:
|
||||
"""Test formatting multiple images."""
|
||||
llm = LLM(model="gpt-4o")
|
||||
files = {
|
||||
"chart1": ImageFile(source=MINIMAL_PNG),
|
||||
"chart2": ImageFile(source=MINIMAL_PNG),
|
||||
}
|
||||
|
||||
result = format_multimodal_content(files, getattr(llm, "provider", None) or llm.model)
|
||||
|
||||
assert len(result) == 2
|
||||
|
||||
def test_format_mixed_supported_and_unsupported(self) -> None:
|
||||
"""Test only supported types are formatted."""
|
||||
llm = LLM(model="gpt-4o") # OpenAI - images only
|
||||
files = {
|
||||
"chart": ImageFile(source=MINIMAL_PNG),
|
||||
"doc": PDFFile(source=MINIMAL_PDF), # Not supported by OpenAI
|
||||
"text": TextFile(source=b"hello"), # Not supported
|
||||
}
|
||||
|
||||
result = format_multimodal_content(files, getattr(llm, "provider", None) or llm.model)
|
||||
|
||||
assert len(result) == 1 # Only image supported
|
||||
|
||||
def test_format_empty_files_dict(self) -> None:
|
||||
"""Test empty files dict returns empty list."""
|
||||
llm = LLM(model="gpt-4o")
|
||||
|
||||
result = format_multimodal_content({}, llm.model)
|
||||
|
||||
assert result == []
|
||||
782
lib/crewai/tests/llms/test_multimodal_integration.py
Normal file
782
lib/crewai/tests/llms/test_multimodal_integration.py
Normal file
@@ -0,0 +1,782 @@
|
||||
"""Integration tests for LLM multimodal functionality with cassettes.
|
||||
|
||||
These tests make actual API calls (recorded via VCR cassettes) to verify
|
||||
multimodal content is properly sent and processed by each provider.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from crewai.llm import LLM
|
||||
from crewai_files import (
|
||||
AudioFile,
|
||||
File,
|
||||
ImageFile,
|
||||
PDFFile,
|
||||
TextFile,
|
||||
VideoFile,
|
||||
format_multimodal_content,
|
||||
)
|
||||
from crewai_files.resolution.resolver import FileResolver, FileResolverConfig
|
||||
|
||||
|
||||
# Path to test data files
|
||||
TEST_FIXTURES_DIR = Path(__file__).parent.parent.parent.parent / "crewai-files" / "tests" / "fixtures"
|
||||
TEST_IMAGE_PATH = TEST_FIXTURES_DIR / "revenue_chart.png"
|
||||
TEST_TEXT_PATH = TEST_FIXTURES_DIR / "review_guidelines.txt"
|
||||
TEST_VIDEO_PATH = TEST_FIXTURES_DIR / "sample_video.mp4"
|
||||
TEST_AUDIO_PATH = TEST_FIXTURES_DIR / "sample_audio.wav"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_image_bytes() -> bytes:
|
||||
"""Load test image bytes."""
|
||||
return TEST_IMAGE_PATH.read_bytes()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_text_bytes() -> bytes:
|
||||
"""Load test text bytes."""
|
||||
return TEST_TEXT_PATH.read_bytes()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_video_bytes() -> bytes:
|
||||
"""Load test video bytes."""
|
||||
if not TEST_VIDEO_PATH.exists():
|
||||
pytest.skip("sample_video.mp4 fixture not found")
|
||||
return TEST_VIDEO_PATH.read_bytes()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_audio_bytes() -> bytes:
|
||||
"""Load test audio bytes."""
|
||||
if not TEST_AUDIO_PATH.exists():
|
||||
pytest.skip("sample_audio.wav fixture not found")
|
||||
return TEST_AUDIO_PATH.read_bytes()
|
||||
|
||||
|
||||
# Minimal PDF for testing (real PDF structure)
|
||||
MINIMAL_PDF = b"""%PDF-1.4
|
||||
1 0 obj << /Type /Catalog /Pages 2 0 R >> endobj
|
||||
2 0 obj << /Type /Pages /Kids [3 0 R] /Count 1 >> endobj
|
||||
3 0 obj << /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >> endobj
|
||||
xref
|
||||
0 4
|
||||
0000000000 65535 f
|
||||
0000000009 00000 n
|
||||
0000000058 00000 n
|
||||
0000000115 00000 n
|
||||
trailer << /Size 4 /Root 1 0 R >>
|
||||
startxref
|
||||
196
|
||||
%%EOF
|
||||
"""
|
||||
|
||||
|
||||
def _build_multimodal_message(llm: LLM, prompt: str, files: dict) -> list[dict]:
|
||||
"""Build a multimodal message with text and file content."""
|
||||
provider = getattr(llm, "provider", None) or llm.model
|
||||
content_blocks = format_multimodal_content(files, provider)
|
||||
return [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
llm.format_text_content(prompt),
|
||||
*content_blocks,
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
class TestOpenAIMultimodalIntegration:
|
||||
"""Integration tests for OpenAI multimodal with real API calls."""
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_describe_image(self, test_image_bytes: bytes) -> None:
|
||||
"""Test OpenAI can describe an image."""
|
||||
llm = LLM(model="openai/gpt-4o-mini")
|
||||
files = {"image": ImageFile(source=test_image_bytes)}
|
||||
|
||||
messages = _build_multimodal_message(
|
||||
llm,
|
||||
"Describe this image in one sentence. Be brief.",
|
||||
files,
|
||||
)
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
|
||||
class TestOpenAIO4MiniMultimodalIntegration:
|
||||
"""Integration tests for OpenAI o4-mini reasoning model with vision."""
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_describe_image(self, test_image_bytes: bytes) -> None:
|
||||
"""Test o4-mini can describe an image."""
|
||||
llm = LLM(model="openai/o4-mini")
|
||||
files = {"image": ImageFile(source=test_image_bytes)}
|
||||
|
||||
messages = _build_multimodal_message(
|
||||
llm,
|
||||
"Describe this image in one sentence. Be brief.",
|
||||
files,
|
||||
)
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
|
||||
class TestOpenAIGPT41MiniMultimodalIntegration:
|
||||
"""Integration tests for OpenAI GPT-4.1-mini with vision."""
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_describe_image(self, test_image_bytes: bytes) -> None:
|
||||
"""Test GPT-4.1-mini can describe an image."""
|
||||
llm = LLM(model="openai/gpt-4.1-mini")
|
||||
files = {"image": ImageFile(source=test_image_bytes)}
|
||||
|
||||
messages = _build_multimodal_message(
|
||||
llm,
|
||||
"Describe this image in one sentence. Be brief.",
|
||||
files,
|
||||
)
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
|
||||
class TestOpenAIGPT5MultimodalIntegration:
|
||||
"""Integration tests for OpenAI GPT-5 with vision."""
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_describe_image(self, test_image_bytes: bytes) -> None:
|
||||
"""Test GPT-5 can describe an image."""
|
||||
llm = LLM(model="openai/gpt-5")
|
||||
files = {"image": ImageFile(source=test_image_bytes)}
|
||||
|
||||
messages = _build_multimodal_message(
|
||||
llm,
|
||||
"Describe this image in one sentence. Be brief.",
|
||||
files,
|
||||
)
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
|
||||
class TestOpenAIGPT5MiniMultimodalIntegration:
|
||||
"""Integration tests for OpenAI GPT-5-mini with vision."""
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_describe_image(self, test_image_bytes: bytes) -> None:
|
||||
"""Test GPT-5-mini can describe an image."""
|
||||
llm = LLM(model="openai/gpt-5-mini")
|
||||
files = {"image": ImageFile(source=test_image_bytes)}
|
||||
|
||||
messages = _build_multimodal_message(
|
||||
llm,
|
||||
"Describe this image in one sentence. Be brief.",
|
||||
files,
|
||||
)
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
|
||||
class TestOpenAIGPT5NanoMultimodalIntegration:
|
||||
"""Integration tests for OpenAI GPT-5-nano with vision."""
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_describe_image(self, test_image_bytes: bytes) -> None:
|
||||
"""Test GPT-5-nano can describe an image."""
|
||||
llm = LLM(model="openai/gpt-5-nano")
|
||||
files = {"image": ImageFile(source=test_image_bytes)}
|
||||
|
||||
messages = _build_multimodal_message(
|
||||
llm,
|
||||
"Describe this image in one sentence. Be brief.",
|
||||
files,
|
||||
)
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
|
||||
class TestAnthropicMultimodalIntegration:
|
||||
"""Integration tests for Anthropic multimodal with real API calls."""
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_describe_image(self, test_image_bytes: bytes) -> None:
|
||||
"""Test Anthropic can describe an image."""
|
||||
llm = LLM(model="anthropic/claude-3-5-haiku-20241022")
|
||||
files = {"image": ImageFile(source=test_image_bytes)}
|
||||
|
||||
messages = _build_multimodal_message(
|
||||
llm,
|
||||
"Describe this image in one sentence. Be brief.",
|
||||
files,
|
||||
)
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_analyze_pdf(self) -> None:
|
||||
"""Test Anthropic can analyze a PDF."""
|
||||
llm = LLM(model="anthropic/claude-3-5-haiku-20241022")
|
||||
files = {"document": PDFFile(source=MINIMAL_PDF)}
|
||||
|
||||
messages = _build_multimodal_message(
|
||||
llm,
|
||||
"What type of document is this? Answer in one word.",
|
||||
files,
|
||||
)
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
|
||||
class TestAzureMultimodalIntegration:
|
||||
"""Integration tests for Azure OpenAI multimodal with real API calls."""
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_describe_image(self, test_image_bytes: bytes) -> None:
|
||||
"""Test Azure OpenAI can describe an image."""
|
||||
llm = LLM(model="azure/gpt-4o")
|
||||
files = {"image": ImageFile(source=test_image_bytes)}
|
||||
|
||||
messages = _build_multimodal_message(
|
||||
llm,
|
||||
"Describe this image in one sentence. Be brief.",
|
||||
files,
|
||||
)
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
|
||||
class TestBedrockMultimodalIntegration:
|
||||
"""Integration tests for AWS Bedrock multimodal with real API calls."""
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_describe_image(self, test_image_bytes: bytes) -> None:
|
||||
"""Test Bedrock Claude can describe an image."""
|
||||
llm = LLM(model="bedrock/anthropic.claude-3-haiku-20240307-v1:0")
|
||||
files = {"image": ImageFile(source=test_image_bytes)}
|
||||
|
||||
messages = _build_multimodal_message(
|
||||
llm,
|
||||
"Describe this image in one sentence. Be brief.",
|
||||
files,
|
||||
)
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_analyze_pdf(self) -> None:
|
||||
"""Test Bedrock Claude can analyze a PDF."""
|
||||
llm = LLM(model="bedrock/anthropic.claude-3-haiku-20240307-v1:0")
|
||||
files = {"document": PDFFile(source=MINIMAL_PDF)}
|
||||
|
||||
messages = _build_multimodal_message(
|
||||
llm,
|
||||
"What type of document is this? Answer in one word.",
|
||||
files,
|
||||
)
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
|
||||
class TestGeminiMultimodalIntegration:
|
||||
"""Integration tests for Gemini multimodal with real API calls."""
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_describe_image(self, test_image_bytes: bytes) -> None:
|
||||
"""Test Gemini can describe an image."""
|
||||
llm = LLM(model="gemini/gemini-2.0-flash")
|
||||
files = {"image": ImageFile(source=test_image_bytes)}
|
||||
|
||||
messages = _build_multimodal_message(
|
||||
llm,
|
||||
"Describe this image in one sentence. Be brief.",
|
||||
files,
|
||||
)
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_analyze_text_file(self, test_text_bytes: bytes) -> None:
|
||||
"""Test Gemini can analyze a text file."""
|
||||
llm = LLM(model="gemini/gemini-2.0-flash")
|
||||
files = {"readme": TextFile(source=test_text_bytes)}
|
||||
|
||||
messages = _build_multimodal_message(
|
||||
llm,
|
||||
"Summarize what this text file says in one sentence.",
|
||||
files,
|
||||
)
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_analyze_video_file(self, test_video_bytes: bytes) -> None:
|
||||
"""Test Gemini can analyze a video file."""
|
||||
llm = LLM(model="gemini/gemini-2.0-flash")
|
||||
files = {"video": VideoFile(source=test_video_bytes)}
|
||||
|
||||
messages = _build_multimodal_message(
|
||||
llm,
|
||||
"Describe what you see in this video in one sentence. Be brief.",
|
||||
files,
|
||||
)
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_analyze_audio_file(self, test_audio_bytes: bytes) -> None:
|
||||
"""Test Gemini can analyze an audio file."""
|
||||
llm = LLM(model="gemini/gemini-2.0-flash")
|
||||
files = {"audio": AudioFile(source=test_audio_bytes)}
|
||||
|
||||
messages = _build_multimodal_message(
|
||||
llm,
|
||||
"Describe what you hear in this audio in one sentence. Be brief.",
|
||||
files,
|
||||
)
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
|
||||
class TestLiteLLMMultimodalIntegration:
|
||||
"""Integration tests for LiteLLM wrapper multimodal with real API calls."""
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_describe_image_gpt4o(self, test_image_bytes: bytes) -> None:
|
||||
"""Test LiteLLM with GPT-4o can describe an image."""
|
||||
llm = LLM(model="gpt-4o-mini", is_litellm=True)
|
||||
files = {"image": ImageFile(source=test_image_bytes)}
|
||||
|
||||
messages = _build_multimodal_message(
|
||||
llm,
|
||||
"Describe this image in one sentence. Be brief.",
|
||||
files,
|
||||
)
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_describe_image_claude(self, test_image_bytes: bytes) -> None:
|
||||
"""Test LiteLLM with Claude can describe an image."""
|
||||
llm = LLM(model="anthropic/claude-3-5-haiku-20241022", is_litellm=True)
|
||||
files = {"image": ImageFile(source=test_image_bytes)}
|
||||
|
||||
messages = _build_multimodal_message(
|
||||
llm,
|
||||
"Describe this image in one sentence. Be brief.",
|
||||
files,
|
||||
)
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
|
||||
class TestMultipleFilesIntegration:
|
||||
"""Integration tests for multiple files in a single request."""
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_multiple_images_openai(self, test_image_bytes: bytes) -> None:
|
||||
"""Test OpenAI can process multiple images."""
|
||||
llm = LLM(model="openai/gpt-4o-mini")
|
||||
files = {
|
||||
"image1": ImageFile(source=test_image_bytes),
|
||||
"image2": ImageFile(source=test_image_bytes),
|
||||
}
|
||||
|
||||
messages = _build_multimodal_message(
|
||||
llm,
|
||||
"How many images do you see? Answer with just the number.",
|
||||
files,
|
||||
)
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert "2" in response or "two" in response.lower()
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_mixed_content_anthropic(self, test_image_bytes: bytes) -> None:
|
||||
"""Test Anthropic can process image and PDF together."""
|
||||
llm = LLM(model="anthropic/claude-3-5-haiku-20241022")
|
||||
files = {
|
||||
"image": ImageFile(source=test_image_bytes),
|
||||
"document": PDFFile(source=MINIMAL_PDF),
|
||||
}
|
||||
|
||||
messages = _build_multimodal_message(
|
||||
llm,
|
||||
"What types of files did I send you? List them briefly.",
|
||||
files,
|
||||
)
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
|
||||
class TestGenericFileIntegration:
|
||||
"""Integration tests for the generic File class with auto-detection."""
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_generic_file_image_openai(self, test_image_bytes: bytes) -> None:
|
||||
"""Test generic File auto-detects image and sends correct content type."""
|
||||
llm = LLM(model="openai/gpt-4o-mini")
|
||||
files = {"image": File(source=test_image_bytes)}
|
||||
|
||||
messages = _build_multimodal_message(
|
||||
llm,
|
||||
"Describe this image in one sentence. Be brief.",
|
||||
files,
|
||||
)
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_generic_file_pdf_anthropic(self) -> None:
|
||||
"""Test generic File auto-detects PDF and sends correct content type."""
|
||||
llm = LLM(model="anthropic/claude-3-5-haiku-20241022")
|
||||
files = {"document": File(source=MINIMAL_PDF)}
|
||||
|
||||
messages = _build_multimodal_message(
|
||||
llm,
|
||||
"What type of document is this? Answer in one word.",
|
||||
files,
|
||||
)
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_generic_file_text_gemini(self, test_text_bytes: bytes) -> None:
|
||||
"""Test generic File auto-detects text and sends correct content type."""
|
||||
llm = LLM(model="gemini/gemini-2.0-flash")
|
||||
files = {"content": File(source=test_text_bytes)}
|
||||
|
||||
messages = _build_multimodal_message(
|
||||
llm,
|
||||
"Summarize what this text says in one sentence.",
|
||||
files,
|
||||
)
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_generic_file_mixed_types(self, test_image_bytes: bytes) -> None:
|
||||
"""Test generic File works with multiple auto-detected types."""
|
||||
llm = LLM(model="anthropic/claude-3-5-haiku-20241022")
|
||||
files = {
|
||||
"chart": File(source=test_image_bytes),
|
||||
"doc": File(source=MINIMAL_PDF),
|
||||
}
|
||||
|
||||
messages = _build_multimodal_message(
|
||||
llm,
|
||||
"What types of files did I send? List them briefly.",
|
||||
files,
|
||||
)
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
|
||||
def _build_multimodal_message_with_upload(
|
||||
llm: LLM, prompt: str, files: dict
|
||||
) -> tuple[list[dict], list[dict]]:
|
||||
"""Build a multimodal message using file_id uploads instead of inline base64.
|
||||
|
||||
Note: OpenAI Chat Completions API only supports file_id for PDFs via
|
||||
type="file", not for images. For image file_id support, OpenAI requires
|
||||
the Responses API (type="input_image"). Since crewAI uses Chat Completions,
|
||||
we test file_id uploads with Anthropic which supports file_id for all types.
|
||||
|
||||
Returns:
|
||||
Tuple of (messages, content_blocks) where content_blocks can be inspected
|
||||
to verify file_id was used.
|
||||
"""
|
||||
from crewai_files.formatting.anthropic import AnthropicFormatter
|
||||
|
||||
config = FileResolverConfig(prefer_upload=True)
|
||||
resolver = FileResolver(config=config)
|
||||
formatter = AnthropicFormatter()
|
||||
|
||||
content_blocks = []
|
||||
for file in files.values():
|
||||
resolved = resolver.resolve(file, "anthropic")
|
||||
block = formatter.format_block(file, resolved)
|
||||
if block is not None:
|
||||
content_blocks.append(block)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
llm.format_text_content(prompt),
|
||||
*content_blocks,
|
||||
],
|
||||
}
|
||||
]
|
||||
return messages, content_blocks
|
||||
|
||||
|
||||
def _build_responses_message_with_upload(
|
||||
llm: LLM, prompt: str, files: dict
|
||||
) -> tuple[list[dict], list[dict]]:
|
||||
"""Build a Responses API message using file_id uploads.
|
||||
|
||||
The Responses API supports file_id for images via type="input_image".
|
||||
|
||||
Returns:
|
||||
Tuple of (messages, content_blocks) where content_blocks can be inspected
|
||||
to verify file_id was used.
|
||||
"""
|
||||
from crewai_files.formatting import OpenAIResponsesFormatter
|
||||
|
||||
config = FileResolverConfig(prefer_upload=True)
|
||||
resolver = FileResolver(config=config)
|
||||
|
||||
content_blocks = []
|
||||
for file in files.values():
|
||||
resolved = resolver.resolve(file, "openai")
|
||||
block = OpenAIResponsesFormatter.format_block(resolved, file.content_type)
|
||||
content_blocks.append(block)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "input_text", "text": prompt},
|
||||
*content_blocks,
|
||||
],
|
||||
}
|
||||
]
|
||||
return messages, content_blocks
|
||||
|
||||
|
||||
class TestAnthropicFileUploadIntegration:
|
||||
"""Integration tests for Anthropic multimodal with file_id uploads.
|
||||
|
||||
We test file_id uploads with Anthropic because OpenAI Chat Completions API
|
||||
only supports file_id references for PDFs (type="file"), not images.
|
||||
OpenAI's Responses API supports image file_id (type="input_image"), but
|
||||
crewAI currently uses Chat Completions. Anthropic supports file_id for
|
||||
all content types including images.
|
||||
"""
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_describe_image_with_file_id(self, test_image_bytes: bytes) -> None:
|
||||
"""Test Anthropic can describe an image uploaded via Files API."""
|
||||
llm = LLM(model="anthropic/claude-3-5-haiku-20241022")
|
||||
files = {"image": ImageFile(source=test_image_bytes)}
|
||||
|
||||
messages, content_blocks = _build_multimodal_message_with_upload(
|
||||
llm,
|
||||
"Describe this image in one sentence. Be brief.",
|
||||
files,
|
||||
)
|
||||
|
||||
# Verify we're using file_id, not base64
|
||||
assert len(content_blocks) == 1
|
||||
source = content_blocks[0].get("source", {})
|
||||
assert source.get("type") == "file", (
|
||||
f"Expected source type 'file' for file_id upload, got '{source.get('type')}'. "
|
||||
"This test verifies file_id uploads work - if falling back to base64, "
|
||||
"check that the Anthropic Files API uploader is working correctly."
|
||||
)
|
||||
assert "file_id" in source, "Expected file_id in source for file_id upload"
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
|
||||
class TestOpenAIResponsesFileUploadIntegration:
|
||||
"""Integration tests for OpenAI Responses API with file_id uploads.
|
||||
|
||||
The Responses API supports file_id for images via type="input_image",
|
||||
unlike Chat Completions which only supports file_id for PDFs.
|
||||
"""
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_describe_image_with_file_id(self, test_image_bytes: bytes) -> None:
|
||||
"""Test OpenAI Responses API can describe an image uploaded via Files API."""
|
||||
llm = LLM(model="openai/gpt-4o-mini", api="responses")
|
||||
files = {"image": ImageFile(source=test_image_bytes)}
|
||||
|
||||
messages, content_blocks = _build_responses_message_with_upload(
|
||||
llm,
|
||||
"Describe this image in one sentence. Be brief.",
|
||||
files,
|
||||
)
|
||||
|
||||
# Verify we're using file_id with input_image type
|
||||
assert len(content_blocks) == 1
|
||||
block = content_blocks[0]
|
||||
assert block.get("type") == "input_image", (
|
||||
f"Expected type 'input_image' for Responses API, got '{block.get('type')}'. "
|
||||
"This test verifies file_id uploads work with the Responses API."
|
||||
)
|
||||
assert "file_id" in block, "Expected file_id in block for file_id upload"
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_describe_image_via_format_api(self, test_image_bytes: bytes) -> None:
|
||||
"""Test format_multimodal_content with api='responses' parameter."""
|
||||
llm = LLM(model="openai/gpt-4o-mini", api="responses")
|
||||
files = {"image": ImageFile(source=test_image_bytes)}
|
||||
|
||||
content_blocks = format_multimodal_content(files, "openai", api="responses")
|
||||
|
||||
# Verify content blocks use Responses API format
|
||||
assert len(content_blocks) == 1
|
||||
block = content_blocks[0]
|
||||
assert block.get("type") == "input_image", (
|
||||
f"Expected type 'input_image' for Responses API, got '{block.get('type')}'"
|
||||
)
|
||||
# Should have image_url (base64 data URL) since we're not forcing upload
|
||||
assert "image_url" in block, "Expected image_url in block for inline image"
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "input_text", "text": "Describe this image in one sentence."},
|
||||
*content_blocks,
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_describe_image_via_format_api_with_upload(self, test_image_bytes: bytes) -> None:
|
||||
"""Test format_multimodal_content with prefer_upload=True uploads the file."""
|
||||
llm = LLM(model="openai/gpt-4o-mini", api="responses")
|
||||
files = {"image": ImageFile(source=test_image_bytes)}
|
||||
|
||||
content_blocks = format_multimodal_content(
|
||||
files, "openai", api="responses", prefer_upload=True
|
||||
)
|
||||
|
||||
# Verify content blocks use file_id from upload
|
||||
assert len(content_blocks) == 1
|
||||
block = content_blocks[0]
|
||||
assert block.get("type") == "input_image", (
|
||||
f"Expected type 'input_image' for Responses API, got '{block.get('type')}'"
|
||||
)
|
||||
assert "file_id" in block, (
|
||||
"Expected file_id in block when prefer_upload=True. "
|
||||
f"Got keys: {list(block.keys())}"
|
||||
)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "input_text", "text": "Describe this image in one sentence."},
|
||||
*content_blocks,
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
Reference in New Issue
Block a user