Files
crewAI/lib/crewai/tests/llms/test_multimodal.py
Greyson LaLonde a1cbb2f4e2 refactor: improve multimodal file handling architecture
- Make crewai_files an optional dependency with graceful fallbacks
- Move file formatting from executor to LLM layer (_process_message_files)
- Add files field to LLMMessage type for cleaner message passing
- Add cache_control to Anthropic content blocks for prompt caching
- Clean up formatters: static methods for OpenAI/Gemini, proper error handling
- Remove unused ContentFormatter protocol
- Move test fixtures to lib/crewai-files/tests/fixtures
- Add Azure and Bedrock multimodal integration tests
- Fix mypy errors in crew_agent_executor.py
2026-01-22 21:55:10 -05:00

453 lines
17 KiB
Python

"""Unit tests for LLM multimodal functionality across all providers."""
import base64
import os
from unittest.mock import patch
import pytest
from crewai.llm import LLM
from crewai_files import ImageFile, PDFFile, TextFile, format_multimodal_content
# Check for optional provider dependencies
try:
from crewai.llms.providers.anthropic.completion import AnthropicCompletion
HAS_ANTHROPIC = True
except ImportError:
HAS_ANTHROPIC = False
try:
from crewai.llms.providers.azure.completion import AzureCompletion
HAS_AZURE = True
except ImportError:
HAS_AZURE = False
try:
from crewai.llms.providers.bedrock.completion import BedrockCompletion
HAS_BEDROCK = True
except ImportError:
HAS_BEDROCK = False
# Minimal valid PNG for testing
MINIMAL_PNG = (
b"\x89PNG\r\n\x1a\n"
b"\x00\x00\x00\rIHDR"
b"\x00\x00\x00\x01\x00\x00\x00\x01\x08\x02\x00\x00\x00"
b"\x90wS\xde"
b"\x00\x00\x00\x00IEND\xaeB`\x82"
)
MINIMAL_PDF = b"%PDF-1.4 test content"
@pytest.fixture(autouse=True)
def mock_api_keys():
"""Mock API keys for all providers."""
env_vars = {
"ANTHROPIC_API_KEY": "test-key",
"OPENAI_API_KEY": "test-key",
"GOOGLE_API_KEY": "test-key",
"AZURE_API_KEY": "test-key",
"AWS_ACCESS_KEY_ID": "test-key",
"AWS_SECRET_ACCESS_KEY": "test-key",
}
with patch.dict(os.environ, env_vars):
yield
class TestLiteLLMMultimodal:
"""Tests for LLM class (litellm wrapper) multimodal functionality.
These tests use `is_litellm=True` to ensure the litellm wrapper is used
instead of native providers.
"""
def test_supports_multimodal_gpt4o(self) -> None:
"""Test GPT-4o model supports multimodal."""
llm = LLM(model="gpt-4o", is_litellm=True)
assert llm.supports_multimodal() is True
def test_supports_multimodal_gpt4_turbo(self) -> None:
"""Test GPT-4 Turbo model supports multimodal."""
llm = LLM(model="gpt-4-turbo", is_litellm=True)
assert llm.supports_multimodal() is True
def test_supports_multimodal_claude3(self) -> None:
"""Test Claude 3 model supports multimodal via litellm."""
# Use litellm/ prefix to avoid native provider import
llm = LLM(model="litellm/claude-3-sonnet-20240229")
assert llm.supports_multimodal() is True
def test_supports_multimodal_gemini(self) -> None:
"""Test Gemini model supports multimodal."""
llm = LLM(model="gemini/gemini-pro", is_litellm=True)
assert llm.supports_multimodal() is True
def test_supports_multimodal_gpt35_does_not(self) -> None:
"""Test GPT-3.5 model does not support multimodal."""
llm = LLM(model="gpt-3.5-turbo", is_litellm=True)
assert llm.supports_multimodal() is False
def test_supported_content_types_openai(self) -> None:
"""Test OpenAI models support images only."""
llm = LLM(model="gpt-4o", is_litellm=True)
types = llm.supported_multimodal_content_types()
assert "image/" in types
assert "application/pdf" not in types
def test_supported_content_types_claude(self) -> None:
"""Test Claude models support images and PDFs via litellm."""
# Use litellm/ prefix to avoid native provider import
llm = LLM(model="litellm/claude-3-sonnet-20240229")
types = llm.supported_multimodal_content_types()
assert "image/" in types
assert "application/pdf" in types
def test_supported_content_types_gemini(self) -> None:
"""Test Gemini models support wide range of content."""
llm = LLM(model="gemini/gemini-pro", is_litellm=True)
types = llm.supported_multimodal_content_types()
assert "image/" in types
assert "audio/" in types
assert "video/" in types
assert "application/pdf" in types
assert "text/" in types
def test_supported_content_types_non_multimodal(self) -> None:
"""Test non-multimodal models return empty list."""
llm = LLM(model="gpt-3.5-turbo", is_litellm=True)
assert llm.supported_multimodal_content_types() == []
def test_format_multimodal_content_image(self) -> None:
"""Test formatting image content."""
llm = LLM(model="gpt-4o", is_litellm=True)
files = {"chart": ImageFile(source=MINIMAL_PNG)}
result = format_multimodal_content(files, getattr(llm, "provider", None) or llm.model)
assert len(result) == 1
assert result[0]["type"] == "image_url"
assert "data:image/png;base64," in result[0]["image_url"]["url"]
def test_format_multimodal_content_unsupported_type(self) -> None:
"""Test unsupported content type is skipped."""
llm = LLM(model="gpt-4o", is_litellm=True) # OpenAI doesn't support PDF
files = {"doc": PDFFile(source=MINIMAL_PDF)}
result = format_multimodal_content(files, getattr(llm, "provider", None) or llm.model)
assert result == []
@pytest.mark.skipif(not HAS_ANTHROPIC, reason="Anthropic SDK not installed")
class TestAnthropicMultimodal:
"""Tests for Anthropic provider multimodal functionality."""
def test_supports_multimodal_claude3(self) -> None:
"""Test Claude 3 supports multimodal."""
llm = LLM(model="anthropic/claude-3-sonnet-20240229")
assert llm.supports_multimodal() is True
def test_supports_multimodal_claude4(self) -> None:
"""Test Claude 4 supports multimodal."""
llm = LLM(model="anthropic/claude-4-opus")
assert llm.supports_multimodal() is True
def test_supported_content_types(self) -> None:
"""Test Anthropic supports images and PDFs."""
llm = LLM(model="anthropic/claude-3-sonnet-20240229")
types = llm.supported_multimodal_content_types()
assert "image/" in types
assert "application/pdf" in types
def test_format_multimodal_content_image(self) -> None:
"""Test Anthropic image format uses source-based structure."""
llm = LLM(model="anthropic/claude-3-sonnet-20240229")
files = {"chart": ImageFile(source=MINIMAL_PNG)}
result = format_multimodal_content(files, getattr(llm, "provider", None) or llm.model)
assert len(result) == 1
assert result[0]["type"] == "image"
assert result[0]["source"]["type"] == "base64"
assert result[0]["source"]["media_type"] == "image/png"
assert "data" in result[0]["source"]
def test_format_multimodal_content_pdf(self) -> None:
"""Test Anthropic PDF format uses document structure."""
llm = LLM(model="anthropic/claude-3-sonnet-20240229")
files = {"doc": PDFFile(source=MINIMAL_PDF)}
result = format_multimodal_content(files, getattr(llm, "provider", None) or llm.model)
assert len(result) == 1
assert result[0]["type"] == "document"
assert result[0]["source"]["type"] == "base64"
assert result[0]["source"]["media_type"] == "application/pdf"
class TestOpenAIMultimodal:
"""Tests for OpenAI provider multimodal functionality."""
def test_supports_multimodal_gpt4o(self) -> None:
"""Test GPT-4o supports multimodal."""
llm = LLM(model="openai/gpt-4o")
assert llm.supports_multimodal() is True
def test_supports_multimodal_gpt4_vision(self) -> None:
"""Test GPT-4 Vision supports multimodal."""
llm = LLM(model="openai/gpt-4-vision-preview")
assert llm.supports_multimodal() is True
def test_supports_multimodal_o1(self) -> None:
"""Test O1 model supports multimodal."""
llm = LLM(model="openai/o1-preview")
assert llm.supports_multimodal() is True
def test_does_not_support_gpt35(self) -> None:
"""Test GPT-3.5 does not support multimodal."""
llm = LLM(model="openai/gpt-3.5-turbo")
assert llm.supports_multimodal() is False
def test_supported_content_types(self) -> None:
"""Test OpenAI supports only images."""
llm = LLM(model="openai/gpt-4o")
types = llm.supported_multimodal_content_types()
assert types == ["image/"]
def test_format_multimodal_content_image(self) -> None:
"""Test OpenAI uses image_url format."""
llm = LLM(model="openai/gpt-4o")
files = {"chart": ImageFile(source=MINIMAL_PNG)}
result = format_multimodal_content(files, getattr(llm, "provider", None) or llm.model)
assert len(result) == 1
assert result[0]["type"] == "image_url"
url = result[0]["image_url"]["url"]
assert url.startswith("data:image/png;base64,")
# Verify base64 content
b64_data = url.split(",")[1]
assert base64.b64decode(b64_data) == MINIMAL_PNG
class TestGeminiMultimodal:
"""Tests for Gemini provider multimodal functionality."""
def test_supports_multimodal_always_true(self) -> None:
"""Test Gemini always supports multimodal."""
llm = LLM(model="gemini/gemini-pro")
assert llm.supports_multimodal() is True
def test_supported_content_types(self) -> None:
"""Test Gemini supports wide range of types."""
llm = LLM(model="gemini/gemini-pro")
types = llm.supported_multimodal_content_types()
assert "image/" in types
assert "audio/" in types
assert "video/" in types
assert "application/pdf" in types
assert "text/" in types
def test_format_multimodal_content_image(self) -> None:
"""Test Gemini uses inlineData format."""
llm = LLM(model="gemini/gemini-pro")
files = {"chart": ImageFile(source=MINIMAL_PNG)}
result = format_multimodal_content(files, getattr(llm, "provider", None) or llm.model)
assert len(result) == 1
assert "inlineData" in result[0]
assert result[0]["inlineData"]["mimeType"] == "image/png"
assert "data" in result[0]["inlineData"]
def test_format_text_content(self) -> None:
"""Test Gemini text format uses simple text key."""
llm = LLM(model="gemini/gemini-pro")
result = llm.format_text_content("Hello world")
assert result == {"text": "Hello world"}
@pytest.mark.skipif(not HAS_AZURE, reason="Azure AI Inference SDK not installed")
class TestAzureMultimodal:
"""Tests for Azure OpenAI provider multimodal functionality."""
@pytest.fixture(autouse=True)
def mock_azure_env(self):
"""Mock Azure-specific environment variables."""
env_vars = {
"AZURE_API_KEY": "test-key",
"AZURE_API_BASE": "https://test.openai.azure.com",
"AZURE_API_VERSION": "2024-02-01",
}
with patch.dict(os.environ, env_vars):
yield
def test_supports_multimodal_gpt4o(self) -> None:
"""Test Azure GPT-4o supports multimodal."""
llm = LLM(model="azure/gpt-4o")
assert llm.supports_multimodal() is True
def test_supports_multimodal_gpt4_turbo(self) -> None:
"""Test Azure GPT-4 Turbo supports multimodal."""
llm = LLM(model="azure/gpt-4-turbo")
assert llm.supports_multimodal() is True
def test_does_not_support_gpt35(self) -> None:
"""Test Azure GPT-3.5 does not support multimodal."""
llm = LLM(model="azure/gpt-35-turbo")
assert llm.supports_multimodal() is False
def test_supported_content_types(self) -> None:
"""Test Azure supports only images."""
llm = LLM(model="azure/gpt-4o")
types = llm.supported_multimodal_content_types()
assert types == ["image/"]
def test_format_multimodal_content_image(self) -> None:
"""Test Azure uses same format as OpenAI."""
llm = LLM(model="azure/gpt-4o")
files = {"chart": ImageFile(source=MINIMAL_PNG)}
result = format_multimodal_content(files, getattr(llm, "provider", None) or llm.model)
assert len(result) == 1
assert result[0]["type"] == "image_url"
assert "data:image/png;base64," in result[0]["image_url"]["url"]
@pytest.mark.skipif(not HAS_BEDROCK, reason="AWS Bedrock SDK not installed")
class TestBedrockMultimodal:
"""Tests for AWS Bedrock provider multimodal functionality."""
@pytest.fixture(autouse=True)
def mock_bedrock_env(self):
"""Mock AWS-specific environment variables."""
env_vars = {
"AWS_ACCESS_KEY_ID": "test-key",
"AWS_SECRET_ACCESS_KEY": "test-secret",
"AWS_DEFAULT_REGION": "us-east-1",
}
with patch.dict(os.environ, env_vars):
yield
def test_supports_multimodal_claude3(self) -> None:
"""Test Bedrock Claude 3 supports multimodal."""
llm = LLM(model="bedrock/anthropic.claude-3-sonnet")
assert llm.supports_multimodal() is True
def test_does_not_support_claude2(self) -> None:
"""Test Bedrock Claude 2 does not support multimodal."""
llm = LLM(model="bedrock/anthropic.claude-v2")
assert llm.supports_multimodal() is False
def test_supported_content_types(self) -> None:
"""Test Bedrock supports images and PDFs."""
llm = LLM(model="bedrock/anthropic.claude-3-sonnet")
types = llm.supported_multimodal_content_types()
assert any(t.startswith("image/") for t in types)
assert "application/pdf" in types
def test_format_multimodal_content_image(self) -> None:
"""Test Bedrock uses Converse API image format."""
llm = LLM(model="bedrock/anthropic.claude-3-sonnet")
files = {"chart": ImageFile(source=MINIMAL_PNG)}
result = format_multimodal_content(files, getattr(llm, "provider", None) or llm.model)
assert len(result) == 1
assert "image" in result[0]
assert result[0]["image"]["format"] == "png"
assert "source" in result[0]["image"]
assert "bytes" in result[0]["image"]["source"]
def test_format_multimodal_content_pdf(self) -> None:
"""Test Bedrock uses Converse API document format."""
llm = LLM(model="bedrock/anthropic.claude-3-sonnet")
files = {"doc": PDFFile(source=MINIMAL_PDF)}
result = format_multimodal_content(files, getattr(llm, "provider", None) or llm.model)
assert len(result) == 1
assert "document" in result[0]
assert result[0]["document"]["format"] == "pdf"
assert "source" in result[0]["document"]
class TestBaseLLMMultimodal:
"""Tests for BaseLLM default multimodal behavior."""
def test_base_supports_multimodal_false(self) -> None:
"""Test base implementation returns False."""
from crewai.llms.base_llm import BaseLLM
class TestLLM(BaseLLM):
def call(self, messages, tools=None, callbacks=None):
return "test"
llm = TestLLM(model="test")
assert llm.supports_multimodal() is False
def test_base_supported_content_types_empty(self) -> None:
"""Test base implementation returns empty list."""
from crewai.llms.base_llm import BaseLLM
class TestLLM(BaseLLM):
def call(self, messages, tools=None, callbacks=None):
return "test"
llm = TestLLM(model="test")
assert llm.supported_multimodal_content_types() == []
def test_base_format_text_content(self) -> None:
"""Test base text formatting uses OpenAI/Anthropic style."""
from crewai.llms.base_llm import BaseLLM
class TestLLM(BaseLLM):
def call(self, messages, tools=None, callbacks=None):
return "test"
llm = TestLLM(model="test")
result = llm.format_text_content("Hello")
assert result == {"type": "text", "text": "Hello"}
class TestMultipleFilesFormatting:
"""Tests for formatting multiple files at once."""
def test_format_multiple_images(self) -> None:
"""Test formatting multiple images."""
llm = LLM(model="gpt-4o")
files = {
"chart1": ImageFile(source=MINIMAL_PNG),
"chart2": ImageFile(source=MINIMAL_PNG),
}
result = format_multimodal_content(files, getattr(llm, "provider", None) or llm.model)
assert len(result) == 2
def test_format_mixed_supported_and_unsupported(self) -> None:
"""Test only supported types are formatted."""
llm = LLM(model="gpt-4o") # OpenAI - images only
files = {
"chart": ImageFile(source=MINIMAL_PNG),
"doc": PDFFile(source=MINIMAL_PDF), # Not supported
"text": TextFile(source=b"hello"), # Not supported
}
result = format_multimodal_content(files, getattr(llm, "provider", None) or llm.model)
assert len(result) == 1
assert result[0]["type"] == "image_url"
def test_format_empty_files_dict(self) -> None:
"""Test empty files dict returns empty list."""
llm = LLM(model="gpt-4o")
result = format_multimodal_content({}, llm.model)
assert result == []