Files
crewAI/lib/crewai/tests/llms/test_multimodal.py
2026-01-22 01:39:04 -05:00

474 lines
17 KiB
Python

"""Unit tests for LLM multimodal functionality across all providers."""
import base64
import os
from unittest.mock import patch
import pytest
from crewai.llm import LLM
from crewai.files import ImageFile, PDFFile, TextFile
# Check for optional provider dependencies
try:
from crewai.llms.providers.anthropic.completion import AnthropicCompletion
HAS_ANTHROPIC = True
except ImportError:
HAS_ANTHROPIC = False
try:
from crewai.llms.providers.azure.completion import AzureCompletion
HAS_AZURE = True
except ImportError:
HAS_AZURE = False
try:
from crewai.llms.providers.bedrock.completion import BedrockCompletion
HAS_BEDROCK = True
except ImportError:
HAS_BEDROCK = False
# Minimal valid PNG for testing
MINIMAL_PNG = (
b"\x89PNG\r\n\x1a\n"
b"\x00\x00\x00\rIHDR"
b"\x00\x00\x00\x01\x00\x00\x00\x01\x08\x02\x00\x00\x00"
b"\x90wS\xde"
b"\x00\x00\x00\x00IEND\xaeB`\x82"
)
MINIMAL_PDF = b"%PDF-1.4 test content"
@pytest.fixture(autouse=True)
def mock_api_keys():
"""Mock API keys for all providers."""
env_vars = {
"ANTHROPIC_API_KEY": "test-key",
"OPENAI_API_KEY": "test-key",
"GOOGLE_API_KEY": "test-key",
"AZURE_API_KEY": "test-key",
"AWS_ACCESS_KEY_ID": "test-key",
"AWS_SECRET_ACCESS_KEY": "test-key",
}
with patch.dict(os.environ, env_vars):
yield
class TestLiteLLMMultimodal:
"""Tests for LLM class (litellm wrapper) multimodal functionality.
These tests use `is_litellm=True` to ensure the litellm wrapper is used
instead of native providers.
"""
def test_supports_multimodal_gpt4o(self) -> None:
"""Test GPT-4o model supports multimodal."""
llm = LLM(model="gpt-4o", is_litellm=True)
assert llm.supports_multimodal() is True
def test_supports_multimodal_gpt4_turbo(self) -> None:
"""Test GPT-4 Turbo model supports multimodal."""
llm = LLM(model="gpt-4-turbo", is_litellm=True)
assert llm.supports_multimodal() is True
def test_supports_multimodal_claude3(self) -> None:
"""Test Claude 3 model supports multimodal via litellm."""
# Use litellm/ prefix to avoid native provider import
llm = LLM(model="litellm/claude-3-sonnet-20240229")
assert llm.supports_multimodal() is True
def test_supports_multimodal_gemini(self) -> None:
"""Test Gemini model supports multimodal."""
llm = LLM(model="gemini/gemini-pro", is_litellm=True)
assert llm.supports_multimodal() is True
def test_supports_multimodal_gpt35_does_not(self) -> None:
"""Test GPT-3.5 model does not support multimodal."""
llm = LLM(model="gpt-3.5-turbo", is_litellm=True)
assert llm.supports_multimodal() is False
def test_supported_content_types_openai(self) -> None:
"""Test OpenAI models support images only."""
llm = LLM(model="gpt-4o", is_litellm=True)
types = llm.supported_multimodal_content_types()
assert "image/" in types
assert "application/pdf" not in types
def test_supported_content_types_claude(self) -> None:
"""Test Claude models support images and PDFs via litellm."""
# Use litellm/ prefix to avoid native provider import
llm = LLM(model="litellm/claude-3-sonnet-20240229")
types = llm.supported_multimodal_content_types()
assert "image/" in types
assert "application/pdf" in types
def test_supported_content_types_gemini(self) -> None:
"""Test Gemini models support wide range of content."""
llm = LLM(model="gemini/gemini-pro", is_litellm=True)
types = llm.supported_multimodal_content_types()
assert "image/" in types
assert "audio/" in types
assert "video/" in types
assert "application/pdf" in types
assert "text/" in types
def test_supported_content_types_non_multimodal(self) -> None:
"""Test non-multimodal models return empty list."""
llm = LLM(model="gpt-3.5-turbo", is_litellm=True)
assert llm.supported_multimodal_content_types() == []
def test_format_multimodal_content_image(self) -> None:
"""Test formatting image content."""
llm = LLM(model="gpt-4o", is_litellm=True)
files = {"chart": ImageFile(source=MINIMAL_PNG)}
result = llm.format_multimodal_content(files)
assert len(result) == 1
assert result[0]["type"] == "image_url"
assert "data:image/png;base64," in result[0]["image_url"]["url"]
def test_format_multimodal_content_non_multimodal(self) -> None:
"""Test non-multimodal model returns empty list."""
llm = LLM(model="gpt-3.5-turbo", is_litellm=True)
files = {"chart": ImageFile(source=MINIMAL_PNG)}
result = llm.format_multimodal_content(files)
assert result == []
def test_format_multimodal_content_unsupported_type(self) -> None:
"""Test unsupported content type is skipped."""
llm = LLM(model="gpt-4o", is_litellm=True) # OpenAI doesn't support PDF
files = {"doc": PDFFile(source=MINIMAL_PDF)}
result = llm.format_multimodal_content(files)
assert result == []
@pytest.mark.skipif(not HAS_ANTHROPIC, reason="Anthropic SDK not installed")
class TestAnthropicMultimodal:
"""Tests for Anthropic provider multimodal functionality."""
def test_supports_multimodal_claude3(self) -> None:
"""Test Claude 3 supports multimodal."""
llm = LLM(model="anthropic/claude-3-sonnet-20240229")
assert llm.supports_multimodal() is True
def test_supports_multimodal_claude4(self) -> None:
"""Test Claude 4 supports multimodal."""
llm = LLM(model="anthropic/claude-4-opus")
assert llm.supports_multimodal() is True
def test_supported_content_types(self) -> None:
"""Test Anthropic supports images and PDFs."""
llm = LLM(model="anthropic/claude-3-sonnet-20240229")
types = llm.supported_multimodal_content_types()
assert "image/" in types
assert "application/pdf" in types
def test_format_multimodal_content_image(self) -> None:
"""Test Anthropic image format uses source-based structure."""
llm = LLM(model="anthropic/claude-3-sonnet-20240229")
files = {"chart": ImageFile(source=MINIMAL_PNG)}
result = llm.format_multimodal_content(files)
assert len(result) == 1
assert result[0]["type"] == "image"
assert result[0]["source"]["type"] == "base64"
assert result[0]["source"]["media_type"] == "image/png"
assert "data" in result[0]["source"]
def test_format_multimodal_content_pdf(self) -> None:
"""Test Anthropic PDF format uses document structure."""
llm = LLM(model="anthropic/claude-3-sonnet-20240229")
files = {"doc": PDFFile(source=MINIMAL_PDF)}
result = llm.format_multimodal_content(files)
assert len(result) == 1
assert result[0]["type"] == "document"
assert result[0]["source"]["type"] == "base64"
assert result[0]["source"]["media_type"] == "application/pdf"
class TestOpenAIMultimodal:
"""Tests for OpenAI provider multimodal functionality."""
def test_supports_multimodal_gpt4o(self) -> None:
"""Test GPT-4o supports multimodal."""
llm = LLM(model="openai/gpt-4o")
assert llm.supports_multimodal() is True
def test_supports_multimodal_gpt4_vision(self) -> None:
"""Test GPT-4 Vision supports multimodal."""
llm = LLM(model="openai/gpt-4-vision-preview")
assert llm.supports_multimodal() is True
def test_supports_multimodal_o1(self) -> None:
"""Test O1 model supports multimodal."""
llm = LLM(model="openai/o1-preview")
assert llm.supports_multimodal() is True
def test_does_not_support_gpt35(self) -> None:
"""Test GPT-3.5 does not support multimodal."""
llm = LLM(model="openai/gpt-3.5-turbo")
assert llm.supports_multimodal() is False
def test_supported_content_types(self) -> None:
"""Test OpenAI supports only images."""
llm = LLM(model="openai/gpt-4o")
types = llm.supported_multimodal_content_types()
assert types == ["image/"]
def test_format_multimodal_content_image(self) -> None:
"""Test OpenAI uses image_url format."""
llm = LLM(model="openai/gpt-4o")
files = {"chart": ImageFile(source=MINIMAL_PNG)}
result = llm.format_multimodal_content(files)
assert len(result) == 1
assert result[0]["type"] == "image_url"
url = result[0]["image_url"]["url"]
assert url.startswith("data:image/png;base64,")
# Verify base64 content
b64_data = url.split(",")[1]
assert base64.b64decode(b64_data) == MINIMAL_PNG
class TestGeminiMultimodal:
"""Tests for Gemini provider multimodal functionality."""
def test_supports_multimodal_always_true(self) -> None:
"""Test Gemini always supports multimodal."""
llm = LLM(model="gemini/gemini-pro")
assert llm.supports_multimodal() is True
def test_supported_content_types(self) -> None:
"""Test Gemini supports wide range of types."""
llm = LLM(model="gemini/gemini-pro")
types = llm.supported_multimodal_content_types()
assert "image/" in types
assert "audio/" in types
assert "video/" in types
assert "application/pdf" in types
assert "text/" in types
def test_format_multimodal_content_image(self) -> None:
"""Test Gemini uses inlineData format."""
llm = LLM(model="gemini/gemini-pro")
files = {"chart": ImageFile(source=MINIMAL_PNG)}
result = llm.format_multimodal_content(files)
assert len(result) == 1
assert "inlineData" in result[0]
assert result[0]["inlineData"]["mimeType"] == "image/png"
assert "data" in result[0]["inlineData"]
def test_format_text_content(self) -> None:
"""Test Gemini text format uses simple text key."""
llm = LLM(model="gemini/gemini-pro")
result = llm.format_text_content("Hello world")
assert result == {"text": "Hello world"}
@pytest.mark.skipif(not HAS_AZURE, reason="Azure AI Inference SDK not installed")
class TestAzureMultimodal:
"""Tests for Azure OpenAI provider multimodal functionality."""
@pytest.fixture(autouse=True)
def mock_azure_env(self):
"""Mock Azure-specific environment variables."""
env_vars = {
"AZURE_API_KEY": "test-key",
"AZURE_API_BASE": "https://test.openai.azure.com",
"AZURE_API_VERSION": "2024-02-01",
}
with patch.dict(os.environ, env_vars):
yield
def test_supports_multimodal_gpt4o(self) -> None:
"""Test Azure GPT-4o supports multimodal."""
llm = LLM(model="azure/gpt-4o")
assert llm.supports_multimodal() is True
def test_supports_multimodal_gpt4_turbo(self) -> None:
"""Test Azure GPT-4 Turbo supports multimodal."""
llm = LLM(model="azure/gpt-4-turbo")
assert llm.supports_multimodal() is True
def test_does_not_support_gpt35(self) -> None:
"""Test Azure GPT-3.5 does not support multimodal."""
llm = LLM(model="azure/gpt-35-turbo")
assert llm.supports_multimodal() is False
def test_supported_content_types(self) -> None:
"""Test Azure supports only images."""
llm = LLM(model="azure/gpt-4o")
types = llm.supported_multimodal_content_types()
assert types == ["image/"]
def test_format_multimodal_content_image(self) -> None:
"""Test Azure uses same format as OpenAI."""
llm = LLM(model="azure/gpt-4o")
files = {"chart": ImageFile(source=MINIMAL_PNG)}
result = llm.format_multimodal_content(files)
assert len(result) == 1
assert result[0]["type"] == "image_url"
assert "data:image/png;base64," in result[0]["image_url"]["url"]
@pytest.mark.skipif(not HAS_BEDROCK, reason="AWS Bedrock SDK not installed")
class TestBedrockMultimodal:
"""Tests for AWS Bedrock provider multimodal functionality."""
@pytest.fixture(autouse=True)
def mock_bedrock_env(self):
"""Mock AWS-specific environment variables."""
env_vars = {
"AWS_ACCESS_KEY_ID": "test-key",
"AWS_SECRET_ACCESS_KEY": "test-secret",
"AWS_DEFAULT_REGION": "us-east-1",
}
with patch.dict(os.environ, env_vars):
yield
def test_supports_multimodal_claude3(self) -> None:
"""Test Bedrock Claude 3 supports multimodal."""
llm = LLM(model="bedrock/anthropic.claude-3-sonnet")
assert llm.supports_multimodal() is True
def test_does_not_support_claude2(self) -> None:
"""Test Bedrock Claude 2 does not support multimodal."""
llm = LLM(model="bedrock/anthropic.claude-v2")
assert llm.supports_multimodal() is False
def test_supported_content_types(self) -> None:
"""Test Bedrock supports images and PDFs."""
llm = LLM(model="bedrock/anthropic.claude-3-sonnet")
types = llm.supported_multimodal_content_types()
assert "image/" in types
assert "application/pdf" in types
def test_format_multimodal_content_image(self) -> None:
"""Test Bedrock uses Converse API image format."""
llm = LLM(model="bedrock/anthropic.claude-3-sonnet")
files = {"chart": ImageFile(source=MINIMAL_PNG)}
result = llm.format_multimodal_content(files)
assert len(result) == 1
assert "image" in result[0]
assert result[0]["image"]["format"] == "png"
assert "source" in result[0]["image"]
assert "bytes" in result[0]["image"]["source"]
def test_format_multimodal_content_pdf(self) -> None:
"""Test Bedrock uses Converse API document format."""
llm = LLM(model="bedrock/anthropic.claude-3-sonnet")
files = {"doc": PDFFile(source=MINIMAL_PDF)}
result = llm.format_multimodal_content(files)
assert len(result) == 1
assert "document" in result[0]
assert result[0]["document"]["format"] == "pdf"
assert "source" in result[0]["document"]
class TestBaseLLMMultimodal:
"""Tests for BaseLLM default multimodal behavior."""
def test_base_supports_multimodal_false(self) -> None:
"""Test base implementation returns False."""
from crewai.llms.base_llm import BaseLLM
class TestLLM(BaseLLM):
def call(self, messages, tools=None, callbacks=None):
return "test"
llm = TestLLM(model="test")
assert llm.supports_multimodal() is False
def test_base_supported_content_types_empty(self) -> None:
"""Test base implementation returns empty list."""
from crewai.llms.base_llm import BaseLLM
class TestLLM(BaseLLM):
def call(self, messages, tools=None, callbacks=None):
return "test"
llm = TestLLM(model="test")
assert llm.supported_multimodal_content_types() == []
def test_base_format_multimodal_content_empty(self) -> None:
"""Test base implementation returns empty list."""
from crewai.llms.base_llm import BaseLLM
class TestLLM(BaseLLM):
def call(self, messages, tools=None, callbacks=None):
return "test"
llm = TestLLM(model="test")
files = {"chart": ImageFile(source=MINIMAL_PNG)}
assert llm.format_multimodal_content(files) == []
def test_base_format_text_content(self) -> None:
"""Test base text formatting uses OpenAI/Anthropic style."""
from crewai.llms.base_llm import BaseLLM
class TestLLM(BaseLLM):
def call(self, messages, tools=None, callbacks=None):
return "test"
llm = TestLLM(model="test")
result = llm.format_text_content("Hello")
assert result == {"type": "text", "text": "Hello"}
class TestMultipleFilesFormatting:
"""Tests for formatting multiple files at once."""
def test_format_multiple_images(self) -> None:
"""Test formatting multiple images."""
llm = LLM(model="gpt-4o")
files = {
"chart1": ImageFile(source=MINIMAL_PNG),
"chart2": ImageFile(source=MINIMAL_PNG),
}
result = llm.format_multimodal_content(files)
assert len(result) == 2
def test_format_mixed_supported_and_unsupported(self) -> None:
"""Test only supported types are formatted."""
llm = LLM(model="gpt-4o") # OpenAI - images only
files = {
"chart": ImageFile(source=MINIMAL_PNG),
"doc": PDFFile(source=MINIMAL_PDF), # Not supported
"text": TextFile(source=b"hello"), # Not supported
}
result = llm.format_multimodal_content(files)
assert len(result) == 1
assert result[0]["type"] == "image_url"
def test_format_empty_files_dict(self) -> None:
"""Test empty files dict returns empty list."""
llm = LLM(model="gpt-4o")
result = llm.format_multimodal_content({})
assert result == []