mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-23 15:18:14 +00:00
454 lines
13 KiB
Python
454 lines
13 KiB
Python
"""Integration tests for LLM multimodal functionality with cassettes.
|
|
|
|
These tests make actual API calls (recorded via VCR cassettes) to verify
|
|
multimodal content is properly sent and processed by each provider.
|
|
"""
|
|
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from crewai.llm import LLM
|
|
from crewai_files import (
|
|
AudioFile,
|
|
File,
|
|
ImageFile,
|
|
PDFFile,
|
|
TextFile,
|
|
VideoFile,
|
|
format_multimodal_content,
|
|
)
|
|
|
|
|
|
# Path to test data files
|
|
TEST_FIXTURES_DIR = Path(__file__).parent.parent.parent.parent / "crewai-files" / "tests" / "fixtures"
|
|
TEST_IMAGE_PATH = TEST_FIXTURES_DIR / "revenue_chart.png"
|
|
TEST_TEXT_PATH = TEST_FIXTURES_DIR / "review_guidelines.txt"
|
|
TEST_VIDEO_PATH = TEST_FIXTURES_DIR / "sample_video.mp4"
|
|
TEST_AUDIO_PATH = TEST_FIXTURES_DIR / "sample_audio.wav"
|
|
|
|
|
|
@pytest.fixture
|
|
def test_image_bytes() -> bytes:
|
|
"""Load test image bytes."""
|
|
return TEST_IMAGE_PATH.read_bytes()
|
|
|
|
|
|
@pytest.fixture
|
|
def test_text_bytes() -> bytes:
|
|
"""Load test text bytes."""
|
|
return TEST_TEXT_PATH.read_bytes()
|
|
|
|
|
|
@pytest.fixture
|
|
def test_video_bytes() -> bytes:
|
|
"""Load test video bytes."""
|
|
if not TEST_VIDEO_PATH.exists():
|
|
pytest.skip("sample_video.mp4 fixture not found")
|
|
return TEST_VIDEO_PATH.read_bytes()
|
|
|
|
|
|
@pytest.fixture
|
|
def test_audio_bytes() -> bytes:
|
|
"""Load test audio bytes."""
|
|
if not TEST_AUDIO_PATH.exists():
|
|
pytest.skip("sample_audio.wav fixture not found")
|
|
return TEST_AUDIO_PATH.read_bytes()
|
|
|
|
|
|
# Minimal PDF for testing (real PDF structure)
|
|
MINIMAL_PDF = b"""%PDF-1.4
|
|
1 0 obj << /Type /Catalog /Pages 2 0 R >> endobj
|
|
2 0 obj << /Type /Pages /Kids [3 0 R] /Count 1 >> endobj
|
|
3 0 obj << /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >> endobj
|
|
xref
|
|
0 4
|
|
0000000000 65535 f
|
|
0000000009 00000 n
|
|
0000000058 00000 n
|
|
0000000115 00000 n
|
|
trailer << /Size 4 /Root 1 0 R >>
|
|
startxref
|
|
196
|
|
%%EOF
|
|
"""
|
|
|
|
|
|
def _build_multimodal_message(llm: LLM, prompt: str, files: dict) -> list[dict]:
|
|
"""Build a multimodal message with text and file content."""
|
|
provider = getattr(llm, "provider", None) or llm.model
|
|
content_blocks = format_multimodal_content(files, provider)
|
|
return [
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
llm.format_text_content(prompt),
|
|
*content_blocks,
|
|
],
|
|
}
|
|
]
|
|
|
|
|
|
class TestOpenAIMultimodalIntegration:
|
|
"""Integration tests for OpenAI multimodal with real API calls."""
|
|
|
|
@pytest.mark.vcr()
|
|
def test_describe_image(self, test_image_bytes: bytes) -> None:
|
|
"""Test OpenAI can describe an image."""
|
|
llm = LLM(model="openai/gpt-4o-mini")
|
|
files = {"image": ImageFile(source=test_image_bytes)}
|
|
|
|
messages = _build_multimodal_message(
|
|
llm,
|
|
"Describe this image in one sentence. Be brief.",
|
|
files,
|
|
)
|
|
|
|
response = llm.call(messages)
|
|
|
|
assert response
|
|
assert isinstance(response, str)
|
|
assert len(response) > 0
|
|
|
|
|
|
class TestAnthropicMultimodalIntegration:
|
|
"""Integration tests for Anthropic multimodal with real API calls."""
|
|
|
|
@pytest.mark.vcr()
|
|
def test_describe_image(self, test_image_bytes: bytes) -> None:
|
|
"""Test Anthropic can describe an image."""
|
|
llm = LLM(model="anthropic/claude-3-5-haiku-20241022")
|
|
files = {"image": ImageFile(source=test_image_bytes)}
|
|
|
|
messages = _build_multimodal_message(
|
|
llm,
|
|
"Describe this image in one sentence. Be brief.",
|
|
files,
|
|
)
|
|
|
|
response = llm.call(messages)
|
|
|
|
assert response
|
|
assert isinstance(response, str)
|
|
assert len(response) > 0
|
|
|
|
@pytest.mark.vcr()
|
|
def test_analyze_pdf(self) -> None:
|
|
"""Test Anthropic can analyze a PDF."""
|
|
llm = LLM(model="anthropic/claude-3-5-haiku-20241022")
|
|
files = {"document": PDFFile(source=MINIMAL_PDF)}
|
|
|
|
messages = _build_multimodal_message(
|
|
llm,
|
|
"What type of document is this? Answer in one word.",
|
|
files,
|
|
)
|
|
|
|
response = llm.call(messages)
|
|
|
|
assert response
|
|
assert isinstance(response, str)
|
|
assert len(response) > 0
|
|
|
|
|
|
class TestAzureMultimodalIntegration:
|
|
"""Integration tests for Azure OpenAI multimodal with real API calls."""
|
|
|
|
@pytest.mark.vcr()
|
|
def test_describe_image(self, test_image_bytes: bytes) -> None:
|
|
"""Test Azure OpenAI can describe an image."""
|
|
llm = LLM(model="azure/gpt-4o")
|
|
files = {"image": ImageFile(source=test_image_bytes)}
|
|
|
|
messages = _build_multimodal_message(
|
|
llm,
|
|
"Describe this image in one sentence. Be brief.",
|
|
files,
|
|
)
|
|
|
|
response = llm.call(messages)
|
|
|
|
assert response
|
|
assert isinstance(response, str)
|
|
assert len(response) > 0
|
|
|
|
|
|
class TestBedrockMultimodalIntegration:
|
|
"""Integration tests for AWS Bedrock multimodal with real API calls."""
|
|
|
|
@pytest.mark.vcr()
|
|
def test_describe_image(self, test_image_bytes: bytes) -> None:
|
|
"""Test Bedrock Claude can describe an image."""
|
|
llm = LLM(model="bedrock/anthropic.claude-3-haiku-20240307-v1:0")
|
|
files = {"image": ImageFile(source=test_image_bytes)}
|
|
|
|
messages = _build_multimodal_message(
|
|
llm,
|
|
"Describe this image in one sentence. Be brief.",
|
|
files,
|
|
)
|
|
|
|
response = llm.call(messages)
|
|
|
|
assert response
|
|
assert isinstance(response, str)
|
|
assert len(response) > 0
|
|
|
|
@pytest.mark.vcr()
|
|
def test_analyze_pdf(self) -> None:
|
|
"""Test Bedrock Claude can analyze a PDF."""
|
|
llm = LLM(model="bedrock/anthropic.claude-3-haiku-20240307-v1:0")
|
|
files = {"document": PDFFile(source=MINIMAL_PDF)}
|
|
|
|
messages = _build_multimodal_message(
|
|
llm,
|
|
"What type of document is this? Answer in one word.",
|
|
files,
|
|
)
|
|
|
|
response = llm.call(messages)
|
|
|
|
assert response
|
|
assert isinstance(response, str)
|
|
assert len(response) > 0
|
|
|
|
|
|
class TestGeminiMultimodalIntegration:
|
|
"""Integration tests for Gemini multimodal with real API calls."""
|
|
|
|
@pytest.mark.vcr()
|
|
def test_describe_image(self, test_image_bytes: bytes) -> None:
|
|
"""Test Gemini can describe an image."""
|
|
llm = LLM(model="gemini/gemini-2.0-flash")
|
|
files = {"image": ImageFile(source=test_image_bytes)}
|
|
|
|
messages = _build_multimodal_message(
|
|
llm,
|
|
"Describe this image in one sentence. Be brief.",
|
|
files,
|
|
)
|
|
|
|
response = llm.call(messages)
|
|
|
|
assert response
|
|
assert isinstance(response, str)
|
|
assert len(response) > 0
|
|
|
|
@pytest.mark.vcr()
|
|
def test_analyze_text_file(self, test_text_bytes: bytes) -> None:
|
|
"""Test Gemini can analyze a text file."""
|
|
llm = LLM(model="gemini/gemini-2.0-flash")
|
|
files = {"readme": TextFile(source=test_text_bytes)}
|
|
|
|
messages = _build_multimodal_message(
|
|
llm,
|
|
"Summarize what this text file says in one sentence.",
|
|
files,
|
|
)
|
|
|
|
response = llm.call(messages)
|
|
|
|
assert response
|
|
assert isinstance(response, str)
|
|
assert len(response) > 0
|
|
|
|
@pytest.mark.vcr()
|
|
def test_analyze_video_file(self, test_video_bytes: bytes) -> None:
|
|
"""Test Gemini can analyze a video file."""
|
|
llm = LLM(model="gemini/gemini-2.0-flash")
|
|
files = {"video": VideoFile(source=test_video_bytes)}
|
|
|
|
messages = _build_multimodal_message(
|
|
llm,
|
|
"Describe what you see in this video in one sentence. Be brief.",
|
|
files,
|
|
)
|
|
|
|
response = llm.call(messages)
|
|
|
|
assert response
|
|
assert isinstance(response, str)
|
|
assert len(response) > 0
|
|
|
|
@pytest.mark.vcr()
|
|
def test_analyze_audio_file(self, test_audio_bytes: bytes) -> None:
|
|
"""Test Gemini can analyze an audio file."""
|
|
llm = LLM(model="gemini/gemini-2.0-flash")
|
|
files = {"audio": AudioFile(source=test_audio_bytes)}
|
|
|
|
messages = _build_multimodal_message(
|
|
llm,
|
|
"Describe what you hear in this audio in one sentence. Be brief.",
|
|
files,
|
|
)
|
|
|
|
response = llm.call(messages)
|
|
|
|
assert response
|
|
assert isinstance(response, str)
|
|
assert len(response) > 0
|
|
|
|
|
|
class TestLiteLLMMultimodalIntegration:
|
|
"""Integration tests for LiteLLM wrapper multimodal with real API calls."""
|
|
|
|
@pytest.mark.vcr()
|
|
def test_describe_image_gpt4o(self, test_image_bytes: bytes) -> None:
|
|
"""Test LiteLLM with GPT-4o can describe an image."""
|
|
llm = LLM(model="gpt-4o-mini", is_litellm=True)
|
|
files = {"image": ImageFile(source=test_image_bytes)}
|
|
|
|
messages = _build_multimodal_message(
|
|
llm,
|
|
"Describe this image in one sentence. Be brief.",
|
|
files,
|
|
)
|
|
|
|
response = llm.call(messages)
|
|
|
|
assert response
|
|
assert isinstance(response, str)
|
|
assert len(response) > 0
|
|
|
|
@pytest.mark.vcr()
|
|
def test_describe_image_claude(self, test_image_bytes: bytes) -> None:
|
|
"""Test LiteLLM with Claude can describe an image."""
|
|
llm = LLM(model="anthropic/claude-3-5-haiku-20241022", is_litellm=True)
|
|
files = {"image": ImageFile(source=test_image_bytes)}
|
|
|
|
messages = _build_multimodal_message(
|
|
llm,
|
|
"Describe this image in one sentence. Be brief.",
|
|
files,
|
|
)
|
|
|
|
response = llm.call(messages)
|
|
|
|
assert response
|
|
assert isinstance(response, str)
|
|
assert len(response) > 0
|
|
|
|
|
|
class TestMultipleFilesIntegration:
|
|
"""Integration tests for multiple files in a single request."""
|
|
|
|
@pytest.mark.vcr()
|
|
def test_multiple_images_openai(self, test_image_bytes: bytes) -> None:
|
|
"""Test OpenAI can process multiple images."""
|
|
llm = LLM(model="openai/gpt-4o-mini")
|
|
files = {
|
|
"image1": ImageFile(source=test_image_bytes),
|
|
"image2": ImageFile(source=test_image_bytes),
|
|
}
|
|
|
|
messages = _build_multimodal_message(
|
|
llm,
|
|
"How many images do you see? Answer with just the number.",
|
|
files,
|
|
)
|
|
|
|
response = llm.call(messages)
|
|
|
|
assert response
|
|
assert isinstance(response, str)
|
|
assert "2" in response or "two" in response.lower()
|
|
|
|
@pytest.mark.vcr()
|
|
def test_mixed_content_anthropic(self, test_image_bytes: bytes) -> None:
|
|
"""Test Anthropic can process image and PDF together."""
|
|
llm = LLM(model="anthropic/claude-3-5-haiku-20241022")
|
|
files = {
|
|
"image": ImageFile(source=test_image_bytes),
|
|
"document": PDFFile(source=MINIMAL_PDF),
|
|
}
|
|
|
|
messages = _build_multimodal_message(
|
|
llm,
|
|
"What types of files did I send you? List them briefly.",
|
|
files,
|
|
)
|
|
|
|
response = llm.call(messages)
|
|
|
|
assert response
|
|
assert isinstance(response, str)
|
|
assert len(response) > 0
|
|
|
|
|
|
class TestGenericFileIntegration:
|
|
"""Integration tests for the generic File class with auto-detection."""
|
|
|
|
@pytest.mark.vcr()
|
|
def test_generic_file_image_openai(self, test_image_bytes: bytes) -> None:
|
|
"""Test generic File auto-detects image and sends correct content type."""
|
|
llm = LLM(model="openai/gpt-4o-mini")
|
|
files = {"image": File(source=test_image_bytes)}
|
|
|
|
messages = _build_multimodal_message(
|
|
llm,
|
|
"Describe this image in one sentence. Be brief.",
|
|
files,
|
|
)
|
|
|
|
response = llm.call(messages)
|
|
|
|
assert response
|
|
assert isinstance(response, str)
|
|
assert len(response) > 0
|
|
|
|
@pytest.mark.vcr()
|
|
def test_generic_file_pdf_anthropic(self) -> None:
|
|
"""Test generic File auto-detects PDF and sends correct content type."""
|
|
llm = LLM(model="anthropic/claude-3-5-haiku-20241022")
|
|
files = {"document": File(source=MINIMAL_PDF)}
|
|
|
|
messages = _build_multimodal_message(
|
|
llm,
|
|
"What type of document is this? Answer in one word.",
|
|
files,
|
|
)
|
|
|
|
response = llm.call(messages)
|
|
|
|
assert response
|
|
assert isinstance(response, str)
|
|
assert len(response) > 0
|
|
|
|
@pytest.mark.vcr()
|
|
def test_generic_file_text_gemini(self, test_text_bytes: bytes) -> None:
|
|
"""Test generic File auto-detects text and sends correct content type."""
|
|
llm = LLM(model="gemini/gemini-2.0-flash")
|
|
files = {"content": File(source=test_text_bytes)}
|
|
|
|
messages = _build_multimodal_message(
|
|
llm,
|
|
"Summarize what this text says in one sentence.",
|
|
files,
|
|
)
|
|
|
|
response = llm.call(messages)
|
|
|
|
assert response
|
|
assert isinstance(response, str)
|
|
assert len(response) > 0
|
|
|
|
@pytest.mark.vcr()
|
|
def test_generic_file_mixed_types(self, test_image_bytes: bytes) -> None:
|
|
"""Test generic File works with multiple auto-detected types."""
|
|
llm = LLM(model="anthropic/claude-3-5-haiku-20241022")
|
|
files = {
|
|
"chart": File(source=test_image_bytes),
|
|
"doc": File(source=MINIMAL_PDF),
|
|
}
|
|
|
|
messages = _build_multimodal_message(
|
|
llm,
|
|
"What types of files did I send? List them briefly.",
|
|
files,
|
|
)
|
|
|
|
response = llm.call(messages)
|
|
|
|
assert response
|
|
assert isinstance(response, str)
|
|
assert len(response) > 0 |