refactor: improve multimodal file handling architecture

- Make crewai_files an optional dependency with graceful fallbacks - Move file formatting from executor to LLM layer (_process_message_files) - Add files field to LLMMessage type for cleaner message passing - Add cache_control to Anthropic content blocks for prompt caching - Clean up formatters: static methods for OpenAI/Gemini, proper error handling - Remove unused ContentFormatter protocol - Move test fixtures to lib/crewai-files/tests/fixtures - Add Azure and Bedrock multimodal integration tests - Fix mypy errors in crew_agent_executor.py
2026-05-05 01:02:37 +00:00 · 2026-01-22 21:55:10 -05:00
parent dc015b14f9
commit a1cbb2f4e2
31 changed files with 320 additions and 1278 deletions
--- a/lib/crewai/tests/llms/test_multimodal_integration.py
+++ b/lib/crewai/tests/llms/test_multimodal_integration.py
@@ -9,13 +9,13 @@ from pathlib import Path
 import pytest

 from crewai.llm import LLM
-from crewai.files import File, ImageFile, PDFFile, TextFile
+from crewai_files import File, ImageFile, PDFFile, TextFile, format_multimodal_content


 # Path to test data files
-TEST_DATA_DIR = Path(__file__).parent.parent.parent.parent.parent / "data"
-TEST_IMAGE_PATH = TEST_DATA_DIR / "revenue_chart.png"
-TEST_TEXT_PATH = TEST_DATA_DIR / "review_guidelines.txt"
+TEST_FIXTURES_DIR = Path(__file__).parent.parent.parent.parent / "crewai-files" / "tests" / "fixtures"
+TEST_IMAGE_PATH = TEST_FIXTURES_DIR / "revenue_chart.png"
+TEST_TEXT_PATH = TEST_FIXTURES_DIR / "review_guidelines.txt"


@pytest.fixture
@@ -50,7 +50,8 @@ startxref

 def _build_multimodal_message(llm: LLM, prompt: str, files: dict) -> list[dict]:
    """Build a multimodal message with text and file content."""
-    content_blocks = llm.format_multimodal_content(files)
+    provider = getattr(llm, "provider", None) or llm.model
+    content_blocks = format_multimodal_content(files, provider)
    return [
        {
            "role": "user",
@@ -124,6 +125,68 @@ class TestAnthropicMultimodalIntegration:
        assert len(response) > 0


+class TestAzureMultimodalIntegration:
+    """Integration tests for Azure OpenAI multimodal with real API calls."""
+
+    @pytest.mark.vcr()
+    def test_describe_image(self, test_image_bytes: bytes) -> None:
+        """Test Azure OpenAI can describe an image."""
+        llm = LLM(model="azure/gpt-4o")
+        files = {"image": ImageFile(source=test_image_bytes)}
+
+        messages = _build_multimodal_message(
+            llm,
+            "Describe this image in one sentence. Be brief.",
+            files,
+        )
+
+        response = llm.call(messages)
+
+        assert response
+        assert isinstance(response, str)
+        assert len(response) > 0
+
+
+class TestBedrockMultimodalIntegration:
+    """Integration tests for AWS Bedrock multimodal with real API calls."""
+
+    @pytest.mark.vcr()
+    def test_describe_image(self, test_image_bytes: bytes) -> None:
+        """Test Bedrock Claude can describe an image."""
+        llm = LLM(model="bedrock/anthropic.claude-3-5-sonnet-20241022-v2:0")
+        files = {"image": ImageFile(source=test_image_bytes)}
+
+        messages = _build_multimodal_message(
+            llm,
+            "Describe this image in one sentence. Be brief.",
+            files,
+        )
+
+        response = llm.call(messages)
+
+        assert response
+        assert isinstance(response, str)
+        assert len(response) > 0
+
+    @pytest.mark.vcr()
+    def test_analyze_pdf(self) -> None:
+        """Test Bedrock Claude can analyze a PDF."""
+        llm = LLM(model="bedrock/anthropic.claude-3-5-sonnet-20241022-v2:0")
+        files = {"document": PDFFile(source=MINIMAL_PDF)}
+
+        messages = _build_multimodal_message(
+            llm,
+            "What type of document is this? Answer in one word.",
+            files,
+        )
+
+        response = llm.call(messages)
+
+        assert response
+        assert isinstance(response, str)
+        assert len(response) > 0
+
+
 class TestGeminiMultimodalIntegration:
    """Integration tests for Gemini multimodal with real API calls."""