feat(files): add file_id upload support and text file handling

- Add VCR patch for binary request bodies (base64 encoding fallback) - Add generate_filename() utility for UUID-based filenames with extension - Add OpenAIResponsesFormatter for Responses API (input_image, input_file) - Fix OpenAI uploader to use 'vision' purpose for images - Fix Anthropic uploader to use tuple format (filename, content, content_type) - Add TextConstraints and text support for Gemini - Add file_id upload integration tests for Anthropic and OpenAI Responses API
2026-05-03 08:12:39 +00:00 · 2026-01-23 01:57:29 -05:00
parent 7c9ce9ccd8
commit 4ab53c0726
14 changed files with 833 additions and 44 deletions
--- a/lib/crewai/tests/cassettes/llms/TestAnthropicFileUploadIntegration.test_describe_image_with_file_id.yaml
+++ b/lib/crewai/tests/cassettes/llms/TestAnthropicFileUploadIntegration.test_describe_image_with_file_id.yaml
--- a/lib/crewai/tests/cassettes/llms/TestGeminiMultimodalIntegration.test_analyze_text_file.yaml
+++ b/lib/crewai/tests/cassettes/llms/TestGeminiMultimodalIntegration.test_analyze_text_file.yaml
@@ -1,7 +1,8 @@
 interactions:
 - request:
    body: '{"contents": [{"parts": [{"text": "Summarize what this text file says in
-      one sentence."}], "role": "user"}], "generationConfig": {}}'
+      one sentence."}, {"inlineData": {"data": "UmV2aWV3IEd1aWRlbGluZXMKCjEuIEJlIGNsZWFyIGFuZCBjb25jaXNlOiBXcml0ZSBmZWVkYmFjayB0aGF0IGlzIGVhc3kgdG8gdW5kZXJzdGFuZC4KMi4gRm9jdXMgb24gYmVoYXZpb3IgYW5kIG91dGNvbWVzOiBEZXNjcmliZSB3aGF0IGhhcHBlbmVkIGFuZCB3aHkgaXQgbWF0dGVycy4KMy4gQmUgc3BlY2lmaWM6IFByb3ZpZGUgZXhhbXBsZXMgdG8gc3VwcG9ydCB5b3VyIHBvaW50cy4KNC4gQmFsYW5jZSBwb3NpdGl2ZXMgYW5kIGltcHJvdmVtZW50czogSGlnaGxpZ2h0IHN0cmVuZ3RocyBhbmQgYXJlYXMgdG8gZ3Jvdy4KNS4gQmUgcmVzcGVjdGZ1bCBhbmQgY29uc3RydWN0aXZlOiBBc3N1bWUgcG9zaXRpdmUgaW50ZW50IGFuZCBvZmZlciBzb2x1dGlvbnMuCjYuIFVzZSBvYmplY3RpdmUgY3JpdGVyaWE6IFJlZmVyZW5jZSBnb2FscywgbWV0cmljcywgb3IgZXhwZWN0YXRpb25zIHdoZXJlIHBvc3NpYmxlLgo3LiBTdWdnZXN0IG5leHQgc3RlcHM6IFJlY29tbWVuZCBhY3Rpb25hYmxlIHdheXMgdG8gaW1wcm92ZS4KOC4gUHJvb2ZyZWFkOiBDaGVjayB0b25lLCBncmFtbWFyLCBhbmQgY2xhcml0eSBiZWZvcmUgc3VibWl0dGluZy4K",
+      "mimeType": "text/plain"}}], "role": "user"}], "generationConfig": {}}'
    headers:
      User-Agent:
      - X-USER-AGENT-XXX
@@ -12,7 +13,7 @@ interactions:
      connection:
      - keep-alive
      content-length:
-      - '132'
+      - '976'
      content-type:
      - application/json
      host:
@@ -26,27 +27,28 @@ interactions:
  response:
    body:
      string: "{\n  \"candidates\": [\n    {\n      \"content\": {\n        \"parts\":
-        [\n          {\n            \"text\": \"Please provide the text file so I
-        can summarize it for you. I need the content of the file to be able to understand
-        and summarize it in one sentence.\\n\"\n          }\n        ],\n        \"role\":
+        [\n          {\n            \"text\": \"The text file outlines guidelines
+        for providing effective feedback, emphasizing clarity, specificity, a balance
+        of positive and constructive criticism, respect, objectivity, actionable suggestions,
+        and careful proofreading.\\n\"\n          }\n        ],\n        \"role\":
        \"model\"\n      },\n      \"finishReason\": \"STOP\",\n      \"avgLogprobs\":
-        -0.17782547979643851\n    }\n  ],\n  \"usageMetadata\": {\n    \"promptTokenCount\":
-        11,\n    \"candidatesTokenCount\": 33,\n    \"totalTokenCount\": 44,\n    \"promptTokensDetails\":
-        [\n      {\n        \"modality\": \"TEXT\",\n        \"tokenCount\": 11\n
+        -0.17109338442484537\n    }\n  ],\n  \"usageMetadata\": {\n    \"promptTokenCount\":
+        136,\n    \"candidatesTokenCount\": 36,\n    \"totalTokenCount\": 172,\n    \"promptTokensDetails\":
+        [\n      {\n        \"modality\": \"TEXT\",\n        \"tokenCount\": 136\n
        \     }\n    ],\n    \"candidatesTokensDetails\": [\n      {\n        \"modality\":
-        \"TEXT\",\n        \"tokenCount\": 33\n      }\n    ]\n  },\n  \"modelVersion\":
-        \"gemini-2.0-flash\",\n  \"responseId\": \"b-dyabKwN8a9jrEP7JT1yAo\"\n}\n"
+        \"TEXT\",\n        \"tokenCount\": 36\n      }\n    ]\n  },\n  \"modelVersion\":
+        \"gemini-2.0-flash\",\n  \"responseId\": \"wxZzaYaiGYG2_uMPtMjFiAw\"\n}\n"
    headers:
      Alt-Svc:
      - h3=":443"; ma=2592000,h3-29=":443"; ma=2592000
      Content-Type:
      - application/json; charset=UTF-8
      Date:
-      - Fri, 23 Jan 2026 03:13:52 GMT
+      - Fri, 23 Jan 2026 06:35:48 GMT
      Server:
      - scaffolding on HTTPServer2
      Server-Timing:
-      - gfet4t7; dur=631
+      - gfet4t7; dur=675
      Transfer-Encoding:
      - chunked
      Vary:
--- a/lib/crewai/tests/cassettes/llms/TestGenericFileIntegration.test_generic_file_text_gemini.yaml
+++ b/lib/crewai/tests/cassettes/llms/TestGenericFileIntegration.test_generic_file_text_gemini.yaml
@@ -0,0 +1,67 @@
+interactions:
+- request:
+    body: '{"contents": [{"parts": [{"text": "Summarize what this text says in one
+      sentence."}, {"inlineData": {"data": "UmV2aWV3IEd1aWRlbGluZXMKCjEuIEJlIGNsZWFyIGFuZCBjb25jaXNlOiBXcml0ZSBmZWVkYmFjayB0aGF0IGlzIGVhc3kgdG8gdW5kZXJzdGFuZC4KMi4gRm9jdXMgb24gYmVoYXZpb3IgYW5kIG91dGNvbWVzOiBEZXNjcmliZSB3aGF0IGhhcHBlbmVkIGFuZCB3aHkgaXQgbWF0dGVycy4KMy4gQmUgc3BlY2lmaWM6IFByb3ZpZGUgZXhhbXBsZXMgdG8gc3VwcG9ydCB5b3VyIHBvaW50cy4KNC4gQmFsYW5jZSBwb3NpdGl2ZXMgYW5kIGltcHJvdmVtZW50czogSGlnaGxpZ2h0IHN0cmVuZ3RocyBhbmQgYXJlYXMgdG8gZ3Jvdy4KNS4gQmUgcmVzcGVjdGZ1bCBhbmQgY29uc3RydWN0aXZlOiBBc3N1bWUgcG9zaXRpdmUgaW50ZW50IGFuZCBvZmZlciBzb2x1dGlvbnMuCjYuIFVzZSBvYmplY3RpdmUgY3JpdGVyaWE6IFJlZmVyZW5jZSBnb2FscywgbWV0cmljcywgb3IgZXhwZWN0YXRpb25zIHdoZXJlIHBvc3NpYmxlLgo3LiBTdWdnZXN0IG5leHQgc3RlcHM6IFJlY29tbWVuZCBhY3Rpb25hYmxlIHdheXMgdG8gaW1wcm92ZS4KOC4gUHJvb2ZyZWFkOiBDaGVjayB0b25lLCBncmFtbWFyLCBhbmQgY2xhcml0eSBiZWZvcmUgc3VibWl0dGluZy4K",
+      "mimeType": "text/plain"}}], "role": "user"}], "generationConfig": {}}'
+    headers:
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept:
+      - '*/*'
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      connection:
+      - keep-alive
+      content-length:
+      - '971'
+      content-type:
+      - application/json
+      host:
+      - generativelanguage.googleapis.com
+      x-goog-api-client:
+      - google-genai-sdk/1.49.0 gl-python/3.12.10
+      x-goog-api-key:
+      - X-GOOG-API-KEY-XXX
+    method: POST
+    uri: https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent
+  response:
+    body:
+      string: "{\n  \"candidates\": [\n    {\n      \"content\": {\n        \"parts\":
+        [\n          {\n            \"text\": \"Effective review feedback should be
+        clear, specific, balanced, respectful, and constructive, focusing on behaviors
+        and outcomes with examples, objective criteria, and suggested next steps,
+        ensuring it is proofread for clarity.\\n\"\n          }\n        ],\n        \"role\":
+        \"model\"\n      },\n      \"finishReason\": \"STOP\",\n      \"avgLogprobs\":
+        -0.35489303309743\n    }\n  ],\n  \"usageMetadata\": {\n    \"promptTokenCount\":
+        135,\n    \"candidatesTokenCount\": 41,\n    \"totalTokenCount\": 176,\n    \"promptTokensDetails\":
+        [\n      {\n        \"modality\": \"TEXT\",\n        \"tokenCount\": 135\n
+        \     }\n    ],\n    \"candidatesTokensDetails\": [\n      {\n        \"modality\":
+        \"TEXT\",\n        \"tokenCount\": 41\n      }\n    ]\n  },\n  \"modelVersion\":
+        \"gemini-2.0-flash\",\n  \"responseId\": \"xBZzaY2tCsa9jrEP7JT1yAo\"\n}\n"
+    headers:
+      Alt-Svc:
+      - h3=":443"; ma=2592000,h3-29=":443"; ma=2592000
+      Content-Type:
+      - application/json; charset=UTF-8
+      Date:
+      - Fri, 23 Jan 2026 06:35:48 GMT
+      Server:
+      - scaffolding on HTTPServer2
+      Server-Timing:
+      - gfet4t7; dur=732
+      Transfer-Encoding:
+      - chunked
+      Vary:
+      - Origin
+      - X-Origin
+      - Referer
+      X-Content-Type-Options:
+      - X-CONTENT-TYPE-XXX
+      X-Frame-Options:
+      - X-FRAME-OPTIONS-XXX
+      X-XSS-Protection:
+      - '0'
+    status:
+      code: 200
+      message: OK
+version: 1
--- a/lib/crewai/tests/cassettes/llms/TestOpenAIResponsesFileUploadIntegration.test_describe_image_with_file_id.yaml
+++ b/lib/crewai/tests/cassettes/llms/TestOpenAIResponsesFileUploadIntegration.test_describe_image_with_file_id.yaml
--- a/lib/crewai/tests/llms/test_multimodal_integration.py
+++ b/lib/crewai/tests/llms/test_multimodal_integration.py
@@ -18,6 +18,7 @@ from crewai_files import (
    VideoFile,
    format_multimodal_content,
 )
+from crewai_files.resolution.resolver import FileResolver, FileResolverConfig


 # Path to test data files
@@ -559,6 +560,153 @@ class TestGenericFileIntegration:

        response = llm.call(messages)

+        assert response
+        assert isinstance(response, str)
+        assert len(response) > 0
+
+
+def _build_multimodal_message_with_upload(
+    llm: LLM, prompt: str, files: dict
+) -> tuple[list[dict], list[dict]]:
+    """Build a multimodal message using file_id uploads instead of inline base64.
+
+    Note: OpenAI Chat Completions API only supports file_id for PDFs via
+    type="file", not for images. For image file_id support, OpenAI requires
+    the Responses API (type="input_image"). Since crewAI uses Chat Completions,
+    we test file_id uploads with Anthropic which supports file_id for all types.
+
+    Returns:
+        Tuple of (messages, content_blocks) where content_blocks can be inspected
+        to verify file_id was used.
+    """
+    from crewai_files.formatting.anthropic import AnthropicFormatter
+
+    config = FileResolverConfig(prefer_upload=True)
+    resolver = FileResolver(config=config)
+    formatter = AnthropicFormatter()
+
+    content_blocks = []
+    for file in files.values():
+        resolved = resolver.resolve(file, "anthropic")
+        block = formatter.format_block(file, resolved)
+        if block is not None:
+            content_blocks.append(block)
+
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                llm.format_text_content(prompt),
+                *content_blocks,
+            ],
+        }
+    ]
+    return messages, content_blocks
+
+
+def _build_responses_message_with_upload(
+    llm: LLM, prompt: str, files: dict
+) -> tuple[list[dict], list[dict]]:
+    """Build a Responses API message using file_id uploads.
+
+    The Responses API supports file_id for images via type="input_image".
+
+    Returns:
+        Tuple of (messages, content_blocks) where content_blocks can be inspected
+        to verify file_id was used.
+    """
+    from crewai_files.formatting import OpenAIResponsesFormatter
+
+    config = FileResolverConfig(prefer_upload=True)
+    resolver = FileResolver(config=config)
+
+    content_blocks = []
+    for file in files.values():
+        resolved = resolver.resolve(file, "openai")
+        block = OpenAIResponsesFormatter.format_block(resolved, file.content_type)
+        content_blocks.append(block)
+
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "input_text", "text": prompt},
+                *content_blocks,
+            ],
+        }
+    ]
+    return messages, content_blocks
+
+
+class TestAnthropicFileUploadIntegration:
+    """Integration tests for Anthropic multimodal with file_id uploads.
+
+    We test file_id uploads with Anthropic because OpenAI Chat Completions API
+    only supports file_id references for PDFs (type="file"), not images.
+    OpenAI's Responses API supports image file_id (type="input_image"), but
+    crewAI currently uses Chat Completions. Anthropic supports file_id for
+    all content types including images.
+    """
+
+    @pytest.mark.vcr()
+    def test_describe_image_with_file_id(self, test_image_bytes: bytes) -> None:
+        """Test Anthropic can describe an image uploaded via Files API."""
+        llm = LLM(model="anthropic/claude-3-5-haiku-20241022")
+        files = {"image": ImageFile(source=test_image_bytes)}
+
+        messages, content_blocks = _build_multimodal_message_with_upload(
+            llm,
+            "Describe this image in one sentence. Be brief.",
+            files,
+        )
+
+        # Verify we're using file_id, not base64
+        assert len(content_blocks) == 1
+        source = content_blocks[0].get("source", {})
+        assert source.get("type") == "file", (
+            f"Expected source type 'file' for file_id upload, got '{source.get('type')}'. "
+            "This test verifies file_id uploads work - if falling back to base64, "
+            "check that the Anthropic Files API uploader is working correctly."
+        )
+        assert "file_id" in source, "Expected file_id in source for file_id upload"
+
+        response = llm.call(messages)
+
+        assert response
+        assert isinstance(response, str)
+        assert len(response) > 0
+
+
+class TestOpenAIResponsesFileUploadIntegration:
+    """Integration tests for OpenAI Responses API with file_id uploads.
+
+    The Responses API supports file_id for images via type="input_image",
+    unlike Chat Completions which only supports file_id for PDFs.
+    """
+
+    @pytest.mark.vcr()
+    def test_describe_image_with_file_id(self, test_image_bytes: bytes) -> None:
+        """Test OpenAI Responses API can describe an image uploaded via Files API."""
+        llm = LLM(model="openai/gpt-4o-mini", api="responses")
+        files = {"image": ImageFile(source=test_image_bytes)}
+
+        messages, content_blocks = _build_responses_message_with_upload(
+            llm,
+            "Describe this image in one sentence. Be brief.",
+            files,
+        )
+
+        # Verify we're using file_id with input_image type
+        assert len(content_blocks) == 1
+        block = content_blocks[0]
+        assert block.get("type") == "input_image", (
+            f"Expected type 'input_image' for Responses API, got '{block.get('type')}'. "
+            "This test verifies file_id uploads work with the Responses API."
+        )
+        assert "file_id" in block, "Expected file_id in block for file_id upload"
+
+        response = llm.call(messages)
+
        assert response
        assert isinstance(response, str)
        assert len(response) > 0