feat(files): add file_id upload support and text file handling

- Add VCR patch for binary request bodies (base64 encoding fallback)
- Add generate_filename() utility for UUID-based filenames with extension
- Add OpenAIResponsesFormatter for Responses API (input_image, input_file)
- Fix OpenAI uploader to use 'vision' purpose for images
- Fix Anthropic uploader to use tuple format (filename, content, content_type)
- Add TextConstraints and text support for Gemini
- Add file_id upload integration tests for Anthropic and OpenAI Responses API
This commit is contained in:
Greyson LaLonde
2026-01-23 01:57:29 -05:00
parent 7c9ce9ccd8
commit 4ab53c0726
14 changed files with 833 additions and 44 deletions

File diff suppressed because one or more lines are too long

View File

@@ -1,7 +1,8 @@
interactions:
- request:
body: '{"contents": [{"parts": [{"text": "Summarize what this text file says in
one sentence."}], "role": "user"}], "generationConfig": {}}'
one sentence."}, {"inlineData": {"data": "UmV2aWV3IEd1aWRlbGluZXMKCjEuIEJlIGNsZWFyIGFuZCBjb25jaXNlOiBXcml0ZSBmZWVkYmFjayB0aGF0IGlzIGVhc3kgdG8gdW5kZXJzdGFuZC4KMi4gRm9jdXMgb24gYmVoYXZpb3IgYW5kIG91dGNvbWVzOiBEZXNjcmliZSB3aGF0IGhhcHBlbmVkIGFuZCB3aHkgaXQgbWF0dGVycy4KMy4gQmUgc3BlY2lmaWM6IFByb3ZpZGUgZXhhbXBsZXMgdG8gc3VwcG9ydCB5b3VyIHBvaW50cy4KNC4gQmFsYW5jZSBwb3NpdGl2ZXMgYW5kIGltcHJvdmVtZW50czogSGlnaGxpZ2h0IHN0cmVuZ3RocyBhbmQgYXJlYXMgdG8gZ3Jvdy4KNS4gQmUgcmVzcGVjdGZ1bCBhbmQgY29uc3RydWN0aXZlOiBBc3N1bWUgcG9zaXRpdmUgaW50ZW50IGFuZCBvZmZlciBzb2x1dGlvbnMuCjYuIFVzZSBvYmplY3RpdmUgY3JpdGVyaWE6IFJlZmVyZW5jZSBnb2FscywgbWV0cmljcywgb3IgZXhwZWN0YXRpb25zIHdoZXJlIHBvc3NpYmxlLgo3LiBTdWdnZXN0IG5leHQgc3RlcHM6IFJlY29tbWVuZCBhY3Rpb25hYmxlIHdheXMgdG8gaW1wcm92ZS4KOC4gUHJvb2ZyZWFkOiBDaGVjayB0b25lLCBncmFtbWFyLCBhbmQgY2xhcml0eSBiZWZvcmUgc3VibWl0dGluZy4K",
"mimeType": "text/plain"}}], "role": "user"}], "generationConfig": {}}'
headers:
User-Agent:
- X-USER-AGENT-XXX
@@ -12,7 +13,7 @@ interactions:
connection:
- keep-alive
content-length:
- '132'
- '976'
content-type:
- application/json
host:
@@ -26,27 +27,28 @@ interactions:
response:
body:
string: "{\n \"candidates\": [\n {\n \"content\": {\n \"parts\":
[\n {\n \"text\": \"Please provide the text file so I
can summarize it for you. I need the content of the file to be able to understand
and summarize it in one sentence.\\n\"\n }\n ],\n \"role\":
[\n {\n \"text\": \"The text file outlines guidelines
for providing effective feedback, emphasizing clarity, specificity, a balance
of positive and constructive criticism, respect, objectivity, actionable suggestions,
and careful proofreading.\\n\"\n }\n ],\n \"role\":
\"model\"\n },\n \"finishReason\": \"STOP\",\n \"avgLogprobs\":
-0.17782547979643851\n }\n ],\n \"usageMetadata\": {\n \"promptTokenCount\":
11,\n \"candidatesTokenCount\": 33,\n \"totalTokenCount\": 44,\n \"promptTokensDetails\":
[\n {\n \"modality\": \"TEXT\",\n \"tokenCount\": 11\n
-0.17109338442484537\n }\n ],\n \"usageMetadata\": {\n \"promptTokenCount\":
136,\n \"candidatesTokenCount\": 36,\n \"totalTokenCount\": 172,\n \"promptTokensDetails\":
[\n {\n \"modality\": \"TEXT\",\n \"tokenCount\": 136\n
\ }\n ],\n \"candidatesTokensDetails\": [\n {\n \"modality\":
\"TEXT\",\n \"tokenCount\": 33\n }\n ]\n },\n \"modelVersion\":
\"gemini-2.0-flash\",\n \"responseId\": \"b-dyabKwN8a9jrEP7JT1yAo\"\n}\n"
\"TEXT\",\n \"tokenCount\": 36\n }\n ]\n },\n \"modelVersion\":
\"gemini-2.0-flash\",\n \"responseId\": \"wxZzaYaiGYG2_uMPtMjFiAw\"\n}\n"
headers:
Alt-Svc:
- h3=":443"; ma=2592000,h3-29=":443"; ma=2592000
Content-Type:
- application/json; charset=UTF-8
Date:
- Fri, 23 Jan 2026 03:13:52 GMT
- Fri, 23 Jan 2026 06:35:48 GMT
Server:
- scaffolding on HTTPServer2
Server-Timing:
- gfet4t7; dur=631
- gfet4t7; dur=675
Transfer-Encoding:
- chunked
Vary:

View File

@@ -0,0 +1,67 @@
interactions:
- request:
body: '{"contents": [{"parts": [{"text": "Summarize what this text says in one
sentence."}, {"inlineData": {"data": "UmV2aWV3IEd1aWRlbGluZXMKCjEuIEJlIGNsZWFyIGFuZCBjb25jaXNlOiBXcml0ZSBmZWVkYmFjayB0aGF0IGlzIGVhc3kgdG8gdW5kZXJzdGFuZC4KMi4gRm9jdXMgb24gYmVoYXZpb3IgYW5kIG91dGNvbWVzOiBEZXNjcmliZSB3aGF0IGhhcHBlbmVkIGFuZCB3aHkgaXQgbWF0dGVycy4KMy4gQmUgc3BlY2lmaWM6IFByb3ZpZGUgZXhhbXBsZXMgdG8gc3VwcG9ydCB5b3VyIHBvaW50cy4KNC4gQmFsYW5jZSBwb3NpdGl2ZXMgYW5kIGltcHJvdmVtZW50czogSGlnaGxpZ2h0IHN0cmVuZ3RocyBhbmQgYXJlYXMgdG8gZ3Jvdy4KNS4gQmUgcmVzcGVjdGZ1bCBhbmQgY29uc3RydWN0aXZlOiBBc3N1bWUgcG9zaXRpdmUgaW50ZW50IGFuZCBvZmZlciBzb2x1dGlvbnMuCjYuIFVzZSBvYmplY3RpdmUgY3JpdGVyaWE6IFJlZmVyZW5jZSBnb2FscywgbWV0cmljcywgb3IgZXhwZWN0YXRpb25zIHdoZXJlIHBvc3NpYmxlLgo3LiBTdWdnZXN0IG5leHQgc3RlcHM6IFJlY29tbWVuZCBhY3Rpb25hYmxlIHdheXMgdG8gaW1wcm92ZS4KOC4gUHJvb2ZyZWFkOiBDaGVjayB0b25lLCBncmFtbWFyLCBhbmQgY2xhcml0eSBiZWZvcmUgc3VibWl0dGluZy4K",
"mimeType": "text/plain"}}], "role": "user"}], "generationConfig": {}}'
headers:
User-Agent:
- X-USER-AGENT-XXX
accept:
- '*/*'
accept-encoding:
- ACCEPT-ENCODING-XXX
connection:
- keep-alive
content-length:
- '971'
content-type:
- application/json
host:
- generativelanguage.googleapis.com
x-goog-api-client:
- google-genai-sdk/1.49.0 gl-python/3.12.10
x-goog-api-key:
- X-GOOG-API-KEY-XXX
method: POST
uri: https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent
response:
body:
string: "{\n \"candidates\": [\n {\n \"content\": {\n \"parts\":
[\n {\n \"text\": \"Effective review feedback should be
clear, specific, balanced, respectful, and constructive, focusing on behaviors
and outcomes with examples, objective criteria, and suggested next steps,
ensuring it is proofread for clarity.\\n\"\n }\n ],\n \"role\":
\"model\"\n },\n \"finishReason\": \"STOP\",\n \"avgLogprobs\":
-0.35489303309743\n }\n ],\n \"usageMetadata\": {\n \"promptTokenCount\":
135,\n \"candidatesTokenCount\": 41,\n \"totalTokenCount\": 176,\n \"promptTokensDetails\":
[\n {\n \"modality\": \"TEXT\",\n \"tokenCount\": 135\n
\ }\n ],\n \"candidatesTokensDetails\": [\n {\n \"modality\":
\"TEXT\",\n \"tokenCount\": 41\n }\n ]\n },\n \"modelVersion\":
\"gemini-2.0-flash\",\n \"responseId\": \"xBZzaY2tCsa9jrEP7JT1yAo\"\n}\n"
headers:
Alt-Svc:
- h3=":443"; ma=2592000,h3-29=":443"; ma=2592000
Content-Type:
- application/json; charset=UTF-8
Date:
- Fri, 23 Jan 2026 06:35:48 GMT
Server:
- scaffolding on HTTPServer2
Server-Timing:
- gfet4t7; dur=732
Transfer-Encoding:
- chunked
Vary:
- Origin
- X-Origin
- Referer
X-Content-Type-Options:
- X-CONTENT-TYPE-XXX
X-Frame-Options:
- X-FRAME-OPTIONS-XXX
X-XSS-Protection:
- '0'
status:
code: 200
message: OK
version: 1

View File

@@ -18,6 +18,7 @@ from crewai_files import (
VideoFile,
format_multimodal_content,
)
from crewai_files.resolution.resolver import FileResolver, FileResolverConfig
# Path to test data files
@@ -559,6 +560,153 @@ class TestGenericFileIntegration:
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
def _build_multimodal_message_with_upload(
llm: LLM, prompt: str, files: dict
) -> tuple[list[dict], list[dict]]:
"""Build a multimodal message using file_id uploads instead of inline base64.
Note: OpenAI Chat Completions API only supports file_id for PDFs via
type="file", not for images. For image file_id support, OpenAI requires
the Responses API (type="input_image"). Since crewAI uses Chat Completions,
we test file_id uploads with Anthropic which supports file_id for all types.
Returns:
Tuple of (messages, content_blocks) where content_blocks can be inspected
to verify file_id was used.
"""
from crewai_files.formatting.anthropic import AnthropicFormatter
config = FileResolverConfig(prefer_upload=True)
resolver = FileResolver(config=config)
formatter = AnthropicFormatter()
content_blocks = []
for file in files.values():
resolved = resolver.resolve(file, "anthropic")
block = formatter.format_block(file, resolved)
if block is not None:
content_blocks.append(block)
messages = [
{
"role": "user",
"content": [
llm.format_text_content(prompt),
*content_blocks,
],
}
]
return messages, content_blocks
def _build_responses_message_with_upload(
llm: LLM, prompt: str, files: dict
) -> tuple[list[dict], list[dict]]:
"""Build a Responses API message using file_id uploads.
The Responses API supports file_id for images via type="input_image".
Returns:
Tuple of (messages, content_blocks) where content_blocks can be inspected
to verify file_id was used.
"""
from crewai_files.formatting import OpenAIResponsesFormatter
config = FileResolverConfig(prefer_upload=True)
resolver = FileResolver(config=config)
content_blocks = []
for file in files.values():
resolved = resolver.resolve(file, "openai")
block = OpenAIResponsesFormatter.format_block(resolved, file.content_type)
content_blocks.append(block)
messages = [
{
"role": "user",
"content": [
{"type": "input_text", "text": prompt},
*content_blocks,
],
}
]
return messages, content_blocks
class TestAnthropicFileUploadIntegration:
"""Integration tests for Anthropic multimodal with file_id uploads.
We test file_id uploads with Anthropic because OpenAI Chat Completions API
only supports file_id references for PDFs (type="file"), not images.
OpenAI's Responses API supports image file_id (type="input_image"), but
crewAI currently uses Chat Completions. Anthropic supports file_id for
all content types including images.
"""
@pytest.mark.vcr()
def test_describe_image_with_file_id(self, test_image_bytes: bytes) -> None:
"""Test Anthropic can describe an image uploaded via Files API."""
llm = LLM(model="anthropic/claude-3-5-haiku-20241022")
files = {"image": ImageFile(source=test_image_bytes)}
messages, content_blocks = _build_multimodal_message_with_upload(
llm,
"Describe this image in one sentence. Be brief.",
files,
)
# Verify we're using file_id, not base64
assert len(content_blocks) == 1
source = content_blocks[0].get("source", {})
assert source.get("type") == "file", (
f"Expected source type 'file' for file_id upload, got '{source.get('type')}'. "
"This test verifies file_id uploads work - if falling back to base64, "
"check that the Anthropic Files API uploader is working correctly."
)
assert "file_id" in source, "Expected file_id in source for file_id upload"
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0
class TestOpenAIResponsesFileUploadIntegration:
"""Integration tests for OpenAI Responses API with file_id uploads.
The Responses API supports file_id for images via type="input_image",
unlike Chat Completions which only supports file_id for PDFs.
"""
@pytest.mark.vcr()
def test_describe_image_with_file_id(self, test_image_bytes: bytes) -> None:
"""Test OpenAI Responses API can describe an image uploaded via Files API."""
llm = LLM(model="openai/gpt-4o-mini", api="responses")
files = {"image": ImageFile(source=test_image_bytes)}
messages, content_blocks = _build_responses_message_with_upload(
llm,
"Describe this image in one sentence. Be brief.",
files,
)
# Verify we're using file_id with input_image type
assert len(content_blocks) == 1
block = content_blocks[0]
assert block.get("type") == "input_image", (
f"Expected type 'input_image' for Responses API, got '{block.get('type')}'. "
"This test verifies file_id uploads work with the Responses API."
)
assert "file_id" in block, "Expected file_id in block for file_id upload"
response = llm.call(messages)
assert response
assert isinstance(response, str)
assert len(response) > 0