mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-05-03 08:12:39 +00:00
feat(files): add file_id upload support and text file handling
- Add VCR patch for binary request bodies (base64 encoding fallback) - Add generate_filename() utility for UUID-based filenames with extension - Add OpenAIResponsesFormatter for Responses API (input_image, input_file) - Fix OpenAI uploader to use 'vision' purpose for images - Fix Anthropic uploader to use tuple format (filename, content, content_type) - Add TextConstraints and text support for Gemini - Add file_id upload integration tests for Anthropic and OpenAI Responses API
This commit is contained in:
File diff suppressed because one or more lines are too long
@@ -1,7 +1,8 @@
|
||||
interactions:
|
||||
- request:
|
||||
body: '{"contents": [{"parts": [{"text": "Summarize what this text file says in
|
||||
one sentence."}], "role": "user"}], "generationConfig": {}}'
|
||||
one sentence."}, {"inlineData": {"data": "UmV2aWV3IEd1aWRlbGluZXMKCjEuIEJlIGNsZWFyIGFuZCBjb25jaXNlOiBXcml0ZSBmZWVkYmFjayB0aGF0IGlzIGVhc3kgdG8gdW5kZXJzdGFuZC4KMi4gRm9jdXMgb24gYmVoYXZpb3IgYW5kIG91dGNvbWVzOiBEZXNjcmliZSB3aGF0IGhhcHBlbmVkIGFuZCB3aHkgaXQgbWF0dGVycy4KMy4gQmUgc3BlY2lmaWM6IFByb3ZpZGUgZXhhbXBsZXMgdG8gc3VwcG9ydCB5b3VyIHBvaW50cy4KNC4gQmFsYW5jZSBwb3NpdGl2ZXMgYW5kIGltcHJvdmVtZW50czogSGlnaGxpZ2h0IHN0cmVuZ3RocyBhbmQgYXJlYXMgdG8gZ3Jvdy4KNS4gQmUgcmVzcGVjdGZ1bCBhbmQgY29uc3RydWN0aXZlOiBBc3N1bWUgcG9zaXRpdmUgaW50ZW50IGFuZCBvZmZlciBzb2x1dGlvbnMuCjYuIFVzZSBvYmplY3RpdmUgY3JpdGVyaWE6IFJlZmVyZW5jZSBnb2FscywgbWV0cmljcywgb3IgZXhwZWN0YXRpb25zIHdoZXJlIHBvc3NpYmxlLgo3LiBTdWdnZXN0IG5leHQgc3RlcHM6IFJlY29tbWVuZCBhY3Rpb25hYmxlIHdheXMgdG8gaW1wcm92ZS4KOC4gUHJvb2ZyZWFkOiBDaGVjayB0b25lLCBncmFtbWFyLCBhbmQgY2xhcml0eSBiZWZvcmUgc3VibWl0dGluZy4K",
|
||||
"mimeType": "text/plain"}}], "role": "user"}], "generationConfig": {}}'
|
||||
headers:
|
||||
User-Agent:
|
||||
- X-USER-AGENT-XXX
|
||||
@@ -12,7 +13,7 @@ interactions:
|
||||
connection:
|
||||
- keep-alive
|
||||
content-length:
|
||||
- '132'
|
||||
- '976'
|
||||
content-type:
|
||||
- application/json
|
||||
host:
|
||||
@@ -26,27 +27,28 @@ interactions:
|
||||
response:
|
||||
body:
|
||||
string: "{\n \"candidates\": [\n {\n \"content\": {\n \"parts\":
|
||||
[\n {\n \"text\": \"Please provide the text file so I
|
||||
can summarize it for you. I need the content of the file to be able to understand
|
||||
and summarize it in one sentence.\\n\"\n }\n ],\n \"role\":
|
||||
[\n {\n \"text\": \"The text file outlines guidelines
|
||||
for providing effective feedback, emphasizing clarity, specificity, a balance
|
||||
of positive and constructive criticism, respect, objectivity, actionable suggestions,
|
||||
and careful proofreading.\\n\"\n }\n ],\n \"role\":
|
||||
\"model\"\n },\n \"finishReason\": \"STOP\",\n \"avgLogprobs\":
|
||||
-0.17782547979643851\n }\n ],\n \"usageMetadata\": {\n \"promptTokenCount\":
|
||||
11,\n \"candidatesTokenCount\": 33,\n \"totalTokenCount\": 44,\n \"promptTokensDetails\":
|
||||
[\n {\n \"modality\": \"TEXT\",\n \"tokenCount\": 11\n
|
||||
-0.17109338442484537\n }\n ],\n \"usageMetadata\": {\n \"promptTokenCount\":
|
||||
136,\n \"candidatesTokenCount\": 36,\n \"totalTokenCount\": 172,\n \"promptTokensDetails\":
|
||||
[\n {\n \"modality\": \"TEXT\",\n \"tokenCount\": 136\n
|
||||
\ }\n ],\n \"candidatesTokensDetails\": [\n {\n \"modality\":
|
||||
\"TEXT\",\n \"tokenCount\": 33\n }\n ]\n },\n \"modelVersion\":
|
||||
\"gemini-2.0-flash\",\n \"responseId\": \"b-dyabKwN8a9jrEP7JT1yAo\"\n}\n"
|
||||
\"TEXT\",\n \"tokenCount\": 36\n }\n ]\n },\n \"modelVersion\":
|
||||
\"gemini-2.0-flash\",\n \"responseId\": \"wxZzaYaiGYG2_uMPtMjFiAw\"\n}\n"
|
||||
headers:
|
||||
Alt-Svc:
|
||||
- h3=":443"; ma=2592000,h3-29=":443"; ma=2592000
|
||||
Content-Type:
|
||||
- application/json; charset=UTF-8
|
||||
Date:
|
||||
- Fri, 23 Jan 2026 03:13:52 GMT
|
||||
- Fri, 23 Jan 2026 06:35:48 GMT
|
||||
Server:
|
||||
- scaffolding on HTTPServer2
|
||||
Server-Timing:
|
||||
- gfet4t7; dur=631
|
||||
- gfet4t7; dur=675
|
||||
Transfer-Encoding:
|
||||
- chunked
|
||||
Vary:
|
||||
|
||||
@@ -0,0 +1,67 @@
|
||||
interactions:
|
||||
- request:
|
||||
body: '{"contents": [{"parts": [{"text": "Summarize what this text says in one
|
||||
sentence."}, {"inlineData": {"data": "UmV2aWV3IEd1aWRlbGluZXMKCjEuIEJlIGNsZWFyIGFuZCBjb25jaXNlOiBXcml0ZSBmZWVkYmFjayB0aGF0IGlzIGVhc3kgdG8gdW5kZXJzdGFuZC4KMi4gRm9jdXMgb24gYmVoYXZpb3IgYW5kIG91dGNvbWVzOiBEZXNjcmliZSB3aGF0IGhhcHBlbmVkIGFuZCB3aHkgaXQgbWF0dGVycy4KMy4gQmUgc3BlY2lmaWM6IFByb3ZpZGUgZXhhbXBsZXMgdG8gc3VwcG9ydCB5b3VyIHBvaW50cy4KNC4gQmFsYW5jZSBwb3NpdGl2ZXMgYW5kIGltcHJvdmVtZW50czogSGlnaGxpZ2h0IHN0cmVuZ3RocyBhbmQgYXJlYXMgdG8gZ3Jvdy4KNS4gQmUgcmVzcGVjdGZ1bCBhbmQgY29uc3RydWN0aXZlOiBBc3N1bWUgcG9zaXRpdmUgaW50ZW50IGFuZCBvZmZlciBzb2x1dGlvbnMuCjYuIFVzZSBvYmplY3RpdmUgY3JpdGVyaWE6IFJlZmVyZW5jZSBnb2FscywgbWV0cmljcywgb3IgZXhwZWN0YXRpb25zIHdoZXJlIHBvc3NpYmxlLgo3LiBTdWdnZXN0IG5leHQgc3RlcHM6IFJlY29tbWVuZCBhY3Rpb25hYmxlIHdheXMgdG8gaW1wcm92ZS4KOC4gUHJvb2ZyZWFkOiBDaGVjayB0b25lLCBncmFtbWFyLCBhbmQgY2xhcml0eSBiZWZvcmUgc3VibWl0dGluZy4K",
|
||||
"mimeType": "text/plain"}}], "role": "user"}], "generationConfig": {}}'
|
||||
headers:
|
||||
User-Agent:
|
||||
- X-USER-AGENT-XXX
|
||||
accept:
|
||||
- '*/*'
|
||||
accept-encoding:
|
||||
- ACCEPT-ENCODING-XXX
|
||||
connection:
|
||||
- keep-alive
|
||||
content-length:
|
||||
- '971'
|
||||
content-type:
|
||||
- application/json
|
||||
host:
|
||||
- generativelanguage.googleapis.com
|
||||
x-goog-api-client:
|
||||
- google-genai-sdk/1.49.0 gl-python/3.12.10
|
||||
x-goog-api-key:
|
||||
- X-GOOG-API-KEY-XXX
|
||||
method: POST
|
||||
uri: https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent
|
||||
response:
|
||||
body:
|
||||
string: "{\n \"candidates\": [\n {\n \"content\": {\n \"parts\":
|
||||
[\n {\n \"text\": \"Effective review feedback should be
|
||||
clear, specific, balanced, respectful, and constructive, focusing on behaviors
|
||||
and outcomes with examples, objective criteria, and suggested next steps,
|
||||
ensuring it is proofread for clarity.\\n\"\n }\n ],\n \"role\":
|
||||
\"model\"\n },\n \"finishReason\": \"STOP\",\n \"avgLogprobs\":
|
||||
-0.35489303309743\n }\n ],\n \"usageMetadata\": {\n \"promptTokenCount\":
|
||||
135,\n \"candidatesTokenCount\": 41,\n \"totalTokenCount\": 176,\n \"promptTokensDetails\":
|
||||
[\n {\n \"modality\": \"TEXT\",\n \"tokenCount\": 135\n
|
||||
\ }\n ],\n \"candidatesTokensDetails\": [\n {\n \"modality\":
|
||||
\"TEXT\",\n \"tokenCount\": 41\n }\n ]\n },\n \"modelVersion\":
|
||||
\"gemini-2.0-flash\",\n \"responseId\": \"xBZzaY2tCsa9jrEP7JT1yAo\"\n}\n"
|
||||
headers:
|
||||
Alt-Svc:
|
||||
- h3=":443"; ma=2592000,h3-29=":443"; ma=2592000
|
||||
Content-Type:
|
||||
- application/json; charset=UTF-8
|
||||
Date:
|
||||
- Fri, 23 Jan 2026 06:35:48 GMT
|
||||
Server:
|
||||
- scaffolding on HTTPServer2
|
||||
Server-Timing:
|
||||
- gfet4t7; dur=732
|
||||
Transfer-Encoding:
|
||||
- chunked
|
||||
Vary:
|
||||
- Origin
|
||||
- X-Origin
|
||||
- Referer
|
||||
X-Content-Type-Options:
|
||||
- X-CONTENT-TYPE-XXX
|
||||
X-Frame-Options:
|
||||
- X-FRAME-OPTIONS-XXX
|
||||
X-XSS-Protection:
|
||||
- '0'
|
||||
status:
|
||||
code: 200
|
||||
message: OK
|
||||
version: 1
|
||||
File diff suppressed because one or more lines are too long
@@ -18,6 +18,7 @@ from crewai_files import (
|
||||
VideoFile,
|
||||
format_multimodal_content,
|
||||
)
|
||||
from crewai_files.resolution.resolver import FileResolver, FileResolverConfig
|
||||
|
||||
|
||||
# Path to test data files
|
||||
@@ -559,6 +560,153 @@ class TestGenericFileIntegration:
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
|
||||
def _build_multimodal_message_with_upload(
|
||||
llm: LLM, prompt: str, files: dict
|
||||
) -> tuple[list[dict], list[dict]]:
|
||||
"""Build a multimodal message using file_id uploads instead of inline base64.
|
||||
|
||||
Note: OpenAI Chat Completions API only supports file_id for PDFs via
|
||||
type="file", not for images. For image file_id support, OpenAI requires
|
||||
the Responses API (type="input_image"). Since crewAI uses Chat Completions,
|
||||
we test file_id uploads with Anthropic which supports file_id for all types.
|
||||
|
||||
Returns:
|
||||
Tuple of (messages, content_blocks) where content_blocks can be inspected
|
||||
to verify file_id was used.
|
||||
"""
|
||||
from crewai_files.formatting.anthropic import AnthropicFormatter
|
||||
|
||||
config = FileResolverConfig(prefer_upload=True)
|
||||
resolver = FileResolver(config=config)
|
||||
formatter = AnthropicFormatter()
|
||||
|
||||
content_blocks = []
|
||||
for file in files.values():
|
||||
resolved = resolver.resolve(file, "anthropic")
|
||||
block = formatter.format_block(file, resolved)
|
||||
if block is not None:
|
||||
content_blocks.append(block)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
llm.format_text_content(prompt),
|
||||
*content_blocks,
|
||||
],
|
||||
}
|
||||
]
|
||||
return messages, content_blocks
|
||||
|
||||
|
||||
def _build_responses_message_with_upload(
|
||||
llm: LLM, prompt: str, files: dict
|
||||
) -> tuple[list[dict], list[dict]]:
|
||||
"""Build a Responses API message using file_id uploads.
|
||||
|
||||
The Responses API supports file_id for images via type="input_image".
|
||||
|
||||
Returns:
|
||||
Tuple of (messages, content_blocks) where content_blocks can be inspected
|
||||
to verify file_id was used.
|
||||
"""
|
||||
from crewai_files.formatting import OpenAIResponsesFormatter
|
||||
|
||||
config = FileResolverConfig(prefer_upload=True)
|
||||
resolver = FileResolver(config=config)
|
||||
|
||||
content_blocks = []
|
||||
for file in files.values():
|
||||
resolved = resolver.resolve(file, "openai")
|
||||
block = OpenAIResponsesFormatter.format_block(resolved, file.content_type)
|
||||
content_blocks.append(block)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "input_text", "text": prompt},
|
||||
*content_blocks,
|
||||
],
|
||||
}
|
||||
]
|
||||
return messages, content_blocks
|
||||
|
||||
|
||||
class TestAnthropicFileUploadIntegration:
|
||||
"""Integration tests for Anthropic multimodal with file_id uploads.
|
||||
|
||||
We test file_id uploads with Anthropic because OpenAI Chat Completions API
|
||||
only supports file_id references for PDFs (type="file"), not images.
|
||||
OpenAI's Responses API supports image file_id (type="input_image"), but
|
||||
crewAI currently uses Chat Completions. Anthropic supports file_id for
|
||||
all content types including images.
|
||||
"""
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_describe_image_with_file_id(self, test_image_bytes: bytes) -> None:
|
||||
"""Test Anthropic can describe an image uploaded via Files API."""
|
||||
llm = LLM(model="anthropic/claude-3-5-haiku-20241022")
|
||||
files = {"image": ImageFile(source=test_image_bytes)}
|
||||
|
||||
messages, content_blocks = _build_multimodal_message_with_upload(
|
||||
llm,
|
||||
"Describe this image in one sentence. Be brief.",
|
||||
files,
|
||||
)
|
||||
|
||||
# Verify we're using file_id, not base64
|
||||
assert len(content_blocks) == 1
|
||||
source = content_blocks[0].get("source", {})
|
||||
assert source.get("type") == "file", (
|
||||
f"Expected source type 'file' for file_id upload, got '{source.get('type')}'. "
|
||||
"This test verifies file_id uploads work - if falling back to base64, "
|
||||
"check that the Anthropic Files API uploader is working correctly."
|
||||
)
|
||||
assert "file_id" in source, "Expected file_id in source for file_id upload"
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
|
||||
|
||||
class TestOpenAIResponsesFileUploadIntegration:
|
||||
"""Integration tests for OpenAI Responses API with file_id uploads.
|
||||
|
||||
The Responses API supports file_id for images via type="input_image",
|
||||
unlike Chat Completions which only supports file_id for PDFs.
|
||||
"""
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_describe_image_with_file_id(self, test_image_bytes: bytes) -> None:
|
||||
"""Test OpenAI Responses API can describe an image uploaded via Files API."""
|
||||
llm = LLM(model="openai/gpt-4o-mini", api="responses")
|
||||
files = {"image": ImageFile(source=test_image_bytes)}
|
||||
|
||||
messages, content_blocks = _build_responses_message_with_upload(
|
||||
llm,
|
||||
"Describe this image in one sentence. Be brief.",
|
||||
files,
|
||||
)
|
||||
|
||||
# Verify we're using file_id with input_image type
|
||||
assert len(content_blocks) == 1
|
||||
block = content_blocks[0]
|
||||
assert block.get("type") == "input_image", (
|
||||
f"Expected type 'input_image' for Responses API, got '{block.get('type')}'. "
|
||||
"This test verifies file_id uploads work with the Responses API."
|
||||
)
|
||||
assert "file_id" in block, "Expected file_id in block for file_id upload"
|
||||
|
||||
response = llm.call(messages)
|
||||
|
||||
assert response
|
||||
assert isinstance(response, str)
|
||||
assert len(response) > 0
|
||||
Reference in New Issue
Block a user