Files
crewAI/lib/crewai-files/src/crewai_files/uploaders/openai.py
Greyson LaLonde c4c9208229 feat: native multimodal file handling; openai responses api
- add input_files parameter to Crew.kickoff(), Flow.kickoff(), Task, and Agent.kickoff()
- add provider-specific file uploaders for OpenAI, Anthropic, Gemini, and Bedrock
- add file type detection, constraint validation, and automatic format conversion
- add URL file source support for multimodal content
- add streaming uploads for large files
- add prompt caching support for Anthropic
- add OpenAI Responses API support
2026-01-23 15:13:25 -05:00

696 lines
22 KiB
Python

"""OpenAI Files API uploader implementation."""
from __future__ import annotations
from collections.abc import AsyncIterator, Iterator
import io
import logging
import os
from typing import Any
from crewai_files.core.constants import DEFAULT_UPLOAD_CHUNK_SIZE, FILES_API_MAX_SIZE
from crewai_files.core.sources import FileBytes, FilePath, FileStream, generate_filename
from crewai_files.core.types import FileInput
from crewai_files.processing.exceptions import (
PermanentUploadError,
TransientUploadError,
classify_upload_error,
)
from crewai_files.uploaders.base import FileUploader, UploadResult
logger = logging.getLogger(__name__)
def _get_purpose_for_content_type(content_type: str, purpose: str | None) -> str:
"""Get the appropriate purpose for a file based on content type.
OpenAI Files API requires different purposes for different file types:
- Images (for Responses API vision): "vision"
- PDFs and other documents: "user_data"
Args:
content_type: MIME type of the file.
purpose: Optional explicit purpose override.
Returns:
The purpose string to use for upload.
"""
if purpose is not None:
return purpose
if content_type.startswith("image/"):
return "vision"
return "user_data"
def _get_file_size(file: FileInput) -> int | None:
"""Get file size without reading content if possible.
Args:
file: The file to get size for.
Returns:
File size in bytes, or None if size cannot be determined without reading.
"""
source = file._file_source
if isinstance(source, FilePath):
return source.path.stat().st_size
if isinstance(source, FileBytes):
return len(source.data)
return None
def _iter_file_chunks(file: FileInput, chunk_size: int) -> Iterator[bytes]:
"""Iterate over file content in chunks.
Args:
file: The file to read.
chunk_size: Size of each chunk in bytes.
Yields:
Chunks of file content.
"""
source = file._file_source
if isinstance(source, (FilePath, FileBytes, FileStream)):
yield from source.read_chunks(chunk_size)
else:
content = file.read()
for i in range(0, len(content), chunk_size):
yield content[i : i + chunk_size]
async def _aiter_file_chunks(
file: FileInput, chunk_size: int, content: bytes | None = None
) -> AsyncIterator[bytes]:
"""Async iterate over file content in chunks.
Args:
file: The file to read.
chunk_size: Size of each chunk in bytes.
content: Optional pre-loaded content to chunk.
Yields:
Chunks of file content.
"""
if content is not None:
for i in range(0, len(content), chunk_size):
yield content[i : i + chunk_size]
return
source = file._file_source
if isinstance(source, FilePath):
async for chunk in source.aread_chunks(chunk_size):
yield chunk
elif isinstance(source, (FileBytes, FileStream)):
for chunk in source.read_chunks(chunk_size):
yield chunk
else:
data = await file.aread()
for i in range(0, len(data), chunk_size):
yield data[i : i + chunk_size]
class OpenAIFileUploader(FileUploader):
"""Uploader for OpenAI Files and Uploads APIs.
Uses the Files API for files up to 512MB (single request).
Uses the Uploads API for files larger than 512MB (multipart chunked).
"""
def __init__(
self,
api_key: str | None = None,
chunk_size: int = DEFAULT_UPLOAD_CHUNK_SIZE,
client: Any = None,
async_client: Any = None,
) -> None:
"""Initialize the OpenAI uploader.
Args:
api_key: Optional OpenAI API key. If not provided, uses
OPENAI_API_KEY environment variable.
chunk_size: Chunk size in bytes for multipart uploads (default 64MB).
client: Optional pre-instantiated OpenAI client.
async_client: Optional pre-instantiated async OpenAI client.
"""
self._api_key = api_key or os.environ.get("OPENAI_API_KEY")
self._chunk_size = chunk_size
self._client: Any = client
self._async_client: Any = async_client
@property
def provider_name(self) -> str:
"""Return the provider name."""
return "openai"
def _build_upload_result(self, file_id: str, content_type: str) -> UploadResult:
"""Build an UploadResult for a completed upload.
Args:
file_id: The uploaded file ID.
content_type: The file's content type.
Returns:
UploadResult with the file metadata.
"""
return UploadResult(
file_id=file_id,
file_uri=None,
content_type=content_type,
expires_at=None,
provider=self.provider_name,
)
def _get_client(self) -> Any:
"""Get or create the OpenAI client."""
if self._client is None:
try:
from openai import OpenAI
self._client = OpenAI(api_key=self._api_key)
except ImportError as e:
raise ImportError(
"openai is required for OpenAI file uploads. "
"Install with: pip install openai"
) from e
return self._client
def _get_async_client(self) -> Any:
"""Get or create the async OpenAI client."""
if self._async_client is None:
try:
from openai import AsyncOpenAI
self._async_client = AsyncOpenAI(api_key=self._api_key)
except ImportError as e:
raise ImportError(
"openai is required for OpenAI file uploads. "
"Install with: pip install openai"
) from e
return self._async_client
def upload(self, file: FileInput, purpose: str | None = None) -> UploadResult:
"""Upload a file to OpenAI.
Uses Files API for files <= 512MB, Uploads API for larger files.
For large files, streams chunks to avoid loading entire file in memory.
Args:
file: The file to upload.
purpose: Optional purpose for the file (default: "user_data").
Returns:
UploadResult with the file ID and metadata.
Raises:
TransientUploadError: For retryable errors (network, rate limits).
PermanentUploadError: For non-retryable errors (auth, validation).
"""
try:
file_size = _get_file_size(file)
if file_size is not None and file_size > FILES_API_MAX_SIZE:
return self._upload_multipart_streaming(file, file_size, purpose)
content = file.read()
if len(content) > FILES_API_MAX_SIZE:
return self._upload_multipart(file, content, purpose)
return self._upload_simple(file, content, purpose)
except ImportError:
raise
except (TransientUploadError, PermanentUploadError):
raise
except Exception as e:
raise classify_upload_error(e, file.filename) from e
def _upload_simple(
self,
file: FileInput,
content: bytes,
purpose: str | None,
) -> UploadResult:
"""Upload using the Files API (single request, up to 512MB).
Args:
file: The file to upload.
content: File content bytes.
purpose: Optional purpose for the file.
Returns:
UploadResult with the file ID and metadata.
"""
client = self._get_client()
file_purpose = _get_purpose_for_content_type(file.content_type, purpose)
filename = file.filename or generate_filename(file.content_type)
file_data = io.BytesIO(content)
file_data.name = filename
logger.info(
f"Uploading file '{filename}' to OpenAI Files API ({len(content)} bytes)"
)
uploaded_file = client.files.create(
file=file_data,
purpose=file_purpose,
)
logger.info(f"Uploaded to OpenAI: {uploaded_file.id}")
return self._build_upload_result(uploaded_file.id, file.content_type)
def _upload_multipart(
self,
file: FileInput,
content: bytes,
purpose: str | None,
) -> UploadResult:
"""Upload using the Uploads API with content already in memory.
Args:
file: The file to upload.
content: File content bytes (already loaded).
purpose: Optional purpose for the file.
Returns:
UploadResult with the file ID and metadata.
"""
client = self._get_client()
file_purpose = _get_purpose_for_content_type(file.content_type, purpose)
filename = file.filename or generate_filename(file.content_type)
file_size = len(content)
logger.info(
f"Uploading file '{filename}' to OpenAI Uploads API "
f"({file_size} bytes, {self._chunk_size} byte chunks)"
)
upload = client.uploads.create(
bytes=file_size,
filename=filename,
mime_type=file.content_type,
purpose=file_purpose,
)
part_ids: list[str] = []
offset = 0
part_num = 1
try:
while offset < file_size:
chunk = content[offset : offset + self._chunk_size]
chunk_io = io.BytesIO(chunk)
logger.debug(
f"Uploading part {part_num} ({len(chunk)} bytes, offset {offset})"
)
part = client.uploads.parts.create(
upload_id=upload.id,
data=chunk_io,
)
part_ids.append(part.id)
offset += self._chunk_size
part_num += 1
completed = client.uploads.complete(
upload_id=upload.id,
part_ids=part_ids,
)
file_id = completed.file.id if completed.file else upload.id
logger.info(f"Completed multipart upload to OpenAI: {file_id}")
return self._build_upload_result(file_id, file.content_type)
except Exception:
logger.warning(f"Multipart upload failed, cancelling upload {upload.id}")
try:
client.uploads.cancel(upload_id=upload.id)
except Exception as cancel_err:
logger.debug(f"Failed to cancel upload: {cancel_err}")
raise
def _upload_multipart_streaming(
self,
file: FileInput,
file_size: int,
purpose: str | None,
) -> UploadResult:
"""Upload using the Uploads API with streaming chunks.
Streams chunks directly from the file source without loading
the entire file into memory. Used for large files.
Args:
file: The file to upload.
file_size: Total file size in bytes.
purpose: Optional purpose for the file.
Returns:
UploadResult with the file ID and metadata.
"""
client = self._get_client()
file_purpose = _get_purpose_for_content_type(file.content_type, purpose)
filename = file.filename or generate_filename(file.content_type)
logger.info(
f"Uploading file '{filename}' to OpenAI Uploads API (streaming) "
f"({file_size} bytes, {self._chunk_size} byte chunks)"
)
upload = client.uploads.create(
bytes=file_size,
filename=filename,
mime_type=file.content_type,
purpose=file_purpose,
)
part_ids: list[str] = []
part_num = 1
try:
for chunk in _iter_file_chunks(file, self._chunk_size):
chunk_io = io.BytesIO(chunk)
logger.debug(f"Uploading part {part_num} ({len(chunk)} bytes)")
part = client.uploads.parts.create(
upload_id=upload.id,
data=chunk_io,
)
part_ids.append(part.id)
part_num += 1
completed = client.uploads.complete(
upload_id=upload.id,
part_ids=part_ids,
)
file_id = completed.file.id if completed.file else upload.id
logger.info(f"Completed streaming multipart upload to OpenAI: {file_id}")
return self._build_upload_result(file_id, file.content_type)
except Exception:
logger.warning(f"Multipart upload failed, cancelling upload {upload.id}")
try:
client.uploads.cancel(upload_id=upload.id)
except Exception as cancel_err:
logger.debug(f"Failed to cancel upload: {cancel_err}")
raise
def delete(self, file_id: str) -> bool:
"""Delete an uploaded file from OpenAI.
Args:
file_id: The file ID to delete.
Returns:
True if deletion was successful, False otherwise.
"""
try:
client = self._get_client()
client.files.delete(file_id)
logger.info(f"Deleted OpenAI file: {file_id}")
return True
except Exception as e:
logger.warning(f"Failed to delete OpenAI file {file_id}: {e}")
return False
def get_file_info(self, file_id: str) -> dict[str, Any] | None:
"""Get information about an uploaded file.
Args:
file_id: The file ID.
Returns:
Dictionary with file information, or None if not found.
"""
try:
client = self._get_client()
file_info = client.files.retrieve(file_id)
return {
"id": file_info.id,
"filename": file_info.filename,
"purpose": file_info.purpose,
"bytes": file_info.bytes,
"created_at": file_info.created_at,
"status": file_info.status,
}
except Exception as e:
logger.debug(f"Failed to get OpenAI file info for {file_id}: {e}")
return None
def list_files(self) -> list[dict[str, Any]]:
"""List all uploaded files.
Returns:
List of dictionaries with file information.
"""
try:
client = self._get_client()
files = client.files.list()
return [
{
"id": f.id,
"filename": f.filename,
"purpose": f.purpose,
"bytes": f.bytes,
"created_at": f.created_at,
"status": f.status,
}
for f in files.data
]
except Exception as e:
logger.warning(f"Failed to list OpenAI files: {e}")
return []
async def aupload(
self, file: FileInput, purpose: str | None = None
) -> UploadResult:
"""Async upload a file to OpenAI using native async client.
Uses Files API for files <= 512MB, Uploads API for larger files.
For large files, streams chunks to avoid loading entire file in memory.
Args:
file: The file to upload.
purpose: Optional purpose for the file (default: "user_data").
Returns:
UploadResult with the file ID and metadata.
Raises:
TransientUploadError: For retryable errors (network, rate limits).
PermanentUploadError: For non-retryable errors (auth, validation).
"""
try:
file_size = _get_file_size(file)
if file_size is not None and file_size > FILES_API_MAX_SIZE:
return await self._aupload_multipart_streaming(file, file_size, purpose)
content = await file.aread()
if len(content) > FILES_API_MAX_SIZE:
return await self._aupload_multipart(file, content, purpose)
return await self._aupload_simple(file, content, purpose)
except ImportError:
raise
except (TransientUploadError, PermanentUploadError):
raise
except Exception as e:
raise classify_upload_error(e, file.filename) from e
async def _aupload_simple(
self,
file: FileInput,
content: bytes,
purpose: str | None,
) -> UploadResult:
"""Async upload using the Files API (single request, up to 512MB).
Args:
file: The file to upload.
content: File content bytes.
purpose: Optional purpose for the file.
Returns:
UploadResult with the file ID and metadata.
"""
client = self._get_async_client()
file_purpose = _get_purpose_for_content_type(file.content_type, purpose)
file_data = io.BytesIO(content)
file_data.name = file.filename or generate_filename(file.content_type)
logger.info(
f"Uploading file '{file.filename}' to OpenAI Files API ({len(content)} bytes)"
)
uploaded_file = await client.files.create(
file=file_data,
purpose=file_purpose,
)
logger.info(f"Uploaded to OpenAI: {uploaded_file.id}")
return self._build_upload_result(uploaded_file.id, file.content_type)
async def _aupload_multipart(
self,
file: FileInput,
content: bytes,
purpose: str | None,
) -> UploadResult:
"""Async upload using the Uploads API (multipart chunked, up to 8GB).
Args:
file: The file to upload.
content: File content bytes.
purpose: Optional purpose for the file.
Returns:
UploadResult with the file ID and metadata.
"""
client = self._get_async_client()
file_purpose = _get_purpose_for_content_type(file.content_type, purpose)
filename = file.filename or generate_filename(file.content_type)
file_size = len(content)
logger.info(
f"Uploading file '{filename}' to OpenAI Uploads API "
f"({file_size} bytes, {self._chunk_size} byte chunks)"
)
upload = await client.uploads.create(
bytes=file_size,
filename=filename,
mime_type=file.content_type,
purpose=file_purpose,
)
part_ids: list[str] = []
offset = 0
part_num = 1
try:
while offset < file_size:
chunk = content[offset : offset + self._chunk_size]
chunk_io = io.BytesIO(chunk)
logger.debug(
f"Uploading part {part_num} ({len(chunk)} bytes, offset {offset})"
)
part = await client.uploads.parts.create(
upload_id=upload.id,
data=chunk_io,
)
part_ids.append(part.id)
offset += self._chunk_size
part_num += 1
completed = await client.uploads.complete(
upload_id=upload.id,
part_ids=part_ids,
)
file_id = completed.file.id if completed.file else upload.id
logger.info(f"Completed multipart upload to OpenAI: {file_id}")
return self._build_upload_result(file_id, file.content_type)
except Exception:
logger.warning(f"Multipart upload failed, cancelling upload {upload.id}")
try:
await client.uploads.cancel(upload_id=upload.id)
except Exception as cancel_err:
logger.debug(f"Failed to cancel upload: {cancel_err}")
raise
async def _aupload_multipart_streaming(
self,
file: FileInput,
file_size: int,
purpose: str | None,
) -> UploadResult:
"""Async upload using the Uploads API with streaming chunks.
Streams chunks directly from the file source without loading
the entire file into memory. Used for large files.
Args:
file: The file to upload.
file_size: Total file size in bytes.
purpose: Optional purpose for the file.
Returns:
UploadResult with the file ID and metadata.
"""
client = self._get_async_client()
file_purpose = _get_purpose_for_content_type(file.content_type, purpose)
filename = file.filename or generate_filename(file.content_type)
logger.info(
f"Uploading file '{filename}' to OpenAI Uploads API (streaming) "
f"({file_size} bytes, {self._chunk_size} byte chunks)"
)
upload = await client.uploads.create(
bytes=file_size,
filename=filename,
mime_type=file.content_type,
purpose=file_purpose,
)
part_ids: list[str] = []
part_num = 1
try:
async for chunk in _aiter_file_chunks(file, self._chunk_size):
chunk_io = io.BytesIO(chunk)
logger.debug(f"Uploading part {part_num} ({len(chunk)} bytes)")
part = await client.uploads.parts.create(
upload_id=upload.id,
data=chunk_io,
)
part_ids.append(part.id)
part_num += 1
completed = await client.uploads.complete(
upload_id=upload.id,
part_ids=part_ids,
)
file_id = completed.file.id if completed.file else upload.id
logger.info(f"Completed streaming multipart upload to OpenAI: {file_id}")
return self._build_upload_result(file_id, file.content_type)
except Exception:
logger.warning(f"Multipart upload failed, cancelling upload {upload.id}")
try:
await client.uploads.cancel(upload_id=upload.id)
except Exception as cancel_err:
logger.debug(f"Failed to cancel upload: {cancel_err}")
raise
async def adelete(self, file_id: str) -> bool:
"""Async delete an uploaded file from OpenAI.
Args:
file_id: The file ID to delete.
Returns:
True if deletion was successful, False otherwise.
"""
try:
client = self._get_async_client()
await client.files.delete(file_id)
logger.info(f"Deleted OpenAI file: {file_id}")
return True
except Exception as e:
logger.warning(f"Failed to delete OpenAI file {file_id}: {e}")
return False