mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-24 07:38:14 +00:00
- add input_files parameter to Crew.kickoff(), Flow.kickoff(), Task, and Agent.kickoff() - add provider-specific file uploaders for OpenAI, Anthropic, Gemini, and Bedrock - add file type detection, constraint validation, and automatic format conversion - add URL file source support for multimodal content - add streaming uploads for large files - add prompt caching support for Anthropic - add OpenAI Responses API support
696 lines
22 KiB
Python
696 lines
22 KiB
Python
"""OpenAI Files API uploader implementation."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from collections.abc import AsyncIterator, Iterator
|
|
import io
|
|
import logging
|
|
import os
|
|
from typing import Any
|
|
|
|
from crewai_files.core.constants import DEFAULT_UPLOAD_CHUNK_SIZE, FILES_API_MAX_SIZE
|
|
from crewai_files.core.sources import FileBytes, FilePath, FileStream, generate_filename
|
|
from crewai_files.core.types import FileInput
|
|
from crewai_files.processing.exceptions import (
|
|
PermanentUploadError,
|
|
TransientUploadError,
|
|
classify_upload_error,
|
|
)
|
|
from crewai_files.uploaders.base import FileUploader, UploadResult
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def _get_purpose_for_content_type(content_type: str, purpose: str | None) -> str:
|
|
"""Get the appropriate purpose for a file based on content type.
|
|
|
|
OpenAI Files API requires different purposes for different file types:
|
|
- Images (for Responses API vision): "vision"
|
|
- PDFs and other documents: "user_data"
|
|
|
|
Args:
|
|
content_type: MIME type of the file.
|
|
purpose: Optional explicit purpose override.
|
|
|
|
Returns:
|
|
The purpose string to use for upload.
|
|
"""
|
|
if purpose is not None:
|
|
return purpose
|
|
if content_type.startswith("image/"):
|
|
return "vision"
|
|
return "user_data"
|
|
|
|
|
|
def _get_file_size(file: FileInput) -> int | None:
|
|
"""Get file size without reading content if possible.
|
|
|
|
Args:
|
|
file: The file to get size for.
|
|
|
|
Returns:
|
|
File size in bytes, or None if size cannot be determined without reading.
|
|
"""
|
|
source = file._file_source
|
|
if isinstance(source, FilePath):
|
|
return source.path.stat().st_size
|
|
if isinstance(source, FileBytes):
|
|
return len(source.data)
|
|
return None
|
|
|
|
|
|
def _iter_file_chunks(file: FileInput, chunk_size: int) -> Iterator[bytes]:
|
|
"""Iterate over file content in chunks.
|
|
|
|
Args:
|
|
file: The file to read.
|
|
chunk_size: Size of each chunk in bytes.
|
|
|
|
Yields:
|
|
Chunks of file content.
|
|
"""
|
|
source = file._file_source
|
|
if isinstance(source, (FilePath, FileBytes, FileStream)):
|
|
yield from source.read_chunks(chunk_size)
|
|
else:
|
|
content = file.read()
|
|
for i in range(0, len(content), chunk_size):
|
|
yield content[i : i + chunk_size]
|
|
|
|
|
|
async def _aiter_file_chunks(
|
|
file: FileInput, chunk_size: int, content: bytes | None = None
|
|
) -> AsyncIterator[bytes]:
|
|
"""Async iterate over file content in chunks.
|
|
|
|
Args:
|
|
file: The file to read.
|
|
chunk_size: Size of each chunk in bytes.
|
|
content: Optional pre-loaded content to chunk.
|
|
|
|
Yields:
|
|
Chunks of file content.
|
|
"""
|
|
if content is not None:
|
|
for i in range(0, len(content), chunk_size):
|
|
yield content[i : i + chunk_size]
|
|
return
|
|
|
|
source = file._file_source
|
|
if isinstance(source, FilePath):
|
|
async for chunk in source.aread_chunks(chunk_size):
|
|
yield chunk
|
|
elif isinstance(source, (FileBytes, FileStream)):
|
|
for chunk in source.read_chunks(chunk_size):
|
|
yield chunk
|
|
else:
|
|
data = await file.aread()
|
|
for i in range(0, len(data), chunk_size):
|
|
yield data[i : i + chunk_size]
|
|
|
|
|
|
class OpenAIFileUploader(FileUploader):
|
|
"""Uploader for OpenAI Files and Uploads APIs.
|
|
|
|
Uses the Files API for files up to 512MB (single request).
|
|
Uses the Uploads API for files larger than 512MB (multipart chunked).
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
api_key: str | None = None,
|
|
chunk_size: int = DEFAULT_UPLOAD_CHUNK_SIZE,
|
|
client: Any = None,
|
|
async_client: Any = None,
|
|
) -> None:
|
|
"""Initialize the OpenAI uploader.
|
|
|
|
Args:
|
|
api_key: Optional OpenAI API key. If not provided, uses
|
|
OPENAI_API_KEY environment variable.
|
|
chunk_size: Chunk size in bytes for multipart uploads (default 64MB).
|
|
client: Optional pre-instantiated OpenAI client.
|
|
async_client: Optional pre-instantiated async OpenAI client.
|
|
"""
|
|
self._api_key = api_key or os.environ.get("OPENAI_API_KEY")
|
|
self._chunk_size = chunk_size
|
|
self._client: Any = client
|
|
self._async_client: Any = async_client
|
|
|
|
@property
|
|
def provider_name(self) -> str:
|
|
"""Return the provider name."""
|
|
return "openai"
|
|
|
|
def _build_upload_result(self, file_id: str, content_type: str) -> UploadResult:
|
|
"""Build an UploadResult for a completed upload.
|
|
|
|
Args:
|
|
file_id: The uploaded file ID.
|
|
content_type: The file's content type.
|
|
|
|
Returns:
|
|
UploadResult with the file metadata.
|
|
"""
|
|
return UploadResult(
|
|
file_id=file_id,
|
|
file_uri=None,
|
|
content_type=content_type,
|
|
expires_at=None,
|
|
provider=self.provider_name,
|
|
)
|
|
|
|
def _get_client(self) -> Any:
|
|
"""Get or create the OpenAI client."""
|
|
if self._client is None:
|
|
try:
|
|
from openai import OpenAI
|
|
|
|
self._client = OpenAI(api_key=self._api_key)
|
|
except ImportError as e:
|
|
raise ImportError(
|
|
"openai is required for OpenAI file uploads. "
|
|
"Install with: pip install openai"
|
|
) from e
|
|
return self._client
|
|
|
|
def _get_async_client(self) -> Any:
|
|
"""Get or create the async OpenAI client."""
|
|
if self._async_client is None:
|
|
try:
|
|
from openai import AsyncOpenAI
|
|
|
|
self._async_client = AsyncOpenAI(api_key=self._api_key)
|
|
except ImportError as e:
|
|
raise ImportError(
|
|
"openai is required for OpenAI file uploads. "
|
|
"Install with: pip install openai"
|
|
) from e
|
|
return self._async_client
|
|
|
|
def upload(self, file: FileInput, purpose: str | None = None) -> UploadResult:
|
|
"""Upload a file to OpenAI.
|
|
|
|
Uses Files API for files <= 512MB, Uploads API for larger files.
|
|
For large files, streams chunks to avoid loading entire file in memory.
|
|
|
|
Args:
|
|
file: The file to upload.
|
|
purpose: Optional purpose for the file (default: "user_data").
|
|
|
|
Returns:
|
|
UploadResult with the file ID and metadata.
|
|
|
|
Raises:
|
|
TransientUploadError: For retryable errors (network, rate limits).
|
|
PermanentUploadError: For non-retryable errors (auth, validation).
|
|
"""
|
|
try:
|
|
file_size = _get_file_size(file)
|
|
|
|
if file_size is not None and file_size > FILES_API_MAX_SIZE:
|
|
return self._upload_multipart_streaming(file, file_size, purpose)
|
|
|
|
content = file.read()
|
|
if len(content) > FILES_API_MAX_SIZE:
|
|
return self._upload_multipart(file, content, purpose)
|
|
return self._upload_simple(file, content, purpose)
|
|
except ImportError:
|
|
raise
|
|
except (TransientUploadError, PermanentUploadError):
|
|
raise
|
|
except Exception as e:
|
|
raise classify_upload_error(e, file.filename) from e
|
|
|
|
def _upload_simple(
|
|
self,
|
|
file: FileInput,
|
|
content: bytes,
|
|
purpose: str | None,
|
|
) -> UploadResult:
|
|
"""Upload using the Files API (single request, up to 512MB).
|
|
|
|
Args:
|
|
file: The file to upload.
|
|
content: File content bytes.
|
|
purpose: Optional purpose for the file.
|
|
|
|
Returns:
|
|
UploadResult with the file ID and metadata.
|
|
"""
|
|
client = self._get_client()
|
|
file_purpose = _get_purpose_for_content_type(file.content_type, purpose)
|
|
filename = file.filename or generate_filename(file.content_type)
|
|
|
|
file_data = io.BytesIO(content)
|
|
file_data.name = filename
|
|
|
|
logger.info(
|
|
f"Uploading file '{filename}' to OpenAI Files API ({len(content)} bytes)"
|
|
)
|
|
|
|
uploaded_file = client.files.create(
|
|
file=file_data,
|
|
purpose=file_purpose,
|
|
)
|
|
|
|
logger.info(f"Uploaded to OpenAI: {uploaded_file.id}")
|
|
|
|
return self._build_upload_result(uploaded_file.id, file.content_type)
|
|
|
|
def _upload_multipart(
|
|
self,
|
|
file: FileInput,
|
|
content: bytes,
|
|
purpose: str | None,
|
|
) -> UploadResult:
|
|
"""Upload using the Uploads API with content already in memory.
|
|
|
|
Args:
|
|
file: The file to upload.
|
|
content: File content bytes (already loaded).
|
|
purpose: Optional purpose for the file.
|
|
|
|
Returns:
|
|
UploadResult with the file ID and metadata.
|
|
"""
|
|
client = self._get_client()
|
|
file_purpose = _get_purpose_for_content_type(file.content_type, purpose)
|
|
filename = file.filename or generate_filename(file.content_type)
|
|
file_size = len(content)
|
|
|
|
logger.info(
|
|
f"Uploading file '{filename}' to OpenAI Uploads API "
|
|
f"({file_size} bytes, {self._chunk_size} byte chunks)"
|
|
)
|
|
|
|
upload = client.uploads.create(
|
|
bytes=file_size,
|
|
filename=filename,
|
|
mime_type=file.content_type,
|
|
purpose=file_purpose,
|
|
)
|
|
|
|
part_ids: list[str] = []
|
|
offset = 0
|
|
part_num = 1
|
|
|
|
try:
|
|
while offset < file_size:
|
|
chunk = content[offset : offset + self._chunk_size]
|
|
chunk_io = io.BytesIO(chunk)
|
|
|
|
logger.debug(
|
|
f"Uploading part {part_num} ({len(chunk)} bytes, offset {offset})"
|
|
)
|
|
|
|
part = client.uploads.parts.create(
|
|
upload_id=upload.id,
|
|
data=chunk_io,
|
|
)
|
|
part_ids.append(part.id)
|
|
|
|
offset += self._chunk_size
|
|
part_num += 1
|
|
|
|
completed = client.uploads.complete(
|
|
upload_id=upload.id,
|
|
part_ids=part_ids,
|
|
)
|
|
|
|
file_id = completed.file.id if completed.file else upload.id
|
|
logger.info(f"Completed multipart upload to OpenAI: {file_id}")
|
|
|
|
return self._build_upload_result(file_id, file.content_type)
|
|
except Exception:
|
|
logger.warning(f"Multipart upload failed, cancelling upload {upload.id}")
|
|
try:
|
|
client.uploads.cancel(upload_id=upload.id)
|
|
except Exception as cancel_err:
|
|
logger.debug(f"Failed to cancel upload: {cancel_err}")
|
|
raise
|
|
|
|
def _upload_multipart_streaming(
|
|
self,
|
|
file: FileInput,
|
|
file_size: int,
|
|
purpose: str | None,
|
|
) -> UploadResult:
|
|
"""Upload using the Uploads API with streaming chunks.
|
|
|
|
Streams chunks directly from the file source without loading
|
|
the entire file into memory. Used for large files.
|
|
|
|
Args:
|
|
file: The file to upload.
|
|
file_size: Total file size in bytes.
|
|
purpose: Optional purpose for the file.
|
|
|
|
Returns:
|
|
UploadResult with the file ID and metadata.
|
|
"""
|
|
client = self._get_client()
|
|
file_purpose = _get_purpose_for_content_type(file.content_type, purpose)
|
|
filename = file.filename or generate_filename(file.content_type)
|
|
|
|
logger.info(
|
|
f"Uploading file '{filename}' to OpenAI Uploads API (streaming) "
|
|
f"({file_size} bytes, {self._chunk_size} byte chunks)"
|
|
)
|
|
|
|
upload = client.uploads.create(
|
|
bytes=file_size,
|
|
filename=filename,
|
|
mime_type=file.content_type,
|
|
purpose=file_purpose,
|
|
)
|
|
|
|
part_ids: list[str] = []
|
|
part_num = 1
|
|
|
|
try:
|
|
for chunk in _iter_file_chunks(file, self._chunk_size):
|
|
chunk_io = io.BytesIO(chunk)
|
|
|
|
logger.debug(f"Uploading part {part_num} ({len(chunk)} bytes)")
|
|
|
|
part = client.uploads.parts.create(
|
|
upload_id=upload.id,
|
|
data=chunk_io,
|
|
)
|
|
part_ids.append(part.id)
|
|
part_num += 1
|
|
|
|
completed = client.uploads.complete(
|
|
upload_id=upload.id,
|
|
part_ids=part_ids,
|
|
)
|
|
|
|
file_id = completed.file.id if completed.file else upload.id
|
|
logger.info(f"Completed streaming multipart upload to OpenAI: {file_id}")
|
|
|
|
return self._build_upload_result(file_id, file.content_type)
|
|
except Exception:
|
|
logger.warning(f"Multipart upload failed, cancelling upload {upload.id}")
|
|
try:
|
|
client.uploads.cancel(upload_id=upload.id)
|
|
except Exception as cancel_err:
|
|
logger.debug(f"Failed to cancel upload: {cancel_err}")
|
|
raise
|
|
|
|
def delete(self, file_id: str) -> bool:
|
|
"""Delete an uploaded file from OpenAI.
|
|
|
|
Args:
|
|
file_id: The file ID to delete.
|
|
|
|
Returns:
|
|
True if deletion was successful, False otherwise.
|
|
"""
|
|
try:
|
|
client = self._get_client()
|
|
client.files.delete(file_id)
|
|
logger.info(f"Deleted OpenAI file: {file_id}")
|
|
return True
|
|
except Exception as e:
|
|
logger.warning(f"Failed to delete OpenAI file {file_id}: {e}")
|
|
return False
|
|
|
|
def get_file_info(self, file_id: str) -> dict[str, Any] | None:
|
|
"""Get information about an uploaded file.
|
|
|
|
Args:
|
|
file_id: The file ID.
|
|
|
|
Returns:
|
|
Dictionary with file information, or None if not found.
|
|
"""
|
|
try:
|
|
client = self._get_client()
|
|
file_info = client.files.retrieve(file_id)
|
|
return {
|
|
"id": file_info.id,
|
|
"filename": file_info.filename,
|
|
"purpose": file_info.purpose,
|
|
"bytes": file_info.bytes,
|
|
"created_at": file_info.created_at,
|
|
"status": file_info.status,
|
|
}
|
|
except Exception as e:
|
|
logger.debug(f"Failed to get OpenAI file info for {file_id}: {e}")
|
|
return None
|
|
|
|
def list_files(self) -> list[dict[str, Any]]:
|
|
"""List all uploaded files.
|
|
|
|
Returns:
|
|
List of dictionaries with file information.
|
|
"""
|
|
try:
|
|
client = self._get_client()
|
|
files = client.files.list()
|
|
return [
|
|
{
|
|
"id": f.id,
|
|
"filename": f.filename,
|
|
"purpose": f.purpose,
|
|
"bytes": f.bytes,
|
|
"created_at": f.created_at,
|
|
"status": f.status,
|
|
}
|
|
for f in files.data
|
|
]
|
|
except Exception as e:
|
|
logger.warning(f"Failed to list OpenAI files: {e}")
|
|
return []
|
|
|
|
async def aupload(
|
|
self, file: FileInput, purpose: str | None = None
|
|
) -> UploadResult:
|
|
"""Async upload a file to OpenAI using native async client.
|
|
|
|
Uses Files API for files <= 512MB, Uploads API for larger files.
|
|
For large files, streams chunks to avoid loading entire file in memory.
|
|
|
|
Args:
|
|
file: The file to upload.
|
|
purpose: Optional purpose for the file (default: "user_data").
|
|
|
|
Returns:
|
|
UploadResult with the file ID and metadata.
|
|
|
|
Raises:
|
|
TransientUploadError: For retryable errors (network, rate limits).
|
|
PermanentUploadError: For non-retryable errors (auth, validation).
|
|
"""
|
|
try:
|
|
file_size = _get_file_size(file)
|
|
|
|
if file_size is not None and file_size > FILES_API_MAX_SIZE:
|
|
return await self._aupload_multipart_streaming(file, file_size, purpose)
|
|
|
|
content = await file.aread()
|
|
if len(content) > FILES_API_MAX_SIZE:
|
|
return await self._aupload_multipart(file, content, purpose)
|
|
return await self._aupload_simple(file, content, purpose)
|
|
except ImportError:
|
|
raise
|
|
except (TransientUploadError, PermanentUploadError):
|
|
raise
|
|
except Exception as e:
|
|
raise classify_upload_error(e, file.filename) from e
|
|
|
|
async def _aupload_simple(
|
|
self,
|
|
file: FileInput,
|
|
content: bytes,
|
|
purpose: str | None,
|
|
) -> UploadResult:
|
|
"""Async upload using the Files API (single request, up to 512MB).
|
|
|
|
Args:
|
|
file: The file to upload.
|
|
content: File content bytes.
|
|
purpose: Optional purpose for the file.
|
|
|
|
Returns:
|
|
UploadResult with the file ID and metadata.
|
|
"""
|
|
client = self._get_async_client()
|
|
file_purpose = _get_purpose_for_content_type(file.content_type, purpose)
|
|
|
|
file_data = io.BytesIO(content)
|
|
file_data.name = file.filename or generate_filename(file.content_type)
|
|
|
|
logger.info(
|
|
f"Uploading file '{file.filename}' to OpenAI Files API ({len(content)} bytes)"
|
|
)
|
|
|
|
uploaded_file = await client.files.create(
|
|
file=file_data,
|
|
purpose=file_purpose,
|
|
)
|
|
|
|
logger.info(f"Uploaded to OpenAI: {uploaded_file.id}")
|
|
|
|
return self._build_upload_result(uploaded_file.id, file.content_type)
|
|
|
|
async def _aupload_multipart(
|
|
self,
|
|
file: FileInput,
|
|
content: bytes,
|
|
purpose: str | None,
|
|
) -> UploadResult:
|
|
"""Async upload using the Uploads API (multipart chunked, up to 8GB).
|
|
|
|
Args:
|
|
file: The file to upload.
|
|
content: File content bytes.
|
|
purpose: Optional purpose for the file.
|
|
|
|
Returns:
|
|
UploadResult with the file ID and metadata.
|
|
"""
|
|
client = self._get_async_client()
|
|
file_purpose = _get_purpose_for_content_type(file.content_type, purpose)
|
|
filename = file.filename or generate_filename(file.content_type)
|
|
file_size = len(content)
|
|
|
|
logger.info(
|
|
f"Uploading file '{filename}' to OpenAI Uploads API "
|
|
f"({file_size} bytes, {self._chunk_size} byte chunks)"
|
|
)
|
|
|
|
upload = await client.uploads.create(
|
|
bytes=file_size,
|
|
filename=filename,
|
|
mime_type=file.content_type,
|
|
purpose=file_purpose,
|
|
)
|
|
|
|
part_ids: list[str] = []
|
|
offset = 0
|
|
part_num = 1
|
|
|
|
try:
|
|
while offset < file_size:
|
|
chunk = content[offset : offset + self._chunk_size]
|
|
chunk_io = io.BytesIO(chunk)
|
|
|
|
logger.debug(
|
|
f"Uploading part {part_num} ({len(chunk)} bytes, offset {offset})"
|
|
)
|
|
|
|
part = await client.uploads.parts.create(
|
|
upload_id=upload.id,
|
|
data=chunk_io,
|
|
)
|
|
part_ids.append(part.id)
|
|
|
|
offset += self._chunk_size
|
|
part_num += 1
|
|
|
|
completed = await client.uploads.complete(
|
|
upload_id=upload.id,
|
|
part_ids=part_ids,
|
|
)
|
|
|
|
file_id = completed.file.id if completed.file else upload.id
|
|
logger.info(f"Completed multipart upload to OpenAI: {file_id}")
|
|
|
|
return self._build_upload_result(file_id, file.content_type)
|
|
except Exception:
|
|
logger.warning(f"Multipart upload failed, cancelling upload {upload.id}")
|
|
try:
|
|
await client.uploads.cancel(upload_id=upload.id)
|
|
except Exception as cancel_err:
|
|
logger.debug(f"Failed to cancel upload: {cancel_err}")
|
|
raise
|
|
|
|
async def _aupload_multipart_streaming(
|
|
self,
|
|
file: FileInput,
|
|
file_size: int,
|
|
purpose: str | None,
|
|
) -> UploadResult:
|
|
"""Async upload using the Uploads API with streaming chunks.
|
|
|
|
Streams chunks directly from the file source without loading
|
|
the entire file into memory. Used for large files.
|
|
|
|
Args:
|
|
file: The file to upload.
|
|
file_size: Total file size in bytes.
|
|
purpose: Optional purpose for the file.
|
|
|
|
Returns:
|
|
UploadResult with the file ID and metadata.
|
|
"""
|
|
client = self._get_async_client()
|
|
file_purpose = _get_purpose_for_content_type(file.content_type, purpose)
|
|
filename = file.filename or generate_filename(file.content_type)
|
|
|
|
logger.info(
|
|
f"Uploading file '{filename}' to OpenAI Uploads API (streaming) "
|
|
f"({file_size} bytes, {self._chunk_size} byte chunks)"
|
|
)
|
|
|
|
upload = await client.uploads.create(
|
|
bytes=file_size,
|
|
filename=filename,
|
|
mime_type=file.content_type,
|
|
purpose=file_purpose,
|
|
)
|
|
|
|
part_ids: list[str] = []
|
|
part_num = 1
|
|
|
|
try:
|
|
async for chunk in _aiter_file_chunks(file, self._chunk_size):
|
|
chunk_io = io.BytesIO(chunk)
|
|
|
|
logger.debug(f"Uploading part {part_num} ({len(chunk)} bytes)")
|
|
|
|
part = await client.uploads.parts.create(
|
|
upload_id=upload.id,
|
|
data=chunk_io,
|
|
)
|
|
part_ids.append(part.id)
|
|
part_num += 1
|
|
|
|
completed = await client.uploads.complete(
|
|
upload_id=upload.id,
|
|
part_ids=part_ids,
|
|
)
|
|
|
|
file_id = completed.file.id if completed.file else upload.id
|
|
logger.info(f"Completed streaming multipart upload to OpenAI: {file_id}")
|
|
|
|
return self._build_upload_result(file_id, file.content_type)
|
|
except Exception:
|
|
logger.warning(f"Multipart upload failed, cancelling upload {upload.id}")
|
|
try:
|
|
await client.uploads.cancel(upload_id=upload.id)
|
|
except Exception as cancel_err:
|
|
logger.debug(f"Failed to cancel upload: {cancel_err}")
|
|
raise
|
|
|
|
async def adelete(self, file_id: str) -> bool:
|
|
"""Async delete an uploaded file from OpenAI.
|
|
|
|
Args:
|
|
file_id: The file ID to delete.
|
|
|
|
Returns:
|
|
True if deletion was successful, False otherwise.
|
|
"""
|
|
try:
|
|
client = self._get_async_client()
|
|
await client.files.delete(file_id)
|
|
logger.info(f"Deleted OpenAI file: {file_id}")
|
|
return True
|
|
except Exception as e:
|
|
logger.warning(f"Failed to delete OpenAI file {file_id}: {e}")
|
|
return False
|