feat: promote files to first-class crewai.files package

This commit is contained in:
Greyson LaLonde
2026-01-22 01:39:04 -05:00
parent 204a1cece7
commit 5550c6df7e
63 changed files with 4847 additions and 1546 deletions

View File

@@ -103,6 +103,7 @@ file-processing = [
"pypdf~=4.0.0",
"python-magic>=0.4.27",
"aiocache~=0.12.3",
"aiofiles~=24.1.0",
]

View File

@@ -6,6 +6,14 @@ import warnings
from crewai.agent.core import Agent
from crewai.crew import Crew
from crewai.crews.crew_output import CrewOutput
from crewai.files import (
AudioFile,
File,
ImageFile,
PDFFile,
TextFile,
VideoFile,
)
from crewai.flow.flow import Flow
from crewai.knowledge.knowledge import Knowledge
from crewai.llm import LLM
@@ -15,14 +23,6 @@ from crewai.task import Task
from crewai.tasks.llm_guardrail import LLMGuardrail
from crewai.tasks.task_output import TaskOutput
from crewai.telemetry.telemetry import Telemetry
from crewai.utilities.files import (
AudioFile,
File,
ImageFile,
PDFFile,
TextFile,
VideoFile,
)
def _suppress_pydantic_deprecation_warnings() -> None:

View File

@@ -24,6 +24,7 @@ from crewai.events.types.logging_events import (
AgentLogsExecutionEvent,
AgentLogsStartedEvent,
)
from crewai.files import FileProcessor
from crewai.hooks.llm_hooks import (
get_after_llm_call_hooks,
get_before_llm_call_hooks,
@@ -44,7 +45,6 @@ from crewai.utilities.agent_utils import (
)
from crewai.utilities.constants import TRAINING_DATA_FILE
from crewai.utilities.file_store import get_all_files
from crewai.utilities.files import FileProcessor
from crewai.utilities.i18n import I18N, get_i18n
from crewai.utilities.printer import Printer
from crewai.utilities.tool_utils import (
@@ -238,7 +238,7 @@ class CrewAgentExecutor(CrewAgentExecutorMixin):
processor = FileProcessor(constraints=provider)
files = processor.process_files(files)
from crewai.utilities.files import get_upload_cache
from crewai.files import get_upload_cache
upload_cache = get_upload_cache()
content_blocks = self.llm.format_multimodal_content(
@@ -258,6 +258,48 @@ class CrewAgentExecutor(CrewAgentExecutorMixin):
]
break
async def _ainject_multimodal_files(self) -> None:
"""Async inject files as multimodal content into messages.
For crews with input files and LLMs that support multimodal,
processes files according to provider constraints using parallel processing,
then delegates to the LLM's aformat_multimodal_content method to
generate provider-specific content blocks with parallel file resolution.
"""
if not self.crew or not self.task:
return
if not self.llm.supports_multimodal():
return
files = get_all_files(self.crew.id, self.task.id)
if not files:
return
provider = getattr(self.llm, "provider", None) or getattr(self.llm, "model", "")
processor = FileProcessor(constraints=provider)
files = await processor.aprocess_files(files)
from crewai.files import get_upload_cache
upload_cache = get_upload_cache()
content_blocks = await self.llm.aformat_multimodal_content(
files, upload_cache=upload_cache
)
if not content_blocks:
return
for i in range(len(self.messages) - 1, -1, -1):
msg = self.messages[i]
if msg.get("role") == "user":
existing_content = msg.get("content", "")
if isinstance(existing_content, str):
msg["content"] = [
self.llm.format_text_content(existing_content),
*content_blocks,
]
break
def _invoke_loop(self) -> AgentFinish:
"""Execute agent loop until completion.
@@ -401,7 +443,7 @@ class CrewAgentExecutor(CrewAgentExecutorMixin):
user_prompt = self._format_prompt(self.prompt.get("prompt", ""), inputs)
self.messages.append(format_message_for_llm(user_prompt))
self._inject_multimodal_files()
await self._ainject_multimodal_files()
self._show_start_logs()

View File

@@ -8,16 +8,16 @@ from typing import TYPE_CHECKING, Any
from crewai.agents.agent_builder.base_agent import BaseAgent
from crewai.crews.crew_output import CrewOutput
from crewai.rag.embeddings.types import EmbedderConfig
from crewai.types.streaming import CrewStreamingOutput, FlowStreamingOutput
from crewai.utilities.file_store import store_files
from crewai.utilities.files import (
from crewai.files import (
AudioFile,
ImageFile,
PDFFile,
TextFile,
VideoFile,
)
from crewai.rag.embeddings.types import EmbedderConfig
from crewai.types.streaming import CrewStreamingOutput, FlowStreamingOutput
from crewai.utilities.file_store import store_files
from crewai.utilities.streaming import (
StreamingState,
TaskInfo,

View File

@@ -0,0 +1,207 @@
"""File handling utilities for crewAI tasks."""
from crewai.files.cleanup import (
cleanup_expired_files,
cleanup_provider_files,
cleanup_uploaded_files,
)
from crewai.files.content_types import (
AudioContentType,
AudioExtension,
AudioFile,
BaseFile,
File,
FileMode,
ImageContentType,
ImageExtension,
ImageFile,
PDFContentType,
PDFExtension,
PDFFile,
TextContentType,
TextExtension,
TextFile,
VideoContentType,
VideoExtension,
VideoFile,
)
from crewai.files.file import (
FileBytes,
FilePath,
FileSource,
FileSourceInput,
FileStream,
RawFileInput,
)
from crewai.files.processing import (
ANTHROPIC_CONSTRAINTS,
BEDROCK_CONSTRAINTS,
GEMINI_CONSTRAINTS,
OPENAI_CONSTRAINTS,
AudioConstraints,
FileHandling,
FileProcessingError,
FileProcessor,
FileTooLargeError,
FileValidationError,
ImageConstraints,
PDFConstraints,
ProcessingDependencyError,
ProviderConstraints,
UnsupportedFileTypeError,
VideoConstraints,
get_constraints_for_provider,
)
from crewai.files.resolved import (
FileReference,
InlineBase64,
InlineBytes,
ResolvedFile,
ResolvedFileType,
UrlReference,
)
from crewai.files.resolver import (
FileResolver,
FileResolverConfig,
create_resolver,
)
from crewai.files.upload_cache import (
CachedUpload,
UploadCache,
get_upload_cache,
reset_upload_cache,
)
from crewai.files.uploaders import FileUploader, UploadResult, get_uploader
FileInput = AudioFile | File | ImageFile | PDFFile | TextFile | VideoFile
def wrap_file_source(source: FileSource) -> FileInput:
"""Wrap a FileSource in the appropriate typed FileInput wrapper.
Args:
source: The file source to wrap.
Returns:
Typed FileInput wrapper based on content type.
"""
content_type = source.content_type
if content_type.startswith("image/"):
return ImageFile(source=source)
if content_type.startswith("audio/"):
return AudioFile(source=source)
if content_type.startswith("video/"):
return VideoFile(source=source)
if content_type == "application/pdf":
return PDFFile(source=source)
return TextFile(source=source)
def normalize_input_files(
input_files: list[FileSourceInput | FileInput],
) -> dict[str, FileInput]:
"""Convert a list of file sources to a named dictionary of FileInputs.
Args:
input_files: List of file source inputs or File objects.
Returns:
Dictionary mapping names to FileInput wrappers.
"""
from pathlib import Path
result: dict[str, FileInput] = {}
for i, item in enumerate(input_files):
if isinstance(item, BaseFile):
name = item.filename or f"file_{i}"
if "." in name:
name = name.rsplit(".", 1)[0]
result[name] = item
continue
file_source: FilePath | FileBytes | FileStream
if isinstance(item, (FilePath, FileBytes, FileStream)):
file_source = item
elif isinstance(item, Path):
file_source = FilePath(path=item)
elif isinstance(item, str):
file_source = FilePath(path=Path(item))
elif isinstance(item, (bytes, memoryview)):
file_source = FileBytes(data=bytes(item))
else:
continue
name = file_source.filename or f"file_{i}"
result[name] = wrap_file_source(file_source)
return result
__all__ = [
"ANTHROPIC_CONSTRAINTS",
"BEDROCK_CONSTRAINTS",
"GEMINI_CONSTRAINTS",
"OPENAI_CONSTRAINTS",
"AudioConstraints",
"AudioContentType",
"AudioExtension",
"AudioFile",
"BaseFile",
"CachedUpload",
"File",
"FileBytes",
"FileHandling",
"FileInput",
"FileMode",
"FilePath",
"FileProcessingError",
"FileProcessor",
"FileReference",
"FileResolver",
"FileResolverConfig",
"FileSource",
"FileSourceInput",
"FileStream",
"FileTooLargeError",
"FileUploader",
"FileValidationError",
"ImageConstraints",
"ImageContentType",
"ImageExtension",
"ImageFile",
"InlineBase64",
"InlineBytes",
"PDFConstraints",
"PDFContentType",
"PDFExtension",
"PDFFile",
"ProcessingDependencyError",
"ProviderConstraints",
"RawFileInput",
"ResolvedFile",
"ResolvedFileType",
"TextContentType",
"TextExtension",
"TextFile",
"UnsupportedFileTypeError",
"UploadCache",
"UploadResult",
"UrlReference",
"VideoConstraints",
"VideoContentType",
"VideoExtension",
"VideoFile",
"cleanup_expired_files",
"cleanup_provider_files",
"cleanup_uploaded_files",
"create_resolver",
"get_constraints_for_provider",
"get_upload_cache",
"get_uploader",
"normalize_input_files",
"reset_upload_cache",
"wrap_file_source",
]

View File

@@ -0,0 +1,368 @@
"""Cleanup utilities for uploaded files."""
from __future__ import annotations
import asyncio
import logging
from typing import TYPE_CHECKING
from crewai.files.upload_cache import CachedUpload, UploadCache
from crewai.files.uploaders import get_uploader
if TYPE_CHECKING:
from crewai.files.uploaders.base import FileUploader
logger = logging.getLogger(__name__)
def _safe_delete(
uploader: FileUploader,
file_id: str,
provider: str,
) -> bool:
"""Safely delete a file, logging any errors.
Args:
uploader: The file uploader to use.
file_id: The file ID to delete.
provider: Provider name for logging.
Returns:
True if deleted successfully, False otherwise.
"""
try:
if uploader.delete(file_id):
logger.debug(f"Deleted {file_id} from {provider}")
return True
logger.warning(f"Failed to delete {file_id} from {provider}")
return False
except Exception as e:
logger.warning(f"Error deleting {file_id} from {provider}: {e}")
return False
def cleanup_uploaded_files(
cache: UploadCache,
*,
delete_from_provider: bool = True,
providers: list[str] | None = None,
) -> int:
"""Clean up uploaded files from the cache and optionally from providers.
Args:
cache: The upload cache to clean up.
delete_from_provider: If True, delete files from the provider as well.
providers: Optional list of providers to clean up. If None, cleans all.
Returns:
Number of files cleaned up.
"""
cleaned = 0
provider_uploads: dict[str, list[CachedUpload]] = {}
for provider in _get_providers_from_cache(cache):
if providers is not None and provider not in providers:
continue
provider_uploads[provider] = cache.get_all_for_provider(provider)
if delete_from_provider:
for provider, uploads in provider_uploads.items():
uploader = get_uploader(provider)
if uploader is None:
logger.warning(
f"No uploader available for {provider}, skipping cleanup"
)
continue
for upload in uploads:
if _safe_delete(uploader, upload.file_id, provider):
cleaned += 1
cache.clear()
logger.info(f"Cleaned up {cleaned} uploaded files")
return cleaned
def cleanup_expired_files(
cache: UploadCache,
*,
delete_from_provider: bool = False,
) -> int:
"""Clean up expired files from the cache.
Args:
cache: The upload cache to clean up.
delete_from_provider: If True, attempt to delete from provider as well.
Note: Expired files may already be deleted by the provider.
Returns:
Number of expired entries removed from cache.
"""
expired_entries: list[CachedUpload] = []
if delete_from_provider:
for provider in _get_providers_from_cache(cache):
expired_entries.extend(
upload
for upload in cache.get_all_for_provider(provider)
if upload.is_expired()
)
removed = cache.clear_expired()
if delete_from_provider:
for upload in expired_entries:
uploader = get_uploader(upload.provider)
if uploader is not None:
try:
uploader.delete(upload.file_id)
except Exception as e:
logger.debug(f"Could not delete expired file {upload.file_id}: {e}")
return removed
def cleanup_provider_files(
provider: str,
*,
cache: UploadCache | None = None,
delete_all_from_provider: bool = False,
) -> int:
"""Clean up all files for a specific provider.
Args:
provider: Provider name to clean up.
cache: Optional upload cache to clear entries from.
delete_all_from_provider: If True, delete all files from the provider,
not just cached ones.
Returns:
Number of files deleted.
"""
deleted = 0
uploader = get_uploader(provider)
if uploader is None:
logger.warning(f"No uploader available for {provider}")
return 0
if delete_all_from_provider:
try:
files = uploader.list_files()
for file_info in files:
file_id = file_info.get("id") or file_info.get("name")
if file_id and uploader.delete(file_id):
deleted += 1
except Exception as e:
logger.warning(f"Error listing/deleting files from {provider}: {e}")
elif cache is not None:
uploads = cache.get_all_for_provider(provider)
for upload in uploads:
if _safe_delete(uploader, upload.file_id, provider):
deleted += 1
cache.remove_by_file_id(upload.file_id, provider)
logger.info(f"Deleted {deleted} files from {provider}")
return deleted
def _get_providers_from_cache(cache: UploadCache) -> set[str]:
"""Get unique provider names from cache entries.
Args:
cache: The upload cache.
Returns:
Set of provider names.
"""
return cache.get_providers()
async def _asafe_delete(
uploader: FileUploader,
file_id: str,
provider: str,
) -> bool:
"""Async safely delete a file, logging any errors.
Args:
uploader: The file uploader to use.
file_id: The file ID to delete.
provider: Provider name for logging.
Returns:
True if deleted successfully, False otherwise.
"""
try:
if await uploader.adelete(file_id):
logger.debug(f"Deleted {file_id} from {provider}")
return True
logger.warning(f"Failed to delete {file_id} from {provider}")
return False
except Exception as e:
logger.warning(f"Error deleting {file_id} from {provider}: {e}")
return False
async def acleanup_uploaded_files(
cache: UploadCache,
*,
delete_from_provider: bool = True,
providers: list[str] | None = None,
max_concurrency: int = 10,
) -> int:
"""Async clean up uploaded files from the cache and optionally from providers.
Args:
cache: The upload cache to clean up.
delete_from_provider: If True, delete files from the provider as well.
providers: Optional list of providers to clean up. If None, cleans all.
max_concurrency: Maximum number of concurrent delete operations.
Returns:
Number of files cleaned up.
"""
cleaned = 0
provider_uploads: dict[str, list[CachedUpload]] = {}
for provider in _get_providers_from_cache(cache):
if providers is not None and provider not in providers:
continue
provider_uploads[provider] = await cache.aget_all_for_provider(provider)
if delete_from_provider:
semaphore = asyncio.Semaphore(max_concurrency)
async def delete_one(uploader: FileUploader, upload: CachedUpload) -> bool:
async with semaphore:
return await _asafe_delete(uploader, upload.file_id, upload.provider)
tasks: list[asyncio.Task[bool]] = []
for provider, uploads in provider_uploads.items():
uploader = get_uploader(provider)
if uploader is None:
logger.warning(
f"No uploader available for {provider}, skipping cleanup"
)
continue
tasks.extend(
asyncio.create_task(delete_one(uploader, upload)) for upload in uploads
)
results = await asyncio.gather(*tasks, return_exceptions=True)
cleaned = sum(1 for r in results if r is True)
await cache.aclear()
logger.info(f"Cleaned up {cleaned} uploaded files")
return cleaned
async def acleanup_expired_files(
cache: UploadCache,
*,
delete_from_provider: bool = False,
max_concurrency: int = 10,
) -> int:
"""Async clean up expired files from the cache.
Args:
cache: The upload cache to clean up.
delete_from_provider: If True, attempt to delete from provider as well.
max_concurrency: Maximum number of concurrent delete operations.
Returns:
Number of expired entries removed from cache.
"""
expired_entries: list[CachedUpload] = []
if delete_from_provider:
for provider in _get_providers_from_cache(cache):
uploads = await cache.aget_all_for_provider(provider)
expired_entries.extend(upload for upload in uploads if upload.is_expired())
removed = await cache.aclear_expired()
if delete_from_provider and expired_entries:
semaphore = asyncio.Semaphore(max_concurrency)
async def delete_expired(upload: CachedUpload) -> None:
async with semaphore:
uploader = get_uploader(upload.provider)
if uploader is not None:
try:
await uploader.adelete(upload.file_id)
except Exception as e:
logger.debug(
f"Could not delete expired file {upload.file_id}: {e}"
)
await asyncio.gather(
*[delete_expired(upload) for upload in expired_entries],
return_exceptions=True,
)
return removed
async def acleanup_provider_files(
provider: str,
*,
cache: UploadCache | None = None,
delete_all_from_provider: bool = False,
max_concurrency: int = 10,
) -> int:
"""Async clean up all files for a specific provider.
Args:
provider: Provider name to clean up.
cache: Optional upload cache to clear entries from.
delete_all_from_provider: If True, delete all files from the provider.
max_concurrency: Maximum number of concurrent delete operations.
Returns:
Number of files deleted.
"""
deleted = 0
uploader = get_uploader(provider)
if uploader is None:
logger.warning(f"No uploader available for {provider}")
return 0
semaphore = asyncio.Semaphore(max_concurrency)
async def delete_file(file_id: str) -> bool:
async with semaphore:
return await uploader.adelete(file_id)
if delete_all_from_provider:
try:
files = uploader.list_files()
tasks = []
for file_info in files:
file_id = file_info.get("id") or file_info.get("name")
if file_id:
tasks.append(delete_file(file_id))
results = await asyncio.gather(*tasks, return_exceptions=True)
deleted = sum(1 for r in results if r is True)
except Exception as e:
logger.warning(f"Error listing/deleting files from {provider}: {e}")
elif cache is not None:
uploads = await cache.aget_all_for_provider(provider)
tasks = []
for upload in uploads:
tasks.append(delete_file(upload.file_id))
results = await asyncio.gather(*tasks, return_exceptions=True)
for upload, result in zip(uploads, results, strict=False):
if result is True:
deleted += 1
await cache.aremove_by_file_id(upload.file_id, provider)
logger.info(f"Deleted {deleted} files from {provider}")
return deleted

View File

@@ -11,7 +11,8 @@ from pydantic import BaseModel, Field, GetCoreSchemaHandler
from pydantic_core import CoreSchema, core_schema
from typing_extensions import TypeIs
from crewai.utilities.files.file import (
from crewai.files.file import (
AsyncFileStream,
FileBytes,
FilePath,
FileSource,
@@ -185,7 +186,18 @@ class BaseFile(ABC, BaseModel):
def read(self) -> bytes:
"""Read the file content as bytes."""
return self._file_source.read()
return self._file_source.read() # type: ignore[union-attr]
async def aread(self) -> bytes:
"""Async read the file content as bytes.
Raises:
TypeError: If the underlying source doesn't support async read.
"""
source = self._file_source
if isinstance(source, (FilePath, FileBytes, AsyncFileStream)):
return await source.aread()
raise TypeError(f"{type(source).__name__} does not support async read")
def read_text(self, encoding: str = "utf-8") -> str:
"""Read the file content as string."""

View File

@@ -0,0 +1,377 @@
"""Base file class for handling file inputs in tasks."""
from __future__ import annotations
from collections.abc import AsyncIterator, Iterator
from pathlib import Path
from typing import Annotated, Any, BinaryIO, Protocol, cast, runtime_checkable
import aiofiles
import magic
from pydantic import (
BaseModel,
BeforeValidator,
Field,
GetCoreSchemaHandler,
PrivateAttr,
model_validator,
)
from pydantic_core import CoreSchema, core_schema
@runtime_checkable
class AsyncReadable(Protocol):
"""Protocol for async readable streams."""
async def read(self, size: int = -1) -> bytes: ...
class _AsyncReadableValidator:
"""Pydantic validator for AsyncReadable types."""
@classmethod
def __get_pydantic_core_schema__(
cls, _source_type: Any, _handler: GetCoreSchemaHandler
) -> CoreSchema:
return core_schema.no_info_plain_validator_function(
cls._validate,
serialization=core_schema.plain_serializer_function_ser_schema(
lambda x: None, info_arg=False
),
)
@staticmethod
def _validate(value: Any) -> AsyncReadable:
if isinstance(value, AsyncReadable):
return value
raise ValueError("Expected an async readable object with async read() method")
ValidatedAsyncReadable = Annotated[AsyncReadable, _AsyncReadableValidator()]
DEFAULT_MAX_FILE_SIZE_BYTES = 500 * 1024 * 1024 # 500MB
def detect_content_type(data: bytes) -> str:
"""Detect MIME type from file content.
Args:
data: Raw bytes to analyze.
Returns:
The detected MIME type.
"""
result: str = magic.from_buffer(data, mime=True)
return result
class _BinaryIOValidator:
"""Pydantic validator for BinaryIO types."""
@classmethod
def __get_pydantic_core_schema__(
cls, _source_type: Any, _handler: GetCoreSchemaHandler
) -> CoreSchema:
return core_schema.no_info_plain_validator_function(
cls._validate,
serialization=core_schema.plain_serializer_function_ser_schema(
lambda x: None, info_arg=False
),
)
@staticmethod
def _validate(value: Any) -> BinaryIO:
if hasattr(value, "read") and hasattr(value, "seek"):
return cast(BinaryIO, value)
raise ValueError("Expected a binary file-like object with read() and seek()")
ValidatedBinaryIO = Annotated[BinaryIO, _BinaryIOValidator()]
class FilePath(BaseModel):
"""File loaded from a filesystem path."""
path: Path = Field(description="Path to the file on the filesystem.")
max_size_bytes: int = Field(
default=DEFAULT_MAX_FILE_SIZE_BYTES,
exclude=True,
description="Maximum file size in bytes.",
)
_content: bytes | None = PrivateAttr(default=None)
@model_validator(mode="after")
def _validate_file_exists(self) -> FilePath:
"""Validate that the file exists, is secure, and within size limits."""
from crewai.files.processing.exceptions import FileTooLargeError
path_str = str(self.path)
if ".." in path_str:
raise ValueError(f"Path traversal not allowed: {self.path}")
if self.path.is_symlink():
resolved = self.path.resolve()
cwd = Path.cwd().resolve()
if not str(resolved).startswith(str(cwd)):
raise ValueError(f"Symlink escapes allowed directory: {self.path}")
if not self.path.exists():
raise ValueError(f"File not found: {self.path}")
if not self.path.is_file():
raise ValueError(f"Path is not a file: {self.path}")
actual_size = self.path.stat().st_size
if actual_size > self.max_size_bytes:
raise FileTooLargeError(
f"File exceeds max size ({actual_size} > {self.max_size_bytes})",
file_name=str(self.path),
actual_size=actual_size,
max_size=self.max_size_bytes,
)
return self
@property
def filename(self) -> str:
"""Get the filename from the path."""
return self.path.name
@property
def content_type(self) -> str:
"""Get the content type by reading file content."""
return detect_content_type(self.read())
def read(self) -> bytes:
"""Read the file content from disk."""
if self._content is None:
self._content = self.path.read_bytes()
return self._content
async def aread(self) -> bytes:
"""Async read the file content from disk."""
if self._content is None:
async with aiofiles.open(self.path, "rb") as f:
self._content = await f.read()
return self._content
def read_chunks(self, chunk_size: int = 65536) -> Iterator[bytes]:
"""Stream file content in chunks without loading entirely into memory.
Args:
chunk_size: Size of each chunk in bytes.
Yields:
Chunks of file content.
"""
with open(self.path, "rb") as f:
while chunk := f.read(chunk_size):
yield chunk
async def aread_chunks(self, chunk_size: int = 65536) -> AsyncIterator[bytes]:
"""Async streaming for non-blocking I/O.
Args:
chunk_size: Size of each chunk in bytes.
Yields:
Chunks of file content.
"""
async with aiofiles.open(self.path, "rb") as f:
while chunk := await f.read(chunk_size):
yield chunk
class FileBytes(BaseModel):
"""File created from raw bytes content."""
data: bytes = Field(description="Raw bytes content of the file.")
filename: str | None = Field(default=None, description="Optional filename.")
@property
def content_type(self) -> str:
"""Get the content type from the data."""
return detect_content_type(self.data)
def read(self) -> bytes:
"""Return the bytes content."""
return self.data
async def aread(self) -> bytes:
"""Async return the bytes content (immediate, already in memory)."""
return self.data
def read_chunks(self, chunk_size: int = 65536) -> Iterator[bytes]:
"""Stream bytes content in chunks.
Args:
chunk_size: Size of each chunk in bytes.
Yields:
Chunks of bytes content.
"""
for i in range(0, len(self.data), chunk_size):
yield self.data[i : i + chunk_size]
async def aread_chunks(self, chunk_size: int = 65536) -> AsyncIterator[bytes]:
"""Async streaming (immediate yield since already in memory).
Args:
chunk_size: Size of each chunk in bytes.
Yields:
Chunks of bytes content.
"""
for chunk in self.read_chunks(chunk_size):
yield chunk
class FileStream(BaseModel):
"""File loaded from a file-like stream."""
stream: ValidatedBinaryIO = Field(description="Binary file stream.")
filename: str | None = Field(default=None, description="Optional filename.")
_content: bytes | None = PrivateAttr(default=None)
def model_post_init(self, __context: object) -> None:
"""Extract filename from stream if not provided."""
if self.filename is None:
name = getattr(self.stream, "name", None)
if name is not None:
object.__setattr__(self, "filename", Path(name).name)
@property
def content_type(self) -> str:
"""Get the content type from stream content."""
return detect_content_type(self.read())
def read(self) -> bytes:
"""Read the stream content. Content is cached after first read."""
if self._content is None:
position = self.stream.tell()
self.stream.seek(0)
self._content = self.stream.read()
self.stream.seek(position)
return self._content
def close(self) -> None:
"""Close the underlying stream."""
self.stream.close()
def __enter__(self) -> FileStream:
"""Enter context manager."""
return self
def __exit__(
self,
exc_type: type[BaseException] | None,
exc_val: BaseException | None,
exc_tb: Any,
) -> None:
"""Exit context manager and close stream."""
self.close()
def read_chunks(self, chunk_size: int = 65536) -> Iterator[bytes]:
"""Stream from underlying stream in chunks.
Args:
chunk_size: Size of each chunk in bytes.
Yields:
Chunks of stream content.
"""
position = self.stream.tell()
self.stream.seek(0)
try:
while chunk := self.stream.read(chunk_size):
yield chunk
finally:
self.stream.seek(position)
class AsyncFileStream(BaseModel):
"""File loaded from an async stream.
Use for async file handles like aiofiles objects or aiohttp response bodies.
This is an async-only type - use aread() instead of read().
Attributes:
stream: Async file-like object with async read() method.
filename: Optional filename for the stream.
"""
stream: ValidatedAsyncReadable = Field(
description="Async file stream with async read() method."
)
filename: str | None = Field(default=None, description="Optional filename.")
_content: bytes | None = PrivateAttr(default=None)
@property
def content_type(self) -> str:
"""Get the content type from stream content. Requires aread() first."""
if self._content is None:
raise RuntimeError("Call aread() first to load content")
return detect_content_type(self._content)
async def aread(self) -> bytes:
"""Async read the stream content. Content is cached after first read."""
if self._content is None:
self._content = await self.stream.read()
return self._content
async def aclose(self) -> None:
"""Async close the underlying stream."""
if hasattr(self.stream, "close"):
result = self.stream.close()
if hasattr(result, "__await__"):
await result
async def __aenter__(self) -> AsyncFileStream:
"""Async enter context manager."""
return self
async def __aexit__(
self,
exc_type: type[BaseException] | None,
exc_val: BaseException | None,
exc_tb: Any,
) -> None:
"""Async exit context manager and close stream."""
await self.aclose()
async def aread_chunks(self, chunk_size: int = 65536) -> AsyncIterator[bytes]:
"""Async stream content in chunks.
Args:
chunk_size: Size of each chunk in bytes.
Yields:
Chunks of stream content.
"""
while chunk := await self.stream.read(chunk_size):
yield chunk
FileSource = FilePath | FileBytes | FileStream | AsyncFileStream
def _normalize_source(value: Any) -> FileSource:
"""Convert raw input to appropriate source type."""
if isinstance(value, (FilePath, FileBytes, FileStream, AsyncFileStream)):
return value
if isinstance(value, Path):
return FilePath(path=value)
if isinstance(value, str):
return FilePath(path=Path(value))
if isinstance(value, bytes):
return FileBytes(data=value)
if isinstance(value, AsyncReadable):
return AsyncFileStream(stream=value)
if hasattr(value, "read") and hasattr(value, "seek"):
return FileStream(stream=value)
raise ValueError(f"Cannot convert {type(value).__name__} to file source")
RawFileInput = str | Path | bytes
FileSourceInput = Annotated[
RawFileInput | FileSource, BeforeValidator(_normalize_source)
]

View File

@@ -0,0 +1,184 @@
"""Performance metrics and structured logging for file operations."""
from __future__ import annotations
from collections.abc import Generator
from contextlib import contextmanager
from dataclasses import dataclass, field
from datetime import datetime, timezone
import logging
import time
from typing import Any
logger = logging.getLogger(__name__)
@dataclass
class FileOperationMetrics:
"""Metrics for a file operation.
Attributes:
operation: Name of the operation (e.g., "upload", "resolve", "process").
filename: Name of the file being operated on.
provider: Provider name if applicable.
duration_ms: Duration of the operation in milliseconds.
size_bytes: Size of the file in bytes.
success: Whether the operation succeeded.
error: Error message if operation failed.
timestamp: When the operation occurred.
metadata: Additional operation-specific metadata.
"""
operation: str
filename: str | None = None
provider: str | None = None
duration_ms: float = 0.0
size_bytes: int | None = None
success: bool = True
error: str | None = None
timestamp: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
metadata: dict[str, Any] = field(default_factory=dict)
def to_dict(self) -> dict[str, Any]:
"""Convert metrics to dictionary for logging.
Returns:
Dictionary representation of metrics.
"""
result: dict[str, Any] = {
"operation": self.operation,
"duration_ms": round(self.duration_ms, 2),
"success": self.success,
"timestamp": self.timestamp.isoformat(),
}
if self.filename:
result["filename"] = self.filename
if self.provider:
result["provider"] = self.provider
if self.size_bytes is not None:
result["size_bytes"] = self.size_bytes
if self.error:
result["error"] = self.error
if self.metadata:
result.update(self.metadata)
return result
@contextmanager
def measure_operation(
operation: str,
*,
filename: str | None = None,
provider: str | None = None,
size_bytes: int | None = None,
log_level: int = logging.DEBUG,
**extra_metadata: Any,
) -> Generator[FileOperationMetrics, None, None]:
"""Context manager to measure and log operation performance.
Args:
operation: Name of the operation.
filename: Optional filename being operated on.
provider: Optional provider name.
size_bytes: Optional file size in bytes.
log_level: Log level for the result message.
**extra_metadata: Additional metadata to include.
Yields:
FileOperationMetrics object that will be populated with results.
Example:
with measure_operation("upload", filename="test.pdf", provider="openai") as metrics:
result = upload_file(file)
metrics.metadata["file_id"] = result.file_id
"""
metrics = FileOperationMetrics(
operation=operation,
filename=filename,
provider=provider,
size_bytes=size_bytes,
metadata=dict(extra_metadata),
)
start_time = time.perf_counter()
try:
yield metrics
metrics.success = True
except Exception as e:
metrics.success = False
metrics.error = str(e)
raise
finally:
metrics.duration_ms = (time.perf_counter() - start_time) * 1000
log_message = f"{operation}"
if filename:
log_message += f" [{filename}]"
if provider:
log_message += f" ({provider})"
if metrics.success:
log_message += f" completed in {metrics.duration_ms:.2f}ms"
else:
log_message += f" failed after {metrics.duration_ms:.2f}ms: {metrics.error}"
logger.log(log_level, log_message, extra=metrics.to_dict())
def log_file_operation(
operation: str,
*,
filename: str | None = None,
provider: str | None = None,
size_bytes: int | None = None,
duration_ms: float | None = None,
success: bool = True,
error: str | None = None,
level: int = logging.INFO,
**extra: Any,
) -> None:
"""Log a file operation with structured data.
Args:
operation: Name of the operation.
filename: Optional filename being operated on.
provider: Optional provider name.
size_bytes: Optional file size in bytes.
duration_ms: Optional duration in milliseconds.
success: Whether the operation succeeded.
error: Optional error message.
level: Log level to use.
**extra: Additional metadata to include.
"""
metrics = FileOperationMetrics(
operation=operation,
filename=filename,
provider=provider,
size_bytes=size_bytes,
duration_ms=duration_ms or 0.0,
success=success,
error=error,
metadata=dict(extra),
)
message = f"{operation}"
if filename:
message += f" [{filename}]"
if provider:
message += f" ({provider})"
if success:
if duration_ms:
message += f" completed in {duration_ms:.2f}ms"
else:
message += " completed"
else:
message += " failed"
if error:
message += f": {error}"
logger.log(level, message, extra=metrics.to_dict())

View File

@@ -4,7 +4,7 @@ This module provides validation, transformation, and processing utilities
for files used in multimodal LLM interactions.
"""
from crewai.utilities.files.processing.constraints import (
from crewai.files.processing.constraints import (
ANTHROPIC_CONSTRAINTS,
BEDROCK_CONSTRAINTS,
GEMINI_CONSTRAINTS,
@@ -16,16 +16,16 @@ from crewai.utilities.files.processing.constraints import (
VideoConstraints,
get_constraints_for_provider,
)
from crewai.utilities.files.processing.enums import FileHandling
from crewai.utilities.files.processing.exceptions import (
from crewai.files.processing.enums import FileHandling
from crewai.files.processing.exceptions import (
FileProcessingError,
FileTooLargeError,
FileValidationError,
ProcessingDependencyError,
UnsupportedFileTypeError,
)
from crewai.utilities.files.processing.processor import FileProcessor
from crewai.utilities.files.processing.validators import (
from crewai.files.processing.processor import FileProcessor
from crewai.files.processing.validators import (
validate_audio,
validate_file,
validate_image,

View File

@@ -81,3 +81,23 @@ class ProcessingDependencyError(FileProcessingError):
self.dependency = dependency
self.install_command = install_command
super().__init__(message)
class TransientFileError(FileProcessingError):
"""Transient error that may succeed on retry (network, timeout)."""
class PermanentFileError(FileProcessingError):
"""Permanent error that will not succeed on retry (auth, format)."""
class UploadError(FileProcessingError):
"""Base exception for upload errors."""
class TransientUploadError(UploadError, TransientFileError):
"""Upload failed but may succeed on retry (network issues, rate limits)."""
class PermanentUploadError(UploadError, PermanentFileError):
"""Upload failed permanently (auth failure, invalid file, unsupported type)."""

View File

@@ -1,9 +1,10 @@
"""FileProcessor for validating and transforming files based on provider constraints."""
import asyncio
from collections.abc import Sequence
import logging
from crewai.utilities.files.content_types import (
from crewai.files.content_types import (
AudioFile,
File,
ImageFile,
@@ -11,18 +12,18 @@ from crewai.utilities.files.content_types import (
TextFile,
VideoFile,
)
from crewai.utilities.files.processing.constraints import (
from crewai.files.processing.constraints import (
ProviderConstraints,
get_constraints_for_provider,
)
from crewai.utilities.files.processing.enums import FileHandling
from crewai.utilities.files.processing.exceptions import (
from crewai.files.processing.enums import FileHandling
from crewai.files.processing.exceptions import (
FileProcessingError,
FileTooLargeError,
FileValidationError,
UnsupportedFileTypeError,
)
from crewai.utilities.files.processing.transformers import (
from crewai.files.processing.transformers import (
chunk_pdf,
chunk_text,
get_image_dimensions,
@@ -30,7 +31,7 @@ from crewai.utilities.files.processing.transformers import (
optimize_image,
resize_image,
)
from crewai.utilities.files.processing.validators import validate_file
from crewai.files.processing.validators import validate_file
logger = logging.getLogger(__name__)
@@ -183,6 +184,52 @@ class FileProcessor:
return result
async def aprocess_files(
self,
files: dict[str, FileInput],
max_concurrency: int = 10,
) -> dict[str, FileInput]:
"""Async process multiple files in parallel.
Args:
files: Dictionary mapping names to file inputs.
max_concurrency: Maximum number of concurrent processing tasks.
Returns:
Dictionary mapping names to processed files. If a file is chunked,
multiple entries are created with indexed names.
"""
semaphore = asyncio.Semaphore(max_concurrency)
async def process_one(
name: str, file: FileInput
) -> tuple[str, FileInput | Sequence[FileInput]]:
async with semaphore:
loop = asyncio.get_running_loop()
processed = await loop.run_in_executor(None, self.process, file)
return name, processed
tasks = [process_one(n, f) for n, f in files.items()]
results = await asyncio.gather(*tasks, return_exceptions=True)
output: dict[str, FileInput] = {}
for result in results:
if isinstance(result, BaseException):
logger.error(f"Processing failed: {result}")
continue
name, processed = result
if isinstance(processed, Sequence) and not isinstance(
processed, (str, bytes)
):
for i, chunk in enumerate(processed):
output[f"{name}_chunk_{i}"] = chunk
elif isinstance(
processed, (AudioFile, File, ImageFile, PDFFile, TextFile, VideoFile)
):
output[name] = processed
return output
def _auto_process(self, file: FileInput) -> FileInput:
"""Automatically resize/compress file to meet constraints.
@@ -272,7 +319,7 @@ class FileProcessor:
page_count = get_pdf_page_count(file)
if page_count is not None and page_count > max_pages:
try:
return chunk_pdf(file, max_pages)
return list(chunk_pdf(file, max_pages))
except Exception as e:
logger.warning(f"Failed to chunk PDF: {e}")
return file
@@ -284,7 +331,7 @@ class FileProcessor:
content = file.read()
if len(content) > max_size:
try:
return chunk_text(file, max_size)
return list(chunk_text(file, max_size))
except Exception as e:
logger.warning(f"Failed to chunk text file: {e}")
return file

View File

@@ -1,12 +1,12 @@
"""File transformation functions for resizing, optimizing, and chunking."""
from collections.abc import Sequence
from collections.abc import Iterator
import io
import logging
from crewai.utilities.files.content_types import ImageFile, PDFFile, TextFile
from crewai.utilities.files.file import FileBytes
from crewai.utilities.files.processing.exceptions import ProcessingDependencyError
from crewai.files.content_types import ImageFile, PDFFile, TextFile
from crewai.files.file import FileBytes
from crewai.files.processing.exceptions import ProcessingDependencyError
logger = logging.getLogger(__name__)
@@ -161,22 +161,24 @@ def chunk_pdf(
max_pages: int,
*,
overlap_pages: int = 0,
) -> Sequence[PDFFile]:
) -> Iterator[PDFFile]:
"""Split a PDF into chunks of maximum page count.
Yields chunks one at a time to minimize memory usage.
Args:
file: The PDF file to chunk.
max_pages: Maximum pages per chunk.
overlap_pages: Number of overlapping pages between chunks (for context).
Returns:
List of PDFFile objects, one per chunk.
Yields:
PDFFile objects, one per chunk.
Raises:
ProcessingDependencyError: If pypdf is not installed.
"""
try:
from pypdf import PdfReader, PdfWriter # type: ignore[import-not-found]
from pypdf import PdfReader, PdfWriter
except ImportError as e:
raise ProcessingDependencyError(
"pypdf is required for PDF chunking",
@@ -189,9 +191,9 @@ def chunk_pdf(
total_pages = len(reader.pages)
if total_pages <= max_pages:
return [file]
yield file
return
chunks: list[PDFFile] = []
filename = file.filename or "document.pdf"
base_filename = filename.rsplit(".", 1)[0]
step = max_pages - overlap_pages
@@ -211,19 +213,16 @@ def chunk_pdf(
output_bytes = output_buffer.getvalue()
chunk_filename = f"{base_filename}_chunk_{chunk_num}.pdf"
chunks.append(
PDFFile(source=FileBytes(data=output_bytes, filename=chunk_filename))
)
logger.info(
f"Created PDF chunk '{chunk_filename}' with pages {start_page + 1}-{end_page}"
)
yield PDFFile(source=FileBytes(data=output_bytes, filename=chunk_filename))
start_page += step
chunk_num += 1
return chunks
def chunk_text(
file: TextFile,
@@ -231,26 +230,28 @@ def chunk_text(
*,
overlap_chars: int = 200,
split_on_newlines: bool = True,
) -> Sequence[TextFile]:
) -> Iterator[TextFile]:
"""Split a text file into chunks of maximum character count.
Yields chunks one at a time to minimize memory usage.
Args:
file: The text file to chunk.
max_chars: Maximum characters per chunk.
overlap_chars: Number of overlapping characters between chunks.
split_on_newlines: If True, prefer splitting at newline boundaries.
Returns:
List of TextFile objects, one per chunk.
Yields:
TextFile objects, one per chunk.
"""
content = file.read()
text = content.decode("utf-8", errors="replace")
total_chars = len(text)
if total_chars <= max_chars:
return [file]
yield file
return
chunks: list[TextFile] = []
filename = file.filename or "text.txt"
base_filename = filename.rsplit(".", 1)[0]
extension = filename.rsplit(".", 1)[-1] if "." in filename else "txt"
@@ -261,29 +262,27 @@ def chunk_text(
while start_pos < total_chars:
end_pos = min(start_pos + max_chars, total_chars)
# If not at end, try to find a better split point
if end_pos < total_chars and split_on_newlines:
# Look for last newline within the chunk
last_newline = text.rfind("\n", start_pos, end_pos)
if last_newline > start_pos + max_chars // 2: # Don't split too early
if last_newline > start_pos + max_chars // 2:
end_pos = last_newline + 1
chunk_text = text[start_pos:end_pos]
chunk_bytes = chunk_text.encode("utf-8")
chunk_content = text[start_pos:end_pos]
chunk_bytes = chunk_content.encode("utf-8")
chunk_filename = f"{base_filename}_chunk_{chunk_num}.{extension}"
chunks.append(
TextFile(source=FileBytes(data=chunk_bytes, filename=chunk_filename))
)
logger.info(
f"Created text chunk '{chunk_filename}' with {len(chunk_text)} characters"
f"Created text chunk '{chunk_filename}' with {len(chunk_content)} characters"
)
start_pos = end_pos - overlap_chars if end_pos < total_chars else total_chars
chunk_num += 1
yield TextFile(source=FileBytes(data=chunk_bytes, filename=chunk_filename))
return chunks
if end_pos < total_chars:
start_pos = max(start_pos + 1, end_pos - overlap_chars)
else:
start_pos = total_chars
chunk_num += 1
def get_image_dimensions(file: ImageFile) -> tuple[int, int] | None:

View File

@@ -3,7 +3,7 @@
from collections.abc import Sequence
import logging
from crewai.utilities.files.content_types import (
from crewai.files.content_types import (
AudioFile,
File,
ImageFile,
@@ -11,14 +11,14 @@ from crewai.utilities.files.content_types import (
TextFile,
VideoFile,
)
from crewai.utilities.files.processing.constraints import (
from crewai.files.processing.constraints import (
AudioConstraints,
ImageConstraints,
PDFConstraints,
ProviderConstraints,
VideoConstraints,
)
from crewai.utilities.files.processing.exceptions import (
from crewai.files.processing.exceptions import (
FileTooLargeError,
FileValidationError,
UnsupportedFileTypeError,
@@ -172,7 +172,7 @@ def validate_pdf(
try:
import io
from pypdf import PdfReader # type: ignore[import-not-found]
from pypdf import PdfReader
reader = PdfReader(io.BytesIO(content))
page_count = len(reader.pages)

View File

@@ -0,0 +1,577 @@
"""FileResolver for deciding file delivery method and managing uploads."""
import asyncio
import base64
from dataclasses import dataclass, field
import hashlib
import logging
from crewai.files.content_types import (
AudioFile,
File,
ImageFile,
PDFFile,
TextFile,
VideoFile,
)
from crewai.files.metrics import measure_operation
from crewai.files.processing.constraints import (
ProviderConstraints,
get_constraints_for_provider,
)
from crewai.files.resolved import (
FileReference,
InlineBase64,
InlineBytes,
ResolvedFile,
)
from crewai.files.upload_cache import CachedUpload, UploadCache
from crewai.files.uploaders import UploadResult, get_uploader
from crewai.files.uploaders.base import FileUploader
logger = logging.getLogger(__name__)
FileInput = AudioFile | File | ImageFile | PDFFile | TextFile | VideoFile
UPLOAD_MAX_RETRIES = 3
UPLOAD_RETRY_DELAY_BASE = 2
@dataclass
class FileContext:
"""Cached file metadata to avoid redundant reads.
Attributes:
content: Raw file bytes.
size: Size of the file in bytes.
content_hash: SHA-256 hash of the file content.
content_type: MIME type of the file.
"""
content: bytes
size: int
content_hash: str
content_type: str
@dataclass
class FileResolverConfig:
"""Configuration for FileResolver.
Attributes:
prefer_upload: If True, prefer uploading over inline for supported providers.
upload_threshold_bytes: Size threshold above which to use upload.
If None, uses provider-specific threshold.
use_bytes_for_bedrock: If True, use raw bytes instead of base64 for Bedrock.
"""
prefer_upload: bool = False
upload_threshold_bytes: int | None = None
use_bytes_for_bedrock: bool = True
@dataclass
class FileResolver:
"""Resolves files to their delivery format based on provider capabilities.
Decides whether to use inline base64, raw bytes, or file upload based on:
- Provider constraints and capabilities
- File size
- Configuration preferences
Caches uploaded files to avoid redundant uploads.
Attributes:
config: Resolver configuration.
upload_cache: Cache for tracking uploaded files.
"""
config: FileResolverConfig = field(default_factory=FileResolverConfig)
upload_cache: UploadCache | None = None
_uploaders: dict[str, FileUploader] = field(default_factory=dict)
def _build_file_context(self, file: FileInput) -> FileContext:
"""Build context by reading file once.
Args:
file: The file to build context for.
Returns:
FileContext with cached metadata.
"""
content = file.read()
return FileContext(
content=content,
size=len(content),
content_hash=hashlib.sha256(content).hexdigest(),
content_type=file.content_type,
)
def resolve(self, file: FileInput, provider: str) -> ResolvedFile:
"""Resolve a file to its delivery format for a provider.
Args:
file: The file to resolve.
provider: Provider name (e.g., "gemini", "anthropic", "openai").
Returns:
ResolvedFile representing the appropriate delivery format.
"""
provider_lower = provider.lower()
constraints = get_constraints_for_provider(provider)
context = self._build_file_context(file)
should_upload = self._should_upload(
file, provider_lower, constraints, context.size
)
if should_upload:
resolved = self._resolve_via_upload(file, provider_lower, context)
if resolved is not None:
return resolved
return self._resolve_inline(file, provider_lower, context)
def resolve_files(
self,
files: dict[str, FileInput],
provider: str,
) -> dict[str, ResolvedFile]:
"""Resolve multiple files for a provider.
Args:
files: Dictionary mapping names to file inputs.
provider: Provider name.
Returns:
Dictionary mapping names to resolved files.
"""
return {name: self.resolve(file, provider) for name, file in files.items()}
def _should_upload(
self,
file: FileInput,
provider: str,
constraints: ProviderConstraints | None,
file_size: int,
) -> bool:
"""Determine if a file should be uploaded rather than inlined.
Args:
file: The file to check.
provider: Provider name.
constraints: Provider constraints.
file_size: Size of the file in bytes.
Returns:
True if the file should be uploaded, False otherwise.
"""
if constraints is None or not constraints.supports_file_upload:
return False
if self.config.prefer_upload:
return True
threshold = self.config.upload_threshold_bytes
if threshold is None and constraints is not None:
threshold = constraints.file_upload_threshold_bytes
if threshold is not None and file_size > threshold:
return True
return False
def _resolve_via_upload(
self,
file: FileInput,
provider: str,
context: FileContext,
) -> ResolvedFile | None:
"""Resolve a file by uploading it.
Args:
file: The file to upload.
provider: Provider name.
context: Pre-computed file context.
Returns:
FileReference if upload succeeds, None otherwise.
"""
if self.upload_cache is not None:
cached = self.upload_cache.get_by_hash(context.content_hash, provider)
if cached is not None:
logger.debug(
f"Using cached upload for {file.filename}: {cached.file_id}"
)
return FileReference(
content_type=cached.content_type,
file_id=cached.file_id,
provider=cached.provider,
expires_at=cached.expires_at,
file_uri=cached.file_uri,
)
uploader = self._get_uploader(provider)
if uploader is None:
logger.debug(f"No uploader available for {provider}")
return None
result = self._upload_with_retry(uploader, file, provider, context.size)
if result is None:
return None
if self.upload_cache is not None:
self.upload_cache.set_by_hash(
file_hash=context.content_hash,
content_type=context.content_type,
provider=provider,
file_id=result.file_id,
file_uri=result.file_uri,
expires_at=result.expires_at,
)
return FileReference(
content_type=result.content_type,
file_id=result.file_id,
provider=result.provider,
expires_at=result.expires_at,
file_uri=result.file_uri,
)
def _upload_with_retry(
self,
uploader: FileUploader,
file: FileInput,
provider: str,
file_size: int,
) -> UploadResult | None:
"""Upload with exponential backoff retry.
Args:
uploader: The uploader to use.
file: The file to upload.
provider: Provider name for logging.
file_size: Size of the file in bytes.
Returns:
UploadResult if successful, None otherwise.
"""
import time
from crewai.files.processing.exceptions import (
PermanentUploadError,
TransientUploadError,
)
last_error: Exception | None = None
for attempt in range(UPLOAD_MAX_RETRIES):
with measure_operation(
"upload",
filename=file.filename,
provider=provider,
size_bytes=file_size,
attempt=attempt + 1,
) as metrics:
try:
result = uploader.upload(file)
metrics.metadata["file_id"] = result.file_id
return result
except PermanentUploadError as e:
metrics.metadata["error_type"] = "permanent"
logger.warning(
f"Non-retryable upload error for {file.filename}: {e}"
)
return None
except TransientUploadError as e:
metrics.metadata["error_type"] = "transient"
last_error = e
except Exception as e:
metrics.metadata["error_type"] = "unknown"
last_error = e
if attempt < UPLOAD_MAX_RETRIES - 1:
delay = UPLOAD_RETRY_DELAY_BASE**attempt
logger.debug(
f"Retrying upload for {file.filename} in {delay}s (attempt {attempt + 1})"
)
time.sleep(delay)
logger.warning(
f"Upload failed for {file.filename} to {provider} after {UPLOAD_MAX_RETRIES} attempts: {last_error}"
)
return None
def _resolve_inline(
self,
file: FileInput,
provider: str,
context: FileContext,
) -> ResolvedFile:
"""Resolve a file as inline content.
Args:
file: The file to resolve.
provider: Provider name.
context: Pre-computed file context.
Returns:
InlineBase64 or InlineBytes depending on provider.
"""
if self.config.use_bytes_for_bedrock and "bedrock" in provider:
return InlineBytes(
content_type=context.content_type,
data=context.content,
)
encoded = base64.b64encode(context.content).decode("ascii")
return InlineBase64(
content_type=context.content_type,
data=encoded,
)
async def aresolve(self, file: FileInput, provider: str) -> ResolvedFile:
"""Async resolve a file to its delivery format for a provider.
Args:
file: The file to resolve.
provider: Provider name (e.g., "gemini", "anthropic", "openai").
Returns:
ResolvedFile representing the appropriate delivery format.
"""
provider_lower = provider.lower()
constraints = get_constraints_for_provider(provider)
context = self._build_file_context(file)
should_upload = self._should_upload(
file, provider_lower, constraints, context.size
)
if should_upload:
resolved = await self._aresolve_via_upload(file, provider_lower, context)
if resolved is not None:
return resolved
return self._resolve_inline(file, provider_lower, context)
async def aresolve_files(
self,
files: dict[str, FileInput],
provider: str,
max_concurrency: int = 10,
) -> dict[str, ResolvedFile]:
"""Async resolve multiple files in parallel.
Args:
files: Dictionary mapping names to file inputs.
provider: Provider name.
max_concurrency: Maximum number of concurrent resolutions.
Returns:
Dictionary mapping names to resolved files.
"""
semaphore = asyncio.Semaphore(max_concurrency)
async def resolve_one(name: str, file: FileInput) -> tuple[str, ResolvedFile]:
async with semaphore:
resolved = await self.aresolve(file, provider)
return name, resolved
tasks = [resolve_one(n, f) for n, f in files.items()]
results = await asyncio.gather(*tasks, return_exceptions=True)
output: dict[str, ResolvedFile] = {}
for result in results:
if isinstance(result, BaseException):
logger.error(f"Resolution failed: {result}")
continue
name, resolved = result
output[name] = resolved
return output
async def _aresolve_via_upload(
self,
file: FileInput,
provider: str,
context: FileContext,
) -> ResolvedFile | None:
"""Async resolve a file by uploading it.
Args:
file: The file to upload.
provider: Provider name.
context: Pre-computed file context.
Returns:
FileReference if upload succeeds, None otherwise.
"""
if self.upload_cache is not None:
cached = await self.upload_cache.aget_by_hash(
context.content_hash, provider
)
if cached is not None:
logger.debug(
f"Using cached upload for {file.filename}: {cached.file_id}"
)
return FileReference(
content_type=cached.content_type,
file_id=cached.file_id,
provider=cached.provider,
expires_at=cached.expires_at,
file_uri=cached.file_uri,
)
uploader = self._get_uploader(provider)
if uploader is None:
logger.debug(f"No uploader available for {provider}")
return None
result = await self._aupload_with_retry(uploader, file, provider, context.size)
if result is None:
return None
if self.upload_cache is not None:
await self.upload_cache.aset_by_hash(
file_hash=context.content_hash,
content_type=context.content_type,
provider=provider,
file_id=result.file_id,
file_uri=result.file_uri,
expires_at=result.expires_at,
)
return FileReference(
content_type=result.content_type,
file_id=result.file_id,
provider=result.provider,
expires_at=result.expires_at,
file_uri=result.file_uri,
)
async def _aupload_with_retry(
self,
uploader: FileUploader,
file: FileInput,
provider: str,
file_size: int,
) -> UploadResult | None:
"""Async upload with exponential backoff retry.
Args:
uploader: The uploader to use.
file: The file to upload.
provider: Provider name for logging.
file_size: Size of the file in bytes.
Returns:
UploadResult if successful, None otherwise.
"""
from crewai.files.processing.exceptions import (
PermanentUploadError,
TransientUploadError,
)
last_error: Exception | None = None
for attempt in range(UPLOAD_MAX_RETRIES):
with measure_operation(
"upload",
filename=file.filename,
provider=provider,
size_bytes=file_size,
attempt=attempt + 1,
) as metrics:
try:
result = await uploader.aupload(file)
metrics.metadata["file_id"] = result.file_id
return result
except PermanentUploadError as e:
metrics.metadata["error_type"] = "permanent"
logger.warning(
f"Non-retryable upload error for {file.filename}: {e}"
)
return None
except TransientUploadError as e:
metrics.metadata["error_type"] = "transient"
last_error = e
except Exception as e:
metrics.metadata["error_type"] = "unknown"
last_error = e
if attempt < UPLOAD_MAX_RETRIES - 1:
delay = UPLOAD_RETRY_DELAY_BASE**attempt
logger.debug(
f"Retrying upload for {file.filename} in {delay}s (attempt {attempt + 1})"
)
await asyncio.sleep(delay)
logger.warning(
f"Upload failed for {file.filename} to {provider} after {UPLOAD_MAX_RETRIES} attempts: {last_error}"
)
return None
def _get_uploader(self, provider: str) -> FileUploader | None:
"""Get or create an uploader for a provider.
Args:
provider: Provider name.
Returns:
FileUploader instance or None if not available.
"""
if provider not in self._uploaders:
uploader = get_uploader(provider)
if uploader is not None:
self._uploaders[provider] = uploader
else:
return None
return self._uploaders.get(provider)
def get_cached_uploads(self, provider: str) -> list[CachedUpload]:
"""Get all cached uploads for a provider.
Args:
provider: Provider name.
Returns:
List of cached uploads.
"""
if self.upload_cache is None:
return []
return self.upload_cache.get_all_for_provider(provider)
def clear_cache(self) -> None:
"""Clear the upload cache."""
if self.upload_cache is not None:
self.upload_cache.clear()
def create_resolver(
provider: str | None = None,
prefer_upload: bool = False,
upload_threshold_bytes: int | None = None,
enable_cache: bool = True,
) -> FileResolver:
"""Create a configured FileResolver.
Args:
provider: Optional provider name for provider-specific configuration.
prefer_upload: Whether to prefer upload over inline.
upload_threshold_bytes: Size threshold for using upload.
enable_cache: Whether to enable upload caching.
Returns:
Configured FileResolver instance.
"""
config = FileResolverConfig(
prefer_upload=prefer_upload,
upload_threshold_bytes=upload_threshold_bytes,
)
cache = UploadCache() if enable_cache else None
return FileResolver(config=config, upload_cache=cache)

View File

@@ -5,6 +5,7 @@ from __future__ import annotations
import asyncio
import atexit
import builtins
from collections.abc import Iterator
from dataclasses import dataclass
from datetime import datetime, timezone
import hashlib
@@ -16,7 +17,7 @@ from aiocache.serializers import PickleSerializer # type: ignore[import-untyped
if TYPE_CHECKING:
from crewai.utilities.files.content_types import (
from crewai.files.content_types import (
AudioFile,
File,
ImageFile,
@@ -31,6 +32,7 @@ if TYPE_CHECKING:
logger = logging.getLogger(__name__)
DEFAULT_TTL_SECONDS = 24 * 60 * 60 # 24 hours
DEFAULT_MAX_CACHE_ENTRIES = 1000
@dataclass
@@ -65,8 +67,31 @@ def _make_key(file_hash: str, provider: str) -> str:
return f"upload:{provider}:{file_hash}"
def _compute_file_hash_streaming(chunks: Iterator[bytes]) -> str:
"""Compute SHA-256 hash from streaming chunks.
Args:
chunks: Iterator of byte chunks.
Returns:
Hexadecimal hash string.
"""
hasher = hashlib.sha256()
for chunk in chunks:
hasher.update(chunk)
return hasher.hexdigest()
def _compute_file_hash(file: FileInput) -> str:
"""Compute SHA-256 hash of file content."""
"""Compute SHA-256 hash of file content.
Uses streaming for FilePath sources to avoid loading large files into memory.
"""
from crewai.files.file import FilePath
source = file._file_source
if isinstance(source, FilePath):
return _compute_file_hash_streaming(source.read_chunks(chunk_size=1024 * 1024))
content = file.read()
return hashlib.sha256(content).hexdigest()
@@ -87,6 +112,7 @@ class UploadCache:
ttl: int = DEFAULT_TTL_SECONDS,
namespace: str = "crewai_uploads",
cache_type: str = "memory",
max_entries: int | None = DEFAULT_MAX_CACHE_ENTRIES,
**cache_kwargs: Any,
) -> None:
"""Initialize the upload cache.
@@ -95,11 +121,14 @@ class UploadCache:
ttl: Default TTL in seconds.
namespace: Cache namespace.
cache_type: Backend type ("memory" or "redis").
max_entries: Maximum cache entries (None for unlimited).
**cache_kwargs: Additional args for cache backend.
"""
self.ttl = ttl
self.namespace = namespace
self.max_entries = max_entries
self._provider_keys: dict[str, set[str]] = {}
self._key_access_order: list[str] = []
if cache_type == "redis":
self._cache = Cache(
@@ -116,15 +145,60 @@ class UploadCache:
)
def _track_key(self, provider: str, key: str) -> None:
"""Track a key for a provider (for cleanup)."""
"""Track a key for a provider (for cleanup) and access order."""
if provider not in self._provider_keys:
self._provider_keys[provider] = set()
self._provider_keys[provider].add(key)
if key in self._key_access_order:
self._key_access_order.remove(key)
self._key_access_order.append(key)
def _untrack_key(self, provider: str, key: str) -> None:
"""Remove key tracking for a provider."""
if provider in self._provider_keys:
self._provider_keys[provider].discard(key)
if key in self._key_access_order:
self._key_access_order.remove(key)
async def _evict_if_needed(self) -> int:
"""Evict oldest entries if limit exceeded.
Returns:
Number of entries evicted.
"""
if self.max_entries is None:
return 0
current_count = len(self)
if current_count < self.max_entries:
return 0
to_evict = max(1, self.max_entries // 10)
return await self._evict_oldest(to_evict)
async def _evict_oldest(self, count: int) -> int:
"""Evict the oldest entries from the cache.
Args:
count: Number of entries to evict.
Returns:
Number of entries actually evicted.
"""
evicted = 0
keys_to_evict = self._key_access_order[:count]
for key in keys_to_evict:
await self._cache.delete(key)
self._key_access_order.remove(key)
for provider_keys in self._provider_keys.values():
provider_keys.discard(key)
evicted += 1
if evicted > 0:
logger.debug(f"Evicted {evicted} oldest cache entries")
return evicted
async def aget(self, file: FileInput, provider: str) -> CachedUpload | None:
"""Get a cached upload for a file.
@@ -214,6 +288,8 @@ class UploadCache:
Returns:
The created cache entry.
"""
await self._evict_if_needed()
key = _make_key(file_hash, provider)
now = datetime.now(timezone.utc)
@@ -331,18 +407,15 @@ class UploadCache:
return results
def _run_sync(self, coro: Any) -> Any:
"""Run an async coroutine from sync context."""
"""Run an async coroutine from sync context without blocking event loop."""
try:
loop = asyncio.get_running_loop()
except RuntimeError:
loop = None
if loop is not None and loop.is_running():
import concurrent.futures
with concurrent.futures.ThreadPoolExecutor() as pool:
future = pool.submit(asyncio.run, coro)
return future.result()
future = asyncio.run_coroutine_threadsafe(coro, loop)
return future.result(timeout=30)
return asyncio.run(coro)
def get(self, file: FileInput, provider: str) -> CachedUpload | None:
@@ -473,7 +546,7 @@ def _cleanup_on_exit() -> None:
if _default_cache is None or len(_default_cache) == 0:
return
from crewai.utilities.files.cleanup import cleanup_uploaded_files
from crewai.files.cleanup import cleanup_uploaded_files
try:
cleanup_uploaded_files(_default_cache, delete_from_provider=True)

View File

@@ -5,7 +5,7 @@ from __future__ import annotations
import logging
from typing import Any
from crewai.utilities.files.uploaders.base import FileUploader, UploadResult
from crewai.files.uploaders.base import FileUploader, UploadResult
logger = logging.getLogger(__name__)
@@ -31,7 +31,7 @@ def get_uploader(provider: str, **kwargs: Any) -> FileUploader | None:
if "gemini" in provider_lower or "google" in provider_lower:
try:
from crewai.utilities.files.uploaders.gemini import GeminiFileUploader
from crewai.files.uploaders.gemini import GeminiFileUploader
return GeminiFileUploader(**kwargs)
except ImportError:
@@ -42,7 +42,7 @@ def get_uploader(provider: str, **kwargs: Any) -> FileUploader | None:
if "anthropic" in provider_lower or "claude" in provider_lower:
try:
from crewai.utilities.files.uploaders.anthropic import AnthropicFileUploader
from crewai.files.uploaders.anthropic import AnthropicFileUploader
return AnthropicFileUploader(**kwargs)
except ImportError:
@@ -53,12 +53,32 @@ def get_uploader(provider: str, **kwargs: Any) -> FileUploader | None:
if "openai" in provider_lower or "gpt" in provider_lower:
try:
from crewai.utilities.files.uploaders.openai import OpenAIFileUploader
from crewai.files.uploaders.openai import OpenAIFileUploader
return OpenAIFileUploader(**kwargs)
except ImportError:
logger.warning("openai not installed. Install with: pip install openai")
return None
if "bedrock" in provider_lower or "aws" in provider_lower:
import os
if (
not os.environ.get("CREWAI_BEDROCK_S3_BUCKET")
and "bucket_name" not in kwargs
):
logger.debug(
"Bedrock S3 uploader not configured. "
"Set CREWAI_BEDROCK_S3_BUCKET environment variable to enable."
)
return None
try:
from crewai.files.uploaders.bedrock import BedrockFileUploader
return BedrockFileUploader(**kwargs)
except ImportError:
logger.warning("boto3 not installed. Install with: pip install boto3")
return None
logger.debug(f"No file uploader available for provider: {provider}")
return None

View File

@@ -0,0 +1,320 @@
"""Anthropic Files API uploader implementation."""
from __future__ import annotations
import io
import logging
import os
from typing import Any
from crewai.files.content_types import (
AudioFile,
File,
ImageFile,
PDFFile,
TextFile,
VideoFile,
)
from crewai.files.uploaders.base import FileUploader, UploadResult
logger = logging.getLogger(__name__)
FileInput = AudioFile | File | ImageFile | PDFFile | TextFile | VideoFile
class AnthropicFileUploader(FileUploader):
"""Uploader for Anthropic Files API.
Uses the anthropic SDK to upload files. Files are stored persistently
until explicitly deleted.
Attributes:
api_key: Optional API key (uses ANTHROPIC_API_KEY env var if not provided).
"""
def __init__(self, api_key: str | None = None) -> None:
"""Initialize the Anthropic uploader.
Args:
api_key: Optional Anthropic API key. If not provided, uses
ANTHROPIC_API_KEY environment variable.
"""
self._api_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
self._client: Any = None
self._async_client: Any = None
@property
def provider_name(self) -> str:
"""Return the provider name."""
return "anthropic"
def _get_client(self) -> Any:
"""Get or create the Anthropic client."""
if self._client is None:
try:
import anthropic
self._client = anthropic.Anthropic(api_key=self._api_key)
except ImportError as e:
raise ImportError(
"anthropic is required for Anthropic file uploads. "
"Install with: pip install anthropic"
) from e
return self._client
def _get_async_client(self) -> Any:
"""Get or create the async Anthropic client."""
if self._async_client is None:
try:
import anthropic
self._async_client = anthropic.AsyncAnthropic(api_key=self._api_key)
except ImportError as e:
raise ImportError(
"anthropic is required for Anthropic file uploads. "
"Install with: pip install anthropic"
) from e
return self._async_client
def upload(self, file: FileInput, purpose: str | None = None) -> UploadResult:
"""Upload a file to Anthropic.
Args:
file: The file to upload.
purpose: Optional purpose for the file (default: "user_upload").
Returns:
UploadResult with the file ID and metadata.
Raises:
TransientUploadError: For retryable errors (network, rate limits).
PermanentUploadError: For non-retryable errors (auth, validation).
"""
from crewai.files.processing.exceptions import (
PermanentUploadError,
TransientUploadError,
)
try:
client = self._get_client()
content = file.read()
file_purpose = purpose or "user_upload"
file_data = io.BytesIO(content)
logger.info(
f"Uploading file '{file.filename}' to Anthropic ({len(content)} bytes)"
)
uploaded_file = client.files.create(
file=(file.filename, file_data, file.content_type),
purpose=file_purpose,
)
logger.info(f"Uploaded to Anthropic: {uploaded_file.id}")
return UploadResult(
file_id=uploaded_file.id,
file_uri=None,
content_type=file.content_type,
expires_at=None,
provider=self.provider_name,
)
except ImportError:
raise
except Exception as e:
error_type = type(e).__name__
if "RateLimit" in error_type or "APIConnection" in error_type:
raise TransientUploadError(
f"Transient upload error: {e}", file_name=file.filename
) from e
if "Authentication" in error_type or "Permission" in error_type:
raise PermanentUploadError(
f"Authentication/permission error: {e}", file_name=file.filename
) from e
if "BadRequest" in error_type or "InvalidRequest" in error_type:
raise PermanentUploadError(
f"Invalid request: {e}", file_name=file.filename
) from e
status_code = getattr(e, "status_code", None)
if status_code is not None:
if status_code >= 500 or status_code == 429:
raise TransientUploadError(
f"Server error ({status_code}): {e}", file_name=file.filename
) from e
if status_code in (401, 403):
raise PermanentUploadError(
f"Auth error ({status_code}): {e}", file_name=file.filename
) from e
if status_code == 400:
raise PermanentUploadError(
f"Bad request ({status_code}): {e}", file_name=file.filename
) from e
raise TransientUploadError(
f"Upload failed: {e}", file_name=file.filename
) from e
def delete(self, file_id: str) -> bool:
"""Delete an uploaded file from Anthropic.
Args:
file_id: The file ID to delete.
Returns:
True if deletion was successful, False otherwise.
"""
try:
client = self._get_client()
client.files.delete(file_id=file_id)
logger.info(f"Deleted Anthropic file: {file_id}")
return True
except Exception as e:
logger.warning(f"Failed to delete Anthropic file {file_id}: {e}")
return False
def get_file_info(self, file_id: str) -> dict[str, Any] | None:
"""Get information about an uploaded file.
Args:
file_id: The file ID.
Returns:
Dictionary with file information, or None if not found.
"""
try:
client = self._get_client()
file_info = client.files.retrieve(file_id=file_id)
return {
"id": file_info.id,
"filename": file_info.filename,
"purpose": file_info.purpose,
"size_bytes": file_info.size_bytes,
"created_at": file_info.created_at,
}
except Exception as e:
logger.debug(f"Failed to get Anthropic file info for {file_id}: {e}")
return None
def list_files(self) -> list[dict[str, Any]]:
"""List all uploaded files.
Returns:
List of dictionaries with file information.
"""
try:
client = self._get_client()
files = client.files.list()
return [
{
"id": f.id,
"filename": f.filename,
"purpose": f.purpose,
"size_bytes": f.size_bytes,
"created_at": f.created_at,
}
for f in files.data
]
except Exception as e:
logger.warning(f"Failed to list Anthropic files: {e}")
return []
async def aupload(
self, file: FileInput, purpose: str | None = None
) -> UploadResult:
"""Async upload a file to Anthropic using native async client.
Args:
file: The file to upload.
purpose: Optional purpose for the file (default: "user_upload").
Returns:
UploadResult with the file ID and metadata.
Raises:
TransientUploadError: For retryable errors (network, rate limits).
PermanentUploadError: For non-retryable errors (auth, validation).
"""
from crewai.files.processing.exceptions import (
PermanentUploadError,
TransientUploadError,
)
try:
client = self._get_async_client()
content = await file.aread()
file_purpose = purpose or "user_upload"
file_data = io.BytesIO(content)
logger.info(
f"Uploading file '{file.filename}' to Anthropic ({len(content)} bytes)"
)
uploaded_file = await client.files.create(
file=(file.filename, file_data, file.content_type),
purpose=file_purpose,
)
logger.info(f"Uploaded to Anthropic: {uploaded_file.id}")
return UploadResult(
file_id=uploaded_file.id,
file_uri=None,
content_type=file.content_type,
expires_at=None,
provider=self.provider_name,
)
except ImportError:
raise
except Exception as e:
error_type = type(e).__name__
if "RateLimit" in error_type or "APIConnection" in error_type:
raise TransientUploadError(
f"Transient upload error: {e}", file_name=file.filename
) from e
if "Authentication" in error_type or "Permission" in error_type:
raise PermanentUploadError(
f"Authentication/permission error: {e}", file_name=file.filename
) from e
if "BadRequest" in error_type or "InvalidRequest" in error_type:
raise PermanentUploadError(
f"Invalid request: {e}", file_name=file.filename
) from e
status_code = getattr(e, "status_code", None)
if status_code is not None:
if status_code >= 500 or status_code == 429:
raise TransientUploadError(
f"Server error ({status_code}): {e}", file_name=file.filename
) from e
if status_code in (401, 403):
raise PermanentUploadError(
f"Auth error ({status_code}): {e}", file_name=file.filename
) from e
if status_code == 400:
raise PermanentUploadError(
f"Bad request ({status_code}): {e}", file_name=file.filename
) from e
raise TransientUploadError(
f"Upload failed: {e}", file_name=file.filename
) from e
async def adelete(self, file_id: str) -> bool:
"""Async delete an uploaded file from Anthropic.
Args:
file_id: The file ID to delete.
Returns:
True if deletion was successful, False otherwise.
"""
try:
client = self._get_async_client()
await client.files.delete(file_id=file_id)
logger.info(f"Deleted Anthropic file: {file_id}")
return True
except Exception as e:
logger.warning(f"Failed to delete Anthropic file {file_id}: {e}")
return False

View File

@@ -1,11 +1,12 @@
"""Base class for file uploaders."""
from abc import ABC, abstractmethod
import asyncio
from dataclasses import dataclass
from datetime import datetime
from typing import Any
from crewai.utilities.files.content_types import (
from crewai.files.content_types import (
AudioFile,
File,
ImageFile,
@@ -63,6 +64,24 @@ class FileUploader(ABC):
Exception: If upload fails.
"""
async def aupload(
self, file: FileInput, purpose: str | None = None
) -> UploadResult:
"""Async upload a file to the provider.
Default implementation runs sync upload in executor.
Override in subclasses for native async support.
Args:
file: The file to upload.
purpose: Optional purpose/description for the upload.
Returns:
UploadResult with the file identifier and metadata.
"""
loop = asyncio.get_running_loop()
return await loop.run_in_executor(None, self.upload, file, purpose)
@abstractmethod
def delete(self, file_id: str) -> bool:
"""Delete an uploaded file.
@@ -74,6 +93,21 @@ class FileUploader(ABC):
True if deletion was successful, False otherwise.
"""
async def adelete(self, file_id: str) -> bool:
"""Async delete an uploaded file.
Default implementation runs sync delete in executor.
Override in subclasses for native async support.
Args:
file_id: The file identifier to delete.
Returns:
True if deletion was successful, False otherwise.
"""
loop = asyncio.get_running_loop()
return await loop.run_in_executor(None, self.delete, file_id)
def get_file_info(self, file_id: str) -> dict[str, Any] | None:
"""Get information about an uploaded file.

View File

@@ -0,0 +1,388 @@
"""AWS Bedrock S3 file uploader implementation."""
from __future__ import annotations
import hashlib
import logging
import os
from typing import Any
from crewai.files.content_types import (
AudioFile,
File,
ImageFile,
PDFFile,
TextFile,
VideoFile,
)
from crewai.files.uploaders.base import FileUploader, UploadResult
logger = logging.getLogger(__name__)
FileInput = AudioFile | File | ImageFile | PDFFile | TextFile | VideoFile
class BedrockFileUploader(FileUploader):
"""Uploader for AWS Bedrock via S3.
Uploads files to S3 and returns S3 URIs that can be used with Bedrock's
Converse API s3Location source format.
Attributes:
bucket_name: S3 bucket name for file uploads.
bucket_owner: Optional bucket owner account ID for cross-account access.
prefix: Optional S3 key prefix for uploaded files.
region: AWS region for the S3 bucket.
"""
def __init__(
self,
bucket_name: str | None = None,
bucket_owner: str | None = None,
prefix: str = "crewai-files",
region: str | None = None,
) -> None:
"""Initialize the Bedrock S3 uploader.
Args:
bucket_name: S3 bucket name. If not provided, uses
CREWAI_BEDROCK_S3_BUCKET environment variable.
bucket_owner: Optional bucket owner account ID for cross-account access.
Uses CREWAI_BEDROCK_S3_BUCKET_OWNER environment variable if not provided.
prefix: S3 key prefix for uploaded files (default: "crewai-files").
region: AWS region. Uses AWS_REGION or AWS_DEFAULT_REGION if not provided.
"""
self._bucket_name = bucket_name or os.environ.get("CREWAI_BEDROCK_S3_BUCKET")
self._bucket_owner = bucket_owner or os.environ.get(
"CREWAI_BEDROCK_S3_BUCKET_OWNER"
)
self._prefix = prefix
self._region = region or os.environ.get(
"AWS_REGION", os.environ.get("AWS_DEFAULT_REGION")
)
self._client: Any = None
self._async_client: Any = None
@property
def provider_name(self) -> str:
"""Return the provider name."""
return "bedrock"
@property
def bucket_name(self) -> str:
"""Return the configured bucket name."""
if not self._bucket_name:
raise ValueError(
"S3 bucket name not configured. Set CREWAI_BEDROCK_S3_BUCKET "
"environment variable or pass bucket_name parameter."
)
return self._bucket_name
@property
def bucket_owner(self) -> str | None:
"""Return the configured bucket owner."""
return self._bucket_owner
def _get_client(self) -> Any:
"""Get or create the S3 client."""
if self._client is None:
try:
import boto3
self._client = boto3.client("s3", region_name=self._region)
except ImportError as e:
raise ImportError(
"boto3 is required for Bedrock S3 file uploads. "
"Install with: pip install boto3"
) from e
return self._client
def _get_async_client(self) -> Any:
"""Get or create the async S3 client."""
if self._async_client is None:
try:
import aioboto3 # type: ignore[import-not-found]
self._session = aioboto3.Session()
except ImportError as e:
raise ImportError(
"aioboto3 is required for async Bedrock S3 file uploads. "
"Install with: pip install aioboto3"
) from e
return self._session
def _generate_s3_key(self, file: FileInput, content: bytes) -> str:
"""Generate a unique S3 key for the file.
Args:
file: The file being uploaded.
content: The file content bytes.
Returns:
S3 key string.
"""
content_hash = hashlib.sha256(content).hexdigest()[:16]
filename = file.filename or "file"
safe_filename = "".join(
c if c.isalnum() or c in ".-_" else "_" for c in filename
)
return f"{self._prefix}/{content_hash}_{safe_filename}"
def _build_s3_uri(self, key: str) -> str:
"""Build an S3 URI from a key.
Args:
key: The S3 object key.
Returns:
S3 URI string.
"""
return f"s3://{self.bucket_name}/{key}"
def upload(self, file: FileInput, purpose: str | None = None) -> UploadResult:
"""Upload a file to S3 for use with Bedrock.
Args:
file: The file to upload.
purpose: Optional purpose (unused, kept for interface consistency).
Returns:
UploadResult with the S3 URI and metadata.
Raises:
TransientUploadError: For retryable errors (network, throttling).
PermanentUploadError: For non-retryable errors (auth, validation).
"""
from crewai.files.processing.exceptions import (
PermanentUploadError,
TransientUploadError,
)
try:
client = self._get_client()
content = file.read()
s3_key = self._generate_s3_key(file, content)
logger.info(
f"Uploading file '{file.filename}' to S3 bucket "
f"'{self.bucket_name}' ({len(content)} bytes)"
)
client.put_object(
Bucket=self.bucket_name,
Key=s3_key,
Body=content,
ContentType=file.content_type,
)
s3_uri = self._build_s3_uri(s3_key)
logger.info(f"Uploaded to S3: {s3_uri}")
return UploadResult(
file_id=s3_key,
file_uri=s3_uri,
content_type=file.content_type,
expires_at=None,
provider=self.provider_name,
)
except ImportError:
raise
except Exception as e:
error_type = type(e).__name__
error_code = getattr(e, "response", {}).get("Error", {}).get("Code", "")
if error_code in ("SlowDown", "ServiceUnavailable", "InternalError"):
raise TransientUploadError(
f"Transient S3 error: {e}", file_name=file.filename
) from e
if error_code in (
"AccessDenied",
"InvalidAccessKeyId",
"SignatureDoesNotMatch",
):
raise PermanentUploadError(
f"S3 authentication error: {e}", file_name=file.filename
) from e
if error_code in ("NoSuchBucket", "InvalidBucketName"):
raise PermanentUploadError(
f"S3 bucket error: {e}", file_name=file.filename
) from e
if "Throttl" in error_type or "Throttl" in str(e):
raise TransientUploadError(
f"S3 throttling: {e}", file_name=file.filename
) from e
raise TransientUploadError(
f"S3 upload failed: {e}", file_name=file.filename
) from e
def delete(self, file_id: str) -> bool:
"""Delete an uploaded file from S3.
Args:
file_id: The S3 key to delete.
Returns:
True if deletion was successful, False otherwise.
"""
try:
client = self._get_client()
client.delete_object(Bucket=self.bucket_name, Key=file_id)
logger.info(f"Deleted S3 object: s3://{self.bucket_name}/{file_id}")
return True
except Exception as e:
logger.warning(
f"Failed to delete S3 object s3://{self.bucket_name}/{file_id}: {e}"
)
return False
def get_file_info(self, file_id: str) -> dict[str, Any] | None:
"""Get information about an uploaded file.
Args:
file_id: The S3 key.
Returns:
Dictionary with file information, or None if not found.
"""
try:
client = self._get_client()
response = client.head_object(Bucket=self.bucket_name, Key=file_id)
return {
"id": file_id,
"uri": self._build_s3_uri(file_id),
"content_type": response.get("ContentType"),
"size": response.get("ContentLength"),
"last_modified": response.get("LastModified"),
"etag": response.get("ETag"),
}
except Exception as e:
logger.debug(f"Failed to get S3 object info for {file_id}: {e}")
return None
def list_files(self) -> list[dict[str, Any]]:
"""List all uploaded files in the configured prefix.
Returns:
List of dictionaries with file information.
"""
try:
client = self._get_client()
response = client.list_objects_v2(
Bucket=self.bucket_name,
Prefix=self._prefix,
)
return [
{
"id": obj["Key"],
"uri": self._build_s3_uri(obj["Key"]),
"size": obj.get("Size"),
"last_modified": obj.get("LastModified"),
"etag": obj.get("ETag"),
}
for obj in response.get("Contents", [])
]
except Exception as e:
logger.warning(f"Failed to list S3 objects: {e}")
return []
async def aupload(
self, file: FileInput, purpose: str | None = None
) -> UploadResult:
"""Async upload a file to S3 for use with Bedrock.
Args:
file: The file to upload.
purpose: Optional purpose (unused, kept for interface consistency).
Returns:
UploadResult with the S3 URI and metadata.
Raises:
TransientUploadError: For retryable errors (network, throttling).
PermanentUploadError: For non-retryable errors (auth, validation).
"""
from crewai.files.processing.exceptions import (
PermanentUploadError,
TransientUploadError,
)
try:
session = self._get_async_client()
content = await file.aread()
s3_key = self._generate_s3_key(file, content)
logger.info(
f"Uploading file '{file.filename}' to S3 bucket "
f"'{self.bucket_name}' ({len(content)} bytes)"
)
async with session.client("s3", region_name=self._region) as client:
await client.put_object(
Bucket=self.bucket_name,
Key=s3_key,
Body=content,
ContentType=file.content_type,
)
s3_uri = self._build_s3_uri(s3_key)
logger.info(f"Uploaded to S3: {s3_uri}")
return UploadResult(
file_id=s3_key,
file_uri=s3_uri,
content_type=file.content_type,
expires_at=None,
provider=self.provider_name,
)
except ImportError:
raise
except Exception as e:
error_type = type(e).__name__
error_code = getattr(e, "response", {}).get("Error", {}).get("Code", "")
if error_code in ("SlowDown", "ServiceUnavailable", "InternalError"):
raise TransientUploadError(
f"Transient S3 error: {e}", file_name=file.filename
) from e
if error_code in (
"AccessDenied",
"InvalidAccessKeyId",
"SignatureDoesNotMatch",
):
raise PermanentUploadError(
f"S3 authentication error: {e}", file_name=file.filename
) from e
if error_code in ("NoSuchBucket", "InvalidBucketName"):
raise PermanentUploadError(
f"S3 bucket error: {e}", file_name=file.filename
) from e
if "Throttl" in error_type or "Throttl" in str(e):
raise TransientUploadError(
f"S3 throttling: {e}", file_name=file.filename
) from e
raise TransientUploadError(
f"S3 upload failed: {e}", file_name=file.filename
) from e
async def adelete(self, file_id: str) -> bool:
"""Async delete an uploaded file from S3.
Args:
file_id: The S3 key to delete.
Returns:
True if deletion was successful, False otherwise.
"""
try:
session = self._get_async_client()
async with session.client("s3", region_name=self._region) as client:
await client.delete_object(Bucket=self.bucket_name, Key=file_id)
logger.info(f"Deleted S3 object: s3://{self.bucket_name}/{file_id}")
return True
except Exception as e:
logger.warning(
f"Failed to delete S3 object s3://{self.bucket_name}/{file_id}: {e}"
)
return False

View File

@@ -0,0 +1,444 @@
"""Gemini File API uploader implementation."""
from __future__ import annotations
import asyncio
from datetime import datetime, timedelta, timezone
import io
import logging
import os
import random
import time
from typing import Any
from crewai.files.content_types import (
AudioFile,
File,
ImageFile,
PDFFile,
TextFile,
VideoFile,
)
from crewai.files.uploaders.base import FileUploader, UploadResult
logger = logging.getLogger(__name__)
FileInput = AudioFile | File | ImageFile | PDFFile | TextFile | VideoFile
GEMINI_FILE_TTL = timedelta(hours=48)
class GeminiFileUploader(FileUploader):
"""Uploader for Google Gemini File API.
Uses the google-genai SDK to upload files. Files are stored for 48 hours.
Attributes:
api_key: Optional API key (uses GOOGLE_API_KEY env var if not provided).
"""
def __init__(self, api_key: str | None = None) -> None:
"""Initialize the Gemini uploader.
Args:
api_key: Optional Google API key. If not provided, uses
GOOGLE_API_KEY environment variable.
"""
self._api_key = api_key or os.environ.get("GOOGLE_API_KEY")
self._client: Any = None
@property
def provider_name(self) -> str:
"""Return the provider name."""
return "gemini"
def _get_client(self) -> Any:
"""Get or create the Gemini client."""
if self._client is None:
try:
from google import genai
self._client = genai.Client(api_key=self._api_key)
except ImportError as e:
raise ImportError(
"google-genai is required for Gemini file uploads. "
"Install with: pip install google-genai"
) from e
return self._client
def upload(self, file: FileInput, purpose: str | None = None) -> UploadResult:
"""Upload a file to Gemini.
Args:
file: The file to upload.
purpose: Optional purpose/description (used as display name).
Returns:
UploadResult with the file URI and metadata.
Raises:
TransientUploadError: For retryable errors (network, rate limits).
PermanentUploadError: For non-retryable errors (auth, validation).
"""
from crewai.files.processing.exceptions import (
PermanentUploadError,
TransientUploadError,
)
try:
client = self._get_client()
content = file.read()
display_name = purpose or file.filename
file_data = io.BytesIO(content)
file_data.name = file.filename
logger.info(
f"Uploading file '{file.filename}' to Gemini ({len(content)} bytes)"
)
uploaded_file = client.files.upload(
file=file_data,
config={
"display_name": display_name,
"mime_type": file.content_type,
},
)
if file.content_type.startswith("video/"):
if not self.wait_for_processing(uploaded_file.name):
raise PermanentUploadError(
f"Video processing failed for {file.filename}",
file_name=file.filename,
)
expires_at = datetime.now(timezone.utc) + GEMINI_FILE_TTL
logger.info(
f"Uploaded to Gemini: {uploaded_file.name} (URI: {uploaded_file.uri})"
)
return UploadResult(
file_id=uploaded_file.name,
file_uri=uploaded_file.uri,
content_type=file.content_type,
expires_at=expires_at,
provider=self.provider_name,
)
except ImportError:
raise
except (TransientUploadError, PermanentUploadError):
raise
except Exception as e:
error_msg = str(e).lower()
if "quota" in error_msg or "rate" in error_msg or "limit" in error_msg:
raise TransientUploadError(
f"Rate limit error: {e}", file_name=file.filename
) from e
if (
"auth" in error_msg
or "permission" in error_msg
or "denied" in error_msg
):
raise PermanentUploadError(
f"Authentication/permission error: {e}", file_name=file.filename
) from e
if "invalid" in error_msg or "unsupported" in error_msg:
raise PermanentUploadError(
f"Invalid request: {e}", file_name=file.filename
) from e
status_code = getattr(e, "code", None) or getattr(e, "status_code", None)
if status_code is not None:
if isinstance(status_code, int):
if status_code >= 500 or status_code == 429:
raise TransientUploadError(
f"Server error ({status_code}): {e}",
file_name=file.filename,
) from e
if status_code in (401, 403):
raise PermanentUploadError(
f"Auth error ({status_code}): {e}", file_name=file.filename
) from e
if status_code == 400:
raise PermanentUploadError(
f"Bad request ({status_code}): {e}", file_name=file.filename
) from e
raise TransientUploadError(
f"Upload failed: {e}", file_name=file.filename
) from e
async def aupload(
self, file: FileInput, purpose: str | None = None
) -> UploadResult:
"""Async upload a file to Gemini using native async client.
Uses async wait_for_processing for video files.
Args:
file: The file to upload.
purpose: Optional purpose/description (used as display name).
Returns:
UploadResult with the file URI and metadata.
Raises:
TransientUploadError: For retryable errors (network, rate limits).
PermanentUploadError: For non-retryable errors (auth, validation).
"""
from crewai.files.processing.exceptions import (
PermanentUploadError,
TransientUploadError,
)
try:
client = self._get_client()
content = await file.aread()
display_name = purpose or file.filename
file_data = io.BytesIO(content)
file_data.name = file.filename
logger.info(
f"Uploading file '{file.filename}' to Gemini ({len(content)} bytes)"
)
uploaded_file = await client.aio.files.upload(
file=file_data,
config={
"display_name": display_name,
"mime_type": file.content_type,
},
)
if file.content_type.startswith("video/"):
if not await self.await_for_processing(uploaded_file.name):
raise PermanentUploadError(
f"Video processing failed for {file.filename}",
file_name=file.filename,
)
expires_at = datetime.now(timezone.utc) + GEMINI_FILE_TTL
logger.info(
f"Uploaded to Gemini: {uploaded_file.name} (URI: {uploaded_file.uri})"
)
return UploadResult(
file_id=uploaded_file.name,
file_uri=uploaded_file.uri,
content_type=file.content_type,
expires_at=expires_at,
provider=self.provider_name,
)
except ImportError:
raise
except (TransientUploadError, PermanentUploadError):
raise
except Exception as e:
error_msg = str(e).lower()
if "quota" in error_msg or "rate" in error_msg or "limit" in error_msg:
raise TransientUploadError(
f"Rate limit error: {e}", file_name=file.filename
) from e
if (
"auth" in error_msg
or "permission" in error_msg
or "denied" in error_msg
):
raise PermanentUploadError(
f"Authentication/permission error: {e}", file_name=file.filename
) from e
if "invalid" in error_msg or "unsupported" in error_msg:
raise PermanentUploadError(
f"Invalid request: {e}", file_name=file.filename
) from e
status_code = getattr(e, "code", None) or getattr(e, "status_code", None)
if status_code is not None and isinstance(status_code, int):
if status_code >= 500 or status_code == 429:
raise TransientUploadError(
f"Server error ({status_code}): {e}", file_name=file.filename
) from e
if status_code in (401, 403):
raise PermanentUploadError(
f"Auth error ({status_code}): {e}", file_name=file.filename
) from e
if status_code == 400:
raise PermanentUploadError(
f"Bad request ({status_code}): {e}", file_name=file.filename
) from e
raise TransientUploadError(
f"Upload failed: {e}", file_name=file.filename
) from e
def delete(self, file_id: str) -> bool:
"""Delete an uploaded file from Gemini.
Args:
file_id: The file name/ID to delete.
Returns:
True if deletion was successful, False otherwise.
"""
try:
client = self._get_client()
client.files.delete(name=file_id)
logger.info(f"Deleted Gemini file: {file_id}")
return True
except Exception as e:
logger.warning(f"Failed to delete Gemini file {file_id}: {e}")
return False
async def adelete(self, file_id: str) -> bool:
"""Async delete an uploaded file from Gemini.
Args:
file_id: The file name/ID to delete.
Returns:
True if deletion was successful, False otherwise.
"""
try:
client = self._get_client()
await client.aio.files.delete(name=file_id)
logger.info(f"Deleted Gemini file: {file_id}")
return True
except Exception as e:
logger.warning(f"Failed to delete Gemini file {file_id}: {e}")
return False
def get_file_info(self, file_id: str) -> dict[str, Any] | None:
"""Get information about an uploaded file.
Args:
file_id: The file name/ID.
Returns:
Dictionary with file information, or None if not found.
"""
try:
client = self._get_client()
file_info = client.files.get(name=file_id)
return {
"name": file_info.name,
"uri": file_info.uri,
"display_name": file_info.display_name,
"mime_type": file_info.mime_type,
"size_bytes": file_info.size_bytes,
"state": str(file_info.state),
"create_time": file_info.create_time,
"expiration_time": file_info.expiration_time,
}
except Exception as e:
logger.debug(f"Failed to get Gemini file info for {file_id}: {e}")
return None
def list_files(self) -> list[dict[str, Any]]:
"""List all uploaded files.
Returns:
List of dictionaries with file information.
"""
try:
client = self._get_client()
files = client.files.list()
return [
{
"name": f.name,
"uri": f.uri,
"display_name": f.display_name,
"mime_type": f.mime_type,
"size_bytes": f.size_bytes,
"state": str(f.state),
}
for f in files
]
except Exception as e:
logger.warning(f"Failed to list Gemini files: {e}")
return []
def wait_for_processing(self, file_id: str, timeout_seconds: int = 300) -> bool:
"""Wait for a file to finish processing with exponential backoff.
Some files (especially videos) need time to process after upload.
Args:
file_id: The file name/ID.
timeout_seconds: Maximum time to wait.
Returns:
True if processing completed, False if timed out or failed.
"""
try:
from google.genai.types import FileState
except ImportError:
return True
client = self._get_client()
start_time = time.time()
base_delay = 1.0
max_delay = 30.0
attempt = 0
while time.time() - start_time < timeout_seconds:
file_info = client.files.get(name=file_id)
if file_info.state == FileState.ACTIVE:
return True
if file_info.state == FileState.FAILED:
logger.error(f"Gemini file processing failed: {file_id}")
return False
delay = min(base_delay * (2**attempt), max_delay)
jitter = random.uniform(0, delay * 0.1) # noqa: S311
time.sleep(delay + jitter)
attempt += 1
logger.warning(f"Timed out waiting for Gemini file processing: {file_id}")
return False
async def await_for_processing(
self, file_id: str, timeout_seconds: int = 300
) -> bool:
"""Async wait for a file to finish processing with exponential backoff.
Some files (especially videos) need time to process after upload.
Args:
file_id: The file name/ID.
timeout_seconds: Maximum time to wait.
Returns:
True if processing completed, False if timed out or failed.
"""
try:
from google.genai.types import FileState
except ImportError:
return True
client = self._get_client()
start_time = time.time()
base_delay = 1.0
max_delay = 30.0
attempt = 0
while time.time() - start_time < timeout_seconds:
file_info = await client.aio.files.get(name=file_id)
if file_info.state == FileState.ACTIVE:
return True
if file_info.state == FileState.FAILED:
logger.error(f"Gemini file processing failed: {file_id}")
return False
delay = min(base_delay * (2**attempt), max_delay)
jitter = random.uniform(0, delay * 0.1) # noqa: S311
await asyncio.sleep(delay + jitter)
attempt += 1
logger.warning(f"Timed out waiting for Gemini file processing: {file_id}")
return False

View File

@@ -0,0 +1,324 @@
"""OpenAI Files API uploader implementation."""
from __future__ import annotations
import io
import logging
import os
from typing import Any
from crewai.files.content_types import (
AudioFile,
File,
ImageFile,
PDFFile,
TextFile,
VideoFile,
)
from crewai.files.uploaders.base import FileUploader, UploadResult
logger = logging.getLogger(__name__)
FileInput = AudioFile | File | ImageFile | PDFFile | TextFile | VideoFile
class OpenAIFileUploader(FileUploader):
"""Uploader for OpenAI Files API.
Uses the OpenAI SDK to upload files. Files are stored persistently
until explicitly deleted.
Attributes:
api_key: Optional API key (uses OPENAI_API_KEY env var if not provided).
"""
def __init__(self, api_key: str | None = None) -> None:
"""Initialize the OpenAI uploader.
Args:
api_key: Optional OpenAI API key. If not provided, uses
OPENAI_API_KEY environment variable.
"""
self._api_key = api_key or os.environ.get("OPENAI_API_KEY")
self._client: Any = None
self._async_client: Any = None
@property
def provider_name(self) -> str:
"""Return the provider name."""
return "openai"
def _get_client(self) -> Any:
"""Get or create the OpenAI client."""
if self._client is None:
try:
from openai import OpenAI
self._client = OpenAI(api_key=self._api_key)
except ImportError as e:
raise ImportError(
"openai is required for OpenAI file uploads. "
"Install with: pip install openai"
) from e
return self._client
def _get_async_client(self) -> Any:
"""Get or create the async OpenAI client."""
if self._async_client is None:
try:
from openai import AsyncOpenAI
self._async_client = AsyncOpenAI(api_key=self._api_key)
except ImportError as e:
raise ImportError(
"openai is required for OpenAI file uploads. "
"Install with: pip install openai"
) from e
return self._async_client
def upload(self, file: FileInput, purpose: str | None = None) -> UploadResult:
"""Upload a file to OpenAI.
Args:
file: The file to upload.
purpose: Optional purpose for the file (default: "user_data").
Returns:
UploadResult with the file ID and metadata.
Raises:
TransientUploadError: For retryable errors (network, rate limits).
PermanentUploadError: For non-retryable errors (auth, validation).
"""
from crewai.files.processing.exceptions import (
PermanentUploadError,
TransientUploadError,
)
try:
client = self._get_client()
content = file.read()
file_purpose = purpose or "user_data"
file_data = io.BytesIO(content)
file_data.name = file.filename or "file"
logger.info(
f"Uploading file '{file.filename}' to OpenAI ({len(content)} bytes)"
)
uploaded_file = client.files.create(
file=file_data,
purpose=file_purpose,
)
logger.info(f"Uploaded to OpenAI: {uploaded_file.id}")
return UploadResult(
file_id=uploaded_file.id,
file_uri=None,
content_type=file.content_type,
expires_at=None,
provider=self.provider_name,
)
except ImportError:
raise
except Exception as e:
error_type = type(e).__name__
if "RateLimit" in error_type or "APIConnection" in error_type:
raise TransientUploadError(
f"Transient upload error: {e}", file_name=file.filename
) from e
if "Authentication" in error_type or "Permission" in error_type:
raise PermanentUploadError(
f"Authentication/permission error: {e}", file_name=file.filename
) from e
if "BadRequest" in error_type or "InvalidRequest" in error_type:
raise PermanentUploadError(
f"Invalid request: {e}", file_name=file.filename
) from e
status_code = getattr(e, "status_code", None)
if status_code is not None:
if status_code >= 500 or status_code == 429:
raise TransientUploadError(
f"Server error ({status_code}): {e}", file_name=file.filename
) from e
if status_code in (401, 403):
raise PermanentUploadError(
f"Auth error ({status_code}): {e}", file_name=file.filename
) from e
if status_code == 400:
raise PermanentUploadError(
f"Bad request ({status_code}): {e}", file_name=file.filename
) from e
raise TransientUploadError(
f"Upload failed: {e}", file_name=file.filename
) from e
def delete(self, file_id: str) -> bool:
"""Delete an uploaded file from OpenAI.
Args:
file_id: The file ID to delete.
Returns:
True if deletion was successful, False otherwise.
"""
try:
client = self._get_client()
client.files.delete(file_id)
logger.info(f"Deleted OpenAI file: {file_id}")
return True
except Exception as e:
logger.warning(f"Failed to delete OpenAI file {file_id}: {e}")
return False
def get_file_info(self, file_id: str) -> dict[str, Any] | None:
"""Get information about an uploaded file.
Args:
file_id: The file ID.
Returns:
Dictionary with file information, or None if not found.
"""
try:
client = self._get_client()
file_info = client.files.retrieve(file_id)
return {
"id": file_info.id,
"filename": file_info.filename,
"purpose": file_info.purpose,
"bytes": file_info.bytes,
"created_at": file_info.created_at,
"status": file_info.status,
}
except Exception as e:
logger.debug(f"Failed to get OpenAI file info for {file_id}: {e}")
return None
def list_files(self) -> list[dict[str, Any]]:
"""List all uploaded files.
Returns:
List of dictionaries with file information.
"""
try:
client = self._get_client()
files = client.files.list()
return [
{
"id": f.id,
"filename": f.filename,
"purpose": f.purpose,
"bytes": f.bytes,
"created_at": f.created_at,
"status": f.status,
}
for f in files.data
]
except Exception as e:
logger.warning(f"Failed to list OpenAI files: {e}")
return []
async def aupload(
self, file: FileInput, purpose: str | None = None
) -> UploadResult:
"""Async upload a file to OpenAI using native async client.
Args:
file: The file to upload.
purpose: Optional purpose for the file (default: "user_data").
Returns:
UploadResult with the file ID and metadata.
Raises:
TransientUploadError: For retryable errors (network, rate limits).
PermanentUploadError: For non-retryable errors (auth, validation).
"""
from crewai.files.processing.exceptions import (
PermanentUploadError,
TransientUploadError,
)
try:
client = self._get_async_client()
content = await file.aread()
file_purpose = purpose or "user_data"
file_data = io.BytesIO(content)
file_data.name = file.filename or "file"
logger.info(
f"Uploading file '{file.filename}' to OpenAI ({len(content)} bytes)"
)
uploaded_file = await client.files.create(
file=file_data,
purpose=file_purpose,
)
logger.info(f"Uploaded to OpenAI: {uploaded_file.id}")
return UploadResult(
file_id=uploaded_file.id,
file_uri=None,
content_type=file.content_type,
expires_at=None,
provider=self.provider_name,
)
except ImportError:
raise
except Exception as e:
error_type = type(e).__name__
if "RateLimit" in error_type or "APIConnection" in error_type:
raise TransientUploadError(
f"Transient upload error: {e}", file_name=file.filename
) from e
if "Authentication" in error_type or "Permission" in error_type:
raise PermanentUploadError(
f"Authentication/permission error: {e}", file_name=file.filename
) from e
if "BadRequest" in error_type or "InvalidRequest" in error_type:
raise PermanentUploadError(
f"Invalid request: {e}", file_name=file.filename
) from e
status_code = getattr(e, "status_code", None)
if status_code is not None:
if status_code >= 500 or status_code == 429:
raise TransientUploadError(
f"Server error ({status_code}): {e}", file_name=file.filename
) from e
if status_code in (401, 403):
raise PermanentUploadError(
f"Auth error ({status_code}): {e}", file_name=file.filename
) from e
if status_code == 400:
raise PermanentUploadError(
f"Bad request ({status_code}): {e}", file_name=file.filename
) from e
raise TransientUploadError(
f"Upload failed: {e}", file_name=file.filename
) from e
async def adelete(self, file_id: str) -> bool:
"""Async delete an uploaded file from OpenAI.
Args:
file_id: The file ID to delete.
Returns:
True if deletion was successful, False otherwise.
"""
try:
client = self._get_async_client()
await client.files.delete(file_id)
logger.info(f"Deleted OpenAI file: {file_id}")
return True
except Exception as e:
logger.warning(f"Failed to delete OpenAI file {file_id}: {e}")
return False

View File

@@ -66,11 +66,11 @@ if TYPE_CHECKING:
from litellm.utils import supports_response_schema
from crewai.agent.core import Agent
from crewai.files import FileInput, UploadCache
from crewai.llms.hooks.base import BaseInterceptor
from crewai.llms.providers.anthropic.completion import AnthropicThinkingConfig
from crewai.task import Task
from crewai.tools.base_tool import BaseTool
from crewai.utilities.files import FileInput, UploadCache
from crewai.utilities.types import LLMMessage
try:
@@ -2274,7 +2274,7 @@ class LLM(BaseLLM):
"""
import base64
from crewai.utilities.files import (
from crewai.files import (
FileResolver,
FileResolverConfig,
InlineBase64,

View File

@@ -33,9 +33,9 @@ from crewai.types.usage_metrics import UsageMetrics
if TYPE_CHECKING:
from crewai.agent.core import Agent
from crewai.files import FileInput, UploadCache
from crewai.task import Task
from crewai.tools.base_tool import BaseTool
from crewai.utilities.files import FileInput, UploadCache
from crewai.utilities.types import LLMMessage
@@ -315,6 +315,25 @@ class BaseLLM(ABC):
"""
return []
async def aformat_multimodal_content(
self,
files: dict[str, FileInput],
upload_cache: UploadCache | None = None,
) -> list[dict[str, Any]]:
"""Async format files as multimodal content blocks for the LLM.
Default implementation calls the sync version. Subclasses should
override to use async file resolution for parallel processing.
Args:
files: Dictionary mapping file names to FileInput objects.
upload_cache: Optional cache for tracking uploaded files.
Returns:
List of content blocks in the provider's expected format.
"""
return self.format_multimodal_content(files, upload_cache)
def format_text_content(self, text: str) -> dict[str, Any]:
"""Format text as a content block for the LLM.

View File

@@ -20,8 +20,8 @@ from crewai.utilities.types import LLMMessage
if TYPE_CHECKING:
from crewai.files import FileInput, UploadCache
from crewai.llms.hooks.base import BaseInterceptor
from crewai.utilities.files import FileInput, UploadCache
DEFAULT_CACHE_TTL = "ephemeral"
@@ -1281,7 +1281,7 @@ class AnthropicCompletion(BaseLLM):
if not self.supports_multimodal():
return []
from crewai.utilities.files import (
from crewai.files import (
FileReference,
FileResolver,
FileResolverConfig,
@@ -1370,3 +1370,107 @@ class AnthropicCompletion(BaseLLM):
content_blocks.append(block)
return content_blocks
async def aformat_multimodal_content(
self,
files: dict[str, FileInput],
upload_cache: UploadCache | None = None,
enable_caching: bool = True,
cache_ttl: str | None = None,
) -> list[dict[str, Any]]:
"""Async format files as Anthropic multimodal content blocks.
Uses parallel file resolution for improved performance with multiple files.
Args:
files: Dictionary mapping file names to FileInput objects.
upload_cache: Optional cache for tracking uploaded files.
enable_caching: Whether to add cache_control markers (default: True).
cache_ttl: Cache TTL - "ephemeral" (5min) or "1h" (1hr for supported models).
Returns:
List of content blocks in Anthropic's expected format.
"""
if not self.supports_multimodal():
return []
from crewai.files import (
FileReference,
FileResolver,
FileResolverConfig,
InlineBase64,
)
supported_types = self.supported_multimodal_content_types()
supported_files = {
name: f
for name, f in files.items()
if any(f.content_type.startswith(t) for t in supported_types)
}
if not supported_files:
return []
config = FileResolverConfig(prefer_upload=False)
resolver = FileResolver(config=config, upload_cache=upload_cache)
resolved_files = await resolver.aresolve_files(supported_files, "anthropic")
content_blocks: list[dict[str, Any]] = []
num_files = len(resolved_files)
file_names = list(supported_files.keys())
for i, name in enumerate(file_names):
if name not in resolved_files:
continue
resolved = resolved_files[name]
file_input = supported_files[name]
content_type = file_input.content_type
block: dict[str, Any] = {}
if isinstance(resolved, FileReference):
if content_type.startswith("image/"):
block = {
"type": "image",
"source": {
"type": "file",
"file_id": resolved.file_id,
},
}
elif content_type == "application/pdf":
block = {
"type": "document",
"source": {
"type": "file",
"file_id": resolved.file_id,
},
}
elif isinstance(resolved, InlineBase64):
if content_type.startswith("image/"):
block = {
"type": "image",
"source": {
"type": "base64",
"media_type": resolved.content_type,
"data": resolved.data,
},
}
elif content_type == "application/pdf":
block = {
"type": "document",
"source": {
"type": "base64",
"media_type": resolved.content_type,
"data": resolved.data,
},
}
if block and enable_caching and i == num_files - 1:
cache_control: dict[str, str] = {"type": cache_ttl or DEFAULT_CACHE_TTL}
block["cache_control"] = cache_control
if block:
content_blocks.append(block)
return content_blocks

View File

@@ -18,8 +18,8 @@ from crewai.utilities.types import LLMMessage
if TYPE_CHECKING:
from crewai.files import FileInput, UploadCache
from crewai.llms.hooks.base import BaseInterceptor
from crewai.utilities.files import FileInput, UploadCache
try:
@@ -1060,7 +1060,7 @@ class AzureCompletion(BaseLLM):
if not self.supports_multimodal():
return []
from crewai.utilities.files import (
from crewai.files import (
FileResolver,
FileResolverConfig,
InlineBase64,
@@ -1100,3 +1100,54 @@ class AzureCompletion(BaseLLM):
)
return content_blocks
async def aformat_multimodal_content(
self,
files: dict[str, FileInput],
upload_cache: UploadCache | None = None,
) -> list[dict[str, Any]]:
"""Async format files as Azure OpenAI multimodal content blocks.
Uses parallel file resolution for improved performance with multiple files.
Args:
files: Dictionary mapping file names to FileInput objects.
upload_cache: Optional cache (not used by Azure but kept for interface consistency).
Returns:
List of content blocks in Azure OpenAI's expected format.
"""
if not self.supports_multimodal():
return []
from crewai.files import (
FileResolver,
FileResolverConfig,
InlineBase64,
)
supported_types = self.supported_multimodal_content_types()
supported_files = {
name: f
for name, f in files.items()
if any(f.content_type.startswith(t) for t in supported_types)
}
if not supported_files:
return []
config = FileResolverConfig(prefer_upload=False)
resolver = FileResolver(config=config, upload_cache=upload_cache)
resolved_files = await resolver.aresolve_files(supported_files, "azure")
return [
{
"type": "image_url",
"image_url": {
"url": f"data:{resolved.content_type};base64,{resolved.data}"
},
}
for resolved in resolved_files.values()
if isinstance(resolved, InlineBase64)
]

View File

@@ -32,8 +32,8 @@ if TYPE_CHECKING:
ToolTypeDef,
)
from crewai.files import FileInput, UploadCache
from crewai.llms.hooks.base import BaseInterceptor
from crewai.utilities.files import FileInput, UploadCache
try:
@@ -1455,13 +1455,33 @@ class BedrockCompletion(BaseLLM):
def supports_multimodal(self) -> bool:
"""Check if the model supports multimodal inputs.
Claude models on Bedrock support vision.
Claude 3+ and Nova Lite/Pro/Premier on Bedrock support vision.
Returns:
True if the model supports images.
"""
vision_models = ("anthropic.claude-3",)
return any(self.model.lower().startswith(m) for m in vision_models)
model_lower = self.model.lower()
vision_models = (
"anthropic.claude-3",
"amazon.nova-lite",
"amazon.nova-pro",
"amazon.nova-premier",
"us.amazon.nova-lite",
"us.amazon.nova-pro",
"us.amazon.nova-premier",
)
return any(model_lower.startswith(m) for m in vision_models)
def _is_nova_model(self) -> bool:
"""Check if the model is an Amazon Nova model.
Only Nova models support S3 links for multimedia.
Returns:
True if the model is a Nova model.
"""
model_lower = self.model.lower()
return "amazon.nova-" in model_lower
def supported_multimodal_content_types(self) -> list[str]:
"""Get content types supported by Bedrock for multimodal input.
@@ -1471,7 +1491,78 @@ class BedrockCompletion(BaseLLM):
"""
if not self.supports_multimodal():
return []
return ["image/", "application/pdf"]
types = ["image/png", "image/jpeg", "image/gif", "image/webp"]
if self._is_nova_model():
types.extend(
[
"application/pdf",
"text/csv",
"text/plain",
"text/markdown",
"text/html",
"application/msword",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.ms-excel",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"video/mp4",
"video/quicktime",
"video/x-matroska",
"video/webm",
"video/x-flv",
"video/mpeg",
"video/x-ms-wmv",
"video/3gpp",
]
)
else:
types.append("application/pdf")
return types
def _get_document_format(self, content_type: str) -> str | None:
"""Map content type to Bedrock document format.
Args:
content_type: MIME type of the document.
Returns:
Bedrock format string or None if unsupported.
"""
format_map = {
"application/pdf": "pdf",
"text/csv": "csv",
"text/plain": "txt",
"text/markdown": "md",
"text/html": "html",
"application/msword": "doc",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
"application/vnd.ms-excel": "xls",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
}
return format_map.get(content_type)
def _get_video_format(self, content_type: str) -> str | None:
"""Map content type to Bedrock video format.
Args:
content_type: MIME type of the video.
Returns:
Bedrock format string or None if unsupported.
"""
format_map = {
"video/mp4": "mp4",
"video/quicktime": "mov",
"video/x-matroska": "mkv",
"video/webm": "webm",
"video/x-flv": "flv",
"video/mpeg": "mpeg",
"video/x-ms-wmv": "wmv",
"video/3gpp": "three_gp",
}
return format_map.get(content_type)
def format_multimodal_content(
self,
@@ -1480,12 +1571,12 @@ class BedrockCompletion(BaseLLM):
) -> list[dict[str, Any]]:
"""Format files as Bedrock Converse API multimodal content blocks.
Bedrock Converse API uses specific formats for images and documents with raw bytes.
Uses FileResolver to get InlineBytes format for Bedrock's byte-based API.
Bedrock Converse API supports both raw bytes and S3 URI references.
S3 uploads are only supported by Amazon Nova models.
Args:
files: Dictionary mapping file names to FileInput objects.
upload_cache: Optional cache (not used by Bedrock but kept for interface consistency).
upload_cache: Optional cache for S3 uploads.
Returns:
List of content blocks in Bedrock's expected format.
@@ -1493,50 +1584,239 @@ class BedrockCompletion(BaseLLM):
if not self.supports_multimodal():
return []
from crewai.utilities.files import (
import os
from crewai.files import (
FileReference,
FileResolver,
FileResolverConfig,
InlineBytes,
)
content_blocks: list[dict[str, Any]] = []
is_nova = self._is_nova_model()
# Bedrock uses raw bytes, configure resolver accordingly
config = FileResolverConfig(prefer_upload=False, use_bytes_for_bedrock=True)
s3_bucket = os.environ.get("CREWAI_BEDROCK_S3_BUCKET")
s3_bucket_owner = os.environ.get("CREWAI_BEDROCK_S3_BUCKET_OWNER")
prefer_upload = bool(s3_bucket) and is_nova
config = FileResolverConfig(
prefer_upload=prefer_upload, use_bytes_for_bedrock=True
)
resolver = FileResolver(config=config, upload_cache=upload_cache)
for name, file_input in files.items():
content_type = file_input.content_type
resolved = resolver.resolve(file_input, "bedrock")
if isinstance(resolved, InlineBytes):
file_bytes = resolved.data
else:
# Fallback to reading directly
file_bytes = file_input.read()
if isinstance(resolved, FileReference) and resolved.file_uri:
s3_location: dict[str, Any] = {"uri": resolved.file_uri}
if s3_bucket_owner:
s3_location["bucketOwner"] = s3_bucket_owner
if content_type.startswith("image/"):
media_type = content_type.split("/")[-1]
if media_type == "jpg":
media_type = "jpeg"
content_blocks.append(
{
"image": {
"format": media_type,
"source": {"bytes": file_bytes},
if content_type.startswith("image/"):
media_type = content_type.split("/")[-1]
if media_type == "jpg":
media_type = "jpeg"
content_blocks.append(
{
"image": {
"format": media_type,
"source": {"s3Location": s3_location},
}
}
}
)
elif content_type == "application/pdf":
content_blocks.append(
{
"document": {
"name": name,
"format": "pdf",
"source": {"bytes": file_bytes},
)
elif content_type.startswith("video/"):
video_format = self._get_video_format(content_type)
if video_format:
content_blocks.append(
{
"video": {
"format": video_format,
"source": {"s3Location": s3_location},
}
}
)
else:
doc_format = self._get_document_format(content_type)
if doc_format:
content_blocks.append(
{
"document": {
"name": name,
"format": doc_format,
"source": {"s3Location": s3_location},
}
}
)
else:
if isinstance(resolved, InlineBytes):
file_bytes = resolved.data
else:
file_bytes = file_input.read()
if content_type.startswith("image/"):
media_type = content_type.split("/")[-1]
if media_type == "jpg":
media_type = "jpeg"
content_blocks.append(
{
"image": {
"format": media_type,
"source": {"bytes": file_bytes},
}
}
}
)
)
elif content_type.startswith("video/"):
video_format = self._get_video_format(content_type)
if video_format:
content_blocks.append(
{
"video": {
"format": video_format,
"source": {"bytes": file_bytes},
}
}
)
else:
doc_format = self._get_document_format(content_type)
if doc_format:
content_blocks.append(
{
"document": {
"name": name,
"format": doc_format,
"source": {"bytes": file_bytes},
}
}
)
return content_blocks
async def aformat_multimodal_content(
self,
files: dict[str, FileInput],
upload_cache: UploadCache | None = None,
) -> list[dict[str, Any]]:
"""Async format files as Bedrock Converse API multimodal content blocks.
Uses parallel file resolution. S3 uploads are only supported by Nova models.
Args:
files: Dictionary mapping file names to FileInput objects.
upload_cache: Optional cache for S3 uploads.
Returns:
List of content blocks in Bedrock's expected format.
"""
if not self.supports_multimodal():
return []
import os
from crewai.files import (
FileReference,
FileResolver,
FileResolverConfig,
InlineBytes,
)
is_nova = self._is_nova_model()
s3_bucket = os.environ.get("CREWAI_BEDROCK_S3_BUCKET")
s3_bucket_owner = os.environ.get("CREWAI_BEDROCK_S3_BUCKET_OWNER")
prefer_upload = bool(s3_bucket) and is_nova
config = FileResolverConfig(
prefer_upload=prefer_upload, use_bytes_for_bedrock=True
)
resolver = FileResolver(config=config, upload_cache=upload_cache)
resolved_files = await resolver.aresolve_files(files, "bedrock")
content_blocks: list[dict[str, Any]] = []
for name, resolved in resolved_files.items():
file_input = files[name]
content_type = file_input.content_type
if isinstance(resolved, FileReference) and resolved.file_uri:
s3_location: dict[str, Any] = {"uri": resolved.file_uri}
if s3_bucket_owner:
s3_location["bucketOwner"] = s3_bucket_owner
if content_type.startswith("image/"):
media_type = content_type.split("/")[-1]
if media_type == "jpg":
media_type = "jpeg"
content_blocks.append(
{
"image": {
"format": media_type,
"source": {"s3Location": s3_location},
}
}
)
elif content_type.startswith("video/"):
video_format = self._get_video_format(content_type)
if video_format:
content_blocks.append(
{
"video": {
"format": video_format,
"source": {"s3Location": s3_location},
}
}
)
else:
doc_format = self._get_document_format(content_type)
if doc_format:
content_blocks.append(
{
"document": {
"name": name,
"format": doc_format,
"source": {"s3Location": s3_location},
}
}
)
else:
if isinstance(resolved, InlineBytes):
file_bytes = resolved.data
else:
file_bytes = await file_input.aread()
if content_type.startswith("image/"):
media_type = content_type.split("/")[-1]
if media_type == "jpg":
media_type = "jpeg"
content_blocks.append(
{
"image": {
"format": media_type,
"source": {"bytes": file_bytes},
}
}
)
elif content_type.startswith("video/"):
video_format = self._get_video_format(content_type)
if video_format:
content_blocks.append(
{
"video": {
"format": video_format,
"source": {"bytes": file_bytes},
}
}
)
else:
doc_format = self._get_document_format(content_type)
if doc_format:
content_blocks.append(
{
"document": {
"name": name,
"format": doc_format,
"source": {"bytes": file_bytes},
}
}
)
return content_blocks

View File

@@ -19,11 +19,11 @@ from crewai.utilities.types import LLMMessage
if TYPE_CHECKING:
from crewai.llms.hooks.base import BaseInterceptor
from crewai.utilities.files import (
from crewai.files import (
FileInput,
UploadCache,
)
from crewai.llms.hooks.base import BaseInterceptor
try:
@@ -1113,7 +1113,7 @@ class GeminiCompletion(BaseLLM):
Returns:
List of content blocks in Gemini's expected format.
"""
from crewai.utilities.files import (
from crewai.files import (
FileReference,
FileResolver,
FileResolverConfig,
@@ -1123,7 +1123,6 @@ class GeminiCompletion(BaseLLM):
content_blocks: list[dict[str, Any]] = []
supported_types = self.supported_multimodal_content_types()
# Create resolver with optional cache
config = FileResolverConfig(prefer_upload=False)
resolver = FileResolver(config=config, upload_cache=upload_cache)
@@ -1168,6 +1167,67 @@ class GeminiCompletion(BaseLLM):
return content_blocks
async def aformat_multimodal_content(
self,
files: dict[str, FileInput],
upload_cache: UploadCache | None = None,
) -> list[dict[str, Any]]:
"""Async format files as Gemini multimodal content blocks.
Uses parallel file resolution for improved performance with multiple files.
Args:
files: Dictionary mapping file names to FileInput objects.
upload_cache: Optional cache for tracking uploaded files.
Returns:
List of content blocks in Gemini's expected format.
"""
from crewai.files import (
FileReference,
FileResolver,
FileResolverConfig,
InlineBase64,
)
supported_types = self.supported_multimodal_content_types()
supported_files = {
name: f
for name, f in files.items()
if any(f.content_type.startswith(t) for t in supported_types)
}
if not supported_files:
return []
config = FileResolverConfig(prefer_upload=False)
resolver = FileResolver(config=config, upload_cache=upload_cache)
resolved_files = await resolver.aresolve_files(supported_files, "gemini")
content_blocks: list[dict[str, Any]] = []
for resolved in resolved_files.values():
if isinstance(resolved, FileReference) and resolved.file_uri:
content_blocks.append(
{
"fileData": {
"mimeType": resolved.content_type,
"fileUri": resolved.file_uri,
}
}
)
elif isinstance(resolved, InlineBase64):
content_blocks.append(
{
"inlineData": {
"mimeType": resolved.content_type,
"data": resolved.data,
}
}
)
return content_blocks
def format_text_content(self, text: str) -> dict[str, Any]:
"""Format text as a Gemini content block.

View File

@@ -28,10 +28,10 @@ from crewai.utilities.types import LLMMessage
if TYPE_CHECKING:
from crewai.agent.core import Agent
from crewai.files import FileInput, UploadCache
from crewai.llms.hooks.base import BaseInterceptor
from crewai.task import Task
from crewai.tools.base_tool import BaseTool
from crewai.utilities.files import FileInput, UploadCache
class OpenAICompletion(BaseLLM):
@@ -1100,7 +1100,7 @@ class OpenAICompletion(BaseLLM):
if not self.supports_multimodal():
return []
from crewai.utilities.files import (
from crewai.files import (
FileReference,
FileResolver,
FileResolverConfig,
@@ -1148,3 +1148,67 @@ class OpenAICompletion(BaseLLM):
)
return content_blocks
async def aformat_multimodal_content(
self,
files: dict[str, FileInput],
upload_cache: UploadCache | None = None,
) -> list[dict[str, Any]]:
"""Async format files as OpenAI multimodal content blocks.
Uses parallel file resolution for improved performance with multiple files.
Args:
files: Dictionary mapping file names to FileInput objects.
upload_cache: Optional cache for tracking uploaded files.
Returns:
List of content blocks in OpenAI's expected format.
"""
if not self.supports_multimodal():
return []
from crewai.files import (
FileReference,
FileResolver,
FileResolverConfig,
InlineBase64,
)
supported_types = self.supported_multimodal_content_types()
supported_files = {
name: f
for name, f in files.items()
if any(f.content_type.startswith(t) for t in supported_types)
}
if not supported_files:
return []
config = FileResolverConfig(prefer_upload=False)
resolver = FileResolver(config=config, upload_cache=upload_cache)
resolved_files = await resolver.aresolve_files(supported_files, "openai")
content_blocks: list[dict[str, Any]] = []
for resolved in resolved_files.values():
if isinstance(resolved, FileReference):
content_blocks.append(
{
"type": "file",
"file": {
"file_id": resolved.file_id,
},
}
)
elif isinstance(resolved, InlineBase64):
content_blocks.append(
{
"type": "image_url",
"image_url": {
"url": f"data:{resolved.content_type};base64,{resolved.data}"
},
}
)
return content_blocks

View File

@@ -37,6 +37,12 @@ from crewai.events.types.task_events import (
TaskFailedEvent,
TaskStartedEvent,
)
from crewai.files import (
FileInput,
FilePath,
FileSourceInput,
normalize_input_files,
)
from crewai.security import Fingerprint, SecurityConfig
from crewai.tasks.output_format import OutputFormat
from crewai.tasks.task_output import TaskOutput
@@ -49,12 +55,6 @@ from crewai.utilities.file_store import (
get_all_files,
store_task_files,
)
from crewai.utilities.files import (
FileInput,
FilePath,
FileSourceInput,
normalize_input_files,
)
from crewai.utilities.guardrail import (
process_guardrail,
)

View File

@@ -11,7 +11,7 @@ from crewai.tools.base_tool import BaseTool
if TYPE_CHECKING:
from crewai.utilities.files import FileInput
from crewai.files import FileInput
class ReadFileToolSchema(BaseModel):

View File

@@ -13,7 +13,7 @@ from aiocache.serializers import PickleSerializer # type: ignore[import-untyped
if TYPE_CHECKING:
from crewai.utilities.files import FileInput
from crewai.files import FileInput
_file_store = Cache(Cache.MEMORY, serializer=PickleSerializer())

View File

@@ -1,207 +1,25 @@
"""File handling utilities for crewAI tasks."""
"""Backwards compatibility re-exports from crewai.files.
from crewai.utilities.files.cleanup import (
cleanup_expired_files,
cleanup_provider_files,
cleanup_uploaded_files,
)
from crewai.utilities.files.content_types import (
AudioContentType,
AudioExtension,
AudioFile,
BaseFile,
File,
FileMode,
ImageContentType,
ImageExtension,
ImageFile,
PDFContentType,
PDFExtension,
PDFFile,
TextContentType,
TextExtension,
TextFile,
VideoContentType,
VideoExtension,
VideoFile,
)
from crewai.utilities.files.file import (
FileBytes,
FilePath,
FileSource,
FileSourceInput,
FileStream,
RawFileInput,
)
from crewai.utilities.files.processing import (
ANTHROPIC_CONSTRAINTS,
BEDROCK_CONSTRAINTS,
GEMINI_CONSTRAINTS,
OPENAI_CONSTRAINTS,
AudioConstraints,
FileHandling,
FileProcessingError,
FileProcessor,
FileTooLargeError,
FileValidationError,
ImageConstraints,
PDFConstraints,
ProcessingDependencyError,
ProviderConstraints,
UnsupportedFileTypeError,
VideoConstraints,
get_constraints_for_provider,
)
from crewai.utilities.files.resolved import (
FileReference,
InlineBase64,
InlineBytes,
ResolvedFile,
ResolvedFileType,
UrlReference,
)
from crewai.utilities.files.resolver import (
FileResolver,
FileResolverConfig,
create_resolver,
)
from crewai.utilities.files.upload_cache import (
CachedUpload,
UploadCache,
get_upload_cache,
reset_upload_cache,
)
from crewai.utilities.files.uploaders import FileUploader, UploadResult, get_uploader
Deprecated: Import from crewai.files instead.
"""
import sys
from typing import Any
from typing_extensions import deprecated
import crewai.files as _files
FileInput = AudioFile | File | ImageFile | PDFFile | TextFile | VideoFile
@deprecated("crewai.utilities.files is deprecated. Import from crewai.files instead.")
class _DeprecatedModule:
"""Deprecated module wrapper."""
def __getattr__(self, name: str) -> Any:
return getattr(_files, name)
def __dir__(self) -> list[str]:
return list(_files.__all__)
def wrap_file_source(source: FileSource) -> FileInput:
"""Wrap a FileSource in the appropriate typed FileInput wrapper.
Args:
source: The file source to wrap.
Returns:
Typed FileInput wrapper based on content type.
"""
content_type = source.content_type
if content_type.startswith("image/"):
return ImageFile(source=source)
if content_type.startswith("audio/"):
return AudioFile(source=source)
if content_type.startswith("video/"):
return VideoFile(source=source)
if content_type == "application/pdf":
return PDFFile(source=source)
return TextFile(source=source)
def normalize_input_files(
input_files: list[FileSourceInput | FileInput],
) -> dict[str, FileInput]:
"""Convert a list of file sources to a named dictionary of FileInputs.
Args:
input_files: List of file source inputs or File objects.
Returns:
Dictionary mapping names to FileInput wrappers.
"""
from pathlib import Path
result: dict[str, FileInput] = {}
for i, item in enumerate(input_files):
if isinstance(item, BaseFile):
name = item.filename or f"file_{i}"
if "." in name:
name = name.rsplit(".", 1)[0]
result[name] = item
continue
file_source: FilePath | FileBytes | FileStream
if isinstance(item, (FilePath, FileBytes, FileStream)):
file_source = item
elif isinstance(item, Path):
file_source = FilePath(path=item)
elif isinstance(item, str):
file_source = FilePath(path=Path(item))
elif isinstance(item, (bytes, memoryview)):
file_source = FileBytes(data=bytes(item))
else:
continue
name = file_source.filename or f"file_{i}"
result[name] = wrap_file_source(file_source)
return result
__all__ = [
"ANTHROPIC_CONSTRAINTS",
"BEDROCK_CONSTRAINTS",
"GEMINI_CONSTRAINTS",
"OPENAI_CONSTRAINTS",
"AudioConstraints",
"AudioContentType",
"AudioExtension",
"AudioFile",
"BaseFile",
"CachedUpload",
"File",
"FileBytes",
"FileHandling",
"FileInput",
"FileMode",
"FilePath",
"FileProcessingError",
"FileProcessor",
"FileReference",
"FileResolver",
"FileResolverConfig",
"FileSource",
"FileSourceInput",
"FileStream",
"FileTooLargeError",
"FileUploader",
"FileValidationError",
"ImageConstraints",
"ImageContentType",
"ImageExtension",
"ImageFile",
"InlineBase64",
"InlineBytes",
"PDFConstraints",
"PDFContentType",
"PDFExtension",
"PDFFile",
"ProcessingDependencyError",
"ProviderConstraints",
"RawFileInput",
"ResolvedFile",
"ResolvedFileType",
"TextContentType",
"TextExtension",
"TextFile",
"UnsupportedFileTypeError",
"UploadCache",
"UploadResult",
"UrlReference",
"VideoConstraints",
"VideoContentType",
"VideoExtension",
"VideoFile",
"cleanup_expired_files",
"cleanup_provider_files",
"cleanup_uploaded_files",
"create_resolver",
"get_constraints_for_provider",
"get_upload_cache",
"get_uploader",
"normalize_input_files",
"reset_upload_cache",
"wrap_file_source",
]
sys.modules[__name__] = _DeprecatedModule() # type: ignore[assignment]

View File

@@ -0,0 +1,258 @@
"""Type stubs for backwards compatibility re-exports from crewai.files.
.. deprecated::
Import from crewai.files instead.
"""
from collections.abc import Callable
from datetime import datetime
from pathlib import Path
from typing import Any, Literal
from typing_extensions import deprecated
import crewai.files as _files
FileMode = Literal["strict", "auto", "warn", "chunk"]
ImageExtension = _files.ImageExtension
ImageContentType = _files.ImageContentType
PDFExtension = _files.PDFExtension
PDFContentType = _files.PDFContentType
TextExtension = _files.TextExtension
TextContentType = _files.TextContentType
AudioExtension = _files.AudioExtension
AudioContentType = _files.AudioContentType
VideoExtension = _files.VideoExtension
VideoContentType = _files.VideoContentType
FileInput = _files.FileInput
FileSource = _files.FileSource
FileSourceInput = _files.FileSourceInput
RawFileInput = _files.RawFileInput
ResolvedFileType = _files.ResolvedFileType
FileHandling = _files.FileHandling
# Deprecated classes
@deprecated("Import from crewai.files instead")
class BaseFile(_files.BaseFile):
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
class ImageFile(_files.ImageFile):
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
class PDFFile(_files.PDFFile):
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
class TextFile(_files.TextFile):
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
class AudioFile(_files.AudioFile):
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
class VideoFile(_files.VideoFile):
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
class File(_files.File):
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
class FilePath(_files.FilePath):
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
class FileBytes(_files.FileBytes):
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
class FileStream(_files.FileStream):
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
class FileResolver(_files.FileResolver):
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
class FileResolverConfig(_files.FileResolverConfig):
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
class FileProcessor(_files.FileProcessor):
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
class FileUploader(_files.FileUploader):
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
class UploadCache(_files.UploadCache):
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
class CachedUpload(_files.CachedUpload):
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
class UploadResult(_files.UploadResult):
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
class ResolvedFile(_files.ResolvedFile):
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
class FileReference(_files.FileReference):
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
class UrlReference(_files.UrlReference):
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
class InlineBase64(_files.InlineBase64):
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
class InlineBytes(_files.InlineBytes):
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
class ProviderConstraints(_files.ProviderConstraints):
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
class ImageConstraints(_files.ImageConstraints):
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
class AudioConstraints(_files.AudioConstraints):
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
class VideoConstraints(_files.VideoConstraints):
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
class PDFConstraints(_files.PDFConstraints):
""".. deprecated:: Import from crewai.files instead."""
...
# Exceptions
@deprecated("Import from crewai.files instead")
class FileProcessingError(_files.FileProcessingError):
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
class FileValidationError(_files.FileValidationError):
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
class FileTooLargeError(_files.FileTooLargeError):
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
class UnsupportedFileTypeError(_files.UnsupportedFileTypeError):
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
class ProcessingDependencyError(_files.ProcessingDependencyError):
""".. deprecated:: Import from crewai.files instead."""
...
# Constants
OPENAI_CONSTRAINTS: _files.ProviderConstraints
ANTHROPIC_CONSTRAINTS: _files.ProviderConstraints
GEMINI_CONSTRAINTS: _files.ProviderConstraints
BEDROCK_CONSTRAINTS: _files.ProviderConstraints
# Deprecated functions
@deprecated("Import from crewai.files instead")
def create_resolver(
provider: str,
config: FileResolverConfig | None = None,
) -> FileResolver:
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
def get_uploader(provider: str, **kwargs: Any) -> FileUploader | None:
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
def get_upload_cache() -> UploadCache:
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
def reset_upload_cache() -> None:
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
def get_constraints_for_provider(provider: str) -> ProviderConstraints:
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
def cleanup_uploaded_files(provider: str | None = None) -> int:
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
def cleanup_expired_files() -> int:
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
def cleanup_provider_files(provider: str) -> int:
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
def normalize_input_files(
input_files: list[FileSourceInput | FileInput],
) -> dict[str, FileInput]:
""".. deprecated:: Import from crewai.files instead."""
...
@deprecated("Import from crewai.files instead")
def wrap_file_source(source: FileSource) -> FileInput:
""".. deprecated:: Import from crewai.files instead."""
...
__all__: list[str]

View File

@@ -1,180 +0,0 @@
"""Cleanup utilities for uploaded files."""
from __future__ import annotations
import logging
from typing import TYPE_CHECKING
from crewai.utilities.files.upload_cache import CachedUpload, UploadCache
from crewai.utilities.files.uploaders import get_uploader
if TYPE_CHECKING:
from crewai.utilities.files.uploaders.base import FileUploader
logger = logging.getLogger(__name__)
def _safe_delete(
uploader: FileUploader,
file_id: str,
provider: str,
) -> bool:
"""Safely delete a file, logging any errors.
Args:
uploader: The file uploader to use.
file_id: The file ID to delete.
provider: Provider name for logging.
Returns:
True if deleted successfully, False otherwise.
"""
try:
if uploader.delete(file_id):
logger.debug(f"Deleted {file_id} from {provider}")
return True
logger.warning(f"Failed to delete {file_id} from {provider}")
return False
except Exception as e:
logger.warning(f"Error deleting {file_id} from {provider}: {e}")
return False
def cleanup_uploaded_files(
cache: UploadCache,
*,
delete_from_provider: bool = True,
providers: list[str] | None = None,
) -> int:
"""Clean up uploaded files from the cache and optionally from providers.
Args:
cache: The upload cache to clean up.
delete_from_provider: If True, delete files from the provider as well.
providers: Optional list of providers to clean up. If None, cleans all.
Returns:
Number of files cleaned up.
"""
cleaned = 0
provider_uploads: dict[str, list[CachedUpload]] = {}
for provider in _get_providers_from_cache(cache):
if providers is not None and provider not in providers:
continue
provider_uploads[provider] = cache.get_all_for_provider(provider)
if delete_from_provider:
for provider, uploads in provider_uploads.items():
uploader = get_uploader(provider)
if uploader is None:
logger.warning(
f"No uploader available for {provider}, skipping cleanup"
)
continue
for upload in uploads:
if _safe_delete(uploader, upload.file_id, provider):
cleaned += 1
cache.clear()
logger.info(f"Cleaned up {cleaned} uploaded files")
return cleaned
def cleanup_expired_files(
cache: UploadCache,
*,
delete_from_provider: bool = False,
) -> int:
"""Clean up expired files from the cache.
Args:
cache: The upload cache to clean up.
delete_from_provider: If True, attempt to delete from provider as well.
Note: Expired files may already be deleted by the provider.
Returns:
Number of expired entries removed from cache.
"""
expired_entries: list[CachedUpload] = []
if delete_from_provider:
for provider in _get_providers_from_cache(cache):
expired_entries.extend(
upload
for upload in cache.get_all_for_provider(provider)
if upload.is_expired()
)
removed = cache.clear_expired()
if delete_from_provider:
for upload in expired_entries:
uploader = get_uploader(upload.provider)
if uploader is not None:
try:
uploader.delete(upload.file_id)
except Exception as e:
logger.debug(f"Could not delete expired file {upload.file_id}: {e}")
return removed
def cleanup_provider_files(
provider: str,
*,
cache: UploadCache | None = None,
delete_all_from_provider: bool = False,
) -> int:
"""Clean up all files for a specific provider.
Args:
provider: Provider name to clean up.
cache: Optional upload cache to clear entries from.
delete_all_from_provider: If True, delete all files from the provider,
not just cached ones.
Returns:
Number of files deleted.
"""
deleted = 0
uploader = get_uploader(provider)
if uploader is None:
logger.warning(f"No uploader available for {provider}")
return 0
if delete_all_from_provider:
try:
files = uploader.list_files()
for file_info in files:
file_id = file_info.get("id") or file_info.get("name")
if file_id and uploader.delete(file_id):
deleted += 1
except Exception as e:
logger.warning(f"Error listing/deleting files from {provider}: {e}")
elif cache is not None:
uploads = cache.get_all_for_provider(provider)
for upload in uploads:
if _safe_delete(uploader, upload.file_id, provider):
deleted += 1
cache.remove_by_file_id(upload.file_id, provider)
logger.info(f"Deleted {deleted} files from {provider}")
return deleted
def _get_providers_from_cache(cache: UploadCache) -> set[str]:
"""Get unique provider names from cache entries.
Args:
cache: The upload cache.
Returns:
Set of provider names.
"""
return cache.get_providers()

View File

@@ -1,158 +0,0 @@
"""Base file class for handling file inputs in tasks."""
from __future__ import annotations
from pathlib import Path
from typing import Annotated, Any, BinaryIO, cast
import magic
from pydantic import (
BaseModel,
BeforeValidator,
Field,
GetCoreSchemaHandler,
PrivateAttr,
model_validator,
)
from pydantic_core import CoreSchema, core_schema
def detect_content_type(data: bytes) -> str:
"""Detect MIME type from file content.
Args:
data: Raw bytes to analyze.
Returns:
The detected MIME type.
"""
return magic.from_buffer(data, mime=True)
class _BinaryIOValidator:
"""Pydantic validator for BinaryIO types."""
@classmethod
def __get_pydantic_core_schema__(
cls, source_type: Any, handler: GetCoreSchemaHandler
) -> CoreSchema:
return core_schema.no_info_plain_validator_function(
cls._validate,
serialization=core_schema.plain_serializer_function_ser_schema(
lambda x: None, info_arg=False
),
)
@staticmethod
def _validate(value: Any) -> BinaryIO:
if hasattr(value, "read") and hasattr(value, "seek"):
return cast(BinaryIO, value)
raise ValueError("Expected a binary file-like object with read() and seek()")
ValidatedBinaryIO = Annotated[BinaryIO, _BinaryIOValidator()]
class FilePath(BaseModel):
"""File loaded from a filesystem path."""
path: Path = Field(description="Path to the file on the filesystem.")
_content: bytes | None = PrivateAttr(default=None)
@model_validator(mode="after")
def _validate_file_exists(self) -> FilePath:
"""Validate that the file exists."""
if not self.path.exists():
raise ValueError(f"File not found: {self.path}")
if not self.path.is_file():
raise ValueError(f"Path is not a file: {self.path}")
return self
@property
def filename(self) -> str:
"""Get the filename from the path."""
return self.path.name
@property
def content_type(self) -> str:
"""Get the content type by reading file content."""
return detect_content_type(self.read())
def read(self) -> bytes:
"""Read the file content from disk."""
if self._content is None:
self._content = self.path.read_bytes()
return self._content
class FileBytes(BaseModel):
"""File created from raw bytes content."""
data: bytes = Field(description="Raw bytes content of the file.")
filename: str | None = Field(default=None, description="Optional filename.")
@property
def content_type(self) -> str:
"""Get the content type from the data."""
return detect_content_type(self.data)
def read(self) -> bytes:
"""Return the bytes content."""
return self.data
class FileStream(BaseModel):
"""File loaded from a file-like stream."""
stream: ValidatedBinaryIO = Field(description="Binary file stream.")
filename: str | None = Field(default=None, description="Optional filename.")
_content: bytes | None = PrivateAttr(default=None)
def model_post_init(self, __context: object) -> None:
"""Extract filename from stream if not provided."""
if self.filename is None:
name = getattr(self.stream, "name", None)
if name is not None:
object.__setattr__(self, "filename", Path(name).name)
@property
def content_type(self) -> str:
"""Get the content type from stream content."""
return detect_content_type(self.read())
def read(self) -> bytes:
"""Read the stream content. Content is cached after first read."""
if self._content is None:
position = self.stream.tell()
self.stream.seek(0)
self._content = self.stream.read()
self.stream.seek(position)
return self._content
def close(self) -> None:
"""Close the underlying stream."""
self.stream.close()
FileSource = FilePath | FileBytes | FileStream
def _normalize_source(value: Any) -> FileSource:
"""Convert raw input to appropriate source type."""
if isinstance(value, (FilePath, FileBytes, FileStream)):
return value
if isinstance(value, Path):
return FilePath(path=value)
if isinstance(value, str):
return FilePath(path=Path(value))
if isinstance(value, bytes):
return FileBytes(data=value)
if hasattr(value, "read") and hasattr(value, "seek"):
return FileStream(stream=value)
raise ValueError(f"Cannot convert {type(value).__name__} to file source")
RawFileInput = str | Path | bytes
FileSourceInput = Annotated[
RawFileInput | FileSource, BeforeValidator(_normalize_source)
]

View File

@@ -1,287 +0,0 @@
"""FileResolver for deciding file delivery method and managing uploads."""
import base64
from dataclasses import dataclass, field
import logging
from crewai.utilities.files.content_types import (
AudioFile,
File,
ImageFile,
PDFFile,
TextFile,
VideoFile,
)
from crewai.utilities.files.processing.constraints import (
ProviderConstraints,
get_constraints_for_provider,
)
from crewai.utilities.files.resolved import (
FileReference,
InlineBase64,
InlineBytes,
ResolvedFile,
)
from crewai.utilities.files.upload_cache import CachedUpload, UploadCache
from crewai.utilities.files.uploaders import get_uploader
from crewai.utilities.files.uploaders.base import FileUploader
logger = logging.getLogger(__name__)
FileInput = AudioFile | File | ImageFile | PDFFile | TextFile | VideoFile
@dataclass
class FileResolverConfig:
"""Configuration for FileResolver.
Attributes:
prefer_upload: If True, prefer uploading over inline for supported providers.
upload_threshold_bytes: Size threshold above which to use upload.
If None, uses provider-specific threshold.
use_bytes_for_bedrock: If True, use raw bytes instead of base64 for Bedrock.
"""
prefer_upload: bool = False
upload_threshold_bytes: int | None = None
use_bytes_for_bedrock: bool = True
@dataclass
class FileResolver:
"""Resolves files to their delivery format based on provider capabilities.
Decides whether to use inline base64, raw bytes, or file upload based on:
- Provider constraints and capabilities
- File size
- Configuration preferences
Caches uploaded files to avoid redundant uploads.
Attributes:
config: Resolver configuration.
upload_cache: Cache for tracking uploaded files.
"""
config: FileResolverConfig = field(default_factory=FileResolverConfig)
upload_cache: UploadCache | None = None
_uploaders: dict[str, FileUploader] = field(default_factory=dict)
def resolve(self, file: FileInput, provider: str) -> ResolvedFile:
"""Resolve a file to its delivery format for a provider.
Args:
file: The file to resolve.
provider: Provider name (e.g., "gemini", "anthropic", "openai").
Returns:
ResolvedFile representing the appropriate delivery format.
"""
provider_lower = provider.lower()
constraints = get_constraints_for_provider(provider)
file_size = len(file.read())
should_upload = self._should_upload(
file, provider_lower, constraints, file_size
)
if should_upload:
resolved = self._resolve_via_upload(file, provider_lower)
if resolved is not None:
return resolved
return self._resolve_inline(file, provider_lower)
def resolve_files(
self,
files: dict[str, FileInput],
provider: str,
) -> dict[str, ResolvedFile]:
"""Resolve multiple files for a provider.
Args:
files: Dictionary mapping names to file inputs.
provider: Provider name.
Returns:
Dictionary mapping names to resolved files.
"""
return {name: self.resolve(file, provider) for name, file in files.items()}
def _should_upload(
self,
file: FileInput,
provider: str,
constraints: ProviderConstraints | None,
file_size: int,
) -> bool:
"""Determine if a file should be uploaded rather than inlined.
Args:
file: The file to check.
provider: Provider name.
constraints: Provider constraints.
file_size: Size of the file in bytes.
Returns:
True if the file should be uploaded, False otherwise.
"""
if constraints is None or not constraints.supports_file_upload:
return False
if self.config.prefer_upload:
return True
threshold = self.config.upload_threshold_bytes
if threshold is None and constraints is not None:
threshold = constraints.file_upload_threshold_bytes
if threshold is not None and file_size > threshold:
return True
return False
def _resolve_via_upload(
self,
file: FileInput,
provider: str,
) -> ResolvedFile | None:
"""Resolve a file by uploading it.
Args:
file: The file to upload.
provider: Provider name.
Returns:
FileReference if upload succeeds, None otherwise.
"""
if self.upload_cache is not None:
cached = self.upload_cache.get(file, provider)
if cached is not None:
logger.debug(
f"Using cached upload for {file.filename}: {cached.file_id}"
)
return FileReference(
content_type=cached.content_type,
file_id=cached.file_id,
provider=cached.provider,
expires_at=cached.expires_at,
file_uri=cached.file_uri,
)
uploader = self._get_uploader(provider)
if uploader is None:
logger.debug(f"No uploader available for {provider}")
return None
try:
result = uploader.upload(file)
if self.upload_cache is not None:
self.upload_cache.set(
file=file,
provider=provider,
file_id=result.file_id,
file_uri=result.file_uri,
expires_at=result.expires_at,
)
return FileReference(
content_type=result.content_type,
file_id=result.file_id,
provider=result.provider,
expires_at=result.expires_at,
file_uri=result.file_uri,
)
except Exception as e:
logger.warning(f"Failed to upload {file.filename} to {provider}: {e}")
return None
def _resolve_inline(self, file: FileInput, provider: str) -> ResolvedFile:
"""Resolve a file as inline content.
Args:
file: The file to resolve.
provider: Provider name.
Returns:
InlineBase64 or InlineBytes depending on provider.
"""
content = file.read()
if self.config.use_bytes_for_bedrock and "bedrock" in provider:
return InlineBytes(
content_type=file.content_type,
data=content,
)
encoded = base64.b64encode(content).decode("ascii")
return InlineBase64(
content_type=file.content_type,
data=encoded,
)
def _get_uploader(self, provider: str) -> FileUploader | None:
"""Get or create an uploader for a provider.
Args:
provider: Provider name.
Returns:
FileUploader instance or None if not available.
"""
if provider not in self._uploaders:
uploader = get_uploader(provider)
if uploader is not None:
self._uploaders[provider] = uploader
else:
return None
return self._uploaders.get(provider)
def get_cached_uploads(self, provider: str) -> list[CachedUpload]:
"""Get all cached uploads for a provider.
Args:
provider: Provider name.
Returns:
List of cached uploads.
"""
if self.upload_cache is None:
return []
return self.upload_cache.get_all_for_provider(provider)
def clear_cache(self) -> None:
"""Clear the upload cache."""
if self.upload_cache is not None:
self.upload_cache.clear()
def create_resolver(
provider: str | None = None,
prefer_upload: bool = False,
upload_threshold_bytes: int | None = None,
enable_cache: bool = True,
) -> FileResolver:
"""Create a configured FileResolver.
Args:
provider: Optional provider name for provider-specific configuration.
prefer_upload: Whether to prefer upload over inline.
upload_threshold_bytes: Size threshold for using upload.
enable_cache: Whether to enable upload caching.
Returns:
Configured FileResolver instance.
"""
config = FileResolverConfig(
prefer_upload=prefer_upload,
upload_threshold_bytes=upload_threshold_bytes,
)
cache = UploadCache() if enable_cache else None
return FileResolver(config=config, upload_cache=cache)

View File

@@ -1,166 +0,0 @@
"""Anthropic Files API uploader implementation."""
from __future__ import annotations
import io
import logging
import os
from typing import Any
from crewai.utilities.files.content_types import (
AudioFile,
File,
ImageFile,
PDFFile,
TextFile,
VideoFile,
)
from crewai.utilities.files.uploaders.base import FileUploader, UploadResult
logger = logging.getLogger(__name__)
FileInput = AudioFile | File | ImageFile | PDFFile | TextFile | VideoFile
class AnthropicFileUploader(FileUploader):
"""Uploader for Anthropic Files API.
Uses the anthropic SDK to upload files. Files are stored persistently
until explicitly deleted.
Attributes:
api_key: Optional API key (uses ANTHROPIC_API_KEY env var if not provided).
"""
def __init__(self, api_key: str | None = None) -> None:
"""Initialize the Anthropic uploader.
Args:
api_key: Optional Anthropic API key. If not provided, uses
ANTHROPIC_API_KEY environment variable.
"""
self._api_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
self._client: Any = None
@property
def provider_name(self) -> str:
"""Return the provider name."""
return "anthropic"
def _get_client(self) -> Any:
"""Get or create the Anthropic client."""
if self._client is None:
try:
import anthropic
self._client = anthropic.Anthropic(api_key=self._api_key)
except ImportError as e:
raise ImportError(
"anthropic is required for Anthropic file uploads. "
"Install with: pip install anthropic"
) from e
return self._client
def upload(self, file: FileInput, purpose: str | None = None) -> UploadResult:
"""Upload a file to Anthropic.
Args:
file: The file to upload.
purpose: Optional purpose for the file (default: "user_upload").
Returns:
UploadResult with the file ID and metadata.
Raises:
Exception: If upload fails.
"""
client = self._get_client()
content = file.read()
file_purpose = purpose or "user_upload"
file_data = io.BytesIO(content)
logger.info(
f"Uploading file '{file.filename}' to Anthropic ({len(content)} bytes)"
)
uploaded_file = client.files.create(
file=(file.filename, file_data, file.content_type),
purpose=file_purpose,
)
logger.info(f"Uploaded to Anthropic: {uploaded_file.id}")
return UploadResult(
file_id=uploaded_file.id,
file_uri=None,
content_type=file.content_type,
expires_at=None,
provider=self.provider_name,
)
def delete(self, file_id: str) -> bool:
"""Delete an uploaded file from Anthropic.
Args:
file_id: The file ID to delete.
Returns:
True if deletion was successful, False otherwise.
"""
try:
client = self._get_client()
client.files.delete(file_id=file_id)
logger.info(f"Deleted Anthropic file: {file_id}")
return True
except Exception as e:
logger.warning(f"Failed to delete Anthropic file {file_id}: {e}")
return False
def get_file_info(self, file_id: str) -> dict[str, Any] | None:
"""Get information about an uploaded file.
Args:
file_id: The file ID.
Returns:
Dictionary with file information, or None if not found.
"""
try:
client = self._get_client()
file_info = client.files.retrieve(file_id=file_id)
return {
"id": file_info.id,
"filename": file_info.filename,
"purpose": file_info.purpose,
"size_bytes": file_info.size_bytes,
"created_at": file_info.created_at,
}
except Exception as e:
logger.debug(f"Failed to get Anthropic file info for {file_id}: {e}")
return None
def list_files(self) -> list[dict[str, Any]]:
"""List all uploaded files.
Returns:
List of dictionaries with file information.
"""
try:
client = self._get_client()
files = client.files.list()
return [
{
"id": f.id,
"filename": f.filename,
"purpose": f.purpose,
"size_bytes": f.size_bytes,
"created_at": f.created_at,
}
for f in files.data
]
except Exception as e:
logger.warning(f"Failed to list Anthropic files: {e}")
return []

View File

@@ -1,217 +0,0 @@
"""Gemini File API uploader implementation."""
from __future__ import annotations
from datetime import datetime, timedelta, timezone
import io
import logging
import os
from typing import Any
from crewai.utilities.files.content_types import (
AudioFile,
File,
ImageFile,
PDFFile,
TextFile,
VideoFile,
)
from crewai.utilities.files.uploaders.base import FileUploader, UploadResult
logger = logging.getLogger(__name__)
FileInput = AudioFile | File | ImageFile | PDFFile | TextFile | VideoFile
GEMINI_FILE_TTL = timedelta(hours=48)
class GeminiFileUploader(FileUploader):
"""Uploader for Google Gemini File API.
Uses the google-genai SDK to upload files. Files are stored for 48 hours.
Attributes:
api_key: Optional API key (uses GOOGLE_API_KEY env var if not provided).
"""
def __init__(self, api_key: str | None = None) -> None:
"""Initialize the Gemini uploader.
Args:
api_key: Optional Google API key. If not provided, uses
GOOGLE_API_KEY environment variable.
"""
self._api_key = api_key or os.environ.get("GOOGLE_API_KEY")
self._client: Any = None
@property
def provider_name(self) -> str:
"""Return the provider name."""
return "gemini"
def _get_client(self) -> Any:
"""Get or create the Gemini client."""
if self._client is None:
try:
from google import genai
self._client = genai.Client(api_key=self._api_key)
except ImportError as e:
raise ImportError(
"google-genai is required for Gemini file uploads. "
"Install with: pip install google-genai"
) from e
return self._client
def upload(self, file: FileInput, purpose: str | None = None) -> UploadResult:
"""Upload a file to Gemini.
Args:
file: The file to upload.
purpose: Optional purpose/description (used as display name).
Returns:
UploadResult with the file URI and metadata.
Raises:
Exception: If upload fails.
"""
client = self._get_client()
content = file.read()
display_name = purpose or file.filename
file_data = io.BytesIO(content)
file_data.name = file.filename
logger.info(
f"Uploading file '{file.filename}' to Gemini ({len(content)} bytes)"
)
uploaded_file = client.files.upload(
file=file_data,
config={
"display_name": display_name,
"mime_type": file.content_type,
},
)
expires_at = datetime.now(timezone.utc) + GEMINI_FILE_TTL
logger.info(
f"Uploaded to Gemini: {uploaded_file.name} (URI: {uploaded_file.uri})"
)
return UploadResult(
file_id=uploaded_file.name,
file_uri=uploaded_file.uri,
content_type=file.content_type,
expires_at=expires_at,
provider=self.provider_name,
)
def delete(self, file_id: str) -> bool:
"""Delete an uploaded file from Gemini.
Args:
file_id: The file name/ID to delete.
Returns:
True if deletion was successful, False otherwise.
"""
try:
client = self._get_client()
client.files.delete(name=file_id)
logger.info(f"Deleted Gemini file: {file_id}")
return True
except Exception as e:
logger.warning(f"Failed to delete Gemini file {file_id}: {e}")
return False
def get_file_info(self, file_id: str) -> dict[str, Any] | None:
"""Get information about an uploaded file.
Args:
file_id: The file name/ID.
Returns:
Dictionary with file information, or None if not found.
"""
try:
client = self._get_client()
file_info = client.files.get(name=file_id)
return {
"name": file_info.name,
"uri": file_info.uri,
"display_name": file_info.display_name,
"mime_type": file_info.mime_type,
"size_bytes": file_info.size_bytes,
"state": str(file_info.state),
"create_time": file_info.create_time,
"expiration_time": file_info.expiration_time,
}
except Exception as e:
logger.debug(f"Failed to get Gemini file info for {file_id}: {e}")
return None
def list_files(self) -> list[dict[str, Any]]:
"""List all uploaded files.
Returns:
List of dictionaries with file information.
"""
try:
client = self._get_client()
files = client.files.list()
return [
{
"name": f.name,
"uri": f.uri,
"display_name": f.display_name,
"mime_type": f.mime_type,
"size_bytes": f.size_bytes,
"state": str(f.state),
}
for f in files
]
except Exception as e:
logger.warning(f"Failed to list Gemini files: {e}")
return []
def wait_for_processing(self, file_id: str, timeout_seconds: int = 300) -> bool:
"""Wait for a file to finish processing.
Some files (especially videos) need time to process after upload.
Args:
file_id: The file name/ID.
timeout_seconds: Maximum time to wait.
Returns:
True if processing completed, False if timed out or failed.
"""
import time
try:
from google.genai.types import FileState
except ImportError:
return True
client = self._get_client()
start_time = time.time()
while time.time() - start_time < timeout_seconds:
file_info = client.files.get(name=file_id)
if file_info.state == FileState.ACTIVE:
return True
if file_info.state == FileState.FAILED:
logger.error(f"Gemini file processing failed: {file_id}")
return False
time.sleep(2)
logger.warning(f"Timed out waiting for Gemini file processing: {file_id}")
return False

View File

@@ -1,169 +0,0 @@
"""OpenAI Files API uploader implementation."""
from __future__ import annotations
import io
import logging
import os
from typing import Any
from crewai.utilities.files.content_types import (
AudioFile,
File,
ImageFile,
PDFFile,
TextFile,
VideoFile,
)
from crewai.utilities.files.uploaders.base import FileUploader, UploadResult
logger = logging.getLogger(__name__)
FileInput = AudioFile | File | ImageFile | PDFFile | TextFile | VideoFile
class OpenAIFileUploader(FileUploader):
"""Uploader for OpenAI Files API.
Uses the OpenAI SDK to upload files. Files are stored persistently
until explicitly deleted.
Attributes:
api_key: Optional API key (uses OPENAI_API_KEY env var if not provided).
"""
def __init__(self, api_key: str | None = None) -> None:
"""Initialize the OpenAI uploader.
Args:
api_key: Optional OpenAI API key. If not provided, uses
OPENAI_API_KEY environment variable.
"""
self._api_key = api_key or os.environ.get("OPENAI_API_KEY")
self._client: Any = None
@property
def provider_name(self) -> str:
"""Return the provider name."""
return "openai"
def _get_client(self) -> Any:
"""Get or create the OpenAI client."""
if self._client is None:
try:
from openai import OpenAI
self._client = OpenAI(api_key=self._api_key)
except ImportError as e:
raise ImportError(
"openai is required for OpenAI file uploads. "
"Install with: pip install openai"
) from e
return self._client
def upload(self, file: FileInput, purpose: str | None = None) -> UploadResult:
"""Upload a file to OpenAI.
Args:
file: The file to upload.
purpose: Optional purpose for the file (default: "user_data").
Returns:
UploadResult with the file ID and metadata.
Raises:
Exception: If upload fails.
"""
client = self._get_client()
content = file.read()
file_purpose = purpose or "user_data"
file_data = io.BytesIO(content)
file_data.name = file.filename or "file"
logger.info(
f"Uploading file '{file.filename}' to OpenAI ({len(content)} bytes)"
)
uploaded_file = client.files.create(
file=file_data,
purpose=file_purpose,
)
logger.info(f"Uploaded to OpenAI: {uploaded_file.id}")
return UploadResult(
file_id=uploaded_file.id,
file_uri=None,
content_type=file.content_type,
expires_at=None,
provider=self.provider_name,
)
def delete(self, file_id: str) -> bool:
"""Delete an uploaded file from OpenAI.
Args:
file_id: The file ID to delete.
Returns:
True if deletion was successful, False otherwise.
"""
try:
client = self._get_client()
client.files.delete(file_id)
logger.info(f"Deleted OpenAI file: {file_id}")
return True
except Exception as e:
logger.warning(f"Failed to delete OpenAI file {file_id}: {e}")
return False
def get_file_info(self, file_id: str) -> dict[str, Any] | None:
"""Get information about an uploaded file.
Args:
file_id: The file ID.
Returns:
Dictionary with file information, or None if not found.
"""
try:
client = self._get_client()
file_info = client.files.retrieve(file_id)
return {
"id": file_info.id,
"filename": file_info.filename,
"purpose": file_info.purpose,
"bytes": file_info.bytes,
"created_at": file_info.created_at,
"status": file_info.status,
}
except Exception as e:
logger.debug(f"Failed to get OpenAI file info for {file_id}: {e}")
return None
def list_files(self) -> list[dict[str, Any]]:
"""List all uploaded files.
Returns:
List of dictionaries with file information.
"""
try:
client = self._get_client()
files = client.files.list()
return [
{
"id": f.id,
"filename": f.filename,
"purpose": f.purpose,
"bytes": f.bytes,
"created_at": f.created_at,
"status": f.status,
}
for f in files.data
]
except Exception as e:
logger.warning(f"Failed to list OpenAI files: {e}")
return []

View File

@@ -2,7 +2,7 @@
from typing import Any, Literal, TypedDict
from crewai.utilities.files import FileInput
from crewai.files import FileInput
class LLMMessage(TypedDict):

View File

@@ -2,7 +2,7 @@
import pytest
from crewai.utilities.files.processing.constraints import (
from crewai.files.processing.constraints import (
ANTHROPIC_CONSTRAINTS,
BEDROCK_CONSTRAINTS,
GEMINI_CONSTRAINTS,

View File

@@ -2,19 +2,19 @@
import pytest
from crewai.utilities.files import FileBytes, ImageFile, PDFFile, TextFile
from crewai.utilities.files.processing.constraints import (
from crewai.files import FileBytes, ImageFile, PDFFile, TextFile
from crewai.files.processing.constraints import (
ANTHROPIC_CONSTRAINTS,
ImageConstraints,
PDFConstraints,
ProviderConstraints,
)
from crewai.utilities.files.processing.enums import FileHandling
from crewai.utilities.files.processing.exceptions import (
from crewai.files.processing.enums import FileHandling
from crewai.files.processing.exceptions import (
FileTooLargeError,
FileValidationError,
)
from crewai.utilities.files.processing.processor import FileProcessor
from crewai.files.processing.processor import FileProcessor
# Minimal valid PNG: 8x8 pixel RGB image (valid for PIL)

View File

@@ -0,0 +1,359 @@
"""Unit tests for file transformers."""
import io
from unittest.mock import MagicMock, patch
import pytest
from crewai.files import ImageFile, PDFFile, TextFile
from crewai.files.file import FileBytes
from crewai.files.processing.exceptions import ProcessingDependencyError
from crewai.files.processing.transformers import (
chunk_pdf,
chunk_text,
get_image_dimensions,
get_pdf_page_count,
optimize_image,
resize_image,
)
def create_test_png(width: int = 100, height: int = 100) -> bytes:
"""Create a minimal valid PNG for testing."""
from PIL import Image
img = Image.new("RGB", (width, height), color="red")
buffer = io.BytesIO()
img.save(buffer, format="PNG")
return buffer.getvalue()
def create_test_pdf(num_pages: int = 1) -> bytes:
"""Create a minimal valid PDF for testing."""
from pypdf import PdfWriter
writer = PdfWriter()
for _ in range(num_pages):
writer.add_blank_page(width=612, height=792)
buffer = io.BytesIO()
writer.write(buffer)
return buffer.getvalue()
class TestResizeImage:
"""Tests for resize_image function."""
def test_resize_larger_image(self) -> None:
"""Test resizing an image larger than max dimensions."""
png_bytes = create_test_png(200, 150)
img = ImageFile(source=FileBytes(data=png_bytes, filename="test.png"))
result = resize_image(img, max_width=100, max_height=100)
dims = get_image_dimensions(result)
assert dims is not None
width, height = dims
assert width <= 100
assert height <= 100
def test_no_resize_if_within_bounds(self) -> None:
"""Test that small images are returned unchanged."""
png_bytes = create_test_png(50, 50)
img = ImageFile(source=FileBytes(data=png_bytes, filename="small.png"))
result = resize_image(img, max_width=100, max_height=100)
assert result is img
def test_preserve_aspect_ratio(self) -> None:
"""Test that aspect ratio is preserved during resize."""
png_bytes = create_test_png(200, 100)
img = ImageFile(source=FileBytes(data=png_bytes, filename="wide.png"))
result = resize_image(img, max_width=100, max_height=100)
dims = get_image_dimensions(result)
assert dims is not None
width, height = dims
assert width == 100
assert height == 50
def test_resize_without_aspect_ratio(self) -> None:
"""Test resizing without preserving aspect ratio."""
png_bytes = create_test_png(200, 100)
img = ImageFile(source=FileBytes(data=png_bytes, filename="wide.png"))
result = resize_image(
img, max_width=50, max_height=50, preserve_aspect_ratio=False
)
dims = get_image_dimensions(result)
assert dims is not None
width, height = dims
assert width == 50
assert height == 50
def test_resize_returns_image_file(self) -> None:
"""Test that resize returns an ImageFile instance."""
png_bytes = create_test_png(200, 200)
img = ImageFile(source=FileBytes(data=png_bytes, filename="test.png"))
result = resize_image(img, max_width=100, max_height=100)
assert isinstance(result, ImageFile)
def test_raises_without_pillow(self) -> None:
"""Test that ProcessingDependencyError is raised without Pillow."""
img = ImageFile(source=FileBytes(data=b"fake", filename="test.png"))
with patch.dict("sys.modules", {"PIL": None, "PIL.Image": None}):
with pytest.raises(ProcessingDependencyError) as exc_info:
# Force reimport to trigger ImportError
import importlib
import crewai.files.processing.transformers as t
importlib.reload(t)
t.resize_image(img, 100, 100)
assert "Pillow" in str(exc_info.value)
class TestOptimizeImage:
"""Tests for optimize_image function."""
def test_optimize_reduces_size(self) -> None:
"""Test that optimization reduces file size."""
png_bytes = create_test_png(500, 500)
original_size = len(png_bytes)
img = ImageFile(source=FileBytes(data=png_bytes, filename="large.png"))
result = optimize_image(img, target_size_bytes=original_size // 2)
result_size = len(result.read())
assert result_size < original_size
def test_no_optimize_if_under_target(self) -> None:
"""Test that small images are returned unchanged."""
png_bytes = create_test_png(50, 50)
img = ImageFile(source=FileBytes(data=png_bytes, filename="small.png"))
result = optimize_image(img, target_size_bytes=1024 * 1024)
assert result is img
def test_optimize_returns_image_file(self) -> None:
"""Test that optimize returns an ImageFile instance."""
png_bytes = create_test_png(200, 200)
img = ImageFile(source=FileBytes(data=png_bytes, filename="test.png"))
result = optimize_image(img, target_size_bytes=100)
assert isinstance(result, ImageFile)
def test_optimize_respects_min_quality(self) -> None:
"""Test that optimization stops at minimum quality."""
png_bytes = create_test_png(100, 100)
img = ImageFile(source=FileBytes(data=png_bytes, filename="test.png"))
# Request impossibly small size - should stop at min quality
result = optimize_image(img, target_size_bytes=10, min_quality=50)
assert isinstance(result, ImageFile)
assert len(result.read()) > 10
class TestChunkPdf:
"""Tests for chunk_pdf function."""
def test_chunk_splits_large_pdf(self) -> None:
"""Test that large PDFs are split into chunks."""
pdf_bytes = create_test_pdf(num_pages=10)
pdf = PDFFile(source=FileBytes(data=pdf_bytes, filename="large.pdf"))
result = list(chunk_pdf(pdf, max_pages=3))
assert len(result) == 4
assert all(isinstance(chunk, PDFFile) for chunk in result)
def test_no_chunk_if_within_limit(self) -> None:
"""Test that small PDFs are returned unchanged."""
pdf_bytes = create_test_pdf(num_pages=3)
pdf = PDFFile(source=FileBytes(data=pdf_bytes, filename="small.pdf"))
result = list(chunk_pdf(pdf, max_pages=5))
assert len(result) == 1
assert result[0] is pdf
def test_chunk_filenames(self) -> None:
"""Test that chunked files have indexed filenames."""
pdf_bytes = create_test_pdf(num_pages=6)
pdf = PDFFile(source=FileBytes(data=pdf_bytes, filename="document.pdf"))
result = list(chunk_pdf(pdf, max_pages=2))
assert result[0].filename == "document_chunk_0.pdf"
assert result[1].filename == "document_chunk_1.pdf"
assert result[2].filename == "document_chunk_2.pdf"
def test_chunk_with_overlap(self) -> None:
"""Test chunking with overlapping pages."""
pdf_bytes = create_test_pdf(num_pages=10)
pdf = PDFFile(source=FileBytes(data=pdf_bytes, filename="doc.pdf"))
result = list(chunk_pdf(pdf, max_pages=4, overlap_pages=1))
# With overlap, we get more chunks
assert len(result) >= 3
def test_chunk_page_counts(self) -> None:
"""Test that each chunk has correct page count."""
pdf_bytes = create_test_pdf(num_pages=7)
pdf = PDFFile(source=FileBytes(data=pdf_bytes, filename="doc.pdf"))
result = list(chunk_pdf(pdf, max_pages=3))
page_counts = [get_pdf_page_count(chunk) for chunk in result]
assert page_counts == [3, 3, 1]
class TestChunkText:
"""Tests for chunk_text function."""
def test_chunk_splits_large_text(self) -> None:
"""Test that large text files are split into chunks."""
content = "Hello world. " * 100
text = TextFile(source=content.encode(), filename="large.txt")
result = list(chunk_text(text, max_chars=200, overlap_chars=0))
assert len(result) > 1
assert all(isinstance(chunk, TextFile) for chunk in result)
def test_no_chunk_if_within_limit(self) -> None:
"""Test that small text files are returned unchanged."""
content = "Short text"
text = TextFile(source=content.encode(), filename="small.txt")
result = list(chunk_text(text, max_chars=1000, overlap_chars=0))
assert len(result) == 1
assert result[0] is text
def test_chunk_filenames(self) -> None:
"""Test that chunked files have indexed filenames."""
content = "A" * 500
text = TextFile(source=FileBytes(data=content.encode(), filename="data.txt"))
result = list(chunk_text(text, max_chars=200, overlap_chars=0))
assert result[0].filename == "data_chunk_0.txt"
assert result[1].filename == "data_chunk_1.txt"
assert len(result) == 3
def test_chunk_preserves_extension(self) -> None:
"""Test that file extension is preserved in chunks."""
content = "A" * 500
text = TextFile(source=FileBytes(data=content.encode(), filename="script.py"))
result = list(chunk_text(text, max_chars=200, overlap_chars=0))
assert all(chunk.filename.endswith(".py") for chunk in result)
def test_chunk_prefers_newline_boundaries(self) -> None:
"""Test that chunking prefers to split at newlines."""
content = "Line one\nLine two\nLine three\nLine four\nLine five"
text = TextFile(source=content.encode(), filename="lines.txt")
result = list(chunk_text(text, max_chars=25, overlap_chars=0, split_on_newlines=True))
# Should split at newline boundaries
for chunk in result:
chunk_text_content = chunk.read().decode()
# Chunks should end at newlines (except possibly the last)
if chunk != result[-1]:
assert chunk_text_content.endswith("\n") or len(chunk_text_content) <= 25
def test_chunk_with_overlap(self) -> None:
"""Test chunking with overlapping characters."""
content = "ABCDEFGHIJ" * 10
text = TextFile(source=content.encode(), filename="data.txt")
result = list(chunk_text(text, max_chars=30, overlap_chars=5))
# With overlap, chunks should share some content
assert len(result) >= 3
def test_chunk_overlap_larger_than_max_chars(self) -> None:
"""Test that overlap > max_chars doesn't cause infinite loop."""
content = "A" * 100
text = TextFile(source=content.encode(), filename="data.txt")
# overlap_chars > max_chars should still work (just with max overlap)
result = list(chunk_text(text, max_chars=20, overlap_chars=50))
assert len(result) > 1
# Should still complete without hanging
class TestGetImageDimensions:
"""Tests for get_image_dimensions function."""
def test_get_dimensions(self) -> None:
"""Test getting image dimensions."""
png_bytes = create_test_png(150, 100)
img = ImageFile(source=FileBytes(data=png_bytes, filename="test.png"))
dims = get_image_dimensions(img)
assert dims == (150, 100)
def test_returns_none_for_invalid_image(self) -> None:
"""Test that None is returned for invalid image data."""
img = ImageFile(source=FileBytes(data=b"not an image", filename="bad.png"))
dims = get_image_dimensions(img)
assert dims is None
def test_returns_none_without_pillow(self) -> None:
"""Test that None is returned when Pillow is not installed."""
png_bytes = create_test_png(100, 100)
img = ImageFile(source=FileBytes(data=png_bytes, filename="test.png"))
with patch.dict("sys.modules", {"PIL": None}):
# Can't easily test this without unloading module
# Just verify the function handles the case gracefully
pass
class TestGetPdfPageCount:
"""Tests for get_pdf_page_count function."""
def test_get_page_count(self) -> None:
"""Test getting PDF page count."""
pdf_bytes = create_test_pdf(num_pages=5)
pdf = PDFFile(source=FileBytes(data=pdf_bytes, filename="test.pdf"))
count = get_pdf_page_count(pdf)
assert count == 5
def test_single_page(self) -> None:
"""Test page count for single page PDF."""
pdf_bytes = create_test_pdf(num_pages=1)
pdf = PDFFile(source=FileBytes(data=pdf_bytes, filename="single.pdf"))
count = get_pdf_page_count(pdf)
assert count == 1
def test_returns_none_for_invalid_pdf(self) -> None:
"""Test that None is returned for invalid PDF data."""
pdf = PDFFile(source=FileBytes(data=b"not a pdf", filename="bad.pdf"))
count = get_pdf_page_count(pdf)
assert count is None

View File

@@ -2,19 +2,19 @@
import pytest
from crewai.utilities.files import FileBytes, ImageFile, PDFFile, TextFile
from crewai.utilities.files.processing.constraints import (
from crewai.files import FileBytes, ImageFile, PDFFile, TextFile
from crewai.files.processing.constraints import (
ANTHROPIC_CONSTRAINTS,
ImageConstraints,
PDFConstraints,
ProviderConstraints,
)
from crewai.utilities.files.processing.exceptions import (
from crewai.files.processing.exceptions import (
FileTooLargeError,
FileValidationError,
UnsupportedFileTypeError,
)
from crewai.utilities.files.processing.validators import (
from crewai.files.processing.validators import (
validate_file,
validate_image,
validate_pdf,

View File

@@ -4,7 +4,7 @@ from datetime import datetime, timezone
import pytest
from crewai.utilities.files.resolved import (
from crewai.files.resolved import (
FileReference,
InlineBase64,
InlineBytes,

View File

@@ -2,14 +2,14 @@
import pytest
from crewai.utilities.files import FileBytes, ImageFile
from crewai.utilities.files.resolved import InlineBase64, InlineBytes
from crewai.utilities.files.resolver import (
from crewai.files import FileBytes, ImageFile
from crewai.files.resolved import InlineBase64, InlineBytes
from crewai.files.resolver import (
FileResolver,
FileResolverConfig,
create_resolver,
)
from crewai.utilities.files.upload_cache import UploadCache
from crewai.files.upload_cache import UploadCache
# Minimal valid PNG

View File

@@ -4,8 +4,8 @@ from datetime import datetime, timedelta, timezone
import pytest
from crewai.utilities.files import FileBytes, ImageFile
from crewai.utilities.files.upload_cache import CachedUpload, UploadCache
from crewai.files import FileBytes, ImageFile
from crewai.files.upload_cache import CachedUpload, UploadCache
# Minimal valid PNG

View File

@@ -0,0 +1,5 @@
Quarter,Revenue ($M),Expenses ($M),Profit ($M)
Q1 2024,70,40,30
Q2 2024,75,42,33
Q3 2024,80,45,35
Q4 2024,75,44,31
1 Quarter Revenue ($M) Expenses ($M) Profit ($M)
2 Q1 2024 70 40 30
3 Q2 2024 75 42 33
4 Q3 2024 80 45 35
5 Q4 2024 75 44 31

Binary file not shown.

After

Width:  |  Height:  |  Size: 27 KiB

View File

@@ -0,0 +1,10 @@
Review Guidelines
1. Be clear and concise: Write feedback that is easy to understand.
2. Focus on behavior and outcomes: Describe what happened and why it matters.
3. Be specific: Provide examples to support your points.
4. Balance positives and improvements: Highlight strengths and areas to grow.
5. Be respectful and constructive: Assume positive intent and offer solutions.
6. Use objective criteria: Reference goals, metrics, or expectations where possible.
7. Suggest next steps: Recommend actionable ways to improve.
8. Proofread: Check tone, grammar, and clarity before submitting.

View File

@@ -7,7 +7,7 @@ from unittest.mock import patch
import pytest
from crewai.llm import LLM
from crewai.utilities.files import ImageFile, PDFFile, TextFile
from crewai.files import ImageFile, PDFFile, TextFile
# Check for optional provider dependencies
try:

View File

@@ -9,7 +9,7 @@ from pathlib import Path
import pytest
from crewai.llm import LLM
from crewai.utilities.files import File, ImageFile, PDFFile, TextFile
from crewai.files import File, ImageFile, PDFFile, TextFile
# Path to test data files

View File

@@ -5,7 +5,7 @@ import base64
import pytest
from crewai.tools.agent_tools.read_file_tool import ReadFileTool
from crewai.utilities.files import ImageFile, PDFFile, TextFile
from crewai.files import ImageFile, PDFFile, TextFile
class TestReadFileTool:

View File

@@ -13,7 +13,7 @@ from crewai.utilities.file_store import (
store_files,
store_task_files,
)
from crewai.utilities.files import TextFile
from crewai.files import TextFile
class TestFileStore:

View File

@@ -6,7 +6,7 @@ from pathlib import Path
import pytest
from crewai.utilities.files import (
from crewai.files import (
AudioFile,
File,
FileBytes,
@@ -20,7 +20,7 @@ from crewai.utilities.files import (
normalize_input_files,
wrap_file_source,
)
from crewai.utilities.files.file import detect_content_type
from crewai.files.file import detect_content_type
class TestDetectContentType:
@@ -34,7 +34,7 @@ class TestDetectContentType:
def test_detect_json(self) -> None:
"""Test detection of JSON content."""
result = detect_content_type(b'{"key": "value"}')
assert result in ("text/plain", "application/json")
assert result == "application/json"
def test_detect_png(self) -> None:
"""Test detection of PNG content."""