mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-05-02 15:52:34 +00:00
refactor: extract files module to standalone crewai-files package
This commit is contained in:
@@ -99,13 +99,7 @@ a2a = [
|
||||
"aiocache[redis,memcached]~=0.12.3",
|
||||
]
|
||||
file-processing = [
|
||||
"Pillow~=10.4.0",
|
||||
"pypdf~=4.0.0",
|
||||
"python-magic>=0.4.27",
|
||||
"aiocache~=0.12.3",
|
||||
"aiofiles~=24.1.0",
|
||||
"tinytag~=1.10.0",
|
||||
"av~=13.0.0",
|
||||
"crewai-files",
|
||||
]
|
||||
|
||||
|
||||
@@ -133,6 +127,7 @@ torchvision = [
|
||||
{ index = "pytorch-nightly", marker = "python_version >= '3.13'" },
|
||||
{ index = "pytorch", marker = "python_version < '3.13'" },
|
||||
]
|
||||
crewai-files = { workspace = true }
|
||||
|
||||
|
||||
[build-system]
|
||||
|
||||
@@ -3,10 +3,7 @@ from typing import Any
|
||||
import urllib.request
|
||||
import warnings
|
||||
|
||||
from crewai.agent.core import Agent
|
||||
from crewai.crew import Crew
|
||||
from crewai.crews.crew_output import CrewOutput
|
||||
from crewai.files import (
|
||||
from crewai_files import (
|
||||
AudioFile,
|
||||
File,
|
||||
ImageFile,
|
||||
@@ -14,6 +11,10 @@ from crewai.files import (
|
||||
TextFile,
|
||||
VideoFile,
|
||||
)
|
||||
|
||||
from crewai.agent.core import Agent
|
||||
from crewai.crew import Crew
|
||||
from crewai.crews.crew_output import CrewOutput
|
||||
from crewai.flow.flow import Flow
|
||||
from crewai.knowledge.knowledge import Knowledge
|
||||
from crewai.llm import LLM
|
||||
|
||||
@@ -10,6 +10,7 @@ from collections.abc import Callable
|
||||
import logging
|
||||
from typing import TYPE_CHECKING, Any, Literal, cast
|
||||
|
||||
from crewai_files import FileProcessor
|
||||
from pydantic import BaseModel, GetCoreSchemaHandler, ValidationError
|
||||
from pydantic_core import CoreSchema, core_schema
|
||||
|
||||
@@ -24,7 +25,6 @@ from crewai.events.types.logging_events import (
|
||||
AgentLogsExecutionEvent,
|
||||
AgentLogsStartedEvent,
|
||||
)
|
||||
from crewai.files import FileProcessor
|
||||
from crewai.hooks.llm_hooks import (
|
||||
get_after_llm_call_hooks,
|
||||
get_before_llm_call_hooks,
|
||||
@@ -238,7 +238,7 @@ class CrewAgentExecutor(CrewAgentExecutorMixin):
|
||||
processor = FileProcessor(constraints=provider)
|
||||
files = processor.process_files(files)
|
||||
|
||||
from crewai.files import get_upload_cache
|
||||
from crewai_files import get_upload_cache
|
||||
|
||||
upload_cache = get_upload_cache()
|
||||
content_blocks = self.llm.format_multimodal_content(
|
||||
@@ -280,7 +280,7 @@ class CrewAgentExecutor(CrewAgentExecutorMixin):
|
||||
processor = FileProcessor(constraints=provider)
|
||||
files = await processor.aprocess_files(files)
|
||||
|
||||
from crewai.files import get_upload_cache
|
||||
from crewai_files import get_upload_cache
|
||||
|
||||
upload_cache = get_upload_cache()
|
||||
content_blocks = await self.llm.aformat_multimodal_content(
|
||||
|
||||
@@ -6,15 +6,16 @@ import asyncio
|
||||
from collections.abc import Callable, Coroutine, Iterable
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from crewai.agents.agent_builder.base_agent import BaseAgent
|
||||
from crewai.crews.crew_output import CrewOutput
|
||||
from crewai.files import (
|
||||
from crewai_files import (
|
||||
AudioFile,
|
||||
ImageFile,
|
||||
PDFFile,
|
||||
TextFile,
|
||||
VideoFile,
|
||||
)
|
||||
|
||||
from crewai.agents.agent_builder.base_agent import BaseAgent
|
||||
from crewai.crews.crew_output import CrewOutput
|
||||
from crewai.rag.embeddings.types import EmbedderConfig
|
||||
from crewai.types.streaming import CrewStreamingOutput, FlowStreamingOutput
|
||||
from crewai.utilities.file_store import store_files
|
||||
|
||||
@@ -1,145 +0,0 @@
|
||||
"""File handling utilities for crewAI tasks."""
|
||||
|
||||
from crewai.files.cleanup import (
|
||||
cleanup_expired_files,
|
||||
cleanup_provider_files,
|
||||
cleanup_uploaded_files,
|
||||
)
|
||||
from crewai.files.content_types import (
|
||||
AudioExtension,
|
||||
AudioFile,
|
||||
AudioMimeType,
|
||||
BaseFile,
|
||||
File,
|
||||
FileInput,
|
||||
FileMode,
|
||||
ImageExtension,
|
||||
ImageFile,
|
||||
ImageMimeType,
|
||||
PDFContentType,
|
||||
PDFExtension,
|
||||
PDFFile,
|
||||
TextContentType,
|
||||
TextExtension,
|
||||
TextFile,
|
||||
VideoExtension,
|
||||
VideoFile,
|
||||
VideoMimeType,
|
||||
)
|
||||
from crewai.files.file import (
|
||||
FileBytes,
|
||||
FilePath,
|
||||
FileSource,
|
||||
FileSourceInput,
|
||||
FileStream,
|
||||
FileUrl,
|
||||
RawFileInput,
|
||||
)
|
||||
from crewai.files.processing import (
|
||||
ANTHROPIC_CONSTRAINTS,
|
||||
BEDROCK_CONSTRAINTS,
|
||||
GEMINI_CONSTRAINTS,
|
||||
OPENAI_CONSTRAINTS,
|
||||
AudioConstraints,
|
||||
FileHandling,
|
||||
FileProcessingError,
|
||||
FileProcessor,
|
||||
FileTooLargeError,
|
||||
FileValidationError,
|
||||
ImageConstraints,
|
||||
PDFConstraints,
|
||||
ProcessingDependencyError,
|
||||
ProviderConstraints,
|
||||
UnsupportedFileTypeError,
|
||||
VideoConstraints,
|
||||
get_constraints_for_provider,
|
||||
)
|
||||
from crewai.files.resolved import (
|
||||
FileReference,
|
||||
InlineBase64,
|
||||
InlineBytes,
|
||||
ResolvedFile,
|
||||
ResolvedFileType,
|
||||
UrlReference,
|
||||
)
|
||||
from crewai.files.resolver import (
|
||||
FileResolver,
|
||||
FileResolverConfig,
|
||||
create_resolver,
|
||||
)
|
||||
from crewai.files.upload_cache import (
|
||||
CachedUpload,
|
||||
UploadCache,
|
||||
get_upload_cache,
|
||||
reset_upload_cache,
|
||||
)
|
||||
from crewai.files.uploaders import FileUploader, UploadResult, get_uploader
|
||||
from crewai.files.utils import normalize_input_files, wrap_file_source
|
||||
|
||||
|
||||
__all__ = [
|
||||
"ANTHROPIC_CONSTRAINTS",
|
||||
"BEDROCK_CONSTRAINTS",
|
||||
"GEMINI_CONSTRAINTS",
|
||||
"OPENAI_CONSTRAINTS",
|
||||
"AudioConstraints",
|
||||
"AudioExtension",
|
||||
"AudioFile",
|
||||
"AudioMimeType",
|
||||
"BaseFile",
|
||||
"CachedUpload",
|
||||
"File",
|
||||
"FileBytes",
|
||||
"FileHandling",
|
||||
"FileInput",
|
||||
"FileMode",
|
||||
"FilePath",
|
||||
"FileProcessingError",
|
||||
"FileProcessor",
|
||||
"FileReference",
|
||||
"FileResolver",
|
||||
"FileResolverConfig",
|
||||
"FileSource",
|
||||
"FileSourceInput",
|
||||
"FileStream",
|
||||
"FileTooLargeError",
|
||||
"FileUploader",
|
||||
"FileUrl",
|
||||
"FileValidationError",
|
||||
"ImageConstraints",
|
||||
"ImageExtension",
|
||||
"ImageFile",
|
||||
"ImageMimeType",
|
||||
"InlineBase64",
|
||||
"InlineBytes",
|
||||
"PDFConstraints",
|
||||
"PDFContentType",
|
||||
"PDFExtension",
|
||||
"PDFFile",
|
||||
"ProcessingDependencyError",
|
||||
"ProviderConstraints",
|
||||
"RawFileInput",
|
||||
"ResolvedFile",
|
||||
"ResolvedFileType",
|
||||
"TextContentType",
|
||||
"TextExtension",
|
||||
"TextFile",
|
||||
"UnsupportedFileTypeError",
|
||||
"UploadCache",
|
||||
"UploadResult",
|
||||
"UrlReference",
|
||||
"VideoConstraints",
|
||||
"VideoExtension",
|
||||
"VideoFile",
|
||||
"VideoMimeType",
|
||||
"cleanup_expired_files",
|
||||
"cleanup_provider_files",
|
||||
"cleanup_uploaded_files",
|
||||
"create_resolver",
|
||||
"get_constraints_for_provider",
|
||||
"get_upload_cache",
|
||||
"get_uploader",
|
||||
"normalize_input_files",
|
||||
"reset_upload_cache",
|
||||
"wrap_file_source",
|
||||
]
|
||||
@@ -1,373 +0,0 @@
|
||||
"""Cleanup utilities for uploaded files."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from crewai.files.upload_cache import CachedUpload, UploadCache
|
||||
from crewai.files.uploaders import get_uploader
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from crewai.files.uploaders.base import FileUploader
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _safe_delete(
|
||||
uploader: FileUploader,
|
||||
file_id: str,
|
||||
provider: str,
|
||||
) -> bool:
|
||||
"""Safely delete a file, logging any errors.
|
||||
|
||||
Args:
|
||||
uploader: The file uploader to use.
|
||||
file_id: The file ID to delete.
|
||||
provider: Provider name for logging.
|
||||
|
||||
Returns:
|
||||
True if deleted successfully, False otherwise.
|
||||
"""
|
||||
try:
|
||||
if uploader.delete(file_id):
|
||||
logger.debug(f"Deleted {file_id} from {provider}")
|
||||
return True
|
||||
logger.warning(f"Failed to delete {file_id} from {provider}")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.warning(f"Error deleting {file_id} from {provider}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def cleanup_uploaded_files(
|
||||
cache: UploadCache,
|
||||
*,
|
||||
delete_from_provider: bool = True,
|
||||
providers: list[str] | None = None,
|
||||
) -> int:
|
||||
"""Clean up uploaded files from the cache and optionally from providers.
|
||||
|
||||
Args:
|
||||
cache: The upload cache to clean up.
|
||||
delete_from_provider: If True, delete files from the provider as well.
|
||||
providers: Optional list of providers to clean up. If None, cleans all.
|
||||
|
||||
Returns:
|
||||
Number of files cleaned up.
|
||||
"""
|
||||
cleaned = 0
|
||||
|
||||
provider_uploads: dict[str, list[CachedUpload]] = {}
|
||||
|
||||
for provider in _get_providers_from_cache(cache):
|
||||
if providers is not None and provider not in providers:
|
||||
continue
|
||||
provider_uploads[provider] = cache.get_all_for_provider(provider)
|
||||
|
||||
if delete_from_provider:
|
||||
for provider, uploads in provider_uploads.items():
|
||||
uploader = get_uploader(provider)
|
||||
if uploader is None:
|
||||
logger.warning(
|
||||
f"No uploader available for {provider}, skipping cleanup"
|
||||
)
|
||||
continue
|
||||
|
||||
for upload in uploads:
|
||||
if _safe_delete(uploader, upload.file_id, provider):
|
||||
cleaned += 1
|
||||
|
||||
cache.clear()
|
||||
|
||||
logger.info(f"Cleaned up {cleaned} uploaded files")
|
||||
return cleaned
|
||||
|
||||
|
||||
def cleanup_expired_files(
|
||||
cache: UploadCache,
|
||||
*,
|
||||
delete_from_provider: bool = False,
|
||||
) -> int:
|
||||
"""Clean up expired files from the cache.
|
||||
|
||||
Args:
|
||||
cache: The upload cache to clean up.
|
||||
delete_from_provider: If True, attempt to delete from provider as well.
|
||||
Note: Expired files may already be deleted by the provider.
|
||||
|
||||
Returns:
|
||||
Number of expired entries removed from cache.
|
||||
"""
|
||||
expired_entries: list[CachedUpload] = []
|
||||
|
||||
if delete_from_provider:
|
||||
for provider in _get_providers_from_cache(cache):
|
||||
expired_entries.extend(
|
||||
upload
|
||||
for upload in cache.get_all_for_provider(provider)
|
||||
if upload.is_expired()
|
||||
)
|
||||
|
||||
removed = cache.clear_expired()
|
||||
|
||||
if delete_from_provider:
|
||||
for upload in expired_entries:
|
||||
uploader = get_uploader(upload.provider)
|
||||
if uploader is not None:
|
||||
try:
|
||||
uploader.delete(upload.file_id)
|
||||
except Exception as e:
|
||||
logger.debug(f"Could not delete expired file {upload.file_id}: {e}")
|
||||
|
||||
return removed
|
||||
|
||||
|
||||
def cleanup_provider_files(
|
||||
provider: str,
|
||||
*,
|
||||
cache: UploadCache | None = None,
|
||||
delete_all_from_provider: bool = False,
|
||||
) -> int:
|
||||
"""Clean up all files for a specific provider.
|
||||
|
||||
Args:
|
||||
provider: Provider name to clean up.
|
||||
cache: Optional upload cache to clear entries from.
|
||||
delete_all_from_provider: If True, delete all files from the provider,
|
||||
not just cached ones.
|
||||
|
||||
Returns:
|
||||
Number of files deleted.
|
||||
"""
|
||||
deleted = 0
|
||||
uploader = get_uploader(provider)
|
||||
|
||||
if uploader is None:
|
||||
logger.warning(f"No uploader available for {provider}")
|
||||
return 0
|
||||
|
||||
if delete_all_from_provider:
|
||||
try:
|
||||
files = uploader.list_files()
|
||||
for file_info in files:
|
||||
file_id = file_info.get("id") or file_info.get("name")
|
||||
if file_id and uploader.delete(file_id):
|
||||
deleted += 1
|
||||
except Exception as e:
|
||||
logger.warning(f"Error listing/deleting files from {provider}: {e}")
|
||||
elif cache is not None:
|
||||
uploads = cache.get_all_for_provider(provider)
|
||||
for upload in uploads:
|
||||
if _safe_delete(uploader, upload.file_id, provider):
|
||||
deleted += 1
|
||||
cache.remove_by_file_id(upload.file_id, provider)
|
||||
|
||||
logger.info(f"Deleted {deleted} files from {provider}")
|
||||
return deleted
|
||||
|
||||
|
||||
def _get_providers_from_cache(cache: UploadCache) -> set[str]:
|
||||
"""Get unique provider names from cache entries.
|
||||
|
||||
Args:
|
||||
cache: The upload cache.
|
||||
|
||||
Returns:
|
||||
Set of provider names.
|
||||
"""
|
||||
return cache.get_providers()
|
||||
|
||||
|
||||
async def _asafe_delete(
|
||||
uploader: FileUploader,
|
||||
file_id: str,
|
||||
provider: str,
|
||||
) -> bool:
|
||||
"""Async safely delete a file, logging any errors.
|
||||
|
||||
Args:
|
||||
uploader: The file uploader to use.
|
||||
file_id: The file ID to delete.
|
||||
provider: Provider name for logging.
|
||||
|
||||
Returns:
|
||||
True if deleted successfully, False otherwise.
|
||||
"""
|
||||
try:
|
||||
if await uploader.adelete(file_id):
|
||||
logger.debug(f"Deleted {file_id} from {provider}")
|
||||
return True
|
||||
logger.warning(f"Failed to delete {file_id} from {provider}")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.warning(f"Error deleting {file_id} from {provider}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
async def acleanup_uploaded_files(
|
||||
cache: UploadCache,
|
||||
*,
|
||||
delete_from_provider: bool = True,
|
||||
providers: list[str] | None = None,
|
||||
max_concurrency: int = 10,
|
||||
) -> int:
|
||||
"""Async clean up uploaded files from the cache and optionally from providers.
|
||||
|
||||
Args:
|
||||
cache: The upload cache to clean up.
|
||||
delete_from_provider: If True, delete files from the provider as well.
|
||||
providers: Optional list of providers to clean up. If None, cleans all.
|
||||
max_concurrency: Maximum number of concurrent delete operations.
|
||||
|
||||
Returns:
|
||||
Number of files cleaned up.
|
||||
"""
|
||||
cleaned = 0
|
||||
|
||||
provider_uploads: dict[str, list[CachedUpload]] = {}
|
||||
|
||||
for provider in _get_providers_from_cache(cache):
|
||||
if providers is not None and provider not in providers:
|
||||
continue
|
||||
provider_uploads[provider] = await cache.aget_all_for_provider(provider)
|
||||
|
||||
if delete_from_provider:
|
||||
semaphore = asyncio.Semaphore(max_concurrency)
|
||||
|
||||
async def delete_one(file_uploader: FileUploader, cached: CachedUpload) -> bool:
|
||||
"""Delete a single file with semaphore limiting."""
|
||||
async with semaphore:
|
||||
return await _asafe_delete(
|
||||
file_uploader, cached.file_id, cached.provider
|
||||
)
|
||||
|
||||
tasks: list[asyncio.Task[bool]] = []
|
||||
for provider, uploads in provider_uploads.items():
|
||||
uploader = get_uploader(provider)
|
||||
if uploader is None:
|
||||
logger.warning(
|
||||
f"No uploader available for {provider}, skipping cleanup"
|
||||
)
|
||||
continue
|
||||
|
||||
tasks.extend(
|
||||
asyncio.create_task(delete_one(uploader, cached)) for cached in uploads
|
||||
)
|
||||
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
cleaned = sum(1 for r in results if r is True)
|
||||
|
||||
await cache.aclear()
|
||||
|
||||
logger.info(f"Cleaned up {cleaned} uploaded files")
|
||||
return cleaned
|
||||
|
||||
|
||||
async def acleanup_expired_files(
|
||||
cache: UploadCache,
|
||||
*,
|
||||
delete_from_provider: bool = False,
|
||||
max_concurrency: int = 10,
|
||||
) -> int:
|
||||
"""Async clean up expired files from the cache.
|
||||
|
||||
Args:
|
||||
cache: The upload cache to clean up.
|
||||
delete_from_provider: If True, attempt to delete from provider as well.
|
||||
max_concurrency: Maximum number of concurrent delete operations.
|
||||
|
||||
Returns:
|
||||
Number of expired entries removed from cache.
|
||||
"""
|
||||
expired_entries: list[CachedUpload] = []
|
||||
|
||||
if delete_from_provider:
|
||||
for provider in _get_providers_from_cache(cache):
|
||||
uploads = await cache.aget_all_for_provider(provider)
|
||||
expired_entries.extend(upload for upload in uploads if upload.is_expired())
|
||||
|
||||
removed = await cache.aclear_expired()
|
||||
|
||||
if delete_from_provider and expired_entries:
|
||||
semaphore = asyncio.Semaphore(max_concurrency)
|
||||
|
||||
async def delete_expired(cached: CachedUpload) -> None:
|
||||
"""Delete an expired file with semaphore limiting."""
|
||||
async with semaphore:
|
||||
file_uploader = get_uploader(cached.provider)
|
||||
if file_uploader is not None:
|
||||
try:
|
||||
await file_uploader.adelete(cached.file_id)
|
||||
except Exception as e:
|
||||
logger.debug(
|
||||
f"Could not delete expired file {cached.file_id}: {e}"
|
||||
)
|
||||
|
||||
await asyncio.gather(
|
||||
*[delete_expired(cached) for cached in expired_entries],
|
||||
return_exceptions=True,
|
||||
)
|
||||
|
||||
return removed
|
||||
|
||||
|
||||
async def acleanup_provider_files(
|
||||
provider: str,
|
||||
*,
|
||||
cache: UploadCache | None = None,
|
||||
delete_all_from_provider: bool = False,
|
||||
max_concurrency: int = 10,
|
||||
) -> int:
|
||||
"""Async clean up all files for a specific provider.
|
||||
|
||||
Args:
|
||||
provider: Provider name to clean up.
|
||||
cache: Optional upload cache to clear entries from.
|
||||
delete_all_from_provider: If True, delete all files from the provider.
|
||||
max_concurrency: Maximum number of concurrent delete operations.
|
||||
|
||||
Returns:
|
||||
Number of files deleted.
|
||||
"""
|
||||
deleted = 0
|
||||
uploader = get_uploader(provider)
|
||||
|
||||
if uploader is None:
|
||||
logger.warning(f"No uploader available for {provider}")
|
||||
return 0
|
||||
|
||||
semaphore = asyncio.Semaphore(max_concurrency)
|
||||
|
||||
async def delete_single(target_file_id: str) -> bool:
|
||||
"""Delete a single file with semaphore limiting."""
|
||||
async with semaphore:
|
||||
return await uploader.adelete(target_file_id)
|
||||
|
||||
if delete_all_from_provider:
|
||||
try:
|
||||
files = uploader.list_files()
|
||||
tasks = []
|
||||
for file_info in files:
|
||||
fid = file_info.get("id") or file_info.get("name")
|
||||
if fid:
|
||||
tasks.append(delete_single(fid))
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
deleted = sum(1 for r in results if r is True)
|
||||
except Exception as e:
|
||||
logger.warning(f"Error listing/deleting files from {provider}: {e}")
|
||||
elif cache is not None:
|
||||
uploads = await cache.aget_all_for_provider(provider)
|
||||
tasks = []
|
||||
for upload in uploads:
|
||||
tasks.append(delete_single(upload.file_id))
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
for upload, result in zip(uploads, results, strict=False):
|
||||
if result is True:
|
||||
deleted += 1
|
||||
await cache.aremove_by_file_id(upload.file_id, provider)
|
||||
|
||||
logger.info(f"Deleted {deleted} files from {provider}")
|
||||
return deleted
|
||||
@@ -1,26 +0,0 @@
|
||||
"""Constants for file handling utilities."""
|
||||
|
||||
from datetime import timedelta
|
||||
from typing import Final, Literal
|
||||
|
||||
|
||||
DEFAULT_MAX_FILE_SIZE_BYTES: Final[Literal[524_288_000]] = 524_288_000
|
||||
MAGIC_BUFFER_SIZE: Final[Literal[2048]] = 2048
|
||||
|
||||
UPLOAD_MAX_RETRIES: Final[Literal[3]] = 3
|
||||
UPLOAD_RETRY_DELAY_BASE: Final[Literal[2]] = 2
|
||||
|
||||
DEFAULT_TTL_SECONDS: Final[Literal[86_400]] = 86_400
|
||||
DEFAULT_MAX_CACHE_ENTRIES: Final[Literal[1000]] = 1000
|
||||
|
||||
GEMINI_FILE_TTL: Final[timedelta] = timedelta(hours=48)
|
||||
BACKOFF_BASE_DELAY: Final[float] = 1.0
|
||||
BACKOFF_MAX_DELAY: Final[float] = 30.0
|
||||
BACKOFF_JITTER_FACTOR: Final[float] = 0.1
|
||||
|
||||
FILES_API_MAX_SIZE: Final[Literal[536_870_912]] = 536_870_912
|
||||
DEFAULT_UPLOAD_CHUNK_SIZE: Final[Literal[67_108_864]] = 67_108_864
|
||||
|
||||
MULTIPART_THRESHOLD: Final[Literal[8_388_608]] = 8_388_608
|
||||
MULTIPART_CHUNKSIZE: Final[Literal[8_388_608]] = 8_388_608
|
||||
MAX_CONCURRENCY: Final[Literal[10]] = 10
|
||||
@@ -1,281 +0,0 @@
|
||||
"""Content-type specific file classes."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import ABC
|
||||
from io import IOBase
|
||||
from pathlib import Path
|
||||
from typing import Annotated, Any, BinaryIO, Literal, Self
|
||||
|
||||
from pydantic import BaseModel, Field, GetCoreSchemaHandler
|
||||
from pydantic_core import CoreSchema, core_schema
|
||||
|
||||
from crewai.files.file import (
|
||||
AsyncFileStream,
|
||||
FileBytes,
|
||||
FilePath,
|
||||
FileSource,
|
||||
FileStream,
|
||||
FileUrl,
|
||||
)
|
||||
from crewai.files.utils import is_file_source
|
||||
|
||||
|
||||
FileSourceInput = str | Path | bytes | IOBase | FileSource
|
||||
|
||||
|
||||
class _FileSourceCoercer:
|
||||
"""Pydantic-compatible type that coerces various inputs to FileSource."""
|
||||
|
||||
@classmethod
|
||||
def _coerce(cls, v: Any) -> FileSource:
|
||||
"""Convert raw input to appropriate FileSource type."""
|
||||
if isinstance(v, (FilePath, FileBytes, FileStream, FileUrl)):
|
||||
return v
|
||||
if isinstance(v, str):
|
||||
if v.startswith(("http://", "https://")):
|
||||
return FileUrl(url=v)
|
||||
return FilePath(path=Path(v))
|
||||
if isinstance(v, Path):
|
||||
return FilePath(path=v)
|
||||
if isinstance(v, bytes):
|
||||
return FileBytes(data=v)
|
||||
if isinstance(v, (IOBase, BinaryIO)):
|
||||
return FileStream(stream=v)
|
||||
raise ValueError(f"Cannot convert {type(v).__name__} to file source")
|
||||
|
||||
@classmethod
|
||||
def __get_pydantic_core_schema__(
|
||||
cls,
|
||||
_source_type: Any,
|
||||
_handler: GetCoreSchemaHandler,
|
||||
) -> CoreSchema:
|
||||
"""Generate Pydantic core schema for FileSource coercion."""
|
||||
return core_schema.no_info_plain_validator_function(
|
||||
cls._coerce,
|
||||
serialization=core_schema.plain_serializer_function_ser_schema(
|
||||
lambda v: v,
|
||||
info_arg=False,
|
||||
return_schema=core_schema.any_schema(),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
CoercedFileSource = Annotated[FileSourceInput, _FileSourceCoercer]
|
||||
|
||||
FileMode = Literal["strict", "auto", "warn", "chunk"]
|
||||
|
||||
|
||||
ImageExtension = Literal[
|
||||
".png",
|
||||
".jpg",
|
||||
".jpeg",
|
||||
".gif",
|
||||
".webp",
|
||||
".bmp",
|
||||
".tiff",
|
||||
".tif",
|
||||
".svg",
|
||||
".heic",
|
||||
".heif",
|
||||
]
|
||||
ImageMimeType = Literal[
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
"image/gif",
|
||||
"image/webp",
|
||||
"image/bmp",
|
||||
"image/tiff",
|
||||
"image/svg+xml",
|
||||
"image/heic",
|
||||
"image/heif",
|
||||
]
|
||||
|
||||
PDFExtension = Literal[".pdf"]
|
||||
PDFContentType = Literal["application/pdf"]
|
||||
|
||||
TextExtension = Literal[
|
||||
".txt",
|
||||
".md",
|
||||
".rst",
|
||||
".csv",
|
||||
".json",
|
||||
".xml",
|
||||
".yaml",
|
||||
".yml",
|
||||
".html",
|
||||
".htm",
|
||||
".log",
|
||||
".ini",
|
||||
".cfg",
|
||||
".conf",
|
||||
]
|
||||
TextContentType = Literal[
|
||||
"text/plain",
|
||||
"text/markdown",
|
||||
"text/csv",
|
||||
"application/json",
|
||||
"application/xml",
|
||||
"text/xml",
|
||||
"application/x-yaml",
|
||||
"text/yaml",
|
||||
"text/html",
|
||||
]
|
||||
|
||||
AudioExtension = Literal[
|
||||
".mp3", ".wav", ".ogg", ".flac", ".aac", ".m4a", ".wma", ".aiff", ".opus"
|
||||
]
|
||||
AudioMimeType = Literal[
|
||||
"audio/mp3",
|
||||
"audio/mpeg",
|
||||
"audio/wav",
|
||||
"audio/x-wav",
|
||||
"audio/ogg",
|
||||
"audio/flac",
|
||||
"audio/aac",
|
||||
"audio/m4a",
|
||||
"audio/mp4",
|
||||
"audio/x-ms-wma",
|
||||
"audio/aiff",
|
||||
"audio/opus",
|
||||
]
|
||||
|
||||
VideoExtension = Literal[
|
||||
".mp4", ".avi", ".mkv", ".mov", ".webm", ".flv", ".wmv", ".m4v", ".mpeg", ".mpg"
|
||||
]
|
||||
VideoMimeType = Literal[
|
||||
"video/mp4",
|
||||
"video/mpeg",
|
||||
"video/webm",
|
||||
"video/quicktime",
|
||||
"video/x-msvideo",
|
||||
"video/x-matroska",
|
||||
"video/x-flv",
|
||||
"video/x-ms-wmv",
|
||||
]
|
||||
|
||||
|
||||
class BaseFile(ABC, BaseModel):
|
||||
"""Abstract base class for typed file wrappers.
|
||||
|
||||
Provides common functionality for all file types including:
|
||||
- File source management
|
||||
- Content reading
|
||||
- Dict unpacking support (`**` syntax)
|
||||
- Per-file mode mode
|
||||
|
||||
Can be unpacked with ** syntax: `{**ImageFile(source="./chart.png")}`
|
||||
which unpacks to: `{"chart": <ImageFile instance>}` using filename stem as key.
|
||||
|
||||
Attributes:
|
||||
source: The underlying file source (path, bytes, or stream).
|
||||
mode: How to handle this file if it exceeds provider limits.
|
||||
"""
|
||||
|
||||
source: CoercedFileSource = Field(description="The underlying file source.")
|
||||
mode: FileMode = Field(
|
||||
default="auto",
|
||||
description="How to handle if file exceeds limits: strict, auto, warn, chunk.",
|
||||
)
|
||||
|
||||
@property
|
||||
def _file_source(self) -> FileSource:
|
||||
"""Get source with narrowed type (always FileSource after validation)."""
|
||||
if is_file_source(self.source):
|
||||
return self.source
|
||||
raise TypeError("source must be a FileSource after validation")
|
||||
|
||||
@property
|
||||
def filename(self) -> str | None:
|
||||
"""Get the filename from the source."""
|
||||
return self._file_source.filename
|
||||
|
||||
@property
|
||||
def content_type(self) -> str:
|
||||
"""Get the content type from the source."""
|
||||
return self._file_source.content_type
|
||||
|
||||
def read(self) -> bytes:
|
||||
"""Read the file content as bytes."""
|
||||
return self._file_source.read() # type: ignore[union-attr]
|
||||
|
||||
async def aread(self) -> bytes:
|
||||
"""Async read the file content as bytes.
|
||||
|
||||
Raises:
|
||||
TypeError: If the underlying source doesn't support async read.
|
||||
"""
|
||||
source = self._file_source
|
||||
if isinstance(source, (FilePath, FileBytes, AsyncFileStream, FileUrl)):
|
||||
return await source.aread()
|
||||
raise TypeError(f"{type(source).__name__} does not support async read")
|
||||
|
||||
def read_text(self, encoding: str = "utf-8") -> str:
|
||||
"""Read the file content as string."""
|
||||
return self.read().decode(encoding)
|
||||
|
||||
@property
|
||||
def _unpack_key(self) -> str:
|
||||
"""Get the key to use when unpacking (filename stem)."""
|
||||
filename = self._file_source.filename
|
||||
if filename:
|
||||
return Path(filename).stem
|
||||
return "file"
|
||||
|
||||
def keys(self) -> list[str]:
|
||||
"""Return keys for dict unpacking."""
|
||||
return [self._unpack_key]
|
||||
|
||||
def __getitem__(self, key: str) -> Self:
|
||||
"""Return self for dict unpacking."""
|
||||
if key == self._unpack_key:
|
||||
return self
|
||||
raise KeyError(key)
|
||||
|
||||
|
||||
class ImageFile(BaseFile):
|
||||
"""File representing an image.
|
||||
|
||||
Supports common image formats: PNG, JPEG, GIF, WebP, BMP, TIFF, SVG.
|
||||
"""
|
||||
|
||||
|
||||
class PDFFile(BaseFile):
|
||||
"""File representing a PDF document."""
|
||||
|
||||
|
||||
class TextFile(BaseFile):
|
||||
"""File representing a text document.
|
||||
|
||||
Supports common text formats: TXT, MD, RST, CSV, JSON, XML, YAML, HTML.
|
||||
"""
|
||||
|
||||
|
||||
class AudioFile(BaseFile):
|
||||
"""File representing an audio file.
|
||||
|
||||
Supports common audio formats: MP3, WAV, OGG, FLAC, AAC, M4A, WMA.
|
||||
"""
|
||||
|
||||
|
||||
class VideoFile(BaseFile):
|
||||
"""File representing a video file.
|
||||
|
||||
Supports common video formats: MP4, AVI, MKV, MOV, WebM, FLV, WMV.
|
||||
"""
|
||||
|
||||
|
||||
class File(BaseFile):
|
||||
"""Generic file that auto-detects the appropriate type.
|
||||
|
||||
Use this when you don't want to specify the exact file type.
|
||||
The content type is automatically detected from the file contents.
|
||||
|
||||
Example:
|
||||
>>> pdf_file = File(source="./document.pdf")
|
||||
>>> image_file = File(source="./image.png")
|
||||
>>> bytes_file = File(source=b"file content")
|
||||
"""
|
||||
|
||||
|
||||
FileInput = AudioFile | File | ImageFile | PDFFile | TextFile | VideoFile
|
||||
@@ -1,507 +0,0 @@
|
||||
"""Base file class for handling file inputs in tasks."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import AsyncIterator, Iterator
|
||||
import mimetypes
|
||||
from pathlib import Path
|
||||
from typing import Annotated, Any, BinaryIO, Protocol, cast, runtime_checkable
|
||||
|
||||
import aiofiles
|
||||
from pydantic import (
|
||||
BaseModel,
|
||||
BeforeValidator,
|
||||
Field,
|
||||
GetCoreSchemaHandler,
|
||||
PrivateAttr,
|
||||
model_validator,
|
||||
)
|
||||
from pydantic_core import CoreSchema, core_schema
|
||||
|
||||
from crewai.files.constants import DEFAULT_MAX_FILE_SIZE_BYTES, MAGIC_BUFFER_SIZE
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class AsyncReadable(Protocol):
|
||||
"""Protocol for async readable streams."""
|
||||
|
||||
async def read(self, size: int = -1) -> bytes:
|
||||
"""Read up to size bytes from the stream."""
|
||||
...
|
||||
|
||||
|
||||
class _AsyncReadableValidator:
|
||||
"""Pydantic validator for AsyncReadable types."""
|
||||
|
||||
@classmethod
|
||||
def __get_pydantic_core_schema__(
|
||||
cls, _source_type: Any, _handler: GetCoreSchemaHandler
|
||||
) -> CoreSchema:
|
||||
return core_schema.no_info_plain_validator_function(
|
||||
cls._validate,
|
||||
serialization=core_schema.plain_serializer_function_ser_schema(
|
||||
lambda x: None, info_arg=False
|
||||
),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _validate(value: Any) -> AsyncReadable:
|
||||
if isinstance(value, AsyncReadable):
|
||||
return value
|
||||
raise ValueError("Expected an async readable object with async read() method")
|
||||
|
||||
|
||||
ValidatedAsyncReadable = Annotated[AsyncReadable, _AsyncReadableValidator()]
|
||||
|
||||
|
||||
def _fallback_content_type(filename: str | None) -> str:
|
||||
"""Get content type from filename extension or return default."""
|
||||
if filename:
|
||||
mime_type, _ = mimetypes.guess_type(filename)
|
||||
if mime_type:
|
||||
return mime_type
|
||||
return "application/octet-stream"
|
||||
|
||||
|
||||
def detect_content_type(data: bytes, filename: str | None = None) -> str:
|
||||
"""Detect MIME type from file content.
|
||||
|
||||
Uses python-magic if available for accurate content-based detection,
|
||||
falls back to mimetypes module using filename extension.
|
||||
|
||||
Args:
|
||||
data: Raw bytes to analyze (only first 2048 bytes are used).
|
||||
filename: Optional filename for extension-based fallback.
|
||||
|
||||
Returns:
|
||||
The detected MIME type.
|
||||
"""
|
||||
try:
|
||||
import magic
|
||||
|
||||
result: str = magic.from_buffer(data[:MAGIC_BUFFER_SIZE], mime=True)
|
||||
return result
|
||||
except ImportError:
|
||||
return _fallback_content_type(filename)
|
||||
|
||||
|
||||
def detect_content_type_from_path(path: Path, filename: str | None = None) -> str:
|
||||
"""Detect MIME type from file path.
|
||||
|
||||
Uses python-magic's from_file() for accurate detection without reading
|
||||
the entire file into memory.
|
||||
|
||||
Args:
|
||||
path: Path to the file.
|
||||
filename: Optional filename for extension-based fallback.
|
||||
|
||||
Returns:
|
||||
The detected MIME type.
|
||||
"""
|
||||
try:
|
||||
import magic
|
||||
|
||||
result: str = magic.from_file(str(path), mime=True)
|
||||
return result
|
||||
except ImportError:
|
||||
return _fallback_content_type(filename or path.name)
|
||||
|
||||
|
||||
class _BinaryIOValidator:
|
||||
"""Pydantic validator for BinaryIO types."""
|
||||
|
||||
@classmethod
|
||||
def __get_pydantic_core_schema__(
|
||||
cls, _source_type: Any, _handler: GetCoreSchemaHandler
|
||||
) -> CoreSchema:
|
||||
return core_schema.no_info_plain_validator_function(
|
||||
cls._validate,
|
||||
serialization=core_schema.plain_serializer_function_ser_schema(
|
||||
lambda x: None, info_arg=False
|
||||
),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _validate(value: Any) -> BinaryIO:
|
||||
if hasattr(value, "read") and hasattr(value, "seek"):
|
||||
return cast(BinaryIO, value)
|
||||
raise ValueError("Expected a binary file-like object with read() and seek()")
|
||||
|
||||
|
||||
ValidatedBinaryIO = Annotated[BinaryIO, _BinaryIOValidator()]
|
||||
|
||||
|
||||
class FilePath(BaseModel):
|
||||
"""File loaded from a filesystem path."""
|
||||
|
||||
path: Path = Field(description="Path to the file on the filesystem.")
|
||||
max_size_bytes: int = Field(
|
||||
default=DEFAULT_MAX_FILE_SIZE_BYTES,
|
||||
exclude=True,
|
||||
description="Maximum file size in bytes.",
|
||||
)
|
||||
_content: bytes | None = PrivateAttr(default=None)
|
||||
_content_type: str = PrivateAttr()
|
||||
|
||||
@model_validator(mode="after")
|
||||
def _validate_file_exists(self) -> FilePath:
|
||||
"""Validate that the file exists, is secure, and within size limits."""
|
||||
from crewai.files.processing.exceptions import FileTooLargeError
|
||||
|
||||
path_str = str(self.path)
|
||||
if ".." in path_str:
|
||||
raise ValueError(f"Path traversal not allowed: {self.path}")
|
||||
|
||||
if self.path.is_symlink():
|
||||
resolved = self.path.resolve()
|
||||
cwd = Path.cwd().resolve()
|
||||
if not str(resolved).startswith(str(cwd)):
|
||||
raise ValueError(f"Symlink escapes allowed directory: {self.path}")
|
||||
|
||||
if not self.path.exists():
|
||||
raise ValueError(f"File not found: {self.path}")
|
||||
if not self.path.is_file():
|
||||
raise ValueError(f"Path is not a file: {self.path}")
|
||||
|
||||
actual_size = self.path.stat().st_size
|
||||
if actual_size > self.max_size_bytes:
|
||||
raise FileTooLargeError(
|
||||
f"File exceeds max size ({actual_size} > {self.max_size_bytes})",
|
||||
file_name=str(self.path),
|
||||
actual_size=actual_size,
|
||||
max_size=self.max_size_bytes,
|
||||
)
|
||||
|
||||
self._content_type = detect_content_type_from_path(self.path, self.path.name)
|
||||
return self
|
||||
|
||||
@property
|
||||
def filename(self) -> str:
|
||||
"""Get the filename from the path."""
|
||||
return self.path.name
|
||||
|
||||
@property
|
||||
def content_type(self) -> str:
|
||||
"""Get the content type."""
|
||||
return self._content_type
|
||||
|
||||
def read(self) -> bytes:
|
||||
"""Read the file content from disk."""
|
||||
if self._content is None:
|
||||
self._content = self.path.read_bytes()
|
||||
return self._content
|
||||
|
||||
async def aread(self) -> bytes:
|
||||
"""Async read the file content from disk."""
|
||||
if self._content is None:
|
||||
async with aiofiles.open(self.path, "rb") as f:
|
||||
self._content = await f.read()
|
||||
return self._content
|
||||
|
||||
def read_chunks(self, chunk_size: int = 65536) -> Iterator[bytes]:
|
||||
"""Stream file content in chunks without loading entirely into memory.
|
||||
|
||||
Args:
|
||||
chunk_size: Size of each chunk in bytes.
|
||||
|
||||
Yields:
|
||||
Chunks of file content.
|
||||
"""
|
||||
with open(self.path, "rb") as f:
|
||||
while chunk := f.read(chunk_size):
|
||||
yield chunk
|
||||
|
||||
async def aread_chunks(self, chunk_size: int = 65536) -> AsyncIterator[bytes]:
|
||||
"""Async streaming for non-blocking I/O.
|
||||
|
||||
Args:
|
||||
chunk_size: Size of each chunk in bytes.
|
||||
|
||||
Yields:
|
||||
Chunks of file content.
|
||||
"""
|
||||
async with aiofiles.open(self.path, "rb") as f:
|
||||
while chunk := await f.read(chunk_size):
|
||||
yield chunk
|
||||
|
||||
|
||||
class FileBytes(BaseModel):
|
||||
"""File created from raw bytes content."""
|
||||
|
||||
data: bytes = Field(description="Raw bytes content of the file.")
|
||||
filename: str | None = Field(default=None, description="Optional filename.")
|
||||
_content_type: str = PrivateAttr()
|
||||
|
||||
@model_validator(mode="after")
|
||||
def _detect_content_type(self) -> FileBytes:
|
||||
"""Detect and cache content type from data."""
|
||||
self._content_type = detect_content_type(self.data, self.filename)
|
||||
return self
|
||||
|
||||
@property
|
||||
def content_type(self) -> str:
|
||||
"""Get the content type."""
|
||||
return self._content_type
|
||||
|
||||
def read(self) -> bytes:
|
||||
"""Return the bytes content."""
|
||||
return self.data
|
||||
|
||||
async def aread(self) -> bytes:
|
||||
"""Async return the bytes content (immediate, already in memory)."""
|
||||
return self.data
|
||||
|
||||
def read_chunks(self, chunk_size: int = 65536) -> Iterator[bytes]:
|
||||
"""Stream bytes content in chunks.
|
||||
|
||||
Args:
|
||||
chunk_size: Size of each chunk in bytes.
|
||||
|
||||
Yields:
|
||||
Chunks of bytes content.
|
||||
"""
|
||||
for i in range(0, len(self.data), chunk_size):
|
||||
yield self.data[i : i + chunk_size]
|
||||
|
||||
async def aread_chunks(self, chunk_size: int = 65536) -> AsyncIterator[bytes]:
|
||||
"""Async streaming (immediate yield since already in memory).
|
||||
|
||||
Args:
|
||||
chunk_size: Size of each chunk in bytes.
|
||||
|
||||
Yields:
|
||||
Chunks of bytes content.
|
||||
"""
|
||||
for chunk in self.read_chunks(chunk_size):
|
||||
yield chunk
|
||||
|
||||
|
||||
class FileStream(BaseModel):
|
||||
"""File loaded from a file-like stream."""
|
||||
|
||||
stream: ValidatedBinaryIO = Field(description="Binary file stream.")
|
||||
filename: str | None = Field(default=None, description="Optional filename.")
|
||||
_content: bytes | None = PrivateAttr(default=None)
|
||||
_content_type: str = PrivateAttr()
|
||||
|
||||
@model_validator(mode="after")
|
||||
def _initialize(self) -> FileStream:
|
||||
"""Extract filename and detect content type."""
|
||||
if self.filename is None:
|
||||
name = getattr(self.stream, "name", None)
|
||||
if name is not None:
|
||||
self.filename = Path(name).name
|
||||
|
||||
position = self.stream.tell()
|
||||
self.stream.seek(0)
|
||||
header = self.stream.read(MAGIC_BUFFER_SIZE)
|
||||
self.stream.seek(position)
|
||||
self._content_type = detect_content_type(header, self.filename)
|
||||
return self
|
||||
|
||||
@property
|
||||
def content_type(self) -> str:
|
||||
"""Get the content type."""
|
||||
return self._content_type
|
||||
|
||||
def read(self) -> bytes:
|
||||
"""Read the stream content. Content is cached after first read."""
|
||||
if self._content is None:
|
||||
position = self.stream.tell()
|
||||
self.stream.seek(0)
|
||||
self._content = self.stream.read()
|
||||
self.stream.seek(position)
|
||||
return self._content
|
||||
|
||||
def close(self) -> None:
|
||||
"""Close the underlying stream."""
|
||||
self.stream.close()
|
||||
|
||||
def __enter__(self) -> FileStream:
|
||||
"""Enter context manager."""
|
||||
return self
|
||||
|
||||
def __exit__(
|
||||
self,
|
||||
exc_type: type[BaseException] | None,
|
||||
exc_val: BaseException | None,
|
||||
exc_tb: Any,
|
||||
) -> None:
|
||||
"""Exit context manager and close stream."""
|
||||
self.close()
|
||||
|
||||
def read_chunks(self, chunk_size: int = 65536) -> Iterator[bytes]:
|
||||
"""Stream from underlying stream in chunks.
|
||||
|
||||
Args:
|
||||
chunk_size: Size of each chunk in bytes.
|
||||
|
||||
Yields:
|
||||
Chunks of stream content.
|
||||
"""
|
||||
position = self.stream.tell()
|
||||
self.stream.seek(0)
|
||||
try:
|
||||
while chunk := self.stream.read(chunk_size):
|
||||
yield chunk
|
||||
finally:
|
||||
self.stream.seek(position)
|
||||
|
||||
|
||||
class AsyncFileStream(BaseModel):
|
||||
"""File loaded from an async stream.
|
||||
|
||||
Use for async file handles like aiofiles objects or aiohttp response bodies.
|
||||
This is an async-only type - use aread() instead of read().
|
||||
|
||||
Attributes:
|
||||
stream: Async file-like object with async read() method.
|
||||
filename: Optional filename for the stream.
|
||||
"""
|
||||
|
||||
stream: ValidatedAsyncReadable = Field(
|
||||
description="Async file stream with async read() method."
|
||||
)
|
||||
filename: str | None = Field(default=None, description="Optional filename.")
|
||||
_content: bytes | None = PrivateAttr(default=None)
|
||||
_content_type: str | None = PrivateAttr(default=None)
|
||||
|
||||
@property
|
||||
def content_type(self) -> str:
|
||||
"""Get the content type from stream content (cached). Requires aread() first."""
|
||||
if self._content is None:
|
||||
raise RuntimeError("Call aread() first to load content")
|
||||
if self._content_type is None:
|
||||
self._content_type = detect_content_type(self._content, self.filename)
|
||||
return self._content_type
|
||||
|
||||
async def aread(self) -> bytes:
|
||||
"""Async read the stream content. Content is cached after first read."""
|
||||
if self._content is None:
|
||||
self._content = await self.stream.read()
|
||||
return self._content
|
||||
|
||||
async def aclose(self) -> None:
|
||||
"""Async close the underlying stream."""
|
||||
if hasattr(self.stream, "close"):
|
||||
result = self.stream.close()
|
||||
if hasattr(result, "__await__"):
|
||||
await result
|
||||
|
||||
async def __aenter__(self) -> AsyncFileStream:
|
||||
"""Async enter context manager."""
|
||||
return self
|
||||
|
||||
async def __aexit__(
|
||||
self,
|
||||
exc_type: type[BaseException] | None,
|
||||
exc_val: BaseException | None,
|
||||
exc_tb: Any,
|
||||
) -> None:
|
||||
"""Async exit context manager and close stream."""
|
||||
await self.aclose()
|
||||
|
||||
async def aread_chunks(self, chunk_size: int = 65536) -> AsyncIterator[bytes]:
|
||||
"""Async stream content in chunks.
|
||||
|
||||
Args:
|
||||
chunk_size: Size of each chunk in bytes.
|
||||
|
||||
Yields:
|
||||
Chunks of stream content.
|
||||
"""
|
||||
while chunk := await self.stream.read(chunk_size):
|
||||
yield chunk
|
||||
|
||||
|
||||
class FileUrl(BaseModel):
|
||||
"""File referenced by URL.
|
||||
|
||||
For providers that support URL references, the URL is passed directly.
|
||||
For providers that don't, content is fetched on demand.
|
||||
|
||||
Attributes:
|
||||
url: URL where the file can be accessed.
|
||||
filename: Optional filename (extracted from URL if not provided).
|
||||
"""
|
||||
|
||||
url: str = Field(description="URL where the file can be accessed.")
|
||||
filename: str | None = Field(default=None, description="Optional filename.")
|
||||
_content_type: str | None = PrivateAttr(default=None)
|
||||
_content: bytes | None = PrivateAttr(default=None)
|
||||
|
||||
@model_validator(mode="after")
|
||||
def _validate_url(self) -> FileUrl:
|
||||
"""Validate URL format."""
|
||||
if not self.url.startswith(("http://", "https://")):
|
||||
raise ValueError(f"Invalid URL scheme: {self.url}")
|
||||
return self
|
||||
|
||||
@property
|
||||
def content_type(self) -> str:
|
||||
"""Get the content type, guessing from URL extension if not set."""
|
||||
if self._content_type is None:
|
||||
self._content_type = self._guess_content_type()
|
||||
return self._content_type
|
||||
|
||||
def _guess_content_type(self) -> str:
|
||||
"""Guess content type from URL extension."""
|
||||
from urllib.parse import urlparse
|
||||
|
||||
parsed = urlparse(self.url)
|
||||
path = parsed.path
|
||||
guessed, _ = mimetypes.guess_type(path)
|
||||
return guessed or "application/octet-stream"
|
||||
|
||||
def read(self) -> bytes:
|
||||
"""Fetch content from URL (for providers that don't support URL references)."""
|
||||
if self._content is None:
|
||||
import httpx
|
||||
|
||||
response = httpx.get(self.url, follow_redirects=True)
|
||||
response.raise_for_status()
|
||||
self._content = response.content
|
||||
if "content-type" in response.headers:
|
||||
self._content_type = response.headers["content-type"].split(";")[0]
|
||||
return self._content
|
||||
|
||||
async def aread(self) -> bytes:
|
||||
"""Async fetch content from URL."""
|
||||
if self._content is None:
|
||||
import httpx
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.get(self.url, follow_redirects=True)
|
||||
response.raise_for_status()
|
||||
self._content = response.content
|
||||
if "content-type" in response.headers:
|
||||
self._content_type = response.headers["content-type"].split(";")[0]
|
||||
return self._content
|
||||
|
||||
|
||||
FileSource = FilePath | FileBytes | FileStream | AsyncFileStream | FileUrl
|
||||
|
||||
|
||||
def _normalize_source(value: Any) -> FileSource:
|
||||
"""Convert raw input to appropriate source type."""
|
||||
if isinstance(value, (FilePath, FileBytes, FileStream, AsyncFileStream, FileUrl)):
|
||||
return value
|
||||
if isinstance(value, str):
|
||||
if value.startswith(("http://", "https://")):
|
||||
return FileUrl(url=value)
|
||||
return FilePath(path=Path(value))
|
||||
if isinstance(value, Path):
|
||||
return FilePath(path=value)
|
||||
if isinstance(value, bytes):
|
||||
return FileBytes(data=value)
|
||||
if isinstance(value, AsyncReadable):
|
||||
return AsyncFileStream(stream=value)
|
||||
if hasattr(value, "read") and hasattr(value, "seek"):
|
||||
return FileStream(stream=value)
|
||||
raise ValueError(f"Cannot convert {type(value).__name__} to file source")
|
||||
|
||||
|
||||
RawFileInput = str | Path | bytes
|
||||
FileSourceInput = Annotated[
|
||||
RawFileInput | FileSource, BeforeValidator(_normalize_source)
|
||||
]
|
||||
@@ -1,184 +0,0 @@
|
||||
"""Performance metrics and structured logging for file operations."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Generator
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
import logging
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class FileOperationMetrics:
|
||||
"""Metrics for a file operation.
|
||||
|
||||
Attributes:
|
||||
operation: Name of the operation (e.g., "upload", "resolve", "process").
|
||||
filename: Name of the file being operated on.
|
||||
provider: Provider name if applicable.
|
||||
duration_ms: Duration of the operation in milliseconds.
|
||||
size_bytes: Size of the file in bytes.
|
||||
success: Whether the operation succeeded.
|
||||
error: Error message if operation failed.
|
||||
timestamp: When the operation occurred.
|
||||
metadata: Additional operation-specific metadata.
|
||||
"""
|
||||
|
||||
operation: str
|
||||
filename: str | None = None
|
||||
provider: str | None = None
|
||||
duration_ms: float = 0.0
|
||||
size_bytes: int | None = None
|
||||
success: bool = True
|
||||
error: str | None = None
|
||||
timestamp: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
"""Convert metrics to dictionary for logging.
|
||||
|
||||
Returns:
|
||||
Dictionary representation of metrics.
|
||||
"""
|
||||
result: dict[str, Any] = {
|
||||
"operation": self.operation,
|
||||
"duration_ms": round(self.duration_ms, 2),
|
||||
"success": self.success,
|
||||
"timestamp": self.timestamp.isoformat(),
|
||||
}
|
||||
|
||||
if self.filename:
|
||||
result["filename"] = self.filename
|
||||
if self.provider:
|
||||
result["provider"] = self.provider
|
||||
if self.size_bytes is not None:
|
||||
result["size_bytes"] = self.size_bytes
|
||||
if self.error:
|
||||
result["error"] = self.error
|
||||
if self.metadata:
|
||||
result.update(self.metadata)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@contextmanager
|
||||
def measure_operation(
|
||||
operation: str,
|
||||
*,
|
||||
filename: str | None = None,
|
||||
provider: str | None = None,
|
||||
size_bytes: int | None = None,
|
||||
log_level: int = logging.DEBUG,
|
||||
**extra_metadata: Any,
|
||||
) -> Generator[FileOperationMetrics, None, None]:
|
||||
"""Context manager to measure and log operation performance.
|
||||
|
||||
Args:
|
||||
operation: Name of the operation.
|
||||
filename: Optional filename being operated on.
|
||||
provider: Optional provider name.
|
||||
size_bytes: Optional file size in bytes.
|
||||
log_level: Log level for the result message.
|
||||
**extra_metadata: Additional metadata to include.
|
||||
|
||||
Yields:
|
||||
FileOperationMetrics object that will be populated with results.
|
||||
|
||||
Example:
|
||||
with measure_operation("upload", filename="test.pdf", provider="openai") as metrics:
|
||||
result = upload_file(file)
|
||||
metrics.metadata["file_id"] = result.file_id
|
||||
"""
|
||||
metrics = FileOperationMetrics(
|
||||
operation=operation,
|
||||
filename=filename,
|
||||
provider=provider,
|
||||
size_bytes=size_bytes,
|
||||
metadata=dict(extra_metadata),
|
||||
)
|
||||
|
||||
start_time = time.perf_counter()
|
||||
|
||||
try:
|
||||
yield metrics
|
||||
metrics.success = True
|
||||
except Exception as e:
|
||||
metrics.success = False
|
||||
metrics.error = str(e)
|
||||
raise
|
||||
finally:
|
||||
metrics.duration_ms = (time.perf_counter() - start_time) * 1000
|
||||
|
||||
log_message = f"{operation}"
|
||||
if filename:
|
||||
log_message += f" [{filename}]"
|
||||
if provider:
|
||||
log_message += f" ({provider})"
|
||||
|
||||
if metrics.success:
|
||||
log_message += f" completed in {metrics.duration_ms:.2f}ms"
|
||||
else:
|
||||
log_message += f" failed after {metrics.duration_ms:.2f}ms: {metrics.error}"
|
||||
|
||||
logger.log(log_level, log_message, extra=metrics.to_dict())
|
||||
|
||||
|
||||
def log_file_operation(
|
||||
operation: str,
|
||||
*,
|
||||
filename: str | None = None,
|
||||
provider: str | None = None,
|
||||
size_bytes: int | None = None,
|
||||
duration_ms: float | None = None,
|
||||
success: bool = True,
|
||||
error: str | None = None,
|
||||
level: int = logging.INFO,
|
||||
**extra: Any,
|
||||
) -> None:
|
||||
"""Log a file operation with structured data.
|
||||
|
||||
Args:
|
||||
operation: Name of the operation.
|
||||
filename: Optional filename being operated on.
|
||||
provider: Optional provider name.
|
||||
size_bytes: Optional file size in bytes.
|
||||
duration_ms: Optional duration in milliseconds.
|
||||
success: Whether the operation succeeded.
|
||||
error: Optional error message.
|
||||
level: Log level to use.
|
||||
**extra: Additional metadata to include.
|
||||
"""
|
||||
metrics = FileOperationMetrics(
|
||||
operation=operation,
|
||||
filename=filename,
|
||||
provider=provider,
|
||||
size_bytes=size_bytes,
|
||||
duration_ms=duration_ms or 0.0,
|
||||
success=success,
|
||||
error=error,
|
||||
metadata=dict(extra),
|
||||
)
|
||||
|
||||
message = f"{operation}"
|
||||
if filename:
|
||||
message += f" [{filename}]"
|
||||
if provider:
|
||||
message += f" ({provider})"
|
||||
|
||||
if success:
|
||||
if duration_ms:
|
||||
message += f" completed in {duration_ms:.2f}ms"
|
||||
else:
|
||||
message += " completed"
|
||||
else:
|
||||
message += " failed"
|
||||
if error:
|
||||
message += f": {error}"
|
||||
|
||||
logger.log(level, message, extra=metrics.to_dict())
|
||||
@@ -1,62 +0,0 @@
|
||||
"""File processing module for multimodal content handling.
|
||||
|
||||
This module provides validation, transformation, and processing utilities
|
||||
for files used in multimodal LLM interactions.
|
||||
"""
|
||||
|
||||
from crewai.files.processing.constraints import (
|
||||
ANTHROPIC_CONSTRAINTS,
|
||||
BEDROCK_CONSTRAINTS,
|
||||
GEMINI_CONSTRAINTS,
|
||||
OPENAI_CONSTRAINTS,
|
||||
AudioConstraints,
|
||||
ImageConstraints,
|
||||
PDFConstraints,
|
||||
ProviderConstraints,
|
||||
VideoConstraints,
|
||||
get_constraints_for_provider,
|
||||
)
|
||||
from crewai.files.processing.enums import FileHandling
|
||||
from crewai.files.processing.exceptions import (
|
||||
FileProcessingError,
|
||||
FileTooLargeError,
|
||||
FileValidationError,
|
||||
ProcessingDependencyError,
|
||||
UnsupportedFileTypeError,
|
||||
)
|
||||
from crewai.files.processing.processor import FileProcessor
|
||||
from crewai.files.processing.validators import (
|
||||
validate_audio,
|
||||
validate_file,
|
||||
validate_image,
|
||||
validate_pdf,
|
||||
validate_text,
|
||||
validate_video,
|
||||
)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"ANTHROPIC_CONSTRAINTS",
|
||||
"BEDROCK_CONSTRAINTS",
|
||||
"GEMINI_CONSTRAINTS",
|
||||
"OPENAI_CONSTRAINTS",
|
||||
"AudioConstraints",
|
||||
"FileHandling",
|
||||
"FileProcessingError",
|
||||
"FileProcessor",
|
||||
"FileTooLargeError",
|
||||
"FileValidationError",
|
||||
"ImageConstraints",
|
||||
"PDFConstraints",
|
||||
"ProcessingDependencyError",
|
||||
"ProviderConstraints",
|
||||
"UnsupportedFileTypeError",
|
||||
"VideoConstraints",
|
||||
"get_constraints_for_provider",
|
||||
"validate_audio",
|
||||
"validate_file",
|
||||
"validate_image",
|
||||
"validate_pdf",
|
||||
"validate_text",
|
||||
"validate_video",
|
||||
]
|
||||
@@ -1,285 +0,0 @@
|
||||
"""Provider-specific file constraints for multimodal content."""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from functools import lru_cache
|
||||
from typing import Literal
|
||||
|
||||
from crewai.files.content_types import (
|
||||
AudioMimeType,
|
||||
ImageMimeType,
|
||||
VideoMimeType,
|
||||
)
|
||||
|
||||
|
||||
ProviderName = Literal[
|
||||
"anthropic",
|
||||
"openai",
|
||||
"gemini",
|
||||
"bedrock",
|
||||
"azure",
|
||||
]
|
||||
|
||||
DEFAULT_IMAGE_FORMATS: tuple[ImageMimeType, ...] = (
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
"image/gif",
|
||||
"image/webp",
|
||||
)
|
||||
|
||||
GEMINI_IMAGE_FORMATS: tuple[ImageMimeType, ...] = (
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
"image/gif",
|
||||
"image/webp",
|
||||
"image/heic",
|
||||
"image/heif",
|
||||
)
|
||||
|
||||
DEFAULT_AUDIO_FORMATS: tuple[AudioMimeType, ...] = (
|
||||
"audio/mp3",
|
||||
"audio/mpeg",
|
||||
"audio/wav",
|
||||
"audio/ogg",
|
||||
"audio/flac",
|
||||
"audio/aac",
|
||||
"audio/m4a",
|
||||
)
|
||||
|
||||
GEMINI_AUDIO_FORMATS: tuple[AudioMimeType, ...] = (
|
||||
"audio/mp3",
|
||||
"audio/mpeg",
|
||||
"audio/wav",
|
||||
"audio/ogg",
|
||||
"audio/flac",
|
||||
"audio/aac",
|
||||
"audio/m4a",
|
||||
"audio/opus",
|
||||
)
|
||||
|
||||
DEFAULT_VIDEO_FORMATS: tuple[VideoMimeType, ...] = (
|
||||
"video/mp4",
|
||||
"video/mpeg",
|
||||
"video/webm",
|
||||
"video/quicktime",
|
||||
)
|
||||
|
||||
GEMINI_VIDEO_FORMATS: tuple[VideoMimeType, ...] = (
|
||||
"video/mp4",
|
||||
"video/mpeg",
|
||||
"video/webm",
|
||||
"video/quicktime",
|
||||
"video/x-msvideo",
|
||||
"video/x-flv",
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ImageConstraints:
|
||||
"""Constraints for image files.
|
||||
|
||||
Attributes:
|
||||
max_size_bytes: Maximum file size in bytes.
|
||||
max_width: Maximum image width in pixels.
|
||||
max_height: Maximum image height in pixels.
|
||||
max_images_per_request: Maximum number of images per request.
|
||||
supported_formats: Supported image MIME types.
|
||||
"""
|
||||
|
||||
max_size_bytes: int
|
||||
max_width: int | None = None
|
||||
max_height: int | None = None
|
||||
max_images_per_request: int | None = None
|
||||
supported_formats: tuple[ImageMimeType, ...] = DEFAULT_IMAGE_FORMATS
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PDFConstraints:
|
||||
"""Constraints for PDF files.
|
||||
|
||||
Attributes:
|
||||
max_size_bytes: Maximum file size in bytes.
|
||||
max_pages: Maximum number of pages.
|
||||
"""
|
||||
|
||||
max_size_bytes: int
|
||||
max_pages: int | None = None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class AudioConstraints:
|
||||
"""Constraints for audio files.
|
||||
|
||||
Attributes:
|
||||
max_size_bytes: Maximum file size in bytes.
|
||||
max_duration_seconds: Maximum audio duration in seconds.
|
||||
supported_formats: Supported audio MIME types.
|
||||
"""
|
||||
|
||||
max_size_bytes: int
|
||||
max_duration_seconds: int | None = None
|
||||
supported_formats: tuple[AudioMimeType, ...] = DEFAULT_AUDIO_FORMATS
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class VideoConstraints:
|
||||
"""Constraints for video files.
|
||||
|
||||
Attributes:
|
||||
max_size_bytes: Maximum file size in bytes.
|
||||
max_duration_seconds: Maximum video duration in seconds.
|
||||
supported_formats: Supported video MIME types.
|
||||
"""
|
||||
|
||||
max_size_bytes: int
|
||||
max_duration_seconds: int | None = None
|
||||
supported_formats: tuple[VideoMimeType, ...] = DEFAULT_VIDEO_FORMATS
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ProviderConstraints:
|
||||
"""Complete set of constraints for a provider.
|
||||
|
||||
Attributes:
|
||||
name: Provider name identifier.
|
||||
image: Image file constraints.
|
||||
pdf: PDF file constraints.
|
||||
audio: Audio file constraints.
|
||||
video: Video file constraints.
|
||||
general_max_size_bytes: Maximum size for any file type.
|
||||
supports_file_upload: Whether the provider supports file upload APIs.
|
||||
file_upload_threshold_bytes: Size threshold above which to use file upload.
|
||||
supports_url_references: Whether the provider supports URL-based file references.
|
||||
"""
|
||||
|
||||
name: ProviderName
|
||||
image: ImageConstraints | None = None
|
||||
pdf: PDFConstraints | None = None
|
||||
audio: AudioConstraints | None = None
|
||||
video: VideoConstraints | None = None
|
||||
general_max_size_bytes: int | None = None
|
||||
supports_file_upload: bool = False
|
||||
file_upload_threshold_bytes: int | None = None
|
||||
supports_url_references: bool = False
|
||||
|
||||
|
||||
ANTHROPIC_CONSTRAINTS = ProviderConstraints(
|
||||
name="anthropic",
|
||||
image=ImageConstraints(
|
||||
max_size_bytes=5_242_880, # 5 MB per image
|
||||
max_width=8000,
|
||||
max_height=8000,
|
||||
max_images_per_request=100,
|
||||
),
|
||||
pdf=PDFConstraints(
|
||||
max_size_bytes=33_554_432, # 32 MB request size limit
|
||||
max_pages=100,
|
||||
),
|
||||
supports_file_upload=True,
|
||||
file_upload_threshold_bytes=5_242_880,
|
||||
supports_url_references=True,
|
||||
)
|
||||
|
||||
OPENAI_CONSTRAINTS = ProviderConstraints(
|
||||
name="openai",
|
||||
image=ImageConstraints(
|
||||
max_size_bytes=20_971_520,
|
||||
max_images_per_request=10,
|
||||
),
|
||||
audio=AudioConstraints(
|
||||
max_size_bytes=26_214_400, # 25 MB - whisper limit
|
||||
max_duration_seconds=1500, # 25 minutes, arbitrary-ish, this is from the transcriptions limit
|
||||
),
|
||||
supports_file_upload=True,
|
||||
file_upload_threshold_bytes=5_242_880,
|
||||
supports_url_references=True,
|
||||
)
|
||||
|
||||
GEMINI_CONSTRAINTS = ProviderConstraints(
|
||||
name="gemini",
|
||||
image=ImageConstraints(
|
||||
max_size_bytes=104_857_600,
|
||||
supported_formats=GEMINI_IMAGE_FORMATS,
|
||||
),
|
||||
pdf=PDFConstraints(
|
||||
max_size_bytes=52_428_800,
|
||||
),
|
||||
audio=AudioConstraints(
|
||||
max_size_bytes=104_857_600,
|
||||
max_duration_seconds=34200, # 9.5 hours
|
||||
supported_formats=GEMINI_AUDIO_FORMATS,
|
||||
),
|
||||
video=VideoConstraints(
|
||||
max_size_bytes=2_147_483_648,
|
||||
max_duration_seconds=3600, # 1 hour at default resolution
|
||||
supported_formats=GEMINI_VIDEO_FORMATS,
|
||||
),
|
||||
supports_file_upload=True,
|
||||
file_upload_threshold_bytes=20_971_520,
|
||||
supports_url_references=True,
|
||||
)
|
||||
|
||||
BEDROCK_CONSTRAINTS = ProviderConstraints(
|
||||
name="bedrock",
|
||||
image=ImageConstraints(
|
||||
max_size_bytes=4_608_000,
|
||||
max_width=8000,
|
||||
max_height=8000,
|
||||
),
|
||||
pdf=PDFConstraints(
|
||||
max_size_bytes=3_840_000,
|
||||
max_pages=100,
|
||||
),
|
||||
)
|
||||
|
||||
AZURE_CONSTRAINTS = ProviderConstraints(
|
||||
name="azure",
|
||||
image=ImageConstraints(
|
||||
max_size_bytes=20_971_520,
|
||||
max_images_per_request=10,
|
||||
),
|
||||
audio=AudioConstraints(
|
||||
max_size_bytes=26_214_400, # 25 MB - same as openai
|
||||
max_duration_seconds=1500, # 25 minutes - same as openai
|
||||
),
|
||||
supports_url_references=True,
|
||||
)
|
||||
|
||||
|
||||
_PROVIDER_CONSTRAINTS_MAP: dict[str, ProviderConstraints] = {
|
||||
"anthropic": ANTHROPIC_CONSTRAINTS,
|
||||
"openai": OPENAI_CONSTRAINTS,
|
||||
"gemini": GEMINI_CONSTRAINTS,
|
||||
"bedrock": BEDROCK_CONSTRAINTS,
|
||||
"azure": AZURE_CONSTRAINTS,
|
||||
"claude": ANTHROPIC_CONSTRAINTS,
|
||||
"gpt": OPENAI_CONSTRAINTS,
|
||||
"google": GEMINI_CONSTRAINTS,
|
||||
"aws": BEDROCK_CONSTRAINTS,
|
||||
}
|
||||
|
||||
|
||||
@lru_cache(maxsize=32)
|
||||
def get_constraints_for_provider(
|
||||
provider: str | ProviderConstraints,
|
||||
) -> ProviderConstraints | None:
|
||||
"""Get constraints for a provider by name or return if already ProviderConstraints.
|
||||
|
||||
Args:
|
||||
provider: Provider name string or ProviderConstraints instance.
|
||||
|
||||
Returns:
|
||||
ProviderConstraints for the provider, or None if not found.
|
||||
"""
|
||||
if isinstance(provider, ProviderConstraints):
|
||||
return provider
|
||||
|
||||
provider_lower = provider.lower()
|
||||
|
||||
if provider_lower in _PROVIDER_CONSTRAINTS_MAP:
|
||||
return _PROVIDER_CONSTRAINTS_MAP[provider_lower]
|
||||
|
||||
for key, constraints in _PROVIDER_CONSTRAINTS_MAP.items():
|
||||
if key in provider_lower:
|
||||
return constraints
|
||||
|
||||
return None
|
||||
@@ -1,19 +0,0 @@
|
||||
"""Enums for file processing configuration."""
|
||||
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class FileHandling(Enum):
|
||||
"""Defines how files exceeding provider limits should be handled.
|
||||
|
||||
Attributes:
|
||||
STRICT: Fail with an error if file exceeds limits.
|
||||
AUTO: Automatically resize, compress, or optimize to fit limits.
|
||||
WARN: Log a warning but attempt to process anyway.
|
||||
CHUNK: Split large files into smaller pieces.
|
||||
"""
|
||||
|
||||
STRICT = "strict"
|
||||
AUTO = "auto"
|
||||
WARN = "warn"
|
||||
CHUNK = "chunk"
|
||||
@@ -1,145 +0,0 @@
|
||||
"""Exceptions for file processing operations."""
|
||||
|
||||
|
||||
class FileProcessingError(Exception):
|
||||
"""Base exception for file processing errors."""
|
||||
|
||||
def __init__(self, message: str, file_name: str | None = None) -> None:
|
||||
"""Initialize the exception.
|
||||
|
||||
Args:
|
||||
message: Error message describing the issue.
|
||||
file_name: Optional name of the file that caused the error.
|
||||
"""
|
||||
self.file_name = file_name
|
||||
super().__init__(message)
|
||||
|
||||
|
||||
class FileValidationError(FileProcessingError):
|
||||
"""Raised when file validation fails."""
|
||||
|
||||
|
||||
class FileTooLargeError(FileValidationError):
|
||||
"""Raised when a file exceeds the maximum allowed size."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
message: str,
|
||||
file_name: str | None = None,
|
||||
actual_size: int | None = None,
|
||||
max_size: int | None = None,
|
||||
) -> None:
|
||||
"""Initialize the exception.
|
||||
|
||||
Args:
|
||||
message: Error message describing the issue.
|
||||
file_name: Optional name of the file that caused the error.
|
||||
actual_size: The actual size of the file in bytes.
|
||||
max_size: The maximum allowed size in bytes.
|
||||
"""
|
||||
self.actual_size = actual_size
|
||||
self.max_size = max_size
|
||||
super().__init__(message, file_name)
|
||||
|
||||
|
||||
class UnsupportedFileTypeError(FileValidationError):
|
||||
"""Raised when a file type is not supported by the provider."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
message: str,
|
||||
file_name: str | None = None,
|
||||
content_type: str | None = None,
|
||||
) -> None:
|
||||
"""Initialize the exception.
|
||||
|
||||
Args:
|
||||
message: Error message describing the issue.
|
||||
file_name: Optional name of the file that caused the error.
|
||||
content_type: The content type that is not supported.
|
||||
"""
|
||||
self.content_type = content_type
|
||||
super().__init__(message, file_name)
|
||||
|
||||
|
||||
class ProcessingDependencyError(FileProcessingError):
|
||||
"""Raised when a required processing dependency is not installed."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
message: str,
|
||||
dependency: str,
|
||||
install_command: str | None = None,
|
||||
) -> None:
|
||||
"""Initialize the exception.
|
||||
|
||||
Args:
|
||||
message: Error message describing the issue.
|
||||
dependency: Name of the missing dependency.
|
||||
install_command: Optional command to install the dependency.
|
||||
"""
|
||||
self.dependency = dependency
|
||||
self.install_command = install_command
|
||||
super().__init__(message)
|
||||
|
||||
|
||||
class TransientFileError(FileProcessingError):
|
||||
"""Transient error that may succeed on retry (network, timeout)."""
|
||||
|
||||
|
||||
class PermanentFileError(FileProcessingError):
|
||||
"""Permanent error that will not succeed on retry (auth, format)."""
|
||||
|
||||
|
||||
class UploadError(FileProcessingError):
|
||||
"""Base exception for upload errors."""
|
||||
|
||||
|
||||
class TransientUploadError(UploadError, TransientFileError):
|
||||
"""Upload failed but may succeed on retry (network issues, rate limits)."""
|
||||
|
||||
|
||||
class PermanentUploadError(UploadError, PermanentFileError):
|
||||
"""Upload failed permanently (auth failure, invalid file, unsupported type)."""
|
||||
|
||||
|
||||
def classify_upload_error(e: Exception, filename: str | None = None) -> Exception:
|
||||
"""Classify an exception as transient or permanent upload error.
|
||||
|
||||
Analyzes the exception type name and status code to determine if
|
||||
the error is likely transient (retryable) or permanent.
|
||||
|
||||
Args:
|
||||
e: The exception to classify.
|
||||
filename: Optional filename for error context.
|
||||
|
||||
Returns:
|
||||
A TransientUploadError or PermanentUploadError wrapping the original.
|
||||
"""
|
||||
error_type = type(e).__name__
|
||||
|
||||
if "RateLimit" in error_type or "APIConnection" in error_type:
|
||||
return TransientUploadError(f"Transient upload error: {e}", file_name=filename)
|
||||
if "Authentication" in error_type or "Permission" in error_type:
|
||||
return PermanentUploadError(
|
||||
f"Authentication/permission error: {e}", file_name=filename
|
||||
)
|
||||
if "BadRequest" in error_type or "InvalidRequest" in error_type:
|
||||
return PermanentUploadError(f"Invalid request: {e}", file_name=filename)
|
||||
|
||||
status_code = getattr(e, "status_code", None)
|
||||
if status_code is not None:
|
||||
if status_code >= 500 or status_code == 429:
|
||||
return TransientUploadError(
|
||||
f"Server error ({status_code}): {e}", file_name=filename
|
||||
)
|
||||
if status_code in (401, 403):
|
||||
return PermanentUploadError(
|
||||
f"Auth error ({status_code}): {e}", file_name=filename
|
||||
)
|
||||
if status_code == 400:
|
||||
return PermanentUploadError(
|
||||
f"Bad request ({status_code}): {e}", file_name=filename
|
||||
)
|
||||
|
||||
return TransientUploadError(f"Upload failed: {e}", file_name=filename)
|
||||
@@ -1,346 +0,0 @@
|
||||
"""FileProcessor for validating and transforming files based on provider constraints."""
|
||||
|
||||
import asyncio
|
||||
from collections.abc import Sequence
|
||||
import logging
|
||||
|
||||
from crewai.files.content_types import (
|
||||
AudioFile,
|
||||
File,
|
||||
FileInput,
|
||||
ImageFile,
|
||||
PDFFile,
|
||||
TextFile,
|
||||
VideoFile,
|
||||
)
|
||||
from crewai.files.processing.constraints import (
|
||||
ProviderConstraints,
|
||||
get_constraints_for_provider,
|
||||
)
|
||||
from crewai.files.processing.enums import FileHandling
|
||||
from crewai.files.processing.exceptions import (
|
||||
FileProcessingError,
|
||||
FileTooLargeError,
|
||||
FileValidationError,
|
||||
UnsupportedFileTypeError,
|
||||
)
|
||||
from crewai.files.processing.transformers import (
|
||||
chunk_pdf,
|
||||
chunk_text,
|
||||
get_image_dimensions,
|
||||
get_pdf_page_count,
|
||||
optimize_image,
|
||||
resize_image,
|
||||
)
|
||||
from crewai.files.processing.validators import validate_file
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FileProcessor:
|
||||
"""Processes files according to provider constraints and per-file mode mode.
|
||||
|
||||
Validates files against provider-specific limits and optionally transforms
|
||||
them (resize, compress, chunk) to meet those limits. Each file specifies
|
||||
its own mode mode via `file.mode`.
|
||||
|
||||
Attributes:
|
||||
constraints: Provider constraints for validation.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
constraints: ProviderConstraints | str | None = None,
|
||||
) -> None:
|
||||
"""Initialize the FileProcessor.
|
||||
|
||||
Args:
|
||||
constraints: Provider constraints or provider name string.
|
||||
If None, validation is skipped.
|
||||
"""
|
||||
if isinstance(constraints, str):
|
||||
resolved = get_constraints_for_provider(constraints)
|
||||
if resolved is None:
|
||||
logger.warning(
|
||||
f"Unknown provider '{constraints}' - validation disabled"
|
||||
)
|
||||
self.constraints = resolved
|
||||
else:
|
||||
self.constraints = constraints
|
||||
|
||||
def validate(self, file: FileInput) -> Sequence[str]:
|
||||
"""Validate a file against provider constraints.
|
||||
|
||||
Args:
|
||||
file: The file to validate.
|
||||
|
||||
Returns:
|
||||
List of validation error messages (empty if valid).
|
||||
|
||||
Raises:
|
||||
FileValidationError: If file.mode is STRICT and validation fails.
|
||||
"""
|
||||
if self.constraints is None:
|
||||
return []
|
||||
|
||||
mode = self._get_mode(file)
|
||||
raise_on_error = mode == FileHandling.STRICT
|
||||
return validate_file(file, self.constraints, raise_on_error=raise_on_error)
|
||||
|
||||
@staticmethod
|
||||
def _get_mode(file: FileInput) -> FileHandling:
|
||||
"""Get the mode mode for a file.
|
||||
|
||||
Args:
|
||||
file: The file to get mode for.
|
||||
|
||||
Returns:
|
||||
The file's mode mode, defaulting to AUTO.
|
||||
"""
|
||||
mode = getattr(file, "mode", None)
|
||||
if mode is None:
|
||||
return FileHandling.AUTO
|
||||
if isinstance(mode, str):
|
||||
return FileHandling(mode)
|
||||
if isinstance(mode, FileHandling):
|
||||
return mode
|
||||
return FileHandling.AUTO
|
||||
|
||||
def process(self, file: FileInput) -> FileInput | Sequence[FileInput]:
|
||||
"""Process a single file according to constraints and its mode mode.
|
||||
|
||||
Args:
|
||||
file: The file to process.
|
||||
|
||||
Returns:
|
||||
The processed file (possibly transformed) or a sequence of files
|
||||
if the file was chunked.
|
||||
|
||||
Raises:
|
||||
FileProcessingError: If file.mode is STRICT and processing fails.
|
||||
"""
|
||||
if self.constraints is None:
|
||||
return file
|
||||
|
||||
mode = self._get_mode(file)
|
||||
|
||||
try:
|
||||
errors = self.validate(file)
|
||||
|
||||
if not errors:
|
||||
return file
|
||||
|
||||
if mode == FileHandling.STRICT:
|
||||
raise FileValidationError("; ".join(errors), file_name=file.filename)
|
||||
|
||||
if mode == FileHandling.WARN:
|
||||
for error in errors:
|
||||
logger.warning(error)
|
||||
return file
|
||||
|
||||
if mode == FileHandling.AUTO:
|
||||
return self._auto_process(file)
|
||||
|
||||
if mode == FileHandling.CHUNK:
|
||||
return self._chunk_process(file)
|
||||
|
||||
return file
|
||||
|
||||
except (FileValidationError, FileTooLargeError, UnsupportedFileTypeError):
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing file '{file.filename}': {e}")
|
||||
if mode == FileHandling.STRICT:
|
||||
raise FileProcessingError(str(e), file_name=file.filename) from e
|
||||
return file
|
||||
|
||||
def process_files(
|
||||
self,
|
||||
files: dict[str, FileInput],
|
||||
) -> dict[str, FileInput]:
|
||||
"""Process multiple files according to constraints.
|
||||
|
||||
Args:
|
||||
files: Dictionary mapping names to file inputs.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping names to processed files. If a file is chunked,
|
||||
multiple entries are created with indexed names.
|
||||
"""
|
||||
result: dict[str, FileInput] = {}
|
||||
|
||||
for name, file in files.items():
|
||||
processed = self.process(file)
|
||||
|
||||
if isinstance(processed, Sequence) and not isinstance(
|
||||
processed, (str, bytes)
|
||||
):
|
||||
for i, chunk in enumerate(processed):
|
||||
chunk_name = f"{name}_chunk_{i}"
|
||||
result[chunk_name] = chunk
|
||||
else:
|
||||
result[name] = processed
|
||||
|
||||
return result
|
||||
|
||||
async def aprocess_files(
|
||||
self,
|
||||
files: dict[str, FileInput],
|
||||
max_concurrency: int = 10,
|
||||
) -> dict[str, FileInput]:
|
||||
"""Async process multiple files in parallel.
|
||||
|
||||
Args:
|
||||
files: Dictionary mapping names to file inputs.
|
||||
max_concurrency: Maximum number of concurrent processing tasks.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping names to processed files. If a file is chunked,
|
||||
multiple entries are created with indexed names.
|
||||
"""
|
||||
semaphore = asyncio.Semaphore(max_concurrency)
|
||||
|
||||
async def process_single(
|
||||
key: str, input_file: FileInput
|
||||
) -> tuple[str, FileInput | Sequence[FileInput]]:
|
||||
"""Process a single file with semaphore limiting."""
|
||||
async with semaphore:
|
||||
loop = asyncio.get_running_loop()
|
||||
result = await loop.run_in_executor(None, self.process, input_file)
|
||||
return key, result
|
||||
|
||||
tasks = [process_single(n, f) for n, f in files.items()]
|
||||
gather_results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
output: dict[str, FileInput] = {}
|
||||
for item in gather_results:
|
||||
if isinstance(item, BaseException):
|
||||
logger.error(f"Processing failed: {item}")
|
||||
continue
|
||||
entry_name, processed = item
|
||||
if isinstance(processed, Sequence) and not isinstance(
|
||||
processed, (str, bytes)
|
||||
):
|
||||
for i, chunk in enumerate(processed):
|
||||
output[f"{entry_name}_chunk_{i}"] = chunk
|
||||
elif isinstance(
|
||||
processed, (AudioFile, File, ImageFile, PDFFile, TextFile, VideoFile)
|
||||
):
|
||||
output[entry_name] = processed
|
||||
|
||||
return output
|
||||
|
||||
def _auto_process(self, file: FileInput) -> FileInput:
|
||||
"""Automatically resize/compress file to meet constraints.
|
||||
|
||||
Args:
|
||||
file: The file to process.
|
||||
|
||||
Returns:
|
||||
The processed file.
|
||||
"""
|
||||
if self.constraints is None:
|
||||
return file
|
||||
|
||||
if isinstance(file, ImageFile) and self.constraints.image is not None:
|
||||
return self._auto_process_image(file)
|
||||
|
||||
if isinstance(file, PDFFile) and self.constraints.pdf is not None:
|
||||
logger.warning(
|
||||
f"Cannot auto-compress PDF '{file.filename}'. "
|
||||
"Consider using CHUNK mode for large PDFs."
|
||||
)
|
||||
return file
|
||||
|
||||
if isinstance(file, (AudioFile, VideoFile)):
|
||||
logger.warning(
|
||||
f"Auto-processing not supported for {type(file).__name__}. "
|
||||
"File will be used as-is."
|
||||
)
|
||||
return file
|
||||
|
||||
return file
|
||||
|
||||
def _auto_process_image(self, file: ImageFile) -> ImageFile:
|
||||
"""Auto-process an image file.
|
||||
|
||||
Args:
|
||||
file: The image file to process.
|
||||
|
||||
Returns:
|
||||
The processed image file.
|
||||
"""
|
||||
if self.constraints is None or self.constraints.image is None:
|
||||
return file
|
||||
|
||||
image_constraints = self.constraints.image
|
||||
processed = file
|
||||
content = file.read()
|
||||
current_size = len(content)
|
||||
|
||||
if image_constraints.max_width or image_constraints.max_height:
|
||||
dimensions = get_image_dimensions(file)
|
||||
if dimensions:
|
||||
width, height = dimensions
|
||||
max_w = image_constraints.max_width or width
|
||||
max_h = image_constraints.max_height or height
|
||||
|
||||
if width > max_w or height > max_h:
|
||||
try:
|
||||
processed = resize_image(file, max_w, max_h)
|
||||
content = processed.read()
|
||||
current_size = len(content)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to resize image: {e}")
|
||||
|
||||
if current_size > image_constraints.max_size_bytes:
|
||||
try:
|
||||
processed = optimize_image(processed, image_constraints.max_size_bytes)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to optimize image: {e}")
|
||||
|
||||
return processed
|
||||
|
||||
def _chunk_process(self, file: FileInput) -> FileInput | Sequence[FileInput]:
|
||||
"""Split file into chunks to meet constraints.
|
||||
|
||||
Args:
|
||||
file: The file to chunk.
|
||||
|
||||
Returns:
|
||||
Original file if chunking not needed, or sequence of chunked files.
|
||||
"""
|
||||
if self.constraints is None:
|
||||
return file
|
||||
|
||||
if isinstance(file, PDFFile) and self.constraints.pdf is not None:
|
||||
max_pages = self.constraints.pdf.max_pages
|
||||
if max_pages is not None:
|
||||
page_count = get_pdf_page_count(file)
|
||||
if page_count is not None and page_count > max_pages:
|
||||
try:
|
||||
return list(chunk_pdf(file, max_pages))
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to chunk PDF: {e}")
|
||||
return file
|
||||
|
||||
if isinstance(file, TextFile):
|
||||
# Use general max size as character limit approximation
|
||||
max_size = self.constraints.general_max_size_bytes
|
||||
if max_size is not None:
|
||||
content = file.read()
|
||||
if len(content) > max_size:
|
||||
try:
|
||||
return list(chunk_text(file, max_size))
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to chunk text file: {e}")
|
||||
return file
|
||||
|
||||
if isinstance(file, (ImageFile, AudioFile, VideoFile)):
|
||||
logger.warning(
|
||||
f"Chunking not supported for {type(file).__name__}. "
|
||||
"Consider using AUTO mode for images."
|
||||
)
|
||||
|
||||
return file
|
||||
@@ -1,336 +0,0 @@
|
||||
"""File transformation functions for resizing, optimizing, and chunking."""
|
||||
|
||||
from collections.abc import Iterator
|
||||
import io
|
||||
import logging
|
||||
|
||||
from crewai.files.content_types import ImageFile, PDFFile, TextFile
|
||||
from crewai.files.file import FileBytes
|
||||
from crewai.files.processing.exceptions import ProcessingDependencyError
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def resize_image(
|
||||
file: ImageFile,
|
||||
max_width: int,
|
||||
max_height: int,
|
||||
*,
|
||||
preserve_aspect_ratio: bool = True,
|
||||
) -> ImageFile:
|
||||
"""Resize an image to fit within the specified dimensions.
|
||||
|
||||
Args:
|
||||
file: The image file to resize.
|
||||
max_width: Maximum width in pixels.
|
||||
max_height: Maximum height in pixels.
|
||||
preserve_aspect_ratio: If True, maintain aspect ratio while fitting within bounds.
|
||||
|
||||
Returns:
|
||||
A new ImageFile with the resized image data.
|
||||
|
||||
Raises:
|
||||
ProcessingDependencyError: If Pillow is not installed.
|
||||
"""
|
||||
try:
|
||||
from PIL import Image
|
||||
except ImportError as e:
|
||||
raise ProcessingDependencyError(
|
||||
"Pillow is required for image resizing",
|
||||
dependency="Pillow",
|
||||
install_command="pip install Pillow",
|
||||
) from e
|
||||
|
||||
content = file.read()
|
||||
|
||||
with Image.open(io.BytesIO(content)) as img:
|
||||
original_width, original_height = img.size
|
||||
|
||||
if original_width <= max_width and original_height <= max_height:
|
||||
return file
|
||||
|
||||
if preserve_aspect_ratio:
|
||||
width_ratio = max_width / original_width
|
||||
height_ratio = max_height / original_height
|
||||
scale_factor = min(width_ratio, height_ratio)
|
||||
|
||||
new_width = int(original_width * scale_factor)
|
||||
new_height = int(original_height * scale_factor)
|
||||
else:
|
||||
new_width = min(original_width, max_width)
|
||||
new_height = min(original_height, max_height)
|
||||
|
||||
resized_img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
|
||||
|
||||
output_format = img.format or "PNG"
|
||||
if output_format.upper() == "JPEG":
|
||||
if resized_img.mode in ("RGBA", "LA", "P"):
|
||||
resized_img = resized_img.convert("RGB")
|
||||
|
||||
output_buffer = io.BytesIO()
|
||||
resized_img.save(output_buffer, format=output_format)
|
||||
output_bytes = output_buffer.getvalue()
|
||||
|
||||
logger.info(
|
||||
f"Resized image '{file.filename}' from {original_width}x{original_height} "
|
||||
f"to {new_width}x{new_height}"
|
||||
)
|
||||
|
||||
return ImageFile(source=FileBytes(data=output_bytes, filename=file.filename))
|
||||
|
||||
|
||||
def optimize_image(
|
||||
file: ImageFile,
|
||||
target_size_bytes: int,
|
||||
*,
|
||||
min_quality: int = 20,
|
||||
initial_quality: int = 85,
|
||||
) -> ImageFile:
|
||||
"""Optimize an image to fit within a target file size.
|
||||
|
||||
Uses iterative quality reduction to achieve target size.
|
||||
|
||||
Args:
|
||||
file: The image file to optimize.
|
||||
target_size_bytes: Target maximum file size in bytes.
|
||||
min_quality: Minimum quality to use (prevents excessive degradation).
|
||||
initial_quality: Starting quality for optimization.
|
||||
|
||||
Returns:
|
||||
A new ImageFile with the optimized image data.
|
||||
|
||||
Raises:
|
||||
ProcessingDependencyError: If Pillow is not installed.
|
||||
"""
|
||||
try:
|
||||
from PIL import Image
|
||||
except ImportError as e:
|
||||
raise ProcessingDependencyError(
|
||||
"Pillow is required for image optimization",
|
||||
dependency="Pillow",
|
||||
install_command="pip install Pillow",
|
||||
) from e
|
||||
|
||||
content = file.read()
|
||||
current_size = len(content)
|
||||
|
||||
if current_size <= target_size_bytes:
|
||||
return file
|
||||
|
||||
with Image.open(io.BytesIO(content)) as img:
|
||||
if img.mode in ("RGBA", "LA", "P"):
|
||||
img = img.convert("RGB")
|
||||
output_format = "JPEG"
|
||||
else:
|
||||
output_format = img.format or "JPEG"
|
||||
if output_format.upper() not in ("JPEG", "JPG"):
|
||||
output_format = "JPEG"
|
||||
|
||||
quality = initial_quality
|
||||
output_bytes = content
|
||||
|
||||
while len(output_bytes) > target_size_bytes and quality >= min_quality:
|
||||
output_buffer = io.BytesIO()
|
||||
img.save(
|
||||
output_buffer, format=output_format, quality=quality, optimize=True
|
||||
)
|
||||
output_bytes = output_buffer.getvalue()
|
||||
|
||||
if len(output_bytes) > target_size_bytes:
|
||||
quality -= 5
|
||||
|
||||
logger.info(
|
||||
f"Optimized image '{file.filename}' from {current_size} bytes to "
|
||||
f"{len(output_bytes)} bytes (quality={quality})"
|
||||
)
|
||||
|
||||
filename = file.filename
|
||||
if (
|
||||
filename
|
||||
and output_format.upper() == "JPEG"
|
||||
and not filename.lower().endswith((".jpg", ".jpeg"))
|
||||
):
|
||||
filename = filename.rsplit(".", 1)[0] + ".jpg"
|
||||
|
||||
return ImageFile(source=FileBytes(data=output_bytes, filename=filename))
|
||||
|
||||
|
||||
def chunk_pdf(
|
||||
file: PDFFile,
|
||||
max_pages: int,
|
||||
*,
|
||||
overlap_pages: int = 0,
|
||||
) -> Iterator[PDFFile]:
|
||||
"""Split a PDF into chunks of maximum page count.
|
||||
|
||||
Yields chunks one at a time to minimize memory usage.
|
||||
|
||||
Args:
|
||||
file: The PDF file to chunk.
|
||||
max_pages: Maximum pages per chunk.
|
||||
overlap_pages: Number of overlapping pages between chunks (for context).
|
||||
|
||||
Yields:
|
||||
PDFFile objects, one per chunk.
|
||||
|
||||
Raises:
|
||||
ProcessingDependencyError: If pypdf is not installed.
|
||||
"""
|
||||
try:
|
||||
from pypdf import PdfReader, PdfWriter
|
||||
except ImportError as e:
|
||||
raise ProcessingDependencyError(
|
||||
"pypdf is required for PDF chunking",
|
||||
dependency="pypdf",
|
||||
install_command="pip install pypdf",
|
||||
) from e
|
||||
|
||||
content = file.read()
|
||||
reader = PdfReader(io.BytesIO(content))
|
||||
total_pages = len(reader.pages)
|
||||
|
||||
if total_pages <= max_pages:
|
||||
yield file
|
||||
return
|
||||
|
||||
filename = file.filename or "document.pdf"
|
||||
base_filename = filename.rsplit(".", 1)[0]
|
||||
step = max_pages - overlap_pages
|
||||
|
||||
chunk_num = 0
|
||||
start_page = 0
|
||||
|
||||
while start_page < total_pages:
|
||||
end_page = min(start_page + max_pages, total_pages)
|
||||
|
||||
writer = PdfWriter()
|
||||
for page_num in range(start_page, end_page):
|
||||
writer.add_page(reader.pages[page_num])
|
||||
|
||||
output_buffer = io.BytesIO()
|
||||
writer.write(output_buffer)
|
||||
output_bytes = output_buffer.getvalue()
|
||||
|
||||
chunk_filename = f"{base_filename}_chunk_{chunk_num}.pdf"
|
||||
|
||||
logger.info(
|
||||
f"Created PDF chunk '{chunk_filename}' with pages {start_page + 1}-{end_page}"
|
||||
)
|
||||
|
||||
yield PDFFile(source=FileBytes(data=output_bytes, filename=chunk_filename))
|
||||
|
||||
start_page += step
|
||||
chunk_num += 1
|
||||
|
||||
|
||||
def chunk_text(
|
||||
file: TextFile,
|
||||
max_chars: int,
|
||||
*,
|
||||
overlap_chars: int = 200,
|
||||
split_on_newlines: bool = True,
|
||||
) -> Iterator[TextFile]:
|
||||
"""Split a text file into chunks of maximum character count.
|
||||
|
||||
Yields chunks one at a time to minimize memory usage.
|
||||
|
||||
Args:
|
||||
file: The text file to chunk.
|
||||
max_chars: Maximum characters per chunk.
|
||||
overlap_chars: Number of overlapping characters between chunks.
|
||||
split_on_newlines: If True, prefer splitting at newline boundaries.
|
||||
|
||||
Yields:
|
||||
TextFile objects, one per chunk.
|
||||
"""
|
||||
content = file.read()
|
||||
text = content.decode(errors="replace")
|
||||
total_chars = len(text)
|
||||
|
||||
if total_chars <= max_chars:
|
||||
yield file
|
||||
return
|
||||
|
||||
filename = file.filename or "text.txt"
|
||||
base_filename = filename.rsplit(".", 1)[0]
|
||||
extension = filename.rsplit(".", 1)[-1] if "." in filename else "txt"
|
||||
|
||||
chunk_num = 0
|
||||
start_pos = 0
|
||||
|
||||
while start_pos < total_chars:
|
||||
end_pos = min(start_pos + max_chars, total_chars)
|
||||
|
||||
if end_pos < total_chars and split_on_newlines:
|
||||
last_newline = text.rfind("\n", start_pos, end_pos)
|
||||
if last_newline > start_pos + max_chars // 2:
|
||||
end_pos = last_newline + 1
|
||||
|
||||
chunk_content = text[start_pos:end_pos]
|
||||
chunk_bytes = chunk_content.encode()
|
||||
|
||||
chunk_filename = f"{base_filename}_chunk_{chunk_num}.{extension}"
|
||||
|
||||
logger.info(
|
||||
f"Created text chunk '{chunk_filename}' with {len(chunk_content)} characters"
|
||||
)
|
||||
|
||||
yield TextFile(source=FileBytes(data=chunk_bytes, filename=chunk_filename))
|
||||
|
||||
if end_pos < total_chars:
|
||||
start_pos = max(start_pos + 1, end_pos - overlap_chars)
|
||||
else:
|
||||
start_pos = total_chars
|
||||
chunk_num += 1
|
||||
|
||||
|
||||
def get_image_dimensions(file: ImageFile) -> tuple[int, int] | None:
|
||||
"""Get the dimensions of an image file.
|
||||
|
||||
Args:
|
||||
file: The image file to measure.
|
||||
|
||||
Returns:
|
||||
Tuple of (width, height) in pixels, or None if dimensions cannot be determined.
|
||||
"""
|
||||
try:
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
logger.warning("Pillow not installed - cannot get image dimensions")
|
||||
return None
|
||||
|
||||
content = file.read()
|
||||
|
||||
try:
|
||||
with Image.open(io.BytesIO(content)) as img:
|
||||
width, height = img.size
|
||||
return width, height
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to get image dimensions: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def get_pdf_page_count(file: PDFFile) -> int | None:
|
||||
"""Get the page count of a PDF file.
|
||||
|
||||
Args:
|
||||
file: The PDF file to measure.
|
||||
|
||||
Returns:
|
||||
Number of pages, or None if page count cannot be determined.
|
||||
"""
|
||||
try:
|
||||
from pypdf import PdfReader
|
||||
except ImportError:
|
||||
logger.warning("pypdf not installed - cannot get PDF page count")
|
||||
return None
|
||||
|
||||
content = file.read()
|
||||
|
||||
try:
|
||||
reader = PdfReader(io.BytesIO(content))
|
||||
return len(reader.pages)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to get PDF page count: {e}")
|
||||
return None
|
||||
@@ -1,564 +0,0 @@
|
||||
"""File validation functions for checking against provider constraints."""
|
||||
|
||||
from collections.abc import Sequence
|
||||
import io
|
||||
import logging
|
||||
|
||||
from crewai.files.content_types import (
|
||||
AudioFile,
|
||||
FileInput,
|
||||
ImageFile,
|
||||
PDFFile,
|
||||
TextFile,
|
||||
VideoFile,
|
||||
)
|
||||
from crewai.files.processing.constraints import (
|
||||
AudioConstraints,
|
||||
ImageConstraints,
|
||||
PDFConstraints,
|
||||
ProviderConstraints,
|
||||
VideoConstraints,
|
||||
)
|
||||
from crewai.files.processing.exceptions import (
|
||||
FileTooLargeError,
|
||||
FileValidationError,
|
||||
UnsupportedFileTypeError,
|
||||
)
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _get_image_dimensions(content: bytes) -> tuple[int, int] | None:
|
||||
"""Get image dimensions using Pillow if available.
|
||||
|
||||
Args:
|
||||
content: Raw image bytes.
|
||||
|
||||
Returns:
|
||||
Tuple of (width, height) or None if Pillow unavailable.
|
||||
"""
|
||||
try:
|
||||
from PIL import Image
|
||||
|
||||
with Image.open(io.BytesIO(content)) as img:
|
||||
width, height = img.size
|
||||
return int(width), int(height)
|
||||
except ImportError:
|
||||
logger.warning(
|
||||
"Pillow not installed - cannot validate image dimensions. "
|
||||
"Install with: pip install Pillow"
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
def _get_pdf_page_count(content: bytes) -> int | None:
|
||||
"""Get PDF page count using pypdf if available.
|
||||
|
||||
Args:
|
||||
content: Raw PDF bytes.
|
||||
|
||||
Returns:
|
||||
Page count or None if pypdf unavailable.
|
||||
"""
|
||||
try:
|
||||
from pypdf import PdfReader
|
||||
|
||||
reader = PdfReader(io.BytesIO(content))
|
||||
return len(reader.pages)
|
||||
except ImportError:
|
||||
logger.warning(
|
||||
"pypdf not installed - cannot validate PDF page count. "
|
||||
"Install with: pip install pypdf"
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
def _get_audio_duration(content: bytes, filename: str | None = None) -> float | None:
|
||||
"""Get audio duration in seconds using tinytag if available.
|
||||
|
||||
Args:
|
||||
content: Raw audio bytes.
|
||||
filename: Optional filename for format detection hint.
|
||||
|
||||
Returns:
|
||||
Duration in seconds or None if tinytag unavailable.
|
||||
"""
|
||||
try:
|
||||
from tinytag import TinyTag # type: ignore[import-untyped]
|
||||
except ImportError:
|
||||
logger.warning(
|
||||
"tinytag not installed - cannot validate audio duration. "
|
||||
"Install with: pip install tinytag"
|
||||
)
|
||||
return None
|
||||
|
||||
try:
|
||||
tag = TinyTag.get(file_obj=io.BytesIO(content), filename=filename)
|
||||
duration: float | None = tag.duration
|
||||
return duration
|
||||
except Exception as e:
|
||||
logger.debug(f"Could not determine audio duration: {e}")
|
||||
return None
|
||||
|
||||
|
||||
_VIDEO_FORMAT_MAP: dict[str, str] = {
|
||||
"video/mp4": "mp4",
|
||||
"video/webm": "webm",
|
||||
"video/x-matroska": "matroska",
|
||||
"video/quicktime": "mov",
|
||||
"video/x-msvideo": "avi",
|
||||
"video/x-flv": "flv",
|
||||
}
|
||||
|
||||
|
||||
def _get_video_duration(
|
||||
content: bytes, content_type: str | None = None
|
||||
) -> float | None:
|
||||
"""Get video duration in seconds using av if available.
|
||||
|
||||
Args:
|
||||
content: Raw video bytes.
|
||||
content_type: Optional MIME type for format detection hint.
|
||||
|
||||
Returns:
|
||||
Duration in seconds or None if av unavailable.
|
||||
"""
|
||||
try:
|
||||
import av
|
||||
except ImportError:
|
||||
logger.warning(
|
||||
"av (PyAV) not installed - cannot validate video duration. "
|
||||
"Install with: pip install av"
|
||||
)
|
||||
return None
|
||||
|
||||
format_hint = _VIDEO_FORMAT_MAP.get(content_type) if content_type else None
|
||||
|
||||
try:
|
||||
with av.open(io.BytesIO(content), format=format_hint) as container: # type: ignore[attr-defined]
|
||||
duration: int | None = container.duration # type: ignore[union-attr]
|
||||
if duration is None:
|
||||
return None
|
||||
return float(duration) / 1_000_000
|
||||
except Exception as e:
|
||||
logger.debug(f"Could not determine video duration: {e}")
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _format_size(size_bytes: int) -> str:
|
||||
"""Format byte size to human-readable string."""
|
||||
if size_bytes >= 1024 * 1024 * 1024:
|
||||
return f"{size_bytes / (1024 * 1024 * 1024):.1f}GB"
|
||||
if size_bytes >= 1024 * 1024:
|
||||
return f"{size_bytes / (1024 * 1024):.1f}MB"
|
||||
if size_bytes >= 1024:
|
||||
return f"{size_bytes / 1024:.1f}KB"
|
||||
return f"{size_bytes}B"
|
||||
|
||||
|
||||
def _validate_size(
|
||||
file_type: str,
|
||||
filename: str | None,
|
||||
file_size: int,
|
||||
max_size: int,
|
||||
errors: list[str],
|
||||
raise_on_error: bool,
|
||||
) -> None:
|
||||
"""Validate file size against maximum.
|
||||
|
||||
Args:
|
||||
file_type: Type label for error messages (e.g., "Image", "PDF").
|
||||
filename: Name of the file being validated.
|
||||
file_size: Actual file size in bytes.
|
||||
max_size: Maximum allowed size in bytes.
|
||||
errors: List to append error messages to.
|
||||
raise_on_error: If True, raise FileTooLargeError on failure.
|
||||
"""
|
||||
if file_size > max_size:
|
||||
msg = (
|
||||
f"{file_type} '{filename}' size ({_format_size(file_size)}) exceeds "
|
||||
f"maximum ({_format_size(max_size)})"
|
||||
)
|
||||
errors.append(msg)
|
||||
if raise_on_error:
|
||||
raise FileTooLargeError(
|
||||
msg,
|
||||
file_name=filename,
|
||||
actual_size=file_size,
|
||||
max_size=max_size,
|
||||
)
|
||||
|
||||
|
||||
def _validate_format(
|
||||
file_type: str,
|
||||
filename: str | None,
|
||||
content_type: str,
|
||||
supported_formats: tuple[str, ...],
|
||||
errors: list[str],
|
||||
raise_on_error: bool,
|
||||
) -> None:
|
||||
"""Validate content type against supported formats.
|
||||
|
||||
Args:
|
||||
file_type: Type label for error messages (e.g., "Image", "Audio").
|
||||
filename: Name of the file being validated.
|
||||
content_type: MIME type of the file.
|
||||
supported_formats: Tuple of supported MIME types.
|
||||
errors: List to append error messages to.
|
||||
raise_on_error: If True, raise UnsupportedFileTypeError on failure.
|
||||
"""
|
||||
if content_type not in supported_formats:
|
||||
msg = (
|
||||
f"{file_type} format '{content_type}' is not supported. "
|
||||
f"Supported: {', '.join(supported_formats)}"
|
||||
)
|
||||
errors.append(msg)
|
||||
if raise_on_error:
|
||||
raise UnsupportedFileTypeError(
|
||||
msg, file_name=filename, content_type=content_type
|
||||
)
|
||||
|
||||
|
||||
def validate_image(
|
||||
file: ImageFile,
|
||||
constraints: ImageConstraints,
|
||||
*,
|
||||
raise_on_error: bool = True,
|
||||
) -> Sequence[str]:
|
||||
"""Validate an image file against constraints.
|
||||
|
||||
Args:
|
||||
file: The image file to validate.
|
||||
constraints: Image constraints to validate against.
|
||||
raise_on_error: If True, raise exceptions on validation failure.
|
||||
|
||||
Returns:
|
||||
List of validation error messages (empty if valid).
|
||||
|
||||
Raises:
|
||||
FileTooLargeError: If the file exceeds size limits.
|
||||
FileValidationError: If the file exceeds dimension limits.
|
||||
UnsupportedFileTypeError: If the format is not supported.
|
||||
"""
|
||||
errors: list[str] = []
|
||||
content = file.read()
|
||||
file_size = len(content)
|
||||
filename = file.filename
|
||||
|
||||
_validate_size(
|
||||
"Image", filename, file_size, constraints.max_size_bytes, errors, raise_on_error
|
||||
)
|
||||
_validate_format(
|
||||
"Image",
|
||||
filename,
|
||||
file.content_type,
|
||||
constraints.supported_formats,
|
||||
errors,
|
||||
raise_on_error,
|
||||
)
|
||||
|
||||
if constraints.max_width is not None or constraints.max_height is not None:
|
||||
dimensions = _get_image_dimensions(content)
|
||||
if dimensions is not None:
|
||||
width, height = dimensions
|
||||
|
||||
if constraints.max_width and width > constraints.max_width:
|
||||
msg = (
|
||||
f"Image '{filename}' width ({width}px) exceeds "
|
||||
f"maximum ({constraints.max_width}px)"
|
||||
)
|
||||
errors.append(msg)
|
||||
if raise_on_error:
|
||||
raise FileValidationError(msg, file_name=filename)
|
||||
|
||||
if constraints.max_height and height > constraints.max_height:
|
||||
msg = (
|
||||
f"Image '{filename}' height ({height}px) exceeds "
|
||||
f"maximum ({constraints.max_height}px)"
|
||||
)
|
||||
errors.append(msg)
|
||||
if raise_on_error:
|
||||
raise FileValidationError(msg, file_name=filename)
|
||||
|
||||
return errors
|
||||
|
||||
|
||||
def validate_pdf(
|
||||
file: PDFFile,
|
||||
constraints: PDFConstraints,
|
||||
*,
|
||||
raise_on_error: bool = True,
|
||||
) -> Sequence[str]:
|
||||
"""Validate a PDF file against constraints.
|
||||
|
||||
Args:
|
||||
file: The PDF file to validate.
|
||||
constraints: PDF constraints to validate against.
|
||||
raise_on_error: If True, raise exceptions on validation failure.
|
||||
|
||||
Returns:
|
||||
List of validation error messages (empty if valid).
|
||||
|
||||
Raises:
|
||||
FileTooLargeError: If the file exceeds size limits.
|
||||
FileValidationError: If the file exceeds page limits.
|
||||
"""
|
||||
errors: list[str] = []
|
||||
content = file.read()
|
||||
file_size = len(content)
|
||||
filename = file.filename
|
||||
|
||||
_validate_size(
|
||||
"PDF", filename, file_size, constraints.max_size_bytes, errors, raise_on_error
|
||||
)
|
||||
|
||||
if constraints.max_pages is not None:
|
||||
page_count = _get_pdf_page_count(content)
|
||||
if page_count is not None and page_count > constraints.max_pages:
|
||||
msg = (
|
||||
f"PDF '{filename}' page count ({page_count}) exceeds "
|
||||
f"maximum ({constraints.max_pages})"
|
||||
)
|
||||
errors.append(msg)
|
||||
if raise_on_error:
|
||||
raise FileValidationError(msg, file_name=filename)
|
||||
|
||||
return errors
|
||||
|
||||
|
||||
def validate_audio(
|
||||
file: AudioFile,
|
||||
constraints: AudioConstraints,
|
||||
*,
|
||||
raise_on_error: bool = True,
|
||||
) -> Sequence[str]:
|
||||
"""Validate an audio file against constraints.
|
||||
|
||||
Args:
|
||||
file: The audio file to validate.
|
||||
constraints: Audio constraints to validate against.
|
||||
raise_on_error: If True, raise exceptions on validation failure.
|
||||
|
||||
Returns:
|
||||
List of validation error messages (empty if valid).
|
||||
|
||||
Raises:
|
||||
FileTooLargeError: If the file exceeds size limits.
|
||||
FileValidationError: If the file exceeds duration limits.
|
||||
UnsupportedFileTypeError: If the format is not supported.
|
||||
"""
|
||||
errors: list[str] = []
|
||||
content = file.read()
|
||||
file_size = len(content)
|
||||
filename = file.filename
|
||||
|
||||
_validate_size(
|
||||
"Audio",
|
||||
filename,
|
||||
file_size,
|
||||
constraints.max_size_bytes,
|
||||
errors,
|
||||
raise_on_error,
|
||||
)
|
||||
_validate_format(
|
||||
"Audio",
|
||||
filename,
|
||||
file.content_type,
|
||||
constraints.supported_formats,
|
||||
errors,
|
||||
raise_on_error,
|
||||
)
|
||||
|
||||
if constraints.max_duration_seconds is not None:
|
||||
duration = _get_audio_duration(content, filename)
|
||||
if duration is not None and duration > constraints.max_duration_seconds:
|
||||
msg = (
|
||||
f"Audio '{filename}' duration ({duration:.1f}s) exceeds "
|
||||
f"maximum ({constraints.max_duration_seconds}s)"
|
||||
)
|
||||
errors.append(msg)
|
||||
if raise_on_error:
|
||||
raise FileValidationError(msg, file_name=filename)
|
||||
|
||||
return errors
|
||||
|
||||
|
||||
def validate_video(
|
||||
file: VideoFile,
|
||||
constraints: VideoConstraints,
|
||||
*,
|
||||
raise_on_error: bool = True,
|
||||
) -> Sequence[str]:
|
||||
"""Validate a video file against constraints.
|
||||
|
||||
Args:
|
||||
file: The video file to validate.
|
||||
constraints: Video constraints to validate against.
|
||||
raise_on_error: If True, raise exceptions on validation failure.
|
||||
|
||||
Returns:
|
||||
List of validation error messages (empty if valid).
|
||||
|
||||
Raises:
|
||||
FileTooLargeError: If the file exceeds size limits.
|
||||
FileValidationError: If the file exceeds duration limits.
|
||||
UnsupportedFileTypeError: If the format is not supported.
|
||||
"""
|
||||
errors: list[str] = []
|
||||
content = file.read()
|
||||
file_size = len(content)
|
||||
filename = file.filename
|
||||
|
||||
_validate_size(
|
||||
"Video",
|
||||
filename,
|
||||
file_size,
|
||||
constraints.max_size_bytes,
|
||||
errors,
|
||||
raise_on_error,
|
||||
)
|
||||
_validate_format(
|
||||
"Video",
|
||||
filename,
|
||||
file.content_type,
|
||||
constraints.supported_formats,
|
||||
errors,
|
||||
raise_on_error,
|
||||
)
|
||||
|
||||
if constraints.max_duration_seconds is not None:
|
||||
duration = _get_video_duration(content)
|
||||
if duration is not None and duration > constraints.max_duration_seconds:
|
||||
msg = (
|
||||
f"Video '{filename}' duration ({duration:.1f}s) exceeds "
|
||||
f"maximum ({constraints.max_duration_seconds}s)"
|
||||
)
|
||||
errors.append(msg)
|
||||
if raise_on_error:
|
||||
raise FileValidationError(msg, file_name=filename)
|
||||
|
||||
return errors
|
||||
|
||||
|
||||
def validate_text(
|
||||
file: TextFile,
|
||||
constraints: ProviderConstraints,
|
||||
*,
|
||||
raise_on_error: bool = True,
|
||||
) -> Sequence[str]:
|
||||
"""Validate a text file against general constraints.
|
||||
|
||||
Args:
|
||||
file: The text file to validate.
|
||||
constraints: Provider constraints to validate against.
|
||||
raise_on_error: If True, raise exceptions on validation failure.
|
||||
|
||||
Returns:
|
||||
List of validation error messages (empty if valid).
|
||||
|
||||
Raises:
|
||||
FileTooLargeError: If the file exceeds size limits.
|
||||
"""
|
||||
errors: list[str] = []
|
||||
|
||||
if constraints.general_max_size_bytes is None:
|
||||
return errors
|
||||
|
||||
file_size = len(file.read())
|
||||
_validate_size(
|
||||
"Text file",
|
||||
file.filename,
|
||||
file_size,
|
||||
constraints.general_max_size_bytes,
|
||||
errors,
|
||||
raise_on_error,
|
||||
)
|
||||
|
||||
return errors
|
||||
|
||||
|
||||
def _check_unsupported_type(
|
||||
file: FileInput,
|
||||
provider_name: str,
|
||||
type_name: str,
|
||||
raise_on_error: bool,
|
||||
) -> Sequence[str]:
|
||||
"""Check if file type is unsupported and handle error.
|
||||
|
||||
Args:
|
||||
file: The file being validated.
|
||||
provider_name: Name of the provider.
|
||||
type_name: Name of the file type (e.g., "images", "PDFs").
|
||||
raise_on_error: If True, raise exception instead of returning errors.
|
||||
|
||||
Returns:
|
||||
List with error message (only returns when raise_on_error is False).
|
||||
|
||||
Raises:
|
||||
UnsupportedFileTypeError: If raise_on_error is True.
|
||||
"""
|
||||
msg = f"Provider '{provider_name}' does not support {type_name}"
|
||||
if raise_on_error:
|
||||
raise UnsupportedFileTypeError(
|
||||
msg, file_name=file.filename, content_type=file.content_type
|
||||
)
|
||||
return [msg]
|
||||
|
||||
|
||||
def validate_file(
|
||||
file: FileInput,
|
||||
constraints: ProviderConstraints,
|
||||
*,
|
||||
raise_on_error: bool = True,
|
||||
) -> Sequence[str]:
|
||||
"""Validate a file against provider constraints.
|
||||
|
||||
Dispatches to the appropriate validator based on file type.
|
||||
|
||||
Args:
|
||||
file: The file to validate.
|
||||
constraints: Provider constraints to validate against.
|
||||
raise_on_error: If True, raise exceptions on validation failure.
|
||||
|
||||
Returns:
|
||||
List of validation error messages (empty if valid).
|
||||
|
||||
Raises:
|
||||
FileTooLargeError: If the file exceeds size limits.
|
||||
FileValidationError: If the file fails other validation checks.
|
||||
UnsupportedFileTypeError: If the file type is not supported.
|
||||
"""
|
||||
if isinstance(file, ImageFile):
|
||||
if constraints.image is None:
|
||||
return _check_unsupported_type(
|
||||
file, constraints.name, "images", raise_on_error
|
||||
)
|
||||
return validate_image(file, constraints.image, raise_on_error=raise_on_error)
|
||||
|
||||
if isinstance(file, PDFFile):
|
||||
if constraints.pdf is None:
|
||||
return _check_unsupported_type(
|
||||
file, constraints.name, "PDFs", raise_on_error
|
||||
)
|
||||
return validate_pdf(file, constraints.pdf, raise_on_error=raise_on_error)
|
||||
|
||||
if isinstance(file, AudioFile):
|
||||
if constraints.audio is None:
|
||||
return _check_unsupported_type(
|
||||
file, constraints.name, "audio", raise_on_error
|
||||
)
|
||||
return validate_audio(file, constraints.audio, raise_on_error=raise_on_error)
|
||||
|
||||
if isinstance(file, VideoFile):
|
||||
if constraints.video is None:
|
||||
return _check_unsupported_type(
|
||||
file, constraints.name, "video", raise_on_error
|
||||
)
|
||||
return validate_video(file, constraints.video, raise_on_error=raise_on_error)
|
||||
|
||||
if isinstance(file, TextFile):
|
||||
return validate_text(file, constraints, raise_on_error=raise_on_error)
|
||||
|
||||
return []
|
||||
@@ -1,84 +0,0 @@
|
||||
"""Resolved file types representing different delivery methods for file content."""
|
||||
|
||||
from abc import ABC
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ResolvedFile(ABC):
|
||||
"""Base class for resolved file representations.
|
||||
|
||||
A ResolvedFile represents the final form of a file ready for delivery
|
||||
to an LLM provider, whether inline or via reference.
|
||||
|
||||
Attributes:
|
||||
content_type: MIME type of the file content.
|
||||
"""
|
||||
|
||||
content_type: str
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class InlineBase64(ResolvedFile):
|
||||
"""File content encoded as base64 string.
|
||||
|
||||
Used by most providers for inline file content in messages.
|
||||
|
||||
Attributes:
|
||||
content_type: MIME type of the file content.
|
||||
data: Base64-encoded file content.
|
||||
"""
|
||||
|
||||
data: str
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class InlineBytes(ResolvedFile):
|
||||
"""File content as raw bytes.
|
||||
|
||||
Used by providers like Bedrock that accept raw bytes instead of base64.
|
||||
|
||||
Attributes:
|
||||
content_type: MIME type of the file content.
|
||||
data: Raw file bytes.
|
||||
"""
|
||||
|
||||
data: bytes
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class FileReference(ResolvedFile):
|
||||
"""Reference to an uploaded file.
|
||||
|
||||
Used when files are uploaded via provider File APIs.
|
||||
|
||||
Attributes:
|
||||
content_type: MIME type of the file content.
|
||||
file_id: Provider-specific file identifier.
|
||||
provider: Name of the provider the file was uploaded to.
|
||||
expires_at: When the uploaded file expires (if applicable).
|
||||
file_uri: Optional URI for accessing the file (used by Gemini).
|
||||
"""
|
||||
|
||||
file_id: str
|
||||
provider: str
|
||||
expires_at: datetime | None = None
|
||||
file_uri: str | None = None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class UrlReference(ResolvedFile):
|
||||
"""Reference to a file accessible via URL.
|
||||
|
||||
Used by providers that support fetching files from URLs.
|
||||
|
||||
Attributes:
|
||||
content_type: MIME type of the file content.
|
||||
url: URL where the file can be accessed.
|
||||
"""
|
||||
|
||||
url: str
|
||||
|
||||
|
||||
ResolvedFileType = InlineBase64 | InlineBytes | FileReference | UrlReference
|
||||
@@ -1,670 +0,0 @@
|
||||
"""FileResolver for deciding file delivery method and managing uploads."""
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
from dataclasses import dataclass, field
|
||||
import hashlib
|
||||
import logging
|
||||
|
||||
from crewai.files.constants import UPLOAD_MAX_RETRIES, UPLOAD_RETRY_DELAY_BASE
|
||||
from crewai.files.content_types import FileInput
|
||||
from crewai.files.file import FileUrl
|
||||
from crewai.files.metrics import measure_operation
|
||||
from crewai.files.processing.constraints import (
|
||||
AudioConstraints,
|
||||
ImageConstraints,
|
||||
PDFConstraints,
|
||||
ProviderConstraints,
|
||||
VideoConstraints,
|
||||
get_constraints_for_provider,
|
||||
)
|
||||
from crewai.files.resolved import (
|
||||
FileReference,
|
||||
InlineBase64,
|
||||
InlineBytes,
|
||||
ResolvedFile,
|
||||
UrlReference,
|
||||
)
|
||||
from crewai.files.upload_cache import CachedUpload, UploadCache
|
||||
from crewai.files.uploaders import UploadResult, get_uploader
|
||||
from crewai.files.uploaders.base import FileUploader
|
||||
from crewai.files.uploaders.factory import ProviderType
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class FileContext:
|
||||
"""Cached file metadata to avoid redundant reads.
|
||||
|
||||
Attributes:
|
||||
content: Raw file bytes.
|
||||
size: Size of the file in bytes.
|
||||
content_hash: SHA-256 hash of the file content.
|
||||
content_type: MIME type of the file.
|
||||
"""
|
||||
|
||||
content: bytes
|
||||
size: int
|
||||
content_hash: str
|
||||
content_type: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class FileResolverConfig:
|
||||
"""Configuration for FileResolver.
|
||||
|
||||
Attributes:
|
||||
prefer_upload: If True, prefer uploading over inline for supported providers.
|
||||
upload_threshold_bytes: Size threshold above which to use upload.
|
||||
If None, uses provider-specific threshold.
|
||||
use_bytes_for_bedrock: If True, use raw bytes instead of base64 for Bedrock.
|
||||
"""
|
||||
|
||||
prefer_upload: bool = False
|
||||
upload_threshold_bytes: int | None = None
|
||||
use_bytes_for_bedrock: bool = True
|
||||
|
||||
|
||||
@dataclass
|
||||
class FileResolver:
|
||||
"""Resolves files to their delivery format based on provider capabilities.
|
||||
|
||||
Decides whether to use inline base64, raw bytes, or file upload based on:
|
||||
- Provider constraints and capabilities
|
||||
- File size
|
||||
- Configuration preferences
|
||||
|
||||
Caches uploaded files to avoid redundant uploads.
|
||||
|
||||
Attributes:
|
||||
config: Resolver configuration.
|
||||
upload_cache: Cache for tracking uploaded files.
|
||||
"""
|
||||
|
||||
config: FileResolverConfig = field(default_factory=FileResolverConfig)
|
||||
upload_cache: UploadCache | None = None
|
||||
_uploaders: dict[str, FileUploader] = field(default_factory=dict)
|
||||
|
||||
@staticmethod
|
||||
def _build_file_context(file: FileInput) -> FileContext:
|
||||
"""Build context by reading file once.
|
||||
|
||||
Args:
|
||||
file: The file to build context for.
|
||||
|
||||
Returns:
|
||||
FileContext with cached metadata.
|
||||
"""
|
||||
content = file.read()
|
||||
return FileContext(
|
||||
content=content,
|
||||
size=len(content),
|
||||
content_hash=hashlib.sha256(content).hexdigest(),
|
||||
content_type=file.content_type,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _is_url_source(file: FileInput) -> bool:
|
||||
"""Check if file source is a URL.
|
||||
|
||||
Args:
|
||||
file: The file to check.
|
||||
|
||||
Returns:
|
||||
True if the file source is a FileUrl, False otherwise.
|
||||
"""
|
||||
return isinstance(file._file_source, FileUrl)
|
||||
|
||||
@staticmethod
|
||||
def _supports_url(constraints: ProviderConstraints | None) -> bool:
|
||||
"""Check if provider supports URL references.
|
||||
|
||||
Args:
|
||||
constraints: Provider constraints.
|
||||
|
||||
Returns:
|
||||
True if the provider supports URL references, False otherwise.
|
||||
"""
|
||||
return constraints is not None and constraints.supports_url_references
|
||||
|
||||
@staticmethod
|
||||
def _resolve_as_url(file: FileInput) -> UrlReference:
|
||||
"""Resolve a URL source as UrlReference.
|
||||
|
||||
Args:
|
||||
file: The file with URL source.
|
||||
|
||||
Returns:
|
||||
UrlReference with the URL and content type.
|
||||
"""
|
||||
source = file._file_source
|
||||
if not isinstance(source, FileUrl):
|
||||
raise TypeError(f"Expected FileUrl source, got {type(source).__name__}")
|
||||
return UrlReference(
|
||||
content_type=file.content_type,
|
||||
url=source.url,
|
||||
)
|
||||
|
||||
def resolve(self, file: FileInput, provider: ProviderType) -> ResolvedFile:
|
||||
"""Resolve a file to its delivery format for a provider.
|
||||
|
||||
Args:
|
||||
file: The file to resolve.
|
||||
provider: Provider name (e.g., "gemini", "anthropic", "openai").
|
||||
|
||||
Returns:
|
||||
ResolvedFile representing the appropriate delivery format.
|
||||
"""
|
||||
constraints = get_constraints_for_provider(provider)
|
||||
|
||||
if self._is_url_source(file) and self._supports_url(constraints):
|
||||
return self._resolve_as_url(file)
|
||||
|
||||
context = self._build_file_context(file)
|
||||
|
||||
should_upload = self._should_upload(file, provider, constraints, context.size)
|
||||
|
||||
if should_upload:
|
||||
resolved = self._resolve_via_upload(file, provider, context)
|
||||
if resolved is not None:
|
||||
return resolved
|
||||
|
||||
return self._resolve_inline(file, provider, context)
|
||||
|
||||
def resolve_files(
|
||||
self,
|
||||
files: dict[str, FileInput],
|
||||
provider: ProviderType,
|
||||
) -> dict[str, ResolvedFile]:
|
||||
"""Resolve multiple files for a provider.
|
||||
|
||||
Args:
|
||||
files: Dictionary mapping names to file inputs.
|
||||
provider: Provider name.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping names to resolved files.
|
||||
"""
|
||||
return {name: self.resolve(file, provider) for name, file in files.items()}
|
||||
|
||||
@staticmethod
|
||||
def _get_type_constraint(
|
||||
content_type: str,
|
||||
constraints: ProviderConstraints,
|
||||
) -> ImageConstraints | PDFConstraints | AudioConstraints | VideoConstraints | None:
|
||||
"""Get type-specific constraint based on content type.
|
||||
|
||||
Args:
|
||||
content_type: MIME type of the file.
|
||||
constraints: Provider constraints.
|
||||
|
||||
Returns:
|
||||
Type-specific constraint or None if not found.
|
||||
"""
|
||||
if content_type.startswith("image/"):
|
||||
return constraints.image
|
||||
if content_type == "application/pdf":
|
||||
return constraints.pdf
|
||||
if content_type.startswith("audio/"):
|
||||
return constraints.audio
|
||||
if content_type.startswith("video/"):
|
||||
return constraints.video
|
||||
return None
|
||||
|
||||
def _should_upload(
|
||||
self,
|
||||
file: FileInput,
|
||||
provider: str,
|
||||
constraints: ProviderConstraints | None,
|
||||
file_size: int,
|
||||
) -> bool:
|
||||
"""Determine if a file should be uploaded rather than inlined.
|
||||
|
||||
Uses type-specific constraints to make smarter decisions:
|
||||
- Checks if file exceeds type-specific inline size limits
|
||||
- Falls back to general threshold if no type-specific constraint
|
||||
|
||||
Args:
|
||||
file: The file to check.
|
||||
provider: Provider name.
|
||||
constraints: Provider constraints.
|
||||
file_size: Size of the file in bytes.
|
||||
|
||||
Returns:
|
||||
True if the file should be uploaded, False otherwise.
|
||||
"""
|
||||
if constraints is None or not constraints.supports_file_upload:
|
||||
return False
|
||||
|
||||
if self.config.prefer_upload:
|
||||
return True
|
||||
|
||||
content_type = file.content_type
|
||||
type_constraint = self._get_type_constraint(content_type, constraints)
|
||||
|
||||
if type_constraint is not None:
|
||||
# Check if file exceeds type-specific inline limit
|
||||
if file_size > type_constraint.max_size_bytes:
|
||||
logger.debug(
|
||||
f"File {file.filename} ({file_size}B) exceeds {content_type} "
|
||||
f"inline limit ({type_constraint.max_size_bytes}B) for {provider}"
|
||||
)
|
||||
return True
|
||||
|
||||
# Fall back to general threshold
|
||||
threshold = self.config.upload_threshold_bytes
|
||||
if threshold is None:
|
||||
threshold = constraints.file_upload_threshold_bytes
|
||||
|
||||
if threshold is not None and file_size > threshold:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _resolve_via_upload(
|
||||
self,
|
||||
file: FileInput,
|
||||
provider: ProviderType,
|
||||
context: FileContext,
|
||||
) -> ResolvedFile | None:
|
||||
"""Resolve a file by uploading it.
|
||||
|
||||
Args:
|
||||
file: The file to upload.
|
||||
provider: Provider name.
|
||||
context: Pre-computed file context.
|
||||
|
||||
Returns:
|
||||
FileReference if upload succeeds, None otherwise.
|
||||
"""
|
||||
if self.upload_cache is not None:
|
||||
cached = self.upload_cache.get_by_hash(context.content_hash, provider)
|
||||
if cached is not None:
|
||||
logger.debug(
|
||||
f"Using cached upload for {file.filename}: {cached.file_id}"
|
||||
)
|
||||
return FileReference(
|
||||
content_type=cached.content_type,
|
||||
file_id=cached.file_id,
|
||||
provider=cached.provider,
|
||||
expires_at=cached.expires_at,
|
||||
file_uri=cached.file_uri,
|
||||
)
|
||||
|
||||
uploader = self._get_uploader(provider)
|
||||
if uploader is None:
|
||||
logger.debug(f"No uploader available for {provider}")
|
||||
return None
|
||||
|
||||
result = self._upload_with_retry(uploader, file, provider, context.size)
|
||||
if result is None:
|
||||
return None
|
||||
|
||||
if self.upload_cache is not None:
|
||||
self.upload_cache.set_by_hash(
|
||||
file_hash=context.content_hash,
|
||||
content_type=context.content_type,
|
||||
provider=provider,
|
||||
file_id=result.file_id,
|
||||
file_uri=result.file_uri,
|
||||
expires_at=result.expires_at,
|
||||
)
|
||||
|
||||
return FileReference(
|
||||
content_type=result.content_type,
|
||||
file_id=result.file_id,
|
||||
provider=result.provider,
|
||||
expires_at=result.expires_at,
|
||||
file_uri=result.file_uri,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _upload_with_retry(
|
||||
uploader: FileUploader,
|
||||
file: FileInput,
|
||||
provider: str,
|
||||
file_size: int,
|
||||
) -> UploadResult | None:
|
||||
"""Upload with exponential backoff retry.
|
||||
|
||||
Args:
|
||||
uploader: The uploader to use.
|
||||
file: The file to upload.
|
||||
provider: Provider name for logging.
|
||||
file_size: Size of the file in bytes.
|
||||
|
||||
Returns:
|
||||
UploadResult if successful, None otherwise.
|
||||
"""
|
||||
import time
|
||||
|
||||
from crewai.files.processing.exceptions import (
|
||||
PermanentUploadError,
|
||||
TransientUploadError,
|
||||
)
|
||||
|
||||
last_error: Exception | None = None
|
||||
|
||||
for attempt in range(UPLOAD_MAX_RETRIES):
|
||||
with measure_operation(
|
||||
"upload",
|
||||
filename=file.filename,
|
||||
provider=provider,
|
||||
size_bytes=file_size,
|
||||
attempt=attempt + 1,
|
||||
) as metrics:
|
||||
try:
|
||||
result = uploader.upload(file)
|
||||
metrics.metadata["file_id"] = result.file_id
|
||||
return result
|
||||
except PermanentUploadError as e:
|
||||
metrics.metadata["error_type"] = "permanent"
|
||||
logger.warning(
|
||||
f"Non-retryable upload error for {file.filename}: {e}"
|
||||
)
|
||||
return None
|
||||
except TransientUploadError as e:
|
||||
metrics.metadata["error_type"] = "transient"
|
||||
last_error = e
|
||||
except Exception as e:
|
||||
metrics.metadata["error_type"] = "unknown"
|
||||
last_error = e
|
||||
|
||||
if attempt < UPLOAD_MAX_RETRIES - 1:
|
||||
delay = UPLOAD_RETRY_DELAY_BASE**attempt
|
||||
logger.debug(
|
||||
f"Retrying upload for {file.filename} in {delay}s (attempt {attempt + 1})"
|
||||
)
|
||||
time.sleep(delay)
|
||||
|
||||
logger.warning(
|
||||
f"Upload failed for {file.filename} to {provider} after {UPLOAD_MAX_RETRIES} attempts: {last_error}"
|
||||
)
|
||||
return None
|
||||
|
||||
def _resolve_inline(
|
||||
self,
|
||||
file: FileInput,
|
||||
provider: str,
|
||||
context: FileContext,
|
||||
) -> ResolvedFile:
|
||||
"""Resolve a file as inline content.
|
||||
|
||||
Args:
|
||||
file: The file to resolve (used for logging).
|
||||
provider: Provider name.
|
||||
context: Pre-computed file context.
|
||||
|
||||
Returns:
|
||||
InlineBase64 or InlineBytes depending on provider.
|
||||
"""
|
||||
logger.debug(f"Resolving {file.filename} as inline for {provider}")
|
||||
if self.config.use_bytes_for_bedrock and "bedrock" in provider:
|
||||
return InlineBytes(
|
||||
content_type=context.content_type,
|
||||
data=context.content,
|
||||
)
|
||||
|
||||
encoded = base64.b64encode(context.content).decode("ascii")
|
||||
return InlineBase64(
|
||||
content_type=context.content_type,
|
||||
data=encoded,
|
||||
)
|
||||
|
||||
async def aresolve(self, file: FileInput, provider: ProviderType) -> ResolvedFile:
|
||||
"""Async resolve a file to its delivery format for a provider.
|
||||
|
||||
Args:
|
||||
file: The file to resolve.
|
||||
provider: Provider name (e.g., "gemini", "anthropic", "openai").
|
||||
|
||||
Returns:
|
||||
ResolvedFile representing the appropriate delivery format.
|
||||
"""
|
||||
constraints = get_constraints_for_provider(provider)
|
||||
|
||||
if self._is_url_source(file) and self._supports_url(constraints):
|
||||
return self._resolve_as_url(file)
|
||||
|
||||
context = self._build_file_context(file)
|
||||
|
||||
should_upload = self._should_upload(file, provider, constraints, context.size)
|
||||
|
||||
if should_upload:
|
||||
resolved = await self._aresolve_via_upload(file, provider, context)
|
||||
if resolved is not None:
|
||||
return resolved
|
||||
|
||||
return self._resolve_inline(file, provider, context)
|
||||
|
||||
async def aresolve_files(
|
||||
self,
|
||||
files: dict[str, FileInput],
|
||||
provider: ProviderType,
|
||||
max_concurrency: int = 10,
|
||||
) -> dict[str, ResolvedFile]:
|
||||
"""Async resolve multiple files in parallel.
|
||||
|
||||
Args:
|
||||
files: Dictionary mapping names to file inputs.
|
||||
provider: Provider name.
|
||||
max_concurrency: Maximum number of concurrent resolutions.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping names to resolved files.
|
||||
"""
|
||||
semaphore = asyncio.Semaphore(max_concurrency)
|
||||
|
||||
async def resolve_single(
|
||||
entry_key: str, input_file: FileInput
|
||||
) -> tuple[str, ResolvedFile]:
|
||||
"""Resolve a single file with semaphore limiting."""
|
||||
async with semaphore:
|
||||
entry_resolved = await self.aresolve(input_file, provider)
|
||||
return entry_key, entry_resolved
|
||||
|
||||
tasks = [resolve_single(n, f) for n, f in files.items()]
|
||||
gather_results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
output: dict[str, ResolvedFile] = {}
|
||||
for item in gather_results:
|
||||
if isinstance(item, BaseException):
|
||||
logger.error(f"Resolution failed: {item}")
|
||||
continue
|
||||
key, resolved = item
|
||||
output[key] = resolved
|
||||
|
||||
return output
|
||||
|
||||
async def _aresolve_via_upload(
|
||||
self,
|
||||
file: FileInput,
|
||||
provider: ProviderType,
|
||||
context: FileContext,
|
||||
) -> ResolvedFile | None:
|
||||
"""Async resolve a file by uploading it.
|
||||
|
||||
Args:
|
||||
file: The file to upload.
|
||||
provider: Provider name.
|
||||
context: Pre-computed file context.
|
||||
|
||||
Returns:
|
||||
FileReference if upload succeeds, None otherwise.
|
||||
"""
|
||||
if self.upload_cache is not None:
|
||||
cached = await self.upload_cache.aget_by_hash(
|
||||
context.content_hash, provider
|
||||
)
|
||||
if cached is not None:
|
||||
logger.debug(
|
||||
f"Using cached upload for {file.filename}: {cached.file_id}"
|
||||
)
|
||||
return FileReference(
|
||||
content_type=cached.content_type,
|
||||
file_id=cached.file_id,
|
||||
provider=cached.provider,
|
||||
expires_at=cached.expires_at,
|
||||
file_uri=cached.file_uri,
|
||||
)
|
||||
|
||||
uploader = self._get_uploader(provider)
|
||||
if uploader is None:
|
||||
logger.debug(f"No uploader available for {provider}")
|
||||
return None
|
||||
|
||||
result = await self._aupload_with_retry(uploader, file, provider, context.size)
|
||||
if result is None:
|
||||
return None
|
||||
|
||||
if self.upload_cache is not None:
|
||||
await self.upload_cache.aset_by_hash(
|
||||
file_hash=context.content_hash,
|
||||
content_type=context.content_type,
|
||||
provider=provider,
|
||||
file_id=result.file_id,
|
||||
file_uri=result.file_uri,
|
||||
expires_at=result.expires_at,
|
||||
)
|
||||
|
||||
return FileReference(
|
||||
content_type=result.content_type,
|
||||
file_id=result.file_id,
|
||||
provider=result.provider,
|
||||
expires_at=result.expires_at,
|
||||
file_uri=result.file_uri,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
async def _aupload_with_retry(
|
||||
uploader: FileUploader,
|
||||
file: FileInput,
|
||||
provider: str,
|
||||
file_size: int,
|
||||
) -> UploadResult | None:
|
||||
"""Async upload with exponential backoff retry.
|
||||
|
||||
Args:
|
||||
uploader: The uploader to use.
|
||||
file: The file to upload.
|
||||
provider: Provider name for logging.
|
||||
file_size: Size of the file in bytes.
|
||||
|
||||
Returns:
|
||||
UploadResult if successful, None otherwise.
|
||||
"""
|
||||
from crewai.files.processing.exceptions import (
|
||||
PermanentUploadError,
|
||||
TransientUploadError,
|
||||
)
|
||||
|
||||
last_error: Exception | None = None
|
||||
|
||||
for attempt in range(UPLOAD_MAX_RETRIES):
|
||||
with measure_operation(
|
||||
"upload",
|
||||
filename=file.filename,
|
||||
provider=provider,
|
||||
size_bytes=file_size,
|
||||
attempt=attempt + 1,
|
||||
) as metrics:
|
||||
try:
|
||||
result = await uploader.aupload(file)
|
||||
metrics.metadata["file_id"] = result.file_id
|
||||
return result
|
||||
except PermanentUploadError as e:
|
||||
metrics.metadata["error_type"] = "permanent"
|
||||
logger.warning(
|
||||
f"Non-retryable upload error for {file.filename}: {e}"
|
||||
)
|
||||
return None
|
||||
except TransientUploadError as e:
|
||||
metrics.metadata["error_type"] = "transient"
|
||||
last_error = e
|
||||
except Exception as e:
|
||||
metrics.metadata["error_type"] = "unknown"
|
||||
last_error = e
|
||||
|
||||
if attempt < UPLOAD_MAX_RETRIES - 1:
|
||||
delay = UPLOAD_RETRY_DELAY_BASE**attempt
|
||||
logger.debug(
|
||||
f"Retrying upload for {file.filename} in {delay}s (attempt {attempt + 1})"
|
||||
)
|
||||
await asyncio.sleep(delay)
|
||||
|
||||
logger.warning(
|
||||
f"Upload failed for {file.filename} to {provider} after {UPLOAD_MAX_RETRIES} attempts: {last_error}"
|
||||
)
|
||||
return None
|
||||
|
||||
def _get_uploader(self, provider: ProviderType) -> FileUploader | None:
|
||||
"""Get or create an uploader for a provider.
|
||||
|
||||
Args:
|
||||
provider: Provider name.
|
||||
|
||||
Returns:
|
||||
FileUploader instance or None if not available.
|
||||
"""
|
||||
if provider not in self._uploaders:
|
||||
uploader = get_uploader(provider)
|
||||
if uploader is not None:
|
||||
self._uploaders[provider] = uploader
|
||||
else:
|
||||
return None
|
||||
|
||||
return self._uploaders.get(provider)
|
||||
|
||||
def get_cached_uploads(self, provider: str) -> list[CachedUpload]:
|
||||
"""Get all cached uploads for a provider.
|
||||
|
||||
Args:
|
||||
provider: Provider name.
|
||||
|
||||
Returns:
|
||||
List of cached uploads.
|
||||
"""
|
||||
if self.upload_cache is None:
|
||||
return []
|
||||
return self.upload_cache.get_all_for_provider(provider)
|
||||
|
||||
def clear_cache(self) -> None:
|
||||
"""Clear the upload cache."""
|
||||
if self.upload_cache is not None:
|
||||
self.upload_cache.clear()
|
||||
|
||||
|
||||
def create_resolver(
|
||||
provider: str | None = None,
|
||||
prefer_upload: bool = False,
|
||||
upload_threshold_bytes: int | None = None,
|
||||
enable_cache: bool = True,
|
||||
) -> FileResolver:
|
||||
"""Create a configured FileResolver.
|
||||
|
||||
Args:
|
||||
provider: Optional provider name to load default threshold from constraints.
|
||||
prefer_upload: Whether to prefer upload over inline.
|
||||
upload_threshold_bytes: Size threshold for using upload. If None and
|
||||
provider is specified, uses provider's default threshold.
|
||||
enable_cache: Whether to enable upload caching.
|
||||
|
||||
Returns:
|
||||
Configured FileResolver instance.
|
||||
"""
|
||||
threshold = upload_threshold_bytes
|
||||
if threshold is None and provider is not None:
|
||||
constraints = get_constraints_for_provider(provider)
|
||||
if constraints is not None:
|
||||
threshold = constraints.file_upload_threshold_bytes
|
||||
|
||||
config = FileResolverConfig(
|
||||
prefer_upload=prefer_upload,
|
||||
upload_threshold_bytes=threshold,
|
||||
)
|
||||
|
||||
cache = UploadCache() if enable_cache else None
|
||||
|
||||
return FileResolver(config=config, upload_cache=cache)
|
||||
@@ -1,546 +0,0 @@
|
||||
"""Cache for tracking uploaded files using aiocache."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import atexit
|
||||
import builtins
|
||||
from collections.abc import Iterator
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
import hashlib
|
||||
import logging
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from aiocache import Cache # type: ignore[import-untyped]
|
||||
from aiocache.serializers import PickleSerializer # type: ignore[import-untyped]
|
||||
|
||||
from crewai.files.constants import DEFAULT_MAX_CACHE_ENTRIES, DEFAULT_TTL_SECONDS
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from crewai.files.content_types import FileInput
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class CachedUpload:
|
||||
"""Represents a cached file upload.
|
||||
|
||||
Attributes:
|
||||
file_id: Provider-specific file identifier.
|
||||
provider: Name of the provider.
|
||||
file_uri: Optional URI for accessing the file.
|
||||
content_type: MIME type of the uploaded file.
|
||||
uploaded_at: When the file was uploaded.
|
||||
expires_at: When the upload expires (if applicable).
|
||||
"""
|
||||
|
||||
file_id: str
|
||||
provider: str
|
||||
file_uri: str | None
|
||||
content_type: str
|
||||
uploaded_at: datetime
|
||||
expires_at: datetime | None = None
|
||||
|
||||
def is_expired(self) -> bool:
|
||||
"""Check if this cached upload has expired."""
|
||||
if self.expires_at is None:
|
||||
return False
|
||||
return datetime.now(timezone.utc) >= self.expires_at
|
||||
|
||||
|
||||
def _make_key(file_hash: str, provider: str) -> str:
|
||||
"""Create a cache key from file hash and provider."""
|
||||
return f"upload:{provider}:{file_hash}"
|
||||
|
||||
|
||||
def _compute_file_hash_streaming(chunks: Iterator[bytes]) -> str:
|
||||
"""Compute SHA-256 hash from streaming chunks.
|
||||
|
||||
Args:
|
||||
chunks: Iterator of byte chunks.
|
||||
|
||||
Returns:
|
||||
Hexadecimal hash string.
|
||||
"""
|
||||
hasher = hashlib.sha256()
|
||||
for chunk in chunks:
|
||||
hasher.update(chunk)
|
||||
return hasher.hexdigest()
|
||||
|
||||
|
||||
def _compute_file_hash(file: FileInput) -> str:
|
||||
"""Compute SHA-256 hash of file content.
|
||||
|
||||
Uses streaming for FilePath sources to avoid loading large files into memory.
|
||||
"""
|
||||
from crewai.files.file import FilePath
|
||||
|
||||
source = file._file_source
|
||||
if isinstance(source, FilePath):
|
||||
return _compute_file_hash_streaming(source.read_chunks(chunk_size=1024 * 1024))
|
||||
content = file.read()
|
||||
return hashlib.sha256(content).hexdigest()
|
||||
|
||||
|
||||
class UploadCache:
|
||||
"""Async cache for tracking uploaded files using aiocache.
|
||||
|
||||
Supports in-memory caching by default, with optional Redis backend
|
||||
for distributed setups.
|
||||
|
||||
Attributes:
|
||||
ttl: Default time-to-live in seconds for cached entries.
|
||||
namespace: Cache namespace for isolation.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
ttl: int = DEFAULT_TTL_SECONDS,
|
||||
namespace: str = "crewai_uploads",
|
||||
cache_type: str = "memory",
|
||||
max_entries: int | None = DEFAULT_MAX_CACHE_ENTRIES,
|
||||
**cache_kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize the upload cache.
|
||||
|
||||
Args:
|
||||
ttl: Default TTL in seconds.
|
||||
namespace: Cache namespace.
|
||||
cache_type: Backend type ("memory" or "redis").
|
||||
max_entries: Maximum cache entries (None for unlimited).
|
||||
**cache_kwargs: Additional args for cache backend.
|
||||
"""
|
||||
self.ttl = ttl
|
||||
self.namespace = namespace
|
||||
self.max_entries = max_entries
|
||||
self._provider_keys: dict[str, set[str]] = {}
|
||||
self._key_access_order: list[str] = []
|
||||
|
||||
if cache_type == "redis":
|
||||
self._cache = Cache(
|
||||
Cache.REDIS,
|
||||
serializer=PickleSerializer(),
|
||||
namespace=namespace,
|
||||
**cache_kwargs,
|
||||
)
|
||||
else:
|
||||
self._cache = Cache(
|
||||
serializer=PickleSerializer(),
|
||||
namespace=namespace,
|
||||
)
|
||||
|
||||
def _track_key(self, provider: str, key: str) -> None:
|
||||
"""Track a key for a provider (for cleanup) and access order."""
|
||||
if provider not in self._provider_keys:
|
||||
self._provider_keys[provider] = set()
|
||||
self._provider_keys[provider].add(key)
|
||||
if key in self._key_access_order:
|
||||
self._key_access_order.remove(key)
|
||||
self._key_access_order.append(key)
|
||||
|
||||
def _untrack_key(self, provider: str, key: str) -> None:
|
||||
"""Remove key tracking for a provider."""
|
||||
if provider in self._provider_keys:
|
||||
self._provider_keys[provider].discard(key)
|
||||
if key in self._key_access_order:
|
||||
self._key_access_order.remove(key)
|
||||
|
||||
async def _evict_if_needed(self) -> int:
|
||||
"""Evict oldest entries if limit exceeded.
|
||||
|
||||
Returns:
|
||||
Number of entries evicted.
|
||||
"""
|
||||
if self.max_entries is None:
|
||||
return 0
|
||||
|
||||
current_count = len(self)
|
||||
if current_count < self.max_entries:
|
||||
return 0
|
||||
|
||||
to_evict = max(1, self.max_entries // 10)
|
||||
return await self._evict_oldest(to_evict)
|
||||
|
||||
async def _evict_oldest(self, count: int) -> int:
|
||||
"""Evict the oldest entries from the cache.
|
||||
|
||||
Args:
|
||||
count: Number of entries to evict.
|
||||
|
||||
Returns:
|
||||
Number of entries actually evicted.
|
||||
"""
|
||||
evicted = 0
|
||||
keys_to_evict = self._key_access_order[:count]
|
||||
|
||||
for key in keys_to_evict:
|
||||
await self._cache.delete(key)
|
||||
self._key_access_order.remove(key)
|
||||
for provider_keys in self._provider_keys.values():
|
||||
provider_keys.discard(key)
|
||||
evicted += 1
|
||||
|
||||
if evicted > 0:
|
||||
logger.debug(f"Evicted {evicted} oldest cache entries")
|
||||
|
||||
return evicted
|
||||
|
||||
async def aget(self, file: FileInput, provider: str) -> CachedUpload | None:
|
||||
"""Get a cached upload for a file.
|
||||
|
||||
Args:
|
||||
file: The file to look up.
|
||||
provider: The provider name.
|
||||
|
||||
Returns:
|
||||
Cached upload if found and not expired, None otherwise.
|
||||
"""
|
||||
file_hash = _compute_file_hash(file)
|
||||
return await self.aget_by_hash(file_hash, provider)
|
||||
|
||||
async def aget_by_hash(self, file_hash: str, provider: str) -> CachedUpload | None:
|
||||
"""Get a cached upload by file hash.
|
||||
|
||||
Args:
|
||||
file_hash: Hash of the file content.
|
||||
provider: The provider name.
|
||||
|
||||
Returns:
|
||||
Cached upload if found and not expired, None otherwise.
|
||||
"""
|
||||
key = _make_key(file_hash, provider)
|
||||
result = await self._cache.get(key)
|
||||
|
||||
if result is None:
|
||||
return None
|
||||
if isinstance(result, CachedUpload):
|
||||
if result.is_expired():
|
||||
await self._cache.delete(key)
|
||||
self._untrack_key(provider, key)
|
||||
return None
|
||||
return result
|
||||
return None
|
||||
|
||||
async def aset(
|
||||
self,
|
||||
file: FileInput,
|
||||
provider: str,
|
||||
file_id: str,
|
||||
file_uri: str | None = None,
|
||||
expires_at: datetime | None = None,
|
||||
) -> CachedUpload:
|
||||
"""Cache an uploaded file.
|
||||
|
||||
Args:
|
||||
file: The file that was uploaded.
|
||||
provider: The provider name.
|
||||
file_id: Provider-specific file identifier.
|
||||
file_uri: Optional URI for accessing the file.
|
||||
expires_at: When the upload expires.
|
||||
|
||||
Returns:
|
||||
The created cache entry.
|
||||
"""
|
||||
file_hash = _compute_file_hash(file)
|
||||
return await self.aset_by_hash(
|
||||
file_hash=file_hash,
|
||||
content_type=file.content_type,
|
||||
provider=provider,
|
||||
file_id=file_id,
|
||||
file_uri=file_uri,
|
||||
expires_at=expires_at,
|
||||
)
|
||||
|
||||
async def aset_by_hash(
|
||||
self,
|
||||
file_hash: str,
|
||||
content_type: str,
|
||||
provider: str,
|
||||
file_id: str,
|
||||
file_uri: str | None = None,
|
||||
expires_at: datetime | None = None,
|
||||
) -> CachedUpload:
|
||||
"""Cache an uploaded file by hash.
|
||||
|
||||
Args:
|
||||
file_hash: Hash of the file content.
|
||||
content_type: MIME type of the file.
|
||||
provider: The provider name.
|
||||
file_id: Provider-specific file identifier.
|
||||
file_uri: Optional URI for accessing the file.
|
||||
expires_at: When the upload expires.
|
||||
|
||||
Returns:
|
||||
The created cache entry.
|
||||
"""
|
||||
await self._evict_if_needed()
|
||||
|
||||
key = _make_key(file_hash, provider)
|
||||
now = datetime.now(timezone.utc)
|
||||
|
||||
cached = CachedUpload(
|
||||
file_id=file_id,
|
||||
provider=provider,
|
||||
file_uri=file_uri,
|
||||
content_type=content_type,
|
||||
uploaded_at=now,
|
||||
expires_at=expires_at,
|
||||
)
|
||||
|
||||
ttl = self.ttl
|
||||
if expires_at is not None:
|
||||
ttl = max(0, int((expires_at - now).total_seconds()))
|
||||
|
||||
await self._cache.set(key, cached, ttl=ttl)
|
||||
self._track_key(provider, key)
|
||||
logger.debug(f"Cached upload: {file_id} for provider {provider}")
|
||||
return cached
|
||||
|
||||
async def aremove(self, file: FileInput, provider: str) -> bool:
|
||||
"""Remove a cached upload.
|
||||
|
||||
Args:
|
||||
file: The file to remove.
|
||||
provider: The provider name.
|
||||
|
||||
Returns:
|
||||
True if entry was removed, False if not found.
|
||||
"""
|
||||
file_hash = _compute_file_hash(file)
|
||||
key = _make_key(file_hash, provider)
|
||||
|
||||
result = await self._cache.delete(key)
|
||||
removed = bool(result > 0 if isinstance(result, int) else result)
|
||||
if removed:
|
||||
self._untrack_key(provider, key)
|
||||
return removed
|
||||
|
||||
async def aremove_by_file_id(self, file_id: str, provider: str) -> bool:
|
||||
"""Remove a cached upload by file ID.
|
||||
|
||||
Args:
|
||||
file_id: The file ID to remove.
|
||||
provider: The provider name.
|
||||
|
||||
Returns:
|
||||
True if entry was removed, False if not found.
|
||||
"""
|
||||
if provider not in self._provider_keys:
|
||||
return False
|
||||
|
||||
for key in list(self._provider_keys[provider]):
|
||||
cached = await self._cache.get(key)
|
||||
if isinstance(cached, CachedUpload) and cached.file_id == file_id:
|
||||
await self._cache.delete(key)
|
||||
self._untrack_key(provider, key)
|
||||
return True
|
||||
return False
|
||||
|
||||
async def aclear_expired(self) -> int:
|
||||
"""Remove all expired entries from the cache.
|
||||
|
||||
Returns:
|
||||
Number of entries removed.
|
||||
"""
|
||||
removed = 0
|
||||
|
||||
for provider, keys in list(self._provider_keys.items()):
|
||||
for key in list(keys):
|
||||
cached = await self._cache.get(key)
|
||||
if cached is None or (
|
||||
isinstance(cached, CachedUpload) and cached.is_expired()
|
||||
):
|
||||
await self._cache.delete(key)
|
||||
self._untrack_key(provider, key)
|
||||
removed += 1
|
||||
|
||||
if removed > 0:
|
||||
logger.debug(f"Cleared {removed} expired cache entries")
|
||||
return removed
|
||||
|
||||
async def aclear(self) -> int:
|
||||
"""Clear all entries from the cache.
|
||||
|
||||
Returns:
|
||||
Number of entries cleared.
|
||||
"""
|
||||
count = sum(len(keys) for keys in self._provider_keys.values())
|
||||
await self._cache.clear(namespace=self.namespace)
|
||||
self._provider_keys.clear()
|
||||
|
||||
if count > 0:
|
||||
logger.debug(f"Cleared {count} cache entries")
|
||||
return count
|
||||
|
||||
async def aget_all_for_provider(self, provider: str) -> list[CachedUpload]:
|
||||
"""Get all cached uploads for a provider.
|
||||
|
||||
Args:
|
||||
provider: The provider name.
|
||||
|
||||
Returns:
|
||||
List of cached uploads for the provider.
|
||||
"""
|
||||
if provider not in self._provider_keys:
|
||||
return []
|
||||
|
||||
results: list[CachedUpload] = []
|
||||
for key in list(self._provider_keys[provider]):
|
||||
cached = await self._cache.get(key)
|
||||
if isinstance(cached, CachedUpload) and not cached.is_expired():
|
||||
results.append(cached)
|
||||
return results
|
||||
|
||||
@staticmethod
|
||||
def _run_sync(coro: Any) -> Any:
|
||||
"""Run an async coroutine from sync context without blocking event loop."""
|
||||
try:
|
||||
loop = asyncio.get_running_loop()
|
||||
except RuntimeError:
|
||||
loop = None
|
||||
|
||||
if loop is not None and loop.is_running():
|
||||
future = asyncio.run_coroutine_threadsafe(coro, loop)
|
||||
return future.result(timeout=30)
|
||||
return asyncio.run(coro)
|
||||
|
||||
def get(self, file: FileInput, provider: str) -> CachedUpload | None:
|
||||
"""Sync wrapper for aget."""
|
||||
result: CachedUpload | None = self._run_sync(self.aget(file, provider))
|
||||
return result
|
||||
|
||||
def get_by_hash(self, file_hash: str, provider: str) -> CachedUpload | None:
|
||||
"""Sync wrapper for aget_by_hash."""
|
||||
result: CachedUpload | None = self._run_sync(
|
||||
self.aget_by_hash(file_hash, provider)
|
||||
)
|
||||
return result
|
||||
|
||||
def set(
|
||||
self,
|
||||
file: FileInput,
|
||||
provider: str,
|
||||
file_id: str,
|
||||
file_uri: str | None = None,
|
||||
expires_at: datetime | None = None,
|
||||
) -> CachedUpload:
|
||||
"""Sync wrapper for aset."""
|
||||
result: CachedUpload = self._run_sync(
|
||||
self.aset(file, provider, file_id, file_uri, expires_at)
|
||||
)
|
||||
return result
|
||||
|
||||
def set_by_hash(
|
||||
self,
|
||||
file_hash: str,
|
||||
content_type: str,
|
||||
provider: str,
|
||||
file_id: str,
|
||||
file_uri: str | None = None,
|
||||
expires_at: datetime | None = None,
|
||||
) -> CachedUpload:
|
||||
"""Sync wrapper for aset_by_hash."""
|
||||
result: CachedUpload = self._run_sync(
|
||||
self.aset_by_hash(
|
||||
file_hash, content_type, provider, file_id, file_uri, expires_at
|
||||
)
|
||||
)
|
||||
return result
|
||||
|
||||
def remove(self, file: FileInput, provider: str) -> bool:
|
||||
"""Sync wrapper for aremove."""
|
||||
result: bool = self._run_sync(self.aremove(file, provider))
|
||||
return result
|
||||
|
||||
def remove_by_file_id(self, file_id: str, provider: str) -> bool:
|
||||
"""Sync wrapper for aremove_by_file_id."""
|
||||
result: bool = self._run_sync(self.aremove_by_file_id(file_id, provider))
|
||||
return result
|
||||
|
||||
def clear_expired(self) -> int:
|
||||
"""Sync wrapper for aclear_expired."""
|
||||
result: int = self._run_sync(self.aclear_expired())
|
||||
return result
|
||||
|
||||
def clear(self) -> int:
|
||||
"""Sync wrapper for aclear."""
|
||||
result: int = self._run_sync(self.aclear())
|
||||
return result
|
||||
|
||||
def get_all_for_provider(self, provider: str) -> list[CachedUpload]:
|
||||
"""Sync wrapper for aget_all_for_provider."""
|
||||
result: list[CachedUpload] = self._run_sync(
|
||||
self.aget_all_for_provider(provider)
|
||||
)
|
||||
return result
|
||||
|
||||
def __len__(self) -> int:
|
||||
"""Return the number of cached entries."""
|
||||
return sum(len(keys) for keys in self._provider_keys.values())
|
||||
|
||||
def get_providers(self) -> builtins.set[str]:
|
||||
"""Get all provider names that have cached entries.
|
||||
|
||||
Returns:
|
||||
Set of provider names.
|
||||
"""
|
||||
return builtins.set(self._provider_keys.keys())
|
||||
|
||||
|
||||
_default_cache: UploadCache | None = None
|
||||
|
||||
|
||||
def get_upload_cache(
|
||||
ttl: int = DEFAULT_TTL_SECONDS,
|
||||
namespace: str = "crewai_uploads",
|
||||
cache_type: str = "memory",
|
||||
**cache_kwargs: Any,
|
||||
) -> UploadCache:
|
||||
"""Get or create the default upload cache.
|
||||
|
||||
Args:
|
||||
ttl: Default TTL in seconds.
|
||||
namespace: Cache namespace.
|
||||
cache_type: Backend type ("memory" or "redis").
|
||||
**cache_kwargs: Additional args for cache backend.
|
||||
|
||||
Returns:
|
||||
The upload cache instance.
|
||||
"""
|
||||
global _default_cache
|
||||
if _default_cache is None:
|
||||
_default_cache = UploadCache(
|
||||
ttl=ttl,
|
||||
namespace=namespace,
|
||||
cache_type=cache_type,
|
||||
**cache_kwargs,
|
||||
)
|
||||
return _default_cache
|
||||
|
||||
|
||||
def reset_upload_cache() -> None:
|
||||
"""Reset the default upload cache (useful for testing)."""
|
||||
global _default_cache
|
||||
if _default_cache is not None:
|
||||
_default_cache.clear()
|
||||
_default_cache = None
|
||||
|
||||
|
||||
def _cleanup_on_exit() -> None:
|
||||
"""Clean up uploaded files on process exit."""
|
||||
global _default_cache
|
||||
if _default_cache is None or len(_default_cache) == 0:
|
||||
return
|
||||
|
||||
from crewai.files.cleanup import cleanup_uploaded_files
|
||||
|
||||
try:
|
||||
cleanup_uploaded_files(_default_cache)
|
||||
except Exception as e:
|
||||
logger.debug(f"Error during exit cleanup: {e}")
|
||||
|
||||
|
||||
atexit.register(_cleanup_on_exit)
|
||||
@@ -1,11 +0,0 @@
|
||||
"""File uploader implementations for provider File APIs."""
|
||||
|
||||
from crewai.files.uploaders.base import FileUploader, UploadResult
|
||||
from crewai.files.uploaders.factory import get_uploader
|
||||
|
||||
|
||||
__all__ = [
|
||||
"FileUploader",
|
||||
"UploadResult",
|
||||
"get_uploader",
|
||||
]
|
||||
@@ -1,241 +0,0 @@
|
||||
"""Anthropic Files API uploader implementation."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
from crewai.files.content_types import FileInput
|
||||
from crewai.files.processing.exceptions import classify_upload_error
|
||||
from crewai.files.uploaders.base import FileUploader, UploadResult
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AnthropicFileUploader(FileUploader):
|
||||
"""Uploader for Anthropic Files API.
|
||||
|
||||
Uses the anthropic SDK to upload files. Files are stored persistently
|
||||
until explicitly deleted.
|
||||
"""
|
||||
|
||||
def __init__(self, api_key: str | None = None) -> None:
|
||||
"""Initialize the Anthropic uploader.
|
||||
|
||||
Args:
|
||||
api_key: Optional Anthropic API key. If not provided, uses
|
||||
ANTHROPIC_API_KEY environment variable.
|
||||
"""
|
||||
self._api_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
|
||||
self._client: Any = None
|
||||
self._async_client: Any = None
|
||||
|
||||
@property
|
||||
def provider_name(self) -> str:
|
||||
"""Return the provider name."""
|
||||
return "anthropic"
|
||||
|
||||
def _get_client(self) -> Any:
|
||||
"""Get or create the Anthropic client."""
|
||||
if self._client is None:
|
||||
try:
|
||||
import anthropic
|
||||
|
||||
self._client = anthropic.Anthropic(api_key=self._api_key)
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"anthropic is required for Anthropic file uploads. "
|
||||
"Install with: pip install anthropic"
|
||||
) from e
|
||||
return self._client
|
||||
|
||||
def _get_async_client(self) -> Any:
|
||||
"""Get or create the async Anthropic client."""
|
||||
if self._async_client is None:
|
||||
try:
|
||||
import anthropic
|
||||
|
||||
self._async_client = anthropic.AsyncAnthropic(api_key=self._api_key)
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"anthropic is required for Anthropic file uploads. "
|
||||
"Install with: pip install anthropic"
|
||||
) from e
|
||||
return self._async_client
|
||||
|
||||
def upload(self, file: FileInput, purpose: str | None = None) -> UploadResult:
|
||||
"""Upload a file to Anthropic.
|
||||
|
||||
Args:
|
||||
file: The file to upload.
|
||||
purpose: Optional purpose for the file (default: "user_upload").
|
||||
|
||||
Returns:
|
||||
UploadResult with the file ID and metadata.
|
||||
|
||||
Raises:
|
||||
TransientUploadError: For retryable errors (network, rate limits).
|
||||
PermanentUploadError: For non-retryable errors (auth, validation).
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
|
||||
content = file.read()
|
||||
file_purpose = purpose or "user_upload"
|
||||
|
||||
file_data = io.BytesIO(content)
|
||||
|
||||
logger.info(
|
||||
f"Uploading file '{file.filename}' to Anthropic ({len(content)} bytes)"
|
||||
)
|
||||
|
||||
uploaded_file = client.files.create(
|
||||
file=(file.filename, file_data, file.content_type),
|
||||
purpose=file_purpose,
|
||||
)
|
||||
|
||||
logger.info(f"Uploaded to Anthropic: {uploaded_file.id}")
|
||||
|
||||
return UploadResult(
|
||||
file_id=uploaded_file.id,
|
||||
file_uri=None,
|
||||
content_type=file.content_type,
|
||||
expires_at=None,
|
||||
provider=self.provider_name,
|
||||
)
|
||||
except ImportError:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise classify_upload_error(e, file.filename) from e
|
||||
|
||||
def delete(self, file_id: str) -> bool:
|
||||
"""Delete an uploaded file from Anthropic.
|
||||
|
||||
Args:
|
||||
file_id: The file ID to delete.
|
||||
|
||||
Returns:
|
||||
True if deletion was successful, False otherwise.
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
client.files.delete(file_id=file_id)
|
||||
logger.info(f"Deleted Anthropic file: {file_id}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to delete Anthropic file {file_id}: {e}")
|
||||
return False
|
||||
|
||||
def get_file_info(self, file_id: str) -> dict[str, Any] | None:
|
||||
"""Get information about an uploaded file.
|
||||
|
||||
Args:
|
||||
file_id: The file ID.
|
||||
|
||||
Returns:
|
||||
Dictionary with file information, or None if not found.
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
file_info = client.files.retrieve(file_id=file_id)
|
||||
return {
|
||||
"id": file_info.id,
|
||||
"filename": file_info.filename,
|
||||
"purpose": file_info.purpose,
|
||||
"size_bytes": file_info.size_bytes,
|
||||
"created_at": file_info.created_at,
|
||||
}
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to get Anthropic file info for {file_id}: {e}")
|
||||
return None
|
||||
|
||||
def list_files(self) -> list[dict[str, Any]]:
|
||||
"""List all uploaded files.
|
||||
|
||||
Returns:
|
||||
List of dictionaries with file information.
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
files = client.files.list()
|
||||
return [
|
||||
{
|
||||
"id": f.id,
|
||||
"filename": f.filename,
|
||||
"purpose": f.purpose,
|
||||
"size_bytes": f.size_bytes,
|
||||
"created_at": f.created_at,
|
||||
}
|
||||
for f in files.data
|
||||
]
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to list Anthropic files: {e}")
|
||||
return []
|
||||
|
||||
async def aupload(
|
||||
self, file: FileInput, purpose: str | None = None
|
||||
) -> UploadResult:
|
||||
"""Async upload a file to Anthropic using native async client.
|
||||
|
||||
Args:
|
||||
file: The file to upload.
|
||||
purpose: Optional purpose for the file (default: "user_upload").
|
||||
|
||||
Returns:
|
||||
UploadResult with the file ID and metadata.
|
||||
|
||||
Raises:
|
||||
TransientUploadError: For retryable errors (network, rate limits).
|
||||
PermanentUploadError: For non-retryable errors (auth, validation).
|
||||
"""
|
||||
try:
|
||||
client = self._get_async_client()
|
||||
|
||||
content = await file.aread()
|
||||
file_purpose = purpose or "user_upload"
|
||||
|
||||
file_data = io.BytesIO(content)
|
||||
|
||||
logger.info(
|
||||
f"Uploading file '{file.filename}' to Anthropic ({len(content)} bytes)"
|
||||
)
|
||||
|
||||
uploaded_file = await client.files.create(
|
||||
file=(file.filename, file_data, file.content_type),
|
||||
purpose=file_purpose,
|
||||
)
|
||||
|
||||
logger.info(f"Uploaded to Anthropic: {uploaded_file.id}")
|
||||
|
||||
return UploadResult(
|
||||
file_id=uploaded_file.id,
|
||||
file_uri=None,
|
||||
content_type=file.content_type,
|
||||
expires_at=None,
|
||||
provider=self.provider_name,
|
||||
)
|
||||
except ImportError:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise classify_upload_error(e, file.filename) from e
|
||||
|
||||
async def adelete(self, file_id: str) -> bool:
|
||||
"""Async delete an uploaded file from Anthropic.
|
||||
|
||||
Args:
|
||||
file_id: The file ID to delete.
|
||||
|
||||
Returns:
|
||||
True if deletion was successful, False otherwise.
|
||||
"""
|
||||
try:
|
||||
client = self._get_async_client()
|
||||
await client.files.delete(file_id=file_id)
|
||||
logger.info(f"Deleted Anthropic file: {file_id}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to delete Anthropic file {file_id}: {e}")
|
||||
return False
|
||||
@@ -1,118 +0,0 @@
|
||||
"""Base class for file uploaders."""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
import asyncio
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
from crewai.files.content_types import FileInput
|
||||
|
||||
|
||||
@dataclass
|
||||
class UploadResult:
|
||||
"""Result of a file upload operation.
|
||||
|
||||
Attributes:
|
||||
file_id: Provider-specific file identifier.
|
||||
file_uri: Optional URI for accessing the file.
|
||||
content_type: MIME type of the uploaded file.
|
||||
expires_at: When the upload expires (if applicable).
|
||||
provider: Name of the provider.
|
||||
"""
|
||||
|
||||
file_id: str
|
||||
provider: str
|
||||
content_type: str
|
||||
file_uri: str | None = None
|
||||
expires_at: datetime | None = None
|
||||
|
||||
|
||||
class FileUploader(ABC):
|
||||
"""Abstract base class for provider file uploaders.
|
||||
|
||||
Implementations handle uploading files to provider-specific File APIs.
|
||||
"""
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def provider_name(self) -> str:
|
||||
"""Return the provider name."""
|
||||
|
||||
@abstractmethod
|
||||
def upload(self, file: FileInput, purpose: str | None = None) -> UploadResult:
|
||||
"""Upload a file to the provider.
|
||||
|
||||
Args:
|
||||
file: The file to upload.
|
||||
purpose: Optional purpose/description for the upload.
|
||||
|
||||
Returns:
|
||||
UploadResult with the file identifier and metadata.
|
||||
|
||||
Raises:
|
||||
Exception: If upload fails.
|
||||
"""
|
||||
|
||||
async def aupload(
|
||||
self, file: FileInput, purpose: str | None = None
|
||||
) -> UploadResult:
|
||||
"""Async upload a file to the provider.
|
||||
|
||||
Default implementation runs sync upload in executor.
|
||||
Override in subclasses for native async support.
|
||||
|
||||
Args:
|
||||
file: The file to upload.
|
||||
purpose: Optional purpose/description for the upload.
|
||||
|
||||
Returns:
|
||||
UploadResult with the file identifier and metadata.
|
||||
"""
|
||||
loop = asyncio.get_running_loop()
|
||||
return await loop.run_in_executor(None, self.upload, file, purpose)
|
||||
|
||||
@abstractmethod
|
||||
def delete(self, file_id: str) -> bool:
|
||||
"""Delete an uploaded file.
|
||||
|
||||
Args:
|
||||
file_id: The file identifier to delete.
|
||||
|
||||
Returns:
|
||||
True if deletion was successful, False otherwise.
|
||||
"""
|
||||
|
||||
async def adelete(self, file_id: str) -> bool:
|
||||
"""Async delete an uploaded file.
|
||||
|
||||
Default implementation runs sync delete in executor.
|
||||
Override in subclasses for native async support.
|
||||
|
||||
Args:
|
||||
file_id: The file identifier to delete.
|
||||
|
||||
Returns:
|
||||
True if deletion was successful, False otherwise.
|
||||
"""
|
||||
loop = asyncio.get_running_loop()
|
||||
return await loop.run_in_executor(None, self.delete, file_id)
|
||||
|
||||
def get_file_info(self, file_id: str) -> dict[str, Any] | None:
|
||||
"""Get information about an uploaded file.
|
||||
|
||||
Args:
|
||||
file_id: The file identifier.
|
||||
|
||||
Returns:
|
||||
Dictionary with file information, or None if not found.
|
||||
"""
|
||||
return None
|
||||
|
||||
def list_files(self) -> list[dict[str, Any]]:
|
||||
"""List all uploaded files.
|
||||
|
||||
Returns:
|
||||
List of dictionaries with file information.
|
||||
"""
|
||||
return []
|
||||
@@ -1,473 +0,0 @@
|
||||
"""AWS Bedrock S3 file uploader implementation."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from crewai.files.constants import (
|
||||
MAX_CONCURRENCY,
|
||||
MULTIPART_CHUNKSIZE,
|
||||
MULTIPART_THRESHOLD,
|
||||
)
|
||||
from crewai.files.content_types import FileInput
|
||||
from crewai.files.file import FileBytes, FilePath
|
||||
from crewai.files.processing.exceptions import (
|
||||
PermanentUploadError,
|
||||
TransientUploadError,
|
||||
)
|
||||
from crewai.files.uploaders.base import FileUploader, UploadResult
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _classify_s3_error(e: Exception, filename: str | None) -> Exception:
|
||||
"""Classify an S3 exception as transient or permanent upload error.
|
||||
|
||||
Args:
|
||||
e: The exception to classify.
|
||||
filename: The filename for error context.
|
||||
|
||||
Returns:
|
||||
A TransientUploadError or PermanentUploadError wrapping the original.
|
||||
"""
|
||||
error_type = type(e).__name__
|
||||
error_code = getattr(e, "response", {}).get("Error", {}).get("Code", "")
|
||||
|
||||
if error_code in ("SlowDown", "ServiceUnavailable", "InternalError"):
|
||||
return TransientUploadError(f"Transient S3 error: {e}", file_name=filename)
|
||||
if error_code in ("AccessDenied", "InvalidAccessKeyId", "SignatureDoesNotMatch"):
|
||||
return PermanentUploadError(f"S3 authentication error: {e}", file_name=filename)
|
||||
if error_code in ("NoSuchBucket", "InvalidBucketName"):
|
||||
return PermanentUploadError(f"S3 bucket error: {e}", file_name=filename)
|
||||
if "Throttl" in error_type or "Throttl" in str(e):
|
||||
return TransientUploadError(f"S3 throttling: {e}", file_name=filename)
|
||||
return TransientUploadError(f"S3 upload failed: {e}", file_name=filename)
|
||||
|
||||
|
||||
def _get_file_path(file: FileInput) -> Path | None:
|
||||
"""Get the filesystem path if file source is FilePath.
|
||||
|
||||
Args:
|
||||
file: The file input to check.
|
||||
|
||||
Returns:
|
||||
Path if source is FilePath, None otherwise.
|
||||
"""
|
||||
source = file._file_source
|
||||
if isinstance(source, FilePath):
|
||||
return source.path
|
||||
return None
|
||||
|
||||
|
||||
def _get_file_size(file: FileInput) -> int | None:
|
||||
"""Get file size without reading content if possible.
|
||||
|
||||
Args:
|
||||
file: The file input.
|
||||
|
||||
Returns:
|
||||
Size in bytes if determinable without reading, None otherwise.
|
||||
"""
|
||||
source = file._file_source
|
||||
if isinstance(source, FilePath):
|
||||
return source.path.stat().st_size
|
||||
if isinstance(source, FileBytes):
|
||||
return len(source.data)
|
||||
return None
|
||||
|
||||
|
||||
def _compute_hash_streaming(file_path: Path) -> str:
|
||||
"""Compute SHA-256 hash by streaming file content.
|
||||
|
||||
Args:
|
||||
file_path: Path to the file.
|
||||
|
||||
Returns:
|
||||
First 16 characters of hex digest.
|
||||
"""
|
||||
hasher = hashlib.sha256()
|
||||
with open(file_path, "rb") as f:
|
||||
while chunk := f.read(1024 * 1024):
|
||||
hasher.update(chunk)
|
||||
return hasher.hexdigest()[:16]
|
||||
|
||||
|
||||
class BedrockFileUploader(FileUploader):
|
||||
"""Uploader for AWS Bedrock via S3.
|
||||
|
||||
Uploads files to S3 and returns S3 URIs that can be used with Bedrock's
|
||||
Converse API s3Location source format.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
bucket_name: str | None = None,
|
||||
bucket_owner: str | None = None,
|
||||
prefix: str = "crewai-files",
|
||||
region: str | None = None,
|
||||
) -> None:
|
||||
"""Initialize the Bedrock S3 uploader.
|
||||
|
||||
Args:
|
||||
bucket_name: S3 bucket name. If not provided, uses
|
||||
CREWAI_BEDROCK_S3_BUCKET environment variable.
|
||||
bucket_owner: Optional bucket owner account ID for cross-account access.
|
||||
Uses CREWAI_BEDROCK_S3_BUCKET_OWNER environment variable if not provided.
|
||||
prefix: S3 key prefix for uploaded files (default: "crewai-files").
|
||||
region: AWS region. Uses AWS_REGION or AWS_DEFAULT_REGION if not provided.
|
||||
"""
|
||||
self._bucket_name = bucket_name or os.environ.get("CREWAI_BEDROCK_S3_BUCKET")
|
||||
self._bucket_owner = bucket_owner or os.environ.get(
|
||||
"CREWAI_BEDROCK_S3_BUCKET_OWNER"
|
||||
)
|
||||
self._prefix = prefix
|
||||
self._region = region or os.environ.get(
|
||||
"AWS_REGION", os.environ.get("AWS_DEFAULT_REGION")
|
||||
)
|
||||
self._client: Any = None
|
||||
self._async_client: Any = None
|
||||
|
||||
@property
|
||||
def provider_name(self) -> str:
|
||||
"""Return the provider name."""
|
||||
return "bedrock"
|
||||
|
||||
@property
|
||||
def bucket_name(self) -> str:
|
||||
"""Return the configured bucket name."""
|
||||
if not self._bucket_name:
|
||||
raise ValueError(
|
||||
"S3 bucket name not configured. Set CREWAI_BEDROCK_S3_BUCKET "
|
||||
"environment variable or pass bucket_name parameter."
|
||||
)
|
||||
return self._bucket_name
|
||||
|
||||
@property
|
||||
def bucket_owner(self) -> str | None:
|
||||
"""Return the configured bucket owner."""
|
||||
return self._bucket_owner
|
||||
|
||||
def _get_client(self) -> Any:
|
||||
"""Get or create the S3 client."""
|
||||
if self._client is None:
|
||||
try:
|
||||
import boto3
|
||||
|
||||
self._client = boto3.client("s3", region_name=self._region)
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"boto3 is required for Bedrock S3 file uploads. "
|
||||
"Install with: pip install boto3"
|
||||
) from e
|
||||
return self._client
|
||||
|
||||
def _get_async_client(self) -> Any:
|
||||
"""Get or create the async S3 client."""
|
||||
if self._async_client is None:
|
||||
try:
|
||||
import aioboto3 # type: ignore[import-not-found]
|
||||
|
||||
self._session = aioboto3.Session()
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"aioboto3 is required for async Bedrock S3 file uploads. "
|
||||
"Install with: pip install aioboto3"
|
||||
) from e
|
||||
return self._session
|
||||
|
||||
def _generate_s3_key(self, file: FileInput, content: bytes | None = None) -> str:
|
||||
"""Generate a unique S3 key for the file.
|
||||
|
||||
For FilePath sources with no content provided, computes hash via streaming.
|
||||
|
||||
Args:
|
||||
file: The file being uploaded.
|
||||
content: The file content bytes (optional for FilePath sources).
|
||||
|
||||
Returns:
|
||||
S3 key string.
|
||||
"""
|
||||
if content is not None:
|
||||
content_hash = hashlib.sha256(content).hexdigest()[:16]
|
||||
else:
|
||||
file_path = _get_file_path(file)
|
||||
if file_path is not None:
|
||||
content_hash = _compute_hash_streaming(file_path)
|
||||
else:
|
||||
content_hash = hashlib.sha256(file.read()).hexdigest()[:16]
|
||||
|
||||
filename = file.filename or "file"
|
||||
safe_filename = "".join(
|
||||
c if c.isalnum() or c in ".-_" else "_" for c in filename
|
||||
)
|
||||
return f"{self._prefix}/{content_hash}_{safe_filename}"
|
||||
|
||||
def _build_s3_uri(self, key: str) -> str:
|
||||
"""Build an S3 URI from a key.
|
||||
|
||||
Args:
|
||||
key: The S3 object key.
|
||||
|
||||
Returns:
|
||||
S3 URI string.
|
||||
"""
|
||||
return f"s3://{self.bucket_name}/{key}"
|
||||
|
||||
@staticmethod
|
||||
def _get_transfer_config() -> Any:
|
||||
"""Get boto3 TransferConfig for multipart uploads."""
|
||||
from boto3.s3.transfer import TransferConfig
|
||||
|
||||
return TransferConfig(
|
||||
multipart_threshold=MULTIPART_THRESHOLD,
|
||||
multipart_chunksize=MULTIPART_CHUNKSIZE,
|
||||
max_concurrency=MAX_CONCURRENCY,
|
||||
)
|
||||
|
||||
def upload(self, file: FileInput, purpose: str | None = None) -> UploadResult:
|
||||
"""Upload a file to S3 for use with Bedrock.
|
||||
|
||||
Uses streaming upload with automatic multipart for large files.
|
||||
For FilePath sources, streams directly from disk without loading into memory.
|
||||
|
||||
Args:
|
||||
file: The file to upload.
|
||||
purpose: Optional purpose (unused, kept for interface consistency).
|
||||
|
||||
Returns:
|
||||
UploadResult with the S3 URI and metadata.
|
||||
|
||||
Raises:
|
||||
TransientUploadError: For retryable errors (network, throttling).
|
||||
PermanentUploadError: For non-retryable errors (auth, validation).
|
||||
"""
|
||||
import io
|
||||
|
||||
try:
|
||||
client = self._get_client()
|
||||
transfer_config = self._get_transfer_config()
|
||||
file_path = _get_file_path(file)
|
||||
|
||||
if file_path is not None:
|
||||
file_size = file_path.stat().st_size
|
||||
s3_key = self._generate_s3_key(file)
|
||||
|
||||
logger.info(
|
||||
f"Uploading file '{file.filename}' to S3 bucket "
|
||||
f"'{self.bucket_name}' ({file_size} bytes, streaming)"
|
||||
)
|
||||
|
||||
with open(file_path, "rb") as f:
|
||||
client.upload_fileobj(
|
||||
f,
|
||||
self.bucket_name,
|
||||
s3_key,
|
||||
ExtraArgs={"ContentType": file.content_type},
|
||||
Config=transfer_config,
|
||||
)
|
||||
else:
|
||||
content = file.read()
|
||||
s3_key = self._generate_s3_key(file, content)
|
||||
|
||||
logger.info(
|
||||
f"Uploading file '{file.filename}' to S3 bucket "
|
||||
f"'{self.bucket_name}' ({len(content)} bytes)"
|
||||
)
|
||||
|
||||
client.upload_fileobj(
|
||||
io.BytesIO(content),
|
||||
self.bucket_name,
|
||||
s3_key,
|
||||
ExtraArgs={"ContentType": file.content_type},
|
||||
Config=transfer_config,
|
||||
)
|
||||
|
||||
s3_uri = self._build_s3_uri(s3_key)
|
||||
logger.info(f"Uploaded to S3: {s3_uri}")
|
||||
|
||||
return UploadResult(
|
||||
file_id=s3_key,
|
||||
file_uri=s3_uri,
|
||||
content_type=file.content_type,
|
||||
expires_at=None,
|
||||
provider=self.provider_name,
|
||||
)
|
||||
except ImportError:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise _classify_s3_error(e, file.filename) from e
|
||||
|
||||
def delete(self, file_id: str) -> bool:
|
||||
"""Delete an uploaded file from S3.
|
||||
|
||||
Args:
|
||||
file_id: The S3 key to delete.
|
||||
|
||||
Returns:
|
||||
True if deletion was successful, False otherwise.
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
client.delete_object(Bucket=self.bucket_name, Key=file_id)
|
||||
logger.info(f"Deleted S3 object: s3://{self.bucket_name}/{file_id}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Failed to delete S3 object s3://{self.bucket_name}/{file_id}: {e}"
|
||||
)
|
||||
return False
|
||||
|
||||
def get_file_info(self, file_id: str) -> dict[str, Any] | None:
|
||||
"""Get information about an uploaded file.
|
||||
|
||||
Args:
|
||||
file_id: The S3 key.
|
||||
|
||||
Returns:
|
||||
Dictionary with file information, or None if not found.
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
response = client.head_object(Bucket=self.bucket_name, Key=file_id)
|
||||
return {
|
||||
"id": file_id,
|
||||
"uri": self._build_s3_uri(file_id),
|
||||
"content_type": response.get("ContentType"),
|
||||
"size": response.get("ContentLength"),
|
||||
"last_modified": response.get("LastModified"),
|
||||
"etag": response.get("ETag"),
|
||||
}
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to get S3 object info for {file_id}: {e}")
|
||||
return None
|
||||
|
||||
def list_files(self) -> list[dict[str, Any]]:
|
||||
"""List all uploaded files in the configured prefix.
|
||||
|
||||
Returns:
|
||||
List of dictionaries with file information.
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
response = client.list_objects_v2(
|
||||
Bucket=self.bucket_name,
|
||||
Prefix=self._prefix,
|
||||
)
|
||||
return [
|
||||
{
|
||||
"id": obj["Key"],
|
||||
"uri": self._build_s3_uri(obj["Key"]),
|
||||
"size": obj.get("Size"),
|
||||
"last_modified": obj.get("LastModified"),
|
||||
"etag": obj.get("ETag"),
|
||||
}
|
||||
for obj in response.get("Contents", [])
|
||||
]
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to list S3 objects: {e}")
|
||||
return []
|
||||
|
||||
async def aupload(
|
||||
self, file: FileInput, purpose: str | None = None
|
||||
) -> UploadResult:
|
||||
"""Async upload a file to S3 for use with Bedrock.
|
||||
|
||||
Uses streaming upload with automatic multipart for large files.
|
||||
For FilePath sources, streams directly from disk without loading into memory.
|
||||
|
||||
Args:
|
||||
file: The file to upload.
|
||||
purpose: Optional purpose (unused, kept for interface consistency).
|
||||
|
||||
Returns:
|
||||
UploadResult with the S3 URI and metadata.
|
||||
|
||||
Raises:
|
||||
TransientUploadError: For retryable errors (network, throttling).
|
||||
PermanentUploadError: For non-retryable errors (auth, validation).
|
||||
"""
|
||||
import io
|
||||
|
||||
import aiofiles
|
||||
|
||||
try:
|
||||
session = self._get_async_client()
|
||||
transfer_config = self._get_transfer_config()
|
||||
file_path = _get_file_path(file)
|
||||
|
||||
if file_path is not None:
|
||||
file_size = file_path.stat().st_size
|
||||
s3_key = self._generate_s3_key(file)
|
||||
|
||||
logger.info(
|
||||
f"Uploading file '{file.filename}' to S3 bucket "
|
||||
f"'{self.bucket_name}' ({file_size} bytes, streaming)"
|
||||
)
|
||||
|
||||
async with session.client("s3", region_name=self._region) as client:
|
||||
async with aiofiles.open(file_path, "rb") as f:
|
||||
await client.upload_fileobj(
|
||||
f,
|
||||
self.bucket_name,
|
||||
s3_key,
|
||||
ExtraArgs={"ContentType": file.content_type},
|
||||
Config=transfer_config,
|
||||
)
|
||||
else:
|
||||
content = await file.aread()
|
||||
s3_key = self._generate_s3_key(file, content)
|
||||
|
||||
logger.info(
|
||||
f"Uploading file '{file.filename}' to S3 bucket "
|
||||
f"'{self.bucket_name}' ({len(content)} bytes)"
|
||||
)
|
||||
|
||||
async with session.client("s3", region_name=self._region) as client:
|
||||
await client.upload_fileobj(
|
||||
io.BytesIO(content),
|
||||
self.bucket_name,
|
||||
s3_key,
|
||||
ExtraArgs={"ContentType": file.content_type},
|
||||
Config=transfer_config,
|
||||
)
|
||||
|
||||
s3_uri = self._build_s3_uri(s3_key)
|
||||
logger.info(f"Uploaded to S3: {s3_uri}")
|
||||
|
||||
return UploadResult(
|
||||
file_id=s3_key,
|
||||
file_uri=s3_uri,
|
||||
content_type=file.content_type,
|
||||
expires_at=None,
|
||||
provider=self.provider_name,
|
||||
)
|
||||
except ImportError:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise _classify_s3_error(e, file.filename) from e
|
||||
|
||||
async def adelete(self, file_id: str) -> bool:
|
||||
"""Async delete an uploaded file from S3.
|
||||
|
||||
Args:
|
||||
file_id: The S3 key to delete.
|
||||
|
||||
Returns:
|
||||
True if deletion was successful, False otherwise.
|
||||
"""
|
||||
try:
|
||||
session = self._get_async_client()
|
||||
async with session.client("s3", region_name=self._region) as client:
|
||||
await client.delete_object(Bucket=self.bucket_name, Key=file_id)
|
||||
logger.info(f"Deleted S3 object: s3://{self.bucket_name}/{file_id}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Failed to delete S3 object s3://{self.bucket_name}/{file_id}: {e}"
|
||||
)
|
||||
return False
|
||||
@@ -1,192 +0,0 @@
|
||||
"""Factory for creating file uploaders."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import Literal, TypeAlias, TypedDict, overload
|
||||
|
||||
from typing_extensions import NotRequired, Unpack
|
||||
|
||||
from crewai.files.uploaders.anthropic import AnthropicFileUploader
|
||||
from crewai.files.uploaders.bedrock import BedrockFileUploader
|
||||
from crewai.files.uploaders.gemini import GeminiFileUploader
|
||||
from crewai.files.uploaders.openai import OpenAIFileUploader
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
FileUploaderType: TypeAlias = (
|
||||
GeminiFileUploader
|
||||
| AnthropicFileUploader
|
||||
| BedrockFileUploader
|
||||
| OpenAIFileUploader
|
||||
)
|
||||
|
||||
GeminiProviderType = Literal["gemini", "google"]
|
||||
AnthropicProviderType = Literal["anthropic", "claude"]
|
||||
OpenAIProviderType = Literal["openai", "gpt"]
|
||||
BedrockProviderType = Literal["bedrock", "aws"]
|
||||
|
||||
ProviderType: TypeAlias = (
|
||||
GeminiProviderType
|
||||
| AnthropicProviderType
|
||||
| OpenAIProviderType
|
||||
| BedrockProviderType
|
||||
)
|
||||
|
||||
|
||||
class _BaseOpts(TypedDict):
|
||||
"""Kwargs for uploader factory."""
|
||||
|
||||
api_key: NotRequired[str | None]
|
||||
|
||||
|
||||
class OpenAIOpts(_BaseOpts):
|
||||
"""Kwargs for openai uploader factory."""
|
||||
|
||||
chunk_size: NotRequired[int]
|
||||
|
||||
|
||||
class GeminiOpts(_BaseOpts):
|
||||
"""Kwargs for gemini uploader factory."""
|
||||
|
||||
|
||||
class AnthropicOpts(_BaseOpts):
|
||||
"""Kwargs for anthropic uploader factory."""
|
||||
|
||||
|
||||
class BedrockOpts(TypedDict):
|
||||
"""Kwargs for bedrock uploader factory."""
|
||||
|
||||
bucket_name: NotRequired[str | None]
|
||||
bucket_owner: NotRequired[str | None]
|
||||
prefix: NotRequired[str]
|
||||
region: NotRequired[str | None]
|
||||
|
||||
|
||||
class AllOptions(TypedDict):
|
||||
"""Kwargs for uploader factory."""
|
||||
|
||||
api_key: NotRequired[str | None]
|
||||
chunk_size: NotRequired[int]
|
||||
bucket_name: NotRequired[str | None]
|
||||
bucket_owner: NotRequired[str | None]
|
||||
prefix: NotRequired[str]
|
||||
region: NotRequired[str | None]
|
||||
|
||||
|
||||
@overload
|
||||
def get_uploader(
|
||||
provider: GeminiProviderType,
|
||||
**kwargs: Unpack[GeminiOpts],
|
||||
) -> GeminiFileUploader:
|
||||
"""Get Gemini file uploader."""
|
||||
|
||||
|
||||
@overload
|
||||
def get_uploader(
|
||||
provider: AnthropicProviderType,
|
||||
**kwargs: Unpack[AnthropicOpts],
|
||||
) -> AnthropicFileUploader:
|
||||
"""Get Anthropic file uploader."""
|
||||
|
||||
|
||||
@overload
|
||||
def get_uploader(
|
||||
provider: OpenAIProviderType,
|
||||
**kwargs: Unpack[OpenAIOpts],
|
||||
) -> OpenAIFileUploader:
|
||||
"""Get OpenAI file uploader."""
|
||||
|
||||
|
||||
@overload
|
||||
def get_uploader(
|
||||
provider: BedrockProviderType,
|
||||
**kwargs: Unpack[BedrockOpts],
|
||||
) -> BedrockFileUploader:
|
||||
"""Get Bedrock file uploader."""
|
||||
|
||||
|
||||
@overload
|
||||
def get_uploader(
|
||||
provider: ProviderType, **kwargs: Unpack[AllOptions]
|
||||
) -> FileUploaderType:
|
||||
"""Get any file uploader."""
|
||||
|
||||
|
||||
def get_uploader(
|
||||
provider: ProviderType, **kwargs: Unpack[AllOptions]
|
||||
) -> FileUploaderType:
|
||||
"""Get a file uploader for a specific provider.
|
||||
|
||||
Args:
|
||||
provider: Provider name (e.g., "gemini", "anthropic").
|
||||
**kwargs: Additional arguments passed to the uploader constructor.
|
||||
|
||||
Returns:
|
||||
FileUploader instance for the provider, or None if not supported.
|
||||
"""
|
||||
provider_lower = provider.lower()
|
||||
|
||||
if "gemini" in provider_lower or "google" in provider_lower:
|
||||
try:
|
||||
from crewai.files.uploaders.gemini import GeminiFileUploader
|
||||
|
||||
return GeminiFileUploader(api_key=kwargs.get("api_key"))
|
||||
except ImportError:
|
||||
logger.warning(
|
||||
"google-genai not installed. Install with: pip install google-genai"
|
||||
)
|
||||
raise
|
||||
|
||||
if "anthropic" in provider_lower or "claude" in provider_lower:
|
||||
try:
|
||||
from crewai.files.uploaders.anthropic import AnthropicFileUploader
|
||||
|
||||
return AnthropicFileUploader(api_key=kwargs.get("api_key"))
|
||||
except ImportError:
|
||||
logger.warning(
|
||||
"anthropic not installed. Install with: pip install anthropic"
|
||||
)
|
||||
raise
|
||||
|
||||
if "openai" in provider_lower or "gpt" in provider_lower:
|
||||
try:
|
||||
from crewai.files.uploaders.openai import OpenAIFileUploader
|
||||
|
||||
return OpenAIFileUploader(
|
||||
api_key=kwargs.get("api_key"),
|
||||
chunk_size=kwargs.get("chunk_size", 67_108_864),
|
||||
)
|
||||
except ImportError:
|
||||
logger.warning("openai not installed. Install with: pip install openai")
|
||||
raise
|
||||
|
||||
if "bedrock" in provider_lower or "aws" in provider_lower:
|
||||
import os
|
||||
|
||||
if (
|
||||
not os.environ.get("CREWAI_BEDROCK_S3_BUCKET")
|
||||
and "bucket_name" not in kwargs
|
||||
):
|
||||
logger.debug(
|
||||
"Bedrock S3 uploader not configured. "
|
||||
"Set CREWAI_BEDROCK_S3_BUCKET environment variable to enable."
|
||||
)
|
||||
raise
|
||||
try:
|
||||
from crewai.files.uploaders.bedrock import BedrockFileUploader
|
||||
|
||||
return BedrockFileUploader(
|
||||
bucket_name=kwargs.get("bucket_name"),
|
||||
bucket_owner=kwargs.get("bucket_owner"),
|
||||
prefix=kwargs.get("prefix", "crewai-files"),
|
||||
region=kwargs.get("region"),
|
||||
)
|
||||
except ImportError:
|
||||
logger.warning("boto3 not installed. Install with: pip install boto3")
|
||||
raise
|
||||
|
||||
logger.debug(f"No file uploader available for provider: {provider}")
|
||||
raise
|
||||
@@ -1,443 +0,0 @@
|
||||
"""Gemini File API uploader implementation."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from datetime import datetime, timezone
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
import random
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
from crewai.files.constants import (
|
||||
BACKOFF_BASE_DELAY,
|
||||
BACKOFF_JITTER_FACTOR,
|
||||
BACKOFF_MAX_DELAY,
|
||||
GEMINI_FILE_TTL,
|
||||
)
|
||||
from crewai.files.content_types import FileInput
|
||||
from crewai.files.file import FilePath
|
||||
from crewai.files.processing.exceptions import (
|
||||
PermanentUploadError,
|
||||
TransientUploadError,
|
||||
classify_upload_error,
|
||||
)
|
||||
from crewai.files.uploaders.base import FileUploader, UploadResult
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _compute_backoff_delay(attempt: int) -> float:
|
||||
"""Compute exponential backoff delay with jitter.
|
||||
|
||||
Args:
|
||||
attempt: The current attempt number (0-indexed).
|
||||
|
||||
Returns:
|
||||
Delay in seconds with jitter applied.
|
||||
"""
|
||||
delay: float = min(BACKOFF_BASE_DELAY * (2**attempt), BACKOFF_MAX_DELAY)
|
||||
jitter: float = random.uniform(0, delay * BACKOFF_JITTER_FACTOR) # noqa: S311
|
||||
return float(delay + jitter)
|
||||
|
||||
|
||||
def _classify_gemini_error(e: Exception, filename: str | None) -> Exception:
|
||||
"""Classify a Gemini exception as transient or permanent upload error.
|
||||
|
||||
Checks Gemini-specific error message patterns first, then falls back
|
||||
to generic status code classification.
|
||||
|
||||
Args:
|
||||
e: The exception to classify.
|
||||
filename: The filename for error context.
|
||||
|
||||
Returns:
|
||||
A TransientUploadError or PermanentUploadError wrapping the original.
|
||||
"""
|
||||
error_msg = str(e).lower()
|
||||
|
||||
if "quota" in error_msg or "rate" in error_msg or "limit" in error_msg:
|
||||
return TransientUploadError(f"Rate limit error: {e}", file_name=filename)
|
||||
if "auth" in error_msg or "permission" in error_msg or "denied" in error_msg:
|
||||
return PermanentUploadError(
|
||||
f"Authentication/permission error: {e}", file_name=filename
|
||||
)
|
||||
if "invalid" in error_msg or "unsupported" in error_msg:
|
||||
return PermanentUploadError(f"Invalid request: {e}", file_name=filename)
|
||||
|
||||
return classify_upload_error(e, filename)
|
||||
|
||||
|
||||
def _get_file_path(file: FileInput) -> Path | None:
|
||||
"""Get the filesystem path if file source is FilePath.
|
||||
|
||||
Args:
|
||||
file: The file input to check.
|
||||
|
||||
Returns:
|
||||
Path if source is FilePath, None otherwise.
|
||||
"""
|
||||
source = file._file_source
|
||||
if isinstance(source, FilePath):
|
||||
return source.path
|
||||
return None
|
||||
|
||||
|
||||
class GeminiFileUploader(FileUploader):
|
||||
"""Uploader for Google Gemini File API.
|
||||
|
||||
Uses the google-genai SDK to upload files. Files are stored for 48 hours.
|
||||
"""
|
||||
|
||||
def __init__(self, api_key: str | None = None) -> None:
|
||||
"""Initialize the Gemini uploader.
|
||||
|
||||
Args:
|
||||
api_key: Optional Google API key. If not provided, uses
|
||||
GOOGLE_API_KEY environment variable.
|
||||
"""
|
||||
self._api_key = api_key or os.environ.get("GOOGLE_API_KEY")
|
||||
self._client: Any = None
|
||||
|
||||
@property
|
||||
def provider_name(self) -> str:
|
||||
"""Return the provider name."""
|
||||
return "gemini"
|
||||
|
||||
def _get_client(self) -> Any:
|
||||
"""Get or create the Gemini client."""
|
||||
if self._client is None:
|
||||
try:
|
||||
from google import genai
|
||||
|
||||
self._client = genai.Client(api_key=self._api_key)
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"google-genai is required for Gemini file uploads. "
|
||||
"Install with: pip install google-genai"
|
||||
) from e
|
||||
return self._client
|
||||
|
||||
def upload(self, file: FileInput, purpose: str | None = None) -> UploadResult:
|
||||
"""Upload a file to Gemini.
|
||||
|
||||
For FilePath sources, passes the path directly to the SDK which handles
|
||||
streaming internally via resumable uploads, avoiding memory overhead.
|
||||
|
||||
Args:
|
||||
file: The file to upload.
|
||||
purpose: Optional purpose/description (used as display name).
|
||||
|
||||
Returns:
|
||||
UploadResult with the file URI and metadata.
|
||||
|
||||
Raises:
|
||||
TransientUploadError: For retryable errors (network, rate limits).
|
||||
PermanentUploadError: For non-retryable errors (auth, validation).
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
display_name = purpose or file.filename
|
||||
|
||||
file_path = _get_file_path(file)
|
||||
if file_path is not None:
|
||||
file_size = file_path.stat().st_size
|
||||
logger.info(
|
||||
f"Uploading file '{file.filename}' to Gemini via path "
|
||||
f"({file_size} bytes, streaming)"
|
||||
)
|
||||
uploaded_file = client.files.upload(
|
||||
file=file_path,
|
||||
config={
|
||||
"display_name": display_name,
|
||||
"mime_type": file.content_type,
|
||||
},
|
||||
)
|
||||
else:
|
||||
content = file.read()
|
||||
file_data = io.BytesIO(content)
|
||||
file_data.name = file.filename
|
||||
|
||||
logger.info(
|
||||
f"Uploading file '{file.filename}' to Gemini ({len(content)} bytes)"
|
||||
)
|
||||
|
||||
uploaded_file = client.files.upload(
|
||||
file=file_data,
|
||||
config={
|
||||
"display_name": display_name,
|
||||
"mime_type": file.content_type,
|
||||
},
|
||||
)
|
||||
|
||||
if file.content_type.startswith("video/"):
|
||||
if not self.wait_for_processing(uploaded_file.name):
|
||||
raise PermanentUploadError(
|
||||
f"Video processing failed for {file.filename}",
|
||||
file_name=file.filename,
|
||||
)
|
||||
|
||||
expires_at = datetime.now(timezone.utc) + GEMINI_FILE_TTL
|
||||
|
||||
logger.info(
|
||||
f"Uploaded to Gemini: {uploaded_file.name} (URI: {uploaded_file.uri})"
|
||||
)
|
||||
|
||||
return UploadResult(
|
||||
file_id=uploaded_file.name,
|
||||
file_uri=uploaded_file.uri,
|
||||
content_type=file.content_type,
|
||||
expires_at=expires_at,
|
||||
provider=self.provider_name,
|
||||
)
|
||||
except ImportError:
|
||||
raise
|
||||
except (TransientUploadError, PermanentUploadError):
|
||||
raise
|
||||
except Exception as e:
|
||||
raise _classify_gemini_error(e, file.filename) from e
|
||||
|
||||
async def aupload(
|
||||
self, file: FileInput, purpose: str | None = None
|
||||
) -> UploadResult:
|
||||
"""Async upload a file to Gemini using native async client.
|
||||
|
||||
For FilePath sources, passes the path directly to the SDK which handles
|
||||
streaming internally via resumable uploads, avoiding memory overhead.
|
||||
|
||||
Args:
|
||||
file: The file to upload.
|
||||
purpose: Optional purpose/description (used as display name).
|
||||
|
||||
Returns:
|
||||
UploadResult with the file URI and metadata.
|
||||
|
||||
Raises:
|
||||
TransientUploadError: For retryable errors (network, rate limits).
|
||||
PermanentUploadError: For non-retryable errors (auth, validation).
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
display_name = purpose or file.filename
|
||||
|
||||
file_path = _get_file_path(file)
|
||||
if file_path is not None:
|
||||
file_size = file_path.stat().st_size
|
||||
logger.info(
|
||||
f"Uploading file '{file.filename}' to Gemini via path "
|
||||
f"({file_size} bytes, streaming)"
|
||||
)
|
||||
uploaded_file = await client.aio.files.upload(
|
||||
file=file_path,
|
||||
config={
|
||||
"display_name": display_name,
|
||||
"mime_type": file.content_type,
|
||||
},
|
||||
)
|
||||
else:
|
||||
content = await file.aread()
|
||||
file_data = io.BytesIO(content)
|
||||
file_data.name = file.filename
|
||||
|
||||
logger.info(
|
||||
f"Uploading file '{file.filename}' to Gemini ({len(content)} bytes)"
|
||||
)
|
||||
|
||||
uploaded_file = await client.aio.files.upload(
|
||||
file=file_data,
|
||||
config={
|
||||
"display_name": display_name,
|
||||
"mime_type": file.content_type,
|
||||
},
|
||||
)
|
||||
|
||||
if file.content_type.startswith("video/"):
|
||||
if not await self.await_for_processing(uploaded_file.name):
|
||||
raise PermanentUploadError(
|
||||
f"Video processing failed for {file.filename}",
|
||||
file_name=file.filename,
|
||||
)
|
||||
|
||||
expires_at = datetime.now(timezone.utc) + GEMINI_FILE_TTL
|
||||
|
||||
logger.info(
|
||||
f"Uploaded to Gemini: {uploaded_file.name} (URI: {uploaded_file.uri})"
|
||||
)
|
||||
|
||||
return UploadResult(
|
||||
file_id=uploaded_file.name,
|
||||
file_uri=uploaded_file.uri,
|
||||
content_type=file.content_type,
|
||||
expires_at=expires_at,
|
||||
provider=self.provider_name,
|
||||
)
|
||||
except ImportError:
|
||||
raise
|
||||
except (TransientUploadError, PermanentUploadError):
|
||||
raise
|
||||
except Exception as e:
|
||||
raise _classify_gemini_error(e, file.filename) from e
|
||||
|
||||
def delete(self, file_id: str) -> bool:
|
||||
"""Delete an uploaded file from Gemini.
|
||||
|
||||
Args:
|
||||
file_id: The file name/ID to delete.
|
||||
|
||||
Returns:
|
||||
True if deletion was successful, False otherwise.
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
client.files.delete(name=file_id)
|
||||
logger.info(f"Deleted Gemini file: {file_id}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to delete Gemini file {file_id}: {e}")
|
||||
return False
|
||||
|
||||
async def adelete(self, file_id: str) -> bool:
|
||||
"""Async delete an uploaded file from Gemini.
|
||||
|
||||
Args:
|
||||
file_id: The file name/ID to delete.
|
||||
|
||||
Returns:
|
||||
True if deletion was successful, False otherwise.
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
await client.aio.files.delete(name=file_id)
|
||||
logger.info(f"Deleted Gemini file: {file_id}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to delete Gemini file {file_id}: {e}")
|
||||
return False
|
||||
|
||||
def get_file_info(self, file_id: str) -> dict[str, Any] | None:
|
||||
"""Get information about an uploaded file.
|
||||
|
||||
Args:
|
||||
file_id: The file name/ID.
|
||||
|
||||
Returns:
|
||||
Dictionary with file information, or None if not found.
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
file_info = client.files.get(name=file_id)
|
||||
return {
|
||||
"name": file_info.name,
|
||||
"uri": file_info.uri,
|
||||
"display_name": file_info.display_name,
|
||||
"mime_type": file_info.mime_type,
|
||||
"size_bytes": file_info.size_bytes,
|
||||
"state": str(file_info.state),
|
||||
"create_time": file_info.create_time,
|
||||
"expiration_time": file_info.expiration_time,
|
||||
}
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to get Gemini file info for {file_id}: {e}")
|
||||
return None
|
||||
|
||||
def list_files(self) -> list[dict[str, Any]]:
|
||||
"""List all uploaded files.
|
||||
|
||||
Returns:
|
||||
List of dictionaries with file information.
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
files = client.files.list()
|
||||
return [
|
||||
{
|
||||
"name": f.name,
|
||||
"uri": f.uri,
|
||||
"display_name": f.display_name,
|
||||
"mime_type": f.mime_type,
|
||||
"size_bytes": f.size_bytes,
|
||||
"state": str(f.state),
|
||||
}
|
||||
for f in files
|
||||
]
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to list Gemini files: {e}")
|
||||
return []
|
||||
|
||||
def wait_for_processing(self, file_id: str, timeout_seconds: int = 300) -> bool:
|
||||
"""Wait for a file to finish processing with exponential backoff.
|
||||
|
||||
Some files (especially videos) need time to process after upload.
|
||||
|
||||
Args:
|
||||
file_id: The file name/ID.
|
||||
timeout_seconds: Maximum time to wait.
|
||||
|
||||
Returns:
|
||||
True if processing completed, False if timed out or failed.
|
||||
"""
|
||||
try:
|
||||
from google.genai.types import FileState
|
||||
except ImportError:
|
||||
return True
|
||||
|
||||
client = self._get_client()
|
||||
start_time = time.time()
|
||||
attempt = 0
|
||||
|
||||
while time.time() - start_time < timeout_seconds:
|
||||
file_info = client.files.get(name=file_id)
|
||||
|
||||
if file_info.state == FileState.ACTIVE:
|
||||
return True
|
||||
if file_info.state == FileState.FAILED:
|
||||
logger.error(f"Gemini file processing failed: {file_id}")
|
||||
return False
|
||||
|
||||
time.sleep(_compute_backoff_delay(attempt))
|
||||
attempt += 1
|
||||
|
||||
logger.warning(f"Timed out waiting for Gemini file processing: {file_id}")
|
||||
return False
|
||||
|
||||
async def await_for_processing(
|
||||
self, file_id: str, timeout_seconds: int = 300
|
||||
) -> bool:
|
||||
"""Async wait for a file to finish processing with exponential backoff.
|
||||
|
||||
Some files (especially videos) need time to process after upload.
|
||||
|
||||
Args:
|
||||
file_id: The file name/ID.
|
||||
timeout_seconds: Maximum time to wait.
|
||||
|
||||
Returns:
|
||||
True if processing completed, False if timed out or failed.
|
||||
"""
|
||||
try:
|
||||
from google.genai.types import FileState
|
||||
except ImportError:
|
||||
return True
|
||||
|
||||
client = self._get_client()
|
||||
start_time = time.time()
|
||||
attempt = 0
|
||||
|
||||
while time.time() - start_time < timeout_seconds:
|
||||
file_info = await client.aio.files.get(name=file_id)
|
||||
|
||||
if file_info.state == FileState.ACTIVE:
|
||||
return True
|
||||
if file_info.state == FileState.FAILED:
|
||||
logger.error(f"Gemini file processing failed: {file_id}")
|
||||
return False
|
||||
|
||||
await asyncio.sleep(_compute_backoff_delay(attempt))
|
||||
attempt += 1
|
||||
|
||||
logger.warning(f"Timed out waiting for Gemini file processing: {file_id}")
|
||||
return False
|
||||
@@ -1,669 +0,0 @@
|
||||
"""OpenAI Files API uploader implementation."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import AsyncIterator, Iterator
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
from crewai.files.constants import DEFAULT_UPLOAD_CHUNK_SIZE, FILES_API_MAX_SIZE
|
||||
from crewai.files.content_types import FileInput
|
||||
from crewai.files.file import FileBytes, FilePath, FileStream
|
||||
from crewai.files.processing.exceptions import (
|
||||
PermanentUploadError,
|
||||
TransientUploadError,
|
||||
classify_upload_error,
|
||||
)
|
||||
from crewai.files.uploaders.base import FileUploader, UploadResult
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _get_file_size(file: FileInput) -> int | None:
|
||||
"""Get file size without reading content if possible.
|
||||
|
||||
Args:
|
||||
file: The file to get size for.
|
||||
|
||||
Returns:
|
||||
File size in bytes, or None if size cannot be determined without reading.
|
||||
"""
|
||||
source = file._file_source
|
||||
if isinstance(source, FilePath):
|
||||
return source.path.stat().st_size
|
||||
if isinstance(source, FileBytes):
|
||||
return len(source.data)
|
||||
return None
|
||||
|
||||
|
||||
def _iter_file_chunks(file: FileInput, chunk_size: int) -> Iterator[bytes]:
|
||||
"""Iterate over file content in chunks.
|
||||
|
||||
Args:
|
||||
file: The file to read.
|
||||
chunk_size: Size of each chunk in bytes.
|
||||
|
||||
Yields:
|
||||
Chunks of file content.
|
||||
"""
|
||||
source = file._file_source
|
||||
if isinstance(source, (FilePath, FileBytes, FileStream)):
|
||||
yield from source.read_chunks(chunk_size)
|
||||
else:
|
||||
content = file.read()
|
||||
for i in range(0, len(content), chunk_size):
|
||||
yield content[i : i + chunk_size]
|
||||
|
||||
|
||||
async def _aiter_file_chunks(
|
||||
file: FileInput, chunk_size: int, content: bytes | None = None
|
||||
) -> AsyncIterator[bytes]:
|
||||
"""Async iterate over file content in chunks.
|
||||
|
||||
Args:
|
||||
file: The file to read.
|
||||
chunk_size: Size of each chunk in bytes.
|
||||
content: Optional pre-loaded content to chunk.
|
||||
|
||||
Yields:
|
||||
Chunks of file content.
|
||||
"""
|
||||
if content is not None:
|
||||
for i in range(0, len(content), chunk_size):
|
||||
yield content[i : i + chunk_size]
|
||||
return
|
||||
|
||||
source = file._file_source
|
||||
if isinstance(source, FilePath):
|
||||
async for chunk in source.aread_chunks(chunk_size):
|
||||
yield chunk
|
||||
elif isinstance(source, (FileBytes, FileStream)):
|
||||
for chunk in source.read_chunks(chunk_size):
|
||||
yield chunk
|
||||
else:
|
||||
data = await file.aread()
|
||||
for i in range(0, len(data), chunk_size):
|
||||
yield data[i : i + chunk_size]
|
||||
|
||||
|
||||
class OpenAIFileUploader(FileUploader):
|
||||
"""Uploader for OpenAI Files and Uploads APIs.
|
||||
|
||||
Uses the Files API for files up to 512MB (single request).
|
||||
Uses the Uploads API for files larger than 512MB (multipart chunked).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
api_key: str | None = None,
|
||||
chunk_size: int = DEFAULT_UPLOAD_CHUNK_SIZE,
|
||||
) -> None:
|
||||
"""Initialize the OpenAI uploader.
|
||||
|
||||
Args:
|
||||
api_key: Optional OpenAI API key. If not provided, uses
|
||||
OPENAI_API_KEY environment variable.
|
||||
chunk_size: Chunk size in bytes for multipart uploads (default 64MB).
|
||||
"""
|
||||
self._api_key = api_key or os.environ.get("OPENAI_API_KEY")
|
||||
self._chunk_size = chunk_size
|
||||
self._client: Any = None
|
||||
self._async_client: Any = None
|
||||
|
||||
@property
|
||||
def provider_name(self) -> str:
|
||||
"""Return the provider name."""
|
||||
return "openai"
|
||||
|
||||
def _build_upload_result(self, file_id: str, content_type: str) -> UploadResult:
|
||||
"""Build an UploadResult for a completed upload.
|
||||
|
||||
Args:
|
||||
file_id: The uploaded file ID.
|
||||
content_type: The file's content type.
|
||||
|
||||
Returns:
|
||||
UploadResult with the file metadata.
|
||||
"""
|
||||
return UploadResult(
|
||||
file_id=file_id,
|
||||
file_uri=None,
|
||||
content_type=content_type,
|
||||
expires_at=None,
|
||||
provider=self.provider_name,
|
||||
)
|
||||
|
||||
def _get_client(self) -> Any:
|
||||
"""Get or create the OpenAI client."""
|
||||
if self._client is None:
|
||||
try:
|
||||
from openai import OpenAI
|
||||
|
||||
self._client = OpenAI(api_key=self._api_key)
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"openai is required for OpenAI file uploads. "
|
||||
"Install with: pip install openai"
|
||||
) from e
|
||||
return self._client
|
||||
|
||||
def _get_async_client(self) -> Any:
|
||||
"""Get or create the async OpenAI client."""
|
||||
if self._async_client is None:
|
||||
try:
|
||||
from openai import AsyncOpenAI
|
||||
|
||||
self._async_client = AsyncOpenAI(api_key=self._api_key)
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"openai is required for OpenAI file uploads. "
|
||||
"Install with: pip install openai"
|
||||
) from e
|
||||
return self._async_client
|
||||
|
||||
def upload(self, file: FileInput, purpose: str | None = None) -> UploadResult:
|
||||
"""Upload a file to OpenAI.
|
||||
|
||||
Uses Files API for files <= 512MB, Uploads API for larger files.
|
||||
For large files, streams chunks to avoid loading entire file in memory.
|
||||
|
||||
Args:
|
||||
file: The file to upload.
|
||||
purpose: Optional purpose for the file (default: "user_data").
|
||||
|
||||
Returns:
|
||||
UploadResult with the file ID and metadata.
|
||||
|
||||
Raises:
|
||||
TransientUploadError: For retryable errors (network, rate limits).
|
||||
PermanentUploadError: For non-retryable errors (auth, validation).
|
||||
"""
|
||||
try:
|
||||
file_size = _get_file_size(file)
|
||||
|
||||
if file_size is not None and file_size > FILES_API_MAX_SIZE:
|
||||
return self._upload_multipart_streaming(file, file_size, purpose)
|
||||
|
||||
content = file.read()
|
||||
if len(content) > FILES_API_MAX_SIZE:
|
||||
return self._upload_multipart(file, content, purpose)
|
||||
return self._upload_simple(file, content, purpose)
|
||||
except ImportError:
|
||||
raise
|
||||
except (TransientUploadError, PermanentUploadError):
|
||||
raise
|
||||
except Exception as e:
|
||||
raise classify_upload_error(e, file.filename) from e
|
||||
|
||||
def _upload_simple(
|
||||
self,
|
||||
file: FileInput,
|
||||
content: bytes,
|
||||
purpose: str | None,
|
||||
) -> UploadResult:
|
||||
"""Upload using the Files API (single request, up to 512MB).
|
||||
|
||||
Args:
|
||||
file: The file to upload.
|
||||
content: File content bytes.
|
||||
purpose: Optional purpose for the file.
|
||||
|
||||
Returns:
|
||||
UploadResult with the file ID and metadata.
|
||||
"""
|
||||
client = self._get_client()
|
||||
file_purpose = purpose or "user_data"
|
||||
|
||||
file_data = io.BytesIO(content)
|
||||
file_data.name = file.filename or "file"
|
||||
|
||||
logger.info(
|
||||
f"Uploading file '{file.filename}' to OpenAI Files API ({len(content)} bytes)"
|
||||
)
|
||||
|
||||
uploaded_file = client.files.create(
|
||||
file=file_data,
|
||||
purpose=file_purpose,
|
||||
)
|
||||
|
||||
logger.info(f"Uploaded to OpenAI: {uploaded_file.id}")
|
||||
|
||||
return self._build_upload_result(uploaded_file.id, file.content_type)
|
||||
|
||||
def _upload_multipart(
|
||||
self,
|
||||
file: FileInput,
|
||||
content: bytes,
|
||||
purpose: str | None,
|
||||
) -> UploadResult:
|
||||
"""Upload using the Uploads API with content already in memory.
|
||||
|
||||
Args:
|
||||
file: The file to upload.
|
||||
content: File content bytes (already loaded).
|
||||
purpose: Optional purpose for the file.
|
||||
|
||||
Returns:
|
||||
UploadResult with the file ID and metadata.
|
||||
"""
|
||||
client = self._get_client()
|
||||
file_purpose = purpose or "user_data"
|
||||
filename = file.filename or "file"
|
||||
file_size = len(content)
|
||||
|
||||
logger.info(
|
||||
f"Uploading file '{filename}' to OpenAI Uploads API "
|
||||
f"({file_size} bytes, {self._chunk_size} byte chunks)"
|
||||
)
|
||||
|
||||
upload = client.uploads.create(
|
||||
bytes=file_size,
|
||||
filename=filename,
|
||||
mime_type=file.content_type,
|
||||
purpose=file_purpose,
|
||||
)
|
||||
|
||||
part_ids: list[str] = []
|
||||
offset = 0
|
||||
part_num = 1
|
||||
|
||||
try:
|
||||
while offset < file_size:
|
||||
chunk = content[offset : offset + self._chunk_size]
|
||||
chunk_io = io.BytesIO(chunk)
|
||||
|
||||
logger.debug(
|
||||
f"Uploading part {part_num} ({len(chunk)} bytes, offset {offset})"
|
||||
)
|
||||
|
||||
part = client.uploads.parts.create(
|
||||
upload_id=upload.id,
|
||||
data=chunk_io,
|
||||
)
|
||||
part_ids.append(part.id)
|
||||
|
||||
offset += self._chunk_size
|
||||
part_num += 1
|
||||
|
||||
completed = client.uploads.complete(
|
||||
upload_id=upload.id,
|
||||
part_ids=part_ids,
|
||||
)
|
||||
|
||||
file_id = completed.file.id if completed.file else upload.id
|
||||
logger.info(f"Completed multipart upload to OpenAI: {file_id}")
|
||||
|
||||
return self._build_upload_result(file_id, file.content_type)
|
||||
except Exception:
|
||||
logger.warning(f"Multipart upload failed, cancelling upload {upload.id}")
|
||||
try:
|
||||
client.uploads.cancel(upload_id=upload.id)
|
||||
except Exception as cancel_err:
|
||||
logger.debug(f"Failed to cancel upload: {cancel_err}")
|
||||
raise
|
||||
|
||||
def _upload_multipart_streaming(
|
||||
self,
|
||||
file: FileInput,
|
||||
file_size: int,
|
||||
purpose: str | None,
|
||||
) -> UploadResult:
|
||||
"""Upload using the Uploads API with streaming chunks.
|
||||
|
||||
Streams chunks directly from the file source without loading
|
||||
the entire file into memory. Used for large files.
|
||||
|
||||
Args:
|
||||
file: The file to upload.
|
||||
file_size: Total file size in bytes.
|
||||
purpose: Optional purpose for the file.
|
||||
|
||||
Returns:
|
||||
UploadResult with the file ID and metadata.
|
||||
"""
|
||||
client = self._get_client()
|
||||
file_purpose = purpose or "user_data"
|
||||
filename = file.filename or "file"
|
||||
|
||||
logger.info(
|
||||
f"Uploading file '{filename}' to OpenAI Uploads API (streaming) "
|
||||
f"({file_size} bytes, {self._chunk_size} byte chunks)"
|
||||
)
|
||||
|
||||
upload = client.uploads.create(
|
||||
bytes=file_size,
|
||||
filename=filename,
|
||||
mime_type=file.content_type,
|
||||
purpose=file_purpose,
|
||||
)
|
||||
|
||||
part_ids: list[str] = []
|
||||
part_num = 1
|
||||
|
||||
try:
|
||||
for chunk in _iter_file_chunks(file, self._chunk_size):
|
||||
chunk_io = io.BytesIO(chunk)
|
||||
|
||||
logger.debug(f"Uploading part {part_num} ({len(chunk)} bytes)")
|
||||
|
||||
part = client.uploads.parts.create(
|
||||
upload_id=upload.id,
|
||||
data=chunk_io,
|
||||
)
|
||||
part_ids.append(part.id)
|
||||
part_num += 1
|
||||
|
||||
completed = client.uploads.complete(
|
||||
upload_id=upload.id,
|
||||
part_ids=part_ids,
|
||||
)
|
||||
|
||||
file_id = completed.file.id if completed.file else upload.id
|
||||
logger.info(f"Completed streaming multipart upload to OpenAI: {file_id}")
|
||||
|
||||
return self._build_upload_result(file_id, file.content_type)
|
||||
except Exception:
|
||||
logger.warning(f"Multipart upload failed, cancelling upload {upload.id}")
|
||||
try:
|
||||
client.uploads.cancel(upload_id=upload.id)
|
||||
except Exception as cancel_err:
|
||||
logger.debug(f"Failed to cancel upload: {cancel_err}")
|
||||
raise
|
||||
|
||||
def delete(self, file_id: str) -> bool:
|
||||
"""Delete an uploaded file from OpenAI.
|
||||
|
||||
Args:
|
||||
file_id: The file ID to delete.
|
||||
|
||||
Returns:
|
||||
True if deletion was successful, False otherwise.
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
client.files.delete(file_id)
|
||||
logger.info(f"Deleted OpenAI file: {file_id}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to delete OpenAI file {file_id}: {e}")
|
||||
return False
|
||||
|
||||
def get_file_info(self, file_id: str) -> dict[str, Any] | None:
|
||||
"""Get information about an uploaded file.
|
||||
|
||||
Args:
|
||||
file_id: The file ID.
|
||||
|
||||
Returns:
|
||||
Dictionary with file information, or None if not found.
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
file_info = client.files.retrieve(file_id)
|
||||
return {
|
||||
"id": file_info.id,
|
||||
"filename": file_info.filename,
|
||||
"purpose": file_info.purpose,
|
||||
"bytes": file_info.bytes,
|
||||
"created_at": file_info.created_at,
|
||||
"status": file_info.status,
|
||||
}
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to get OpenAI file info for {file_id}: {e}")
|
||||
return None
|
||||
|
||||
def list_files(self) -> list[dict[str, Any]]:
|
||||
"""List all uploaded files.
|
||||
|
||||
Returns:
|
||||
List of dictionaries with file information.
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
files = client.files.list()
|
||||
return [
|
||||
{
|
||||
"id": f.id,
|
||||
"filename": f.filename,
|
||||
"purpose": f.purpose,
|
||||
"bytes": f.bytes,
|
||||
"created_at": f.created_at,
|
||||
"status": f.status,
|
||||
}
|
||||
for f in files.data
|
||||
]
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to list OpenAI files: {e}")
|
||||
return []
|
||||
|
||||
async def aupload(
|
||||
self, file: FileInput, purpose: str | None = None
|
||||
) -> UploadResult:
|
||||
"""Async upload a file to OpenAI using native async client.
|
||||
|
||||
Uses Files API for files <= 512MB, Uploads API for larger files.
|
||||
For large files, streams chunks to avoid loading entire file in memory.
|
||||
|
||||
Args:
|
||||
file: The file to upload.
|
||||
purpose: Optional purpose for the file (default: "user_data").
|
||||
|
||||
Returns:
|
||||
UploadResult with the file ID and metadata.
|
||||
|
||||
Raises:
|
||||
TransientUploadError: For retryable errors (network, rate limits).
|
||||
PermanentUploadError: For non-retryable errors (auth, validation).
|
||||
"""
|
||||
try:
|
||||
file_size = _get_file_size(file)
|
||||
|
||||
if file_size is not None and file_size > FILES_API_MAX_SIZE:
|
||||
return await self._aupload_multipart_streaming(file, file_size, purpose)
|
||||
|
||||
content = await file.aread()
|
||||
if len(content) > FILES_API_MAX_SIZE:
|
||||
return await self._aupload_multipart(file, content, purpose)
|
||||
return await self._aupload_simple(file, content, purpose)
|
||||
except ImportError:
|
||||
raise
|
||||
except (TransientUploadError, PermanentUploadError):
|
||||
raise
|
||||
except Exception as e:
|
||||
raise classify_upload_error(e, file.filename) from e
|
||||
|
||||
async def _aupload_simple(
|
||||
self,
|
||||
file: FileInput,
|
||||
content: bytes,
|
||||
purpose: str | None,
|
||||
) -> UploadResult:
|
||||
"""Async upload using the Files API (single request, up to 512MB).
|
||||
|
||||
Args:
|
||||
file: The file to upload.
|
||||
content: File content bytes.
|
||||
purpose: Optional purpose for the file.
|
||||
|
||||
Returns:
|
||||
UploadResult with the file ID and metadata.
|
||||
"""
|
||||
client = self._get_async_client()
|
||||
file_purpose = purpose or "user_data"
|
||||
|
||||
file_data = io.BytesIO(content)
|
||||
file_data.name = file.filename or "file"
|
||||
|
||||
logger.info(
|
||||
f"Uploading file '{file.filename}' to OpenAI Files API ({len(content)} bytes)"
|
||||
)
|
||||
|
||||
uploaded_file = await client.files.create(
|
||||
file=file_data,
|
||||
purpose=file_purpose,
|
||||
)
|
||||
|
||||
logger.info(f"Uploaded to OpenAI: {uploaded_file.id}")
|
||||
|
||||
return self._build_upload_result(uploaded_file.id, file.content_type)
|
||||
|
||||
async def _aupload_multipart(
|
||||
self,
|
||||
file: FileInput,
|
||||
content: bytes,
|
||||
purpose: str | None,
|
||||
) -> UploadResult:
|
||||
"""Async upload using the Uploads API (multipart chunked, up to 8GB).
|
||||
|
||||
Args:
|
||||
file: The file to upload.
|
||||
content: File content bytes.
|
||||
purpose: Optional purpose for the file.
|
||||
|
||||
Returns:
|
||||
UploadResult with the file ID and metadata.
|
||||
"""
|
||||
client = self._get_async_client()
|
||||
file_purpose = purpose or "user_data"
|
||||
filename = file.filename or "file"
|
||||
file_size = len(content)
|
||||
|
||||
logger.info(
|
||||
f"Uploading file '{filename}' to OpenAI Uploads API "
|
||||
f"({file_size} bytes, {self._chunk_size} byte chunks)"
|
||||
)
|
||||
|
||||
upload = await client.uploads.create(
|
||||
bytes=file_size,
|
||||
filename=filename,
|
||||
mime_type=file.content_type,
|
||||
purpose=file_purpose,
|
||||
)
|
||||
|
||||
part_ids: list[str] = []
|
||||
offset = 0
|
||||
part_num = 1
|
||||
|
||||
try:
|
||||
while offset < file_size:
|
||||
chunk = content[offset : offset + self._chunk_size]
|
||||
chunk_io = io.BytesIO(chunk)
|
||||
|
||||
logger.debug(
|
||||
f"Uploading part {part_num} ({len(chunk)} bytes, offset {offset})"
|
||||
)
|
||||
|
||||
part = await client.uploads.parts.create(
|
||||
upload_id=upload.id,
|
||||
data=chunk_io,
|
||||
)
|
||||
part_ids.append(part.id)
|
||||
|
||||
offset += self._chunk_size
|
||||
part_num += 1
|
||||
|
||||
completed = await client.uploads.complete(
|
||||
upload_id=upload.id,
|
||||
part_ids=part_ids,
|
||||
)
|
||||
|
||||
file_id = completed.file.id if completed.file else upload.id
|
||||
logger.info(f"Completed multipart upload to OpenAI: {file_id}")
|
||||
|
||||
return self._build_upload_result(file_id, file.content_type)
|
||||
except Exception:
|
||||
logger.warning(f"Multipart upload failed, cancelling upload {upload.id}")
|
||||
try:
|
||||
await client.uploads.cancel(upload_id=upload.id)
|
||||
except Exception as cancel_err:
|
||||
logger.debug(f"Failed to cancel upload: {cancel_err}")
|
||||
raise
|
||||
|
||||
async def _aupload_multipart_streaming(
|
||||
self,
|
||||
file: FileInput,
|
||||
file_size: int,
|
||||
purpose: str | None,
|
||||
) -> UploadResult:
|
||||
"""Async upload using the Uploads API with streaming chunks.
|
||||
|
||||
Streams chunks directly from the file source without loading
|
||||
the entire file into memory. Used for large files.
|
||||
|
||||
Args:
|
||||
file: The file to upload.
|
||||
file_size: Total file size in bytes.
|
||||
purpose: Optional purpose for the file.
|
||||
|
||||
Returns:
|
||||
UploadResult with the file ID and metadata.
|
||||
"""
|
||||
client = self._get_async_client()
|
||||
file_purpose = purpose or "user_data"
|
||||
filename = file.filename or "file"
|
||||
|
||||
logger.info(
|
||||
f"Uploading file '{filename}' to OpenAI Uploads API (streaming) "
|
||||
f"({file_size} bytes, {self._chunk_size} byte chunks)"
|
||||
)
|
||||
|
||||
upload = await client.uploads.create(
|
||||
bytes=file_size,
|
||||
filename=filename,
|
||||
mime_type=file.content_type,
|
||||
purpose=file_purpose,
|
||||
)
|
||||
|
||||
part_ids: list[str] = []
|
||||
part_num = 1
|
||||
|
||||
try:
|
||||
async for chunk in _aiter_file_chunks(file, self._chunk_size):
|
||||
chunk_io = io.BytesIO(chunk)
|
||||
|
||||
logger.debug(f"Uploading part {part_num} ({len(chunk)} bytes)")
|
||||
|
||||
part = await client.uploads.parts.create(
|
||||
upload_id=upload.id,
|
||||
data=chunk_io,
|
||||
)
|
||||
part_ids.append(part.id)
|
||||
part_num += 1
|
||||
|
||||
completed = await client.uploads.complete(
|
||||
upload_id=upload.id,
|
||||
part_ids=part_ids,
|
||||
)
|
||||
|
||||
file_id = completed.file.id if completed.file else upload.id
|
||||
logger.info(f"Completed streaming multipart upload to OpenAI: {file_id}")
|
||||
|
||||
return self._build_upload_result(file_id, file.content_type)
|
||||
except Exception:
|
||||
logger.warning(f"Multipart upload failed, cancelling upload {upload.id}")
|
||||
try:
|
||||
await client.uploads.cancel(upload_id=upload.id)
|
||||
except Exception as cancel_err:
|
||||
logger.debug(f"Failed to cancel upload: {cancel_err}")
|
||||
raise
|
||||
|
||||
async def adelete(self, file_id: str) -> bool:
|
||||
"""Async delete an uploaded file from OpenAI.
|
||||
|
||||
Args:
|
||||
file_id: The file ID to delete.
|
||||
|
||||
Returns:
|
||||
True if deletion was successful, False otherwise.
|
||||
"""
|
||||
try:
|
||||
client = self._get_async_client()
|
||||
await client.files.delete(file_id)
|
||||
logger.info(f"Deleted OpenAI file: {file_id}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to delete OpenAI file {file_id}: {e}")
|
||||
return False
|
||||
@@ -1,95 +0,0 @@
|
||||
"""Utility functions for file handling."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from typing_extensions import TypeIs
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from crewai.files.content_types import FileInput
|
||||
from crewai.files.file import FileSource, FileSourceInput
|
||||
|
||||
|
||||
def is_file_source(v: object) -> TypeIs[FileSource]:
|
||||
"""Type guard to narrow input to FileSource."""
|
||||
from crewai.files.file import FileBytes, FilePath, FileStream, FileUrl
|
||||
|
||||
return isinstance(v, (FilePath, FileBytes, FileStream, FileUrl))
|
||||
|
||||
|
||||
def wrap_file_source(source: FileSource) -> FileInput:
|
||||
"""Wrap a FileSource in the appropriate typed FileInput wrapper.
|
||||
|
||||
Args:
|
||||
source: The file source to wrap.
|
||||
|
||||
Returns:
|
||||
Typed FileInput wrapper based on content type.
|
||||
"""
|
||||
from crewai.files.content_types import (
|
||||
AudioFile,
|
||||
ImageFile,
|
||||
PDFFile,
|
||||
TextFile,
|
||||
VideoFile,
|
||||
)
|
||||
|
||||
content_type = source.content_type
|
||||
|
||||
if content_type.startswith("image/"):
|
||||
return ImageFile(source=source)
|
||||
if content_type.startswith("audio/"):
|
||||
return AudioFile(source=source)
|
||||
if content_type.startswith("video/"):
|
||||
return VideoFile(source=source)
|
||||
if content_type == "application/pdf":
|
||||
return PDFFile(source=source)
|
||||
return TextFile(source=source)
|
||||
|
||||
|
||||
def normalize_input_files(
|
||||
input_files: list[FileSourceInput | FileInput],
|
||||
) -> dict[str, FileInput]:
|
||||
"""Convert a list of file sources to a named dictionary of FileInputs.
|
||||
|
||||
Args:
|
||||
input_files: List of file source inputs or File objects.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping names to FileInput wrappers.
|
||||
"""
|
||||
from crewai.files.content_types import BaseFile
|
||||
from crewai.files.file import FileBytes, FilePath, FileStream, FileUrl
|
||||
|
||||
result: dict[str, FileInput] = {}
|
||||
|
||||
for i, item in enumerate(input_files):
|
||||
if isinstance(item, BaseFile):
|
||||
name = item.filename or f"file_{i}"
|
||||
if "." in name:
|
||||
name = name.rsplit(".", 1)[0]
|
||||
result[name] = item
|
||||
continue
|
||||
|
||||
file_source: FilePath | FileBytes | FileStream | FileUrl
|
||||
if isinstance(item, (FilePath, FileBytes, FileStream, FileUrl)):
|
||||
file_source = item
|
||||
elif isinstance(item, Path):
|
||||
file_source = FilePath(path=item)
|
||||
elif isinstance(item, str):
|
||||
if item.startswith(("http://", "https://")):
|
||||
file_source = FileUrl(url=item)
|
||||
else:
|
||||
file_source = FilePath(path=Path(item))
|
||||
elif isinstance(item, (bytes, memoryview)):
|
||||
file_source = FileBytes(data=bytes(item))
|
||||
else:
|
||||
continue
|
||||
|
||||
name = file_source.filename or f"file_{i}"
|
||||
result[name] = wrap_file_source(file_source)
|
||||
|
||||
return result
|
||||
@@ -53,6 +53,7 @@ from crewai.utilities.logger_utils import suppress_warnings
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from crewai_files import FileInput, UploadCache
|
||||
from litellm.exceptions import ContextWindowExceededError
|
||||
from litellm.litellm_core_utils.get_supported_openai_params import (
|
||||
get_supported_openai_params,
|
||||
@@ -66,7 +67,6 @@ if TYPE_CHECKING:
|
||||
from litellm.utils import supports_response_schema
|
||||
|
||||
from crewai.agent.core import Agent
|
||||
from crewai.files import FileInput, UploadCache
|
||||
from crewai.llms.hooks.base import BaseInterceptor
|
||||
from crewai.llms.providers.anthropic.completion import AnthropicThinkingConfig
|
||||
from crewai.task import Task
|
||||
@@ -2274,7 +2274,7 @@ class LLM(BaseLLM):
|
||||
"""
|
||||
import base64
|
||||
|
||||
from crewai.files import (
|
||||
from crewai_files import (
|
||||
FileResolver,
|
||||
FileResolverConfig,
|
||||
InlineBase64,
|
||||
|
||||
@@ -32,8 +32,9 @@ from crewai.types.usage_metrics import UsageMetrics
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from crewai_files import FileInput, UploadCache
|
||||
|
||||
from crewai.agent.core import Agent
|
||||
from crewai.files import FileInput, UploadCache
|
||||
from crewai.task import Task
|
||||
from crewai.tools.base_tool import BaseTool
|
||||
from crewai.utilities.types import LLMMessage
|
||||
|
||||
@@ -20,7 +20,8 @@ from crewai.utilities.types import LLMMessage
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from crewai.files import FileInput, UploadCache
|
||||
from crewai_files import FileInput, UploadCache
|
||||
|
||||
from crewai.llms.hooks.base import BaseInterceptor
|
||||
|
||||
DEFAULT_CACHE_TTL = "ephemeral"
|
||||
@@ -1281,7 +1282,7 @@ class AnthropicCompletion(BaseLLM):
|
||||
if not self.supports_multimodal():
|
||||
return []
|
||||
|
||||
from crewai.files import (
|
||||
from crewai_files import (
|
||||
FileReference,
|
||||
FileResolver,
|
||||
FileResolverConfig,
|
||||
@@ -1394,7 +1395,7 @@ class AnthropicCompletion(BaseLLM):
|
||||
if not self.supports_multimodal():
|
||||
return []
|
||||
|
||||
from crewai.files import (
|
||||
from crewai_files import (
|
||||
FileReference,
|
||||
FileResolver,
|
||||
FileResolverConfig,
|
||||
|
||||
@@ -18,7 +18,8 @@ from crewai.utilities.types import LLMMessage
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from crewai.files import FileInput, UploadCache
|
||||
from crewai_files import FileInput, UploadCache
|
||||
|
||||
from crewai.llms.hooks.base import BaseInterceptor
|
||||
|
||||
|
||||
@@ -1060,7 +1061,7 @@ class AzureCompletion(BaseLLM):
|
||||
if not self.supports_multimodal():
|
||||
return []
|
||||
|
||||
from crewai.files import (
|
||||
from crewai_files import (
|
||||
FileResolver,
|
||||
FileResolverConfig,
|
||||
InlineBase64,
|
||||
@@ -1120,7 +1121,7 @@ class AzureCompletion(BaseLLM):
|
||||
if not self.supports_multimodal():
|
||||
return []
|
||||
|
||||
from crewai.files import (
|
||||
from crewai_files import (
|
||||
FileResolver,
|
||||
FileResolverConfig,
|
||||
InlineBase64,
|
||||
|
||||
@@ -20,6 +20,7 @@ from crewai.utilities.types import LLMMessage
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from crewai_files import FileInput, UploadCache
|
||||
from mypy_boto3_bedrock_runtime.type_defs import (
|
||||
GuardrailConfigurationTypeDef,
|
||||
GuardrailStreamConfigurationTypeDef,
|
||||
@@ -32,7 +33,6 @@ if TYPE_CHECKING:
|
||||
ToolTypeDef,
|
||||
)
|
||||
|
||||
from crewai.files import FileInput, UploadCache
|
||||
from crewai.llms.hooks.base import BaseInterceptor
|
||||
|
||||
|
||||
@@ -1586,7 +1586,7 @@ class BedrockCompletion(BaseLLM):
|
||||
|
||||
import os
|
||||
|
||||
from crewai.files import (
|
||||
from crewai_files import (
|
||||
FileReference,
|
||||
FileResolver,
|
||||
FileResolverConfig,
|
||||
@@ -1714,7 +1714,7 @@ class BedrockCompletion(BaseLLM):
|
||||
|
||||
import os
|
||||
|
||||
from crewai.files import (
|
||||
from crewai_files import (
|
||||
FileReference,
|
||||
FileResolver,
|
||||
FileResolverConfig,
|
||||
|
||||
@@ -19,10 +19,11 @@ from crewai.utilities.types import LLMMessage
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from crewai.files import (
|
||||
from crewai_files import (
|
||||
FileInput,
|
||||
UploadCache,
|
||||
)
|
||||
|
||||
from crewai.llms.hooks.base import BaseInterceptor
|
||||
|
||||
|
||||
@@ -1113,7 +1114,7 @@ class GeminiCompletion(BaseLLM):
|
||||
Returns:
|
||||
List of content blocks in Gemini's expected format.
|
||||
"""
|
||||
from crewai.files import (
|
||||
from crewai_files import (
|
||||
FileReference,
|
||||
FileResolver,
|
||||
FileResolverConfig,
|
||||
@@ -1183,7 +1184,7 @@ class GeminiCompletion(BaseLLM):
|
||||
Returns:
|
||||
List of content blocks in Gemini's expected format.
|
||||
"""
|
||||
from crewai.files import (
|
||||
from crewai_files import (
|
||||
FileReference,
|
||||
FileResolver,
|
||||
FileResolverConfig,
|
||||
|
||||
@@ -27,8 +27,9 @@ from crewai.utilities.types import LLMMessage
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from crewai_files import FileInput, UploadCache
|
||||
|
||||
from crewai.agent.core import Agent
|
||||
from crewai.files import FileInput, UploadCache
|
||||
from crewai.llms.hooks.base import BaseInterceptor
|
||||
from crewai.task import Task
|
||||
from crewai.tools.base_tool import BaseTool
|
||||
@@ -1100,7 +1101,7 @@ class OpenAICompletion(BaseLLM):
|
||||
if not self.supports_multimodal():
|
||||
return []
|
||||
|
||||
from crewai.files import (
|
||||
from crewai_files import (
|
||||
FileReference,
|
||||
FileResolver,
|
||||
FileResolverConfig,
|
||||
@@ -1168,7 +1169,7 @@ class OpenAICompletion(BaseLLM):
|
||||
if not self.supports_multimodal():
|
||||
return []
|
||||
|
||||
from crewai.files import (
|
||||
from crewai_files import (
|
||||
FileReference,
|
||||
FileResolver,
|
||||
FileResolverConfig,
|
||||
|
||||
@@ -19,6 +19,12 @@ from typing import (
|
||||
import uuid
|
||||
import warnings
|
||||
|
||||
from crewai_files import (
|
||||
FileInput,
|
||||
FilePath,
|
||||
FileSourceInput,
|
||||
normalize_input_files,
|
||||
)
|
||||
from pydantic import (
|
||||
UUID4,
|
||||
BaseModel,
|
||||
@@ -37,12 +43,6 @@ from crewai.events.types.task_events import (
|
||||
TaskFailedEvent,
|
||||
TaskStartedEvent,
|
||||
)
|
||||
from crewai.files import (
|
||||
FileInput,
|
||||
FilePath,
|
||||
FileSourceInput,
|
||||
normalize_input_files,
|
||||
)
|
||||
from crewai.security import Fingerprint, SecurityConfig
|
||||
from crewai.tasks.output_format import OutputFormat
|
||||
from crewai.tasks.task_output import TaskOutput
|
||||
|
||||
@@ -11,7 +11,7 @@ from crewai.tools.base_tool import BaseTool
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from crewai.files import FileInput
|
||||
from crewai_files import FileInput
|
||||
|
||||
|
||||
class ReadFileToolSchema(BaseModel):
|
||||
|
||||
@@ -13,7 +13,7 @@ from aiocache.serializers import PickleSerializer # type: ignore[import-untyped
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from crewai.files import FileInput
|
||||
from crewai_files import FileInput
|
||||
|
||||
_file_store = Cache(Cache.MEMORY, serializer=PickleSerializer())
|
||||
|
||||
|
||||
@@ -1,25 +0,0 @@
|
||||
"""Backwards compatibility re-exports from crewai.files.
|
||||
|
||||
Deprecated: Import from crewai.files instead.
|
||||
"""
|
||||
|
||||
import sys
|
||||
from typing import Any
|
||||
|
||||
from typing_extensions import deprecated
|
||||
|
||||
import crewai.files as _files
|
||||
|
||||
|
||||
@deprecated("crewai.utilities.files is deprecated. Import from crewai.files instead.")
|
||||
class _DeprecatedModule:
|
||||
"""Deprecated module wrapper."""
|
||||
|
||||
def __getattr__(self, name: str) -> Any:
|
||||
return getattr(_files, name)
|
||||
|
||||
def __dir__(self) -> list[str]:
|
||||
return list(_files.__all__)
|
||||
|
||||
|
||||
sys.modules[__name__] = _DeprecatedModule() # type: ignore[assignment]
|
||||
@@ -1,258 +0,0 @@
|
||||
"""Type stubs for backwards compatibility re-exports from crewai.files.
|
||||
|
||||
.. deprecated::
|
||||
Import from crewai.files instead.
|
||||
"""
|
||||
|
||||
from collections.abc import Callable
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Literal
|
||||
|
||||
from typing_extensions import deprecated
|
||||
|
||||
import crewai.files as _files
|
||||
|
||||
FileMode = Literal["strict", "auto", "warn", "chunk"]
|
||||
ImageExtension = _files.ImageExtension
|
||||
ImageContentType = _files.ImageContentType
|
||||
PDFExtension = _files.PDFExtension
|
||||
PDFContentType = _files.PDFContentType
|
||||
TextExtension = _files.TextExtension
|
||||
TextContentType = _files.TextContentType
|
||||
AudioExtension = _files.AudioExtension
|
||||
AudioContentType = _files.AudioContentType
|
||||
VideoExtension = _files.VideoExtension
|
||||
VideoContentType = _files.VideoContentType
|
||||
FileInput = _files.FileInput
|
||||
FileSource = _files.FileSource
|
||||
FileSourceInput = _files.FileSourceInput
|
||||
RawFileInput = _files.RawFileInput
|
||||
ResolvedFileType = _files.ResolvedFileType
|
||||
FileHandling = _files.FileHandling
|
||||
|
||||
# Deprecated classes
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class BaseFile(_files.BaseFile):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class ImageFile(_files.ImageFile):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class PDFFile(_files.PDFFile):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class TextFile(_files.TextFile):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class AudioFile(_files.AudioFile):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class VideoFile(_files.VideoFile):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class File(_files.File):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class FilePath(_files.FilePath):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class FileBytes(_files.FileBytes):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class FileStream(_files.FileStream):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class FileResolver(_files.FileResolver):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class FileResolverConfig(_files.FileResolverConfig):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class FileProcessor(_files.FileProcessor):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class FileUploader(_files.FileUploader):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class UploadCache(_files.UploadCache):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class CachedUpload(_files.CachedUpload):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class UploadResult(_files.UploadResult):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class ResolvedFile(_files.ResolvedFile):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class FileReference(_files.FileReference):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class UrlReference(_files.UrlReference):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class InlineBase64(_files.InlineBase64):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class InlineBytes(_files.InlineBytes):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class ProviderConstraints(_files.ProviderConstraints):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class ImageConstraints(_files.ImageConstraints):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class AudioConstraints(_files.AudioConstraints):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class VideoConstraints(_files.VideoConstraints):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class PDFConstraints(_files.PDFConstraints):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
# Exceptions
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class FileProcessingError(_files.FileProcessingError):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class FileValidationError(_files.FileValidationError):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class FileTooLargeError(_files.FileTooLargeError):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class UnsupportedFileTypeError(_files.UnsupportedFileTypeError):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class ProcessingDependencyError(_files.ProcessingDependencyError):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
# Constants
|
||||
OPENAI_CONSTRAINTS: _files.ProviderConstraints
|
||||
ANTHROPIC_CONSTRAINTS: _files.ProviderConstraints
|
||||
GEMINI_CONSTRAINTS: _files.ProviderConstraints
|
||||
BEDROCK_CONSTRAINTS: _files.ProviderConstraints
|
||||
|
||||
# Deprecated functions
|
||||
@deprecated("Import from crewai.files instead")
|
||||
def create_resolver(
|
||||
provider: str,
|
||||
config: FileResolverConfig | None = None,
|
||||
) -> FileResolver:
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
def get_uploader(provider: str, **kwargs: Any) -> FileUploader | None:
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
def get_upload_cache() -> UploadCache:
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
def reset_upload_cache() -> None:
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
def get_constraints_for_provider(provider: str) -> ProviderConstraints:
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
def cleanup_uploaded_files(provider: str | None = None) -> int:
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
def cleanup_expired_files() -> int:
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
def cleanup_provider_files(provider: str) -> int:
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
def normalize_input_files(
|
||||
input_files: list[FileSourceInput | FileInput],
|
||||
) -> dict[str, FileInput]:
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
def wrap_file_source(source: FileSource) -> FileInput:
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
__all__: list[str]
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
from typing import Any, Literal, TypedDict
|
||||
|
||||
from crewai.files import FileInput
|
||||
from crewai_files import FileInput
|
||||
|
||||
|
||||
class LLMMessage(TypedDict):
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
"""Tests for file processing utilities."""
|
||||
@@ -1 +0,0 @@
|
||||
"""Tests for file processing module."""
|
||||
@@ -1,226 +0,0 @@
|
||||
"""Tests for provider constraints."""
|
||||
|
||||
import pytest
|
||||
|
||||
from crewai.files.processing.constraints import (
|
||||
ANTHROPIC_CONSTRAINTS,
|
||||
BEDROCK_CONSTRAINTS,
|
||||
GEMINI_CONSTRAINTS,
|
||||
OPENAI_CONSTRAINTS,
|
||||
AudioConstraints,
|
||||
ImageConstraints,
|
||||
PDFConstraints,
|
||||
ProviderConstraints,
|
||||
VideoConstraints,
|
||||
get_constraints_for_provider,
|
||||
)
|
||||
|
||||
|
||||
class TestImageConstraints:
|
||||
"""Tests for ImageConstraints dataclass."""
|
||||
|
||||
def test_image_constraints_creation(self):
|
||||
"""Test creating image constraints with all fields."""
|
||||
constraints = ImageConstraints(
|
||||
max_size_bytes=5 * 1024 * 1024,
|
||||
max_width=8000,
|
||||
max_height=8000,
|
||||
max_images_per_request=10,
|
||||
)
|
||||
|
||||
assert constraints.max_size_bytes == 5 * 1024 * 1024
|
||||
assert constraints.max_width == 8000
|
||||
assert constraints.max_height == 8000
|
||||
assert constraints.max_images_per_request == 10
|
||||
|
||||
def test_image_constraints_defaults(self):
|
||||
"""Test image constraints with default values."""
|
||||
constraints = ImageConstraints(max_size_bytes=1000)
|
||||
|
||||
assert constraints.max_size_bytes == 1000
|
||||
assert constraints.max_width is None
|
||||
assert constraints.max_height is None
|
||||
assert constraints.max_images_per_request is None
|
||||
assert "image/png" in constraints.supported_formats
|
||||
|
||||
def test_image_constraints_frozen(self):
|
||||
"""Test that image constraints are immutable."""
|
||||
constraints = ImageConstraints(max_size_bytes=1000)
|
||||
|
||||
with pytest.raises(Exception):
|
||||
constraints.max_size_bytes = 2000
|
||||
|
||||
|
||||
class TestPDFConstraints:
|
||||
"""Tests for PDFConstraints dataclass."""
|
||||
|
||||
def test_pdf_constraints_creation(self):
|
||||
"""Test creating PDF constraints."""
|
||||
constraints = PDFConstraints(
|
||||
max_size_bytes=30 * 1024 * 1024,
|
||||
max_pages=100,
|
||||
)
|
||||
|
||||
assert constraints.max_size_bytes == 30 * 1024 * 1024
|
||||
assert constraints.max_pages == 100
|
||||
|
||||
def test_pdf_constraints_defaults(self):
|
||||
"""Test PDF constraints with default values."""
|
||||
constraints = PDFConstraints(max_size_bytes=1000)
|
||||
|
||||
assert constraints.max_size_bytes == 1000
|
||||
assert constraints.max_pages is None
|
||||
|
||||
|
||||
class TestAudioConstraints:
|
||||
"""Tests for AudioConstraints dataclass."""
|
||||
|
||||
def test_audio_constraints_creation(self):
|
||||
"""Test creating audio constraints."""
|
||||
constraints = AudioConstraints(
|
||||
max_size_bytes=100 * 1024 * 1024,
|
||||
max_duration_seconds=3600,
|
||||
)
|
||||
|
||||
assert constraints.max_size_bytes == 100 * 1024 * 1024
|
||||
assert constraints.max_duration_seconds == 3600
|
||||
assert "audio/mp3" in constraints.supported_formats
|
||||
|
||||
|
||||
class TestVideoConstraints:
|
||||
"""Tests for VideoConstraints dataclass."""
|
||||
|
||||
def test_video_constraints_creation(self):
|
||||
"""Test creating video constraints."""
|
||||
constraints = VideoConstraints(
|
||||
max_size_bytes=2 * 1024 * 1024 * 1024,
|
||||
max_duration_seconds=7200,
|
||||
)
|
||||
|
||||
assert constraints.max_size_bytes == 2 * 1024 * 1024 * 1024
|
||||
assert constraints.max_duration_seconds == 7200
|
||||
assert "video/mp4" in constraints.supported_formats
|
||||
|
||||
|
||||
class TestProviderConstraints:
|
||||
"""Tests for ProviderConstraints dataclass."""
|
||||
|
||||
def test_provider_constraints_creation(self):
|
||||
"""Test creating full provider constraints."""
|
||||
constraints = ProviderConstraints(
|
||||
name="test-provider",
|
||||
image=ImageConstraints(max_size_bytes=5 * 1024 * 1024),
|
||||
pdf=PDFConstraints(max_size_bytes=30 * 1024 * 1024),
|
||||
supports_file_upload=True,
|
||||
file_upload_threshold_bytes=10 * 1024 * 1024,
|
||||
)
|
||||
|
||||
assert constraints.name == "test-provider"
|
||||
assert constraints.image is not None
|
||||
assert constraints.pdf is not None
|
||||
assert constraints.supports_file_upload is True
|
||||
|
||||
def test_provider_constraints_defaults(self):
|
||||
"""Test provider constraints with default values."""
|
||||
constraints = ProviderConstraints(name="test")
|
||||
|
||||
assert constraints.name == "test"
|
||||
assert constraints.image is None
|
||||
assert constraints.pdf is None
|
||||
assert constraints.audio is None
|
||||
assert constraints.video is None
|
||||
assert constraints.supports_file_upload is False
|
||||
|
||||
|
||||
class TestPredefinedConstraints:
|
||||
"""Tests for predefined provider constraints."""
|
||||
|
||||
def test_anthropic_constraints(self):
|
||||
"""Test Anthropic constraints are properly defined."""
|
||||
assert ANTHROPIC_CONSTRAINTS.name == "anthropic"
|
||||
assert ANTHROPIC_CONSTRAINTS.image is not None
|
||||
assert ANTHROPIC_CONSTRAINTS.image.max_size_bytes == 5 * 1024 * 1024
|
||||
assert ANTHROPIC_CONSTRAINTS.image.max_width == 8000
|
||||
assert ANTHROPIC_CONSTRAINTS.pdf is not None
|
||||
assert ANTHROPIC_CONSTRAINTS.pdf.max_pages == 100
|
||||
assert ANTHROPIC_CONSTRAINTS.supports_file_upload is True
|
||||
|
||||
def test_openai_constraints(self):
|
||||
"""Test OpenAI constraints are properly defined."""
|
||||
assert OPENAI_CONSTRAINTS.name == "openai"
|
||||
assert OPENAI_CONSTRAINTS.image is not None
|
||||
assert OPENAI_CONSTRAINTS.image.max_size_bytes == 20 * 1024 * 1024
|
||||
assert OPENAI_CONSTRAINTS.pdf is None # OpenAI doesn't support PDFs
|
||||
|
||||
def test_gemini_constraints(self):
|
||||
"""Test Gemini constraints are properly defined."""
|
||||
assert GEMINI_CONSTRAINTS.name == "gemini"
|
||||
assert GEMINI_CONSTRAINTS.image is not None
|
||||
assert GEMINI_CONSTRAINTS.pdf is not None
|
||||
assert GEMINI_CONSTRAINTS.audio is not None
|
||||
assert GEMINI_CONSTRAINTS.video is not None
|
||||
assert GEMINI_CONSTRAINTS.supports_file_upload is True
|
||||
|
||||
def test_bedrock_constraints(self):
|
||||
"""Test Bedrock constraints are properly defined."""
|
||||
assert BEDROCK_CONSTRAINTS.name == "bedrock"
|
||||
assert BEDROCK_CONSTRAINTS.image is not None
|
||||
assert BEDROCK_CONSTRAINTS.image.max_size_bytes == 4_608_000
|
||||
assert BEDROCK_CONSTRAINTS.pdf is not None
|
||||
assert BEDROCK_CONSTRAINTS.supports_file_upload is False
|
||||
|
||||
|
||||
class TestGetConstraintsForProvider:
|
||||
"""Tests for get_constraints_for_provider function."""
|
||||
|
||||
def test_get_by_exact_name(self):
|
||||
"""Test getting constraints by exact provider name."""
|
||||
result = get_constraints_for_provider("anthropic")
|
||||
assert result == ANTHROPIC_CONSTRAINTS
|
||||
|
||||
result = get_constraints_for_provider("openai")
|
||||
assert result == OPENAI_CONSTRAINTS
|
||||
|
||||
result = get_constraints_for_provider("gemini")
|
||||
assert result == GEMINI_CONSTRAINTS
|
||||
|
||||
def test_get_by_alias(self):
|
||||
"""Test getting constraints by alias name."""
|
||||
result = get_constraints_for_provider("claude")
|
||||
assert result == ANTHROPIC_CONSTRAINTS
|
||||
|
||||
result = get_constraints_for_provider("gpt")
|
||||
assert result == OPENAI_CONSTRAINTS
|
||||
|
||||
result = get_constraints_for_provider("google")
|
||||
assert result == GEMINI_CONSTRAINTS
|
||||
|
||||
def test_get_case_insensitive(self):
|
||||
"""Test case-insensitive lookup."""
|
||||
result = get_constraints_for_provider("ANTHROPIC")
|
||||
assert result == ANTHROPIC_CONSTRAINTS
|
||||
|
||||
result = get_constraints_for_provider("OpenAI")
|
||||
assert result == OPENAI_CONSTRAINTS
|
||||
|
||||
def test_get_with_provider_constraints_object(self):
|
||||
"""Test passing ProviderConstraints object returns it unchanged."""
|
||||
custom = ProviderConstraints(name="custom")
|
||||
result = get_constraints_for_provider(custom)
|
||||
assert result is custom
|
||||
|
||||
def test_get_unknown_provider(self):
|
||||
"""Test unknown provider returns None."""
|
||||
result = get_constraints_for_provider("unknown-provider")
|
||||
assert result is None
|
||||
|
||||
def test_get_by_partial_match(self):
|
||||
"""Test partial match in provider string."""
|
||||
result = get_constraints_for_provider("claude-3-sonnet")
|
||||
assert result == ANTHROPIC_CONSTRAINTS
|
||||
|
||||
result = get_constraints_for_provider("gpt-4o")
|
||||
assert result == OPENAI_CONSTRAINTS
|
||||
|
||||
result = get_constraints_for_provider("gemini-pro")
|
||||
assert result == GEMINI_CONSTRAINTS
|
||||
@@ -1,220 +0,0 @@
|
||||
"""Tests for FileProcessor class."""
|
||||
|
||||
import pytest
|
||||
|
||||
from crewai.files import FileBytes, ImageFile, PDFFile, TextFile
|
||||
from crewai.files.processing.constraints import (
|
||||
ANTHROPIC_CONSTRAINTS,
|
||||
ImageConstraints,
|
||||
PDFConstraints,
|
||||
ProviderConstraints,
|
||||
)
|
||||
from crewai.files.processing.enums import FileHandling
|
||||
from crewai.files.processing.exceptions import (
|
||||
FileTooLargeError,
|
||||
FileValidationError,
|
||||
)
|
||||
from crewai.files.processing.processor import FileProcessor
|
||||
|
||||
|
||||
# Minimal valid PNG: 8x8 pixel RGB image (valid for PIL)
|
||||
MINIMAL_PNG = bytes([
|
||||
0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a, 0x00, 0x00, 0x00, 0x0d,
|
||||
0x49, 0x48, 0x44, 0x52, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08,
|
||||
0x08, 0x02, 0x00, 0x00, 0x00, 0x4b, 0x6d, 0x29, 0xdc, 0x00, 0x00, 0x00,
|
||||
0x12, 0x49, 0x44, 0x41, 0x54, 0x78, 0x9c, 0x63, 0xfc, 0xcf, 0x80, 0x1d,
|
||||
0x30, 0xe1, 0x10, 0x1f, 0xa4, 0x12, 0x00, 0xcd, 0x41, 0x01, 0x0f, 0xe8,
|
||||
0x41, 0xe2, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x49, 0x45, 0x4e, 0x44, 0xae,
|
||||
0x42, 0x60, 0x82,
|
||||
])
|
||||
|
||||
# Minimal valid PDF
|
||||
MINIMAL_PDF = (
|
||||
b"%PDF-1.4\n1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj "
|
||||
b"2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj "
|
||||
b"3 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R>>endobj "
|
||||
b"xref\n0 4\n0000000000 65535 f \n0000000009 00000 n \n"
|
||||
b"0000000052 00000 n \n0000000101 00000 n \n"
|
||||
b"trailer<</Size 4/Root 1 0 R>>\nstartxref\n178\n%%EOF"
|
||||
)
|
||||
|
||||
|
||||
class TestFileProcessorInit:
|
||||
"""Tests for FileProcessor initialization."""
|
||||
|
||||
def test_init_with_constraints(self):
|
||||
"""Test initialization with ProviderConstraints."""
|
||||
processor = FileProcessor(constraints=ANTHROPIC_CONSTRAINTS)
|
||||
|
||||
assert processor.constraints == ANTHROPIC_CONSTRAINTS
|
||||
|
||||
def test_init_with_provider_string(self):
|
||||
"""Test initialization with provider name string."""
|
||||
processor = FileProcessor(constraints="anthropic")
|
||||
|
||||
assert processor.constraints == ANTHROPIC_CONSTRAINTS
|
||||
|
||||
def test_init_with_unknown_provider(self):
|
||||
"""Test initialization with unknown provider sets constraints to None."""
|
||||
processor = FileProcessor(constraints="unknown")
|
||||
|
||||
assert processor.constraints is None
|
||||
|
||||
def test_init_with_none_constraints(self):
|
||||
"""Test initialization with None constraints."""
|
||||
processor = FileProcessor(constraints=None)
|
||||
|
||||
assert processor.constraints is None
|
||||
|
||||
|
||||
class TestFileProcessorValidate:
|
||||
"""Tests for FileProcessor.validate method."""
|
||||
|
||||
def test_validate_valid_file(self):
|
||||
"""Test validating a valid file returns no errors."""
|
||||
processor = FileProcessor(constraints=ANTHROPIC_CONSTRAINTS)
|
||||
file = ImageFile(source=FileBytes(data=MINIMAL_PNG, filename="test.png"))
|
||||
|
||||
errors = processor.validate(file)
|
||||
|
||||
assert len(errors) == 0
|
||||
|
||||
def test_validate_without_constraints(self):
|
||||
"""Test validating without constraints returns empty list."""
|
||||
processor = FileProcessor(constraints=None)
|
||||
file = ImageFile(source=FileBytes(data=MINIMAL_PNG, filename="test.png"))
|
||||
|
||||
errors = processor.validate(file)
|
||||
|
||||
assert len(errors) == 0
|
||||
|
||||
def test_validate_strict_raises_on_error(self):
|
||||
"""Test STRICT mode raises on validation error."""
|
||||
constraints = ProviderConstraints(
|
||||
name="test",
|
||||
image=ImageConstraints(max_size_bytes=10),
|
||||
)
|
||||
processor = FileProcessor(constraints=constraints)
|
||||
# Set mode to strict on the file
|
||||
file = ImageFile(source=FileBytes(data=MINIMAL_PNG, filename="test.png"), mode="strict")
|
||||
|
||||
with pytest.raises(FileTooLargeError):
|
||||
processor.validate(file)
|
||||
|
||||
|
||||
class TestFileProcessorProcess:
|
||||
"""Tests for FileProcessor.process method."""
|
||||
|
||||
def test_process_valid_file(self):
|
||||
"""Test processing a valid file returns it unchanged."""
|
||||
processor = FileProcessor(constraints=ANTHROPIC_CONSTRAINTS)
|
||||
file = ImageFile(source=FileBytes(data=MINIMAL_PNG, filename="test.png"))
|
||||
|
||||
result = processor.process(file)
|
||||
|
||||
assert result == file
|
||||
|
||||
def test_process_without_constraints(self):
|
||||
"""Test processing without constraints returns file unchanged."""
|
||||
processor = FileProcessor(constraints=None)
|
||||
file = ImageFile(source=FileBytes(data=MINIMAL_PNG, filename="test.png"))
|
||||
|
||||
result = processor.process(file)
|
||||
|
||||
assert result == file
|
||||
|
||||
def test_process_strict_raises_on_error(self):
|
||||
"""Test STRICT mode raises on processing error."""
|
||||
constraints = ProviderConstraints(
|
||||
name="test",
|
||||
image=ImageConstraints(max_size_bytes=10),
|
||||
)
|
||||
processor = FileProcessor(constraints=constraints)
|
||||
# Set mode to strict on the file
|
||||
file = ImageFile(source=FileBytes(data=MINIMAL_PNG, filename="test.png"), mode="strict")
|
||||
|
||||
with pytest.raises(FileTooLargeError):
|
||||
processor.process(file)
|
||||
|
||||
def test_process_warn_returns_file(self):
|
||||
"""Test WARN mode returns file with warning."""
|
||||
constraints = ProviderConstraints(
|
||||
name="test",
|
||||
image=ImageConstraints(max_size_bytes=10),
|
||||
)
|
||||
processor = FileProcessor(constraints=constraints)
|
||||
# Set mode to warn on the file
|
||||
file = ImageFile(source=FileBytes(data=MINIMAL_PNG, filename="test.png"), mode="warn")
|
||||
|
||||
result = processor.process(file)
|
||||
|
||||
assert result == file
|
||||
|
||||
|
||||
class TestFileProcessorProcessFiles:
|
||||
"""Tests for FileProcessor.process_files method."""
|
||||
|
||||
def test_process_files_multiple(self):
|
||||
"""Test processing multiple files."""
|
||||
processor = FileProcessor(constraints=ANTHROPIC_CONSTRAINTS)
|
||||
files = {
|
||||
"image1": ImageFile(source=FileBytes(data=MINIMAL_PNG, filename="test1.png")),
|
||||
"image2": ImageFile(source=FileBytes(data=MINIMAL_PNG, filename="test2.png")),
|
||||
}
|
||||
|
||||
result = processor.process_files(files)
|
||||
|
||||
assert len(result) == 2
|
||||
assert "image1" in result
|
||||
assert "image2" in result
|
||||
|
||||
def test_process_files_empty(self):
|
||||
"""Test processing empty files dict."""
|
||||
processor = FileProcessor(constraints=ANTHROPIC_CONSTRAINTS)
|
||||
|
||||
result = processor.process_files({})
|
||||
|
||||
assert result == {}
|
||||
|
||||
|
||||
class TestFileHandlingEnum:
|
||||
"""Tests for FileHandling enum."""
|
||||
|
||||
def test_enum_values(self):
|
||||
"""Test all enum values are accessible."""
|
||||
assert FileHandling.STRICT.value == "strict"
|
||||
assert FileHandling.AUTO.value == "auto"
|
||||
assert FileHandling.WARN.value == "warn"
|
||||
assert FileHandling.CHUNK.value == "chunk"
|
||||
|
||||
|
||||
class TestFileProcessorPerFileMode:
|
||||
"""Tests for per-file mode handling."""
|
||||
|
||||
def test_file_default_mode_is_auto(self):
|
||||
"""Test that files default to auto mode."""
|
||||
file = ImageFile(source=FileBytes(data=MINIMAL_PNG, filename="test.png"))
|
||||
assert file.mode == "auto"
|
||||
|
||||
def test_file_custom_mode(self):
|
||||
"""Test setting custom mode on file."""
|
||||
file = ImageFile(source=FileBytes(data=MINIMAL_PNG, filename="test.png"), mode="strict")
|
||||
assert file.mode == "strict"
|
||||
|
||||
def test_processor_respects_file_mode(self):
|
||||
"""Test processor uses each file's mode setting."""
|
||||
constraints = ProviderConstraints(
|
||||
name="test",
|
||||
image=ImageConstraints(max_size_bytes=10),
|
||||
)
|
||||
processor = FileProcessor(constraints=constraints)
|
||||
|
||||
# File with strict mode should raise
|
||||
strict_file = ImageFile(source=FileBytes(data=MINIMAL_PNG, filename="test.png"), mode="strict")
|
||||
with pytest.raises(FileTooLargeError):
|
||||
processor.process(strict_file)
|
||||
|
||||
# File with warn mode should not raise
|
||||
warn_file = ImageFile(source=FileBytes(data=MINIMAL_PNG, filename="test.png"), mode="warn")
|
||||
result = processor.process(warn_file)
|
||||
assert result == warn_file
|
||||
@@ -1,359 +0,0 @@
|
||||
"""Unit tests for file transformers."""
|
||||
|
||||
import io
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from crewai.files import ImageFile, PDFFile, TextFile
|
||||
from crewai.files.file import FileBytes
|
||||
from crewai.files.processing.exceptions import ProcessingDependencyError
|
||||
from crewai.files.processing.transformers import (
|
||||
chunk_pdf,
|
||||
chunk_text,
|
||||
get_image_dimensions,
|
||||
get_pdf_page_count,
|
||||
optimize_image,
|
||||
resize_image,
|
||||
)
|
||||
|
||||
|
||||
def create_test_png(width: int = 100, height: int = 100) -> bytes:
|
||||
"""Create a minimal valid PNG for testing."""
|
||||
from PIL import Image
|
||||
|
||||
img = Image.new("RGB", (width, height), color="red")
|
||||
buffer = io.BytesIO()
|
||||
img.save(buffer, format="PNG")
|
||||
return buffer.getvalue()
|
||||
|
||||
|
||||
def create_test_pdf(num_pages: int = 1) -> bytes:
|
||||
"""Create a minimal valid PDF for testing."""
|
||||
from pypdf import PdfWriter
|
||||
|
||||
writer = PdfWriter()
|
||||
for _ in range(num_pages):
|
||||
writer.add_blank_page(width=612, height=792)
|
||||
|
||||
buffer = io.BytesIO()
|
||||
writer.write(buffer)
|
||||
return buffer.getvalue()
|
||||
|
||||
|
||||
class TestResizeImage:
|
||||
"""Tests for resize_image function."""
|
||||
|
||||
def test_resize_larger_image(self) -> None:
|
||||
"""Test resizing an image larger than max dimensions."""
|
||||
png_bytes = create_test_png(200, 150)
|
||||
img = ImageFile(source=FileBytes(data=png_bytes, filename="test.png"))
|
||||
|
||||
result = resize_image(img, max_width=100, max_height=100)
|
||||
|
||||
dims = get_image_dimensions(result)
|
||||
assert dims is not None
|
||||
width, height = dims
|
||||
assert width <= 100
|
||||
assert height <= 100
|
||||
|
||||
def test_no_resize_if_within_bounds(self) -> None:
|
||||
"""Test that small images are returned unchanged."""
|
||||
png_bytes = create_test_png(50, 50)
|
||||
img = ImageFile(source=FileBytes(data=png_bytes, filename="small.png"))
|
||||
|
||||
result = resize_image(img, max_width=100, max_height=100)
|
||||
|
||||
assert result is img
|
||||
|
||||
def test_preserve_aspect_ratio(self) -> None:
|
||||
"""Test that aspect ratio is preserved during resize."""
|
||||
png_bytes = create_test_png(200, 100)
|
||||
img = ImageFile(source=FileBytes(data=png_bytes, filename="wide.png"))
|
||||
|
||||
result = resize_image(img, max_width=100, max_height=100)
|
||||
|
||||
dims = get_image_dimensions(result)
|
||||
assert dims is not None
|
||||
width, height = dims
|
||||
assert width == 100
|
||||
assert height == 50
|
||||
|
||||
def test_resize_without_aspect_ratio(self) -> None:
|
||||
"""Test resizing without preserving aspect ratio."""
|
||||
png_bytes = create_test_png(200, 100)
|
||||
img = ImageFile(source=FileBytes(data=png_bytes, filename="wide.png"))
|
||||
|
||||
result = resize_image(
|
||||
img, max_width=50, max_height=50, preserve_aspect_ratio=False
|
||||
)
|
||||
|
||||
dims = get_image_dimensions(result)
|
||||
assert dims is not None
|
||||
width, height = dims
|
||||
assert width == 50
|
||||
assert height == 50
|
||||
|
||||
def test_resize_returns_image_file(self) -> None:
|
||||
"""Test that resize returns an ImageFile instance."""
|
||||
png_bytes = create_test_png(200, 200)
|
||||
img = ImageFile(source=FileBytes(data=png_bytes, filename="test.png"))
|
||||
|
||||
result = resize_image(img, max_width=100, max_height=100)
|
||||
|
||||
assert isinstance(result, ImageFile)
|
||||
|
||||
def test_raises_without_pillow(self) -> None:
|
||||
"""Test that ProcessingDependencyError is raised without Pillow."""
|
||||
img = ImageFile(source=FileBytes(data=b"fake", filename="test.png"))
|
||||
|
||||
with patch.dict("sys.modules", {"PIL": None, "PIL.Image": None}):
|
||||
with pytest.raises(ProcessingDependencyError) as exc_info:
|
||||
# Force reimport to trigger ImportError
|
||||
import importlib
|
||||
|
||||
import crewai.files.processing.transformers as t
|
||||
|
||||
importlib.reload(t)
|
||||
t.resize_image(img, 100, 100)
|
||||
|
||||
assert "Pillow" in str(exc_info.value)
|
||||
|
||||
|
||||
class TestOptimizeImage:
|
||||
"""Tests for optimize_image function."""
|
||||
|
||||
def test_optimize_reduces_size(self) -> None:
|
||||
"""Test that optimization reduces file size."""
|
||||
png_bytes = create_test_png(500, 500)
|
||||
original_size = len(png_bytes)
|
||||
img = ImageFile(source=FileBytes(data=png_bytes, filename="large.png"))
|
||||
|
||||
result = optimize_image(img, target_size_bytes=original_size // 2)
|
||||
|
||||
result_size = len(result.read())
|
||||
assert result_size < original_size
|
||||
|
||||
def test_no_optimize_if_under_target(self) -> None:
|
||||
"""Test that small images are returned unchanged."""
|
||||
png_bytes = create_test_png(50, 50)
|
||||
img = ImageFile(source=FileBytes(data=png_bytes, filename="small.png"))
|
||||
|
||||
result = optimize_image(img, target_size_bytes=1024 * 1024)
|
||||
|
||||
assert result is img
|
||||
|
||||
def test_optimize_returns_image_file(self) -> None:
|
||||
"""Test that optimize returns an ImageFile instance."""
|
||||
png_bytes = create_test_png(200, 200)
|
||||
img = ImageFile(source=FileBytes(data=png_bytes, filename="test.png"))
|
||||
|
||||
result = optimize_image(img, target_size_bytes=100)
|
||||
|
||||
assert isinstance(result, ImageFile)
|
||||
|
||||
def test_optimize_respects_min_quality(self) -> None:
|
||||
"""Test that optimization stops at minimum quality."""
|
||||
png_bytes = create_test_png(100, 100)
|
||||
img = ImageFile(source=FileBytes(data=png_bytes, filename="test.png"))
|
||||
|
||||
# Request impossibly small size - should stop at min quality
|
||||
result = optimize_image(img, target_size_bytes=10, min_quality=50)
|
||||
|
||||
assert isinstance(result, ImageFile)
|
||||
assert len(result.read()) > 10
|
||||
|
||||
|
||||
class TestChunkPdf:
|
||||
"""Tests for chunk_pdf function."""
|
||||
|
||||
def test_chunk_splits_large_pdf(self) -> None:
|
||||
"""Test that large PDFs are split into chunks."""
|
||||
pdf_bytes = create_test_pdf(num_pages=10)
|
||||
pdf = PDFFile(source=FileBytes(data=pdf_bytes, filename="large.pdf"))
|
||||
|
||||
result = list(chunk_pdf(pdf, max_pages=3))
|
||||
|
||||
assert len(result) == 4
|
||||
assert all(isinstance(chunk, PDFFile) for chunk in result)
|
||||
|
||||
def test_no_chunk_if_within_limit(self) -> None:
|
||||
"""Test that small PDFs are returned unchanged."""
|
||||
pdf_bytes = create_test_pdf(num_pages=3)
|
||||
pdf = PDFFile(source=FileBytes(data=pdf_bytes, filename="small.pdf"))
|
||||
|
||||
result = list(chunk_pdf(pdf, max_pages=5))
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0] is pdf
|
||||
|
||||
def test_chunk_filenames(self) -> None:
|
||||
"""Test that chunked files have indexed filenames."""
|
||||
pdf_bytes = create_test_pdf(num_pages=6)
|
||||
pdf = PDFFile(source=FileBytes(data=pdf_bytes, filename="document.pdf"))
|
||||
|
||||
result = list(chunk_pdf(pdf, max_pages=2))
|
||||
|
||||
assert result[0].filename == "document_chunk_0.pdf"
|
||||
assert result[1].filename == "document_chunk_1.pdf"
|
||||
assert result[2].filename == "document_chunk_2.pdf"
|
||||
|
||||
def test_chunk_with_overlap(self) -> None:
|
||||
"""Test chunking with overlapping pages."""
|
||||
pdf_bytes = create_test_pdf(num_pages=10)
|
||||
pdf = PDFFile(source=FileBytes(data=pdf_bytes, filename="doc.pdf"))
|
||||
|
||||
result = list(chunk_pdf(pdf, max_pages=4, overlap_pages=1))
|
||||
|
||||
# With overlap, we get more chunks
|
||||
assert len(result) >= 3
|
||||
|
||||
def test_chunk_page_counts(self) -> None:
|
||||
"""Test that each chunk has correct page count."""
|
||||
pdf_bytes = create_test_pdf(num_pages=7)
|
||||
pdf = PDFFile(source=FileBytes(data=pdf_bytes, filename="doc.pdf"))
|
||||
|
||||
result = list(chunk_pdf(pdf, max_pages=3))
|
||||
|
||||
page_counts = [get_pdf_page_count(chunk) for chunk in result]
|
||||
assert page_counts == [3, 3, 1]
|
||||
|
||||
|
||||
class TestChunkText:
|
||||
"""Tests for chunk_text function."""
|
||||
|
||||
def test_chunk_splits_large_text(self) -> None:
|
||||
"""Test that large text files are split into chunks."""
|
||||
content = "Hello world. " * 100
|
||||
text = TextFile(source=content.encode(), filename="large.txt")
|
||||
|
||||
result = list(chunk_text(text, max_chars=200, overlap_chars=0))
|
||||
|
||||
assert len(result) > 1
|
||||
assert all(isinstance(chunk, TextFile) for chunk in result)
|
||||
|
||||
def test_no_chunk_if_within_limit(self) -> None:
|
||||
"""Test that small text files are returned unchanged."""
|
||||
content = "Short text"
|
||||
text = TextFile(source=content.encode(), filename="small.txt")
|
||||
|
||||
result = list(chunk_text(text, max_chars=1000, overlap_chars=0))
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0] is text
|
||||
|
||||
def test_chunk_filenames(self) -> None:
|
||||
"""Test that chunked files have indexed filenames."""
|
||||
content = "A" * 500
|
||||
text = TextFile(source=FileBytes(data=content.encode(), filename="data.txt"))
|
||||
|
||||
result = list(chunk_text(text, max_chars=200, overlap_chars=0))
|
||||
|
||||
assert result[0].filename == "data_chunk_0.txt"
|
||||
assert result[1].filename == "data_chunk_1.txt"
|
||||
assert len(result) == 3
|
||||
|
||||
def test_chunk_preserves_extension(self) -> None:
|
||||
"""Test that file extension is preserved in chunks."""
|
||||
content = "A" * 500
|
||||
text = TextFile(source=FileBytes(data=content.encode(), filename="script.py"))
|
||||
|
||||
result = list(chunk_text(text, max_chars=200, overlap_chars=0))
|
||||
|
||||
assert all(chunk.filename.endswith(".py") for chunk in result)
|
||||
|
||||
def test_chunk_prefers_newline_boundaries(self) -> None:
|
||||
"""Test that chunking prefers to split at newlines."""
|
||||
content = "Line one\nLine two\nLine three\nLine four\nLine five"
|
||||
text = TextFile(source=content.encode(), filename="lines.txt")
|
||||
|
||||
result = list(chunk_text(text, max_chars=25, overlap_chars=0, split_on_newlines=True))
|
||||
|
||||
# Should split at newline boundaries
|
||||
for chunk in result:
|
||||
chunk_text_content = chunk.read().decode()
|
||||
# Chunks should end at newlines (except possibly the last)
|
||||
if chunk != result[-1]:
|
||||
assert chunk_text_content.endswith("\n") or len(chunk_text_content) <= 25
|
||||
|
||||
def test_chunk_with_overlap(self) -> None:
|
||||
"""Test chunking with overlapping characters."""
|
||||
content = "ABCDEFGHIJ" * 10
|
||||
text = TextFile(source=content.encode(), filename="data.txt")
|
||||
|
||||
result = list(chunk_text(text, max_chars=30, overlap_chars=5))
|
||||
|
||||
# With overlap, chunks should share some content
|
||||
assert len(result) >= 3
|
||||
|
||||
def test_chunk_overlap_larger_than_max_chars(self) -> None:
|
||||
"""Test that overlap > max_chars doesn't cause infinite loop."""
|
||||
content = "A" * 100
|
||||
text = TextFile(source=content.encode(), filename="data.txt")
|
||||
|
||||
# overlap_chars > max_chars should still work (just with max overlap)
|
||||
result = list(chunk_text(text, max_chars=20, overlap_chars=50))
|
||||
|
||||
assert len(result) > 1
|
||||
# Should still complete without hanging
|
||||
|
||||
|
||||
class TestGetImageDimensions:
|
||||
"""Tests for get_image_dimensions function."""
|
||||
|
||||
def test_get_dimensions(self) -> None:
|
||||
"""Test getting image dimensions."""
|
||||
png_bytes = create_test_png(150, 100)
|
||||
img = ImageFile(source=FileBytes(data=png_bytes, filename="test.png"))
|
||||
|
||||
dims = get_image_dimensions(img)
|
||||
|
||||
assert dims == (150, 100)
|
||||
|
||||
def test_returns_none_for_invalid_image(self) -> None:
|
||||
"""Test that None is returned for invalid image data."""
|
||||
img = ImageFile(source=FileBytes(data=b"not an image", filename="bad.png"))
|
||||
|
||||
dims = get_image_dimensions(img)
|
||||
|
||||
assert dims is None
|
||||
|
||||
def test_returns_none_without_pillow(self) -> None:
|
||||
"""Test that None is returned when Pillow is not installed."""
|
||||
png_bytes = create_test_png(100, 100)
|
||||
img = ImageFile(source=FileBytes(data=png_bytes, filename="test.png"))
|
||||
|
||||
with patch.dict("sys.modules", {"PIL": None}):
|
||||
# Can't easily test this without unloading module
|
||||
# Just verify the function handles the case gracefully
|
||||
pass
|
||||
|
||||
|
||||
class TestGetPdfPageCount:
|
||||
"""Tests for get_pdf_page_count function."""
|
||||
|
||||
def test_get_page_count(self) -> None:
|
||||
"""Test getting PDF page count."""
|
||||
pdf_bytes = create_test_pdf(num_pages=5)
|
||||
pdf = PDFFile(source=FileBytes(data=pdf_bytes, filename="test.pdf"))
|
||||
|
||||
count = get_pdf_page_count(pdf)
|
||||
|
||||
assert count == 5
|
||||
|
||||
def test_single_page(self) -> None:
|
||||
"""Test page count for single page PDF."""
|
||||
pdf_bytes = create_test_pdf(num_pages=1)
|
||||
pdf = PDFFile(source=FileBytes(data=pdf_bytes, filename="single.pdf"))
|
||||
|
||||
count = get_pdf_page_count(pdf)
|
||||
|
||||
assert count == 1
|
||||
|
||||
def test_returns_none_for_invalid_pdf(self) -> None:
|
||||
"""Test that None is returned for invalid PDF data."""
|
||||
pdf = PDFFile(source=FileBytes(data=b"not a pdf", filename="bad.pdf"))
|
||||
|
||||
count = get_pdf_page_count(pdf)
|
||||
|
||||
assert count is None
|
||||
@@ -1,575 +0,0 @@
|
||||
"""Tests for file validators."""
|
||||
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from crewai.files import AudioFile, FileBytes, ImageFile, PDFFile, TextFile, VideoFile
|
||||
from crewai.files.processing.constraints import (
|
||||
ANTHROPIC_CONSTRAINTS,
|
||||
AudioConstraints,
|
||||
ImageConstraints,
|
||||
PDFConstraints,
|
||||
ProviderConstraints,
|
||||
VideoConstraints,
|
||||
)
|
||||
from crewai.files.processing.exceptions import (
|
||||
FileTooLargeError,
|
||||
FileValidationError,
|
||||
UnsupportedFileTypeError,
|
||||
)
|
||||
from crewai.files.processing.validators import (
|
||||
_get_audio_duration,
|
||||
_get_video_duration,
|
||||
validate_audio,
|
||||
validate_file,
|
||||
validate_image,
|
||||
validate_pdf,
|
||||
validate_text,
|
||||
validate_video,
|
||||
)
|
||||
|
||||
|
||||
# Minimal valid PNG: 8x8 pixel RGB image (valid for PIL)
|
||||
MINIMAL_PNG = bytes([
|
||||
0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a, 0x00, 0x00, 0x00, 0x0d,
|
||||
0x49, 0x48, 0x44, 0x52, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08,
|
||||
0x08, 0x02, 0x00, 0x00, 0x00, 0x4b, 0x6d, 0x29, 0xdc, 0x00, 0x00, 0x00,
|
||||
0x12, 0x49, 0x44, 0x41, 0x54, 0x78, 0x9c, 0x63, 0xfc, 0xcf, 0x80, 0x1d,
|
||||
0x30, 0xe1, 0x10, 0x1f, 0xa4, 0x12, 0x00, 0xcd, 0x41, 0x01, 0x0f, 0xe8,
|
||||
0x41, 0xe2, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x49, 0x45, 0x4e, 0x44, 0xae,
|
||||
0x42, 0x60, 0x82,
|
||||
])
|
||||
|
||||
# Minimal valid PDF
|
||||
MINIMAL_PDF = (
|
||||
b"%PDF-1.4\n1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj "
|
||||
b"2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj "
|
||||
b"3 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R>>endobj "
|
||||
b"xref\n0 4\n0000000000 65535 f \n0000000009 00000 n \n"
|
||||
b"0000000052 00000 n \n0000000101 00000 n \n"
|
||||
b"trailer<</Size 4/Root 1 0 R>>\nstartxref\n178\n%%EOF"
|
||||
)
|
||||
|
||||
|
||||
class TestValidateImage:
|
||||
"""Tests for validate_image function."""
|
||||
|
||||
def test_validate_valid_image(self):
|
||||
"""Test validating a valid image within constraints."""
|
||||
constraints = ImageConstraints(
|
||||
max_size_bytes=10 * 1024 * 1024,
|
||||
supported_formats=("image/png",),
|
||||
)
|
||||
file = ImageFile(source=FileBytes(data=MINIMAL_PNG, filename="test.png"))
|
||||
|
||||
errors = validate_image(file, constraints, raise_on_error=False)
|
||||
|
||||
assert len(errors) == 0
|
||||
|
||||
def test_validate_image_too_large(self):
|
||||
"""Test validating an image that exceeds size limit."""
|
||||
constraints = ImageConstraints(
|
||||
max_size_bytes=10, # Very small limit
|
||||
supported_formats=("image/png",),
|
||||
)
|
||||
file = ImageFile(source=FileBytes(data=MINIMAL_PNG, filename="test.png"))
|
||||
|
||||
with pytest.raises(FileTooLargeError) as exc_info:
|
||||
validate_image(file, constraints)
|
||||
|
||||
assert "exceeds" in str(exc_info.value)
|
||||
assert exc_info.value.file_name == "test.png"
|
||||
|
||||
def test_validate_image_unsupported_format(self):
|
||||
"""Test validating an image with unsupported format."""
|
||||
constraints = ImageConstraints(
|
||||
max_size_bytes=10 * 1024 * 1024,
|
||||
supported_formats=("image/jpeg",), # Only JPEG
|
||||
)
|
||||
file = ImageFile(source=FileBytes(data=MINIMAL_PNG, filename="test.png"))
|
||||
|
||||
with pytest.raises(UnsupportedFileTypeError) as exc_info:
|
||||
validate_image(file, constraints)
|
||||
|
||||
assert "not supported" in str(exc_info.value)
|
||||
|
||||
def test_validate_image_no_raise(self):
|
||||
"""Test validating with raise_on_error=False returns errors list."""
|
||||
constraints = ImageConstraints(
|
||||
max_size_bytes=10,
|
||||
supported_formats=("image/jpeg",),
|
||||
)
|
||||
file = ImageFile(source=FileBytes(data=MINIMAL_PNG, filename="test.png"))
|
||||
|
||||
errors = validate_image(file, constraints, raise_on_error=False)
|
||||
|
||||
assert len(errors) == 2 # Size error and format error
|
||||
|
||||
|
||||
class TestValidatePDF:
|
||||
"""Tests for validate_pdf function."""
|
||||
|
||||
def test_validate_valid_pdf(self):
|
||||
"""Test validating a valid PDF within constraints."""
|
||||
constraints = PDFConstraints(
|
||||
max_size_bytes=10 * 1024 * 1024,
|
||||
)
|
||||
file = PDFFile(source=FileBytes(data=MINIMAL_PDF, filename="test.pdf"))
|
||||
|
||||
errors = validate_pdf(file, constraints, raise_on_error=False)
|
||||
|
||||
assert len(errors) == 0
|
||||
|
||||
def test_validate_pdf_too_large(self):
|
||||
"""Test validating a PDF that exceeds size limit."""
|
||||
constraints = PDFConstraints(
|
||||
max_size_bytes=10, # Very small limit
|
||||
)
|
||||
file = PDFFile(source=FileBytes(data=MINIMAL_PDF, filename="test.pdf"))
|
||||
|
||||
with pytest.raises(FileTooLargeError) as exc_info:
|
||||
validate_pdf(file, constraints)
|
||||
|
||||
assert "exceeds" in str(exc_info.value)
|
||||
|
||||
|
||||
class TestValidateText:
|
||||
"""Tests for validate_text function."""
|
||||
|
||||
def test_validate_valid_text(self):
|
||||
"""Test validating a valid text file."""
|
||||
constraints = ProviderConstraints(
|
||||
name="test",
|
||||
general_max_size_bytes=10 * 1024 * 1024,
|
||||
)
|
||||
file = TextFile(source=FileBytes(data=b"Hello, World!", filename="test.txt"))
|
||||
|
||||
errors = validate_text(file, constraints, raise_on_error=False)
|
||||
|
||||
assert len(errors) == 0
|
||||
|
||||
def test_validate_text_too_large(self):
|
||||
"""Test validating text that exceeds size limit."""
|
||||
constraints = ProviderConstraints(
|
||||
name="test",
|
||||
general_max_size_bytes=5,
|
||||
)
|
||||
file = TextFile(source=FileBytes(data=b"Hello, World!", filename="test.txt"))
|
||||
|
||||
with pytest.raises(FileTooLargeError):
|
||||
validate_text(file, constraints)
|
||||
|
||||
def test_validate_text_no_limit(self):
|
||||
"""Test validating text with no size limit."""
|
||||
constraints = ProviderConstraints(name="test")
|
||||
file = TextFile(source=FileBytes(data=b"Hello, World!", filename="test.txt"))
|
||||
|
||||
errors = validate_text(file, constraints, raise_on_error=False)
|
||||
|
||||
assert len(errors) == 0
|
||||
|
||||
|
||||
class TestValidateFile:
|
||||
"""Tests for validate_file function."""
|
||||
|
||||
def test_validate_file_dispatches_to_image(self):
|
||||
"""Test validate_file dispatches to image validator."""
|
||||
file = ImageFile(source=FileBytes(data=MINIMAL_PNG, filename="test.png"))
|
||||
|
||||
errors = validate_file(file, ANTHROPIC_CONSTRAINTS, raise_on_error=False)
|
||||
|
||||
assert len(errors) == 0
|
||||
|
||||
def test_validate_file_dispatches_to_pdf(self):
|
||||
"""Test validate_file dispatches to PDF validator."""
|
||||
file = PDFFile(source=FileBytes(data=MINIMAL_PDF, filename="test.pdf"))
|
||||
|
||||
errors = validate_file(file, ANTHROPIC_CONSTRAINTS, raise_on_error=False)
|
||||
|
||||
assert len(errors) == 0
|
||||
|
||||
def test_validate_file_unsupported_type(self):
|
||||
"""Test validating a file type not supported by provider."""
|
||||
constraints = ProviderConstraints(
|
||||
name="test",
|
||||
image=None, # No image support
|
||||
)
|
||||
file = ImageFile(source=FileBytes(data=MINIMAL_PNG, filename="test.png"))
|
||||
|
||||
with pytest.raises(UnsupportedFileTypeError) as exc_info:
|
||||
validate_file(file, constraints)
|
||||
|
||||
assert "does not support images" in str(exc_info.value)
|
||||
|
||||
def test_validate_file_pdf_not_supported(self):
|
||||
"""Test validating PDF when provider doesn't support it."""
|
||||
constraints = ProviderConstraints(
|
||||
name="test",
|
||||
pdf=None, # No PDF support
|
||||
)
|
||||
file = PDFFile(source=FileBytes(data=MINIMAL_PDF, filename="test.pdf"))
|
||||
|
||||
with pytest.raises(UnsupportedFileTypeError) as exc_info:
|
||||
validate_file(file, constraints)
|
||||
|
||||
assert "does not support PDFs" in str(exc_info.value)
|
||||
|
||||
|
||||
# Minimal audio bytes for testing (not a valid audio file, used for mocked tests)
|
||||
MINIMAL_AUDIO = b"\x00" * 100
|
||||
|
||||
# Minimal video bytes for testing (not a valid video file, used for mocked tests)
|
||||
MINIMAL_VIDEO = b"\x00" * 100
|
||||
|
||||
# Fallback content type when python-magic cannot detect
|
||||
FALLBACK_CONTENT_TYPE = "application/octet-stream"
|
||||
|
||||
|
||||
class TestValidateAudio:
|
||||
"""Tests for validate_audio function and audio duration validation."""
|
||||
|
||||
def test_validate_valid_audio(self):
|
||||
"""Test validating a valid audio file within constraints."""
|
||||
constraints = AudioConstraints(
|
||||
max_size_bytes=10 * 1024 * 1024,
|
||||
supported_formats=("audio/mp3", "audio/mpeg", FALLBACK_CONTENT_TYPE),
|
||||
)
|
||||
file = AudioFile(source=FileBytes(data=MINIMAL_AUDIO, filename="test.mp3"))
|
||||
|
||||
errors = validate_audio(file, constraints, raise_on_error=False)
|
||||
|
||||
assert len(errors) == 0
|
||||
|
||||
def test_validate_audio_too_large(self):
|
||||
"""Test validating an audio file that exceeds size limit."""
|
||||
constraints = AudioConstraints(
|
||||
max_size_bytes=10, # Very small limit
|
||||
supported_formats=("audio/mp3", "audio/mpeg", FALLBACK_CONTENT_TYPE),
|
||||
)
|
||||
file = AudioFile(source=FileBytes(data=MINIMAL_AUDIO, filename="test.mp3"))
|
||||
|
||||
with pytest.raises(FileTooLargeError) as exc_info:
|
||||
validate_audio(file, constraints)
|
||||
|
||||
assert "exceeds" in str(exc_info.value)
|
||||
assert exc_info.value.file_name == "test.mp3"
|
||||
|
||||
def test_validate_audio_unsupported_format(self):
|
||||
"""Test validating an audio file with unsupported format."""
|
||||
constraints = AudioConstraints(
|
||||
max_size_bytes=10 * 1024 * 1024,
|
||||
supported_formats=("audio/wav",), # Only WAV
|
||||
)
|
||||
file = AudioFile(source=FileBytes(data=MINIMAL_AUDIO, filename="test.mp3"))
|
||||
|
||||
with pytest.raises(UnsupportedFileTypeError) as exc_info:
|
||||
validate_audio(file, constraints)
|
||||
|
||||
assert "not supported" in str(exc_info.value)
|
||||
|
||||
@patch("crewai.files.processing.validators._get_audio_duration")
|
||||
def test_validate_audio_duration_passes(self, mock_get_duration):
|
||||
"""Test validating audio when duration is under limit."""
|
||||
mock_get_duration.return_value = 30.0
|
||||
constraints = AudioConstraints(
|
||||
max_size_bytes=10 * 1024 * 1024,
|
||||
max_duration_seconds=60,
|
||||
supported_formats=("audio/mp3", "audio/mpeg", FALLBACK_CONTENT_TYPE),
|
||||
)
|
||||
file = AudioFile(source=FileBytes(data=MINIMAL_AUDIO, filename="test.mp3"))
|
||||
|
||||
errors = validate_audio(file, constraints, raise_on_error=False)
|
||||
|
||||
assert len(errors) == 0
|
||||
mock_get_duration.assert_called_once()
|
||||
|
||||
@patch("crewai.files.processing.validators._get_audio_duration")
|
||||
def test_validate_audio_duration_fails(self, mock_get_duration):
|
||||
"""Test validating audio when duration exceeds limit."""
|
||||
mock_get_duration.return_value = 120.5
|
||||
constraints = AudioConstraints(
|
||||
max_size_bytes=10 * 1024 * 1024,
|
||||
max_duration_seconds=60,
|
||||
supported_formats=("audio/mp3", "audio/mpeg", FALLBACK_CONTENT_TYPE),
|
||||
)
|
||||
file = AudioFile(source=FileBytes(data=MINIMAL_AUDIO, filename="test.mp3"))
|
||||
|
||||
with pytest.raises(FileValidationError) as exc_info:
|
||||
validate_audio(file, constraints)
|
||||
|
||||
assert "duration" in str(exc_info.value).lower()
|
||||
assert "120.5s" in str(exc_info.value)
|
||||
assert "60s" in str(exc_info.value)
|
||||
|
||||
@patch("crewai.files.processing.validators._get_audio_duration")
|
||||
def test_validate_audio_duration_no_raise(self, mock_get_duration):
|
||||
"""Test audio duration validation with raise_on_error=False."""
|
||||
mock_get_duration.return_value = 120.5
|
||||
constraints = AudioConstraints(
|
||||
max_size_bytes=10 * 1024 * 1024,
|
||||
max_duration_seconds=60,
|
||||
supported_formats=("audio/mp3", "audio/mpeg", FALLBACK_CONTENT_TYPE),
|
||||
)
|
||||
file = AudioFile(source=FileBytes(data=MINIMAL_AUDIO, filename="test.mp3"))
|
||||
|
||||
errors = validate_audio(file, constraints, raise_on_error=False)
|
||||
|
||||
assert len(errors) == 1
|
||||
assert "duration" in errors[0].lower()
|
||||
|
||||
@patch("crewai.files.processing.validators._get_audio_duration")
|
||||
def test_validate_audio_duration_none_skips(self, mock_get_duration):
|
||||
"""Test that duration validation is skipped when max_duration_seconds is None."""
|
||||
constraints = AudioConstraints(
|
||||
max_size_bytes=10 * 1024 * 1024,
|
||||
max_duration_seconds=None,
|
||||
supported_formats=("audio/mp3", "audio/mpeg", FALLBACK_CONTENT_TYPE),
|
||||
)
|
||||
file = AudioFile(source=FileBytes(data=MINIMAL_AUDIO, filename="test.mp3"))
|
||||
|
||||
errors = validate_audio(file, constraints, raise_on_error=False)
|
||||
|
||||
assert len(errors) == 0
|
||||
mock_get_duration.assert_not_called()
|
||||
|
||||
@patch("crewai.files.processing.validators._get_audio_duration")
|
||||
def test_validate_audio_duration_detection_returns_none(self, mock_get_duration):
|
||||
"""Test that validation passes when duration detection returns None."""
|
||||
mock_get_duration.return_value = None
|
||||
constraints = AudioConstraints(
|
||||
max_size_bytes=10 * 1024 * 1024,
|
||||
max_duration_seconds=60,
|
||||
supported_formats=("audio/mp3", "audio/mpeg", FALLBACK_CONTENT_TYPE),
|
||||
)
|
||||
file = AudioFile(source=FileBytes(data=MINIMAL_AUDIO, filename="test.mp3"))
|
||||
|
||||
errors = validate_audio(file, constraints, raise_on_error=False)
|
||||
|
||||
assert len(errors) == 0
|
||||
|
||||
|
||||
class TestValidateVideo:
|
||||
"""Tests for validate_video function and video duration validation."""
|
||||
|
||||
def test_validate_valid_video(self):
|
||||
"""Test validating a valid video file within constraints."""
|
||||
constraints = VideoConstraints(
|
||||
max_size_bytes=10 * 1024 * 1024,
|
||||
supported_formats=("video/mp4", FALLBACK_CONTENT_TYPE),
|
||||
)
|
||||
file = VideoFile(source=FileBytes(data=MINIMAL_VIDEO, filename="test.mp4"))
|
||||
|
||||
errors = validate_video(file, constraints, raise_on_error=False)
|
||||
|
||||
assert len(errors) == 0
|
||||
|
||||
def test_validate_video_too_large(self):
|
||||
"""Test validating a video file that exceeds size limit."""
|
||||
constraints = VideoConstraints(
|
||||
max_size_bytes=10, # Very small limit
|
||||
supported_formats=("video/mp4", FALLBACK_CONTENT_TYPE),
|
||||
)
|
||||
file = VideoFile(source=FileBytes(data=MINIMAL_VIDEO, filename="test.mp4"))
|
||||
|
||||
with pytest.raises(FileTooLargeError) as exc_info:
|
||||
validate_video(file, constraints)
|
||||
|
||||
assert "exceeds" in str(exc_info.value)
|
||||
assert exc_info.value.file_name == "test.mp4"
|
||||
|
||||
def test_validate_video_unsupported_format(self):
|
||||
"""Test validating a video file with unsupported format."""
|
||||
constraints = VideoConstraints(
|
||||
max_size_bytes=10 * 1024 * 1024,
|
||||
supported_formats=("video/webm",), # Only WebM
|
||||
)
|
||||
file = VideoFile(source=FileBytes(data=MINIMAL_VIDEO, filename="test.mp4"))
|
||||
|
||||
with pytest.raises(UnsupportedFileTypeError) as exc_info:
|
||||
validate_video(file, constraints)
|
||||
|
||||
assert "not supported" in str(exc_info.value)
|
||||
|
||||
@patch("crewai.files.processing.validators._get_video_duration")
|
||||
def test_validate_video_duration_passes(self, mock_get_duration):
|
||||
"""Test validating video when duration is under limit."""
|
||||
mock_get_duration.return_value = 30.0
|
||||
constraints = VideoConstraints(
|
||||
max_size_bytes=10 * 1024 * 1024,
|
||||
max_duration_seconds=60,
|
||||
supported_formats=("video/mp4", FALLBACK_CONTENT_TYPE),
|
||||
)
|
||||
file = VideoFile(source=FileBytes(data=MINIMAL_VIDEO, filename="test.mp4"))
|
||||
|
||||
errors = validate_video(file, constraints, raise_on_error=False)
|
||||
|
||||
assert len(errors) == 0
|
||||
mock_get_duration.assert_called_once()
|
||||
|
||||
@patch("crewai.files.processing.validators._get_video_duration")
|
||||
def test_validate_video_duration_fails(self, mock_get_duration):
|
||||
"""Test validating video when duration exceeds limit."""
|
||||
mock_get_duration.return_value = 180.0
|
||||
constraints = VideoConstraints(
|
||||
max_size_bytes=10 * 1024 * 1024,
|
||||
max_duration_seconds=60,
|
||||
supported_formats=("video/mp4", FALLBACK_CONTENT_TYPE),
|
||||
)
|
||||
file = VideoFile(source=FileBytes(data=MINIMAL_VIDEO, filename="test.mp4"))
|
||||
|
||||
with pytest.raises(FileValidationError) as exc_info:
|
||||
validate_video(file, constraints)
|
||||
|
||||
assert "duration" in str(exc_info.value).lower()
|
||||
assert "180.0s" in str(exc_info.value)
|
||||
assert "60s" in str(exc_info.value)
|
||||
|
||||
@patch("crewai.files.processing.validators._get_video_duration")
|
||||
def test_validate_video_duration_no_raise(self, mock_get_duration):
|
||||
"""Test video duration validation with raise_on_error=False."""
|
||||
mock_get_duration.return_value = 180.0
|
||||
constraints = VideoConstraints(
|
||||
max_size_bytes=10 * 1024 * 1024,
|
||||
max_duration_seconds=60,
|
||||
supported_formats=("video/mp4", FALLBACK_CONTENT_TYPE),
|
||||
)
|
||||
file = VideoFile(source=FileBytes(data=MINIMAL_VIDEO, filename="test.mp4"))
|
||||
|
||||
errors = validate_video(file, constraints, raise_on_error=False)
|
||||
|
||||
assert len(errors) == 1
|
||||
assert "duration" in errors[0].lower()
|
||||
|
||||
@patch("crewai.files.processing.validators._get_video_duration")
|
||||
def test_validate_video_duration_none_skips(self, mock_get_duration):
|
||||
"""Test that duration validation is skipped when max_duration_seconds is None."""
|
||||
constraints = VideoConstraints(
|
||||
max_size_bytes=10 * 1024 * 1024,
|
||||
max_duration_seconds=None,
|
||||
supported_formats=("video/mp4", FALLBACK_CONTENT_TYPE),
|
||||
)
|
||||
file = VideoFile(source=FileBytes(data=MINIMAL_VIDEO, filename="test.mp4"))
|
||||
|
||||
errors = validate_video(file, constraints, raise_on_error=False)
|
||||
|
||||
assert len(errors) == 0
|
||||
mock_get_duration.assert_not_called()
|
||||
|
||||
@patch("crewai.files.processing.validators._get_video_duration")
|
||||
def test_validate_video_duration_detection_returns_none(self, mock_get_duration):
|
||||
"""Test that validation passes when duration detection returns None."""
|
||||
mock_get_duration.return_value = None
|
||||
constraints = VideoConstraints(
|
||||
max_size_bytes=10 * 1024 * 1024,
|
||||
max_duration_seconds=60,
|
||||
supported_formats=("video/mp4", FALLBACK_CONTENT_TYPE),
|
||||
)
|
||||
file = VideoFile(source=FileBytes(data=MINIMAL_VIDEO, filename="test.mp4"))
|
||||
|
||||
errors = validate_video(file, constraints, raise_on_error=False)
|
||||
|
||||
assert len(errors) == 0
|
||||
|
||||
|
||||
class TestGetAudioDuration:
|
||||
"""Tests for _get_audio_duration helper function."""
|
||||
|
||||
def test_get_audio_duration_corrupt_file(self):
|
||||
"""Test handling of corrupt audio data."""
|
||||
corrupt_data = b"not valid audio data at all"
|
||||
result = _get_audio_duration(corrupt_data)
|
||||
|
||||
assert result is None
|
||||
|
||||
|
||||
class TestGetVideoDuration:
|
||||
"""Tests for _get_video_duration helper function."""
|
||||
|
||||
def test_get_video_duration_corrupt_file(self):
|
||||
"""Test handling of corrupt video data."""
|
||||
corrupt_data = b"not valid video data at all"
|
||||
result = _get_video_duration(corrupt_data)
|
||||
|
||||
assert result is None
|
||||
|
||||
|
||||
class TestRealVideoFile:
|
||||
"""Tests using real video fixture file."""
|
||||
|
||||
@pytest.fixture
|
||||
def sample_video_path(self):
|
||||
"""Path to sample video fixture."""
|
||||
from pathlib import Path
|
||||
|
||||
path = Path(__file__).parent.parent.parent / "fixtures" / "sample_video.mp4"
|
||||
if not path.exists():
|
||||
pytest.skip("sample_video.mp4 fixture not found")
|
||||
return path
|
||||
|
||||
@pytest.fixture
|
||||
def sample_video_content(self, sample_video_path):
|
||||
"""Read sample video content."""
|
||||
return sample_video_path.read_bytes()
|
||||
|
||||
def test_get_video_duration_real_file(self, sample_video_content):
|
||||
"""Test duration detection with real video file."""
|
||||
try:
|
||||
import av # noqa: F401
|
||||
except ImportError:
|
||||
pytest.skip("PyAV not installed")
|
||||
|
||||
duration = _get_video_duration(sample_video_content, "video/mp4")
|
||||
|
||||
assert duration is not None
|
||||
assert 4.5 <= duration <= 5.5 # ~5 seconds with tolerance
|
||||
|
||||
def test_get_video_duration_real_file_no_format_hint(self, sample_video_content):
|
||||
"""Test duration detection without format hint."""
|
||||
try:
|
||||
import av # noqa: F401
|
||||
except ImportError:
|
||||
pytest.skip("PyAV not installed")
|
||||
|
||||
duration = _get_video_duration(sample_video_content)
|
||||
|
||||
assert duration is not None
|
||||
assert 4.5 <= duration <= 5.5
|
||||
|
||||
def test_validate_video_real_file_passes(self, sample_video_path):
|
||||
"""Test validating real video file within constraints."""
|
||||
try:
|
||||
import av # noqa: F401
|
||||
except ImportError:
|
||||
pytest.skip("PyAV not installed")
|
||||
|
||||
constraints = VideoConstraints(
|
||||
max_size_bytes=10 * 1024 * 1024,
|
||||
max_duration_seconds=60,
|
||||
supported_formats=("video/mp4",),
|
||||
)
|
||||
file = VideoFile(source=str(sample_video_path))
|
||||
|
||||
errors = validate_video(file, constraints, raise_on_error=False)
|
||||
|
||||
assert len(errors) == 0
|
||||
|
||||
def test_validate_video_real_file_duration_exceeded(self, sample_video_path):
|
||||
"""Test validating real video file that exceeds duration limit."""
|
||||
try:
|
||||
import av # noqa: F401
|
||||
except ImportError:
|
||||
pytest.skip("PyAV not installed")
|
||||
|
||||
constraints = VideoConstraints(
|
||||
max_size_bytes=10 * 1024 * 1024,
|
||||
max_duration_seconds=2, # Video is ~5 seconds
|
||||
supported_formats=("video/mp4",),
|
||||
)
|
||||
file = VideoFile(source=str(sample_video_path))
|
||||
|
||||
with pytest.raises(FileValidationError) as exc_info:
|
||||
validate_video(file, constraints)
|
||||
|
||||
assert "duration" in str(exc_info.value).lower()
|
||||
assert "2s" in str(exc_info.value)
|
||||
@@ -1,312 +0,0 @@
|
||||
"""Tests for FileUrl source type and URL resolution."""
|
||||
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from crewai.files import FileBytes, FileUrl, ImageFile
|
||||
from crewai.files.file import _normalize_source, FilePath
|
||||
from crewai.files.resolved import InlineBase64, UrlReference
|
||||
from crewai.files.resolver import FileResolver
|
||||
|
||||
|
||||
class TestFileUrl:
|
||||
"""Tests for FileUrl source type."""
|
||||
|
||||
def test_create_file_url(self):
|
||||
"""Test creating FileUrl with valid URL."""
|
||||
url = FileUrl(url="https://example.com/image.png")
|
||||
|
||||
assert url.url == "https://example.com/image.png"
|
||||
assert url.filename is None
|
||||
|
||||
def test_create_file_url_with_filename(self):
|
||||
"""Test creating FileUrl with custom filename."""
|
||||
url = FileUrl(url="https://example.com/image.png", filename="custom.png")
|
||||
|
||||
assert url.url == "https://example.com/image.png"
|
||||
assert url.filename == "custom.png"
|
||||
|
||||
def test_invalid_url_scheme_raises(self):
|
||||
"""Test that non-http(s) URLs raise ValueError."""
|
||||
with pytest.raises(ValueError, match="Invalid URL scheme"):
|
||||
FileUrl(url="ftp://example.com/file.txt")
|
||||
|
||||
def test_invalid_url_scheme_file_raises(self):
|
||||
"""Test that file:// URLs raise ValueError."""
|
||||
with pytest.raises(ValueError, match="Invalid URL scheme"):
|
||||
FileUrl(url="file:///path/to/file.txt")
|
||||
|
||||
def test_http_url_valid(self):
|
||||
"""Test that HTTP URLs are valid."""
|
||||
url = FileUrl(url="http://example.com/image.jpg")
|
||||
|
||||
assert url.url == "http://example.com/image.jpg"
|
||||
|
||||
def test_https_url_valid(self):
|
||||
"""Test that HTTPS URLs are valid."""
|
||||
url = FileUrl(url="https://example.com/image.jpg")
|
||||
|
||||
assert url.url == "https://example.com/image.jpg"
|
||||
|
||||
def test_content_type_guessing_png(self):
|
||||
"""Test content type guessing for PNG files."""
|
||||
url = FileUrl(url="https://example.com/image.png")
|
||||
|
||||
assert url.content_type == "image/png"
|
||||
|
||||
def test_content_type_guessing_jpeg(self):
|
||||
"""Test content type guessing for JPEG files."""
|
||||
url = FileUrl(url="https://example.com/photo.jpg")
|
||||
|
||||
assert url.content_type == "image/jpeg"
|
||||
|
||||
def test_content_type_guessing_pdf(self):
|
||||
"""Test content type guessing for PDF files."""
|
||||
url = FileUrl(url="https://example.com/document.pdf")
|
||||
|
||||
assert url.content_type == "application/pdf"
|
||||
|
||||
def test_content_type_guessing_with_query_params(self):
|
||||
"""Test content type guessing with URL query parameters."""
|
||||
url = FileUrl(url="https://example.com/image.png?v=123&token=abc")
|
||||
|
||||
assert url.content_type == "image/png"
|
||||
|
||||
def test_content_type_fallback_unknown(self):
|
||||
"""Test content type falls back to octet-stream for unknown extensions."""
|
||||
url = FileUrl(url="https://example.com/file.unknownext123")
|
||||
|
||||
assert url.content_type == "application/octet-stream"
|
||||
|
||||
def test_content_type_no_extension(self):
|
||||
"""Test content type for URL without extension."""
|
||||
url = FileUrl(url="https://example.com/file")
|
||||
|
||||
assert url.content_type == "application/octet-stream"
|
||||
|
||||
def test_read_fetches_content(self):
|
||||
"""Test that read() fetches content from URL."""
|
||||
url = FileUrl(url="https://example.com/image.png")
|
||||
mock_response = MagicMock()
|
||||
mock_response.content = b"fake image content"
|
||||
mock_response.headers = {"content-type": "image/png"}
|
||||
|
||||
with patch("httpx.get", return_value=mock_response) as mock_get:
|
||||
content = url.read()
|
||||
|
||||
mock_get.assert_called_once_with(
|
||||
"https://example.com/image.png", follow_redirects=True
|
||||
)
|
||||
assert content == b"fake image content"
|
||||
|
||||
def test_read_caches_content(self):
|
||||
"""Test that read() caches content."""
|
||||
url = FileUrl(url="https://example.com/image.png")
|
||||
mock_response = MagicMock()
|
||||
mock_response.content = b"fake content"
|
||||
mock_response.headers = {}
|
||||
|
||||
with patch("httpx.get", return_value=mock_response) as mock_get:
|
||||
content1 = url.read()
|
||||
content2 = url.read()
|
||||
|
||||
mock_get.assert_called_once()
|
||||
assert content1 == content2
|
||||
|
||||
def test_read_updates_content_type_from_response(self):
|
||||
"""Test that read() updates content type from response headers."""
|
||||
url = FileUrl(url="https://example.com/file")
|
||||
mock_response = MagicMock()
|
||||
mock_response.content = b"fake content"
|
||||
mock_response.headers = {"content-type": "image/webp; charset=utf-8"}
|
||||
|
||||
with patch("httpx.get", return_value=mock_response):
|
||||
url.read()
|
||||
|
||||
assert url.content_type == "image/webp"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_aread_fetches_content(self):
|
||||
"""Test that aread() fetches content from URL asynchronously."""
|
||||
url = FileUrl(url="https://example.com/image.png")
|
||||
mock_response = MagicMock()
|
||||
mock_response.content = b"async fake content"
|
||||
mock_response.headers = {"content-type": "image/png"}
|
||||
mock_response.raise_for_status = MagicMock()
|
||||
|
||||
mock_client = MagicMock()
|
||||
mock_client.get = AsyncMock(return_value=mock_response)
|
||||
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||
mock_client.__aexit__ = AsyncMock(return_value=None)
|
||||
|
||||
with patch("httpx.AsyncClient", return_value=mock_client):
|
||||
content = await url.aread()
|
||||
|
||||
assert content == b"async fake content"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_aread_caches_content(self):
|
||||
"""Test that aread() caches content."""
|
||||
url = FileUrl(url="https://example.com/image.png")
|
||||
mock_response = MagicMock()
|
||||
mock_response.content = b"cached content"
|
||||
mock_response.headers = {}
|
||||
mock_response.raise_for_status = MagicMock()
|
||||
|
||||
mock_client = MagicMock()
|
||||
mock_client.get = AsyncMock(return_value=mock_response)
|
||||
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||
mock_client.__aexit__ = AsyncMock(return_value=None)
|
||||
|
||||
with patch("httpx.AsyncClient", return_value=mock_client):
|
||||
content1 = await url.aread()
|
||||
content2 = await url.aread()
|
||||
|
||||
mock_client.get.assert_called_once()
|
||||
assert content1 == content2
|
||||
|
||||
|
||||
class TestNormalizeSource:
|
||||
"""Tests for _normalize_source with URL detection."""
|
||||
|
||||
def test_normalize_url_string(self):
|
||||
"""Test that URL strings are converted to FileUrl."""
|
||||
result = _normalize_source("https://example.com/image.png")
|
||||
|
||||
assert isinstance(result, FileUrl)
|
||||
assert result.url == "https://example.com/image.png"
|
||||
|
||||
def test_normalize_http_url_string(self):
|
||||
"""Test that HTTP URL strings are converted to FileUrl."""
|
||||
result = _normalize_source("http://example.com/file.pdf")
|
||||
|
||||
assert isinstance(result, FileUrl)
|
||||
assert result.url == "http://example.com/file.pdf"
|
||||
|
||||
def test_normalize_file_path_string(self, tmp_path):
|
||||
"""Test that file path strings are converted to FilePath."""
|
||||
test_file = tmp_path / "test.png"
|
||||
test_file.write_bytes(b"test content")
|
||||
|
||||
result = _normalize_source(str(test_file))
|
||||
|
||||
assert isinstance(result, FilePath)
|
||||
|
||||
def test_normalize_relative_path_is_not_url(self):
|
||||
"""Test that relative path strings are not treated as URLs."""
|
||||
result = _normalize_source("https://example.com/file.png")
|
||||
|
||||
assert isinstance(result, FileUrl)
|
||||
assert not isinstance(result, FilePath)
|
||||
|
||||
def test_normalize_file_url_passthrough(self):
|
||||
"""Test that FileUrl instances pass through unchanged."""
|
||||
original = FileUrl(url="https://example.com/image.png")
|
||||
result = _normalize_source(original)
|
||||
|
||||
assert result is original
|
||||
|
||||
|
||||
class TestResolverUrlHandling:
|
||||
"""Tests for FileResolver URL handling."""
|
||||
|
||||
def test_resolve_url_source_for_supported_provider(self):
|
||||
"""Test URL source resolves to UrlReference for supported providers."""
|
||||
resolver = FileResolver()
|
||||
file = ImageFile(source=FileUrl(url="https://example.com/image.png"))
|
||||
|
||||
resolved = resolver.resolve(file, "anthropic")
|
||||
|
||||
assert isinstance(resolved, UrlReference)
|
||||
assert resolved.url == "https://example.com/image.png"
|
||||
assert resolved.content_type == "image/png"
|
||||
|
||||
def test_resolve_url_source_openai(self):
|
||||
"""Test URL source resolves to UrlReference for OpenAI."""
|
||||
resolver = FileResolver()
|
||||
file = ImageFile(source=FileUrl(url="https://example.com/photo.jpg"))
|
||||
|
||||
resolved = resolver.resolve(file, "openai")
|
||||
|
||||
assert isinstance(resolved, UrlReference)
|
||||
assert resolved.url == "https://example.com/photo.jpg"
|
||||
|
||||
def test_resolve_url_source_gemini(self):
|
||||
"""Test URL source resolves to UrlReference for Gemini."""
|
||||
resolver = FileResolver()
|
||||
file = ImageFile(source=FileUrl(url="https://example.com/image.webp"))
|
||||
|
||||
resolved = resolver.resolve(file, "gemini")
|
||||
|
||||
assert isinstance(resolved, UrlReference)
|
||||
assert resolved.url == "https://example.com/image.webp"
|
||||
|
||||
def test_resolve_url_source_azure(self):
|
||||
"""Test URL source resolves to UrlReference for Azure."""
|
||||
resolver = FileResolver()
|
||||
file = ImageFile(source=FileUrl(url="https://example.com/image.gif"))
|
||||
|
||||
resolved = resolver.resolve(file, "azure")
|
||||
|
||||
assert isinstance(resolved, UrlReference)
|
||||
assert resolved.url == "https://example.com/image.gif"
|
||||
|
||||
def test_resolve_url_source_bedrock_fetches_content(self):
|
||||
"""Test URL source fetches content for Bedrock (unsupported URLs)."""
|
||||
resolver = FileResolver()
|
||||
file_url = FileUrl(url="https://example.com/image.png")
|
||||
file = ImageFile(source=file_url)
|
||||
|
||||
mock_response = MagicMock()
|
||||
mock_response.content = b"\x89PNG\r\n\x1a\n" + b"\x00" * 50
|
||||
mock_response.headers = {"content-type": "image/png"}
|
||||
|
||||
with patch("httpx.get", return_value=mock_response):
|
||||
resolved = resolver.resolve(file, "bedrock")
|
||||
|
||||
assert not isinstance(resolved, UrlReference)
|
||||
|
||||
def test_resolve_bytes_source_still_works(self):
|
||||
"""Test that bytes source still resolves normally."""
|
||||
resolver = FileResolver()
|
||||
minimal_png = (
|
||||
b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x08\x00\x00\x00\x08"
|
||||
b"\x01\x00\x00\x00\x00\xf9Y\xab\xcd\x00\x00\x00\nIDATx\x9cc`\x00\x00"
|
||||
b"\x00\x02\x00\x01\xe2!\xbc3\x00\x00\x00\x00IEND\xaeB`\x82"
|
||||
)
|
||||
file = ImageFile(source=FileBytes(data=minimal_png, filename="test.png"))
|
||||
|
||||
resolved = resolver.resolve(file, "anthropic")
|
||||
|
||||
assert isinstance(resolved, InlineBase64)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_aresolve_url_source(self):
|
||||
"""Test async URL resolution for supported provider."""
|
||||
resolver = FileResolver()
|
||||
file = ImageFile(source=FileUrl(url="https://example.com/image.png"))
|
||||
|
||||
resolved = await resolver.aresolve(file, "anthropic")
|
||||
|
||||
assert isinstance(resolved, UrlReference)
|
||||
assert resolved.url == "https://example.com/image.png"
|
||||
|
||||
|
||||
class TestImageFileWithUrl:
|
||||
"""Tests for creating ImageFile with URL source."""
|
||||
|
||||
def test_image_file_from_url_string(self):
|
||||
"""Test creating ImageFile from URL string."""
|
||||
file = ImageFile(source="https://example.com/image.png")
|
||||
|
||||
assert isinstance(file.source, FileUrl)
|
||||
assert file.source.url == "https://example.com/image.png"
|
||||
|
||||
def test_image_file_from_file_url(self):
|
||||
"""Test creating ImageFile from FileUrl instance."""
|
||||
url = FileUrl(url="https://example.com/photo.jpg")
|
||||
file = ImageFile(source=url)
|
||||
|
||||
assert file.source is url
|
||||
assert file.content_type == "image/jpeg"
|
||||
@@ -1,135 +0,0 @@
|
||||
"""Tests for resolved file types."""
|
||||
|
||||
from datetime import datetime, timezone
|
||||
|
||||
import pytest
|
||||
|
||||
from crewai.files.resolved import (
|
||||
FileReference,
|
||||
InlineBase64,
|
||||
InlineBytes,
|
||||
ResolvedFile,
|
||||
UrlReference,
|
||||
)
|
||||
|
||||
|
||||
class TestInlineBase64:
|
||||
"""Tests for InlineBase64 resolved type."""
|
||||
|
||||
def test_create_inline_base64(self):
|
||||
"""Test creating InlineBase64 instance."""
|
||||
resolved = InlineBase64(
|
||||
content_type="image/png",
|
||||
data="iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==",
|
||||
)
|
||||
|
||||
assert resolved.content_type == "image/png"
|
||||
assert len(resolved.data) > 0
|
||||
|
||||
def test_inline_base64_is_resolved_file(self):
|
||||
"""Test InlineBase64 is a ResolvedFile."""
|
||||
resolved = InlineBase64(content_type="image/png", data="abc123")
|
||||
|
||||
assert isinstance(resolved, ResolvedFile)
|
||||
|
||||
def test_inline_base64_frozen(self):
|
||||
"""Test InlineBase64 is immutable."""
|
||||
resolved = InlineBase64(content_type="image/png", data="abc123")
|
||||
|
||||
with pytest.raises(Exception):
|
||||
resolved.data = "xyz789"
|
||||
|
||||
|
||||
class TestInlineBytes:
|
||||
"""Tests for InlineBytes resolved type."""
|
||||
|
||||
def test_create_inline_bytes(self):
|
||||
"""Test creating InlineBytes instance."""
|
||||
data = b"\x89PNG\r\n\x1a\n"
|
||||
resolved = InlineBytes(
|
||||
content_type="image/png",
|
||||
data=data,
|
||||
)
|
||||
|
||||
assert resolved.content_type == "image/png"
|
||||
assert resolved.data == data
|
||||
|
||||
def test_inline_bytes_is_resolved_file(self):
|
||||
"""Test InlineBytes is a ResolvedFile."""
|
||||
resolved = InlineBytes(content_type="image/png", data=b"test")
|
||||
|
||||
assert isinstance(resolved, ResolvedFile)
|
||||
|
||||
|
||||
class TestFileReference:
|
||||
"""Tests for FileReference resolved type."""
|
||||
|
||||
def test_create_file_reference(self):
|
||||
"""Test creating FileReference instance."""
|
||||
resolved = FileReference(
|
||||
content_type="image/png",
|
||||
file_id="file-abc123",
|
||||
provider="gemini",
|
||||
)
|
||||
|
||||
assert resolved.content_type == "image/png"
|
||||
assert resolved.file_id == "file-abc123"
|
||||
assert resolved.provider == "gemini"
|
||||
assert resolved.expires_at is None
|
||||
assert resolved.file_uri is None
|
||||
|
||||
def test_file_reference_with_expiry(self):
|
||||
"""Test FileReference with expiry time."""
|
||||
expiry = datetime.now(timezone.utc)
|
||||
resolved = FileReference(
|
||||
content_type="application/pdf",
|
||||
file_id="file-xyz789",
|
||||
provider="gemini",
|
||||
expires_at=expiry,
|
||||
)
|
||||
|
||||
assert resolved.expires_at == expiry
|
||||
|
||||
def test_file_reference_with_uri(self):
|
||||
"""Test FileReference with URI."""
|
||||
resolved = FileReference(
|
||||
content_type="video/mp4",
|
||||
file_id="file-video123",
|
||||
provider="gemini",
|
||||
file_uri="https://generativelanguage.googleapis.com/v1/files/file-video123",
|
||||
)
|
||||
|
||||
assert resolved.file_uri is not None
|
||||
|
||||
def test_file_reference_is_resolved_file(self):
|
||||
"""Test FileReference is a ResolvedFile."""
|
||||
resolved = FileReference(
|
||||
content_type="image/png",
|
||||
file_id="file-123",
|
||||
provider="anthropic",
|
||||
)
|
||||
|
||||
assert isinstance(resolved, ResolvedFile)
|
||||
|
||||
|
||||
class TestUrlReference:
|
||||
"""Tests for UrlReference resolved type."""
|
||||
|
||||
def test_create_url_reference(self):
|
||||
"""Test creating UrlReference instance."""
|
||||
resolved = UrlReference(
|
||||
content_type="image/png",
|
||||
url="https://storage.googleapis.com/bucket/image.png",
|
||||
)
|
||||
|
||||
assert resolved.content_type == "image/png"
|
||||
assert resolved.url == "https://storage.googleapis.com/bucket/image.png"
|
||||
|
||||
def test_url_reference_is_resolved_file(self):
|
||||
"""Test UrlReference is a ResolvedFile."""
|
||||
resolved = UrlReference(
|
||||
content_type="image/jpeg",
|
||||
url="https://example.com/photo.jpg",
|
||||
)
|
||||
|
||||
assert isinstance(resolved, ResolvedFile)
|
||||
@@ -1,174 +0,0 @@
|
||||
"""Tests for FileResolver."""
|
||||
|
||||
import pytest
|
||||
|
||||
from crewai.files import FileBytes, ImageFile
|
||||
from crewai.files.resolved import InlineBase64, InlineBytes
|
||||
from crewai.files.resolver import (
|
||||
FileResolver,
|
||||
FileResolverConfig,
|
||||
create_resolver,
|
||||
)
|
||||
from crewai.files.upload_cache import UploadCache
|
||||
|
||||
|
||||
# Minimal valid PNG
|
||||
MINIMAL_PNG = (
|
||||
b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x08\x00\x00\x00\x08"
|
||||
b"\x01\x00\x00\x00\x00\xf9Y\xab\xcd\x00\x00\x00\nIDATx\x9cc`\x00\x00"
|
||||
b"\x00\x02\x00\x01\xe2!\xbc3\x00\x00\x00\x00IEND\xaeB`\x82"
|
||||
)
|
||||
|
||||
|
||||
class TestFileResolverConfig:
|
||||
"""Tests for FileResolverConfig."""
|
||||
|
||||
def test_default_config(self):
|
||||
"""Test default configuration values."""
|
||||
config = FileResolverConfig()
|
||||
|
||||
assert config.prefer_upload is False
|
||||
assert config.upload_threshold_bytes is None
|
||||
assert config.use_bytes_for_bedrock is True
|
||||
|
||||
def test_custom_config(self):
|
||||
"""Test custom configuration values."""
|
||||
config = FileResolverConfig(
|
||||
prefer_upload=True,
|
||||
upload_threshold_bytes=1024 * 1024,
|
||||
use_bytes_for_bedrock=False,
|
||||
)
|
||||
|
||||
assert config.prefer_upload is True
|
||||
assert config.upload_threshold_bytes == 1024 * 1024
|
||||
assert config.use_bytes_for_bedrock is False
|
||||
|
||||
|
||||
class TestFileResolver:
|
||||
"""Tests for FileResolver class."""
|
||||
|
||||
def test_resolve_inline_base64(self):
|
||||
"""Test resolving file as inline base64."""
|
||||
resolver = FileResolver()
|
||||
file = ImageFile(source=FileBytes(data=MINIMAL_PNG, filename="test.png"))
|
||||
|
||||
resolved = resolver.resolve(file, "openai")
|
||||
|
||||
assert isinstance(resolved, InlineBase64)
|
||||
assert resolved.content_type == "image/png"
|
||||
assert len(resolved.data) > 0
|
||||
|
||||
def test_resolve_inline_bytes_for_bedrock(self):
|
||||
"""Test resolving file as inline bytes for Bedrock."""
|
||||
config = FileResolverConfig(use_bytes_for_bedrock=True)
|
||||
resolver = FileResolver(config=config)
|
||||
file = ImageFile(source=FileBytes(data=MINIMAL_PNG, filename="test.png"))
|
||||
|
||||
resolved = resolver.resolve(file, "bedrock")
|
||||
|
||||
assert isinstance(resolved, InlineBytes)
|
||||
assert resolved.content_type == "image/png"
|
||||
assert resolved.data == MINIMAL_PNG
|
||||
|
||||
def test_resolve_files_multiple(self):
|
||||
"""Test resolving multiple files."""
|
||||
resolver = FileResolver()
|
||||
files = {
|
||||
"image1": ImageFile(source=FileBytes(data=MINIMAL_PNG, filename="test1.png")),
|
||||
"image2": ImageFile(source=FileBytes(data=MINIMAL_PNG, filename="test2.png")),
|
||||
}
|
||||
|
||||
resolved = resolver.resolve_files(files, "openai")
|
||||
|
||||
assert len(resolved) == 2
|
||||
assert "image1" in resolved
|
||||
assert "image2" in resolved
|
||||
assert all(isinstance(r, InlineBase64) for r in resolved.values())
|
||||
|
||||
def test_resolve_with_cache(self):
|
||||
"""Test resolver uses cache."""
|
||||
cache = UploadCache()
|
||||
resolver = FileResolver(upload_cache=cache)
|
||||
file = ImageFile(source=FileBytes(data=MINIMAL_PNG, filename="test.png"))
|
||||
|
||||
# First resolution
|
||||
resolved1 = resolver.resolve(file, "openai")
|
||||
# Second resolution (should use same base64 encoding)
|
||||
resolved2 = resolver.resolve(file, "openai")
|
||||
|
||||
assert isinstance(resolved1, InlineBase64)
|
||||
assert isinstance(resolved2, InlineBase64)
|
||||
# Data should be identical
|
||||
assert resolved1.data == resolved2.data
|
||||
|
||||
def test_clear_cache(self):
|
||||
"""Test clearing resolver cache."""
|
||||
cache = UploadCache()
|
||||
file = ImageFile(source=FileBytes(data=MINIMAL_PNG, filename="test.png"))
|
||||
|
||||
# Add something to cache manually
|
||||
cache.set(file=file, provider="gemini", file_id="test")
|
||||
|
||||
resolver = FileResolver(upload_cache=cache)
|
||||
resolver.clear_cache()
|
||||
|
||||
assert len(cache) == 0
|
||||
|
||||
def test_get_cached_uploads(self):
|
||||
"""Test getting cached uploads from resolver."""
|
||||
cache = UploadCache()
|
||||
file = ImageFile(source=FileBytes(data=MINIMAL_PNG, filename="test.png"))
|
||||
|
||||
cache.set(file=file, provider="gemini", file_id="test-1")
|
||||
cache.set(file=file, provider="anthropic", file_id="test-2")
|
||||
|
||||
resolver = FileResolver(upload_cache=cache)
|
||||
|
||||
gemini_uploads = resolver.get_cached_uploads("gemini")
|
||||
anthropic_uploads = resolver.get_cached_uploads("anthropic")
|
||||
|
||||
assert len(gemini_uploads) == 1
|
||||
assert len(anthropic_uploads) == 1
|
||||
|
||||
def test_get_cached_uploads_empty(self):
|
||||
"""Test getting cached uploads when no cache."""
|
||||
resolver = FileResolver() # No cache
|
||||
|
||||
uploads = resolver.get_cached_uploads("gemini")
|
||||
|
||||
assert uploads == []
|
||||
|
||||
|
||||
class TestCreateResolver:
|
||||
"""Tests for create_resolver factory function."""
|
||||
|
||||
def test_create_default_resolver(self):
|
||||
"""Test creating resolver with default settings."""
|
||||
resolver = create_resolver()
|
||||
|
||||
assert resolver.config.prefer_upload is False
|
||||
assert resolver.upload_cache is not None
|
||||
|
||||
def test_create_resolver_with_options(self):
|
||||
"""Test creating resolver with custom options."""
|
||||
resolver = create_resolver(
|
||||
prefer_upload=True,
|
||||
upload_threshold_bytes=5 * 1024 * 1024,
|
||||
enable_cache=False,
|
||||
)
|
||||
|
||||
assert resolver.config.prefer_upload is True
|
||||
assert resolver.config.upload_threshold_bytes == 5 * 1024 * 1024
|
||||
assert resolver.upload_cache is None
|
||||
|
||||
def test_create_resolver_cache_enabled(self):
|
||||
"""Test resolver has cache when enabled."""
|
||||
resolver = create_resolver(enable_cache=True)
|
||||
|
||||
assert resolver.upload_cache is not None
|
||||
|
||||
def test_create_resolver_cache_disabled(self):
|
||||
"""Test resolver has no cache when disabled."""
|
||||
resolver = create_resolver(enable_cache=False)
|
||||
|
||||
assert resolver.upload_cache is None
|
||||
@@ -1,206 +0,0 @@
|
||||
"""Tests for upload cache."""
|
||||
|
||||
from datetime import datetime, timedelta, timezone
|
||||
|
||||
import pytest
|
||||
|
||||
from crewai.files import FileBytes, ImageFile
|
||||
from crewai.files.upload_cache import CachedUpload, UploadCache
|
||||
|
||||
|
||||
# Minimal valid PNG
|
||||
MINIMAL_PNG = (
|
||||
b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x08\x00\x00\x00\x08"
|
||||
b"\x01\x00\x00\x00\x00\xf9Y\xab\xcd\x00\x00\x00\nIDATx\x9cc`\x00\x00"
|
||||
b"\x00\x02\x00\x01\xe2!\xbc3\x00\x00\x00\x00IEND\xaeB`\x82"
|
||||
)
|
||||
|
||||
|
||||
class TestCachedUpload:
|
||||
"""Tests for CachedUpload dataclass."""
|
||||
|
||||
def test_cached_upload_creation(self):
|
||||
"""Test creating a cached upload."""
|
||||
now = datetime.now(timezone.utc)
|
||||
cached = CachedUpload(
|
||||
file_id="file-123",
|
||||
provider="gemini",
|
||||
file_uri="files/file-123",
|
||||
content_type="image/png",
|
||||
uploaded_at=now,
|
||||
expires_at=now + timedelta(hours=48),
|
||||
)
|
||||
|
||||
assert cached.file_id == "file-123"
|
||||
assert cached.provider == "gemini"
|
||||
assert cached.file_uri == "files/file-123"
|
||||
assert cached.content_type == "image/png"
|
||||
|
||||
def test_is_expired_false(self):
|
||||
"""Test is_expired returns False for non-expired upload."""
|
||||
future = datetime.now(timezone.utc) + timedelta(hours=24)
|
||||
cached = CachedUpload(
|
||||
file_id="file-123",
|
||||
provider="gemini",
|
||||
file_uri=None,
|
||||
content_type="image/png",
|
||||
uploaded_at=datetime.now(timezone.utc),
|
||||
expires_at=future,
|
||||
)
|
||||
|
||||
assert cached.is_expired() is False
|
||||
|
||||
def test_is_expired_true(self):
|
||||
"""Test is_expired returns True for expired upload."""
|
||||
past = datetime.now(timezone.utc) - timedelta(hours=1)
|
||||
cached = CachedUpload(
|
||||
file_id="file-123",
|
||||
provider="gemini",
|
||||
file_uri=None,
|
||||
content_type="image/png",
|
||||
uploaded_at=datetime.now(timezone.utc) - timedelta(hours=2),
|
||||
expires_at=past,
|
||||
)
|
||||
|
||||
assert cached.is_expired() is True
|
||||
|
||||
def test_is_expired_no_expiry(self):
|
||||
"""Test is_expired returns False when no expiry set."""
|
||||
cached = CachedUpload(
|
||||
file_id="file-123",
|
||||
provider="anthropic",
|
||||
file_uri=None,
|
||||
content_type="image/png",
|
||||
uploaded_at=datetime.now(timezone.utc),
|
||||
expires_at=None,
|
||||
)
|
||||
|
||||
assert cached.is_expired() is False
|
||||
|
||||
|
||||
class TestUploadCache:
|
||||
"""Tests for UploadCache class."""
|
||||
|
||||
def test_cache_creation(self):
|
||||
"""Test creating an empty cache."""
|
||||
cache = UploadCache()
|
||||
|
||||
assert len(cache) == 0
|
||||
|
||||
def test_set_and_get(self):
|
||||
"""Test setting and getting cached uploads."""
|
||||
cache = UploadCache()
|
||||
file = ImageFile(source=FileBytes(data=MINIMAL_PNG, filename="test.png"))
|
||||
|
||||
cached = cache.set(
|
||||
file=file,
|
||||
provider="gemini",
|
||||
file_id="file-123",
|
||||
file_uri="files/file-123",
|
||||
)
|
||||
|
||||
result = cache.get(file, "gemini")
|
||||
|
||||
assert result is not None
|
||||
assert result.file_id == "file-123"
|
||||
assert result.provider == "gemini"
|
||||
|
||||
def test_get_missing(self):
|
||||
"""Test getting non-existent entry returns None."""
|
||||
cache = UploadCache()
|
||||
file = ImageFile(source=FileBytes(data=MINIMAL_PNG, filename="test.png"))
|
||||
|
||||
result = cache.get(file, "gemini")
|
||||
|
||||
assert result is None
|
||||
|
||||
def test_get_different_provider(self):
|
||||
"""Test getting with different provider returns None."""
|
||||
cache = UploadCache()
|
||||
file = ImageFile(source=FileBytes(data=MINIMAL_PNG, filename="test.png"))
|
||||
|
||||
cache.set(file=file, provider="gemini", file_id="file-123")
|
||||
|
||||
result = cache.get(file, "anthropic") # Different provider
|
||||
|
||||
assert result is None
|
||||
|
||||
def test_remove(self):
|
||||
"""Test removing cached entry."""
|
||||
cache = UploadCache()
|
||||
file = ImageFile(source=FileBytes(data=MINIMAL_PNG, filename="test.png"))
|
||||
|
||||
cache.set(file=file, provider="gemini", file_id="file-123")
|
||||
removed = cache.remove(file, "gemini")
|
||||
|
||||
assert removed is True
|
||||
assert cache.get(file, "gemini") is None
|
||||
|
||||
def test_remove_missing(self):
|
||||
"""Test removing non-existent entry returns False."""
|
||||
cache = UploadCache()
|
||||
file = ImageFile(source=FileBytes(data=MINIMAL_PNG, filename="test.png"))
|
||||
|
||||
removed = cache.remove(file, "gemini")
|
||||
|
||||
assert removed is False
|
||||
|
||||
def test_remove_by_file_id(self):
|
||||
"""Test removing by file ID."""
|
||||
cache = UploadCache()
|
||||
file = ImageFile(source=FileBytes(data=MINIMAL_PNG, filename="test.png"))
|
||||
|
||||
cache.set(file=file, provider="gemini", file_id="file-123")
|
||||
removed = cache.remove_by_file_id("file-123", "gemini")
|
||||
|
||||
assert removed is True
|
||||
assert len(cache) == 0
|
||||
|
||||
def test_clear_expired(self):
|
||||
"""Test clearing expired entries."""
|
||||
cache = UploadCache()
|
||||
file1 = ImageFile(source=FileBytes(data=MINIMAL_PNG, filename="test1.png"))
|
||||
file2 = ImageFile(source=FileBytes(data=MINIMAL_PNG + b"x", filename="test2.png"))
|
||||
|
||||
# Add one expired and one valid entry
|
||||
past = datetime.now(timezone.utc) - timedelta(hours=1)
|
||||
future = datetime.now(timezone.utc) + timedelta(hours=24)
|
||||
|
||||
cache.set(file=file1, provider="gemini", file_id="expired", expires_at=past)
|
||||
cache.set(file=file2, provider="gemini", file_id="valid", expires_at=future)
|
||||
|
||||
removed = cache.clear_expired()
|
||||
|
||||
assert removed == 1
|
||||
assert len(cache) == 1
|
||||
assert cache.get(file2, "gemini") is not None
|
||||
|
||||
def test_clear(self):
|
||||
"""Test clearing all entries."""
|
||||
cache = UploadCache()
|
||||
file = ImageFile(source=FileBytes(data=MINIMAL_PNG, filename="test.png"))
|
||||
|
||||
cache.set(file=file, provider="gemini", file_id="file-123")
|
||||
cache.set(file=file, provider="anthropic", file_id="file-456")
|
||||
|
||||
cleared = cache.clear()
|
||||
|
||||
assert cleared == 2
|
||||
assert len(cache) == 0
|
||||
|
||||
def test_get_all_for_provider(self):
|
||||
"""Test getting all cached uploads for a provider."""
|
||||
cache = UploadCache()
|
||||
file1 = ImageFile(source=FileBytes(data=MINIMAL_PNG, filename="test1.png"))
|
||||
file2 = ImageFile(source=FileBytes(data=MINIMAL_PNG + b"x", filename="test2.png"))
|
||||
file3 = ImageFile(source=FileBytes(data=MINIMAL_PNG + b"xx", filename="test3.png"))
|
||||
|
||||
cache.set(file=file1, provider="gemini", file_id="file-1")
|
||||
cache.set(file=file2, provider="gemini", file_id="file-2")
|
||||
cache.set(file=file3, provider="anthropic", file_id="file-3")
|
||||
|
||||
gemini_uploads = cache.get_all_for_provider("gemini")
|
||||
anthropic_uploads = cache.get_all_for_provider("anthropic")
|
||||
|
||||
assert len(gemini_uploads) == 2
|
||||
assert len(anthropic_uploads) == 1
|
||||
Reference in New Issue
Block a user