diff --git a/lib/crewai/pyproject.toml b/lib/crewai/pyproject.toml index 55c68472a..baa61ccd3 100644 --- a/lib/crewai/pyproject.toml +++ b/lib/crewai/pyproject.toml @@ -103,6 +103,7 @@ file-processing = [ "pypdf~=4.0.0", "python-magic>=0.4.27", "aiocache~=0.12.3", + "aiofiles~=24.1.0", ] diff --git a/lib/crewai/src/crewai/__init__.py b/lib/crewai/src/crewai/__init__.py index fc0fc7719..e3d53bc9f 100644 --- a/lib/crewai/src/crewai/__init__.py +++ b/lib/crewai/src/crewai/__init__.py @@ -6,6 +6,14 @@ import warnings from crewai.agent.core import Agent from crewai.crew import Crew from crewai.crews.crew_output import CrewOutput +from crewai.files import ( + AudioFile, + File, + ImageFile, + PDFFile, + TextFile, + VideoFile, +) from crewai.flow.flow import Flow from crewai.knowledge.knowledge import Knowledge from crewai.llm import LLM @@ -15,14 +23,6 @@ from crewai.task import Task from crewai.tasks.llm_guardrail import LLMGuardrail from crewai.tasks.task_output import TaskOutput from crewai.telemetry.telemetry import Telemetry -from crewai.utilities.files import ( - AudioFile, - File, - ImageFile, - PDFFile, - TextFile, - VideoFile, -) def _suppress_pydantic_deprecation_warnings() -> None: diff --git a/lib/crewai/src/crewai/agents/crew_agent_executor.py b/lib/crewai/src/crewai/agents/crew_agent_executor.py index f1eeef646..4ac9a96af 100644 --- a/lib/crewai/src/crewai/agents/crew_agent_executor.py +++ b/lib/crewai/src/crewai/agents/crew_agent_executor.py @@ -24,6 +24,7 @@ from crewai.events.types.logging_events import ( AgentLogsExecutionEvent, AgentLogsStartedEvent, ) +from crewai.files import FileProcessor from crewai.hooks.llm_hooks import ( get_after_llm_call_hooks, get_before_llm_call_hooks, @@ -44,7 +45,6 @@ from crewai.utilities.agent_utils import ( ) from crewai.utilities.constants import TRAINING_DATA_FILE from crewai.utilities.file_store import get_all_files -from crewai.utilities.files import FileProcessor from crewai.utilities.i18n import I18N, get_i18n from crewai.utilities.printer import Printer from crewai.utilities.tool_utils import ( @@ -238,7 +238,7 @@ class CrewAgentExecutor(CrewAgentExecutorMixin): processor = FileProcessor(constraints=provider) files = processor.process_files(files) - from crewai.utilities.files import get_upload_cache + from crewai.files import get_upload_cache upload_cache = get_upload_cache() content_blocks = self.llm.format_multimodal_content( @@ -258,6 +258,48 @@ class CrewAgentExecutor(CrewAgentExecutorMixin): ] break + async def _ainject_multimodal_files(self) -> None: + """Async inject files as multimodal content into messages. + + For crews with input files and LLMs that support multimodal, + processes files according to provider constraints using parallel processing, + then delegates to the LLM's aformat_multimodal_content method to + generate provider-specific content blocks with parallel file resolution. + """ + if not self.crew or not self.task: + return + + if not self.llm.supports_multimodal(): + return + + files = get_all_files(self.crew.id, self.task.id) + if not files: + return + + provider = getattr(self.llm, "provider", None) or getattr(self.llm, "model", "") + processor = FileProcessor(constraints=provider) + files = await processor.aprocess_files(files) + + from crewai.files import get_upload_cache + + upload_cache = get_upload_cache() + content_blocks = await self.llm.aformat_multimodal_content( + files, upload_cache=upload_cache + ) + if not content_blocks: + return + + for i in range(len(self.messages) - 1, -1, -1): + msg = self.messages[i] + if msg.get("role") == "user": + existing_content = msg.get("content", "") + if isinstance(existing_content, str): + msg["content"] = [ + self.llm.format_text_content(existing_content), + *content_blocks, + ] + break + def _invoke_loop(self) -> AgentFinish: """Execute agent loop until completion. @@ -401,7 +443,7 @@ class CrewAgentExecutor(CrewAgentExecutorMixin): user_prompt = self._format_prompt(self.prompt.get("prompt", ""), inputs) self.messages.append(format_message_for_llm(user_prompt)) - self._inject_multimodal_files() + await self._ainject_multimodal_files() self._show_start_logs() diff --git a/lib/crewai/src/crewai/crews/utils.py b/lib/crewai/src/crewai/crews/utils.py index ee1df151b..f7f955886 100644 --- a/lib/crewai/src/crewai/crews/utils.py +++ b/lib/crewai/src/crewai/crews/utils.py @@ -8,16 +8,16 @@ from typing import TYPE_CHECKING, Any from crewai.agents.agent_builder.base_agent import BaseAgent from crewai.crews.crew_output import CrewOutput -from crewai.rag.embeddings.types import EmbedderConfig -from crewai.types.streaming import CrewStreamingOutput, FlowStreamingOutput -from crewai.utilities.file_store import store_files -from crewai.utilities.files import ( +from crewai.files import ( AudioFile, ImageFile, PDFFile, TextFile, VideoFile, ) +from crewai.rag.embeddings.types import EmbedderConfig +from crewai.types.streaming import CrewStreamingOutput, FlowStreamingOutput +from crewai.utilities.file_store import store_files from crewai.utilities.streaming import ( StreamingState, TaskInfo, diff --git a/lib/crewai/src/crewai/files/__init__.py b/lib/crewai/src/crewai/files/__init__.py new file mode 100644 index 000000000..6fef629c6 --- /dev/null +++ b/lib/crewai/src/crewai/files/__init__.py @@ -0,0 +1,207 @@ +"""File handling utilities for crewAI tasks.""" + +from crewai.files.cleanup import ( + cleanup_expired_files, + cleanup_provider_files, + cleanup_uploaded_files, +) +from crewai.files.content_types import ( + AudioContentType, + AudioExtension, + AudioFile, + BaseFile, + File, + FileMode, + ImageContentType, + ImageExtension, + ImageFile, + PDFContentType, + PDFExtension, + PDFFile, + TextContentType, + TextExtension, + TextFile, + VideoContentType, + VideoExtension, + VideoFile, +) +from crewai.files.file import ( + FileBytes, + FilePath, + FileSource, + FileSourceInput, + FileStream, + RawFileInput, +) +from crewai.files.processing import ( + ANTHROPIC_CONSTRAINTS, + BEDROCK_CONSTRAINTS, + GEMINI_CONSTRAINTS, + OPENAI_CONSTRAINTS, + AudioConstraints, + FileHandling, + FileProcessingError, + FileProcessor, + FileTooLargeError, + FileValidationError, + ImageConstraints, + PDFConstraints, + ProcessingDependencyError, + ProviderConstraints, + UnsupportedFileTypeError, + VideoConstraints, + get_constraints_for_provider, +) +from crewai.files.resolved import ( + FileReference, + InlineBase64, + InlineBytes, + ResolvedFile, + ResolvedFileType, + UrlReference, +) +from crewai.files.resolver import ( + FileResolver, + FileResolverConfig, + create_resolver, +) +from crewai.files.upload_cache import ( + CachedUpload, + UploadCache, + get_upload_cache, + reset_upload_cache, +) +from crewai.files.uploaders import FileUploader, UploadResult, get_uploader + + +FileInput = AudioFile | File | ImageFile | PDFFile | TextFile | VideoFile + + +def wrap_file_source(source: FileSource) -> FileInput: + """Wrap a FileSource in the appropriate typed FileInput wrapper. + + Args: + source: The file source to wrap. + + Returns: + Typed FileInput wrapper based on content type. + """ + content_type = source.content_type + + if content_type.startswith("image/"): + return ImageFile(source=source) + if content_type.startswith("audio/"): + return AudioFile(source=source) + if content_type.startswith("video/"): + return VideoFile(source=source) + if content_type == "application/pdf": + return PDFFile(source=source) + return TextFile(source=source) + + +def normalize_input_files( + input_files: list[FileSourceInput | FileInput], +) -> dict[str, FileInput]: + """Convert a list of file sources to a named dictionary of FileInputs. + + Args: + input_files: List of file source inputs or File objects. + + Returns: + Dictionary mapping names to FileInput wrappers. + """ + from pathlib import Path + + result: dict[str, FileInput] = {} + + for i, item in enumerate(input_files): + if isinstance(item, BaseFile): + name = item.filename or f"file_{i}" + if "." in name: + name = name.rsplit(".", 1)[0] + result[name] = item + continue + + file_source: FilePath | FileBytes | FileStream + if isinstance(item, (FilePath, FileBytes, FileStream)): + file_source = item + elif isinstance(item, Path): + file_source = FilePath(path=item) + elif isinstance(item, str): + file_source = FilePath(path=Path(item)) + elif isinstance(item, (bytes, memoryview)): + file_source = FileBytes(data=bytes(item)) + else: + continue + + name = file_source.filename or f"file_{i}" + result[name] = wrap_file_source(file_source) + + return result + + +__all__ = [ + "ANTHROPIC_CONSTRAINTS", + "BEDROCK_CONSTRAINTS", + "GEMINI_CONSTRAINTS", + "OPENAI_CONSTRAINTS", + "AudioConstraints", + "AudioContentType", + "AudioExtension", + "AudioFile", + "BaseFile", + "CachedUpload", + "File", + "FileBytes", + "FileHandling", + "FileInput", + "FileMode", + "FilePath", + "FileProcessingError", + "FileProcessor", + "FileReference", + "FileResolver", + "FileResolverConfig", + "FileSource", + "FileSourceInput", + "FileStream", + "FileTooLargeError", + "FileUploader", + "FileValidationError", + "ImageConstraints", + "ImageContentType", + "ImageExtension", + "ImageFile", + "InlineBase64", + "InlineBytes", + "PDFConstraints", + "PDFContentType", + "PDFExtension", + "PDFFile", + "ProcessingDependencyError", + "ProviderConstraints", + "RawFileInput", + "ResolvedFile", + "ResolvedFileType", + "TextContentType", + "TextExtension", + "TextFile", + "UnsupportedFileTypeError", + "UploadCache", + "UploadResult", + "UrlReference", + "VideoConstraints", + "VideoContentType", + "VideoExtension", + "VideoFile", + "cleanup_expired_files", + "cleanup_provider_files", + "cleanup_uploaded_files", + "create_resolver", + "get_constraints_for_provider", + "get_upload_cache", + "get_uploader", + "normalize_input_files", + "reset_upload_cache", + "wrap_file_source", +] diff --git a/lib/crewai/src/crewai/files/cleanup.py b/lib/crewai/src/crewai/files/cleanup.py new file mode 100644 index 000000000..dc273a7a1 --- /dev/null +++ b/lib/crewai/src/crewai/files/cleanup.py @@ -0,0 +1,368 @@ +"""Cleanup utilities for uploaded files.""" + +from __future__ import annotations + +import asyncio +import logging +from typing import TYPE_CHECKING + +from crewai.files.upload_cache import CachedUpload, UploadCache +from crewai.files.uploaders import get_uploader + + +if TYPE_CHECKING: + from crewai.files.uploaders.base import FileUploader + +logger = logging.getLogger(__name__) + + +def _safe_delete( + uploader: FileUploader, + file_id: str, + provider: str, +) -> bool: + """Safely delete a file, logging any errors. + + Args: + uploader: The file uploader to use. + file_id: The file ID to delete. + provider: Provider name for logging. + + Returns: + True if deleted successfully, False otherwise. + """ + try: + if uploader.delete(file_id): + logger.debug(f"Deleted {file_id} from {provider}") + return True + logger.warning(f"Failed to delete {file_id} from {provider}") + return False + except Exception as e: + logger.warning(f"Error deleting {file_id} from {provider}: {e}") + return False + + +def cleanup_uploaded_files( + cache: UploadCache, + *, + delete_from_provider: bool = True, + providers: list[str] | None = None, +) -> int: + """Clean up uploaded files from the cache and optionally from providers. + + Args: + cache: The upload cache to clean up. + delete_from_provider: If True, delete files from the provider as well. + providers: Optional list of providers to clean up. If None, cleans all. + + Returns: + Number of files cleaned up. + """ + cleaned = 0 + + provider_uploads: dict[str, list[CachedUpload]] = {} + + for provider in _get_providers_from_cache(cache): + if providers is not None and provider not in providers: + continue + provider_uploads[provider] = cache.get_all_for_provider(provider) + + if delete_from_provider: + for provider, uploads in provider_uploads.items(): + uploader = get_uploader(provider) + if uploader is None: + logger.warning( + f"No uploader available for {provider}, skipping cleanup" + ) + continue + + for upload in uploads: + if _safe_delete(uploader, upload.file_id, provider): + cleaned += 1 + + cache.clear() + + logger.info(f"Cleaned up {cleaned} uploaded files") + return cleaned + + +def cleanup_expired_files( + cache: UploadCache, + *, + delete_from_provider: bool = False, +) -> int: + """Clean up expired files from the cache. + + Args: + cache: The upload cache to clean up. + delete_from_provider: If True, attempt to delete from provider as well. + Note: Expired files may already be deleted by the provider. + + Returns: + Number of expired entries removed from cache. + """ + expired_entries: list[CachedUpload] = [] + + if delete_from_provider: + for provider in _get_providers_from_cache(cache): + expired_entries.extend( + upload + for upload in cache.get_all_for_provider(provider) + if upload.is_expired() + ) + + removed = cache.clear_expired() + + if delete_from_provider: + for upload in expired_entries: + uploader = get_uploader(upload.provider) + if uploader is not None: + try: + uploader.delete(upload.file_id) + except Exception as e: + logger.debug(f"Could not delete expired file {upload.file_id}: {e}") + + return removed + + +def cleanup_provider_files( + provider: str, + *, + cache: UploadCache | None = None, + delete_all_from_provider: bool = False, +) -> int: + """Clean up all files for a specific provider. + + Args: + provider: Provider name to clean up. + cache: Optional upload cache to clear entries from. + delete_all_from_provider: If True, delete all files from the provider, + not just cached ones. + + Returns: + Number of files deleted. + """ + deleted = 0 + uploader = get_uploader(provider) + + if uploader is None: + logger.warning(f"No uploader available for {provider}") + return 0 + + if delete_all_from_provider: + try: + files = uploader.list_files() + for file_info in files: + file_id = file_info.get("id") or file_info.get("name") + if file_id and uploader.delete(file_id): + deleted += 1 + except Exception as e: + logger.warning(f"Error listing/deleting files from {provider}: {e}") + elif cache is not None: + uploads = cache.get_all_for_provider(provider) + for upload in uploads: + if _safe_delete(uploader, upload.file_id, provider): + deleted += 1 + cache.remove_by_file_id(upload.file_id, provider) + + logger.info(f"Deleted {deleted} files from {provider}") + return deleted + + +def _get_providers_from_cache(cache: UploadCache) -> set[str]: + """Get unique provider names from cache entries. + + Args: + cache: The upload cache. + + Returns: + Set of provider names. + """ + return cache.get_providers() + + +async def _asafe_delete( + uploader: FileUploader, + file_id: str, + provider: str, +) -> bool: + """Async safely delete a file, logging any errors. + + Args: + uploader: The file uploader to use. + file_id: The file ID to delete. + provider: Provider name for logging. + + Returns: + True if deleted successfully, False otherwise. + """ + try: + if await uploader.adelete(file_id): + logger.debug(f"Deleted {file_id} from {provider}") + return True + logger.warning(f"Failed to delete {file_id} from {provider}") + return False + except Exception as e: + logger.warning(f"Error deleting {file_id} from {provider}: {e}") + return False + + +async def acleanup_uploaded_files( + cache: UploadCache, + *, + delete_from_provider: bool = True, + providers: list[str] | None = None, + max_concurrency: int = 10, +) -> int: + """Async clean up uploaded files from the cache and optionally from providers. + + Args: + cache: The upload cache to clean up. + delete_from_provider: If True, delete files from the provider as well. + providers: Optional list of providers to clean up. If None, cleans all. + max_concurrency: Maximum number of concurrent delete operations. + + Returns: + Number of files cleaned up. + """ + cleaned = 0 + + provider_uploads: dict[str, list[CachedUpload]] = {} + + for provider in _get_providers_from_cache(cache): + if providers is not None and provider not in providers: + continue + provider_uploads[provider] = await cache.aget_all_for_provider(provider) + + if delete_from_provider: + semaphore = asyncio.Semaphore(max_concurrency) + + async def delete_one(uploader: FileUploader, upload: CachedUpload) -> bool: + async with semaphore: + return await _asafe_delete(uploader, upload.file_id, upload.provider) + + tasks: list[asyncio.Task[bool]] = [] + for provider, uploads in provider_uploads.items(): + uploader = get_uploader(provider) + if uploader is None: + logger.warning( + f"No uploader available for {provider}, skipping cleanup" + ) + continue + + tasks.extend( + asyncio.create_task(delete_one(uploader, upload)) for upload in uploads + ) + + results = await asyncio.gather(*tasks, return_exceptions=True) + cleaned = sum(1 for r in results if r is True) + + await cache.aclear() + + logger.info(f"Cleaned up {cleaned} uploaded files") + return cleaned + + +async def acleanup_expired_files( + cache: UploadCache, + *, + delete_from_provider: bool = False, + max_concurrency: int = 10, +) -> int: + """Async clean up expired files from the cache. + + Args: + cache: The upload cache to clean up. + delete_from_provider: If True, attempt to delete from provider as well. + max_concurrency: Maximum number of concurrent delete operations. + + Returns: + Number of expired entries removed from cache. + """ + expired_entries: list[CachedUpload] = [] + + if delete_from_provider: + for provider in _get_providers_from_cache(cache): + uploads = await cache.aget_all_for_provider(provider) + expired_entries.extend(upload for upload in uploads if upload.is_expired()) + + removed = await cache.aclear_expired() + + if delete_from_provider and expired_entries: + semaphore = asyncio.Semaphore(max_concurrency) + + async def delete_expired(upload: CachedUpload) -> None: + async with semaphore: + uploader = get_uploader(upload.provider) + if uploader is not None: + try: + await uploader.adelete(upload.file_id) + except Exception as e: + logger.debug( + f"Could not delete expired file {upload.file_id}: {e}" + ) + + await asyncio.gather( + *[delete_expired(upload) for upload in expired_entries], + return_exceptions=True, + ) + + return removed + + +async def acleanup_provider_files( + provider: str, + *, + cache: UploadCache | None = None, + delete_all_from_provider: bool = False, + max_concurrency: int = 10, +) -> int: + """Async clean up all files for a specific provider. + + Args: + provider: Provider name to clean up. + cache: Optional upload cache to clear entries from. + delete_all_from_provider: If True, delete all files from the provider. + max_concurrency: Maximum number of concurrent delete operations. + + Returns: + Number of files deleted. + """ + deleted = 0 + uploader = get_uploader(provider) + + if uploader is None: + logger.warning(f"No uploader available for {provider}") + return 0 + + semaphore = asyncio.Semaphore(max_concurrency) + + async def delete_file(file_id: str) -> bool: + async with semaphore: + return await uploader.adelete(file_id) + + if delete_all_from_provider: + try: + files = uploader.list_files() + tasks = [] + for file_info in files: + file_id = file_info.get("id") or file_info.get("name") + if file_id: + tasks.append(delete_file(file_id)) + results = await asyncio.gather(*tasks, return_exceptions=True) + deleted = sum(1 for r in results if r is True) + except Exception as e: + logger.warning(f"Error listing/deleting files from {provider}: {e}") + elif cache is not None: + uploads = await cache.aget_all_for_provider(provider) + tasks = [] + for upload in uploads: + tasks.append(delete_file(upload.file_id)) + results = await asyncio.gather(*tasks, return_exceptions=True) + for upload, result in zip(uploads, results, strict=False): + if result is True: + deleted += 1 + await cache.aremove_by_file_id(upload.file_id, provider) + + logger.info(f"Deleted {deleted} files from {provider}") + return deleted diff --git a/lib/crewai/src/crewai/utilities/files/content_types.py b/lib/crewai/src/crewai/files/content_types.py similarity index 92% rename from lib/crewai/src/crewai/utilities/files/content_types.py rename to lib/crewai/src/crewai/files/content_types.py index d7ddf38cd..3aecc6e47 100644 --- a/lib/crewai/src/crewai/utilities/files/content_types.py +++ b/lib/crewai/src/crewai/files/content_types.py @@ -11,7 +11,8 @@ from pydantic import BaseModel, Field, GetCoreSchemaHandler from pydantic_core import CoreSchema, core_schema from typing_extensions import TypeIs -from crewai.utilities.files.file import ( +from crewai.files.file import ( + AsyncFileStream, FileBytes, FilePath, FileSource, @@ -185,7 +186,18 @@ class BaseFile(ABC, BaseModel): def read(self) -> bytes: """Read the file content as bytes.""" - return self._file_source.read() + return self._file_source.read() # type: ignore[union-attr] + + async def aread(self) -> bytes: + """Async read the file content as bytes. + + Raises: + TypeError: If the underlying source doesn't support async read. + """ + source = self._file_source + if isinstance(source, (FilePath, FileBytes, AsyncFileStream)): + return await source.aread() + raise TypeError(f"{type(source).__name__} does not support async read") def read_text(self, encoding: str = "utf-8") -> str: """Read the file content as string.""" diff --git a/lib/crewai/src/crewai/files/file.py b/lib/crewai/src/crewai/files/file.py new file mode 100644 index 000000000..f7669f9ec --- /dev/null +++ b/lib/crewai/src/crewai/files/file.py @@ -0,0 +1,377 @@ +"""Base file class for handling file inputs in tasks.""" + +from __future__ import annotations + +from collections.abc import AsyncIterator, Iterator +from pathlib import Path +from typing import Annotated, Any, BinaryIO, Protocol, cast, runtime_checkable + +import aiofiles +import magic +from pydantic import ( + BaseModel, + BeforeValidator, + Field, + GetCoreSchemaHandler, + PrivateAttr, + model_validator, +) +from pydantic_core import CoreSchema, core_schema + + +@runtime_checkable +class AsyncReadable(Protocol): + """Protocol for async readable streams.""" + + async def read(self, size: int = -1) -> bytes: ... + + +class _AsyncReadableValidator: + """Pydantic validator for AsyncReadable types.""" + + @classmethod + def __get_pydantic_core_schema__( + cls, _source_type: Any, _handler: GetCoreSchemaHandler + ) -> CoreSchema: + return core_schema.no_info_plain_validator_function( + cls._validate, + serialization=core_schema.plain_serializer_function_ser_schema( + lambda x: None, info_arg=False + ), + ) + + @staticmethod + def _validate(value: Any) -> AsyncReadable: + if isinstance(value, AsyncReadable): + return value + raise ValueError("Expected an async readable object with async read() method") + + +ValidatedAsyncReadable = Annotated[AsyncReadable, _AsyncReadableValidator()] + +DEFAULT_MAX_FILE_SIZE_BYTES = 500 * 1024 * 1024 # 500MB + + +def detect_content_type(data: bytes) -> str: + """Detect MIME type from file content. + + Args: + data: Raw bytes to analyze. + + Returns: + The detected MIME type. + """ + result: str = magic.from_buffer(data, mime=True) + return result + + +class _BinaryIOValidator: + """Pydantic validator for BinaryIO types.""" + + @classmethod + def __get_pydantic_core_schema__( + cls, _source_type: Any, _handler: GetCoreSchemaHandler + ) -> CoreSchema: + return core_schema.no_info_plain_validator_function( + cls._validate, + serialization=core_schema.plain_serializer_function_ser_schema( + lambda x: None, info_arg=False + ), + ) + + @staticmethod + def _validate(value: Any) -> BinaryIO: + if hasattr(value, "read") and hasattr(value, "seek"): + return cast(BinaryIO, value) + raise ValueError("Expected a binary file-like object with read() and seek()") + + +ValidatedBinaryIO = Annotated[BinaryIO, _BinaryIOValidator()] + + +class FilePath(BaseModel): + """File loaded from a filesystem path.""" + + path: Path = Field(description="Path to the file on the filesystem.") + max_size_bytes: int = Field( + default=DEFAULT_MAX_FILE_SIZE_BYTES, + exclude=True, + description="Maximum file size in bytes.", + ) + _content: bytes | None = PrivateAttr(default=None) + + @model_validator(mode="after") + def _validate_file_exists(self) -> FilePath: + """Validate that the file exists, is secure, and within size limits.""" + from crewai.files.processing.exceptions import FileTooLargeError + + path_str = str(self.path) + if ".." in path_str: + raise ValueError(f"Path traversal not allowed: {self.path}") + + if self.path.is_symlink(): + resolved = self.path.resolve() + cwd = Path.cwd().resolve() + if not str(resolved).startswith(str(cwd)): + raise ValueError(f"Symlink escapes allowed directory: {self.path}") + + if not self.path.exists(): + raise ValueError(f"File not found: {self.path}") + if not self.path.is_file(): + raise ValueError(f"Path is not a file: {self.path}") + + actual_size = self.path.stat().st_size + if actual_size > self.max_size_bytes: + raise FileTooLargeError( + f"File exceeds max size ({actual_size} > {self.max_size_bytes})", + file_name=str(self.path), + actual_size=actual_size, + max_size=self.max_size_bytes, + ) + + return self + + @property + def filename(self) -> str: + """Get the filename from the path.""" + return self.path.name + + @property + def content_type(self) -> str: + """Get the content type by reading file content.""" + return detect_content_type(self.read()) + + def read(self) -> bytes: + """Read the file content from disk.""" + if self._content is None: + self._content = self.path.read_bytes() + return self._content + + async def aread(self) -> bytes: + """Async read the file content from disk.""" + if self._content is None: + async with aiofiles.open(self.path, "rb") as f: + self._content = await f.read() + return self._content + + def read_chunks(self, chunk_size: int = 65536) -> Iterator[bytes]: + """Stream file content in chunks without loading entirely into memory. + + Args: + chunk_size: Size of each chunk in bytes. + + Yields: + Chunks of file content. + """ + with open(self.path, "rb") as f: + while chunk := f.read(chunk_size): + yield chunk + + async def aread_chunks(self, chunk_size: int = 65536) -> AsyncIterator[bytes]: + """Async streaming for non-blocking I/O. + + Args: + chunk_size: Size of each chunk in bytes. + + Yields: + Chunks of file content. + """ + async with aiofiles.open(self.path, "rb") as f: + while chunk := await f.read(chunk_size): + yield chunk + + +class FileBytes(BaseModel): + """File created from raw bytes content.""" + + data: bytes = Field(description="Raw bytes content of the file.") + filename: str | None = Field(default=None, description="Optional filename.") + + @property + def content_type(self) -> str: + """Get the content type from the data.""" + return detect_content_type(self.data) + + def read(self) -> bytes: + """Return the bytes content.""" + return self.data + + async def aread(self) -> bytes: + """Async return the bytes content (immediate, already in memory).""" + return self.data + + def read_chunks(self, chunk_size: int = 65536) -> Iterator[bytes]: + """Stream bytes content in chunks. + + Args: + chunk_size: Size of each chunk in bytes. + + Yields: + Chunks of bytes content. + """ + for i in range(0, len(self.data), chunk_size): + yield self.data[i : i + chunk_size] + + async def aread_chunks(self, chunk_size: int = 65536) -> AsyncIterator[bytes]: + """Async streaming (immediate yield since already in memory). + + Args: + chunk_size: Size of each chunk in bytes. + + Yields: + Chunks of bytes content. + """ + for chunk in self.read_chunks(chunk_size): + yield chunk + + +class FileStream(BaseModel): + """File loaded from a file-like stream.""" + + stream: ValidatedBinaryIO = Field(description="Binary file stream.") + filename: str | None = Field(default=None, description="Optional filename.") + _content: bytes | None = PrivateAttr(default=None) + + def model_post_init(self, __context: object) -> None: + """Extract filename from stream if not provided.""" + if self.filename is None: + name = getattr(self.stream, "name", None) + if name is not None: + object.__setattr__(self, "filename", Path(name).name) + + @property + def content_type(self) -> str: + """Get the content type from stream content.""" + return detect_content_type(self.read()) + + def read(self) -> bytes: + """Read the stream content. Content is cached after first read.""" + if self._content is None: + position = self.stream.tell() + self.stream.seek(0) + self._content = self.stream.read() + self.stream.seek(position) + return self._content + + def close(self) -> None: + """Close the underlying stream.""" + self.stream.close() + + def __enter__(self) -> FileStream: + """Enter context manager.""" + return self + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_val: BaseException | None, + exc_tb: Any, + ) -> None: + """Exit context manager and close stream.""" + self.close() + + def read_chunks(self, chunk_size: int = 65536) -> Iterator[bytes]: + """Stream from underlying stream in chunks. + + Args: + chunk_size: Size of each chunk in bytes. + + Yields: + Chunks of stream content. + """ + position = self.stream.tell() + self.stream.seek(0) + try: + while chunk := self.stream.read(chunk_size): + yield chunk + finally: + self.stream.seek(position) + + +class AsyncFileStream(BaseModel): + """File loaded from an async stream. + + Use for async file handles like aiofiles objects or aiohttp response bodies. + This is an async-only type - use aread() instead of read(). + + Attributes: + stream: Async file-like object with async read() method. + filename: Optional filename for the stream. + """ + + stream: ValidatedAsyncReadable = Field( + description="Async file stream with async read() method." + ) + filename: str | None = Field(default=None, description="Optional filename.") + _content: bytes | None = PrivateAttr(default=None) + + @property + def content_type(self) -> str: + """Get the content type from stream content. Requires aread() first.""" + if self._content is None: + raise RuntimeError("Call aread() first to load content") + return detect_content_type(self._content) + + async def aread(self) -> bytes: + """Async read the stream content. Content is cached after first read.""" + if self._content is None: + self._content = await self.stream.read() + return self._content + + async def aclose(self) -> None: + """Async close the underlying stream.""" + if hasattr(self.stream, "close"): + result = self.stream.close() + if hasattr(result, "__await__"): + await result + + async def __aenter__(self) -> AsyncFileStream: + """Async enter context manager.""" + return self + + async def __aexit__( + self, + exc_type: type[BaseException] | None, + exc_val: BaseException | None, + exc_tb: Any, + ) -> None: + """Async exit context manager and close stream.""" + await self.aclose() + + async def aread_chunks(self, chunk_size: int = 65536) -> AsyncIterator[bytes]: + """Async stream content in chunks. + + Args: + chunk_size: Size of each chunk in bytes. + + Yields: + Chunks of stream content. + """ + while chunk := await self.stream.read(chunk_size): + yield chunk + + +FileSource = FilePath | FileBytes | FileStream | AsyncFileStream + + +def _normalize_source(value: Any) -> FileSource: + """Convert raw input to appropriate source type.""" + if isinstance(value, (FilePath, FileBytes, FileStream, AsyncFileStream)): + return value + if isinstance(value, Path): + return FilePath(path=value) + if isinstance(value, str): + return FilePath(path=Path(value)) + if isinstance(value, bytes): + return FileBytes(data=value) + if isinstance(value, AsyncReadable): + return AsyncFileStream(stream=value) + if hasattr(value, "read") and hasattr(value, "seek"): + return FileStream(stream=value) + raise ValueError(f"Cannot convert {type(value).__name__} to file source") + + +RawFileInput = str | Path | bytes +FileSourceInput = Annotated[ + RawFileInput | FileSource, BeforeValidator(_normalize_source) +] diff --git a/lib/crewai/src/crewai/files/metrics.py b/lib/crewai/src/crewai/files/metrics.py new file mode 100644 index 000000000..fa20d7e20 --- /dev/null +++ b/lib/crewai/src/crewai/files/metrics.py @@ -0,0 +1,184 @@ +"""Performance metrics and structured logging for file operations.""" + +from __future__ import annotations + +from collections.abc import Generator +from contextlib import contextmanager +from dataclasses import dataclass, field +from datetime import datetime, timezone +import logging +import time +from typing import Any + + +logger = logging.getLogger(__name__) + + +@dataclass +class FileOperationMetrics: + """Metrics for a file operation. + + Attributes: + operation: Name of the operation (e.g., "upload", "resolve", "process"). + filename: Name of the file being operated on. + provider: Provider name if applicable. + duration_ms: Duration of the operation in milliseconds. + size_bytes: Size of the file in bytes. + success: Whether the operation succeeded. + error: Error message if operation failed. + timestamp: When the operation occurred. + metadata: Additional operation-specific metadata. + """ + + operation: str + filename: str | None = None + provider: str | None = None + duration_ms: float = 0.0 + size_bytes: int | None = None + success: bool = True + error: str | None = None + timestamp: datetime = field(default_factory=lambda: datetime.now(timezone.utc)) + metadata: dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> dict[str, Any]: + """Convert metrics to dictionary for logging. + + Returns: + Dictionary representation of metrics. + """ + result: dict[str, Any] = { + "operation": self.operation, + "duration_ms": round(self.duration_ms, 2), + "success": self.success, + "timestamp": self.timestamp.isoformat(), + } + + if self.filename: + result["filename"] = self.filename + if self.provider: + result["provider"] = self.provider + if self.size_bytes is not None: + result["size_bytes"] = self.size_bytes + if self.error: + result["error"] = self.error + if self.metadata: + result.update(self.metadata) + + return result + + +@contextmanager +def measure_operation( + operation: str, + *, + filename: str | None = None, + provider: str | None = None, + size_bytes: int | None = None, + log_level: int = logging.DEBUG, + **extra_metadata: Any, +) -> Generator[FileOperationMetrics, None, None]: + """Context manager to measure and log operation performance. + + Args: + operation: Name of the operation. + filename: Optional filename being operated on. + provider: Optional provider name. + size_bytes: Optional file size in bytes. + log_level: Log level for the result message. + **extra_metadata: Additional metadata to include. + + Yields: + FileOperationMetrics object that will be populated with results. + + Example: + with measure_operation("upload", filename="test.pdf", provider="openai") as metrics: + result = upload_file(file) + metrics.metadata["file_id"] = result.file_id + """ + metrics = FileOperationMetrics( + operation=operation, + filename=filename, + provider=provider, + size_bytes=size_bytes, + metadata=dict(extra_metadata), + ) + + start_time = time.perf_counter() + + try: + yield metrics + metrics.success = True + except Exception as e: + metrics.success = False + metrics.error = str(e) + raise + finally: + metrics.duration_ms = (time.perf_counter() - start_time) * 1000 + + log_message = f"{operation}" + if filename: + log_message += f" [{filename}]" + if provider: + log_message += f" ({provider})" + + if metrics.success: + log_message += f" completed in {metrics.duration_ms:.2f}ms" + else: + log_message += f" failed after {metrics.duration_ms:.2f}ms: {metrics.error}" + + logger.log(log_level, log_message, extra=metrics.to_dict()) + + +def log_file_operation( + operation: str, + *, + filename: str | None = None, + provider: str | None = None, + size_bytes: int | None = None, + duration_ms: float | None = None, + success: bool = True, + error: str | None = None, + level: int = logging.INFO, + **extra: Any, +) -> None: + """Log a file operation with structured data. + + Args: + operation: Name of the operation. + filename: Optional filename being operated on. + provider: Optional provider name. + size_bytes: Optional file size in bytes. + duration_ms: Optional duration in milliseconds. + success: Whether the operation succeeded. + error: Optional error message. + level: Log level to use. + **extra: Additional metadata to include. + """ + metrics = FileOperationMetrics( + operation=operation, + filename=filename, + provider=provider, + size_bytes=size_bytes, + duration_ms=duration_ms or 0.0, + success=success, + error=error, + metadata=dict(extra), + ) + + message = f"{operation}" + if filename: + message += f" [{filename}]" + if provider: + message += f" ({provider})" + + if success: + if duration_ms: + message += f" completed in {duration_ms:.2f}ms" + else: + message += " completed" + else: + message += " failed" + if error: + message += f": {error}" + + logger.log(level, message, extra=metrics.to_dict()) diff --git a/lib/crewai/src/crewai/utilities/files/processing/__init__.py b/lib/crewai/src/crewai/files/processing/__init__.py similarity index 80% rename from lib/crewai/src/crewai/utilities/files/processing/__init__.py rename to lib/crewai/src/crewai/files/processing/__init__.py index cfe44a372..21694f180 100644 --- a/lib/crewai/src/crewai/utilities/files/processing/__init__.py +++ b/lib/crewai/src/crewai/files/processing/__init__.py @@ -4,7 +4,7 @@ This module provides validation, transformation, and processing utilities for files used in multimodal LLM interactions. """ -from crewai.utilities.files.processing.constraints import ( +from crewai.files.processing.constraints import ( ANTHROPIC_CONSTRAINTS, BEDROCK_CONSTRAINTS, GEMINI_CONSTRAINTS, @@ -16,16 +16,16 @@ from crewai.utilities.files.processing.constraints import ( VideoConstraints, get_constraints_for_provider, ) -from crewai.utilities.files.processing.enums import FileHandling -from crewai.utilities.files.processing.exceptions import ( +from crewai.files.processing.enums import FileHandling +from crewai.files.processing.exceptions import ( FileProcessingError, FileTooLargeError, FileValidationError, ProcessingDependencyError, UnsupportedFileTypeError, ) -from crewai.utilities.files.processing.processor import FileProcessor -from crewai.utilities.files.processing.validators import ( +from crewai.files.processing.processor import FileProcessor +from crewai.files.processing.validators import ( validate_audio, validate_file, validate_image, diff --git a/lib/crewai/src/crewai/utilities/files/processing/constraints.py b/lib/crewai/src/crewai/files/processing/constraints.py similarity index 100% rename from lib/crewai/src/crewai/utilities/files/processing/constraints.py rename to lib/crewai/src/crewai/files/processing/constraints.py diff --git a/lib/crewai/src/crewai/utilities/files/processing/enums.py b/lib/crewai/src/crewai/files/processing/enums.py similarity index 100% rename from lib/crewai/src/crewai/utilities/files/processing/enums.py rename to lib/crewai/src/crewai/files/processing/enums.py diff --git a/lib/crewai/src/crewai/utilities/files/processing/exceptions.py b/lib/crewai/src/crewai/files/processing/exceptions.py similarity index 80% rename from lib/crewai/src/crewai/utilities/files/processing/exceptions.py rename to lib/crewai/src/crewai/files/processing/exceptions.py index 803c50911..0a9442462 100644 --- a/lib/crewai/src/crewai/utilities/files/processing/exceptions.py +++ b/lib/crewai/src/crewai/files/processing/exceptions.py @@ -81,3 +81,23 @@ class ProcessingDependencyError(FileProcessingError): self.dependency = dependency self.install_command = install_command super().__init__(message) + + +class TransientFileError(FileProcessingError): + """Transient error that may succeed on retry (network, timeout).""" + + +class PermanentFileError(FileProcessingError): + """Permanent error that will not succeed on retry (auth, format).""" + + +class UploadError(FileProcessingError): + """Base exception for upload errors.""" + + +class TransientUploadError(UploadError, TransientFileError): + """Upload failed but may succeed on retry (network issues, rate limits).""" + + +class PermanentUploadError(UploadError, PermanentFileError): + """Upload failed permanently (auth failure, invalid file, unsupported type).""" diff --git a/lib/crewai/src/crewai/utilities/files/processing/processor.py b/lib/crewai/src/crewai/files/processing/processor.py similarity index 80% rename from lib/crewai/src/crewai/utilities/files/processing/processor.py rename to lib/crewai/src/crewai/files/processing/processor.py index b40087d96..c80f1ec80 100644 --- a/lib/crewai/src/crewai/utilities/files/processing/processor.py +++ b/lib/crewai/src/crewai/files/processing/processor.py @@ -1,9 +1,10 @@ """FileProcessor for validating and transforming files based on provider constraints.""" +import asyncio from collections.abc import Sequence import logging -from crewai.utilities.files.content_types import ( +from crewai.files.content_types import ( AudioFile, File, ImageFile, @@ -11,18 +12,18 @@ from crewai.utilities.files.content_types import ( TextFile, VideoFile, ) -from crewai.utilities.files.processing.constraints import ( +from crewai.files.processing.constraints import ( ProviderConstraints, get_constraints_for_provider, ) -from crewai.utilities.files.processing.enums import FileHandling -from crewai.utilities.files.processing.exceptions import ( +from crewai.files.processing.enums import FileHandling +from crewai.files.processing.exceptions import ( FileProcessingError, FileTooLargeError, FileValidationError, UnsupportedFileTypeError, ) -from crewai.utilities.files.processing.transformers import ( +from crewai.files.processing.transformers import ( chunk_pdf, chunk_text, get_image_dimensions, @@ -30,7 +31,7 @@ from crewai.utilities.files.processing.transformers import ( optimize_image, resize_image, ) -from crewai.utilities.files.processing.validators import validate_file +from crewai.files.processing.validators import validate_file logger = logging.getLogger(__name__) @@ -183,6 +184,52 @@ class FileProcessor: return result + async def aprocess_files( + self, + files: dict[str, FileInput], + max_concurrency: int = 10, + ) -> dict[str, FileInput]: + """Async process multiple files in parallel. + + Args: + files: Dictionary mapping names to file inputs. + max_concurrency: Maximum number of concurrent processing tasks. + + Returns: + Dictionary mapping names to processed files. If a file is chunked, + multiple entries are created with indexed names. + """ + semaphore = asyncio.Semaphore(max_concurrency) + + async def process_one( + name: str, file: FileInput + ) -> tuple[str, FileInput | Sequence[FileInput]]: + async with semaphore: + loop = asyncio.get_running_loop() + processed = await loop.run_in_executor(None, self.process, file) + return name, processed + + tasks = [process_one(n, f) for n, f in files.items()] + results = await asyncio.gather(*tasks, return_exceptions=True) + + output: dict[str, FileInput] = {} + for result in results: + if isinstance(result, BaseException): + logger.error(f"Processing failed: {result}") + continue + name, processed = result + if isinstance(processed, Sequence) and not isinstance( + processed, (str, bytes) + ): + for i, chunk in enumerate(processed): + output[f"{name}_chunk_{i}"] = chunk + elif isinstance( + processed, (AudioFile, File, ImageFile, PDFFile, TextFile, VideoFile) + ): + output[name] = processed + + return output + def _auto_process(self, file: FileInput) -> FileInput: """Automatically resize/compress file to meet constraints. @@ -272,7 +319,7 @@ class FileProcessor: page_count = get_pdf_page_count(file) if page_count is not None and page_count > max_pages: try: - return chunk_pdf(file, max_pages) + return list(chunk_pdf(file, max_pages)) except Exception as e: logger.warning(f"Failed to chunk PDF: {e}") return file @@ -284,7 +331,7 @@ class FileProcessor: content = file.read() if len(content) > max_size: try: - return chunk_text(file, max_size) + return list(chunk_text(file, max_size)) except Exception as e: logger.warning(f"Failed to chunk text file: {e}") return file diff --git a/lib/crewai/src/crewai/utilities/files/processing/transformers.py b/lib/crewai/src/crewai/files/processing/transformers.py similarity index 87% rename from lib/crewai/src/crewai/utilities/files/processing/transformers.py rename to lib/crewai/src/crewai/files/processing/transformers.py index 2e4288bb7..95a489998 100644 --- a/lib/crewai/src/crewai/utilities/files/processing/transformers.py +++ b/lib/crewai/src/crewai/files/processing/transformers.py @@ -1,12 +1,12 @@ """File transformation functions for resizing, optimizing, and chunking.""" -from collections.abc import Sequence +from collections.abc import Iterator import io import logging -from crewai.utilities.files.content_types import ImageFile, PDFFile, TextFile -from crewai.utilities.files.file import FileBytes -from crewai.utilities.files.processing.exceptions import ProcessingDependencyError +from crewai.files.content_types import ImageFile, PDFFile, TextFile +from crewai.files.file import FileBytes +from crewai.files.processing.exceptions import ProcessingDependencyError logger = logging.getLogger(__name__) @@ -161,22 +161,24 @@ def chunk_pdf( max_pages: int, *, overlap_pages: int = 0, -) -> Sequence[PDFFile]: +) -> Iterator[PDFFile]: """Split a PDF into chunks of maximum page count. + Yields chunks one at a time to minimize memory usage. + Args: file: The PDF file to chunk. max_pages: Maximum pages per chunk. overlap_pages: Number of overlapping pages between chunks (for context). - Returns: - List of PDFFile objects, one per chunk. + Yields: + PDFFile objects, one per chunk. Raises: ProcessingDependencyError: If pypdf is not installed. """ try: - from pypdf import PdfReader, PdfWriter # type: ignore[import-not-found] + from pypdf import PdfReader, PdfWriter except ImportError as e: raise ProcessingDependencyError( "pypdf is required for PDF chunking", @@ -189,9 +191,9 @@ def chunk_pdf( total_pages = len(reader.pages) if total_pages <= max_pages: - return [file] + yield file + return - chunks: list[PDFFile] = [] filename = file.filename or "document.pdf" base_filename = filename.rsplit(".", 1)[0] step = max_pages - overlap_pages @@ -211,19 +213,16 @@ def chunk_pdf( output_bytes = output_buffer.getvalue() chunk_filename = f"{base_filename}_chunk_{chunk_num}.pdf" - chunks.append( - PDFFile(source=FileBytes(data=output_bytes, filename=chunk_filename)) - ) logger.info( f"Created PDF chunk '{chunk_filename}' with pages {start_page + 1}-{end_page}" ) + yield PDFFile(source=FileBytes(data=output_bytes, filename=chunk_filename)) + start_page += step chunk_num += 1 - return chunks - def chunk_text( file: TextFile, @@ -231,26 +230,28 @@ def chunk_text( *, overlap_chars: int = 200, split_on_newlines: bool = True, -) -> Sequence[TextFile]: +) -> Iterator[TextFile]: """Split a text file into chunks of maximum character count. + Yields chunks one at a time to minimize memory usage. + Args: file: The text file to chunk. max_chars: Maximum characters per chunk. overlap_chars: Number of overlapping characters between chunks. split_on_newlines: If True, prefer splitting at newline boundaries. - Returns: - List of TextFile objects, one per chunk. + Yields: + TextFile objects, one per chunk. """ content = file.read() text = content.decode("utf-8", errors="replace") total_chars = len(text) if total_chars <= max_chars: - return [file] + yield file + return - chunks: list[TextFile] = [] filename = file.filename or "text.txt" base_filename = filename.rsplit(".", 1)[0] extension = filename.rsplit(".", 1)[-1] if "." in filename else "txt" @@ -261,29 +262,27 @@ def chunk_text( while start_pos < total_chars: end_pos = min(start_pos + max_chars, total_chars) - # If not at end, try to find a better split point if end_pos < total_chars and split_on_newlines: - # Look for last newline within the chunk last_newline = text.rfind("\n", start_pos, end_pos) - if last_newline > start_pos + max_chars // 2: # Don't split too early + if last_newline > start_pos + max_chars // 2: end_pos = last_newline + 1 - chunk_text = text[start_pos:end_pos] - chunk_bytes = chunk_text.encode("utf-8") + chunk_content = text[start_pos:end_pos] + chunk_bytes = chunk_content.encode("utf-8") chunk_filename = f"{base_filename}_chunk_{chunk_num}.{extension}" - chunks.append( - TextFile(source=FileBytes(data=chunk_bytes, filename=chunk_filename)) - ) logger.info( - f"Created text chunk '{chunk_filename}' with {len(chunk_text)} characters" + f"Created text chunk '{chunk_filename}' with {len(chunk_content)} characters" ) - start_pos = end_pos - overlap_chars if end_pos < total_chars else total_chars - chunk_num += 1 + yield TextFile(source=FileBytes(data=chunk_bytes, filename=chunk_filename)) - return chunks + if end_pos < total_chars: + start_pos = max(start_pos + 1, end_pos - overlap_chars) + else: + start_pos = total_chars + chunk_num += 1 def get_image_dimensions(file: ImageFile) -> tuple[int, int] | None: diff --git a/lib/crewai/src/crewai/utilities/files/processing/validators.py b/lib/crewai/src/crewai/files/processing/validators.py similarity index 98% rename from lib/crewai/src/crewai/utilities/files/processing/validators.py rename to lib/crewai/src/crewai/files/processing/validators.py index 814df513d..74af56dd5 100644 --- a/lib/crewai/src/crewai/utilities/files/processing/validators.py +++ b/lib/crewai/src/crewai/files/processing/validators.py @@ -3,7 +3,7 @@ from collections.abc import Sequence import logging -from crewai.utilities.files.content_types import ( +from crewai.files.content_types import ( AudioFile, File, ImageFile, @@ -11,14 +11,14 @@ from crewai.utilities.files.content_types import ( TextFile, VideoFile, ) -from crewai.utilities.files.processing.constraints import ( +from crewai.files.processing.constraints import ( AudioConstraints, ImageConstraints, PDFConstraints, ProviderConstraints, VideoConstraints, ) -from crewai.utilities.files.processing.exceptions import ( +from crewai.files.processing.exceptions import ( FileTooLargeError, FileValidationError, UnsupportedFileTypeError, @@ -172,7 +172,7 @@ def validate_pdf( try: import io - from pypdf import PdfReader # type: ignore[import-not-found] + from pypdf import PdfReader reader = PdfReader(io.BytesIO(content)) page_count = len(reader.pages) diff --git a/lib/crewai/src/crewai/utilities/files/resolved.py b/lib/crewai/src/crewai/files/resolved.py similarity index 100% rename from lib/crewai/src/crewai/utilities/files/resolved.py rename to lib/crewai/src/crewai/files/resolved.py diff --git a/lib/crewai/src/crewai/files/resolver.py b/lib/crewai/src/crewai/files/resolver.py new file mode 100644 index 000000000..ba4f77069 --- /dev/null +++ b/lib/crewai/src/crewai/files/resolver.py @@ -0,0 +1,577 @@ +"""FileResolver for deciding file delivery method and managing uploads.""" + +import asyncio +import base64 +from dataclasses import dataclass, field +import hashlib +import logging + +from crewai.files.content_types import ( + AudioFile, + File, + ImageFile, + PDFFile, + TextFile, + VideoFile, +) +from crewai.files.metrics import measure_operation +from crewai.files.processing.constraints import ( + ProviderConstraints, + get_constraints_for_provider, +) +from crewai.files.resolved import ( + FileReference, + InlineBase64, + InlineBytes, + ResolvedFile, +) +from crewai.files.upload_cache import CachedUpload, UploadCache +from crewai.files.uploaders import UploadResult, get_uploader +from crewai.files.uploaders.base import FileUploader + + +logger = logging.getLogger(__name__) + +FileInput = AudioFile | File | ImageFile | PDFFile | TextFile | VideoFile + +UPLOAD_MAX_RETRIES = 3 +UPLOAD_RETRY_DELAY_BASE = 2 + + +@dataclass +class FileContext: + """Cached file metadata to avoid redundant reads. + + Attributes: + content: Raw file bytes. + size: Size of the file in bytes. + content_hash: SHA-256 hash of the file content. + content_type: MIME type of the file. + """ + + content: bytes + size: int + content_hash: str + content_type: str + + +@dataclass +class FileResolverConfig: + """Configuration for FileResolver. + + Attributes: + prefer_upload: If True, prefer uploading over inline for supported providers. + upload_threshold_bytes: Size threshold above which to use upload. + If None, uses provider-specific threshold. + use_bytes_for_bedrock: If True, use raw bytes instead of base64 for Bedrock. + """ + + prefer_upload: bool = False + upload_threshold_bytes: int | None = None + use_bytes_for_bedrock: bool = True + + +@dataclass +class FileResolver: + """Resolves files to their delivery format based on provider capabilities. + + Decides whether to use inline base64, raw bytes, or file upload based on: + - Provider constraints and capabilities + - File size + - Configuration preferences + + Caches uploaded files to avoid redundant uploads. + + Attributes: + config: Resolver configuration. + upload_cache: Cache for tracking uploaded files. + """ + + config: FileResolverConfig = field(default_factory=FileResolverConfig) + upload_cache: UploadCache | None = None + _uploaders: dict[str, FileUploader] = field(default_factory=dict) + + def _build_file_context(self, file: FileInput) -> FileContext: + """Build context by reading file once. + + Args: + file: The file to build context for. + + Returns: + FileContext with cached metadata. + """ + content = file.read() + return FileContext( + content=content, + size=len(content), + content_hash=hashlib.sha256(content).hexdigest(), + content_type=file.content_type, + ) + + def resolve(self, file: FileInput, provider: str) -> ResolvedFile: + """Resolve a file to its delivery format for a provider. + + Args: + file: The file to resolve. + provider: Provider name (e.g., "gemini", "anthropic", "openai"). + + Returns: + ResolvedFile representing the appropriate delivery format. + """ + provider_lower = provider.lower() + constraints = get_constraints_for_provider(provider) + context = self._build_file_context(file) + + should_upload = self._should_upload( + file, provider_lower, constraints, context.size + ) + + if should_upload: + resolved = self._resolve_via_upload(file, provider_lower, context) + if resolved is not None: + return resolved + + return self._resolve_inline(file, provider_lower, context) + + def resolve_files( + self, + files: dict[str, FileInput], + provider: str, + ) -> dict[str, ResolvedFile]: + """Resolve multiple files for a provider. + + Args: + files: Dictionary mapping names to file inputs. + provider: Provider name. + + Returns: + Dictionary mapping names to resolved files. + """ + return {name: self.resolve(file, provider) for name, file in files.items()} + + def _should_upload( + self, + file: FileInput, + provider: str, + constraints: ProviderConstraints | None, + file_size: int, + ) -> bool: + """Determine if a file should be uploaded rather than inlined. + + Args: + file: The file to check. + provider: Provider name. + constraints: Provider constraints. + file_size: Size of the file in bytes. + + Returns: + True if the file should be uploaded, False otherwise. + """ + if constraints is None or not constraints.supports_file_upload: + return False + + if self.config.prefer_upload: + return True + + threshold = self.config.upload_threshold_bytes + if threshold is None and constraints is not None: + threshold = constraints.file_upload_threshold_bytes + + if threshold is not None and file_size > threshold: + return True + + return False + + def _resolve_via_upload( + self, + file: FileInput, + provider: str, + context: FileContext, + ) -> ResolvedFile | None: + """Resolve a file by uploading it. + + Args: + file: The file to upload. + provider: Provider name. + context: Pre-computed file context. + + Returns: + FileReference if upload succeeds, None otherwise. + """ + if self.upload_cache is not None: + cached = self.upload_cache.get_by_hash(context.content_hash, provider) + if cached is not None: + logger.debug( + f"Using cached upload for {file.filename}: {cached.file_id}" + ) + return FileReference( + content_type=cached.content_type, + file_id=cached.file_id, + provider=cached.provider, + expires_at=cached.expires_at, + file_uri=cached.file_uri, + ) + + uploader = self._get_uploader(provider) + if uploader is None: + logger.debug(f"No uploader available for {provider}") + return None + + result = self._upload_with_retry(uploader, file, provider, context.size) + if result is None: + return None + + if self.upload_cache is not None: + self.upload_cache.set_by_hash( + file_hash=context.content_hash, + content_type=context.content_type, + provider=provider, + file_id=result.file_id, + file_uri=result.file_uri, + expires_at=result.expires_at, + ) + + return FileReference( + content_type=result.content_type, + file_id=result.file_id, + provider=result.provider, + expires_at=result.expires_at, + file_uri=result.file_uri, + ) + + def _upload_with_retry( + self, + uploader: FileUploader, + file: FileInput, + provider: str, + file_size: int, + ) -> UploadResult | None: + """Upload with exponential backoff retry. + + Args: + uploader: The uploader to use. + file: The file to upload. + provider: Provider name for logging. + file_size: Size of the file in bytes. + + Returns: + UploadResult if successful, None otherwise. + """ + import time + + from crewai.files.processing.exceptions import ( + PermanentUploadError, + TransientUploadError, + ) + + last_error: Exception | None = None + + for attempt in range(UPLOAD_MAX_RETRIES): + with measure_operation( + "upload", + filename=file.filename, + provider=provider, + size_bytes=file_size, + attempt=attempt + 1, + ) as metrics: + try: + result = uploader.upload(file) + metrics.metadata["file_id"] = result.file_id + return result + except PermanentUploadError as e: + metrics.metadata["error_type"] = "permanent" + logger.warning( + f"Non-retryable upload error for {file.filename}: {e}" + ) + return None + except TransientUploadError as e: + metrics.metadata["error_type"] = "transient" + last_error = e + except Exception as e: + metrics.metadata["error_type"] = "unknown" + last_error = e + + if attempt < UPLOAD_MAX_RETRIES - 1: + delay = UPLOAD_RETRY_DELAY_BASE**attempt + logger.debug( + f"Retrying upload for {file.filename} in {delay}s (attempt {attempt + 1})" + ) + time.sleep(delay) + + logger.warning( + f"Upload failed for {file.filename} to {provider} after {UPLOAD_MAX_RETRIES} attempts: {last_error}" + ) + return None + + def _resolve_inline( + self, + file: FileInput, + provider: str, + context: FileContext, + ) -> ResolvedFile: + """Resolve a file as inline content. + + Args: + file: The file to resolve. + provider: Provider name. + context: Pre-computed file context. + + Returns: + InlineBase64 or InlineBytes depending on provider. + """ + if self.config.use_bytes_for_bedrock and "bedrock" in provider: + return InlineBytes( + content_type=context.content_type, + data=context.content, + ) + + encoded = base64.b64encode(context.content).decode("ascii") + return InlineBase64( + content_type=context.content_type, + data=encoded, + ) + + async def aresolve(self, file: FileInput, provider: str) -> ResolvedFile: + """Async resolve a file to its delivery format for a provider. + + Args: + file: The file to resolve. + provider: Provider name (e.g., "gemini", "anthropic", "openai"). + + Returns: + ResolvedFile representing the appropriate delivery format. + """ + provider_lower = provider.lower() + constraints = get_constraints_for_provider(provider) + context = self._build_file_context(file) + + should_upload = self._should_upload( + file, provider_lower, constraints, context.size + ) + + if should_upload: + resolved = await self._aresolve_via_upload(file, provider_lower, context) + if resolved is not None: + return resolved + + return self._resolve_inline(file, provider_lower, context) + + async def aresolve_files( + self, + files: dict[str, FileInput], + provider: str, + max_concurrency: int = 10, + ) -> dict[str, ResolvedFile]: + """Async resolve multiple files in parallel. + + Args: + files: Dictionary mapping names to file inputs. + provider: Provider name. + max_concurrency: Maximum number of concurrent resolutions. + + Returns: + Dictionary mapping names to resolved files. + """ + semaphore = asyncio.Semaphore(max_concurrency) + + async def resolve_one(name: str, file: FileInput) -> tuple[str, ResolvedFile]: + async with semaphore: + resolved = await self.aresolve(file, provider) + return name, resolved + + tasks = [resolve_one(n, f) for n, f in files.items()] + results = await asyncio.gather(*tasks, return_exceptions=True) + + output: dict[str, ResolvedFile] = {} + for result in results: + if isinstance(result, BaseException): + logger.error(f"Resolution failed: {result}") + continue + name, resolved = result + output[name] = resolved + + return output + + async def _aresolve_via_upload( + self, + file: FileInput, + provider: str, + context: FileContext, + ) -> ResolvedFile | None: + """Async resolve a file by uploading it. + + Args: + file: The file to upload. + provider: Provider name. + context: Pre-computed file context. + + Returns: + FileReference if upload succeeds, None otherwise. + """ + if self.upload_cache is not None: + cached = await self.upload_cache.aget_by_hash( + context.content_hash, provider + ) + if cached is not None: + logger.debug( + f"Using cached upload for {file.filename}: {cached.file_id}" + ) + return FileReference( + content_type=cached.content_type, + file_id=cached.file_id, + provider=cached.provider, + expires_at=cached.expires_at, + file_uri=cached.file_uri, + ) + + uploader = self._get_uploader(provider) + if uploader is None: + logger.debug(f"No uploader available for {provider}") + return None + + result = await self._aupload_with_retry(uploader, file, provider, context.size) + if result is None: + return None + + if self.upload_cache is not None: + await self.upload_cache.aset_by_hash( + file_hash=context.content_hash, + content_type=context.content_type, + provider=provider, + file_id=result.file_id, + file_uri=result.file_uri, + expires_at=result.expires_at, + ) + + return FileReference( + content_type=result.content_type, + file_id=result.file_id, + provider=result.provider, + expires_at=result.expires_at, + file_uri=result.file_uri, + ) + + async def _aupload_with_retry( + self, + uploader: FileUploader, + file: FileInput, + provider: str, + file_size: int, + ) -> UploadResult | None: + """Async upload with exponential backoff retry. + + Args: + uploader: The uploader to use. + file: The file to upload. + provider: Provider name for logging. + file_size: Size of the file in bytes. + + Returns: + UploadResult if successful, None otherwise. + """ + from crewai.files.processing.exceptions import ( + PermanentUploadError, + TransientUploadError, + ) + + last_error: Exception | None = None + + for attempt in range(UPLOAD_MAX_RETRIES): + with measure_operation( + "upload", + filename=file.filename, + provider=provider, + size_bytes=file_size, + attempt=attempt + 1, + ) as metrics: + try: + result = await uploader.aupload(file) + metrics.metadata["file_id"] = result.file_id + return result + except PermanentUploadError as e: + metrics.metadata["error_type"] = "permanent" + logger.warning( + f"Non-retryable upload error for {file.filename}: {e}" + ) + return None + except TransientUploadError as e: + metrics.metadata["error_type"] = "transient" + last_error = e + except Exception as e: + metrics.metadata["error_type"] = "unknown" + last_error = e + + if attempt < UPLOAD_MAX_RETRIES - 1: + delay = UPLOAD_RETRY_DELAY_BASE**attempt + logger.debug( + f"Retrying upload for {file.filename} in {delay}s (attempt {attempt + 1})" + ) + await asyncio.sleep(delay) + + logger.warning( + f"Upload failed for {file.filename} to {provider} after {UPLOAD_MAX_RETRIES} attempts: {last_error}" + ) + return None + + def _get_uploader(self, provider: str) -> FileUploader | None: + """Get or create an uploader for a provider. + + Args: + provider: Provider name. + + Returns: + FileUploader instance or None if not available. + """ + if provider not in self._uploaders: + uploader = get_uploader(provider) + if uploader is not None: + self._uploaders[provider] = uploader + else: + return None + + return self._uploaders.get(provider) + + def get_cached_uploads(self, provider: str) -> list[CachedUpload]: + """Get all cached uploads for a provider. + + Args: + provider: Provider name. + + Returns: + List of cached uploads. + """ + if self.upload_cache is None: + return [] + return self.upload_cache.get_all_for_provider(provider) + + def clear_cache(self) -> None: + """Clear the upload cache.""" + if self.upload_cache is not None: + self.upload_cache.clear() + + +def create_resolver( + provider: str | None = None, + prefer_upload: bool = False, + upload_threshold_bytes: int | None = None, + enable_cache: bool = True, +) -> FileResolver: + """Create a configured FileResolver. + + Args: + provider: Optional provider name for provider-specific configuration. + prefer_upload: Whether to prefer upload over inline. + upload_threshold_bytes: Size threshold for using upload. + enable_cache: Whether to enable upload caching. + + Returns: + Configured FileResolver instance. + """ + config = FileResolverConfig( + prefer_upload=prefer_upload, + upload_threshold_bytes=upload_threshold_bytes, + ) + + cache = UploadCache() if enable_cache else None + + return FileResolver(config=config, upload_cache=cache) diff --git a/lib/crewai/src/crewai/utilities/files/upload_cache.py b/lib/crewai/src/crewai/files/upload_cache.py similarity index 84% rename from lib/crewai/src/crewai/utilities/files/upload_cache.py rename to lib/crewai/src/crewai/files/upload_cache.py index 2a76542e7..4d3590683 100644 --- a/lib/crewai/src/crewai/utilities/files/upload_cache.py +++ b/lib/crewai/src/crewai/files/upload_cache.py @@ -5,6 +5,7 @@ from __future__ import annotations import asyncio import atexit import builtins +from collections.abc import Iterator from dataclasses import dataclass from datetime import datetime, timezone import hashlib @@ -16,7 +17,7 @@ from aiocache.serializers import PickleSerializer # type: ignore[import-untyped if TYPE_CHECKING: - from crewai.utilities.files.content_types import ( + from crewai.files.content_types import ( AudioFile, File, ImageFile, @@ -31,6 +32,7 @@ if TYPE_CHECKING: logger = logging.getLogger(__name__) DEFAULT_TTL_SECONDS = 24 * 60 * 60 # 24 hours +DEFAULT_MAX_CACHE_ENTRIES = 1000 @dataclass @@ -65,8 +67,31 @@ def _make_key(file_hash: str, provider: str) -> str: return f"upload:{provider}:{file_hash}" +def _compute_file_hash_streaming(chunks: Iterator[bytes]) -> str: + """Compute SHA-256 hash from streaming chunks. + + Args: + chunks: Iterator of byte chunks. + + Returns: + Hexadecimal hash string. + """ + hasher = hashlib.sha256() + for chunk in chunks: + hasher.update(chunk) + return hasher.hexdigest() + + def _compute_file_hash(file: FileInput) -> str: - """Compute SHA-256 hash of file content.""" + """Compute SHA-256 hash of file content. + + Uses streaming for FilePath sources to avoid loading large files into memory. + """ + from crewai.files.file import FilePath + + source = file._file_source + if isinstance(source, FilePath): + return _compute_file_hash_streaming(source.read_chunks(chunk_size=1024 * 1024)) content = file.read() return hashlib.sha256(content).hexdigest() @@ -87,6 +112,7 @@ class UploadCache: ttl: int = DEFAULT_TTL_SECONDS, namespace: str = "crewai_uploads", cache_type: str = "memory", + max_entries: int | None = DEFAULT_MAX_CACHE_ENTRIES, **cache_kwargs: Any, ) -> None: """Initialize the upload cache. @@ -95,11 +121,14 @@ class UploadCache: ttl: Default TTL in seconds. namespace: Cache namespace. cache_type: Backend type ("memory" or "redis"). + max_entries: Maximum cache entries (None for unlimited). **cache_kwargs: Additional args for cache backend. """ self.ttl = ttl self.namespace = namespace + self.max_entries = max_entries self._provider_keys: dict[str, set[str]] = {} + self._key_access_order: list[str] = [] if cache_type == "redis": self._cache = Cache( @@ -116,15 +145,60 @@ class UploadCache: ) def _track_key(self, provider: str, key: str) -> None: - """Track a key for a provider (for cleanup).""" + """Track a key for a provider (for cleanup) and access order.""" if provider not in self._provider_keys: self._provider_keys[provider] = set() self._provider_keys[provider].add(key) + if key in self._key_access_order: + self._key_access_order.remove(key) + self._key_access_order.append(key) def _untrack_key(self, provider: str, key: str) -> None: """Remove key tracking for a provider.""" if provider in self._provider_keys: self._provider_keys[provider].discard(key) + if key in self._key_access_order: + self._key_access_order.remove(key) + + async def _evict_if_needed(self) -> int: + """Evict oldest entries if limit exceeded. + + Returns: + Number of entries evicted. + """ + if self.max_entries is None: + return 0 + + current_count = len(self) + if current_count < self.max_entries: + return 0 + + to_evict = max(1, self.max_entries // 10) + return await self._evict_oldest(to_evict) + + async def _evict_oldest(self, count: int) -> int: + """Evict the oldest entries from the cache. + + Args: + count: Number of entries to evict. + + Returns: + Number of entries actually evicted. + """ + evicted = 0 + keys_to_evict = self._key_access_order[:count] + + for key in keys_to_evict: + await self._cache.delete(key) + self._key_access_order.remove(key) + for provider_keys in self._provider_keys.values(): + provider_keys.discard(key) + evicted += 1 + + if evicted > 0: + logger.debug(f"Evicted {evicted} oldest cache entries") + + return evicted async def aget(self, file: FileInput, provider: str) -> CachedUpload | None: """Get a cached upload for a file. @@ -214,6 +288,8 @@ class UploadCache: Returns: The created cache entry. """ + await self._evict_if_needed() + key = _make_key(file_hash, provider) now = datetime.now(timezone.utc) @@ -331,18 +407,15 @@ class UploadCache: return results def _run_sync(self, coro: Any) -> Any: - """Run an async coroutine from sync context.""" + """Run an async coroutine from sync context without blocking event loop.""" try: loop = asyncio.get_running_loop() except RuntimeError: loop = None if loop is not None and loop.is_running(): - import concurrent.futures - - with concurrent.futures.ThreadPoolExecutor() as pool: - future = pool.submit(asyncio.run, coro) - return future.result() + future = asyncio.run_coroutine_threadsafe(coro, loop) + return future.result(timeout=30) return asyncio.run(coro) def get(self, file: FileInput, provider: str) -> CachedUpload | None: @@ -473,7 +546,7 @@ def _cleanup_on_exit() -> None: if _default_cache is None or len(_default_cache) == 0: return - from crewai.utilities.files.cleanup import cleanup_uploaded_files + from crewai.files.cleanup import cleanup_uploaded_files try: cleanup_uploaded_files(_default_cache, delete_from_provider=True) diff --git a/lib/crewai/src/crewai/utilities/files/uploaders/__init__.py b/lib/crewai/src/crewai/files/uploaders/__init__.py similarity index 62% rename from lib/crewai/src/crewai/utilities/files/uploaders/__init__.py rename to lib/crewai/src/crewai/files/uploaders/__init__.py index 105500ac9..a091eb1b5 100644 --- a/lib/crewai/src/crewai/utilities/files/uploaders/__init__.py +++ b/lib/crewai/src/crewai/files/uploaders/__init__.py @@ -5,7 +5,7 @@ from __future__ import annotations import logging from typing import Any -from crewai.utilities.files.uploaders.base import FileUploader, UploadResult +from crewai.files.uploaders.base import FileUploader, UploadResult logger = logging.getLogger(__name__) @@ -31,7 +31,7 @@ def get_uploader(provider: str, **kwargs: Any) -> FileUploader | None: if "gemini" in provider_lower or "google" in provider_lower: try: - from crewai.utilities.files.uploaders.gemini import GeminiFileUploader + from crewai.files.uploaders.gemini import GeminiFileUploader return GeminiFileUploader(**kwargs) except ImportError: @@ -42,7 +42,7 @@ def get_uploader(provider: str, **kwargs: Any) -> FileUploader | None: if "anthropic" in provider_lower or "claude" in provider_lower: try: - from crewai.utilities.files.uploaders.anthropic import AnthropicFileUploader + from crewai.files.uploaders.anthropic import AnthropicFileUploader return AnthropicFileUploader(**kwargs) except ImportError: @@ -53,12 +53,32 @@ def get_uploader(provider: str, **kwargs: Any) -> FileUploader | None: if "openai" in provider_lower or "gpt" in provider_lower: try: - from crewai.utilities.files.uploaders.openai import OpenAIFileUploader + from crewai.files.uploaders.openai import OpenAIFileUploader return OpenAIFileUploader(**kwargs) except ImportError: logger.warning("openai not installed. Install with: pip install openai") return None + if "bedrock" in provider_lower or "aws" in provider_lower: + import os + + if ( + not os.environ.get("CREWAI_BEDROCK_S3_BUCKET") + and "bucket_name" not in kwargs + ): + logger.debug( + "Bedrock S3 uploader not configured. " + "Set CREWAI_BEDROCK_S3_BUCKET environment variable to enable." + ) + return None + try: + from crewai.files.uploaders.bedrock import BedrockFileUploader + + return BedrockFileUploader(**kwargs) + except ImportError: + logger.warning("boto3 not installed. Install with: pip install boto3") + return None + logger.debug(f"No file uploader available for provider: {provider}") return None diff --git a/lib/crewai/src/crewai/files/uploaders/anthropic.py b/lib/crewai/src/crewai/files/uploaders/anthropic.py new file mode 100644 index 000000000..752ee790b --- /dev/null +++ b/lib/crewai/src/crewai/files/uploaders/anthropic.py @@ -0,0 +1,320 @@ +"""Anthropic Files API uploader implementation.""" + +from __future__ import annotations + +import io +import logging +import os +from typing import Any + +from crewai.files.content_types import ( + AudioFile, + File, + ImageFile, + PDFFile, + TextFile, + VideoFile, +) +from crewai.files.uploaders.base import FileUploader, UploadResult + + +logger = logging.getLogger(__name__) + +FileInput = AudioFile | File | ImageFile | PDFFile | TextFile | VideoFile + + +class AnthropicFileUploader(FileUploader): + """Uploader for Anthropic Files API. + + Uses the anthropic SDK to upload files. Files are stored persistently + until explicitly deleted. + + Attributes: + api_key: Optional API key (uses ANTHROPIC_API_KEY env var if not provided). + """ + + def __init__(self, api_key: str | None = None) -> None: + """Initialize the Anthropic uploader. + + Args: + api_key: Optional Anthropic API key. If not provided, uses + ANTHROPIC_API_KEY environment variable. + """ + self._api_key = api_key or os.environ.get("ANTHROPIC_API_KEY") + self._client: Any = None + self._async_client: Any = None + + @property + def provider_name(self) -> str: + """Return the provider name.""" + return "anthropic" + + def _get_client(self) -> Any: + """Get or create the Anthropic client.""" + if self._client is None: + try: + import anthropic + + self._client = anthropic.Anthropic(api_key=self._api_key) + except ImportError as e: + raise ImportError( + "anthropic is required for Anthropic file uploads. " + "Install with: pip install anthropic" + ) from e + return self._client + + def _get_async_client(self) -> Any: + """Get or create the async Anthropic client.""" + if self._async_client is None: + try: + import anthropic + + self._async_client = anthropic.AsyncAnthropic(api_key=self._api_key) + except ImportError as e: + raise ImportError( + "anthropic is required for Anthropic file uploads. " + "Install with: pip install anthropic" + ) from e + return self._async_client + + def upload(self, file: FileInput, purpose: str | None = None) -> UploadResult: + """Upload a file to Anthropic. + + Args: + file: The file to upload. + purpose: Optional purpose for the file (default: "user_upload"). + + Returns: + UploadResult with the file ID and metadata. + + Raises: + TransientUploadError: For retryable errors (network, rate limits). + PermanentUploadError: For non-retryable errors (auth, validation). + """ + from crewai.files.processing.exceptions import ( + PermanentUploadError, + TransientUploadError, + ) + + try: + client = self._get_client() + + content = file.read() + file_purpose = purpose or "user_upload" + + file_data = io.BytesIO(content) + + logger.info( + f"Uploading file '{file.filename}' to Anthropic ({len(content)} bytes)" + ) + + uploaded_file = client.files.create( + file=(file.filename, file_data, file.content_type), + purpose=file_purpose, + ) + + logger.info(f"Uploaded to Anthropic: {uploaded_file.id}") + + return UploadResult( + file_id=uploaded_file.id, + file_uri=None, + content_type=file.content_type, + expires_at=None, + provider=self.provider_name, + ) + except ImportError: + raise + except Exception as e: + error_type = type(e).__name__ + if "RateLimit" in error_type or "APIConnection" in error_type: + raise TransientUploadError( + f"Transient upload error: {e}", file_name=file.filename + ) from e + if "Authentication" in error_type or "Permission" in error_type: + raise PermanentUploadError( + f"Authentication/permission error: {e}", file_name=file.filename + ) from e + if "BadRequest" in error_type or "InvalidRequest" in error_type: + raise PermanentUploadError( + f"Invalid request: {e}", file_name=file.filename + ) from e + status_code = getattr(e, "status_code", None) + if status_code is not None: + if status_code >= 500 or status_code == 429: + raise TransientUploadError( + f"Server error ({status_code}): {e}", file_name=file.filename + ) from e + if status_code in (401, 403): + raise PermanentUploadError( + f"Auth error ({status_code}): {e}", file_name=file.filename + ) from e + if status_code == 400: + raise PermanentUploadError( + f"Bad request ({status_code}): {e}", file_name=file.filename + ) from e + raise TransientUploadError( + f"Upload failed: {e}", file_name=file.filename + ) from e + + def delete(self, file_id: str) -> bool: + """Delete an uploaded file from Anthropic. + + Args: + file_id: The file ID to delete. + + Returns: + True if deletion was successful, False otherwise. + """ + try: + client = self._get_client() + client.files.delete(file_id=file_id) + logger.info(f"Deleted Anthropic file: {file_id}") + return True + except Exception as e: + logger.warning(f"Failed to delete Anthropic file {file_id}: {e}") + return False + + def get_file_info(self, file_id: str) -> dict[str, Any] | None: + """Get information about an uploaded file. + + Args: + file_id: The file ID. + + Returns: + Dictionary with file information, or None if not found. + """ + try: + client = self._get_client() + file_info = client.files.retrieve(file_id=file_id) + return { + "id": file_info.id, + "filename": file_info.filename, + "purpose": file_info.purpose, + "size_bytes": file_info.size_bytes, + "created_at": file_info.created_at, + } + except Exception as e: + logger.debug(f"Failed to get Anthropic file info for {file_id}: {e}") + return None + + def list_files(self) -> list[dict[str, Any]]: + """List all uploaded files. + + Returns: + List of dictionaries with file information. + """ + try: + client = self._get_client() + files = client.files.list() + return [ + { + "id": f.id, + "filename": f.filename, + "purpose": f.purpose, + "size_bytes": f.size_bytes, + "created_at": f.created_at, + } + for f in files.data + ] + except Exception as e: + logger.warning(f"Failed to list Anthropic files: {e}") + return [] + + async def aupload( + self, file: FileInput, purpose: str | None = None + ) -> UploadResult: + """Async upload a file to Anthropic using native async client. + + Args: + file: The file to upload. + purpose: Optional purpose for the file (default: "user_upload"). + + Returns: + UploadResult with the file ID and metadata. + + Raises: + TransientUploadError: For retryable errors (network, rate limits). + PermanentUploadError: For non-retryable errors (auth, validation). + """ + from crewai.files.processing.exceptions import ( + PermanentUploadError, + TransientUploadError, + ) + + try: + client = self._get_async_client() + + content = await file.aread() + file_purpose = purpose or "user_upload" + + file_data = io.BytesIO(content) + + logger.info( + f"Uploading file '{file.filename}' to Anthropic ({len(content)} bytes)" + ) + + uploaded_file = await client.files.create( + file=(file.filename, file_data, file.content_type), + purpose=file_purpose, + ) + + logger.info(f"Uploaded to Anthropic: {uploaded_file.id}") + + return UploadResult( + file_id=uploaded_file.id, + file_uri=None, + content_type=file.content_type, + expires_at=None, + provider=self.provider_name, + ) + except ImportError: + raise + except Exception as e: + error_type = type(e).__name__ + if "RateLimit" in error_type or "APIConnection" in error_type: + raise TransientUploadError( + f"Transient upload error: {e}", file_name=file.filename + ) from e + if "Authentication" in error_type or "Permission" in error_type: + raise PermanentUploadError( + f"Authentication/permission error: {e}", file_name=file.filename + ) from e + if "BadRequest" in error_type or "InvalidRequest" in error_type: + raise PermanentUploadError( + f"Invalid request: {e}", file_name=file.filename + ) from e + status_code = getattr(e, "status_code", None) + if status_code is not None: + if status_code >= 500 or status_code == 429: + raise TransientUploadError( + f"Server error ({status_code}): {e}", file_name=file.filename + ) from e + if status_code in (401, 403): + raise PermanentUploadError( + f"Auth error ({status_code}): {e}", file_name=file.filename + ) from e + if status_code == 400: + raise PermanentUploadError( + f"Bad request ({status_code}): {e}", file_name=file.filename + ) from e + raise TransientUploadError( + f"Upload failed: {e}", file_name=file.filename + ) from e + + async def adelete(self, file_id: str) -> bool: + """Async delete an uploaded file from Anthropic. + + Args: + file_id: The file ID to delete. + + Returns: + True if deletion was successful, False otherwise. + """ + try: + client = self._get_async_client() + await client.files.delete(file_id=file_id) + logger.info(f"Deleted Anthropic file: {file_id}") + return True + except Exception as e: + logger.warning(f"Failed to delete Anthropic file {file_id}: {e}") + return False diff --git a/lib/crewai/src/crewai/utilities/files/uploaders/base.py b/lib/crewai/src/crewai/files/uploaders/base.py similarity index 66% rename from lib/crewai/src/crewai/utilities/files/uploaders/base.py rename to lib/crewai/src/crewai/files/uploaders/base.py index dd9b27d70..47e7a5af3 100644 --- a/lib/crewai/src/crewai/utilities/files/uploaders/base.py +++ b/lib/crewai/src/crewai/files/uploaders/base.py @@ -1,11 +1,12 @@ """Base class for file uploaders.""" from abc import ABC, abstractmethod +import asyncio from dataclasses import dataclass from datetime import datetime from typing import Any -from crewai.utilities.files.content_types import ( +from crewai.files.content_types import ( AudioFile, File, ImageFile, @@ -63,6 +64,24 @@ class FileUploader(ABC): Exception: If upload fails. """ + async def aupload( + self, file: FileInput, purpose: str | None = None + ) -> UploadResult: + """Async upload a file to the provider. + + Default implementation runs sync upload in executor. + Override in subclasses for native async support. + + Args: + file: The file to upload. + purpose: Optional purpose/description for the upload. + + Returns: + UploadResult with the file identifier and metadata. + """ + loop = asyncio.get_running_loop() + return await loop.run_in_executor(None, self.upload, file, purpose) + @abstractmethod def delete(self, file_id: str) -> bool: """Delete an uploaded file. @@ -74,6 +93,21 @@ class FileUploader(ABC): True if deletion was successful, False otherwise. """ + async def adelete(self, file_id: str) -> bool: + """Async delete an uploaded file. + + Default implementation runs sync delete in executor. + Override in subclasses for native async support. + + Args: + file_id: The file identifier to delete. + + Returns: + True if deletion was successful, False otherwise. + """ + loop = asyncio.get_running_loop() + return await loop.run_in_executor(None, self.delete, file_id) + def get_file_info(self, file_id: str) -> dict[str, Any] | None: """Get information about an uploaded file. diff --git a/lib/crewai/src/crewai/files/uploaders/bedrock.py b/lib/crewai/src/crewai/files/uploaders/bedrock.py new file mode 100644 index 000000000..e0fbbef4c --- /dev/null +++ b/lib/crewai/src/crewai/files/uploaders/bedrock.py @@ -0,0 +1,388 @@ +"""AWS Bedrock S3 file uploader implementation.""" + +from __future__ import annotations + +import hashlib +import logging +import os +from typing import Any + +from crewai.files.content_types import ( + AudioFile, + File, + ImageFile, + PDFFile, + TextFile, + VideoFile, +) +from crewai.files.uploaders.base import FileUploader, UploadResult + + +logger = logging.getLogger(__name__) + +FileInput = AudioFile | File | ImageFile | PDFFile | TextFile | VideoFile + + +class BedrockFileUploader(FileUploader): + """Uploader for AWS Bedrock via S3. + + Uploads files to S3 and returns S3 URIs that can be used with Bedrock's + Converse API s3Location source format. + + Attributes: + bucket_name: S3 bucket name for file uploads. + bucket_owner: Optional bucket owner account ID for cross-account access. + prefix: Optional S3 key prefix for uploaded files. + region: AWS region for the S3 bucket. + """ + + def __init__( + self, + bucket_name: str | None = None, + bucket_owner: str | None = None, + prefix: str = "crewai-files", + region: str | None = None, + ) -> None: + """Initialize the Bedrock S3 uploader. + + Args: + bucket_name: S3 bucket name. If not provided, uses + CREWAI_BEDROCK_S3_BUCKET environment variable. + bucket_owner: Optional bucket owner account ID for cross-account access. + Uses CREWAI_BEDROCK_S3_BUCKET_OWNER environment variable if not provided. + prefix: S3 key prefix for uploaded files (default: "crewai-files"). + region: AWS region. Uses AWS_REGION or AWS_DEFAULT_REGION if not provided. + """ + self._bucket_name = bucket_name or os.environ.get("CREWAI_BEDROCK_S3_BUCKET") + self._bucket_owner = bucket_owner or os.environ.get( + "CREWAI_BEDROCK_S3_BUCKET_OWNER" + ) + self._prefix = prefix + self._region = region or os.environ.get( + "AWS_REGION", os.environ.get("AWS_DEFAULT_REGION") + ) + self._client: Any = None + self._async_client: Any = None + + @property + def provider_name(self) -> str: + """Return the provider name.""" + return "bedrock" + + @property + def bucket_name(self) -> str: + """Return the configured bucket name.""" + if not self._bucket_name: + raise ValueError( + "S3 bucket name not configured. Set CREWAI_BEDROCK_S3_BUCKET " + "environment variable or pass bucket_name parameter." + ) + return self._bucket_name + + @property + def bucket_owner(self) -> str | None: + """Return the configured bucket owner.""" + return self._bucket_owner + + def _get_client(self) -> Any: + """Get or create the S3 client.""" + if self._client is None: + try: + import boto3 + + self._client = boto3.client("s3", region_name=self._region) + except ImportError as e: + raise ImportError( + "boto3 is required for Bedrock S3 file uploads. " + "Install with: pip install boto3" + ) from e + return self._client + + def _get_async_client(self) -> Any: + """Get or create the async S3 client.""" + if self._async_client is None: + try: + import aioboto3 # type: ignore[import-not-found] + + self._session = aioboto3.Session() + except ImportError as e: + raise ImportError( + "aioboto3 is required for async Bedrock S3 file uploads. " + "Install with: pip install aioboto3" + ) from e + return self._session + + def _generate_s3_key(self, file: FileInput, content: bytes) -> str: + """Generate a unique S3 key for the file. + + Args: + file: The file being uploaded. + content: The file content bytes. + + Returns: + S3 key string. + """ + content_hash = hashlib.sha256(content).hexdigest()[:16] + filename = file.filename or "file" + + safe_filename = "".join( + c if c.isalnum() or c in ".-_" else "_" for c in filename + ) + return f"{self._prefix}/{content_hash}_{safe_filename}" + + def _build_s3_uri(self, key: str) -> str: + """Build an S3 URI from a key. + + Args: + key: The S3 object key. + + Returns: + S3 URI string. + """ + return f"s3://{self.bucket_name}/{key}" + + def upload(self, file: FileInput, purpose: str | None = None) -> UploadResult: + """Upload a file to S3 for use with Bedrock. + + Args: + file: The file to upload. + purpose: Optional purpose (unused, kept for interface consistency). + + Returns: + UploadResult with the S3 URI and metadata. + + Raises: + TransientUploadError: For retryable errors (network, throttling). + PermanentUploadError: For non-retryable errors (auth, validation). + """ + from crewai.files.processing.exceptions import ( + PermanentUploadError, + TransientUploadError, + ) + + try: + client = self._get_client() + content = file.read() + s3_key = self._generate_s3_key(file, content) + + logger.info( + f"Uploading file '{file.filename}' to S3 bucket " + f"'{self.bucket_name}' ({len(content)} bytes)" + ) + + client.put_object( + Bucket=self.bucket_name, + Key=s3_key, + Body=content, + ContentType=file.content_type, + ) + + s3_uri = self._build_s3_uri(s3_key) + logger.info(f"Uploaded to S3: {s3_uri}") + + return UploadResult( + file_id=s3_key, + file_uri=s3_uri, + content_type=file.content_type, + expires_at=None, + provider=self.provider_name, + ) + except ImportError: + raise + except Exception as e: + error_type = type(e).__name__ + error_code = getattr(e, "response", {}).get("Error", {}).get("Code", "") + + if error_code in ("SlowDown", "ServiceUnavailable", "InternalError"): + raise TransientUploadError( + f"Transient S3 error: {e}", file_name=file.filename + ) from e + if error_code in ( + "AccessDenied", + "InvalidAccessKeyId", + "SignatureDoesNotMatch", + ): + raise PermanentUploadError( + f"S3 authentication error: {e}", file_name=file.filename + ) from e + if error_code in ("NoSuchBucket", "InvalidBucketName"): + raise PermanentUploadError( + f"S3 bucket error: {e}", file_name=file.filename + ) from e + if "Throttl" in error_type or "Throttl" in str(e): + raise TransientUploadError( + f"S3 throttling: {e}", file_name=file.filename + ) from e + raise TransientUploadError( + f"S3 upload failed: {e}", file_name=file.filename + ) from e + + def delete(self, file_id: str) -> bool: + """Delete an uploaded file from S3. + + Args: + file_id: The S3 key to delete. + + Returns: + True if deletion was successful, False otherwise. + """ + try: + client = self._get_client() + client.delete_object(Bucket=self.bucket_name, Key=file_id) + logger.info(f"Deleted S3 object: s3://{self.bucket_name}/{file_id}") + return True + except Exception as e: + logger.warning( + f"Failed to delete S3 object s3://{self.bucket_name}/{file_id}: {e}" + ) + return False + + def get_file_info(self, file_id: str) -> dict[str, Any] | None: + """Get information about an uploaded file. + + Args: + file_id: The S3 key. + + Returns: + Dictionary with file information, or None if not found. + """ + try: + client = self._get_client() + response = client.head_object(Bucket=self.bucket_name, Key=file_id) + return { + "id": file_id, + "uri": self._build_s3_uri(file_id), + "content_type": response.get("ContentType"), + "size": response.get("ContentLength"), + "last_modified": response.get("LastModified"), + "etag": response.get("ETag"), + } + except Exception as e: + logger.debug(f"Failed to get S3 object info for {file_id}: {e}") + return None + + def list_files(self) -> list[dict[str, Any]]: + """List all uploaded files in the configured prefix. + + Returns: + List of dictionaries with file information. + """ + try: + client = self._get_client() + response = client.list_objects_v2( + Bucket=self.bucket_name, + Prefix=self._prefix, + ) + return [ + { + "id": obj["Key"], + "uri": self._build_s3_uri(obj["Key"]), + "size": obj.get("Size"), + "last_modified": obj.get("LastModified"), + "etag": obj.get("ETag"), + } + for obj in response.get("Contents", []) + ] + except Exception as e: + logger.warning(f"Failed to list S3 objects: {e}") + return [] + + async def aupload( + self, file: FileInput, purpose: str | None = None + ) -> UploadResult: + """Async upload a file to S3 for use with Bedrock. + + Args: + file: The file to upload. + purpose: Optional purpose (unused, kept for interface consistency). + + Returns: + UploadResult with the S3 URI and metadata. + + Raises: + TransientUploadError: For retryable errors (network, throttling). + PermanentUploadError: For non-retryable errors (auth, validation). + """ + from crewai.files.processing.exceptions import ( + PermanentUploadError, + TransientUploadError, + ) + + try: + session = self._get_async_client() + content = await file.aread() + s3_key = self._generate_s3_key(file, content) + + logger.info( + f"Uploading file '{file.filename}' to S3 bucket " + f"'{self.bucket_name}' ({len(content)} bytes)" + ) + + async with session.client("s3", region_name=self._region) as client: + await client.put_object( + Bucket=self.bucket_name, + Key=s3_key, + Body=content, + ContentType=file.content_type, + ) + + s3_uri = self._build_s3_uri(s3_key) + logger.info(f"Uploaded to S3: {s3_uri}") + + return UploadResult( + file_id=s3_key, + file_uri=s3_uri, + content_type=file.content_type, + expires_at=None, + provider=self.provider_name, + ) + except ImportError: + raise + except Exception as e: + error_type = type(e).__name__ + error_code = getattr(e, "response", {}).get("Error", {}).get("Code", "") + + if error_code in ("SlowDown", "ServiceUnavailable", "InternalError"): + raise TransientUploadError( + f"Transient S3 error: {e}", file_name=file.filename + ) from e + if error_code in ( + "AccessDenied", + "InvalidAccessKeyId", + "SignatureDoesNotMatch", + ): + raise PermanentUploadError( + f"S3 authentication error: {e}", file_name=file.filename + ) from e + if error_code in ("NoSuchBucket", "InvalidBucketName"): + raise PermanentUploadError( + f"S3 bucket error: {e}", file_name=file.filename + ) from e + if "Throttl" in error_type or "Throttl" in str(e): + raise TransientUploadError( + f"S3 throttling: {e}", file_name=file.filename + ) from e + raise TransientUploadError( + f"S3 upload failed: {e}", file_name=file.filename + ) from e + + async def adelete(self, file_id: str) -> bool: + """Async delete an uploaded file from S3. + + Args: + file_id: The S3 key to delete. + + Returns: + True if deletion was successful, False otherwise. + """ + try: + session = self._get_async_client() + async with session.client("s3", region_name=self._region) as client: + await client.delete_object(Bucket=self.bucket_name, Key=file_id) + logger.info(f"Deleted S3 object: s3://{self.bucket_name}/{file_id}") + return True + except Exception as e: + logger.warning( + f"Failed to delete S3 object s3://{self.bucket_name}/{file_id}: {e}" + ) + return False diff --git a/lib/crewai/src/crewai/files/uploaders/gemini.py b/lib/crewai/src/crewai/files/uploaders/gemini.py new file mode 100644 index 000000000..eb6d82215 --- /dev/null +++ b/lib/crewai/src/crewai/files/uploaders/gemini.py @@ -0,0 +1,444 @@ +"""Gemini File API uploader implementation.""" + +from __future__ import annotations + +import asyncio +from datetime import datetime, timedelta, timezone +import io +import logging +import os +import random +import time +from typing import Any + +from crewai.files.content_types import ( + AudioFile, + File, + ImageFile, + PDFFile, + TextFile, + VideoFile, +) +from crewai.files.uploaders.base import FileUploader, UploadResult + + +logger = logging.getLogger(__name__) + +FileInput = AudioFile | File | ImageFile | PDFFile | TextFile | VideoFile + +GEMINI_FILE_TTL = timedelta(hours=48) + + +class GeminiFileUploader(FileUploader): + """Uploader for Google Gemini File API. + + Uses the google-genai SDK to upload files. Files are stored for 48 hours. + + Attributes: + api_key: Optional API key (uses GOOGLE_API_KEY env var if not provided). + """ + + def __init__(self, api_key: str | None = None) -> None: + """Initialize the Gemini uploader. + + Args: + api_key: Optional Google API key. If not provided, uses + GOOGLE_API_KEY environment variable. + """ + self._api_key = api_key or os.environ.get("GOOGLE_API_KEY") + self._client: Any = None + + @property + def provider_name(self) -> str: + """Return the provider name.""" + return "gemini" + + def _get_client(self) -> Any: + """Get or create the Gemini client.""" + if self._client is None: + try: + from google import genai + + self._client = genai.Client(api_key=self._api_key) + except ImportError as e: + raise ImportError( + "google-genai is required for Gemini file uploads. " + "Install with: pip install google-genai" + ) from e + return self._client + + def upload(self, file: FileInput, purpose: str | None = None) -> UploadResult: + """Upload a file to Gemini. + + Args: + file: The file to upload. + purpose: Optional purpose/description (used as display name). + + Returns: + UploadResult with the file URI and metadata. + + Raises: + TransientUploadError: For retryable errors (network, rate limits). + PermanentUploadError: For non-retryable errors (auth, validation). + """ + from crewai.files.processing.exceptions import ( + PermanentUploadError, + TransientUploadError, + ) + + try: + client = self._get_client() + + content = file.read() + display_name = purpose or file.filename + + file_data = io.BytesIO(content) + file_data.name = file.filename + + logger.info( + f"Uploading file '{file.filename}' to Gemini ({len(content)} bytes)" + ) + + uploaded_file = client.files.upload( + file=file_data, + config={ + "display_name": display_name, + "mime_type": file.content_type, + }, + ) + + if file.content_type.startswith("video/"): + if not self.wait_for_processing(uploaded_file.name): + raise PermanentUploadError( + f"Video processing failed for {file.filename}", + file_name=file.filename, + ) + + expires_at = datetime.now(timezone.utc) + GEMINI_FILE_TTL + + logger.info( + f"Uploaded to Gemini: {uploaded_file.name} (URI: {uploaded_file.uri})" + ) + + return UploadResult( + file_id=uploaded_file.name, + file_uri=uploaded_file.uri, + content_type=file.content_type, + expires_at=expires_at, + provider=self.provider_name, + ) + except ImportError: + raise + except (TransientUploadError, PermanentUploadError): + raise + except Exception as e: + error_msg = str(e).lower() + if "quota" in error_msg or "rate" in error_msg or "limit" in error_msg: + raise TransientUploadError( + f"Rate limit error: {e}", file_name=file.filename + ) from e + if ( + "auth" in error_msg + or "permission" in error_msg + or "denied" in error_msg + ): + raise PermanentUploadError( + f"Authentication/permission error: {e}", file_name=file.filename + ) from e + if "invalid" in error_msg or "unsupported" in error_msg: + raise PermanentUploadError( + f"Invalid request: {e}", file_name=file.filename + ) from e + status_code = getattr(e, "code", None) or getattr(e, "status_code", None) + if status_code is not None: + if isinstance(status_code, int): + if status_code >= 500 or status_code == 429: + raise TransientUploadError( + f"Server error ({status_code}): {e}", + file_name=file.filename, + ) from e + if status_code in (401, 403): + raise PermanentUploadError( + f"Auth error ({status_code}): {e}", file_name=file.filename + ) from e + if status_code == 400: + raise PermanentUploadError( + f"Bad request ({status_code}): {e}", file_name=file.filename + ) from e + raise TransientUploadError( + f"Upload failed: {e}", file_name=file.filename + ) from e + + async def aupload( + self, file: FileInput, purpose: str | None = None + ) -> UploadResult: + """Async upload a file to Gemini using native async client. + + Uses async wait_for_processing for video files. + + Args: + file: The file to upload. + purpose: Optional purpose/description (used as display name). + + Returns: + UploadResult with the file URI and metadata. + + Raises: + TransientUploadError: For retryable errors (network, rate limits). + PermanentUploadError: For non-retryable errors (auth, validation). + """ + from crewai.files.processing.exceptions import ( + PermanentUploadError, + TransientUploadError, + ) + + try: + client = self._get_client() + + content = await file.aread() + display_name = purpose or file.filename + + file_data = io.BytesIO(content) + file_data.name = file.filename + + logger.info( + f"Uploading file '{file.filename}' to Gemini ({len(content)} bytes)" + ) + + uploaded_file = await client.aio.files.upload( + file=file_data, + config={ + "display_name": display_name, + "mime_type": file.content_type, + }, + ) + + if file.content_type.startswith("video/"): + if not await self.await_for_processing(uploaded_file.name): + raise PermanentUploadError( + f"Video processing failed for {file.filename}", + file_name=file.filename, + ) + + expires_at = datetime.now(timezone.utc) + GEMINI_FILE_TTL + + logger.info( + f"Uploaded to Gemini: {uploaded_file.name} (URI: {uploaded_file.uri})" + ) + + return UploadResult( + file_id=uploaded_file.name, + file_uri=uploaded_file.uri, + content_type=file.content_type, + expires_at=expires_at, + provider=self.provider_name, + ) + except ImportError: + raise + except (TransientUploadError, PermanentUploadError): + raise + except Exception as e: + error_msg = str(e).lower() + if "quota" in error_msg or "rate" in error_msg or "limit" in error_msg: + raise TransientUploadError( + f"Rate limit error: {e}", file_name=file.filename + ) from e + if ( + "auth" in error_msg + or "permission" in error_msg + or "denied" in error_msg + ): + raise PermanentUploadError( + f"Authentication/permission error: {e}", file_name=file.filename + ) from e + if "invalid" in error_msg or "unsupported" in error_msg: + raise PermanentUploadError( + f"Invalid request: {e}", file_name=file.filename + ) from e + status_code = getattr(e, "code", None) or getattr(e, "status_code", None) + if status_code is not None and isinstance(status_code, int): + if status_code >= 500 or status_code == 429: + raise TransientUploadError( + f"Server error ({status_code}): {e}", file_name=file.filename + ) from e + if status_code in (401, 403): + raise PermanentUploadError( + f"Auth error ({status_code}): {e}", file_name=file.filename + ) from e + if status_code == 400: + raise PermanentUploadError( + f"Bad request ({status_code}): {e}", file_name=file.filename + ) from e + raise TransientUploadError( + f"Upload failed: {e}", file_name=file.filename + ) from e + + def delete(self, file_id: str) -> bool: + """Delete an uploaded file from Gemini. + + Args: + file_id: The file name/ID to delete. + + Returns: + True if deletion was successful, False otherwise. + """ + try: + client = self._get_client() + client.files.delete(name=file_id) + logger.info(f"Deleted Gemini file: {file_id}") + return True + except Exception as e: + logger.warning(f"Failed to delete Gemini file {file_id}: {e}") + return False + + async def adelete(self, file_id: str) -> bool: + """Async delete an uploaded file from Gemini. + + Args: + file_id: The file name/ID to delete. + + Returns: + True if deletion was successful, False otherwise. + """ + try: + client = self._get_client() + await client.aio.files.delete(name=file_id) + logger.info(f"Deleted Gemini file: {file_id}") + return True + except Exception as e: + logger.warning(f"Failed to delete Gemini file {file_id}: {e}") + return False + + def get_file_info(self, file_id: str) -> dict[str, Any] | None: + """Get information about an uploaded file. + + Args: + file_id: The file name/ID. + + Returns: + Dictionary with file information, or None if not found. + """ + try: + client = self._get_client() + file_info = client.files.get(name=file_id) + return { + "name": file_info.name, + "uri": file_info.uri, + "display_name": file_info.display_name, + "mime_type": file_info.mime_type, + "size_bytes": file_info.size_bytes, + "state": str(file_info.state), + "create_time": file_info.create_time, + "expiration_time": file_info.expiration_time, + } + except Exception as e: + logger.debug(f"Failed to get Gemini file info for {file_id}: {e}") + return None + + def list_files(self) -> list[dict[str, Any]]: + """List all uploaded files. + + Returns: + List of dictionaries with file information. + """ + try: + client = self._get_client() + files = client.files.list() + return [ + { + "name": f.name, + "uri": f.uri, + "display_name": f.display_name, + "mime_type": f.mime_type, + "size_bytes": f.size_bytes, + "state": str(f.state), + } + for f in files + ] + except Exception as e: + logger.warning(f"Failed to list Gemini files: {e}") + return [] + + def wait_for_processing(self, file_id: str, timeout_seconds: int = 300) -> bool: + """Wait for a file to finish processing with exponential backoff. + + Some files (especially videos) need time to process after upload. + + Args: + file_id: The file name/ID. + timeout_seconds: Maximum time to wait. + + Returns: + True if processing completed, False if timed out or failed. + """ + try: + from google.genai.types import FileState + except ImportError: + return True + + client = self._get_client() + start_time = time.time() + base_delay = 1.0 + max_delay = 30.0 + attempt = 0 + + while time.time() - start_time < timeout_seconds: + file_info = client.files.get(name=file_id) + + if file_info.state == FileState.ACTIVE: + return True + + if file_info.state == FileState.FAILED: + logger.error(f"Gemini file processing failed: {file_id}") + return False + + delay = min(base_delay * (2**attempt), max_delay) + jitter = random.uniform(0, delay * 0.1) # noqa: S311 + time.sleep(delay + jitter) + attempt += 1 + + logger.warning(f"Timed out waiting for Gemini file processing: {file_id}") + return False + + async def await_for_processing( + self, file_id: str, timeout_seconds: int = 300 + ) -> bool: + """Async wait for a file to finish processing with exponential backoff. + + Some files (especially videos) need time to process after upload. + + Args: + file_id: The file name/ID. + timeout_seconds: Maximum time to wait. + + Returns: + True if processing completed, False if timed out or failed. + """ + try: + from google.genai.types import FileState + except ImportError: + return True + + client = self._get_client() + start_time = time.time() + base_delay = 1.0 + max_delay = 30.0 + attempt = 0 + + while time.time() - start_time < timeout_seconds: + file_info = await client.aio.files.get(name=file_id) + + if file_info.state == FileState.ACTIVE: + return True + + if file_info.state == FileState.FAILED: + logger.error(f"Gemini file processing failed: {file_id}") + return False + + delay = min(base_delay * (2**attempt), max_delay) + jitter = random.uniform(0, delay * 0.1) # noqa: S311 + await asyncio.sleep(delay + jitter) + attempt += 1 + + logger.warning(f"Timed out waiting for Gemini file processing: {file_id}") + return False diff --git a/lib/crewai/src/crewai/files/uploaders/openai.py b/lib/crewai/src/crewai/files/uploaders/openai.py new file mode 100644 index 000000000..1bc47cc6f --- /dev/null +++ b/lib/crewai/src/crewai/files/uploaders/openai.py @@ -0,0 +1,324 @@ +"""OpenAI Files API uploader implementation.""" + +from __future__ import annotations + +import io +import logging +import os +from typing import Any + +from crewai.files.content_types import ( + AudioFile, + File, + ImageFile, + PDFFile, + TextFile, + VideoFile, +) +from crewai.files.uploaders.base import FileUploader, UploadResult + + +logger = logging.getLogger(__name__) + +FileInput = AudioFile | File | ImageFile | PDFFile | TextFile | VideoFile + + +class OpenAIFileUploader(FileUploader): + """Uploader for OpenAI Files API. + + Uses the OpenAI SDK to upload files. Files are stored persistently + until explicitly deleted. + + Attributes: + api_key: Optional API key (uses OPENAI_API_KEY env var if not provided). + """ + + def __init__(self, api_key: str | None = None) -> None: + """Initialize the OpenAI uploader. + + Args: + api_key: Optional OpenAI API key. If not provided, uses + OPENAI_API_KEY environment variable. + """ + self._api_key = api_key or os.environ.get("OPENAI_API_KEY") + self._client: Any = None + self._async_client: Any = None + + @property + def provider_name(self) -> str: + """Return the provider name.""" + return "openai" + + def _get_client(self) -> Any: + """Get or create the OpenAI client.""" + if self._client is None: + try: + from openai import OpenAI + + self._client = OpenAI(api_key=self._api_key) + except ImportError as e: + raise ImportError( + "openai is required for OpenAI file uploads. " + "Install with: pip install openai" + ) from e + return self._client + + def _get_async_client(self) -> Any: + """Get or create the async OpenAI client.""" + if self._async_client is None: + try: + from openai import AsyncOpenAI + + self._async_client = AsyncOpenAI(api_key=self._api_key) + except ImportError as e: + raise ImportError( + "openai is required for OpenAI file uploads. " + "Install with: pip install openai" + ) from e + return self._async_client + + def upload(self, file: FileInput, purpose: str | None = None) -> UploadResult: + """Upload a file to OpenAI. + + Args: + file: The file to upload. + purpose: Optional purpose for the file (default: "user_data"). + + Returns: + UploadResult with the file ID and metadata. + + Raises: + TransientUploadError: For retryable errors (network, rate limits). + PermanentUploadError: For non-retryable errors (auth, validation). + """ + from crewai.files.processing.exceptions import ( + PermanentUploadError, + TransientUploadError, + ) + + try: + client = self._get_client() + + content = file.read() + file_purpose = purpose or "user_data" + + file_data = io.BytesIO(content) + file_data.name = file.filename or "file" + + logger.info( + f"Uploading file '{file.filename}' to OpenAI ({len(content)} bytes)" + ) + + uploaded_file = client.files.create( + file=file_data, + purpose=file_purpose, + ) + + logger.info(f"Uploaded to OpenAI: {uploaded_file.id}") + + return UploadResult( + file_id=uploaded_file.id, + file_uri=None, + content_type=file.content_type, + expires_at=None, + provider=self.provider_name, + ) + except ImportError: + raise + except Exception as e: + error_type = type(e).__name__ + if "RateLimit" in error_type or "APIConnection" in error_type: + raise TransientUploadError( + f"Transient upload error: {e}", file_name=file.filename + ) from e + if "Authentication" in error_type or "Permission" in error_type: + raise PermanentUploadError( + f"Authentication/permission error: {e}", file_name=file.filename + ) from e + if "BadRequest" in error_type or "InvalidRequest" in error_type: + raise PermanentUploadError( + f"Invalid request: {e}", file_name=file.filename + ) from e + status_code = getattr(e, "status_code", None) + if status_code is not None: + if status_code >= 500 or status_code == 429: + raise TransientUploadError( + f"Server error ({status_code}): {e}", file_name=file.filename + ) from e + if status_code in (401, 403): + raise PermanentUploadError( + f"Auth error ({status_code}): {e}", file_name=file.filename + ) from e + if status_code == 400: + raise PermanentUploadError( + f"Bad request ({status_code}): {e}", file_name=file.filename + ) from e + raise TransientUploadError( + f"Upload failed: {e}", file_name=file.filename + ) from e + + def delete(self, file_id: str) -> bool: + """Delete an uploaded file from OpenAI. + + Args: + file_id: The file ID to delete. + + Returns: + True if deletion was successful, False otherwise. + """ + try: + client = self._get_client() + client.files.delete(file_id) + logger.info(f"Deleted OpenAI file: {file_id}") + return True + except Exception as e: + logger.warning(f"Failed to delete OpenAI file {file_id}: {e}") + return False + + def get_file_info(self, file_id: str) -> dict[str, Any] | None: + """Get information about an uploaded file. + + Args: + file_id: The file ID. + + Returns: + Dictionary with file information, or None if not found. + """ + try: + client = self._get_client() + file_info = client.files.retrieve(file_id) + return { + "id": file_info.id, + "filename": file_info.filename, + "purpose": file_info.purpose, + "bytes": file_info.bytes, + "created_at": file_info.created_at, + "status": file_info.status, + } + except Exception as e: + logger.debug(f"Failed to get OpenAI file info for {file_id}: {e}") + return None + + def list_files(self) -> list[dict[str, Any]]: + """List all uploaded files. + + Returns: + List of dictionaries with file information. + """ + try: + client = self._get_client() + files = client.files.list() + return [ + { + "id": f.id, + "filename": f.filename, + "purpose": f.purpose, + "bytes": f.bytes, + "created_at": f.created_at, + "status": f.status, + } + for f in files.data + ] + except Exception as e: + logger.warning(f"Failed to list OpenAI files: {e}") + return [] + + async def aupload( + self, file: FileInput, purpose: str | None = None + ) -> UploadResult: + """Async upload a file to OpenAI using native async client. + + Args: + file: The file to upload. + purpose: Optional purpose for the file (default: "user_data"). + + Returns: + UploadResult with the file ID and metadata. + + Raises: + TransientUploadError: For retryable errors (network, rate limits). + PermanentUploadError: For non-retryable errors (auth, validation). + """ + from crewai.files.processing.exceptions import ( + PermanentUploadError, + TransientUploadError, + ) + + try: + client = self._get_async_client() + + content = await file.aread() + file_purpose = purpose or "user_data" + + file_data = io.BytesIO(content) + file_data.name = file.filename or "file" + + logger.info( + f"Uploading file '{file.filename}' to OpenAI ({len(content)} bytes)" + ) + + uploaded_file = await client.files.create( + file=file_data, + purpose=file_purpose, + ) + + logger.info(f"Uploaded to OpenAI: {uploaded_file.id}") + + return UploadResult( + file_id=uploaded_file.id, + file_uri=None, + content_type=file.content_type, + expires_at=None, + provider=self.provider_name, + ) + except ImportError: + raise + except Exception as e: + error_type = type(e).__name__ + if "RateLimit" in error_type or "APIConnection" in error_type: + raise TransientUploadError( + f"Transient upload error: {e}", file_name=file.filename + ) from e + if "Authentication" in error_type or "Permission" in error_type: + raise PermanentUploadError( + f"Authentication/permission error: {e}", file_name=file.filename + ) from e + if "BadRequest" in error_type or "InvalidRequest" in error_type: + raise PermanentUploadError( + f"Invalid request: {e}", file_name=file.filename + ) from e + status_code = getattr(e, "status_code", None) + if status_code is not None: + if status_code >= 500 or status_code == 429: + raise TransientUploadError( + f"Server error ({status_code}): {e}", file_name=file.filename + ) from e + if status_code in (401, 403): + raise PermanentUploadError( + f"Auth error ({status_code}): {e}", file_name=file.filename + ) from e + if status_code == 400: + raise PermanentUploadError( + f"Bad request ({status_code}): {e}", file_name=file.filename + ) from e + raise TransientUploadError( + f"Upload failed: {e}", file_name=file.filename + ) from e + + async def adelete(self, file_id: str) -> bool: + """Async delete an uploaded file from OpenAI. + + Args: + file_id: The file ID to delete. + + Returns: + True if deletion was successful, False otherwise. + """ + try: + client = self._get_async_client() + await client.files.delete(file_id) + logger.info(f"Deleted OpenAI file: {file_id}") + return True + except Exception as e: + logger.warning(f"Failed to delete OpenAI file {file_id}: {e}") + return False diff --git a/lib/crewai/src/crewai/llm.py b/lib/crewai/src/crewai/llm.py index 4a7984b95..9a6314126 100644 --- a/lib/crewai/src/crewai/llm.py +++ b/lib/crewai/src/crewai/llm.py @@ -66,11 +66,11 @@ if TYPE_CHECKING: from litellm.utils import supports_response_schema from crewai.agent.core import Agent + from crewai.files import FileInput, UploadCache from crewai.llms.hooks.base import BaseInterceptor from crewai.llms.providers.anthropic.completion import AnthropicThinkingConfig from crewai.task import Task from crewai.tools.base_tool import BaseTool - from crewai.utilities.files import FileInput, UploadCache from crewai.utilities.types import LLMMessage try: @@ -2274,7 +2274,7 @@ class LLM(BaseLLM): """ import base64 - from crewai.utilities.files import ( + from crewai.files import ( FileResolver, FileResolverConfig, InlineBase64, diff --git a/lib/crewai/src/crewai/llms/base_llm.py b/lib/crewai/src/crewai/llms/base_llm.py index a3ed8a547..a87bfc70c 100644 --- a/lib/crewai/src/crewai/llms/base_llm.py +++ b/lib/crewai/src/crewai/llms/base_llm.py @@ -33,9 +33,9 @@ from crewai.types.usage_metrics import UsageMetrics if TYPE_CHECKING: from crewai.agent.core import Agent + from crewai.files import FileInput, UploadCache from crewai.task import Task from crewai.tools.base_tool import BaseTool - from crewai.utilities.files import FileInput, UploadCache from crewai.utilities.types import LLMMessage @@ -315,6 +315,25 @@ class BaseLLM(ABC): """ return [] + async def aformat_multimodal_content( + self, + files: dict[str, FileInput], + upload_cache: UploadCache | None = None, + ) -> list[dict[str, Any]]: + """Async format files as multimodal content blocks for the LLM. + + Default implementation calls the sync version. Subclasses should + override to use async file resolution for parallel processing. + + Args: + files: Dictionary mapping file names to FileInput objects. + upload_cache: Optional cache for tracking uploaded files. + + Returns: + List of content blocks in the provider's expected format. + """ + return self.format_multimodal_content(files, upload_cache) + def format_text_content(self, text: str) -> dict[str, Any]: """Format text as a content block for the LLM. diff --git a/lib/crewai/src/crewai/llms/providers/anthropic/completion.py b/lib/crewai/src/crewai/llms/providers/anthropic/completion.py index 658d3fc66..1a62dca0e 100644 --- a/lib/crewai/src/crewai/llms/providers/anthropic/completion.py +++ b/lib/crewai/src/crewai/llms/providers/anthropic/completion.py @@ -20,8 +20,8 @@ from crewai.utilities.types import LLMMessage if TYPE_CHECKING: + from crewai.files import FileInput, UploadCache from crewai.llms.hooks.base import BaseInterceptor - from crewai.utilities.files import FileInput, UploadCache DEFAULT_CACHE_TTL = "ephemeral" @@ -1281,7 +1281,7 @@ class AnthropicCompletion(BaseLLM): if not self.supports_multimodal(): return [] - from crewai.utilities.files import ( + from crewai.files import ( FileReference, FileResolver, FileResolverConfig, @@ -1370,3 +1370,107 @@ class AnthropicCompletion(BaseLLM): content_blocks.append(block) return content_blocks + + async def aformat_multimodal_content( + self, + files: dict[str, FileInput], + upload_cache: UploadCache | None = None, + enable_caching: bool = True, + cache_ttl: str | None = None, + ) -> list[dict[str, Any]]: + """Async format files as Anthropic multimodal content blocks. + + Uses parallel file resolution for improved performance with multiple files. + + Args: + files: Dictionary mapping file names to FileInput objects. + upload_cache: Optional cache for tracking uploaded files. + enable_caching: Whether to add cache_control markers (default: True). + cache_ttl: Cache TTL - "ephemeral" (5min) or "1h" (1hr for supported models). + + Returns: + List of content blocks in Anthropic's expected format. + """ + if not self.supports_multimodal(): + return [] + + from crewai.files import ( + FileReference, + FileResolver, + FileResolverConfig, + InlineBase64, + ) + + supported_types = self.supported_multimodal_content_types() + + supported_files = { + name: f + for name, f in files.items() + if any(f.content_type.startswith(t) for t in supported_types) + } + + if not supported_files: + return [] + + config = FileResolverConfig(prefer_upload=False) + resolver = FileResolver(config=config, upload_cache=upload_cache) + resolved_files = await resolver.aresolve_files(supported_files, "anthropic") + + content_blocks: list[dict[str, Any]] = [] + num_files = len(resolved_files) + file_names = list(supported_files.keys()) + + for i, name in enumerate(file_names): + if name not in resolved_files: + continue + + resolved = resolved_files[name] + file_input = supported_files[name] + content_type = file_input.content_type + block: dict[str, Any] = {} + + if isinstance(resolved, FileReference): + if content_type.startswith("image/"): + block = { + "type": "image", + "source": { + "type": "file", + "file_id": resolved.file_id, + }, + } + elif content_type == "application/pdf": + block = { + "type": "document", + "source": { + "type": "file", + "file_id": resolved.file_id, + }, + } + elif isinstance(resolved, InlineBase64): + if content_type.startswith("image/"): + block = { + "type": "image", + "source": { + "type": "base64", + "media_type": resolved.content_type, + "data": resolved.data, + }, + } + elif content_type == "application/pdf": + block = { + "type": "document", + "source": { + "type": "base64", + "media_type": resolved.content_type, + "data": resolved.data, + }, + } + + if block and enable_caching and i == num_files - 1: + cache_control: dict[str, str] = {"type": cache_ttl or DEFAULT_CACHE_TTL} + block["cache_control"] = cache_control + + if block: + content_blocks.append(block) + + return content_blocks diff --git a/lib/crewai/src/crewai/llms/providers/azure/completion.py b/lib/crewai/src/crewai/llms/providers/azure/completion.py index 8efd83405..e14e0b42c 100644 --- a/lib/crewai/src/crewai/llms/providers/azure/completion.py +++ b/lib/crewai/src/crewai/llms/providers/azure/completion.py @@ -18,8 +18,8 @@ from crewai.utilities.types import LLMMessage if TYPE_CHECKING: + from crewai.files import FileInput, UploadCache from crewai.llms.hooks.base import BaseInterceptor - from crewai.utilities.files import FileInput, UploadCache try: @@ -1060,7 +1060,7 @@ class AzureCompletion(BaseLLM): if not self.supports_multimodal(): return [] - from crewai.utilities.files import ( + from crewai.files import ( FileResolver, FileResolverConfig, InlineBase64, @@ -1100,3 +1100,54 @@ class AzureCompletion(BaseLLM): ) return content_blocks + + async def aformat_multimodal_content( + self, + files: dict[str, FileInput], + upload_cache: UploadCache | None = None, + ) -> list[dict[str, Any]]: + """Async format files as Azure OpenAI multimodal content blocks. + + Uses parallel file resolution for improved performance with multiple files. + + Args: + files: Dictionary mapping file names to FileInput objects. + upload_cache: Optional cache (not used by Azure but kept for interface consistency). + + Returns: + List of content blocks in Azure OpenAI's expected format. + """ + if not self.supports_multimodal(): + return [] + + from crewai.files import ( + FileResolver, + FileResolverConfig, + InlineBase64, + ) + + supported_types = self.supported_multimodal_content_types() + + supported_files = { + name: f + for name, f in files.items() + if any(f.content_type.startswith(t) for t in supported_types) + } + + if not supported_files: + return [] + + config = FileResolverConfig(prefer_upload=False) + resolver = FileResolver(config=config, upload_cache=upload_cache) + resolved_files = await resolver.aresolve_files(supported_files, "azure") + + return [ + { + "type": "image_url", + "image_url": { + "url": f"data:{resolved.content_type};base64,{resolved.data}" + }, + } + for resolved in resolved_files.values() + if isinstance(resolved, InlineBase64) + ] diff --git a/lib/crewai/src/crewai/llms/providers/bedrock/completion.py b/lib/crewai/src/crewai/llms/providers/bedrock/completion.py index 6ffddb791..f62652efd 100644 --- a/lib/crewai/src/crewai/llms/providers/bedrock/completion.py +++ b/lib/crewai/src/crewai/llms/providers/bedrock/completion.py @@ -32,8 +32,8 @@ if TYPE_CHECKING: ToolTypeDef, ) + from crewai.files import FileInput, UploadCache from crewai.llms.hooks.base import BaseInterceptor - from crewai.utilities.files import FileInput, UploadCache try: @@ -1455,13 +1455,33 @@ class BedrockCompletion(BaseLLM): def supports_multimodal(self) -> bool: """Check if the model supports multimodal inputs. - Claude models on Bedrock support vision. + Claude 3+ and Nova Lite/Pro/Premier on Bedrock support vision. Returns: True if the model supports images. """ - vision_models = ("anthropic.claude-3",) - return any(self.model.lower().startswith(m) for m in vision_models) + model_lower = self.model.lower() + vision_models = ( + "anthropic.claude-3", + "amazon.nova-lite", + "amazon.nova-pro", + "amazon.nova-premier", + "us.amazon.nova-lite", + "us.amazon.nova-pro", + "us.amazon.nova-premier", + ) + return any(model_lower.startswith(m) for m in vision_models) + + def _is_nova_model(self) -> bool: + """Check if the model is an Amazon Nova model. + + Only Nova models support S3 links for multimedia. + + Returns: + True if the model is a Nova model. + """ + model_lower = self.model.lower() + return "amazon.nova-" in model_lower def supported_multimodal_content_types(self) -> list[str]: """Get content types supported by Bedrock for multimodal input. @@ -1471,7 +1491,78 @@ class BedrockCompletion(BaseLLM): """ if not self.supports_multimodal(): return [] - return ["image/", "application/pdf"] + + types = ["image/png", "image/jpeg", "image/gif", "image/webp"] + + if self._is_nova_model(): + types.extend( + [ + "application/pdf", + "text/csv", + "text/plain", + "text/markdown", + "text/html", + "application/msword", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "application/vnd.ms-excel", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "video/mp4", + "video/quicktime", + "video/x-matroska", + "video/webm", + "video/x-flv", + "video/mpeg", + "video/x-ms-wmv", + "video/3gpp", + ] + ) + else: + types.append("application/pdf") + + return types + + def _get_document_format(self, content_type: str) -> str | None: + """Map content type to Bedrock document format. + + Args: + content_type: MIME type of the document. + + Returns: + Bedrock format string or None if unsupported. + """ + format_map = { + "application/pdf": "pdf", + "text/csv": "csv", + "text/plain": "txt", + "text/markdown": "md", + "text/html": "html", + "application/msword": "doc", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx", + "application/vnd.ms-excel": "xls", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx", + } + return format_map.get(content_type) + + def _get_video_format(self, content_type: str) -> str | None: + """Map content type to Bedrock video format. + + Args: + content_type: MIME type of the video. + + Returns: + Bedrock format string or None if unsupported. + """ + format_map = { + "video/mp4": "mp4", + "video/quicktime": "mov", + "video/x-matroska": "mkv", + "video/webm": "webm", + "video/x-flv": "flv", + "video/mpeg": "mpeg", + "video/x-ms-wmv": "wmv", + "video/3gpp": "three_gp", + } + return format_map.get(content_type) def format_multimodal_content( self, @@ -1480,12 +1571,12 @@ class BedrockCompletion(BaseLLM): ) -> list[dict[str, Any]]: """Format files as Bedrock Converse API multimodal content blocks. - Bedrock Converse API uses specific formats for images and documents with raw bytes. - Uses FileResolver to get InlineBytes format for Bedrock's byte-based API. + Bedrock Converse API supports both raw bytes and S3 URI references. + S3 uploads are only supported by Amazon Nova models. Args: files: Dictionary mapping file names to FileInput objects. - upload_cache: Optional cache (not used by Bedrock but kept for interface consistency). + upload_cache: Optional cache for S3 uploads. Returns: List of content blocks in Bedrock's expected format. @@ -1493,50 +1584,239 @@ class BedrockCompletion(BaseLLM): if not self.supports_multimodal(): return [] - from crewai.utilities.files import ( + import os + + from crewai.files import ( + FileReference, FileResolver, FileResolverConfig, InlineBytes, ) content_blocks: list[dict[str, Any]] = [] + is_nova = self._is_nova_model() - # Bedrock uses raw bytes, configure resolver accordingly - config = FileResolverConfig(prefer_upload=False, use_bytes_for_bedrock=True) + s3_bucket = os.environ.get("CREWAI_BEDROCK_S3_BUCKET") + s3_bucket_owner = os.environ.get("CREWAI_BEDROCK_S3_BUCKET_OWNER") + prefer_upload = bool(s3_bucket) and is_nova + + config = FileResolverConfig( + prefer_upload=prefer_upload, use_bytes_for_bedrock=True + ) resolver = FileResolver(config=config, upload_cache=upload_cache) for name, file_input in files.items(): content_type = file_input.content_type - resolved = resolver.resolve(file_input, "bedrock") - if isinstance(resolved, InlineBytes): - file_bytes = resolved.data - else: - # Fallback to reading directly - file_bytes = file_input.read() + if isinstance(resolved, FileReference) and resolved.file_uri: + s3_location: dict[str, Any] = {"uri": resolved.file_uri} + if s3_bucket_owner: + s3_location["bucketOwner"] = s3_bucket_owner - if content_type.startswith("image/"): - media_type = content_type.split("/")[-1] - if media_type == "jpg": - media_type = "jpeg" - content_blocks.append( - { - "image": { - "format": media_type, - "source": {"bytes": file_bytes}, + if content_type.startswith("image/"): + media_type = content_type.split("/")[-1] + if media_type == "jpg": + media_type = "jpeg" + content_blocks.append( + { + "image": { + "format": media_type, + "source": {"s3Location": s3_location}, + } } - } - ) - elif content_type == "application/pdf": - content_blocks.append( - { - "document": { - "name": name, - "format": "pdf", - "source": {"bytes": file_bytes}, + ) + elif content_type.startswith("video/"): + video_format = self._get_video_format(content_type) + if video_format: + content_blocks.append( + { + "video": { + "format": video_format, + "source": {"s3Location": s3_location}, + } + } + ) + else: + doc_format = self._get_document_format(content_type) + if doc_format: + content_blocks.append( + { + "document": { + "name": name, + "format": doc_format, + "source": {"s3Location": s3_location}, + } + } + ) + else: + if isinstance(resolved, InlineBytes): + file_bytes = resolved.data + else: + file_bytes = file_input.read() + + if content_type.startswith("image/"): + media_type = content_type.split("/")[-1] + if media_type == "jpg": + media_type = "jpeg" + content_blocks.append( + { + "image": { + "format": media_type, + "source": {"bytes": file_bytes}, + } } - } - ) + ) + elif content_type.startswith("video/"): + video_format = self._get_video_format(content_type) + if video_format: + content_blocks.append( + { + "video": { + "format": video_format, + "source": {"bytes": file_bytes}, + } + } + ) + else: + doc_format = self._get_document_format(content_type) + if doc_format: + content_blocks.append( + { + "document": { + "name": name, + "format": doc_format, + "source": {"bytes": file_bytes}, + } + } + ) + + return content_blocks + + async def aformat_multimodal_content( + self, + files: dict[str, FileInput], + upload_cache: UploadCache | None = None, + ) -> list[dict[str, Any]]: + """Async format files as Bedrock Converse API multimodal content blocks. + + Uses parallel file resolution. S3 uploads are only supported by Nova models. + + Args: + files: Dictionary mapping file names to FileInput objects. + upload_cache: Optional cache for S3 uploads. + + Returns: + List of content blocks in Bedrock's expected format. + """ + if not self.supports_multimodal(): + return [] + + import os + + from crewai.files import ( + FileReference, + FileResolver, + FileResolverConfig, + InlineBytes, + ) + + is_nova = self._is_nova_model() + s3_bucket = os.environ.get("CREWAI_BEDROCK_S3_BUCKET") + s3_bucket_owner = os.environ.get("CREWAI_BEDROCK_S3_BUCKET_OWNER") + prefer_upload = bool(s3_bucket) and is_nova + + config = FileResolverConfig( + prefer_upload=prefer_upload, use_bytes_for_bedrock=True + ) + resolver = FileResolver(config=config, upload_cache=upload_cache) + resolved_files = await resolver.aresolve_files(files, "bedrock") + + content_blocks: list[dict[str, Any]] = [] + for name, resolved in resolved_files.items(): + file_input = files[name] + content_type = file_input.content_type + + if isinstance(resolved, FileReference) and resolved.file_uri: + s3_location: dict[str, Any] = {"uri": resolved.file_uri} + if s3_bucket_owner: + s3_location["bucketOwner"] = s3_bucket_owner + + if content_type.startswith("image/"): + media_type = content_type.split("/")[-1] + if media_type == "jpg": + media_type = "jpeg" + content_blocks.append( + { + "image": { + "format": media_type, + "source": {"s3Location": s3_location}, + } + } + ) + elif content_type.startswith("video/"): + video_format = self._get_video_format(content_type) + if video_format: + content_blocks.append( + { + "video": { + "format": video_format, + "source": {"s3Location": s3_location}, + } + } + ) + else: + doc_format = self._get_document_format(content_type) + if doc_format: + content_blocks.append( + { + "document": { + "name": name, + "format": doc_format, + "source": {"s3Location": s3_location}, + } + } + ) + else: + if isinstance(resolved, InlineBytes): + file_bytes = resolved.data + else: + file_bytes = await file_input.aread() + + if content_type.startswith("image/"): + media_type = content_type.split("/")[-1] + if media_type == "jpg": + media_type = "jpeg" + content_blocks.append( + { + "image": { + "format": media_type, + "source": {"bytes": file_bytes}, + } + } + ) + elif content_type.startswith("video/"): + video_format = self._get_video_format(content_type) + if video_format: + content_blocks.append( + { + "video": { + "format": video_format, + "source": {"bytes": file_bytes}, + } + } + ) + else: + doc_format = self._get_document_format(content_type) + if doc_format: + content_blocks.append( + { + "document": { + "name": name, + "format": doc_format, + "source": {"bytes": file_bytes}, + } + } + ) return content_blocks diff --git a/lib/crewai/src/crewai/llms/providers/gemini/completion.py b/lib/crewai/src/crewai/llms/providers/gemini/completion.py index d5758248a..dc7feb917 100644 --- a/lib/crewai/src/crewai/llms/providers/gemini/completion.py +++ b/lib/crewai/src/crewai/llms/providers/gemini/completion.py @@ -19,11 +19,11 @@ from crewai.utilities.types import LLMMessage if TYPE_CHECKING: - from crewai.llms.hooks.base import BaseInterceptor - from crewai.utilities.files import ( + from crewai.files import ( FileInput, UploadCache, ) + from crewai.llms.hooks.base import BaseInterceptor try: @@ -1113,7 +1113,7 @@ class GeminiCompletion(BaseLLM): Returns: List of content blocks in Gemini's expected format. """ - from crewai.utilities.files import ( + from crewai.files import ( FileReference, FileResolver, FileResolverConfig, @@ -1123,7 +1123,6 @@ class GeminiCompletion(BaseLLM): content_blocks: list[dict[str, Any]] = [] supported_types = self.supported_multimodal_content_types() - # Create resolver with optional cache config = FileResolverConfig(prefer_upload=False) resolver = FileResolver(config=config, upload_cache=upload_cache) @@ -1168,6 +1167,67 @@ class GeminiCompletion(BaseLLM): return content_blocks + async def aformat_multimodal_content( + self, + files: dict[str, FileInput], + upload_cache: UploadCache | None = None, + ) -> list[dict[str, Any]]: + """Async format files as Gemini multimodal content blocks. + + Uses parallel file resolution for improved performance with multiple files. + + Args: + files: Dictionary mapping file names to FileInput objects. + upload_cache: Optional cache for tracking uploaded files. + + Returns: + List of content blocks in Gemini's expected format. + """ + from crewai.files import ( + FileReference, + FileResolver, + FileResolverConfig, + InlineBase64, + ) + + supported_types = self.supported_multimodal_content_types() + + supported_files = { + name: f + for name, f in files.items() + if any(f.content_type.startswith(t) for t in supported_types) + } + + if not supported_files: + return [] + + config = FileResolverConfig(prefer_upload=False) + resolver = FileResolver(config=config, upload_cache=upload_cache) + resolved_files = await resolver.aresolve_files(supported_files, "gemini") + + content_blocks: list[dict[str, Any]] = [] + for resolved in resolved_files.values(): + if isinstance(resolved, FileReference) and resolved.file_uri: + content_blocks.append( + { + "fileData": { + "mimeType": resolved.content_type, + "fileUri": resolved.file_uri, + } + } + ) + elif isinstance(resolved, InlineBase64): + content_blocks.append( + { + "inlineData": { + "mimeType": resolved.content_type, + "data": resolved.data, + } + } + ) + + return content_blocks + def format_text_content(self, text: str) -> dict[str, Any]: """Format text as a Gemini content block. diff --git a/lib/crewai/src/crewai/llms/providers/openai/completion.py b/lib/crewai/src/crewai/llms/providers/openai/completion.py index 78946a782..53e8c6d6d 100644 --- a/lib/crewai/src/crewai/llms/providers/openai/completion.py +++ b/lib/crewai/src/crewai/llms/providers/openai/completion.py @@ -28,10 +28,10 @@ from crewai.utilities.types import LLMMessage if TYPE_CHECKING: from crewai.agent.core import Agent + from crewai.files import FileInput, UploadCache from crewai.llms.hooks.base import BaseInterceptor from crewai.task import Task from crewai.tools.base_tool import BaseTool - from crewai.utilities.files import FileInput, UploadCache class OpenAICompletion(BaseLLM): @@ -1100,7 +1100,7 @@ class OpenAICompletion(BaseLLM): if not self.supports_multimodal(): return [] - from crewai.utilities.files import ( + from crewai.files import ( FileReference, FileResolver, FileResolverConfig, @@ -1148,3 +1148,67 @@ class OpenAICompletion(BaseLLM): ) return content_blocks + + async def aformat_multimodal_content( + self, + files: dict[str, FileInput], + upload_cache: UploadCache | None = None, + ) -> list[dict[str, Any]]: + """Async format files as OpenAI multimodal content blocks. + + Uses parallel file resolution for improved performance with multiple files. + + Args: + files: Dictionary mapping file names to FileInput objects. + upload_cache: Optional cache for tracking uploaded files. + + Returns: + List of content blocks in OpenAI's expected format. + """ + if not self.supports_multimodal(): + return [] + + from crewai.files import ( + FileReference, + FileResolver, + FileResolverConfig, + InlineBase64, + ) + + supported_types = self.supported_multimodal_content_types() + + supported_files = { + name: f + for name, f in files.items() + if any(f.content_type.startswith(t) for t in supported_types) + } + + if not supported_files: + return [] + + config = FileResolverConfig(prefer_upload=False) + resolver = FileResolver(config=config, upload_cache=upload_cache) + resolved_files = await resolver.aresolve_files(supported_files, "openai") + + content_blocks: list[dict[str, Any]] = [] + for resolved in resolved_files.values(): + if isinstance(resolved, FileReference): + content_blocks.append( + { + "type": "file", + "file": { + "file_id": resolved.file_id, + }, + } + ) + elif isinstance(resolved, InlineBase64): + content_blocks.append( + { + "type": "image_url", + "image_url": { + "url": f"data:{resolved.content_type};base64,{resolved.data}" + }, + } + ) + + return content_blocks diff --git a/lib/crewai/src/crewai/task.py b/lib/crewai/src/crewai/task.py index 7d8fc5ecb..8c4b26608 100644 --- a/lib/crewai/src/crewai/task.py +++ b/lib/crewai/src/crewai/task.py @@ -37,6 +37,12 @@ from crewai.events.types.task_events import ( TaskFailedEvent, TaskStartedEvent, ) +from crewai.files import ( + FileInput, + FilePath, + FileSourceInput, + normalize_input_files, +) from crewai.security import Fingerprint, SecurityConfig from crewai.tasks.output_format import OutputFormat from crewai.tasks.task_output import TaskOutput @@ -49,12 +55,6 @@ from crewai.utilities.file_store import ( get_all_files, store_task_files, ) -from crewai.utilities.files import ( - FileInput, - FilePath, - FileSourceInput, - normalize_input_files, -) from crewai.utilities.guardrail import ( process_guardrail, ) diff --git a/lib/crewai/src/crewai/tools/agent_tools/read_file_tool.py b/lib/crewai/src/crewai/tools/agent_tools/read_file_tool.py index e74ee23c7..43d47529a 100644 --- a/lib/crewai/src/crewai/tools/agent_tools/read_file_tool.py +++ b/lib/crewai/src/crewai/tools/agent_tools/read_file_tool.py @@ -11,7 +11,7 @@ from crewai.tools.base_tool import BaseTool if TYPE_CHECKING: - from crewai.utilities.files import FileInput + from crewai.files import FileInput class ReadFileToolSchema(BaseModel): diff --git a/lib/crewai/src/crewai/utilities/file_store.py b/lib/crewai/src/crewai/utilities/file_store.py index 814ee486c..dedb46079 100644 --- a/lib/crewai/src/crewai/utilities/file_store.py +++ b/lib/crewai/src/crewai/utilities/file_store.py @@ -13,7 +13,7 @@ from aiocache.serializers import PickleSerializer # type: ignore[import-untyped if TYPE_CHECKING: - from crewai.utilities.files import FileInput + from crewai.files import FileInput _file_store = Cache(Cache.MEMORY, serializer=PickleSerializer()) diff --git a/lib/crewai/src/crewai/utilities/files/__init__.py b/lib/crewai/src/crewai/utilities/files/__init__.py index 6d08e2d73..8e7bb3972 100644 --- a/lib/crewai/src/crewai/utilities/files/__init__.py +++ b/lib/crewai/src/crewai/utilities/files/__init__.py @@ -1,207 +1,25 @@ -"""File handling utilities for crewAI tasks.""" +"""Backwards compatibility re-exports from crewai.files. -from crewai.utilities.files.cleanup import ( - cleanup_expired_files, - cleanup_provider_files, - cleanup_uploaded_files, -) -from crewai.utilities.files.content_types import ( - AudioContentType, - AudioExtension, - AudioFile, - BaseFile, - File, - FileMode, - ImageContentType, - ImageExtension, - ImageFile, - PDFContentType, - PDFExtension, - PDFFile, - TextContentType, - TextExtension, - TextFile, - VideoContentType, - VideoExtension, - VideoFile, -) -from crewai.utilities.files.file import ( - FileBytes, - FilePath, - FileSource, - FileSourceInput, - FileStream, - RawFileInput, -) -from crewai.utilities.files.processing import ( - ANTHROPIC_CONSTRAINTS, - BEDROCK_CONSTRAINTS, - GEMINI_CONSTRAINTS, - OPENAI_CONSTRAINTS, - AudioConstraints, - FileHandling, - FileProcessingError, - FileProcessor, - FileTooLargeError, - FileValidationError, - ImageConstraints, - PDFConstraints, - ProcessingDependencyError, - ProviderConstraints, - UnsupportedFileTypeError, - VideoConstraints, - get_constraints_for_provider, -) -from crewai.utilities.files.resolved import ( - FileReference, - InlineBase64, - InlineBytes, - ResolvedFile, - ResolvedFileType, - UrlReference, -) -from crewai.utilities.files.resolver import ( - FileResolver, - FileResolverConfig, - create_resolver, -) -from crewai.utilities.files.upload_cache import ( - CachedUpload, - UploadCache, - get_upload_cache, - reset_upload_cache, -) -from crewai.utilities.files.uploaders import FileUploader, UploadResult, get_uploader +Deprecated: Import from crewai.files instead. +""" + +import sys +from typing import Any + +from typing_extensions import deprecated + +import crewai.files as _files -FileInput = AudioFile | File | ImageFile | PDFFile | TextFile | VideoFile +@deprecated("crewai.utilities.files is deprecated. Import from crewai.files instead.") +class _DeprecatedModule: + """Deprecated module wrapper.""" + + def __getattr__(self, name: str) -> Any: + return getattr(_files, name) + + def __dir__(self) -> list[str]: + return list(_files.__all__) -def wrap_file_source(source: FileSource) -> FileInput: - """Wrap a FileSource in the appropriate typed FileInput wrapper. - - Args: - source: The file source to wrap. - - Returns: - Typed FileInput wrapper based on content type. - """ - content_type = source.content_type - - if content_type.startswith("image/"): - return ImageFile(source=source) - if content_type.startswith("audio/"): - return AudioFile(source=source) - if content_type.startswith("video/"): - return VideoFile(source=source) - if content_type == "application/pdf": - return PDFFile(source=source) - return TextFile(source=source) - - -def normalize_input_files( - input_files: list[FileSourceInput | FileInput], -) -> dict[str, FileInput]: - """Convert a list of file sources to a named dictionary of FileInputs. - - Args: - input_files: List of file source inputs or File objects. - - Returns: - Dictionary mapping names to FileInput wrappers. - """ - from pathlib import Path - - result: dict[str, FileInput] = {} - - for i, item in enumerate(input_files): - if isinstance(item, BaseFile): - name = item.filename or f"file_{i}" - if "." in name: - name = name.rsplit(".", 1)[0] - result[name] = item - continue - - file_source: FilePath | FileBytes | FileStream - if isinstance(item, (FilePath, FileBytes, FileStream)): - file_source = item - elif isinstance(item, Path): - file_source = FilePath(path=item) - elif isinstance(item, str): - file_source = FilePath(path=Path(item)) - elif isinstance(item, (bytes, memoryview)): - file_source = FileBytes(data=bytes(item)) - else: - continue - - name = file_source.filename or f"file_{i}" - result[name] = wrap_file_source(file_source) - - return result - - -__all__ = [ - "ANTHROPIC_CONSTRAINTS", - "BEDROCK_CONSTRAINTS", - "GEMINI_CONSTRAINTS", - "OPENAI_CONSTRAINTS", - "AudioConstraints", - "AudioContentType", - "AudioExtension", - "AudioFile", - "BaseFile", - "CachedUpload", - "File", - "FileBytes", - "FileHandling", - "FileInput", - "FileMode", - "FilePath", - "FileProcessingError", - "FileProcessor", - "FileReference", - "FileResolver", - "FileResolverConfig", - "FileSource", - "FileSourceInput", - "FileStream", - "FileTooLargeError", - "FileUploader", - "FileValidationError", - "ImageConstraints", - "ImageContentType", - "ImageExtension", - "ImageFile", - "InlineBase64", - "InlineBytes", - "PDFConstraints", - "PDFContentType", - "PDFExtension", - "PDFFile", - "ProcessingDependencyError", - "ProviderConstraints", - "RawFileInput", - "ResolvedFile", - "ResolvedFileType", - "TextContentType", - "TextExtension", - "TextFile", - "UnsupportedFileTypeError", - "UploadCache", - "UploadResult", - "UrlReference", - "VideoConstraints", - "VideoContentType", - "VideoExtension", - "VideoFile", - "cleanup_expired_files", - "cleanup_provider_files", - "cleanup_uploaded_files", - "create_resolver", - "get_constraints_for_provider", - "get_upload_cache", - "get_uploader", - "normalize_input_files", - "reset_upload_cache", - "wrap_file_source", -] +sys.modules[__name__] = _DeprecatedModule() # type: ignore[assignment] diff --git a/lib/crewai/src/crewai/utilities/files/__init__.pyi b/lib/crewai/src/crewai/utilities/files/__init__.pyi new file mode 100644 index 000000000..872245260 --- /dev/null +++ b/lib/crewai/src/crewai/utilities/files/__init__.pyi @@ -0,0 +1,258 @@ +"""Type stubs for backwards compatibility re-exports from crewai.files. + +.. deprecated:: + Import from crewai.files instead. +""" + +from collections.abc import Callable +from datetime import datetime +from pathlib import Path +from typing import Any, Literal + +from typing_extensions import deprecated + +import crewai.files as _files + +FileMode = Literal["strict", "auto", "warn", "chunk"] +ImageExtension = _files.ImageExtension +ImageContentType = _files.ImageContentType +PDFExtension = _files.PDFExtension +PDFContentType = _files.PDFContentType +TextExtension = _files.TextExtension +TextContentType = _files.TextContentType +AudioExtension = _files.AudioExtension +AudioContentType = _files.AudioContentType +VideoExtension = _files.VideoExtension +VideoContentType = _files.VideoContentType +FileInput = _files.FileInput +FileSource = _files.FileSource +FileSourceInput = _files.FileSourceInput +RawFileInput = _files.RawFileInput +ResolvedFileType = _files.ResolvedFileType +FileHandling = _files.FileHandling + +# Deprecated classes +@deprecated("Import from crewai.files instead") +class BaseFile(_files.BaseFile): + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +class ImageFile(_files.ImageFile): + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +class PDFFile(_files.PDFFile): + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +class TextFile(_files.TextFile): + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +class AudioFile(_files.AudioFile): + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +class VideoFile(_files.VideoFile): + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +class File(_files.File): + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +class FilePath(_files.FilePath): + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +class FileBytes(_files.FileBytes): + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +class FileStream(_files.FileStream): + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +class FileResolver(_files.FileResolver): + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +class FileResolverConfig(_files.FileResolverConfig): + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +class FileProcessor(_files.FileProcessor): + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +class FileUploader(_files.FileUploader): + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +class UploadCache(_files.UploadCache): + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +class CachedUpload(_files.CachedUpload): + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +class UploadResult(_files.UploadResult): + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +class ResolvedFile(_files.ResolvedFile): + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +class FileReference(_files.FileReference): + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +class UrlReference(_files.UrlReference): + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +class InlineBase64(_files.InlineBase64): + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +class InlineBytes(_files.InlineBytes): + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +class ProviderConstraints(_files.ProviderConstraints): + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +class ImageConstraints(_files.ImageConstraints): + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +class AudioConstraints(_files.AudioConstraints): + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +class VideoConstraints(_files.VideoConstraints): + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +class PDFConstraints(_files.PDFConstraints): + """.. deprecated:: Import from crewai.files instead.""" + ... + +# Exceptions +@deprecated("Import from crewai.files instead") +class FileProcessingError(_files.FileProcessingError): + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +class FileValidationError(_files.FileValidationError): + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +class FileTooLargeError(_files.FileTooLargeError): + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +class UnsupportedFileTypeError(_files.UnsupportedFileTypeError): + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +class ProcessingDependencyError(_files.ProcessingDependencyError): + """.. deprecated:: Import from crewai.files instead.""" + ... + +# Constants +OPENAI_CONSTRAINTS: _files.ProviderConstraints +ANTHROPIC_CONSTRAINTS: _files.ProviderConstraints +GEMINI_CONSTRAINTS: _files.ProviderConstraints +BEDROCK_CONSTRAINTS: _files.ProviderConstraints + +# Deprecated functions +@deprecated("Import from crewai.files instead") +def create_resolver( + provider: str, + config: FileResolverConfig | None = None, +) -> FileResolver: + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +def get_uploader(provider: str, **kwargs: Any) -> FileUploader | None: + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +def get_upload_cache() -> UploadCache: + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +def reset_upload_cache() -> None: + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +def get_constraints_for_provider(provider: str) -> ProviderConstraints: + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +def cleanup_uploaded_files(provider: str | None = None) -> int: + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +def cleanup_expired_files() -> int: + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +def cleanup_provider_files(provider: str) -> int: + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +def normalize_input_files( + input_files: list[FileSourceInput | FileInput], +) -> dict[str, FileInput]: + """.. deprecated:: Import from crewai.files instead.""" + ... + +@deprecated("Import from crewai.files instead") +def wrap_file_source(source: FileSource) -> FileInput: + """.. deprecated:: Import from crewai.files instead.""" + ... + +__all__: list[str] \ No newline at end of file diff --git a/lib/crewai/src/crewai/utilities/files/cleanup.py b/lib/crewai/src/crewai/utilities/files/cleanup.py deleted file mode 100644 index 1444d1a80..000000000 --- a/lib/crewai/src/crewai/utilities/files/cleanup.py +++ /dev/null @@ -1,180 +0,0 @@ -"""Cleanup utilities for uploaded files.""" - -from __future__ import annotations - -import logging -from typing import TYPE_CHECKING - -from crewai.utilities.files.upload_cache import CachedUpload, UploadCache -from crewai.utilities.files.uploaders import get_uploader - - -if TYPE_CHECKING: - from crewai.utilities.files.uploaders.base import FileUploader - -logger = logging.getLogger(__name__) - - -def _safe_delete( - uploader: FileUploader, - file_id: str, - provider: str, -) -> bool: - """Safely delete a file, logging any errors. - - Args: - uploader: The file uploader to use. - file_id: The file ID to delete. - provider: Provider name for logging. - - Returns: - True if deleted successfully, False otherwise. - """ - try: - if uploader.delete(file_id): - logger.debug(f"Deleted {file_id} from {provider}") - return True - logger.warning(f"Failed to delete {file_id} from {provider}") - return False - except Exception as e: - logger.warning(f"Error deleting {file_id} from {provider}: {e}") - return False - - -def cleanup_uploaded_files( - cache: UploadCache, - *, - delete_from_provider: bool = True, - providers: list[str] | None = None, -) -> int: - """Clean up uploaded files from the cache and optionally from providers. - - Args: - cache: The upload cache to clean up. - delete_from_provider: If True, delete files from the provider as well. - providers: Optional list of providers to clean up. If None, cleans all. - - Returns: - Number of files cleaned up. - """ - cleaned = 0 - - provider_uploads: dict[str, list[CachedUpload]] = {} - - for provider in _get_providers_from_cache(cache): - if providers is not None and provider not in providers: - continue - provider_uploads[provider] = cache.get_all_for_provider(provider) - - if delete_from_provider: - for provider, uploads in provider_uploads.items(): - uploader = get_uploader(provider) - if uploader is None: - logger.warning( - f"No uploader available for {provider}, skipping cleanup" - ) - continue - - for upload in uploads: - if _safe_delete(uploader, upload.file_id, provider): - cleaned += 1 - - cache.clear() - - logger.info(f"Cleaned up {cleaned} uploaded files") - return cleaned - - -def cleanup_expired_files( - cache: UploadCache, - *, - delete_from_provider: bool = False, -) -> int: - """Clean up expired files from the cache. - - Args: - cache: The upload cache to clean up. - delete_from_provider: If True, attempt to delete from provider as well. - Note: Expired files may already be deleted by the provider. - - Returns: - Number of expired entries removed from cache. - """ - expired_entries: list[CachedUpload] = [] - - if delete_from_provider: - for provider in _get_providers_from_cache(cache): - expired_entries.extend( - upload - for upload in cache.get_all_for_provider(provider) - if upload.is_expired() - ) - - removed = cache.clear_expired() - - if delete_from_provider: - for upload in expired_entries: - uploader = get_uploader(upload.provider) - if uploader is not None: - try: - uploader.delete(upload.file_id) - except Exception as e: - logger.debug(f"Could not delete expired file {upload.file_id}: {e}") - - return removed - - -def cleanup_provider_files( - provider: str, - *, - cache: UploadCache | None = None, - delete_all_from_provider: bool = False, -) -> int: - """Clean up all files for a specific provider. - - Args: - provider: Provider name to clean up. - cache: Optional upload cache to clear entries from. - delete_all_from_provider: If True, delete all files from the provider, - not just cached ones. - - Returns: - Number of files deleted. - """ - deleted = 0 - uploader = get_uploader(provider) - - if uploader is None: - logger.warning(f"No uploader available for {provider}") - return 0 - - if delete_all_from_provider: - try: - files = uploader.list_files() - for file_info in files: - file_id = file_info.get("id") or file_info.get("name") - if file_id and uploader.delete(file_id): - deleted += 1 - except Exception as e: - logger.warning(f"Error listing/deleting files from {provider}: {e}") - elif cache is not None: - uploads = cache.get_all_for_provider(provider) - for upload in uploads: - if _safe_delete(uploader, upload.file_id, provider): - deleted += 1 - cache.remove_by_file_id(upload.file_id, provider) - - logger.info(f"Deleted {deleted} files from {provider}") - return deleted - - -def _get_providers_from_cache(cache: UploadCache) -> set[str]: - """Get unique provider names from cache entries. - - Args: - cache: The upload cache. - - Returns: - Set of provider names. - """ - return cache.get_providers() diff --git a/lib/crewai/src/crewai/utilities/files/file.py b/lib/crewai/src/crewai/utilities/files/file.py deleted file mode 100644 index bd8a43618..000000000 --- a/lib/crewai/src/crewai/utilities/files/file.py +++ /dev/null @@ -1,158 +0,0 @@ -"""Base file class for handling file inputs in tasks.""" - -from __future__ import annotations - -from pathlib import Path -from typing import Annotated, Any, BinaryIO, cast - -import magic -from pydantic import ( - BaseModel, - BeforeValidator, - Field, - GetCoreSchemaHandler, - PrivateAttr, - model_validator, -) -from pydantic_core import CoreSchema, core_schema - - -def detect_content_type(data: bytes) -> str: - """Detect MIME type from file content. - - Args: - data: Raw bytes to analyze. - - Returns: - The detected MIME type. - """ - return magic.from_buffer(data, mime=True) - - -class _BinaryIOValidator: - """Pydantic validator for BinaryIO types.""" - - @classmethod - def __get_pydantic_core_schema__( - cls, source_type: Any, handler: GetCoreSchemaHandler - ) -> CoreSchema: - return core_schema.no_info_plain_validator_function( - cls._validate, - serialization=core_schema.plain_serializer_function_ser_schema( - lambda x: None, info_arg=False - ), - ) - - @staticmethod - def _validate(value: Any) -> BinaryIO: - if hasattr(value, "read") and hasattr(value, "seek"): - return cast(BinaryIO, value) - raise ValueError("Expected a binary file-like object with read() and seek()") - - -ValidatedBinaryIO = Annotated[BinaryIO, _BinaryIOValidator()] - - -class FilePath(BaseModel): - """File loaded from a filesystem path.""" - - path: Path = Field(description="Path to the file on the filesystem.") - _content: bytes | None = PrivateAttr(default=None) - - @model_validator(mode="after") - def _validate_file_exists(self) -> FilePath: - """Validate that the file exists.""" - if not self.path.exists(): - raise ValueError(f"File not found: {self.path}") - if not self.path.is_file(): - raise ValueError(f"Path is not a file: {self.path}") - return self - - @property - def filename(self) -> str: - """Get the filename from the path.""" - return self.path.name - - @property - def content_type(self) -> str: - """Get the content type by reading file content.""" - return detect_content_type(self.read()) - - def read(self) -> bytes: - """Read the file content from disk.""" - if self._content is None: - self._content = self.path.read_bytes() - return self._content - - -class FileBytes(BaseModel): - """File created from raw bytes content.""" - - data: bytes = Field(description="Raw bytes content of the file.") - filename: str | None = Field(default=None, description="Optional filename.") - - @property - def content_type(self) -> str: - """Get the content type from the data.""" - return detect_content_type(self.data) - - def read(self) -> bytes: - """Return the bytes content.""" - return self.data - - -class FileStream(BaseModel): - """File loaded from a file-like stream.""" - - stream: ValidatedBinaryIO = Field(description="Binary file stream.") - filename: str | None = Field(default=None, description="Optional filename.") - _content: bytes | None = PrivateAttr(default=None) - - def model_post_init(self, __context: object) -> None: - """Extract filename from stream if not provided.""" - if self.filename is None: - name = getattr(self.stream, "name", None) - if name is not None: - object.__setattr__(self, "filename", Path(name).name) - - @property - def content_type(self) -> str: - """Get the content type from stream content.""" - return detect_content_type(self.read()) - - def read(self) -> bytes: - """Read the stream content. Content is cached after first read.""" - if self._content is None: - position = self.stream.tell() - self.stream.seek(0) - self._content = self.stream.read() - self.stream.seek(position) - return self._content - - def close(self) -> None: - """Close the underlying stream.""" - self.stream.close() - - -FileSource = FilePath | FileBytes | FileStream - - -def _normalize_source(value: Any) -> FileSource: - """Convert raw input to appropriate source type.""" - if isinstance(value, (FilePath, FileBytes, FileStream)): - return value - if isinstance(value, Path): - return FilePath(path=value) - if isinstance(value, str): - return FilePath(path=Path(value)) - if isinstance(value, bytes): - return FileBytes(data=value) - if hasattr(value, "read") and hasattr(value, "seek"): - return FileStream(stream=value) - raise ValueError(f"Cannot convert {type(value).__name__} to file source") - - -RawFileInput = str | Path | bytes -FileSourceInput = Annotated[ - RawFileInput | FileSource, BeforeValidator(_normalize_source) -] diff --git a/lib/crewai/src/crewai/utilities/files/resolver.py b/lib/crewai/src/crewai/utilities/files/resolver.py deleted file mode 100644 index b8ee9460d..000000000 --- a/lib/crewai/src/crewai/utilities/files/resolver.py +++ /dev/null @@ -1,287 +0,0 @@ -"""FileResolver for deciding file delivery method and managing uploads.""" - -import base64 -from dataclasses import dataclass, field -import logging - -from crewai.utilities.files.content_types import ( - AudioFile, - File, - ImageFile, - PDFFile, - TextFile, - VideoFile, -) -from crewai.utilities.files.processing.constraints import ( - ProviderConstraints, - get_constraints_for_provider, -) -from crewai.utilities.files.resolved import ( - FileReference, - InlineBase64, - InlineBytes, - ResolvedFile, -) -from crewai.utilities.files.upload_cache import CachedUpload, UploadCache -from crewai.utilities.files.uploaders import get_uploader -from crewai.utilities.files.uploaders.base import FileUploader - - -logger = logging.getLogger(__name__) - -FileInput = AudioFile | File | ImageFile | PDFFile | TextFile | VideoFile - - -@dataclass -class FileResolverConfig: - """Configuration for FileResolver. - - Attributes: - prefer_upload: If True, prefer uploading over inline for supported providers. - upload_threshold_bytes: Size threshold above which to use upload. - If None, uses provider-specific threshold. - use_bytes_for_bedrock: If True, use raw bytes instead of base64 for Bedrock. - """ - - prefer_upload: bool = False - upload_threshold_bytes: int | None = None - use_bytes_for_bedrock: bool = True - - -@dataclass -class FileResolver: - """Resolves files to their delivery format based on provider capabilities. - - Decides whether to use inline base64, raw bytes, or file upload based on: - - Provider constraints and capabilities - - File size - - Configuration preferences - - Caches uploaded files to avoid redundant uploads. - - Attributes: - config: Resolver configuration. - upload_cache: Cache for tracking uploaded files. - """ - - config: FileResolverConfig = field(default_factory=FileResolverConfig) - upload_cache: UploadCache | None = None - _uploaders: dict[str, FileUploader] = field(default_factory=dict) - - def resolve(self, file: FileInput, provider: str) -> ResolvedFile: - """Resolve a file to its delivery format for a provider. - - Args: - file: The file to resolve. - provider: Provider name (e.g., "gemini", "anthropic", "openai"). - - Returns: - ResolvedFile representing the appropriate delivery format. - """ - provider_lower = provider.lower() - constraints = get_constraints_for_provider(provider) - file_size = len(file.read()) - - should_upload = self._should_upload( - file, provider_lower, constraints, file_size - ) - - if should_upload: - resolved = self._resolve_via_upload(file, provider_lower) - if resolved is not None: - return resolved - - return self._resolve_inline(file, provider_lower) - - def resolve_files( - self, - files: dict[str, FileInput], - provider: str, - ) -> dict[str, ResolvedFile]: - """Resolve multiple files for a provider. - - Args: - files: Dictionary mapping names to file inputs. - provider: Provider name. - - Returns: - Dictionary mapping names to resolved files. - """ - return {name: self.resolve(file, provider) for name, file in files.items()} - - def _should_upload( - self, - file: FileInput, - provider: str, - constraints: ProviderConstraints | None, - file_size: int, - ) -> bool: - """Determine if a file should be uploaded rather than inlined. - - Args: - file: The file to check. - provider: Provider name. - constraints: Provider constraints. - file_size: Size of the file in bytes. - - Returns: - True if the file should be uploaded, False otherwise. - """ - if constraints is None or not constraints.supports_file_upload: - return False - - if self.config.prefer_upload: - return True - - threshold = self.config.upload_threshold_bytes - if threshold is None and constraints is not None: - threshold = constraints.file_upload_threshold_bytes - - if threshold is not None and file_size > threshold: - return True - - return False - - def _resolve_via_upload( - self, - file: FileInput, - provider: str, - ) -> ResolvedFile | None: - """Resolve a file by uploading it. - - Args: - file: The file to upload. - provider: Provider name. - - Returns: - FileReference if upload succeeds, None otherwise. - """ - if self.upload_cache is not None: - cached = self.upload_cache.get(file, provider) - if cached is not None: - logger.debug( - f"Using cached upload for {file.filename}: {cached.file_id}" - ) - return FileReference( - content_type=cached.content_type, - file_id=cached.file_id, - provider=cached.provider, - expires_at=cached.expires_at, - file_uri=cached.file_uri, - ) - - uploader = self._get_uploader(provider) - if uploader is None: - logger.debug(f"No uploader available for {provider}") - return None - - try: - result = uploader.upload(file) - - if self.upload_cache is not None: - self.upload_cache.set( - file=file, - provider=provider, - file_id=result.file_id, - file_uri=result.file_uri, - expires_at=result.expires_at, - ) - - return FileReference( - content_type=result.content_type, - file_id=result.file_id, - provider=result.provider, - expires_at=result.expires_at, - file_uri=result.file_uri, - ) - - except Exception as e: - logger.warning(f"Failed to upload {file.filename} to {provider}: {e}") - return None - - def _resolve_inline(self, file: FileInput, provider: str) -> ResolvedFile: - """Resolve a file as inline content. - - Args: - file: The file to resolve. - provider: Provider name. - - Returns: - InlineBase64 or InlineBytes depending on provider. - """ - content = file.read() - - if self.config.use_bytes_for_bedrock and "bedrock" in provider: - return InlineBytes( - content_type=file.content_type, - data=content, - ) - - encoded = base64.b64encode(content).decode("ascii") - return InlineBase64( - content_type=file.content_type, - data=encoded, - ) - - def _get_uploader(self, provider: str) -> FileUploader | None: - """Get or create an uploader for a provider. - - Args: - provider: Provider name. - - Returns: - FileUploader instance or None if not available. - """ - if provider not in self._uploaders: - uploader = get_uploader(provider) - if uploader is not None: - self._uploaders[provider] = uploader - else: - return None - - return self._uploaders.get(provider) - - def get_cached_uploads(self, provider: str) -> list[CachedUpload]: - """Get all cached uploads for a provider. - - Args: - provider: Provider name. - - Returns: - List of cached uploads. - """ - if self.upload_cache is None: - return [] - return self.upload_cache.get_all_for_provider(provider) - - def clear_cache(self) -> None: - """Clear the upload cache.""" - if self.upload_cache is not None: - self.upload_cache.clear() - - -def create_resolver( - provider: str | None = None, - prefer_upload: bool = False, - upload_threshold_bytes: int | None = None, - enable_cache: bool = True, -) -> FileResolver: - """Create a configured FileResolver. - - Args: - provider: Optional provider name for provider-specific configuration. - prefer_upload: Whether to prefer upload over inline. - upload_threshold_bytes: Size threshold for using upload. - enable_cache: Whether to enable upload caching. - - Returns: - Configured FileResolver instance. - """ - config = FileResolverConfig( - prefer_upload=prefer_upload, - upload_threshold_bytes=upload_threshold_bytes, - ) - - cache = UploadCache() if enable_cache else None - - return FileResolver(config=config, upload_cache=cache) diff --git a/lib/crewai/src/crewai/utilities/files/uploaders/anthropic.py b/lib/crewai/src/crewai/utilities/files/uploaders/anthropic.py deleted file mode 100644 index c7bf64010..000000000 --- a/lib/crewai/src/crewai/utilities/files/uploaders/anthropic.py +++ /dev/null @@ -1,166 +0,0 @@ -"""Anthropic Files API uploader implementation.""" - -from __future__ import annotations - -import io -import logging -import os -from typing import Any - -from crewai.utilities.files.content_types import ( - AudioFile, - File, - ImageFile, - PDFFile, - TextFile, - VideoFile, -) -from crewai.utilities.files.uploaders.base import FileUploader, UploadResult - - -logger = logging.getLogger(__name__) - -FileInput = AudioFile | File | ImageFile | PDFFile | TextFile | VideoFile - - -class AnthropicFileUploader(FileUploader): - """Uploader for Anthropic Files API. - - Uses the anthropic SDK to upload files. Files are stored persistently - until explicitly deleted. - - Attributes: - api_key: Optional API key (uses ANTHROPIC_API_KEY env var if not provided). - """ - - def __init__(self, api_key: str | None = None) -> None: - """Initialize the Anthropic uploader. - - Args: - api_key: Optional Anthropic API key. If not provided, uses - ANTHROPIC_API_KEY environment variable. - """ - self._api_key = api_key or os.environ.get("ANTHROPIC_API_KEY") - self._client: Any = None - - @property - def provider_name(self) -> str: - """Return the provider name.""" - return "anthropic" - - def _get_client(self) -> Any: - """Get or create the Anthropic client.""" - if self._client is None: - try: - import anthropic - - self._client = anthropic.Anthropic(api_key=self._api_key) - except ImportError as e: - raise ImportError( - "anthropic is required for Anthropic file uploads. " - "Install with: pip install anthropic" - ) from e - return self._client - - def upload(self, file: FileInput, purpose: str | None = None) -> UploadResult: - """Upload a file to Anthropic. - - Args: - file: The file to upload. - purpose: Optional purpose for the file (default: "user_upload"). - - Returns: - UploadResult with the file ID and metadata. - - Raises: - Exception: If upload fails. - """ - client = self._get_client() - - content = file.read() - file_purpose = purpose or "user_upload" - - file_data = io.BytesIO(content) - - logger.info( - f"Uploading file '{file.filename}' to Anthropic ({len(content)} bytes)" - ) - - uploaded_file = client.files.create( - file=(file.filename, file_data, file.content_type), - purpose=file_purpose, - ) - - logger.info(f"Uploaded to Anthropic: {uploaded_file.id}") - - return UploadResult( - file_id=uploaded_file.id, - file_uri=None, - content_type=file.content_type, - expires_at=None, - provider=self.provider_name, - ) - - def delete(self, file_id: str) -> bool: - """Delete an uploaded file from Anthropic. - - Args: - file_id: The file ID to delete. - - Returns: - True if deletion was successful, False otherwise. - """ - try: - client = self._get_client() - client.files.delete(file_id=file_id) - logger.info(f"Deleted Anthropic file: {file_id}") - return True - except Exception as e: - logger.warning(f"Failed to delete Anthropic file {file_id}: {e}") - return False - - def get_file_info(self, file_id: str) -> dict[str, Any] | None: - """Get information about an uploaded file. - - Args: - file_id: The file ID. - - Returns: - Dictionary with file information, or None if not found. - """ - try: - client = self._get_client() - file_info = client.files.retrieve(file_id=file_id) - return { - "id": file_info.id, - "filename": file_info.filename, - "purpose": file_info.purpose, - "size_bytes": file_info.size_bytes, - "created_at": file_info.created_at, - } - except Exception as e: - logger.debug(f"Failed to get Anthropic file info for {file_id}: {e}") - return None - - def list_files(self) -> list[dict[str, Any]]: - """List all uploaded files. - - Returns: - List of dictionaries with file information. - """ - try: - client = self._get_client() - files = client.files.list() - return [ - { - "id": f.id, - "filename": f.filename, - "purpose": f.purpose, - "size_bytes": f.size_bytes, - "created_at": f.created_at, - } - for f in files.data - ] - except Exception as e: - logger.warning(f"Failed to list Anthropic files: {e}") - return [] diff --git a/lib/crewai/src/crewai/utilities/files/uploaders/gemini.py b/lib/crewai/src/crewai/utilities/files/uploaders/gemini.py deleted file mode 100644 index c4a53db38..000000000 --- a/lib/crewai/src/crewai/utilities/files/uploaders/gemini.py +++ /dev/null @@ -1,217 +0,0 @@ -"""Gemini File API uploader implementation.""" - -from __future__ import annotations - -from datetime import datetime, timedelta, timezone -import io -import logging -import os -from typing import Any - -from crewai.utilities.files.content_types import ( - AudioFile, - File, - ImageFile, - PDFFile, - TextFile, - VideoFile, -) -from crewai.utilities.files.uploaders.base import FileUploader, UploadResult - - -logger = logging.getLogger(__name__) - -FileInput = AudioFile | File | ImageFile | PDFFile | TextFile | VideoFile - -GEMINI_FILE_TTL = timedelta(hours=48) - - -class GeminiFileUploader(FileUploader): - """Uploader for Google Gemini File API. - - Uses the google-genai SDK to upload files. Files are stored for 48 hours. - - Attributes: - api_key: Optional API key (uses GOOGLE_API_KEY env var if not provided). - """ - - def __init__(self, api_key: str | None = None) -> None: - """Initialize the Gemini uploader. - - Args: - api_key: Optional Google API key. If not provided, uses - GOOGLE_API_KEY environment variable. - """ - self._api_key = api_key or os.environ.get("GOOGLE_API_KEY") - self._client: Any = None - - @property - def provider_name(self) -> str: - """Return the provider name.""" - return "gemini" - - def _get_client(self) -> Any: - """Get or create the Gemini client.""" - if self._client is None: - try: - from google import genai - - self._client = genai.Client(api_key=self._api_key) - except ImportError as e: - raise ImportError( - "google-genai is required for Gemini file uploads. " - "Install with: pip install google-genai" - ) from e - return self._client - - def upload(self, file: FileInput, purpose: str | None = None) -> UploadResult: - """Upload a file to Gemini. - - Args: - file: The file to upload. - purpose: Optional purpose/description (used as display name). - - Returns: - UploadResult with the file URI and metadata. - - Raises: - Exception: If upload fails. - """ - client = self._get_client() - - content = file.read() - display_name = purpose or file.filename - - file_data = io.BytesIO(content) - file_data.name = file.filename - - logger.info( - f"Uploading file '{file.filename}' to Gemini ({len(content)} bytes)" - ) - - uploaded_file = client.files.upload( - file=file_data, - config={ - "display_name": display_name, - "mime_type": file.content_type, - }, - ) - - expires_at = datetime.now(timezone.utc) + GEMINI_FILE_TTL - - logger.info( - f"Uploaded to Gemini: {uploaded_file.name} (URI: {uploaded_file.uri})" - ) - - return UploadResult( - file_id=uploaded_file.name, - file_uri=uploaded_file.uri, - content_type=file.content_type, - expires_at=expires_at, - provider=self.provider_name, - ) - - def delete(self, file_id: str) -> bool: - """Delete an uploaded file from Gemini. - - Args: - file_id: The file name/ID to delete. - - Returns: - True if deletion was successful, False otherwise. - """ - try: - client = self._get_client() - client.files.delete(name=file_id) - logger.info(f"Deleted Gemini file: {file_id}") - return True - except Exception as e: - logger.warning(f"Failed to delete Gemini file {file_id}: {e}") - return False - - def get_file_info(self, file_id: str) -> dict[str, Any] | None: - """Get information about an uploaded file. - - Args: - file_id: The file name/ID. - - Returns: - Dictionary with file information, or None if not found. - """ - try: - client = self._get_client() - file_info = client.files.get(name=file_id) - return { - "name": file_info.name, - "uri": file_info.uri, - "display_name": file_info.display_name, - "mime_type": file_info.mime_type, - "size_bytes": file_info.size_bytes, - "state": str(file_info.state), - "create_time": file_info.create_time, - "expiration_time": file_info.expiration_time, - } - except Exception as e: - logger.debug(f"Failed to get Gemini file info for {file_id}: {e}") - return None - - def list_files(self) -> list[dict[str, Any]]: - """List all uploaded files. - - Returns: - List of dictionaries with file information. - """ - try: - client = self._get_client() - files = client.files.list() - return [ - { - "name": f.name, - "uri": f.uri, - "display_name": f.display_name, - "mime_type": f.mime_type, - "size_bytes": f.size_bytes, - "state": str(f.state), - } - for f in files - ] - except Exception as e: - logger.warning(f"Failed to list Gemini files: {e}") - return [] - - def wait_for_processing(self, file_id: str, timeout_seconds: int = 300) -> bool: - """Wait for a file to finish processing. - - Some files (especially videos) need time to process after upload. - - Args: - file_id: The file name/ID. - timeout_seconds: Maximum time to wait. - - Returns: - True if processing completed, False if timed out or failed. - """ - import time - - try: - from google.genai.types import FileState - except ImportError: - return True - - client = self._get_client() - start_time = time.time() - - while time.time() - start_time < timeout_seconds: - file_info = client.files.get(name=file_id) - - if file_info.state == FileState.ACTIVE: - return True - - if file_info.state == FileState.FAILED: - logger.error(f"Gemini file processing failed: {file_id}") - return False - - time.sleep(2) - - logger.warning(f"Timed out waiting for Gemini file processing: {file_id}") - return False diff --git a/lib/crewai/src/crewai/utilities/files/uploaders/openai.py b/lib/crewai/src/crewai/utilities/files/uploaders/openai.py deleted file mode 100644 index f94905316..000000000 --- a/lib/crewai/src/crewai/utilities/files/uploaders/openai.py +++ /dev/null @@ -1,169 +0,0 @@ -"""OpenAI Files API uploader implementation.""" - -from __future__ import annotations - -import io -import logging -import os -from typing import Any - -from crewai.utilities.files.content_types import ( - AudioFile, - File, - ImageFile, - PDFFile, - TextFile, - VideoFile, -) -from crewai.utilities.files.uploaders.base import FileUploader, UploadResult - - -logger = logging.getLogger(__name__) - -FileInput = AudioFile | File | ImageFile | PDFFile | TextFile | VideoFile - - -class OpenAIFileUploader(FileUploader): - """Uploader for OpenAI Files API. - - Uses the OpenAI SDK to upload files. Files are stored persistently - until explicitly deleted. - - Attributes: - api_key: Optional API key (uses OPENAI_API_KEY env var if not provided). - """ - - def __init__(self, api_key: str | None = None) -> None: - """Initialize the OpenAI uploader. - - Args: - api_key: Optional OpenAI API key. If not provided, uses - OPENAI_API_KEY environment variable. - """ - self._api_key = api_key or os.environ.get("OPENAI_API_KEY") - self._client: Any = None - - @property - def provider_name(self) -> str: - """Return the provider name.""" - return "openai" - - def _get_client(self) -> Any: - """Get or create the OpenAI client.""" - if self._client is None: - try: - from openai import OpenAI - - self._client = OpenAI(api_key=self._api_key) - except ImportError as e: - raise ImportError( - "openai is required for OpenAI file uploads. " - "Install with: pip install openai" - ) from e - return self._client - - def upload(self, file: FileInput, purpose: str | None = None) -> UploadResult: - """Upload a file to OpenAI. - - Args: - file: The file to upload. - purpose: Optional purpose for the file (default: "user_data"). - - Returns: - UploadResult with the file ID and metadata. - - Raises: - Exception: If upload fails. - """ - client = self._get_client() - - content = file.read() - file_purpose = purpose or "user_data" - - file_data = io.BytesIO(content) - file_data.name = file.filename or "file" - - logger.info( - f"Uploading file '{file.filename}' to OpenAI ({len(content)} bytes)" - ) - - uploaded_file = client.files.create( - file=file_data, - purpose=file_purpose, - ) - - logger.info(f"Uploaded to OpenAI: {uploaded_file.id}") - - return UploadResult( - file_id=uploaded_file.id, - file_uri=None, - content_type=file.content_type, - expires_at=None, - provider=self.provider_name, - ) - - def delete(self, file_id: str) -> bool: - """Delete an uploaded file from OpenAI. - - Args: - file_id: The file ID to delete. - - Returns: - True if deletion was successful, False otherwise. - """ - try: - client = self._get_client() - client.files.delete(file_id) - logger.info(f"Deleted OpenAI file: {file_id}") - return True - except Exception as e: - logger.warning(f"Failed to delete OpenAI file {file_id}: {e}") - return False - - def get_file_info(self, file_id: str) -> dict[str, Any] | None: - """Get information about an uploaded file. - - Args: - file_id: The file ID. - - Returns: - Dictionary with file information, or None if not found. - """ - try: - client = self._get_client() - file_info = client.files.retrieve(file_id) - return { - "id": file_info.id, - "filename": file_info.filename, - "purpose": file_info.purpose, - "bytes": file_info.bytes, - "created_at": file_info.created_at, - "status": file_info.status, - } - except Exception as e: - logger.debug(f"Failed to get OpenAI file info for {file_id}: {e}") - return None - - def list_files(self) -> list[dict[str, Any]]: - """List all uploaded files. - - Returns: - List of dictionaries with file information. - """ - try: - client = self._get_client() - files = client.files.list() - return [ - { - "id": f.id, - "filename": f.filename, - "purpose": f.purpose, - "bytes": f.bytes, - "created_at": f.created_at, - "status": f.status, - } - for f in files.data - ] - except Exception as e: - logger.warning(f"Failed to list OpenAI files: {e}") - return [] diff --git a/lib/crewai/src/crewai/utilities/types.py b/lib/crewai/src/crewai/utilities/types.py index 044d3ed73..9f616c1c8 100644 --- a/lib/crewai/src/crewai/utilities/types.py +++ b/lib/crewai/src/crewai/utilities/types.py @@ -2,7 +2,7 @@ from typing import Any, Literal, TypedDict -from crewai.utilities.files import FileInput +from crewai.files import FileInput class LLMMessage(TypedDict): diff --git a/lib/crewai/tests/utilities/files/__init__.py b/lib/crewai/tests/files/__init__.py similarity index 100% rename from lib/crewai/tests/utilities/files/__init__.py rename to lib/crewai/tests/files/__init__.py diff --git a/lib/crewai/tests/utilities/files/processing/__init__.py b/lib/crewai/tests/files/processing/__init__.py similarity index 100% rename from lib/crewai/tests/utilities/files/processing/__init__.py rename to lib/crewai/tests/files/processing/__init__.py diff --git a/lib/crewai/tests/utilities/files/processing/test_constraints.py b/lib/crewai/tests/files/processing/test_constraints.py similarity index 99% rename from lib/crewai/tests/utilities/files/processing/test_constraints.py rename to lib/crewai/tests/files/processing/test_constraints.py index fd487b680..e23434829 100644 --- a/lib/crewai/tests/utilities/files/processing/test_constraints.py +++ b/lib/crewai/tests/files/processing/test_constraints.py @@ -2,7 +2,7 @@ import pytest -from crewai.utilities.files.processing.constraints import ( +from crewai.files.processing.constraints import ( ANTHROPIC_CONSTRAINTS, BEDROCK_CONSTRAINTS, GEMINI_CONSTRAINTS, diff --git a/lib/crewai/tests/utilities/files/processing/test_processor.py b/lib/crewai/tests/files/processing/test_processor.py similarity index 95% rename from lib/crewai/tests/utilities/files/processing/test_processor.py rename to lib/crewai/tests/files/processing/test_processor.py index e590c8d8d..2454a44f3 100644 --- a/lib/crewai/tests/utilities/files/processing/test_processor.py +++ b/lib/crewai/tests/files/processing/test_processor.py @@ -2,19 +2,19 @@ import pytest -from crewai.utilities.files import FileBytes, ImageFile, PDFFile, TextFile -from crewai.utilities.files.processing.constraints import ( +from crewai.files import FileBytes, ImageFile, PDFFile, TextFile +from crewai.files.processing.constraints import ( ANTHROPIC_CONSTRAINTS, ImageConstraints, PDFConstraints, ProviderConstraints, ) -from crewai.utilities.files.processing.enums import FileHandling -from crewai.utilities.files.processing.exceptions import ( +from crewai.files.processing.enums import FileHandling +from crewai.files.processing.exceptions import ( FileTooLargeError, FileValidationError, ) -from crewai.utilities.files.processing.processor import FileProcessor +from crewai.files.processing.processor import FileProcessor # Minimal valid PNG: 8x8 pixel RGB image (valid for PIL) diff --git a/lib/crewai/tests/files/processing/test_transformers.py b/lib/crewai/tests/files/processing/test_transformers.py new file mode 100644 index 000000000..c40cd412f --- /dev/null +++ b/lib/crewai/tests/files/processing/test_transformers.py @@ -0,0 +1,359 @@ +"""Unit tests for file transformers.""" + +import io +from unittest.mock import MagicMock, patch + +import pytest + +from crewai.files import ImageFile, PDFFile, TextFile +from crewai.files.file import FileBytes +from crewai.files.processing.exceptions import ProcessingDependencyError +from crewai.files.processing.transformers import ( + chunk_pdf, + chunk_text, + get_image_dimensions, + get_pdf_page_count, + optimize_image, + resize_image, +) + + +def create_test_png(width: int = 100, height: int = 100) -> bytes: + """Create a minimal valid PNG for testing.""" + from PIL import Image + + img = Image.new("RGB", (width, height), color="red") + buffer = io.BytesIO() + img.save(buffer, format="PNG") + return buffer.getvalue() + + +def create_test_pdf(num_pages: int = 1) -> bytes: + """Create a minimal valid PDF for testing.""" + from pypdf import PdfWriter + + writer = PdfWriter() + for _ in range(num_pages): + writer.add_blank_page(width=612, height=792) + + buffer = io.BytesIO() + writer.write(buffer) + return buffer.getvalue() + + +class TestResizeImage: + """Tests for resize_image function.""" + + def test_resize_larger_image(self) -> None: + """Test resizing an image larger than max dimensions.""" + png_bytes = create_test_png(200, 150) + img = ImageFile(source=FileBytes(data=png_bytes, filename="test.png")) + + result = resize_image(img, max_width=100, max_height=100) + + dims = get_image_dimensions(result) + assert dims is not None + width, height = dims + assert width <= 100 + assert height <= 100 + + def test_no_resize_if_within_bounds(self) -> None: + """Test that small images are returned unchanged.""" + png_bytes = create_test_png(50, 50) + img = ImageFile(source=FileBytes(data=png_bytes, filename="small.png")) + + result = resize_image(img, max_width=100, max_height=100) + + assert result is img + + def test_preserve_aspect_ratio(self) -> None: + """Test that aspect ratio is preserved during resize.""" + png_bytes = create_test_png(200, 100) + img = ImageFile(source=FileBytes(data=png_bytes, filename="wide.png")) + + result = resize_image(img, max_width=100, max_height=100) + + dims = get_image_dimensions(result) + assert dims is not None + width, height = dims + assert width == 100 + assert height == 50 + + def test_resize_without_aspect_ratio(self) -> None: + """Test resizing without preserving aspect ratio.""" + png_bytes = create_test_png(200, 100) + img = ImageFile(source=FileBytes(data=png_bytes, filename="wide.png")) + + result = resize_image( + img, max_width=50, max_height=50, preserve_aspect_ratio=False + ) + + dims = get_image_dimensions(result) + assert dims is not None + width, height = dims + assert width == 50 + assert height == 50 + + def test_resize_returns_image_file(self) -> None: + """Test that resize returns an ImageFile instance.""" + png_bytes = create_test_png(200, 200) + img = ImageFile(source=FileBytes(data=png_bytes, filename="test.png")) + + result = resize_image(img, max_width=100, max_height=100) + + assert isinstance(result, ImageFile) + + def test_raises_without_pillow(self) -> None: + """Test that ProcessingDependencyError is raised without Pillow.""" + img = ImageFile(source=FileBytes(data=b"fake", filename="test.png")) + + with patch.dict("sys.modules", {"PIL": None, "PIL.Image": None}): + with pytest.raises(ProcessingDependencyError) as exc_info: + # Force reimport to trigger ImportError + import importlib + + import crewai.files.processing.transformers as t + + importlib.reload(t) + t.resize_image(img, 100, 100) + + assert "Pillow" in str(exc_info.value) + + +class TestOptimizeImage: + """Tests for optimize_image function.""" + + def test_optimize_reduces_size(self) -> None: + """Test that optimization reduces file size.""" + png_bytes = create_test_png(500, 500) + original_size = len(png_bytes) + img = ImageFile(source=FileBytes(data=png_bytes, filename="large.png")) + + result = optimize_image(img, target_size_bytes=original_size // 2) + + result_size = len(result.read()) + assert result_size < original_size + + def test_no_optimize_if_under_target(self) -> None: + """Test that small images are returned unchanged.""" + png_bytes = create_test_png(50, 50) + img = ImageFile(source=FileBytes(data=png_bytes, filename="small.png")) + + result = optimize_image(img, target_size_bytes=1024 * 1024) + + assert result is img + + def test_optimize_returns_image_file(self) -> None: + """Test that optimize returns an ImageFile instance.""" + png_bytes = create_test_png(200, 200) + img = ImageFile(source=FileBytes(data=png_bytes, filename="test.png")) + + result = optimize_image(img, target_size_bytes=100) + + assert isinstance(result, ImageFile) + + def test_optimize_respects_min_quality(self) -> None: + """Test that optimization stops at minimum quality.""" + png_bytes = create_test_png(100, 100) + img = ImageFile(source=FileBytes(data=png_bytes, filename="test.png")) + + # Request impossibly small size - should stop at min quality + result = optimize_image(img, target_size_bytes=10, min_quality=50) + + assert isinstance(result, ImageFile) + assert len(result.read()) > 10 + + +class TestChunkPdf: + """Tests for chunk_pdf function.""" + + def test_chunk_splits_large_pdf(self) -> None: + """Test that large PDFs are split into chunks.""" + pdf_bytes = create_test_pdf(num_pages=10) + pdf = PDFFile(source=FileBytes(data=pdf_bytes, filename="large.pdf")) + + result = list(chunk_pdf(pdf, max_pages=3)) + + assert len(result) == 4 + assert all(isinstance(chunk, PDFFile) for chunk in result) + + def test_no_chunk_if_within_limit(self) -> None: + """Test that small PDFs are returned unchanged.""" + pdf_bytes = create_test_pdf(num_pages=3) + pdf = PDFFile(source=FileBytes(data=pdf_bytes, filename="small.pdf")) + + result = list(chunk_pdf(pdf, max_pages=5)) + + assert len(result) == 1 + assert result[0] is pdf + + def test_chunk_filenames(self) -> None: + """Test that chunked files have indexed filenames.""" + pdf_bytes = create_test_pdf(num_pages=6) + pdf = PDFFile(source=FileBytes(data=pdf_bytes, filename="document.pdf")) + + result = list(chunk_pdf(pdf, max_pages=2)) + + assert result[0].filename == "document_chunk_0.pdf" + assert result[1].filename == "document_chunk_1.pdf" + assert result[2].filename == "document_chunk_2.pdf" + + def test_chunk_with_overlap(self) -> None: + """Test chunking with overlapping pages.""" + pdf_bytes = create_test_pdf(num_pages=10) + pdf = PDFFile(source=FileBytes(data=pdf_bytes, filename="doc.pdf")) + + result = list(chunk_pdf(pdf, max_pages=4, overlap_pages=1)) + + # With overlap, we get more chunks + assert len(result) >= 3 + + def test_chunk_page_counts(self) -> None: + """Test that each chunk has correct page count.""" + pdf_bytes = create_test_pdf(num_pages=7) + pdf = PDFFile(source=FileBytes(data=pdf_bytes, filename="doc.pdf")) + + result = list(chunk_pdf(pdf, max_pages=3)) + + page_counts = [get_pdf_page_count(chunk) for chunk in result] + assert page_counts == [3, 3, 1] + + +class TestChunkText: + """Tests for chunk_text function.""" + + def test_chunk_splits_large_text(self) -> None: + """Test that large text files are split into chunks.""" + content = "Hello world. " * 100 + text = TextFile(source=content.encode(), filename="large.txt") + + result = list(chunk_text(text, max_chars=200, overlap_chars=0)) + + assert len(result) > 1 + assert all(isinstance(chunk, TextFile) for chunk in result) + + def test_no_chunk_if_within_limit(self) -> None: + """Test that small text files are returned unchanged.""" + content = "Short text" + text = TextFile(source=content.encode(), filename="small.txt") + + result = list(chunk_text(text, max_chars=1000, overlap_chars=0)) + + assert len(result) == 1 + assert result[0] is text + + def test_chunk_filenames(self) -> None: + """Test that chunked files have indexed filenames.""" + content = "A" * 500 + text = TextFile(source=FileBytes(data=content.encode(), filename="data.txt")) + + result = list(chunk_text(text, max_chars=200, overlap_chars=0)) + + assert result[0].filename == "data_chunk_0.txt" + assert result[1].filename == "data_chunk_1.txt" + assert len(result) == 3 + + def test_chunk_preserves_extension(self) -> None: + """Test that file extension is preserved in chunks.""" + content = "A" * 500 + text = TextFile(source=FileBytes(data=content.encode(), filename="script.py")) + + result = list(chunk_text(text, max_chars=200, overlap_chars=0)) + + assert all(chunk.filename.endswith(".py") for chunk in result) + + def test_chunk_prefers_newline_boundaries(self) -> None: + """Test that chunking prefers to split at newlines.""" + content = "Line one\nLine two\nLine three\nLine four\nLine five" + text = TextFile(source=content.encode(), filename="lines.txt") + + result = list(chunk_text(text, max_chars=25, overlap_chars=0, split_on_newlines=True)) + + # Should split at newline boundaries + for chunk in result: + chunk_text_content = chunk.read().decode() + # Chunks should end at newlines (except possibly the last) + if chunk != result[-1]: + assert chunk_text_content.endswith("\n") or len(chunk_text_content) <= 25 + + def test_chunk_with_overlap(self) -> None: + """Test chunking with overlapping characters.""" + content = "ABCDEFGHIJ" * 10 + text = TextFile(source=content.encode(), filename="data.txt") + + result = list(chunk_text(text, max_chars=30, overlap_chars=5)) + + # With overlap, chunks should share some content + assert len(result) >= 3 + + def test_chunk_overlap_larger_than_max_chars(self) -> None: + """Test that overlap > max_chars doesn't cause infinite loop.""" + content = "A" * 100 + text = TextFile(source=content.encode(), filename="data.txt") + + # overlap_chars > max_chars should still work (just with max overlap) + result = list(chunk_text(text, max_chars=20, overlap_chars=50)) + + assert len(result) > 1 + # Should still complete without hanging + + +class TestGetImageDimensions: + """Tests for get_image_dimensions function.""" + + def test_get_dimensions(self) -> None: + """Test getting image dimensions.""" + png_bytes = create_test_png(150, 100) + img = ImageFile(source=FileBytes(data=png_bytes, filename="test.png")) + + dims = get_image_dimensions(img) + + assert dims == (150, 100) + + def test_returns_none_for_invalid_image(self) -> None: + """Test that None is returned for invalid image data.""" + img = ImageFile(source=FileBytes(data=b"not an image", filename="bad.png")) + + dims = get_image_dimensions(img) + + assert dims is None + + def test_returns_none_without_pillow(self) -> None: + """Test that None is returned when Pillow is not installed.""" + png_bytes = create_test_png(100, 100) + img = ImageFile(source=FileBytes(data=png_bytes, filename="test.png")) + + with patch.dict("sys.modules", {"PIL": None}): + # Can't easily test this without unloading module + # Just verify the function handles the case gracefully + pass + + +class TestGetPdfPageCount: + """Tests for get_pdf_page_count function.""" + + def test_get_page_count(self) -> None: + """Test getting PDF page count.""" + pdf_bytes = create_test_pdf(num_pages=5) + pdf = PDFFile(source=FileBytes(data=pdf_bytes, filename="test.pdf")) + + count = get_pdf_page_count(pdf) + + assert count == 5 + + def test_single_page(self) -> None: + """Test page count for single page PDF.""" + pdf_bytes = create_test_pdf(num_pages=1) + pdf = PDFFile(source=FileBytes(data=pdf_bytes, filename="single.pdf")) + + count = get_pdf_page_count(pdf) + + assert count == 1 + + def test_returns_none_for_invalid_pdf(self) -> None: + """Test that None is returned for invalid PDF data.""" + pdf = PDFFile(source=FileBytes(data=b"not a pdf", filename="bad.pdf")) + + count = get_pdf_page_count(pdf) + + assert count is None \ No newline at end of file diff --git a/lib/crewai/tests/utilities/files/processing/test_validators.py b/lib/crewai/tests/files/processing/test_validators.py similarity index 96% rename from lib/crewai/tests/utilities/files/processing/test_validators.py rename to lib/crewai/tests/files/processing/test_validators.py index e5f12840e..4b9d32294 100644 --- a/lib/crewai/tests/utilities/files/processing/test_validators.py +++ b/lib/crewai/tests/files/processing/test_validators.py @@ -2,19 +2,19 @@ import pytest -from crewai.utilities.files import FileBytes, ImageFile, PDFFile, TextFile -from crewai.utilities.files.processing.constraints import ( +from crewai.files import FileBytes, ImageFile, PDFFile, TextFile +from crewai.files.processing.constraints import ( ANTHROPIC_CONSTRAINTS, ImageConstraints, PDFConstraints, ProviderConstraints, ) -from crewai.utilities.files.processing.exceptions import ( +from crewai.files.processing.exceptions import ( FileTooLargeError, FileValidationError, UnsupportedFileTypeError, ) -from crewai.utilities.files.processing.validators import ( +from crewai.files.processing.validators import ( validate_file, validate_image, validate_pdf, diff --git a/lib/crewai/tests/utilities/files/test_resolved.py b/lib/crewai/tests/files/test_resolved.py similarity index 98% rename from lib/crewai/tests/utilities/files/test_resolved.py rename to lib/crewai/tests/files/test_resolved.py index 4a69184c6..d5101d2a1 100644 --- a/lib/crewai/tests/utilities/files/test_resolved.py +++ b/lib/crewai/tests/files/test_resolved.py @@ -4,7 +4,7 @@ from datetime import datetime, timezone import pytest -from crewai.utilities.files.resolved import ( +from crewai.files.resolved import ( FileReference, InlineBase64, InlineBytes, diff --git a/lib/crewai/tests/utilities/files/test_resolver.py b/lib/crewai/tests/files/test_resolver.py similarity index 96% rename from lib/crewai/tests/utilities/files/test_resolver.py rename to lib/crewai/tests/files/test_resolver.py index 643952e9b..dcc890848 100644 --- a/lib/crewai/tests/utilities/files/test_resolver.py +++ b/lib/crewai/tests/files/test_resolver.py @@ -2,14 +2,14 @@ import pytest -from crewai.utilities.files import FileBytes, ImageFile -from crewai.utilities.files.resolved import InlineBase64, InlineBytes -from crewai.utilities.files.resolver import ( +from crewai.files import FileBytes, ImageFile +from crewai.files.resolved import InlineBase64, InlineBytes +from crewai.files.resolver import ( FileResolver, FileResolverConfig, create_resolver, ) -from crewai.utilities.files.upload_cache import UploadCache +from crewai.files.upload_cache import UploadCache # Minimal valid PNG diff --git a/lib/crewai/tests/utilities/files/test_upload_cache.py b/lib/crewai/tests/files/test_upload_cache.py similarity index 98% rename from lib/crewai/tests/utilities/files/test_upload_cache.py rename to lib/crewai/tests/files/test_upload_cache.py index e3b8ebe72..7dd92268c 100644 --- a/lib/crewai/tests/utilities/files/test_upload_cache.py +++ b/lib/crewai/tests/files/test_upload_cache.py @@ -4,8 +4,8 @@ from datetime import datetime, timedelta, timezone import pytest -from crewai.utilities.files import FileBytes, ImageFile -from crewai.utilities.files.upload_cache import CachedUpload, UploadCache +from crewai.files import FileBytes, ImageFile +from crewai.files.upload_cache import CachedUpload, UploadCache # Minimal valid PNG diff --git a/lib/crewai/tests/fixtures/quarterly_report.csv b/lib/crewai/tests/fixtures/quarterly_report.csv new file mode 100644 index 000000000..20197ee2f --- /dev/null +++ b/lib/crewai/tests/fixtures/quarterly_report.csv @@ -0,0 +1,5 @@ +Quarter,Revenue ($M),Expenses ($M),Profit ($M) +Q1 2024,70,40,30 +Q2 2024,75,42,33 +Q3 2024,80,45,35 +Q4 2024,75,44,31 diff --git a/lib/crewai/tests/fixtures/revenue_chart.png b/lib/crewai/tests/fixtures/revenue_chart.png new file mode 100644 index 000000000..0c6606adb Binary files /dev/null and b/lib/crewai/tests/fixtures/revenue_chart.png differ diff --git a/lib/crewai/tests/fixtures/review_guidelines.txt b/lib/crewai/tests/fixtures/review_guidelines.txt new file mode 100644 index 000000000..e7c52116c --- /dev/null +++ b/lib/crewai/tests/fixtures/review_guidelines.txt @@ -0,0 +1,10 @@ +Review Guidelines + +1. Be clear and concise: Write feedback that is easy to understand. +2. Focus on behavior and outcomes: Describe what happened and why it matters. +3. Be specific: Provide examples to support your points. +4. Balance positives and improvements: Highlight strengths and areas to grow. +5. Be respectful and constructive: Assume positive intent and offer solutions. +6. Use objective criteria: Reference goals, metrics, or expectations where possible. +7. Suggest next steps: Recommend actionable ways to improve. +8. Proofread: Check tone, grammar, and clarity before submitting. diff --git a/lib/crewai/tests/llms/test_multimodal.py b/lib/crewai/tests/llms/test_multimodal.py index f84f6a270..7dfcfaabf 100644 --- a/lib/crewai/tests/llms/test_multimodal.py +++ b/lib/crewai/tests/llms/test_multimodal.py @@ -7,7 +7,7 @@ from unittest.mock import patch import pytest from crewai.llm import LLM -from crewai.utilities.files import ImageFile, PDFFile, TextFile +from crewai.files import ImageFile, PDFFile, TextFile # Check for optional provider dependencies try: diff --git a/lib/crewai/tests/llms/test_multimodal_integration.py b/lib/crewai/tests/llms/test_multimodal_integration.py index ec66365ac..db5d890c4 100644 --- a/lib/crewai/tests/llms/test_multimodal_integration.py +++ b/lib/crewai/tests/llms/test_multimodal_integration.py @@ -9,7 +9,7 @@ from pathlib import Path import pytest from crewai.llm import LLM -from crewai.utilities.files import File, ImageFile, PDFFile, TextFile +from crewai.files import File, ImageFile, PDFFile, TextFile # Path to test data files diff --git a/lib/crewai/tests/tools/agent_tools/test_read_file_tool.py b/lib/crewai/tests/tools/agent_tools/test_read_file_tool.py index 5f521f974..1e77618b2 100644 --- a/lib/crewai/tests/tools/agent_tools/test_read_file_tool.py +++ b/lib/crewai/tests/tools/agent_tools/test_read_file_tool.py @@ -5,7 +5,7 @@ import base64 import pytest from crewai.tools.agent_tools.read_file_tool import ReadFileTool -from crewai.utilities.files import ImageFile, PDFFile, TextFile +from crewai.files import ImageFile, PDFFile, TextFile class TestReadFileTool: diff --git a/lib/crewai/tests/utilities/test_file_store.py b/lib/crewai/tests/utilities/test_file_store.py index 30049f3cb..8d248b820 100644 --- a/lib/crewai/tests/utilities/test_file_store.py +++ b/lib/crewai/tests/utilities/test_file_store.py @@ -13,7 +13,7 @@ from crewai.utilities.file_store import ( store_files, store_task_files, ) -from crewai.utilities.files import TextFile +from crewai.files import TextFile class TestFileStore: diff --git a/lib/crewai/tests/utilities/test_files.py b/lib/crewai/tests/utilities/test_files.py index addd8bcaf..6306b1de6 100644 --- a/lib/crewai/tests/utilities/test_files.py +++ b/lib/crewai/tests/utilities/test_files.py @@ -6,7 +6,7 @@ from pathlib import Path import pytest -from crewai.utilities.files import ( +from crewai.files import ( AudioFile, File, FileBytes, @@ -20,7 +20,7 @@ from crewai.utilities.files import ( normalize_input_files, wrap_file_source, ) -from crewai.utilities.files.file import detect_content_type +from crewai.files.file import detect_content_type class TestDetectContentType: @@ -34,7 +34,7 @@ class TestDetectContentType: def test_detect_json(self) -> None: """Test detection of JSON content.""" result = detect_content_type(b'{"key": "value"}') - assert result in ("text/plain", "application/json") + assert result == "application/json" def test_detect_png(self) -> None: """Test detection of PNG content.""" diff --git a/pyproject.toml b/pyproject.toml index de3f03ecb..975e05b80 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ dev = [ "boto3-stubs[bedrock-runtime]==1.40.54", "types-psycopg2==2.9.21.20251012", "types-pymysql==1.1.0.20250916", + "types-aiofiles~=24.1.0", ] diff --git a/uv.lock b/uv.lock index c1bf877c5..7bf50ac53 100644 --- a/uv.lock +++ b/uv.lock @@ -50,6 +50,7 @@ dev = [ { name = "pytest-timeout", specifier = "==2.4.0" }, { name = "pytest-xdist", specifier = "==3.8.0" }, { name = "ruff", specifier = "==0.14.7" }, + { name = "types-aiofiles", specifier = "~=24.1.0" }, { name = "types-appdirs", specifier = "==1.4.*" }, { name = "types-psycopg2", specifier = "==2.9.21.20251012" }, { name = "types-pymysql", specifier = "==1.1.0.20250916" }, @@ -131,11 +132,11 @@ redis = [ [[package]] name = "aiofiles" -version = "25.1.0" +version = "24.1.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/41/c3/534eac40372d8ee36ef40df62ec129bee4fdb5ad9706e58a29be53b2c970/aiofiles-25.1.0.tar.gz", hash = "sha256:a8d728f0a29de45dc521f18f07297428d56992a742f0cd2701ba86e44d23d5b2", size = 46354, upload-time = "2025-10-09T20:51:04.358Z" } +sdist = { url = "https://files.pythonhosted.org/packages/0b/03/a88171e277e8caa88a4c77808c20ebb04ba74cc4681bf1e9416c862de237/aiofiles-24.1.0.tar.gz", hash = "sha256:22a075c9e5a3810f0c2e48f3008c94d68c65d763b9b03857924c99e57355166c", size = 30247, upload-time = "2024-06-24T11:02:03.584Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/bc/8a/340a1555ae33d7354dbca4faa54948d76d89a27ceef032c8c3bc661d003e/aiofiles-25.1.0-py3-none-any.whl", hash = "sha256:abe311e527c862958650f9438e859c1fa7568a141b22abcd015e120e86a85695", size = 14668, upload-time = "2025-10-09T20:51:03.174Z" }, + { url = "https://files.pythonhosted.org/packages/a5/45/30bb92d442636f570cb5651bc661f52b610e2eec3f891a5dc3a4c3667db0/aiofiles-24.1.0-py3-none-any.whl", hash = "sha256:b4ec55f4195e3eb5d7abd1bf7e061763e864dd4954231fb8539a0ef8bb8260e5", size = 15896, upload-time = "2024-06-24T11:02:01.529Z" }, ] [[package]] @@ -1201,6 +1202,7 @@ embeddings = [ ] file-processing = [ { name = "aiocache" }, + { name = "aiofiles" }, { name = "pillow" }, { name = "pypdf" }, { name = "python-magic" }, @@ -1239,6 +1241,7 @@ requires-dist = [ { name = "aiobotocore", marker = "extra == 'aws'", specifier = "~=2.25.2" }, { name = "aiocache", marker = "extra == 'file-processing'", specifier = "~=0.12.3" }, { name = "aiocache", extras = ["memcached", "redis"], marker = "extra == 'a2a'", specifier = "~=0.12.3" }, + { name = "aiofiles", marker = "extra == 'file-processing'", specifier = "~=24.1.0" }, { name = "aiosqlite", specifier = "~=0.21.0" }, { name = "anthropic", marker = "extra == 'anthropic'", specifier = "~=0.71.0" }, { name = "appdirs", specifier = "~=1.4.4" }, @@ -8210,6 +8213,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/00/22/35617eee79080a5d071d0f14ad698d325ee6b3bf824fc0467c03b30e7fa8/typer-0.19.2-py3-none-any.whl", hash = "sha256:755e7e19670ffad8283db353267cb81ef252f595aa6834a0d1ca9312d9326cb9", size = 46748, upload-time = "2025-09-23T09:47:46.777Z" }, ] +[[package]] +name = "types-aiofiles" +version = "24.1.0.20250822" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/19/48/c64471adac9206cc844afb33ed311ac5a65d2f59df3d861e0f2d0cad7414/types_aiofiles-24.1.0.20250822.tar.gz", hash = "sha256:9ab90d8e0c307fe97a7cf09338301e3f01a163e39f3b529ace82466355c84a7b", size = 14484, upload-time = "2025-08-22T03:02:23.039Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bc/8e/5e6d2215e1d8f7c2a94c6e9d0059ae8109ce0f5681956d11bb0a228cef04/types_aiofiles-24.1.0.20250822-py3-none-any.whl", hash = "sha256:0ec8f8909e1a85a5a79aed0573af7901f53120dd2a29771dd0b3ef48e12328b0", size = 14322, upload-time = "2025-08-22T03:02:21.918Z" }, +] + [[package]] name = "types-appdirs" version = "1.4.3.5"