mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-27 09:08:14 +00:00
feat: promote files to first-class crewai.files package
This commit is contained in:
@@ -103,6 +103,7 @@ file-processing = [
|
||||
"pypdf~=4.0.0",
|
||||
"python-magic>=0.4.27",
|
||||
"aiocache~=0.12.3",
|
||||
"aiofiles~=24.1.0",
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -6,6 +6,14 @@ import warnings
|
||||
from crewai.agent.core import Agent
|
||||
from crewai.crew import Crew
|
||||
from crewai.crews.crew_output import CrewOutput
|
||||
from crewai.files import (
|
||||
AudioFile,
|
||||
File,
|
||||
ImageFile,
|
||||
PDFFile,
|
||||
TextFile,
|
||||
VideoFile,
|
||||
)
|
||||
from crewai.flow.flow import Flow
|
||||
from crewai.knowledge.knowledge import Knowledge
|
||||
from crewai.llm import LLM
|
||||
@@ -15,14 +23,6 @@ from crewai.task import Task
|
||||
from crewai.tasks.llm_guardrail import LLMGuardrail
|
||||
from crewai.tasks.task_output import TaskOutput
|
||||
from crewai.telemetry.telemetry import Telemetry
|
||||
from crewai.utilities.files import (
|
||||
AudioFile,
|
||||
File,
|
||||
ImageFile,
|
||||
PDFFile,
|
||||
TextFile,
|
||||
VideoFile,
|
||||
)
|
||||
|
||||
|
||||
def _suppress_pydantic_deprecation_warnings() -> None:
|
||||
|
||||
@@ -24,6 +24,7 @@ from crewai.events.types.logging_events import (
|
||||
AgentLogsExecutionEvent,
|
||||
AgentLogsStartedEvent,
|
||||
)
|
||||
from crewai.files import FileProcessor
|
||||
from crewai.hooks.llm_hooks import (
|
||||
get_after_llm_call_hooks,
|
||||
get_before_llm_call_hooks,
|
||||
@@ -44,7 +45,6 @@ from crewai.utilities.agent_utils import (
|
||||
)
|
||||
from crewai.utilities.constants import TRAINING_DATA_FILE
|
||||
from crewai.utilities.file_store import get_all_files
|
||||
from crewai.utilities.files import FileProcessor
|
||||
from crewai.utilities.i18n import I18N, get_i18n
|
||||
from crewai.utilities.printer import Printer
|
||||
from crewai.utilities.tool_utils import (
|
||||
@@ -238,7 +238,7 @@ class CrewAgentExecutor(CrewAgentExecutorMixin):
|
||||
processor = FileProcessor(constraints=provider)
|
||||
files = processor.process_files(files)
|
||||
|
||||
from crewai.utilities.files import get_upload_cache
|
||||
from crewai.files import get_upload_cache
|
||||
|
||||
upload_cache = get_upload_cache()
|
||||
content_blocks = self.llm.format_multimodal_content(
|
||||
@@ -258,6 +258,48 @@ class CrewAgentExecutor(CrewAgentExecutorMixin):
|
||||
]
|
||||
break
|
||||
|
||||
async def _ainject_multimodal_files(self) -> None:
|
||||
"""Async inject files as multimodal content into messages.
|
||||
|
||||
For crews with input files and LLMs that support multimodal,
|
||||
processes files according to provider constraints using parallel processing,
|
||||
then delegates to the LLM's aformat_multimodal_content method to
|
||||
generate provider-specific content blocks with parallel file resolution.
|
||||
"""
|
||||
if not self.crew or not self.task:
|
||||
return
|
||||
|
||||
if not self.llm.supports_multimodal():
|
||||
return
|
||||
|
||||
files = get_all_files(self.crew.id, self.task.id)
|
||||
if not files:
|
||||
return
|
||||
|
||||
provider = getattr(self.llm, "provider", None) or getattr(self.llm, "model", "")
|
||||
processor = FileProcessor(constraints=provider)
|
||||
files = await processor.aprocess_files(files)
|
||||
|
||||
from crewai.files import get_upload_cache
|
||||
|
||||
upload_cache = get_upload_cache()
|
||||
content_blocks = await self.llm.aformat_multimodal_content(
|
||||
files, upload_cache=upload_cache
|
||||
)
|
||||
if not content_blocks:
|
||||
return
|
||||
|
||||
for i in range(len(self.messages) - 1, -1, -1):
|
||||
msg = self.messages[i]
|
||||
if msg.get("role") == "user":
|
||||
existing_content = msg.get("content", "")
|
||||
if isinstance(existing_content, str):
|
||||
msg["content"] = [
|
||||
self.llm.format_text_content(existing_content),
|
||||
*content_blocks,
|
||||
]
|
||||
break
|
||||
|
||||
def _invoke_loop(self) -> AgentFinish:
|
||||
"""Execute agent loop until completion.
|
||||
|
||||
@@ -401,7 +443,7 @@ class CrewAgentExecutor(CrewAgentExecutorMixin):
|
||||
user_prompt = self._format_prompt(self.prompt.get("prompt", ""), inputs)
|
||||
self.messages.append(format_message_for_llm(user_prompt))
|
||||
|
||||
self._inject_multimodal_files()
|
||||
await self._ainject_multimodal_files()
|
||||
|
||||
self._show_start_logs()
|
||||
|
||||
|
||||
@@ -8,16 +8,16 @@ from typing import TYPE_CHECKING, Any
|
||||
|
||||
from crewai.agents.agent_builder.base_agent import BaseAgent
|
||||
from crewai.crews.crew_output import CrewOutput
|
||||
from crewai.rag.embeddings.types import EmbedderConfig
|
||||
from crewai.types.streaming import CrewStreamingOutput, FlowStreamingOutput
|
||||
from crewai.utilities.file_store import store_files
|
||||
from crewai.utilities.files import (
|
||||
from crewai.files import (
|
||||
AudioFile,
|
||||
ImageFile,
|
||||
PDFFile,
|
||||
TextFile,
|
||||
VideoFile,
|
||||
)
|
||||
from crewai.rag.embeddings.types import EmbedderConfig
|
||||
from crewai.types.streaming import CrewStreamingOutput, FlowStreamingOutput
|
||||
from crewai.utilities.file_store import store_files
|
||||
from crewai.utilities.streaming import (
|
||||
StreamingState,
|
||||
TaskInfo,
|
||||
|
||||
207
lib/crewai/src/crewai/files/__init__.py
Normal file
207
lib/crewai/src/crewai/files/__init__.py
Normal file
@@ -0,0 +1,207 @@
|
||||
"""File handling utilities for crewAI tasks."""
|
||||
|
||||
from crewai.files.cleanup import (
|
||||
cleanup_expired_files,
|
||||
cleanup_provider_files,
|
||||
cleanup_uploaded_files,
|
||||
)
|
||||
from crewai.files.content_types import (
|
||||
AudioContentType,
|
||||
AudioExtension,
|
||||
AudioFile,
|
||||
BaseFile,
|
||||
File,
|
||||
FileMode,
|
||||
ImageContentType,
|
||||
ImageExtension,
|
||||
ImageFile,
|
||||
PDFContentType,
|
||||
PDFExtension,
|
||||
PDFFile,
|
||||
TextContentType,
|
||||
TextExtension,
|
||||
TextFile,
|
||||
VideoContentType,
|
||||
VideoExtension,
|
||||
VideoFile,
|
||||
)
|
||||
from crewai.files.file import (
|
||||
FileBytes,
|
||||
FilePath,
|
||||
FileSource,
|
||||
FileSourceInput,
|
||||
FileStream,
|
||||
RawFileInput,
|
||||
)
|
||||
from crewai.files.processing import (
|
||||
ANTHROPIC_CONSTRAINTS,
|
||||
BEDROCK_CONSTRAINTS,
|
||||
GEMINI_CONSTRAINTS,
|
||||
OPENAI_CONSTRAINTS,
|
||||
AudioConstraints,
|
||||
FileHandling,
|
||||
FileProcessingError,
|
||||
FileProcessor,
|
||||
FileTooLargeError,
|
||||
FileValidationError,
|
||||
ImageConstraints,
|
||||
PDFConstraints,
|
||||
ProcessingDependencyError,
|
||||
ProviderConstraints,
|
||||
UnsupportedFileTypeError,
|
||||
VideoConstraints,
|
||||
get_constraints_for_provider,
|
||||
)
|
||||
from crewai.files.resolved import (
|
||||
FileReference,
|
||||
InlineBase64,
|
||||
InlineBytes,
|
||||
ResolvedFile,
|
||||
ResolvedFileType,
|
||||
UrlReference,
|
||||
)
|
||||
from crewai.files.resolver import (
|
||||
FileResolver,
|
||||
FileResolverConfig,
|
||||
create_resolver,
|
||||
)
|
||||
from crewai.files.upload_cache import (
|
||||
CachedUpload,
|
||||
UploadCache,
|
||||
get_upload_cache,
|
||||
reset_upload_cache,
|
||||
)
|
||||
from crewai.files.uploaders import FileUploader, UploadResult, get_uploader
|
||||
|
||||
|
||||
FileInput = AudioFile | File | ImageFile | PDFFile | TextFile | VideoFile
|
||||
|
||||
|
||||
def wrap_file_source(source: FileSource) -> FileInput:
|
||||
"""Wrap a FileSource in the appropriate typed FileInput wrapper.
|
||||
|
||||
Args:
|
||||
source: The file source to wrap.
|
||||
|
||||
Returns:
|
||||
Typed FileInput wrapper based on content type.
|
||||
"""
|
||||
content_type = source.content_type
|
||||
|
||||
if content_type.startswith("image/"):
|
||||
return ImageFile(source=source)
|
||||
if content_type.startswith("audio/"):
|
||||
return AudioFile(source=source)
|
||||
if content_type.startswith("video/"):
|
||||
return VideoFile(source=source)
|
||||
if content_type == "application/pdf":
|
||||
return PDFFile(source=source)
|
||||
return TextFile(source=source)
|
||||
|
||||
|
||||
def normalize_input_files(
|
||||
input_files: list[FileSourceInput | FileInput],
|
||||
) -> dict[str, FileInput]:
|
||||
"""Convert a list of file sources to a named dictionary of FileInputs.
|
||||
|
||||
Args:
|
||||
input_files: List of file source inputs or File objects.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping names to FileInput wrappers.
|
||||
"""
|
||||
from pathlib import Path
|
||||
|
||||
result: dict[str, FileInput] = {}
|
||||
|
||||
for i, item in enumerate(input_files):
|
||||
if isinstance(item, BaseFile):
|
||||
name = item.filename or f"file_{i}"
|
||||
if "." in name:
|
||||
name = name.rsplit(".", 1)[0]
|
||||
result[name] = item
|
||||
continue
|
||||
|
||||
file_source: FilePath | FileBytes | FileStream
|
||||
if isinstance(item, (FilePath, FileBytes, FileStream)):
|
||||
file_source = item
|
||||
elif isinstance(item, Path):
|
||||
file_source = FilePath(path=item)
|
||||
elif isinstance(item, str):
|
||||
file_source = FilePath(path=Path(item))
|
||||
elif isinstance(item, (bytes, memoryview)):
|
||||
file_source = FileBytes(data=bytes(item))
|
||||
else:
|
||||
continue
|
||||
|
||||
name = file_source.filename or f"file_{i}"
|
||||
result[name] = wrap_file_source(file_source)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
__all__ = [
|
||||
"ANTHROPIC_CONSTRAINTS",
|
||||
"BEDROCK_CONSTRAINTS",
|
||||
"GEMINI_CONSTRAINTS",
|
||||
"OPENAI_CONSTRAINTS",
|
||||
"AudioConstraints",
|
||||
"AudioContentType",
|
||||
"AudioExtension",
|
||||
"AudioFile",
|
||||
"BaseFile",
|
||||
"CachedUpload",
|
||||
"File",
|
||||
"FileBytes",
|
||||
"FileHandling",
|
||||
"FileInput",
|
||||
"FileMode",
|
||||
"FilePath",
|
||||
"FileProcessingError",
|
||||
"FileProcessor",
|
||||
"FileReference",
|
||||
"FileResolver",
|
||||
"FileResolverConfig",
|
||||
"FileSource",
|
||||
"FileSourceInput",
|
||||
"FileStream",
|
||||
"FileTooLargeError",
|
||||
"FileUploader",
|
||||
"FileValidationError",
|
||||
"ImageConstraints",
|
||||
"ImageContentType",
|
||||
"ImageExtension",
|
||||
"ImageFile",
|
||||
"InlineBase64",
|
||||
"InlineBytes",
|
||||
"PDFConstraints",
|
||||
"PDFContentType",
|
||||
"PDFExtension",
|
||||
"PDFFile",
|
||||
"ProcessingDependencyError",
|
||||
"ProviderConstraints",
|
||||
"RawFileInput",
|
||||
"ResolvedFile",
|
||||
"ResolvedFileType",
|
||||
"TextContentType",
|
||||
"TextExtension",
|
||||
"TextFile",
|
||||
"UnsupportedFileTypeError",
|
||||
"UploadCache",
|
||||
"UploadResult",
|
||||
"UrlReference",
|
||||
"VideoConstraints",
|
||||
"VideoContentType",
|
||||
"VideoExtension",
|
||||
"VideoFile",
|
||||
"cleanup_expired_files",
|
||||
"cleanup_provider_files",
|
||||
"cleanup_uploaded_files",
|
||||
"create_resolver",
|
||||
"get_constraints_for_provider",
|
||||
"get_upload_cache",
|
||||
"get_uploader",
|
||||
"normalize_input_files",
|
||||
"reset_upload_cache",
|
||||
"wrap_file_source",
|
||||
]
|
||||
368
lib/crewai/src/crewai/files/cleanup.py
Normal file
368
lib/crewai/src/crewai/files/cleanup.py
Normal file
@@ -0,0 +1,368 @@
|
||||
"""Cleanup utilities for uploaded files."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from crewai.files.upload_cache import CachedUpload, UploadCache
|
||||
from crewai.files.uploaders import get_uploader
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from crewai.files.uploaders.base import FileUploader
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _safe_delete(
|
||||
uploader: FileUploader,
|
||||
file_id: str,
|
||||
provider: str,
|
||||
) -> bool:
|
||||
"""Safely delete a file, logging any errors.
|
||||
|
||||
Args:
|
||||
uploader: The file uploader to use.
|
||||
file_id: The file ID to delete.
|
||||
provider: Provider name for logging.
|
||||
|
||||
Returns:
|
||||
True if deleted successfully, False otherwise.
|
||||
"""
|
||||
try:
|
||||
if uploader.delete(file_id):
|
||||
logger.debug(f"Deleted {file_id} from {provider}")
|
||||
return True
|
||||
logger.warning(f"Failed to delete {file_id} from {provider}")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.warning(f"Error deleting {file_id} from {provider}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def cleanup_uploaded_files(
|
||||
cache: UploadCache,
|
||||
*,
|
||||
delete_from_provider: bool = True,
|
||||
providers: list[str] | None = None,
|
||||
) -> int:
|
||||
"""Clean up uploaded files from the cache and optionally from providers.
|
||||
|
||||
Args:
|
||||
cache: The upload cache to clean up.
|
||||
delete_from_provider: If True, delete files from the provider as well.
|
||||
providers: Optional list of providers to clean up. If None, cleans all.
|
||||
|
||||
Returns:
|
||||
Number of files cleaned up.
|
||||
"""
|
||||
cleaned = 0
|
||||
|
||||
provider_uploads: dict[str, list[CachedUpload]] = {}
|
||||
|
||||
for provider in _get_providers_from_cache(cache):
|
||||
if providers is not None and provider not in providers:
|
||||
continue
|
||||
provider_uploads[provider] = cache.get_all_for_provider(provider)
|
||||
|
||||
if delete_from_provider:
|
||||
for provider, uploads in provider_uploads.items():
|
||||
uploader = get_uploader(provider)
|
||||
if uploader is None:
|
||||
logger.warning(
|
||||
f"No uploader available for {provider}, skipping cleanup"
|
||||
)
|
||||
continue
|
||||
|
||||
for upload in uploads:
|
||||
if _safe_delete(uploader, upload.file_id, provider):
|
||||
cleaned += 1
|
||||
|
||||
cache.clear()
|
||||
|
||||
logger.info(f"Cleaned up {cleaned} uploaded files")
|
||||
return cleaned
|
||||
|
||||
|
||||
def cleanup_expired_files(
|
||||
cache: UploadCache,
|
||||
*,
|
||||
delete_from_provider: bool = False,
|
||||
) -> int:
|
||||
"""Clean up expired files from the cache.
|
||||
|
||||
Args:
|
||||
cache: The upload cache to clean up.
|
||||
delete_from_provider: If True, attempt to delete from provider as well.
|
||||
Note: Expired files may already be deleted by the provider.
|
||||
|
||||
Returns:
|
||||
Number of expired entries removed from cache.
|
||||
"""
|
||||
expired_entries: list[CachedUpload] = []
|
||||
|
||||
if delete_from_provider:
|
||||
for provider in _get_providers_from_cache(cache):
|
||||
expired_entries.extend(
|
||||
upload
|
||||
for upload in cache.get_all_for_provider(provider)
|
||||
if upload.is_expired()
|
||||
)
|
||||
|
||||
removed = cache.clear_expired()
|
||||
|
||||
if delete_from_provider:
|
||||
for upload in expired_entries:
|
||||
uploader = get_uploader(upload.provider)
|
||||
if uploader is not None:
|
||||
try:
|
||||
uploader.delete(upload.file_id)
|
||||
except Exception as e:
|
||||
logger.debug(f"Could not delete expired file {upload.file_id}: {e}")
|
||||
|
||||
return removed
|
||||
|
||||
|
||||
def cleanup_provider_files(
|
||||
provider: str,
|
||||
*,
|
||||
cache: UploadCache | None = None,
|
||||
delete_all_from_provider: bool = False,
|
||||
) -> int:
|
||||
"""Clean up all files for a specific provider.
|
||||
|
||||
Args:
|
||||
provider: Provider name to clean up.
|
||||
cache: Optional upload cache to clear entries from.
|
||||
delete_all_from_provider: If True, delete all files from the provider,
|
||||
not just cached ones.
|
||||
|
||||
Returns:
|
||||
Number of files deleted.
|
||||
"""
|
||||
deleted = 0
|
||||
uploader = get_uploader(provider)
|
||||
|
||||
if uploader is None:
|
||||
logger.warning(f"No uploader available for {provider}")
|
||||
return 0
|
||||
|
||||
if delete_all_from_provider:
|
||||
try:
|
||||
files = uploader.list_files()
|
||||
for file_info in files:
|
||||
file_id = file_info.get("id") or file_info.get("name")
|
||||
if file_id and uploader.delete(file_id):
|
||||
deleted += 1
|
||||
except Exception as e:
|
||||
logger.warning(f"Error listing/deleting files from {provider}: {e}")
|
||||
elif cache is not None:
|
||||
uploads = cache.get_all_for_provider(provider)
|
||||
for upload in uploads:
|
||||
if _safe_delete(uploader, upload.file_id, provider):
|
||||
deleted += 1
|
||||
cache.remove_by_file_id(upload.file_id, provider)
|
||||
|
||||
logger.info(f"Deleted {deleted} files from {provider}")
|
||||
return deleted
|
||||
|
||||
|
||||
def _get_providers_from_cache(cache: UploadCache) -> set[str]:
|
||||
"""Get unique provider names from cache entries.
|
||||
|
||||
Args:
|
||||
cache: The upload cache.
|
||||
|
||||
Returns:
|
||||
Set of provider names.
|
||||
"""
|
||||
return cache.get_providers()
|
||||
|
||||
|
||||
async def _asafe_delete(
|
||||
uploader: FileUploader,
|
||||
file_id: str,
|
||||
provider: str,
|
||||
) -> bool:
|
||||
"""Async safely delete a file, logging any errors.
|
||||
|
||||
Args:
|
||||
uploader: The file uploader to use.
|
||||
file_id: The file ID to delete.
|
||||
provider: Provider name for logging.
|
||||
|
||||
Returns:
|
||||
True if deleted successfully, False otherwise.
|
||||
"""
|
||||
try:
|
||||
if await uploader.adelete(file_id):
|
||||
logger.debug(f"Deleted {file_id} from {provider}")
|
||||
return True
|
||||
logger.warning(f"Failed to delete {file_id} from {provider}")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.warning(f"Error deleting {file_id} from {provider}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
async def acleanup_uploaded_files(
|
||||
cache: UploadCache,
|
||||
*,
|
||||
delete_from_provider: bool = True,
|
||||
providers: list[str] | None = None,
|
||||
max_concurrency: int = 10,
|
||||
) -> int:
|
||||
"""Async clean up uploaded files from the cache and optionally from providers.
|
||||
|
||||
Args:
|
||||
cache: The upload cache to clean up.
|
||||
delete_from_provider: If True, delete files from the provider as well.
|
||||
providers: Optional list of providers to clean up. If None, cleans all.
|
||||
max_concurrency: Maximum number of concurrent delete operations.
|
||||
|
||||
Returns:
|
||||
Number of files cleaned up.
|
||||
"""
|
||||
cleaned = 0
|
||||
|
||||
provider_uploads: dict[str, list[CachedUpload]] = {}
|
||||
|
||||
for provider in _get_providers_from_cache(cache):
|
||||
if providers is not None and provider not in providers:
|
||||
continue
|
||||
provider_uploads[provider] = await cache.aget_all_for_provider(provider)
|
||||
|
||||
if delete_from_provider:
|
||||
semaphore = asyncio.Semaphore(max_concurrency)
|
||||
|
||||
async def delete_one(uploader: FileUploader, upload: CachedUpload) -> bool:
|
||||
async with semaphore:
|
||||
return await _asafe_delete(uploader, upload.file_id, upload.provider)
|
||||
|
||||
tasks: list[asyncio.Task[bool]] = []
|
||||
for provider, uploads in provider_uploads.items():
|
||||
uploader = get_uploader(provider)
|
||||
if uploader is None:
|
||||
logger.warning(
|
||||
f"No uploader available for {provider}, skipping cleanup"
|
||||
)
|
||||
continue
|
||||
|
||||
tasks.extend(
|
||||
asyncio.create_task(delete_one(uploader, upload)) for upload in uploads
|
||||
)
|
||||
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
cleaned = sum(1 for r in results if r is True)
|
||||
|
||||
await cache.aclear()
|
||||
|
||||
logger.info(f"Cleaned up {cleaned} uploaded files")
|
||||
return cleaned
|
||||
|
||||
|
||||
async def acleanup_expired_files(
|
||||
cache: UploadCache,
|
||||
*,
|
||||
delete_from_provider: bool = False,
|
||||
max_concurrency: int = 10,
|
||||
) -> int:
|
||||
"""Async clean up expired files from the cache.
|
||||
|
||||
Args:
|
||||
cache: The upload cache to clean up.
|
||||
delete_from_provider: If True, attempt to delete from provider as well.
|
||||
max_concurrency: Maximum number of concurrent delete operations.
|
||||
|
||||
Returns:
|
||||
Number of expired entries removed from cache.
|
||||
"""
|
||||
expired_entries: list[CachedUpload] = []
|
||||
|
||||
if delete_from_provider:
|
||||
for provider in _get_providers_from_cache(cache):
|
||||
uploads = await cache.aget_all_for_provider(provider)
|
||||
expired_entries.extend(upload for upload in uploads if upload.is_expired())
|
||||
|
||||
removed = await cache.aclear_expired()
|
||||
|
||||
if delete_from_provider and expired_entries:
|
||||
semaphore = asyncio.Semaphore(max_concurrency)
|
||||
|
||||
async def delete_expired(upload: CachedUpload) -> None:
|
||||
async with semaphore:
|
||||
uploader = get_uploader(upload.provider)
|
||||
if uploader is not None:
|
||||
try:
|
||||
await uploader.adelete(upload.file_id)
|
||||
except Exception as e:
|
||||
logger.debug(
|
||||
f"Could not delete expired file {upload.file_id}: {e}"
|
||||
)
|
||||
|
||||
await asyncio.gather(
|
||||
*[delete_expired(upload) for upload in expired_entries],
|
||||
return_exceptions=True,
|
||||
)
|
||||
|
||||
return removed
|
||||
|
||||
|
||||
async def acleanup_provider_files(
|
||||
provider: str,
|
||||
*,
|
||||
cache: UploadCache | None = None,
|
||||
delete_all_from_provider: bool = False,
|
||||
max_concurrency: int = 10,
|
||||
) -> int:
|
||||
"""Async clean up all files for a specific provider.
|
||||
|
||||
Args:
|
||||
provider: Provider name to clean up.
|
||||
cache: Optional upload cache to clear entries from.
|
||||
delete_all_from_provider: If True, delete all files from the provider.
|
||||
max_concurrency: Maximum number of concurrent delete operations.
|
||||
|
||||
Returns:
|
||||
Number of files deleted.
|
||||
"""
|
||||
deleted = 0
|
||||
uploader = get_uploader(provider)
|
||||
|
||||
if uploader is None:
|
||||
logger.warning(f"No uploader available for {provider}")
|
||||
return 0
|
||||
|
||||
semaphore = asyncio.Semaphore(max_concurrency)
|
||||
|
||||
async def delete_file(file_id: str) -> bool:
|
||||
async with semaphore:
|
||||
return await uploader.adelete(file_id)
|
||||
|
||||
if delete_all_from_provider:
|
||||
try:
|
||||
files = uploader.list_files()
|
||||
tasks = []
|
||||
for file_info in files:
|
||||
file_id = file_info.get("id") or file_info.get("name")
|
||||
if file_id:
|
||||
tasks.append(delete_file(file_id))
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
deleted = sum(1 for r in results if r is True)
|
||||
except Exception as e:
|
||||
logger.warning(f"Error listing/deleting files from {provider}: {e}")
|
||||
elif cache is not None:
|
||||
uploads = await cache.aget_all_for_provider(provider)
|
||||
tasks = []
|
||||
for upload in uploads:
|
||||
tasks.append(delete_file(upload.file_id))
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
for upload, result in zip(uploads, results, strict=False):
|
||||
if result is True:
|
||||
deleted += 1
|
||||
await cache.aremove_by_file_id(upload.file_id, provider)
|
||||
|
||||
logger.info(f"Deleted {deleted} files from {provider}")
|
||||
return deleted
|
||||
@@ -11,7 +11,8 @@ from pydantic import BaseModel, Field, GetCoreSchemaHandler
|
||||
from pydantic_core import CoreSchema, core_schema
|
||||
from typing_extensions import TypeIs
|
||||
|
||||
from crewai.utilities.files.file import (
|
||||
from crewai.files.file import (
|
||||
AsyncFileStream,
|
||||
FileBytes,
|
||||
FilePath,
|
||||
FileSource,
|
||||
@@ -185,7 +186,18 @@ class BaseFile(ABC, BaseModel):
|
||||
|
||||
def read(self) -> bytes:
|
||||
"""Read the file content as bytes."""
|
||||
return self._file_source.read()
|
||||
return self._file_source.read() # type: ignore[union-attr]
|
||||
|
||||
async def aread(self) -> bytes:
|
||||
"""Async read the file content as bytes.
|
||||
|
||||
Raises:
|
||||
TypeError: If the underlying source doesn't support async read.
|
||||
"""
|
||||
source = self._file_source
|
||||
if isinstance(source, (FilePath, FileBytes, AsyncFileStream)):
|
||||
return await source.aread()
|
||||
raise TypeError(f"{type(source).__name__} does not support async read")
|
||||
|
||||
def read_text(self, encoding: str = "utf-8") -> str:
|
||||
"""Read the file content as string."""
|
||||
377
lib/crewai/src/crewai/files/file.py
Normal file
377
lib/crewai/src/crewai/files/file.py
Normal file
@@ -0,0 +1,377 @@
|
||||
"""Base file class for handling file inputs in tasks."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import AsyncIterator, Iterator
|
||||
from pathlib import Path
|
||||
from typing import Annotated, Any, BinaryIO, Protocol, cast, runtime_checkable
|
||||
|
||||
import aiofiles
|
||||
import magic
|
||||
from pydantic import (
|
||||
BaseModel,
|
||||
BeforeValidator,
|
||||
Field,
|
||||
GetCoreSchemaHandler,
|
||||
PrivateAttr,
|
||||
model_validator,
|
||||
)
|
||||
from pydantic_core import CoreSchema, core_schema
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class AsyncReadable(Protocol):
|
||||
"""Protocol for async readable streams."""
|
||||
|
||||
async def read(self, size: int = -1) -> bytes: ...
|
||||
|
||||
|
||||
class _AsyncReadableValidator:
|
||||
"""Pydantic validator for AsyncReadable types."""
|
||||
|
||||
@classmethod
|
||||
def __get_pydantic_core_schema__(
|
||||
cls, _source_type: Any, _handler: GetCoreSchemaHandler
|
||||
) -> CoreSchema:
|
||||
return core_schema.no_info_plain_validator_function(
|
||||
cls._validate,
|
||||
serialization=core_schema.plain_serializer_function_ser_schema(
|
||||
lambda x: None, info_arg=False
|
||||
),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _validate(value: Any) -> AsyncReadable:
|
||||
if isinstance(value, AsyncReadable):
|
||||
return value
|
||||
raise ValueError("Expected an async readable object with async read() method")
|
||||
|
||||
|
||||
ValidatedAsyncReadable = Annotated[AsyncReadable, _AsyncReadableValidator()]
|
||||
|
||||
DEFAULT_MAX_FILE_SIZE_BYTES = 500 * 1024 * 1024 # 500MB
|
||||
|
||||
|
||||
def detect_content_type(data: bytes) -> str:
|
||||
"""Detect MIME type from file content.
|
||||
|
||||
Args:
|
||||
data: Raw bytes to analyze.
|
||||
|
||||
Returns:
|
||||
The detected MIME type.
|
||||
"""
|
||||
result: str = magic.from_buffer(data, mime=True)
|
||||
return result
|
||||
|
||||
|
||||
class _BinaryIOValidator:
|
||||
"""Pydantic validator for BinaryIO types."""
|
||||
|
||||
@classmethod
|
||||
def __get_pydantic_core_schema__(
|
||||
cls, _source_type: Any, _handler: GetCoreSchemaHandler
|
||||
) -> CoreSchema:
|
||||
return core_schema.no_info_plain_validator_function(
|
||||
cls._validate,
|
||||
serialization=core_schema.plain_serializer_function_ser_schema(
|
||||
lambda x: None, info_arg=False
|
||||
),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _validate(value: Any) -> BinaryIO:
|
||||
if hasattr(value, "read") and hasattr(value, "seek"):
|
||||
return cast(BinaryIO, value)
|
||||
raise ValueError("Expected a binary file-like object with read() and seek()")
|
||||
|
||||
|
||||
ValidatedBinaryIO = Annotated[BinaryIO, _BinaryIOValidator()]
|
||||
|
||||
|
||||
class FilePath(BaseModel):
|
||||
"""File loaded from a filesystem path."""
|
||||
|
||||
path: Path = Field(description="Path to the file on the filesystem.")
|
||||
max_size_bytes: int = Field(
|
||||
default=DEFAULT_MAX_FILE_SIZE_BYTES,
|
||||
exclude=True,
|
||||
description="Maximum file size in bytes.",
|
||||
)
|
||||
_content: bytes | None = PrivateAttr(default=None)
|
||||
|
||||
@model_validator(mode="after")
|
||||
def _validate_file_exists(self) -> FilePath:
|
||||
"""Validate that the file exists, is secure, and within size limits."""
|
||||
from crewai.files.processing.exceptions import FileTooLargeError
|
||||
|
||||
path_str = str(self.path)
|
||||
if ".." in path_str:
|
||||
raise ValueError(f"Path traversal not allowed: {self.path}")
|
||||
|
||||
if self.path.is_symlink():
|
||||
resolved = self.path.resolve()
|
||||
cwd = Path.cwd().resolve()
|
||||
if not str(resolved).startswith(str(cwd)):
|
||||
raise ValueError(f"Symlink escapes allowed directory: {self.path}")
|
||||
|
||||
if not self.path.exists():
|
||||
raise ValueError(f"File not found: {self.path}")
|
||||
if not self.path.is_file():
|
||||
raise ValueError(f"Path is not a file: {self.path}")
|
||||
|
||||
actual_size = self.path.stat().st_size
|
||||
if actual_size > self.max_size_bytes:
|
||||
raise FileTooLargeError(
|
||||
f"File exceeds max size ({actual_size} > {self.max_size_bytes})",
|
||||
file_name=str(self.path),
|
||||
actual_size=actual_size,
|
||||
max_size=self.max_size_bytes,
|
||||
)
|
||||
|
||||
return self
|
||||
|
||||
@property
|
||||
def filename(self) -> str:
|
||||
"""Get the filename from the path."""
|
||||
return self.path.name
|
||||
|
||||
@property
|
||||
def content_type(self) -> str:
|
||||
"""Get the content type by reading file content."""
|
||||
return detect_content_type(self.read())
|
||||
|
||||
def read(self) -> bytes:
|
||||
"""Read the file content from disk."""
|
||||
if self._content is None:
|
||||
self._content = self.path.read_bytes()
|
||||
return self._content
|
||||
|
||||
async def aread(self) -> bytes:
|
||||
"""Async read the file content from disk."""
|
||||
if self._content is None:
|
||||
async with aiofiles.open(self.path, "rb") as f:
|
||||
self._content = await f.read()
|
||||
return self._content
|
||||
|
||||
def read_chunks(self, chunk_size: int = 65536) -> Iterator[bytes]:
|
||||
"""Stream file content in chunks without loading entirely into memory.
|
||||
|
||||
Args:
|
||||
chunk_size: Size of each chunk in bytes.
|
||||
|
||||
Yields:
|
||||
Chunks of file content.
|
||||
"""
|
||||
with open(self.path, "rb") as f:
|
||||
while chunk := f.read(chunk_size):
|
||||
yield chunk
|
||||
|
||||
async def aread_chunks(self, chunk_size: int = 65536) -> AsyncIterator[bytes]:
|
||||
"""Async streaming for non-blocking I/O.
|
||||
|
||||
Args:
|
||||
chunk_size: Size of each chunk in bytes.
|
||||
|
||||
Yields:
|
||||
Chunks of file content.
|
||||
"""
|
||||
async with aiofiles.open(self.path, "rb") as f:
|
||||
while chunk := await f.read(chunk_size):
|
||||
yield chunk
|
||||
|
||||
|
||||
class FileBytes(BaseModel):
|
||||
"""File created from raw bytes content."""
|
||||
|
||||
data: bytes = Field(description="Raw bytes content of the file.")
|
||||
filename: str | None = Field(default=None, description="Optional filename.")
|
||||
|
||||
@property
|
||||
def content_type(self) -> str:
|
||||
"""Get the content type from the data."""
|
||||
return detect_content_type(self.data)
|
||||
|
||||
def read(self) -> bytes:
|
||||
"""Return the bytes content."""
|
||||
return self.data
|
||||
|
||||
async def aread(self) -> bytes:
|
||||
"""Async return the bytes content (immediate, already in memory)."""
|
||||
return self.data
|
||||
|
||||
def read_chunks(self, chunk_size: int = 65536) -> Iterator[bytes]:
|
||||
"""Stream bytes content in chunks.
|
||||
|
||||
Args:
|
||||
chunk_size: Size of each chunk in bytes.
|
||||
|
||||
Yields:
|
||||
Chunks of bytes content.
|
||||
"""
|
||||
for i in range(0, len(self.data), chunk_size):
|
||||
yield self.data[i : i + chunk_size]
|
||||
|
||||
async def aread_chunks(self, chunk_size: int = 65536) -> AsyncIterator[bytes]:
|
||||
"""Async streaming (immediate yield since already in memory).
|
||||
|
||||
Args:
|
||||
chunk_size: Size of each chunk in bytes.
|
||||
|
||||
Yields:
|
||||
Chunks of bytes content.
|
||||
"""
|
||||
for chunk in self.read_chunks(chunk_size):
|
||||
yield chunk
|
||||
|
||||
|
||||
class FileStream(BaseModel):
|
||||
"""File loaded from a file-like stream."""
|
||||
|
||||
stream: ValidatedBinaryIO = Field(description="Binary file stream.")
|
||||
filename: str | None = Field(default=None, description="Optional filename.")
|
||||
_content: bytes | None = PrivateAttr(default=None)
|
||||
|
||||
def model_post_init(self, __context: object) -> None:
|
||||
"""Extract filename from stream if not provided."""
|
||||
if self.filename is None:
|
||||
name = getattr(self.stream, "name", None)
|
||||
if name is not None:
|
||||
object.__setattr__(self, "filename", Path(name).name)
|
||||
|
||||
@property
|
||||
def content_type(self) -> str:
|
||||
"""Get the content type from stream content."""
|
||||
return detect_content_type(self.read())
|
||||
|
||||
def read(self) -> bytes:
|
||||
"""Read the stream content. Content is cached after first read."""
|
||||
if self._content is None:
|
||||
position = self.stream.tell()
|
||||
self.stream.seek(0)
|
||||
self._content = self.stream.read()
|
||||
self.stream.seek(position)
|
||||
return self._content
|
||||
|
||||
def close(self) -> None:
|
||||
"""Close the underlying stream."""
|
||||
self.stream.close()
|
||||
|
||||
def __enter__(self) -> FileStream:
|
||||
"""Enter context manager."""
|
||||
return self
|
||||
|
||||
def __exit__(
|
||||
self,
|
||||
exc_type: type[BaseException] | None,
|
||||
exc_val: BaseException | None,
|
||||
exc_tb: Any,
|
||||
) -> None:
|
||||
"""Exit context manager and close stream."""
|
||||
self.close()
|
||||
|
||||
def read_chunks(self, chunk_size: int = 65536) -> Iterator[bytes]:
|
||||
"""Stream from underlying stream in chunks.
|
||||
|
||||
Args:
|
||||
chunk_size: Size of each chunk in bytes.
|
||||
|
||||
Yields:
|
||||
Chunks of stream content.
|
||||
"""
|
||||
position = self.stream.tell()
|
||||
self.stream.seek(0)
|
||||
try:
|
||||
while chunk := self.stream.read(chunk_size):
|
||||
yield chunk
|
||||
finally:
|
||||
self.stream.seek(position)
|
||||
|
||||
|
||||
class AsyncFileStream(BaseModel):
|
||||
"""File loaded from an async stream.
|
||||
|
||||
Use for async file handles like aiofiles objects or aiohttp response bodies.
|
||||
This is an async-only type - use aread() instead of read().
|
||||
|
||||
Attributes:
|
||||
stream: Async file-like object with async read() method.
|
||||
filename: Optional filename for the stream.
|
||||
"""
|
||||
|
||||
stream: ValidatedAsyncReadable = Field(
|
||||
description="Async file stream with async read() method."
|
||||
)
|
||||
filename: str | None = Field(default=None, description="Optional filename.")
|
||||
_content: bytes | None = PrivateAttr(default=None)
|
||||
|
||||
@property
|
||||
def content_type(self) -> str:
|
||||
"""Get the content type from stream content. Requires aread() first."""
|
||||
if self._content is None:
|
||||
raise RuntimeError("Call aread() first to load content")
|
||||
return detect_content_type(self._content)
|
||||
|
||||
async def aread(self) -> bytes:
|
||||
"""Async read the stream content. Content is cached after first read."""
|
||||
if self._content is None:
|
||||
self._content = await self.stream.read()
|
||||
return self._content
|
||||
|
||||
async def aclose(self) -> None:
|
||||
"""Async close the underlying stream."""
|
||||
if hasattr(self.stream, "close"):
|
||||
result = self.stream.close()
|
||||
if hasattr(result, "__await__"):
|
||||
await result
|
||||
|
||||
async def __aenter__(self) -> AsyncFileStream:
|
||||
"""Async enter context manager."""
|
||||
return self
|
||||
|
||||
async def __aexit__(
|
||||
self,
|
||||
exc_type: type[BaseException] | None,
|
||||
exc_val: BaseException | None,
|
||||
exc_tb: Any,
|
||||
) -> None:
|
||||
"""Async exit context manager and close stream."""
|
||||
await self.aclose()
|
||||
|
||||
async def aread_chunks(self, chunk_size: int = 65536) -> AsyncIterator[bytes]:
|
||||
"""Async stream content in chunks.
|
||||
|
||||
Args:
|
||||
chunk_size: Size of each chunk in bytes.
|
||||
|
||||
Yields:
|
||||
Chunks of stream content.
|
||||
"""
|
||||
while chunk := await self.stream.read(chunk_size):
|
||||
yield chunk
|
||||
|
||||
|
||||
FileSource = FilePath | FileBytes | FileStream | AsyncFileStream
|
||||
|
||||
|
||||
def _normalize_source(value: Any) -> FileSource:
|
||||
"""Convert raw input to appropriate source type."""
|
||||
if isinstance(value, (FilePath, FileBytes, FileStream, AsyncFileStream)):
|
||||
return value
|
||||
if isinstance(value, Path):
|
||||
return FilePath(path=value)
|
||||
if isinstance(value, str):
|
||||
return FilePath(path=Path(value))
|
||||
if isinstance(value, bytes):
|
||||
return FileBytes(data=value)
|
||||
if isinstance(value, AsyncReadable):
|
||||
return AsyncFileStream(stream=value)
|
||||
if hasattr(value, "read") and hasattr(value, "seek"):
|
||||
return FileStream(stream=value)
|
||||
raise ValueError(f"Cannot convert {type(value).__name__} to file source")
|
||||
|
||||
|
||||
RawFileInput = str | Path | bytes
|
||||
FileSourceInput = Annotated[
|
||||
RawFileInput | FileSource, BeforeValidator(_normalize_source)
|
||||
]
|
||||
184
lib/crewai/src/crewai/files/metrics.py
Normal file
184
lib/crewai/src/crewai/files/metrics.py
Normal file
@@ -0,0 +1,184 @@
|
||||
"""Performance metrics and structured logging for file operations."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Generator
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
import logging
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class FileOperationMetrics:
|
||||
"""Metrics for a file operation.
|
||||
|
||||
Attributes:
|
||||
operation: Name of the operation (e.g., "upload", "resolve", "process").
|
||||
filename: Name of the file being operated on.
|
||||
provider: Provider name if applicable.
|
||||
duration_ms: Duration of the operation in milliseconds.
|
||||
size_bytes: Size of the file in bytes.
|
||||
success: Whether the operation succeeded.
|
||||
error: Error message if operation failed.
|
||||
timestamp: When the operation occurred.
|
||||
metadata: Additional operation-specific metadata.
|
||||
"""
|
||||
|
||||
operation: str
|
||||
filename: str | None = None
|
||||
provider: str | None = None
|
||||
duration_ms: float = 0.0
|
||||
size_bytes: int | None = None
|
||||
success: bool = True
|
||||
error: str | None = None
|
||||
timestamp: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
"""Convert metrics to dictionary for logging.
|
||||
|
||||
Returns:
|
||||
Dictionary representation of metrics.
|
||||
"""
|
||||
result: dict[str, Any] = {
|
||||
"operation": self.operation,
|
||||
"duration_ms": round(self.duration_ms, 2),
|
||||
"success": self.success,
|
||||
"timestamp": self.timestamp.isoformat(),
|
||||
}
|
||||
|
||||
if self.filename:
|
||||
result["filename"] = self.filename
|
||||
if self.provider:
|
||||
result["provider"] = self.provider
|
||||
if self.size_bytes is not None:
|
||||
result["size_bytes"] = self.size_bytes
|
||||
if self.error:
|
||||
result["error"] = self.error
|
||||
if self.metadata:
|
||||
result.update(self.metadata)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@contextmanager
|
||||
def measure_operation(
|
||||
operation: str,
|
||||
*,
|
||||
filename: str | None = None,
|
||||
provider: str | None = None,
|
||||
size_bytes: int | None = None,
|
||||
log_level: int = logging.DEBUG,
|
||||
**extra_metadata: Any,
|
||||
) -> Generator[FileOperationMetrics, None, None]:
|
||||
"""Context manager to measure and log operation performance.
|
||||
|
||||
Args:
|
||||
operation: Name of the operation.
|
||||
filename: Optional filename being operated on.
|
||||
provider: Optional provider name.
|
||||
size_bytes: Optional file size in bytes.
|
||||
log_level: Log level for the result message.
|
||||
**extra_metadata: Additional metadata to include.
|
||||
|
||||
Yields:
|
||||
FileOperationMetrics object that will be populated with results.
|
||||
|
||||
Example:
|
||||
with measure_operation("upload", filename="test.pdf", provider="openai") as metrics:
|
||||
result = upload_file(file)
|
||||
metrics.metadata["file_id"] = result.file_id
|
||||
"""
|
||||
metrics = FileOperationMetrics(
|
||||
operation=operation,
|
||||
filename=filename,
|
||||
provider=provider,
|
||||
size_bytes=size_bytes,
|
||||
metadata=dict(extra_metadata),
|
||||
)
|
||||
|
||||
start_time = time.perf_counter()
|
||||
|
||||
try:
|
||||
yield metrics
|
||||
metrics.success = True
|
||||
except Exception as e:
|
||||
metrics.success = False
|
||||
metrics.error = str(e)
|
||||
raise
|
||||
finally:
|
||||
metrics.duration_ms = (time.perf_counter() - start_time) * 1000
|
||||
|
||||
log_message = f"{operation}"
|
||||
if filename:
|
||||
log_message += f" [{filename}]"
|
||||
if provider:
|
||||
log_message += f" ({provider})"
|
||||
|
||||
if metrics.success:
|
||||
log_message += f" completed in {metrics.duration_ms:.2f}ms"
|
||||
else:
|
||||
log_message += f" failed after {metrics.duration_ms:.2f}ms: {metrics.error}"
|
||||
|
||||
logger.log(log_level, log_message, extra=metrics.to_dict())
|
||||
|
||||
|
||||
def log_file_operation(
|
||||
operation: str,
|
||||
*,
|
||||
filename: str | None = None,
|
||||
provider: str | None = None,
|
||||
size_bytes: int | None = None,
|
||||
duration_ms: float | None = None,
|
||||
success: bool = True,
|
||||
error: str | None = None,
|
||||
level: int = logging.INFO,
|
||||
**extra: Any,
|
||||
) -> None:
|
||||
"""Log a file operation with structured data.
|
||||
|
||||
Args:
|
||||
operation: Name of the operation.
|
||||
filename: Optional filename being operated on.
|
||||
provider: Optional provider name.
|
||||
size_bytes: Optional file size in bytes.
|
||||
duration_ms: Optional duration in milliseconds.
|
||||
success: Whether the operation succeeded.
|
||||
error: Optional error message.
|
||||
level: Log level to use.
|
||||
**extra: Additional metadata to include.
|
||||
"""
|
||||
metrics = FileOperationMetrics(
|
||||
operation=operation,
|
||||
filename=filename,
|
||||
provider=provider,
|
||||
size_bytes=size_bytes,
|
||||
duration_ms=duration_ms or 0.0,
|
||||
success=success,
|
||||
error=error,
|
||||
metadata=dict(extra),
|
||||
)
|
||||
|
||||
message = f"{operation}"
|
||||
if filename:
|
||||
message += f" [{filename}]"
|
||||
if provider:
|
||||
message += f" ({provider})"
|
||||
|
||||
if success:
|
||||
if duration_ms:
|
||||
message += f" completed in {duration_ms:.2f}ms"
|
||||
else:
|
||||
message += " completed"
|
||||
else:
|
||||
message += " failed"
|
||||
if error:
|
||||
message += f": {error}"
|
||||
|
||||
logger.log(level, message, extra=metrics.to_dict())
|
||||
@@ -4,7 +4,7 @@ This module provides validation, transformation, and processing utilities
|
||||
for files used in multimodal LLM interactions.
|
||||
"""
|
||||
|
||||
from crewai.utilities.files.processing.constraints import (
|
||||
from crewai.files.processing.constraints import (
|
||||
ANTHROPIC_CONSTRAINTS,
|
||||
BEDROCK_CONSTRAINTS,
|
||||
GEMINI_CONSTRAINTS,
|
||||
@@ -16,16 +16,16 @@ from crewai.utilities.files.processing.constraints import (
|
||||
VideoConstraints,
|
||||
get_constraints_for_provider,
|
||||
)
|
||||
from crewai.utilities.files.processing.enums import FileHandling
|
||||
from crewai.utilities.files.processing.exceptions import (
|
||||
from crewai.files.processing.enums import FileHandling
|
||||
from crewai.files.processing.exceptions import (
|
||||
FileProcessingError,
|
||||
FileTooLargeError,
|
||||
FileValidationError,
|
||||
ProcessingDependencyError,
|
||||
UnsupportedFileTypeError,
|
||||
)
|
||||
from crewai.utilities.files.processing.processor import FileProcessor
|
||||
from crewai.utilities.files.processing.validators import (
|
||||
from crewai.files.processing.processor import FileProcessor
|
||||
from crewai.files.processing.validators import (
|
||||
validate_audio,
|
||||
validate_file,
|
||||
validate_image,
|
||||
@@ -81,3 +81,23 @@ class ProcessingDependencyError(FileProcessingError):
|
||||
self.dependency = dependency
|
||||
self.install_command = install_command
|
||||
super().__init__(message)
|
||||
|
||||
|
||||
class TransientFileError(FileProcessingError):
|
||||
"""Transient error that may succeed on retry (network, timeout)."""
|
||||
|
||||
|
||||
class PermanentFileError(FileProcessingError):
|
||||
"""Permanent error that will not succeed on retry (auth, format)."""
|
||||
|
||||
|
||||
class UploadError(FileProcessingError):
|
||||
"""Base exception for upload errors."""
|
||||
|
||||
|
||||
class TransientUploadError(UploadError, TransientFileError):
|
||||
"""Upload failed but may succeed on retry (network issues, rate limits)."""
|
||||
|
||||
|
||||
class PermanentUploadError(UploadError, PermanentFileError):
|
||||
"""Upload failed permanently (auth failure, invalid file, unsupported type)."""
|
||||
@@ -1,9 +1,10 @@
|
||||
"""FileProcessor for validating and transforming files based on provider constraints."""
|
||||
|
||||
import asyncio
|
||||
from collections.abc import Sequence
|
||||
import logging
|
||||
|
||||
from crewai.utilities.files.content_types import (
|
||||
from crewai.files.content_types import (
|
||||
AudioFile,
|
||||
File,
|
||||
ImageFile,
|
||||
@@ -11,18 +12,18 @@ from crewai.utilities.files.content_types import (
|
||||
TextFile,
|
||||
VideoFile,
|
||||
)
|
||||
from crewai.utilities.files.processing.constraints import (
|
||||
from crewai.files.processing.constraints import (
|
||||
ProviderConstraints,
|
||||
get_constraints_for_provider,
|
||||
)
|
||||
from crewai.utilities.files.processing.enums import FileHandling
|
||||
from crewai.utilities.files.processing.exceptions import (
|
||||
from crewai.files.processing.enums import FileHandling
|
||||
from crewai.files.processing.exceptions import (
|
||||
FileProcessingError,
|
||||
FileTooLargeError,
|
||||
FileValidationError,
|
||||
UnsupportedFileTypeError,
|
||||
)
|
||||
from crewai.utilities.files.processing.transformers import (
|
||||
from crewai.files.processing.transformers import (
|
||||
chunk_pdf,
|
||||
chunk_text,
|
||||
get_image_dimensions,
|
||||
@@ -30,7 +31,7 @@ from crewai.utilities.files.processing.transformers import (
|
||||
optimize_image,
|
||||
resize_image,
|
||||
)
|
||||
from crewai.utilities.files.processing.validators import validate_file
|
||||
from crewai.files.processing.validators import validate_file
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -183,6 +184,52 @@ class FileProcessor:
|
||||
|
||||
return result
|
||||
|
||||
async def aprocess_files(
|
||||
self,
|
||||
files: dict[str, FileInput],
|
||||
max_concurrency: int = 10,
|
||||
) -> dict[str, FileInput]:
|
||||
"""Async process multiple files in parallel.
|
||||
|
||||
Args:
|
||||
files: Dictionary mapping names to file inputs.
|
||||
max_concurrency: Maximum number of concurrent processing tasks.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping names to processed files. If a file is chunked,
|
||||
multiple entries are created with indexed names.
|
||||
"""
|
||||
semaphore = asyncio.Semaphore(max_concurrency)
|
||||
|
||||
async def process_one(
|
||||
name: str, file: FileInput
|
||||
) -> tuple[str, FileInput | Sequence[FileInput]]:
|
||||
async with semaphore:
|
||||
loop = asyncio.get_running_loop()
|
||||
processed = await loop.run_in_executor(None, self.process, file)
|
||||
return name, processed
|
||||
|
||||
tasks = [process_one(n, f) for n, f in files.items()]
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
output: dict[str, FileInput] = {}
|
||||
for result in results:
|
||||
if isinstance(result, BaseException):
|
||||
logger.error(f"Processing failed: {result}")
|
||||
continue
|
||||
name, processed = result
|
||||
if isinstance(processed, Sequence) and not isinstance(
|
||||
processed, (str, bytes)
|
||||
):
|
||||
for i, chunk in enumerate(processed):
|
||||
output[f"{name}_chunk_{i}"] = chunk
|
||||
elif isinstance(
|
||||
processed, (AudioFile, File, ImageFile, PDFFile, TextFile, VideoFile)
|
||||
):
|
||||
output[name] = processed
|
||||
|
||||
return output
|
||||
|
||||
def _auto_process(self, file: FileInput) -> FileInput:
|
||||
"""Automatically resize/compress file to meet constraints.
|
||||
|
||||
@@ -272,7 +319,7 @@ class FileProcessor:
|
||||
page_count = get_pdf_page_count(file)
|
||||
if page_count is not None and page_count > max_pages:
|
||||
try:
|
||||
return chunk_pdf(file, max_pages)
|
||||
return list(chunk_pdf(file, max_pages))
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to chunk PDF: {e}")
|
||||
return file
|
||||
@@ -284,7 +331,7 @@ class FileProcessor:
|
||||
content = file.read()
|
||||
if len(content) > max_size:
|
||||
try:
|
||||
return chunk_text(file, max_size)
|
||||
return list(chunk_text(file, max_size))
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to chunk text file: {e}")
|
||||
return file
|
||||
@@ -1,12 +1,12 @@
|
||||
"""File transformation functions for resizing, optimizing, and chunking."""
|
||||
|
||||
from collections.abc import Sequence
|
||||
from collections.abc import Iterator
|
||||
import io
|
||||
import logging
|
||||
|
||||
from crewai.utilities.files.content_types import ImageFile, PDFFile, TextFile
|
||||
from crewai.utilities.files.file import FileBytes
|
||||
from crewai.utilities.files.processing.exceptions import ProcessingDependencyError
|
||||
from crewai.files.content_types import ImageFile, PDFFile, TextFile
|
||||
from crewai.files.file import FileBytes
|
||||
from crewai.files.processing.exceptions import ProcessingDependencyError
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -161,22 +161,24 @@ def chunk_pdf(
|
||||
max_pages: int,
|
||||
*,
|
||||
overlap_pages: int = 0,
|
||||
) -> Sequence[PDFFile]:
|
||||
) -> Iterator[PDFFile]:
|
||||
"""Split a PDF into chunks of maximum page count.
|
||||
|
||||
Yields chunks one at a time to minimize memory usage.
|
||||
|
||||
Args:
|
||||
file: The PDF file to chunk.
|
||||
max_pages: Maximum pages per chunk.
|
||||
overlap_pages: Number of overlapping pages between chunks (for context).
|
||||
|
||||
Returns:
|
||||
List of PDFFile objects, one per chunk.
|
||||
Yields:
|
||||
PDFFile objects, one per chunk.
|
||||
|
||||
Raises:
|
||||
ProcessingDependencyError: If pypdf is not installed.
|
||||
"""
|
||||
try:
|
||||
from pypdf import PdfReader, PdfWriter # type: ignore[import-not-found]
|
||||
from pypdf import PdfReader, PdfWriter
|
||||
except ImportError as e:
|
||||
raise ProcessingDependencyError(
|
||||
"pypdf is required for PDF chunking",
|
||||
@@ -189,9 +191,9 @@ def chunk_pdf(
|
||||
total_pages = len(reader.pages)
|
||||
|
||||
if total_pages <= max_pages:
|
||||
return [file]
|
||||
yield file
|
||||
return
|
||||
|
||||
chunks: list[PDFFile] = []
|
||||
filename = file.filename or "document.pdf"
|
||||
base_filename = filename.rsplit(".", 1)[0]
|
||||
step = max_pages - overlap_pages
|
||||
@@ -211,19 +213,16 @@ def chunk_pdf(
|
||||
output_bytes = output_buffer.getvalue()
|
||||
|
||||
chunk_filename = f"{base_filename}_chunk_{chunk_num}.pdf"
|
||||
chunks.append(
|
||||
PDFFile(source=FileBytes(data=output_bytes, filename=chunk_filename))
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Created PDF chunk '{chunk_filename}' with pages {start_page + 1}-{end_page}"
|
||||
)
|
||||
|
||||
yield PDFFile(source=FileBytes(data=output_bytes, filename=chunk_filename))
|
||||
|
||||
start_page += step
|
||||
chunk_num += 1
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def chunk_text(
|
||||
file: TextFile,
|
||||
@@ -231,26 +230,28 @@ def chunk_text(
|
||||
*,
|
||||
overlap_chars: int = 200,
|
||||
split_on_newlines: bool = True,
|
||||
) -> Sequence[TextFile]:
|
||||
) -> Iterator[TextFile]:
|
||||
"""Split a text file into chunks of maximum character count.
|
||||
|
||||
Yields chunks one at a time to minimize memory usage.
|
||||
|
||||
Args:
|
||||
file: The text file to chunk.
|
||||
max_chars: Maximum characters per chunk.
|
||||
overlap_chars: Number of overlapping characters between chunks.
|
||||
split_on_newlines: If True, prefer splitting at newline boundaries.
|
||||
|
||||
Returns:
|
||||
List of TextFile objects, one per chunk.
|
||||
Yields:
|
||||
TextFile objects, one per chunk.
|
||||
"""
|
||||
content = file.read()
|
||||
text = content.decode("utf-8", errors="replace")
|
||||
total_chars = len(text)
|
||||
|
||||
if total_chars <= max_chars:
|
||||
return [file]
|
||||
yield file
|
||||
return
|
||||
|
||||
chunks: list[TextFile] = []
|
||||
filename = file.filename or "text.txt"
|
||||
base_filename = filename.rsplit(".", 1)[0]
|
||||
extension = filename.rsplit(".", 1)[-1] if "." in filename else "txt"
|
||||
@@ -261,29 +262,27 @@ def chunk_text(
|
||||
while start_pos < total_chars:
|
||||
end_pos = min(start_pos + max_chars, total_chars)
|
||||
|
||||
# If not at end, try to find a better split point
|
||||
if end_pos < total_chars and split_on_newlines:
|
||||
# Look for last newline within the chunk
|
||||
last_newline = text.rfind("\n", start_pos, end_pos)
|
||||
if last_newline > start_pos + max_chars // 2: # Don't split too early
|
||||
if last_newline > start_pos + max_chars // 2:
|
||||
end_pos = last_newline + 1
|
||||
|
||||
chunk_text = text[start_pos:end_pos]
|
||||
chunk_bytes = chunk_text.encode("utf-8")
|
||||
chunk_content = text[start_pos:end_pos]
|
||||
chunk_bytes = chunk_content.encode("utf-8")
|
||||
|
||||
chunk_filename = f"{base_filename}_chunk_{chunk_num}.{extension}"
|
||||
chunks.append(
|
||||
TextFile(source=FileBytes(data=chunk_bytes, filename=chunk_filename))
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Created text chunk '{chunk_filename}' with {len(chunk_text)} characters"
|
||||
f"Created text chunk '{chunk_filename}' with {len(chunk_content)} characters"
|
||||
)
|
||||
|
||||
start_pos = end_pos - overlap_chars if end_pos < total_chars else total_chars
|
||||
chunk_num += 1
|
||||
yield TextFile(source=FileBytes(data=chunk_bytes, filename=chunk_filename))
|
||||
|
||||
return chunks
|
||||
if end_pos < total_chars:
|
||||
start_pos = max(start_pos + 1, end_pos - overlap_chars)
|
||||
else:
|
||||
start_pos = total_chars
|
||||
chunk_num += 1
|
||||
|
||||
|
||||
def get_image_dimensions(file: ImageFile) -> tuple[int, int] | None:
|
||||
@@ -3,7 +3,7 @@
|
||||
from collections.abc import Sequence
|
||||
import logging
|
||||
|
||||
from crewai.utilities.files.content_types import (
|
||||
from crewai.files.content_types import (
|
||||
AudioFile,
|
||||
File,
|
||||
ImageFile,
|
||||
@@ -11,14 +11,14 @@ from crewai.utilities.files.content_types import (
|
||||
TextFile,
|
||||
VideoFile,
|
||||
)
|
||||
from crewai.utilities.files.processing.constraints import (
|
||||
from crewai.files.processing.constraints import (
|
||||
AudioConstraints,
|
||||
ImageConstraints,
|
||||
PDFConstraints,
|
||||
ProviderConstraints,
|
||||
VideoConstraints,
|
||||
)
|
||||
from crewai.utilities.files.processing.exceptions import (
|
||||
from crewai.files.processing.exceptions import (
|
||||
FileTooLargeError,
|
||||
FileValidationError,
|
||||
UnsupportedFileTypeError,
|
||||
@@ -172,7 +172,7 @@ def validate_pdf(
|
||||
try:
|
||||
import io
|
||||
|
||||
from pypdf import PdfReader # type: ignore[import-not-found]
|
||||
from pypdf import PdfReader
|
||||
|
||||
reader = PdfReader(io.BytesIO(content))
|
||||
page_count = len(reader.pages)
|
||||
577
lib/crewai/src/crewai/files/resolver.py
Normal file
577
lib/crewai/src/crewai/files/resolver.py
Normal file
@@ -0,0 +1,577 @@
|
||||
"""FileResolver for deciding file delivery method and managing uploads."""
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
from dataclasses import dataclass, field
|
||||
import hashlib
|
||||
import logging
|
||||
|
||||
from crewai.files.content_types import (
|
||||
AudioFile,
|
||||
File,
|
||||
ImageFile,
|
||||
PDFFile,
|
||||
TextFile,
|
||||
VideoFile,
|
||||
)
|
||||
from crewai.files.metrics import measure_operation
|
||||
from crewai.files.processing.constraints import (
|
||||
ProviderConstraints,
|
||||
get_constraints_for_provider,
|
||||
)
|
||||
from crewai.files.resolved import (
|
||||
FileReference,
|
||||
InlineBase64,
|
||||
InlineBytes,
|
||||
ResolvedFile,
|
||||
)
|
||||
from crewai.files.upload_cache import CachedUpload, UploadCache
|
||||
from crewai.files.uploaders import UploadResult, get_uploader
|
||||
from crewai.files.uploaders.base import FileUploader
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
FileInput = AudioFile | File | ImageFile | PDFFile | TextFile | VideoFile
|
||||
|
||||
UPLOAD_MAX_RETRIES = 3
|
||||
UPLOAD_RETRY_DELAY_BASE = 2
|
||||
|
||||
|
||||
@dataclass
|
||||
class FileContext:
|
||||
"""Cached file metadata to avoid redundant reads.
|
||||
|
||||
Attributes:
|
||||
content: Raw file bytes.
|
||||
size: Size of the file in bytes.
|
||||
content_hash: SHA-256 hash of the file content.
|
||||
content_type: MIME type of the file.
|
||||
"""
|
||||
|
||||
content: bytes
|
||||
size: int
|
||||
content_hash: str
|
||||
content_type: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class FileResolverConfig:
|
||||
"""Configuration for FileResolver.
|
||||
|
||||
Attributes:
|
||||
prefer_upload: If True, prefer uploading over inline for supported providers.
|
||||
upload_threshold_bytes: Size threshold above which to use upload.
|
||||
If None, uses provider-specific threshold.
|
||||
use_bytes_for_bedrock: If True, use raw bytes instead of base64 for Bedrock.
|
||||
"""
|
||||
|
||||
prefer_upload: bool = False
|
||||
upload_threshold_bytes: int | None = None
|
||||
use_bytes_for_bedrock: bool = True
|
||||
|
||||
|
||||
@dataclass
|
||||
class FileResolver:
|
||||
"""Resolves files to their delivery format based on provider capabilities.
|
||||
|
||||
Decides whether to use inline base64, raw bytes, or file upload based on:
|
||||
- Provider constraints and capabilities
|
||||
- File size
|
||||
- Configuration preferences
|
||||
|
||||
Caches uploaded files to avoid redundant uploads.
|
||||
|
||||
Attributes:
|
||||
config: Resolver configuration.
|
||||
upload_cache: Cache for tracking uploaded files.
|
||||
"""
|
||||
|
||||
config: FileResolverConfig = field(default_factory=FileResolverConfig)
|
||||
upload_cache: UploadCache | None = None
|
||||
_uploaders: dict[str, FileUploader] = field(default_factory=dict)
|
||||
|
||||
def _build_file_context(self, file: FileInput) -> FileContext:
|
||||
"""Build context by reading file once.
|
||||
|
||||
Args:
|
||||
file: The file to build context for.
|
||||
|
||||
Returns:
|
||||
FileContext with cached metadata.
|
||||
"""
|
||||
content = file.read()
|
||||
return FileContext(
|
||||
content=content,
|
||||
size=len(content),
|
||||
content_hash=hashlib.sha256(content).hexdigest(),
|
||||
content_type=file.content_type,
|
||||
)
|
||||
|
||||
def resolve(self, file: FileInput, provider: str) -> ResolvedFile:
|
||||
"""Resolve a file to its delivery format for a provider.
|
||||
|
||||
Args:
|
||||
file: The file to resolve.
|
||||
provider: Provider name (e.g., "gemini", "anthropic", "openai").
|
||||
|
||||
Returns:
|
||||
ResolvedFile representing the appropriate delivery format.
|
||||
"""
|
||||
provider_lower = provider.lower()
|
||||
constraints = get_constraints_for_provider(provider)
|
||||
context = self._build_file_context(file)
|
||||
|
||||
should_upload = self._should_upload(
|
||||
file, provider_lower, constraints, context.size
|
||||
)
|
||||
|
||||
if should_upload:
|
||||
resolved = self._resolve_via_upload(file, provider_lower, context)
|
||||
if resolved is not None:
|
||||
return resolved
|
||||
|
||||
return self._resolve_inline(file, provider_lower, context)
|
||||
|
||||
def resolve_files(
|
||||
self,
|
||||
files: dict[str, FileInput],
|
||||
provider: str,
|
||||
) -> dict[str, ResolvedFile]:
|
||||
"""Resolve multiple files for a provider.
|
||||
|
||||
Args:
|
||||
files: Dictionary mapping names to file inputs.
|
||||
provider: Provider name.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping names to resolved files.
|
||||
"""
|
||||
return {name: self.resolve(file, provider) for name, file in files.items()}
|
||||
|
||||
def _should_upload(
|
||||
self,
|
||||
file: FileInput,
|
||||
provider: str,
|
||||
constraints: ProviderConstraints | None,
|
||||
file_size: int,
|
||||
) -> bool:
|
||||
"""Determine if a file should be uploaded rather than inlined.
|
||||
|
||||
Args:
|
||||
file: The file to check.
|
||||
provider: Provider name.
|
||||
constraints: Provider constraints.
|
||||
file_size: Size of the file in bytes.
|
||||
|
||||
Returns:
|
||||
True if the file should be uploaded, False otherwise.
|
||||
"""
|
||||
if constraints is None or not constraints.supports_file_upload:
|
||||
return False
|
||||
|
||||
if self.config.prefer_upload:
|
||||
return True
|
||||
|
||||
threshold = self.config.upload_threshold_bytes
|
||||
if threshold is None and constraints is not None:
|
||||
threshold = constraints.file_upload_threshold_bytes
|
||||
|
||||
if threshold is not None and file_size > threshold:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _resolve_via_upload(
|
||||
self,
|
||||
file: FileInput,
|
||||
provider: str,
|
||||
context: FileContext,
|
||||
) -> ResolvedFile | None:
|
||||
"""Resolve a file by uploading it.
|
||||
|
||||
Args:
|
||||
file: The file to upload.
|
||||
provider: Provider name.
|
||||
context: Pre-computed file context.
|
||||
|
||||
Returns:
|
||||
FileReference if upload succeeds, None otherwise.
|
||||
"""
|
||||
if self.upload_cache is not None:
|
||||
cached = self.upload_cache.get_by_hash(context.content_hash, provider)
|
||||
if cached is not None:
|
||||
logger.debug(
|
||||
f"Using cached upload for {file.filename}: {cached.file_id}"
|
||||
)
|
||||
return FileReference(
|
||||
content_type=cached.content_type,
|
||||
file_id=cached.file_id,
|
||||
provider=cached.provider,
|
||||
expires_at=cached.expires_at,
|
||||
file_uri=cached.file_uri,
|
||||
)
|
||||
|
||||
uploader = self._get_uploader(provider)
|
||||
if uploader is None:
|
||||
logger.debug(f"No uploader available for {provider}")
|
||||
return None
|
||||
|
||||
result = self._upload_with_retry(uploader, file, provider, context.size)
|
||||
if result is None:
|
||||
return None
|
||||
|
||||
if self.upload_cache is not None:
|
||||
self.upload_cache.set_by_hash(
|
||||
file_hash=context.content_hash,
|
||||
content_type=context.content_type,
|
||||
provider=provider,
|
||||
file_id=result.file_id,
|
||||
file_uri=result.file_uri,
|
||||
expires_at=result.expires_at,
|
||||
)
|
||||
|
||||
return FileReference(
|
||||
content_type=result.content_type,
|
||||
file_id=result.file_id,
|
||||
provider=result.provider,
|
||||
expires_at=result.expires_at,
|
||||
file_uri=result.file_uri,
|
||||
)
|
||||
|
||||
def _upload_with_retry(
|
||||
self,
|
||||
uploader: FileUploader,
|
||||
file: FileInput,
|
||||
provider: str,
|
||||
file_size: int,
|
||||
) -> UploadResult | None:
|
||||
"""Upload with exponential backoff retry.
|
||||
|
||||
Args:
|
||||
uploader: The uploader to use.
|
||||
file: The file to upload.
|
||||
provider: Provider name for logging.
|
||||
file_size: Size of the file in bytes.
|
||||
|
||||
Returns:
|
||||
UploadResult if successful, None otherwise.
|
||||
"""
|
||||
import time
|
||||
|
||||
from crewai.files.processing.exceptions import (
|
||||
PermanentUploadError,
|
||||
TransientUploadError,
|
||||
)
|
||||
|
||||
last_error: Exception | None = None
|
||||
|
||||
for attempt in range(UPLOAD_MAX_RETRIES):
|
||||
with measure_operation(
|
||||
"upload",
|
||||
filename=file.filename,
|
||||
provider=provider,
|
||||
size_bytes=file_size,
|
||||
attempt=attempt + 1,
|
||||
) as metrics:
|
||||
try:
|
||||
result = uploader.upload(file)
|
||||
metrics.metadata["file_id"] = result.file_id
|
||||
return result
|
||||
except PermanentUploadError as e:
|
||||
metrics.metadata["error_type"] = "permanent"
|
||||
logger.warning(
|
||||
f"Non-retryable upload error for {file.filename}: {e}"
|
||||
)
|
||||
return None
|
||||
except TransientUploadError as e:
|
||||
metrics.metadata["error_type"] = "transient"
|
||||
last_error = e
|
||||
except Exception as e:
|
||||
metrics.metadata["error_type"] = "unknown"
|
||||
last_error = e
|
||||
|
||||
if attempt < UPLOAD_MAX_RETRIES - 1:
|
||||
delay = UPLOAD_RETRY_DELAY_BASE**attempt
|
||||
logger.debug(
|
||||
f"Retrying upload for {file.filename} in {delay}s (attempt {attempt + 1})"
|
||||
)
|
||||
time.sleep(delay)
|
||||
|
||||
logger.warning(
|
||||
f"Upload failed for {file.filename} to {provider} after {UPLOAD_MAX_RETRIES} attempts: {last_error}"
|
||||
)
|
||||
return None
|
||||
|
||||
def _resolve_inline(
|
||||
self,
|
||||
file: FileInput,
|
||||
provider: str,
|
||||
context: FileContext,
|
||||
) -> ResolvedFile:
|
||||
"""Resolve a file as inline content.
|
||||
|
||||
Args:
|
||||
file: The file to resolve.
|
||||
provider: Provider name.
|
||||
context: Pre-computed file context.
|
||||
|
||||
Returns:
|
||||
InlineBase64 or InlineBytes depending on provider.
|
||||
"""
|
||||
if self.config.use_bytes_for_bedrock and "bedrock" in provider:
|
||||
return InlineBytes(
|
||||
content_type=context.content_type,
|
||||
data=context.content,
|
||||
)
|
||||
|
||||
encoded = base64.b64encode(context.content).decode("ascii")
|
||||
return InlineBase64(
|
||||
content_type=context.content_type,
|
||||
data=encoded,
|
||||
)
|
||||
|
||||
async def aresolve(self, file: FileInput, provider: str) -> ResolvedFile:
|
||||
"""Async resolve a file to its delivery format for a provider.
|
||||
|
||||
Args:
|
||||
file: The file to resolve.
|
||||
provider: Provider name (e.g., "gemini", "anthropic", "openai").
|
||||
|
||||
Returns:
|
||||
ResolvedFile representing the appropriate delivery format.
|
||||
"""
|
||||
provider_lower = provider.lower()
|
||||
constraints = get_constraints_for_provider(provider)
|
||||
context = self._build_file_context(file)
|
||||
|
||||
should_upload = self._should_upload(
|
||||
file, provider_lower, constraints, context.size
|
||||
)
|
||||
|
||||
if should_upload:
|
||||
resolved = await self._aresolve_via_upload(file, provider_lower, context)
|
||||
if resolved is not None:
|
||||
return resolved
|
||||
|
||||
return self._resolve_inline(file, provider_lower, context)
|
||||
|
||||
async def aresolve_files(
|
||||
self,
|
||||
files: dict[str, FileInput],
|
||||
provider: str,
|
||||
max_concurrency: int = 10,
|
||||
) -> dict[str, ResolvedFile]:
|
||||
"""Async resolve multiple files in parallel.
|
||||
|
||||
Args:
|
||||
files: Dictionary mapping names to file inputs.
|
||||
provider: Provider name.
|
||||
max_concurrency: Maximum number of concurrent resolutions.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping names to resolved files.
|
||||
"""
|
||||
semaphore = asyncio.Semaphore(max_concurrency)
|
||||
|
||||
async def resolve_one(name: str, file: FileInput) -> tuple[str, ResolvedFile]:
|
||||
async with semaphore:
|
||||
resolved = await self.aresolve(file, provider)
|
||||
return name, resolved
|
||||
|
||||
tasks = [resolve_one(n, f) for n, f in files.items()]
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
output: dict[str, ResolvedFile] = {}
|
||||
for result in results:
|
||||
if isinstance(result, BaseException):
|
||||
logger.error(f"Resolution failed: {result}")
|
||||
continue
|
||||
name, resolved = result
|
||||
output[name] = resolved
|
||||
|
||||
return output
|
||||
|
||||
async def _aresolve_via_upload(
|
||||
self,
|
||||
file: FileInput,
|
||||
provider: str,
|
||||
context: FileContext,
|
||||
) -> ResolvedFile | None:
|
||||
"""Async resolve a file by uploading it.
|
||||
|
||||
Args:
|
||||
file: The file to upload.
|
||||
provider: Provider name.
|
||||
context: Pre-computed file context.
|
||||
|
||||
Returns:
|
||||
FileReference if upload succeeds, None otherwise.
|
||||
"""
|
||||
if self.upload_cache is not None:
|
||||
cached = await self.upload_cache.aget_by_hash(
|
||||
context.content_hash, provider
|
||||
)
|
||||
if cached is not None:
|
||||
logger.debug(
|
||||
f"Using cached upload for {file.filename}: {cached.file_id}"
|
||||
)
|
||||
return FileReference(
|
||||
content_type=cached.content_type,
|
||||
file_id=cached.file_id,
|
||||
provider=cached.provider,
|
||||
expires_at=cached.expires_at,
|
||||
file_uri=cached.file_uri,
|
||||
)
|
||||
|
||||
uploader = self._get_uploader(provider)
|
||||
if uploader is None:
|
||||
logger.debug(f"No uploader available for {provider}")
|
||||
return None
|
||||
|
||||
result = await self._aupload_with_retry(uploader, file, provider, context.size)
|
||||
if result is None:
|
||||
return None
|
||||
|
||||
if self.upload_cache is not None:
|
||||
await self.upload_cache.aset_by_hash(
|
||||
file_hash=context.content_hash,
|
||||
content_type=context.content_type,
|
||||
provider=provider,
|
||||
file_id=result.file_id,
|
||||
file_uri=result.file_uri,
|
||||
expires_at=result.expires_at,
|
||||
)
|
||||
|
||||
return FileReference(
|
||||
content_type=result.content_type,
|
||||
file_id=result.file_id,
|
||||
provider=result.provider,
|
||||
expires_at=result.expires_at,
|
||||
file_uri=result.file_uri,
|
||||
)
|
||||
|
||||
async def _aupload_with_retry(
|
||||
self,
|
||||
uploader: FileUploader,
|
||||
file: FileInput,
|
||||
provider: str,
|
||||
file_size: int,
|
||||
) -> UploadResult | None:
|
||||
"""Async upload with exponential backoff retry.
|
||||
|
||||
Args:
|
||||
uploader: The uploader to use.
|
||||
file: The file to upload.
|
||||
provider: Provider name for logging.
|
||||
file_size: Size of the file in bytes.
|
||||
|
||||
Returns:
|
||||
UploadResult if successful, None otherwise.
|
||||
"""
|
||||
from crewai.files.processing.exceptions import (
|
||||
PermanentUploadError,
|
||||
TransientUploadError,
|
||||
)
|
||||
|
||||
last_error: Exception | None = None
|
||||
|
||||
for attempt in range(UPLOAD_MAX_RETRIES):
|
||||
with measure_operation(
|
||||
"upload",
|
||||
filename=file.filename,
|
||||
provider=provider,
|
||||
size_bytes=file_size,
|
||||
attempt=attempt + 1,
|
||||
) as metrics:
|
||||
try:
|
||||
result = await uploader.aupload(file)
|
||||
metrics.metadata["file_id"] = result.file_id
|
||||
return result
|
||||
except PermanentUploadError as e:
|
||||
metrics.metadata["error_type"] = "permanent"
|
||||
logger.warning(
|
||||
f"Non-retryable upload error for {file.filename}: {e}"
|
||||
)
|
||||
return None
|
||||
except TransientUploadError as e:
|
||||
metrics.metadata["error_type"] = "transient"
|
||||
last_error = e
|
||||
except Exception as e:
|
||||
metrics.metadata["error_type"] = "unknown"
|
||||
last_error = e
|
||||
|
||||
if attempt < UPLOAD_MAX_RETRIES - 1:
|
||||
delay = UPLOAD_RETRY_DELAY_BASE**attempt
|
||||
logger.debug(
|
||||
f"Retrying upload for {file.filename} in {delay}s (attempt {attempt + 1})"
|
||||
)
|
||||
await asyncio.sleep(delay)
|
||||
|
||||
logger.warning(
|
||||
f"Upload failed for {file.filename} to {provider} after {UPLOAD_MAX_RETRIES} attempts: {last_error}"
|
||||
)
|
||||
return None
|
||||
|
||||
def _get_uploader(self, provider: str) -> FileUploader | None:
|
||||
"""Get or create an uploader for a provider.
|
||||
|
||||
Args:
|
||||
provider: Provider name.
|
||||
|
||||
Returns:
|
||||
FileUploader instance or None if not available.
|
||||
"""
|
||||
if provider not in self._uploaders:
|
||||
uploader = get_uploader(provider)
|
||||
if uploader is not None:
|
||||
self._uploaders[provider] = uploader
|
||||
else:
|
||||
return None
|
||||
|
||||
return self._uploaders.get(provider)
|
||||
|
||||
def get_cached_uploads(self, provider: str) -> list[CachedUpload]:
|
||||
"""Get all cached uploads for a provider.
|
||||
|
||||
Args:
|
||||
provider: Provider name.
|
||||
|
||||
Returns:
|
||||
List of cached uploads.
|
||||
"""
|
||||
if self.upload_cache is None:
|
||||
return []
|
||||
return self.upload_cache.get_all_for_provider(provider)
|
||||
|
||||
def clear_cache(self) -> None:
|
||||
"""Clear the upload cache."""
|
||||
if self.upload_cache is not None:
|
||||
self.upload_cache.clear()
|
||||
|
||||
|
||||
def create_resolver(
|
||||
provider: str | None = None,
|
||||
prefer_upload: bool = False,
|
||||
upload_threshold_bytes: int | None = None,
|
||||
enable_cache: bool = True,
|
||||
) -> FileResolver:
|
||||
"""Create a configured FileResolver.
|
||||
|
||||
Args:
|
||||
provider: Optional provider name for provider-specific configuration.
|
||||
prefer_upload: Whether to prefer upload over inline.
|
||||
upload_threshold_bytes: Size threshold for using upload.
|
||||
enable_cache: Whether to enable upload caching.
|
||||
|
||||
Returns:
|
||||
Configured FileResolver instance.
|
||||
"""
|
||||
config = FileResolverConfig(
|
||||
prefer_upload=prefer_upload,
|
||||
upload_threshold_bytes=upload_threshold_bytes,
|
||||
)
|
||||
|
||||
cache = UploadCache() if enable_cache else None
|
||||
|
||||
return FileResolver(config=config, upload_cache=cache)
|
||||
@@ -5,6 +5,7 @@ from __future__ import annotations
|
||||
import asyncio
|
||||
import atexit
|
||||
import builtins
|
||||
from collections.abc import Iterator
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
import hashlib
|
||||
@@ -16,7 +17,7 @@ from aiocache.serializers import PickleSerializer # type: ignore[import-untyped
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from crewai.utilities.files.content_types import (
|
||||
from crewai.files.content_types import (
|
||||
AudioFile,
|
||||
File,
|
||||
ImageFile,
|
||||
@@ -31,6 +32,7 @@ if TYPE_CHECKING:
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DEFAULT_TTL_SECONDS = 24 * 60 * 60 # 24 hours
|
||||
DEFAULT_MAX_CACHE_ENTRIES = 1000
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -65,8 +67,31 @@ def _make_key(file_hash: str, provider: str) -> str:
|
||||
return f"upload:{provider}:{file_hash}"
|
||||
|
||||
|
||||
def _compute_file_hash_streaming(chunks: Iterator[bytes]) -> str:
|
||||
"""Compute SHA-256 hash from streaming chunks.
|
||||
|
||||
Args:
|
||||
chunks: Iterator of byte chunks.
|
||||
|
||||
Returns:
|
||||
Hexadecimal hash string.
|
||||
"""
|
||||
hasher = hashlib.sha256()
|
||||
for chunk in chunks:
|
||||
hasher.update(chunk)
|
||||
return hasher.hexdigest()
|
||||
|
||||
|
||||
def _compute_file_hash(file: FileInput) -> str:
|
||||
"""Compute SHA-256 hash of file content."""
|
||||
"""Compute SHA-256 hash of file content.
|
||||
|
||||
Uses streaming for FilePath sources to avoid loading large files into memory.
|
||||
"""
|
||||
from crewai.files.file import FilePath
|
||||
|
||||
source = file._file_source
|
||||
if isinstance(source, FilePath):
|
||||
return _compute_file_hash_streaming(source.read_chunks(chunk_size=1024 * 1024))
|
||||
content = file.read()
|
||||
return hashlib.sha256(content).hexdigest()
|
||||
|
||||
@@ -87,6 +112,7 @@ class UploadCache:
|
||||
ttl: int = DEFAULT_TTL_SECONDS,
|
||||
namespace: str = "crewai_uploads",
|
||||
cache_type: str = "memory",
|
||||
max_entries: int | None = DEFAULT_MAX_CACHE_ENTRIES,
|
||||
**cache_kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize the upload cache.
|
||||
@@ -95,11 +121,14 @@ class UploadCache:
|
||||
ttl: Default TTL in seconds.
|
||||
namespace: Cache namespace.
|
||||
cache_type: Backend type ("memory" or "redis").
|
||||
max_entries: Maximum cache entries (None for unlimited).
|
||||
**cache_kwargs: Additional args for cache backend.
|
||||
"""
|
||||
self.ttl = ttl
|
||||
self.namespace = namespace
|
||||
self.max_entries = max_entries
|
||||
self._provider_keys: dict[str, set[str]] = {}
|
||||
self._key_access_order: list[str] = []
|
||||
|
||||
if cache_type == "redis":
|
||||
self._cache = Cache(
|
||||
@@ -116,15 +145,60 @@ class UploadCache:
|
||||
)
|
||||
|
||||
def _track_key(self, provider: str, key: str) -> None:
|
||||
"""Track a key for a provider (for cleanup)."""
|
||||
"""Track a key for a provider (for cleanup) and access order."""
|
||||
if provider not in self._provider_keys:
|
||||
self._provider_keys[provider] = set()
|
||||
self._provider_keys[provider].add(key)
|
||||
if key in self._key_access_order:
|
||||
self._key_access_order.remove(key)
|
||||
self._key_access_order.append(key)
|
||||
|
||||
def _untrack_key(self, provider: str, key: str) -> None:
|
||||
"""Remove key tracking for a provider."""
|
||||
if provider in self._provider_keys:
|
||||
self._provider_keys[provider].discard(key)
|
||||
if key in self._key_access_order:
|
||||
self._key_access_order.remove(key)
|
||||
|
||||
async def _evict_if_needed(self) -> int:
|
||||
"""Evict oldest entries if limit exceeded.
|
||||
|
||||
Returns:
|
||||
Number of entries evicted.
|
||||
"""
|
||||
if self.max_entries is None:
|
||||
return 0
|
||||
|
||||
current_count = len(self)
|
||||
if current_count < self.max_entries:
|
||||
return 0
|
||||
|
||||
to_evict = max(1, self.max_entries // 10)
|
||||
return await self._evict_oldest(to_evict)
|
||||
|
||||
async def _evict_oldest(self, count: int) -> int:
|
||||
"""Evict the oldest entries from the cache.
|
||||
|
||||
Args:
|
||||
count: Number of entries to evict.
|
||||
|
||||
Returns:
|
||||
Number of entries actually evicted.
|
||||
"""
|
||||
evicted = 0
|
||||
keys_to_evict = self._key_access_order[:count]
|
||||
|
||||
for key in keys_to_evict:
|
||||
await self._cache.delete(key)
|
||||
self._key_access_order.remove(key)
|
||||
for provider_keys in self._provider_keys.values():
|
||||
provider_keys.discard(key)
|
||||
evicted += 1
|
||||
|
||||
if evicted > 0:
|
||||
logger.debug(f"Evicted {evicted} oldest cache entries")
|
||||
|
||||
return evicted
|
||||
|
||||
async def aget(self, file: FileInput, provider: str) -> CachedUpload | None:
|
||||
"""Get a cached upload for a file.
|
||||
@@ -214,6 +288,8 @@ class UploadCache:
|
||||
Returns:
|
||||
The created cache entry.
|
||||
"""
|
||||
await self._evict_if_needed()
|
||||
|
||||
key = _make_key(file_hash, provider)
|
||||
now = datetime.now(timezone.utc)
|
||||
|
||||
@@ -331,18 +407,15 @@ class UploadCache:
|
||||
return results
|
||||
|
||||
def _run_sync(self, coro: Any) -> Any:
|
||||
"""Run an async coroutine from sync context."""
|
||||
"""Run an async coroutine from sync context without blocking event loop."""
|
||||
try:
|
||||
loop = asyncio.get_running_loop()
|
||||
except RuntimeError:
|
||||
loop = None
|
||||
|
||||
if loop is not None and loop.is_running():
|
||||
import concurrent.futures
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor() as pool:
|
||||
future = pool.submit(asyncio.run, coro)
|
||||
return future.result()
|
||||
future = asyncio.run_coroutine_threadsafe(coro, loop)
|
||||
return future.result(timeout=30)
|
||||
return asyncio.run(coro)
|
||||
|
||||
def get(self, file: FileInput, provider: str) -> CachedUpload | None:
|
||||
@@ -473,7 +546,7 @@ def _cleanup_on_exit() -> None:
|
||||
if _default_cache is None or len(_default_cache) == 0:
|
||||
return
|
||||
|
||||
from crewai.utilities.files.cleanup import cleanup_uploaded_files
|
||||
from crewai.files.cleanup import cleanup_uploaded_files
|
||||
|
||||
try:
|
||||
cleanup_uploaded_files(_default_cache, delete_from_provider=True)
|
||||
@@ -5,7 +5,7 @@ from __future__ import annotations
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
from crewai.utilities.files.uploaders.base import FileUploader, UploadResult
|
||||
from crewai.files.uploaders.base import FileUploader, UploadResult
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -31,7 +31,7 @@ def get_uploader(provider: str, **kwargs: Any) -> FileUploader | None:
|
||||
|
||||
if "gemini" in provider_lower or "google" in provider_lower:
|
||||
try:
|
||||
from crewai.utilities.files.uploaders.gemini import GeminiFileUploader
|
||||
from crewai.files.uploaders.gemini import GeminiFileUploader
|
||||
|
||||
return GeminiFileUploader(**kwargs)
|
||||
except ImportError:
|
||||
@@ -42,7 +42,7 @@ def get_uploader(provider: str, **kwargs: Any) -> FileUploader | None:
|
||||
|
||||
if "anthropic" in provider_lower or "claude" in provider_lower:
|
||||
try:
|
||||
from crewai.utilities.files.uploaders.anthropic import AnthropicFileUploader
|
||||
from crewai.files.uploaders.anthropic import AnthropicFileUploader
|
||||
|
||||
return AnthropicFileUploader(**kwargs)
|
||||
except ImportError:
|
||||
@@ -53,12 +53,32 @@ def get_uploader(provider: str, **kwargs: Any) -> FileUploader | None:
|
||||
|
||||
if "openai" in provider_lower or "gpt" in provider_lower:
|
||||
try:
|
||||
from crewai.utilities.files.uploaders.openai import OpenAIFileUploader
|
||||
from crewai.files.uploaders.openai import OpenAIFileUploader
|
||||
|
||||
return OpenAIFileUploader(**kwargs)
|
||||
except ImportError:
|
||||
logger.warning("openai not installed. Install with: pip install openai")
|
||||
return None
|
||||
|
||||
if "bedrock" in provider_lower or "aws" in provider_lower:
|
||||
import os
|
||||
|
||||
if (
|
||||
not os.environ.get("CREWAI_BEDROCK_S3_BUCKET")
|
||||
and "bucket_name" not in kwargs
|
||||
):
|
||||
logger.debug(
|
||||
"Bedrock S3 uploader not configured. "
|
||||
"Set CREWAI_BEDROCK_S3_BUCKET environment variable to enable."
|
||||
)
|
||||
return None
|
||||
try:
|
||||
from crewai.files.uploaders.bedrock import BedrockFileUploader
|
||||
|
||||
return BedrockFileUploader(**kwargs)
|
||||
except ImportError:
|
||||
logger.warning("boto3 not installed. Install with: pip install boto3")
|
||||
return None
|
||||
|
||||
logger.debug(f"No file uploader available for provider: {provider}")
|
||||
return None
|
||||
320
lib/crewai/src/crewai/files/uploaders/anthropic.py
Normal file
320
lib/crewai/src/crewai/files/uploaders/anthropic.py
Normal file
@@ -0,0 +1,320 @@
|
||||
"""Anthropic Files API uploader implementation."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
from crewai.files.content_types import (
|
||||
AudioFile,
|
||||
File,
|
||||
ImageFile,
|
||||
PDFFile,
|
||||
TextFile,
|
||||
VideoFile,
|
||||
)
|
||||
from crewai.files.uploaders.base import FileUploader, UploadResult
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
FileInput = AudioFile | File | ImageFile | PDFFile | TextFile | VideoFile
|
||||
|
||||
|
||||
class AnthropicFileUploader(FileUploader):
|
||||
"""Uploader for Anthropic Files API.
|
||||
|
||||
Uses the anthropic SDK to upload files. Files are stored persistently
|
||||
until explicitly deleted.
|
||||
|
||||
Attributes:
|
||||
api_key: Optional API key (uses ANTHROPIC_API_KEY env var if not provided).
|
||||
"""
|
||||
|
||||
def __init__(self, api_key: str | None = None) -> None:
|
||||
"""Initialize the Anthropic uploader.
|
||||
|
||||
Args:
|
||||
api_key: Optional Anthropic API key. If not provided, uses
|
||||
ANTHROPIC_API_KEY environment variable.
|
||||
"""
|
||||
self._api_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
|
||||
self._client: Any = None
|
||||
self._async_client: Any = None
|
||||
|
||||
@property
|
||||
def provider_name(self) -> str:
|
||||
"""Return the provider name."""
|
||||
return "anthropic"
|
||||
|
||||
def _get_client(self) -> Any:
|
||||
"""Get or create the Anthropic client."""
|
||||
if self._client is None:
|
||||
try:
|
||||
import anthropic
|
||||
|
||||
self._client = anthropic.Anthropic(api_key=self._api_key)
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"anthropic is required for Anthropic file uploads. "
|
||||
"Install with: pip install anthropic"
|
||||
) from e
|
||||
return self._client
|
||||
|
||||
def _get_async_client(self) -> Any:
|
||||
"""Get or create the async Anthropic client."""
|
||||
if self._async_client is None:
|
||||
try:
|
||||
import anthropic
|
||||
|
||||
self._async_client = anthropic.AsyncAnthropic(api_key=self._api_key)
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"anthropic is required for Anthropic file uploads. "
|
||||
"Install with: pip install anthropic"
|
||||
) from e
|
||||
return self._async_client
|
||||
|
||||
def upload(self, file: FileInput, purpose: str | None = None) -> UploadResult:
|
||||
"""Upload a file to Anthropic.
|
||||
|
||||
Args:
|
||||
file: The file to upload.
|
||||
purpose: Optional purpose for the file (default: "user_upload").
|
||||
|
||||
Returns:
|
||||
UploadResult with the file ID and metadata.
|
||||
|
||||
Raises:
|
||||
TransientUploadError: For retryable errors (network, rate limits).
|
||||
PermanentUploadError: For non-retryable errors (auth, validation).
|
||||
"""
|
||||
from crewai.files.processing.exceptions import (
|
||||
PermanentUploadError,
|
||||
TransientUploadError,
|
||||
)
|
||||
|
||||
try:
|
||||
client = self._get_client()
|
||||
|
||||
content = file.read()
|
||||
file_purpose = purpose or "user_upload"
|
||||
|
||||
file_data = io.BytesIO(content)
|
||||
|
||||
logger.info(
|
||||
f"Uploading file '{file.filename}' to Anthropic ({len(content)} bytes)"
|
||||
)
|
||||
|
||||
uploaded_file = client.files.create(
|
||||
file=(file.filename, file_data, file.content_type),
|
||||
purpose=file_purpose,
|
||||
)
|
||||
|
||||
logger.info(f"Uploaded to Anthropic: {uploaded_file.id}")
|
||||
|
||||
return UploadResult(
|
||||
file_id=uploaded_file.id,
|
||||
file_uri=None,
|
||||
content_type=file.content_type,
|
||||
expires_at=None,
|
||||
provider=self.provider_name,
|
||||
)
|
||||
except ImportError:
|
||||
raise
|
||||
except Exception as e:
|
||||
error_type = type(e).__name__
|
||||
if "RateLimit" in error_type or "APIConnection" in error_type:
|
||||
raise TransientUploadError(
|
||||
f"Transient upload error: {e}", file_name=file.filename
|
||||
) from e
|
||||
if "Authentication" in error_type or "Permission" in error_type:
|
||||
raise PermanentUploadError(
|
||||
f"Authentication/permission error: {e}", file_name=file.filename
|
||||
) from e
|
||||
if "BadRequest" in error_type or "InvalidRequest" in error_type:
|
||||
raise PermanentUploadError(
|
||||
f"Invalid request: {e}", file_name=file.filename
|
||||
) from e
|
||||
status_code = getattr(e, "status_code", None)
|
||||
if status_code is not None:
|
||||
if status_code >= 500 or status_code == 429:
|
||||
raise TransientUploadError(
|
||||
f"Server error ({status_code}): {e}", file_name=file.filename
|
||||
) from e
|
||||
if status_code in (401, 403):
|
||||
raise PermanentUploadError(
|
||||
f"Auth error ({status_code}): {e}", file_name=file.filename
|
||||
) from e
|
||||
if status_code == 400:
|
||||
raise PermanentUploadError(
|
||||
f"Bad request ({status_code}): {e}", file_name=file.filename
|
||||
) from e
|
||||
raise TransientUploadError(
|
||||
f"Upload failed: {e}", file_name=file.filename
|
||||
) from e
|
||||
|
||||
def delete(self, file_id: str) -> bool:
|
||||
"""Delete an uploaded file from Anthropic.
|
||||
|
||||
Args:
|
||||
file_id: The file ID to delete.
|
||||
|
||||
Returns:
|
||||
True if deletion was successful, False otherwise.
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
client.files.delete(file_id=file_id)
|
||||
logger.info(f"Deleted Anthropic file: {file_id}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to delete Anthropic file {file_id}: {e}")
|
||||
return False
|
||||
|
||||
def get_file_info(self, file_id: str) -> dict[str, Any] | None:
|
||||
"""Get information about an uploaded file.
|
||||
|
||||
Args:
|
||||
file_id: The file ID.
|
||||
|
||||
Returns:
|
||||
Dictionary with file information, or None if not found.
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
file_info = client.files.retrieve(file_id=file_id)
|
||||
return {
|
||||
"id": file_info.id,
|
||||
"filename": file_info.filename,
|
||||
"purpose": file_info.purpose,
|
||||
"size_bytes": file_info.size_bytes,
|
||||
"created_at": file_info.created_at,
|
||||
}
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to get Anthropic file info for {file_id}: {e}")
|
||||
return None
|
||||
|
||||
def list_files(self) -> list[dict[str, Any]]:
|
||||
"""List all uploaded files.
|
||||
|
||||
Returns:
|
||||
List of dictionaries with file information.
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
files = client.files.list()
|
||||
return [
|
||||
{
|
||||
"id": f.id,
|
||||
"filename": f.filename,
|
||||
"purpose": f.purpose,
|
||||
"size_bytes": f.size_bytes,
|
||||
"created_at": f.created_at,
|
||||
}
|
||||
for f in files.data
|
||||
]
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to list Anthropic files: {e}")
|
||||
return []
|
||||
|
||||
async def aupload(
|
||||
self, file: FileInput, purpose: str | None = None
|
||||
) -> UploadResult:
|
||||
"""Async upload a file to Anthropic using native async client.
|
||||
|
||||
Args:
|
||||
file: The file to upload.
|
||||
purpose: Optional purpose for the file (default: "user_upload").
|
||||
|
||||
Returns:
|
||||
UploadResult with the file ID and metadata.
|
||||
|
||||
Raises:
|
||||
TransientUploadError: For retryable errors (network, rate limits).
|
||||
PermanentUploadError: For non-retryable errors (auth, validation).
|
||||
"""
|
||||
from crewai.files.processing.exceptions import (
|
||||
PermanentUploadError,
|
||||
TransientUploadError,
|
||||
)
|
||||
|
||||
try:
|
||||
client = self._get_async_client()
|
||||
|
||||
content = await file.aread()
|
||||
file_purpose = purpose or "user_upload"
|
||||
|
||||
file_data = io.BytesIO(content)
|
||||
|
||||
logger.info(
|
||||
f"Uploading file '{file.filename}' to Anthropic ({len(content)} bytes)"
|
||||
)
|
||||
|
||||
uploaded_file = await client.files.create(
|
||||
file=(file.filename, file_data, file.content_type),
|
||||
purpose=file_purpose,
|
||||
)
|
||||
|
||||
logger.info(f"Uploaded to Anthropic: {uploaded_file.id}")
|
||||
|
||||
return UploadResult(
|
||||
file_id=uploaded_file.id,
|
||||
file_uri=None,
|
||||
content_type=file.content_type,
|
||||
expires_at=None,
|
||||
provider=self.provider_name,
|
||||
)
|
||||
except ImportError:
|
||||
raise
|
||||
except Exception as e:
|
||||
error_type = type(e).__name__
|
||||
if "RateLimit" in error_type or "APIConnection" in error_type:
|
||||
raise TransientUploadError(
|
||||
f"Transient upload error: {e}", file_name=file.filename
|
||||
) from e
|
||||
if "Authentication" in error_type or "Permission" in error_type:
|
||||
raise PermanentUploadError(
|
||||
f"Authentication/permission error: {e}", file_name=file.filename
|
||||
) from e
|
||||
if "BadRequest" in error_type or "InvalidRequest" in error_type:
|
||||
raise PermanentUploadError(
|
||||
f"Invalid request: {e}", file_name=file.filename
|
||||
) from e
|
||||
status_code = getattr(e, "status_code", None)
|
||||
if status_code is not None:
|
||||
if status_code >= 500 or status_code == 429:
|
||||
raise TransientUploadError(
|
||||
f"Server error ({status_code}): {e}", file_name=file.filename
|
||||
) from e
|
||||
if status_code in (401, 403):
|
||||
raise PermanentUploadError(
|
||||
f"Auth error ({status_code}): {e}", file_name=file.filename
|
||||
) from e
|
||||
if status_code == 400:
|
||||
raise PermanentUploadError(
|
||||
f"Bad request ({status_code}): {e}", file_name=file.filename
|
||||
) from e
|
||||
raise TransientUploadError(
|
||||
f"Upload failed: {e}", file_name=file.filename
|
||||
) from e
|
||||
|
||||
async def adelete(self, file_id: str) -> bool:
|
||||
"""Async delete an uploaded file from Anthropic.
|
||||
|
||||
Args:
|
||||
file_id: The file ID to delete.
|
||||
|
||||
Returns:
|
||||
True if deletion was successful, False otherwise.
|
||||
"""
|
||||
try:
|
||||
client = self._get_async_client()
|
||||
await client.files.delete(file_id=file_id)
|
||||
logger.info(f"Deleted Anthropic file: {file_id}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to delete Anthropic file {file_id}: {e}")
|
||||
return False
|
||||
@@ -1,11 +1,12 @@
|
||||
"""Base class for file uploaders."""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
import asyncio
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
from crewai.utilities.files.content_types import (
|
||||
from crewai.files.content_types import (
|
||||
AudioFile,
|
||||
File,
|
||||
ImageFile,
|
||||
@@ -63,6 +64,24 @@ class FileUploader(ABC):
|
||||
Exception: If upload fails.
|
||||
"""
|
||||
|
||||
async def aupload(
|
||||
self, file: FileInput, purpose: str | None = None
|
||||
) -> UploadResult:
|
||||
"""Async upload a file to the provider.
|
||||
|
||||
Default implementation runs sync upload in executor.
|
||||
Override in subclasses for native async support.
|
||||
|
||||
Args:
|
||||
file: The file to upload.
|
||||
purpose: Optional purpose/description for the upload.
|
||||
|
||||
Returns:
|
||||
UploadResult with the file identifier and metadata.
|
||||
"""
|
||||
loop = asyncio.get_running_loop()
|
||||
return await loop.run_in_executor(None, self.upload, file, purpose)
|
||||
|
||||
@abstractmethod
|
||||
def delete(self, file_id: str) -> bool:
|
||||
"""Delete an uploaded file.
|
||||
@@ -74,6 +93,21 @@ class FileUploader(ABC):
|
||||
True if deletion was successful, False otherwise.
|
||||
"""
|
||||
|
||||
async def adelete(self, file_id: str) -> bool:
|
||||
"""Async delete an uploaded file.
|
||||
|
||||
Default implementation runs sync delete in executor.
|
||||
Override in subclasses for native async support.
|
||||
|
||||
Args:
|
||||
file_id: The file identifier to delete.
|
||||
|
||||
Returns:
|
||||
True if deletion was successful, False otherwise.
|
||||
"""
|
||||
loop = asyncio.get_running_loop()
|
||||
return await loop.run_in_executor(None, self.delete, file_id)
|
||||
|
||||
def get_file_info(self, file_id: str) -> dict[str, Any] | None:
|
||||
"""Get information about an uploaded file.
|
||||
|
||||
388
lib/crewai/src/crewai/files/uploaders/bedrock.py
Normal file
388
lib/crewai/src/crewai/files/uploaders/bedrock.py
Normal file
@@ -0,0 +1,388 @@
|
||||
"""AWS Bedrock S3 file uploader implementation."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
from crewai.files.content_types import (
|
||||
AudioFile,
|
||||
File,
|
||||
ImageFile,
|
||||
PDFFile,
|
||||
TextFile,
|
||||
VideoFile,
|
||||
)
|
||||
from crewai.files.uploaders.base import FileUploader, UploadResult
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
FileInput = AudioFile | File | ImageFile | PDFFile | TextFile | VideoFile
|
||||
|
||||
|
||||
class BedrockFileUploader(FileUploader):
|
||||
"""Uploader for AWS Bedrock via S3.
|
||||
|
||||
Uploads files to S3 and returns S3 URIs that can be used with Bedrock's
|
||||
Converse API s3Location source format.
|
||||
|
||||
Attributes:
|
||||
bucket_name: S3 bucket name for file uploads.
|
||||
bucket_owner: Optional bucket owner account ID for cross-account access.
|
||||
prefix: Optional S3 key prefix for uploaded files.
|
||||
region: AWS region for the S3 bucket.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
bucket_name: str | None = None,
|
||||
bucket_owner: str | None = None,
|
||||
prefix: str = "crewai-files",
|
||||
region: str | None = None,
|
||||
) -> None:
|
||||
"""Initialize the Bedrock S3 uploader.
|
||||
|
||||
Args:
|
||||
bucket_name: S3 bucket name. If not provided, uses
|
||||
CREWAI_BEDROCK_S3_BUCKET environment variable.
|
||||
bucket_owner: Optional bucket owner account ID for cross-account access.
|
||||
Uses CREWAI_BEDROCK_S3_BUCKET_OWNER environment variable if not provided.
|
||||
prefix: S3 key prefix for uploaded files (default: "crewai-files").
|
||||
region: AWS region. Uses AWS_REGION or AWS_DEFAULT_REGION if not provided.
|
||||
"""
|
||||
self._bucket_name = bucket_name or os.environ.get("CREWAI_BEDROCK_S3_BUCKET")
|
||||
self._bucket_owner = bucket_owner or os.environ.get(
|
||||
"CREWAI_BEDROCK_S3_BUCKET_OWNER"
|
||||
)
|
||||
self._prefix = prefix
|
||||
self._region = region or os.environ.get(
|
||||
"AWS_REGION", os.environ.get("AWS_DEFAULT_REGION")
|
||||
)
|
||||
self._client: Any = None
|
||||
self._async_client: Any = None
|
||||
|
||||
@property
|
||||
def provider_name(self) -> str:
|
||||
"""Return the provider name."""
|
||||
return "bedrock"
|
||||
|
||||
@property
|
||||
def bucket_name(self) -> str:
|
||||
"""Return the configured bucket name."""
|
||||
if not self._bucket_name:
|
||||
raise ValueError(
|
||||
"S3 bucket name not configured. Set CREWAI_BEDROCK_S3_BUCKET "
|
||||
"environment variable or pass bucket_name parameter."
|
||||
)
|
||||
return self._bucket_name
|
||||
|
||||
@property
|
||||
def bucket_owner(self) -> str | None:
|
||||
"""Return the configured bucket owner."""
|
||||
return self._bucket_owner
|
||||
|
||||
def _get_client(self) -> Any:
|
||||
"""Get or create the S3 client."""
|
||||
if self._client is None:
|
||||
try:
|
||||
import boto3
|
||||
|
||||
self._client = boto3.client("s3", region_name=self._region)
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"boto3 is required for Bedrock S3 file uploads. "
|
||||
"Install with: pip install boto3"
|
||||
) from e
|
||||
return self._client
|
||||
|
||||
def _get_async_client(self) -> Any:
|
||||
"""Get or create the async S3 client."""
|
||||
if self._async_client is None:
|
||||
try:
|
||||
import aioboto3 # type: ignore[import-not-found]
|
||||
|
||||
self._session = aioboto3.Session()
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"aioboto3 is required for async Bedrock S3 file uploads. "
|
||||
"Install with: pip install aioboto3"
|
||||
) from e
|
||||
return self._session
|
||||
|
||||
def _generate_s3_key(self, file: FileInput, content: bytes) -> str:
|
||||
"""Generate a unique S3 key for the file.
|
||||
|
||||
Args:
|
||||
file: The file being uploaded.
|
||||
content: The file content bytes.
|
||||
|
||||
Returns:
|
||||
S3 key string.
|
||||
"""
|
||||
content_hash = hashlib.sha256(content).hexdigest()[:16]
|
||||
filename = file.filename or "file"
|
||||
|
||||
safe_filename = "".join(
|
||||
c if c.isalnum() or c in ".-_" else "_" for c in filename
|
||||
)
|
||||
return f"{self._prefix}/{content_hash}_{safe_filename}"
|
||||
|
||||
def _build_s3_uri(self, key: str) -> str:
|
||||
"""Build an S3 URI from a key.
|
||||
|
||||
Args:
|
||||
key: The S3 object key.
|
||||
|
||||
Returns:
|
||||
S3 URI string.
|
||||
"""
|
||||
return f"s3://{self.bucket_name}/{key}"
|
||||
|
||||
def upload(self, file: FileInput, purpose: str | None = None) -> UploadResult:
|
||||
"""Upload a file to S3 for use with Bedrock.
|
||||
|
||||
Args:
|
||||
file: The file to upload.
|
||||
purpose: Optional purpose (unused, kept for interface consistency).
|
||||
|
||||
Returns:
|
||||
UploadResult with the S3 URI and metadata.
|
||||
|
||||
Raises:
|
||||
TransientUploadError: For retryable errors (network, throttling).
|
||||
PermanentUploadError: For non-retryable errors (auth, validation).
|
||||
"""
|
||||
from crewai.files.processing.exceptions import (
|
||||
PermanentUploadError,
|
||||
TransientUploadError,
|
||||
)
|
||||
|
||||
try:
|
||||
client = self._get_client()
|
||||
content = file.read()
|
||||
s3_key = self._generate_s3_key(file, content)
|
||||
|
||||
logger.info(
|
||||
f"Uploading file '{file.filename}' to S3 bucket "
|
||||
f"'{self.bucket_name}' ({len(content)} bytes)"
|
||||
)
|
||||
|
||||
client.put_object(
|
||||
Bucket=self.bucket_name,
|
||||
Key=s3_key,
|
||||
Body=content,
|
||||
ContentType=file.content_type,
|
||||
)
|
||||
|
||||
s3_uri = self._build_s3_uri(s3_key)
|
||||
logger.info(f"Uploaded to S3: {s3_uri}")
|
||||
|
||||
return UploadResult(
|
||||
file_id=s3_key,
|
||||
file_uri=s3_uri,
|
||||
content_type=file.content_type,
|
||||
expires_at=None,
|
||||
provider=self.provider_name,
|
||||
)
|
||||
except ImportError:
|
||||
raise
|
||||
except Exception as e:
|
||||
error_type = type(e).__name__
|
||||
error_code = getattr(e, "response", {}).get("Error", {}).get("Code", "")
|
||||
|
||||
if error_code in ("SlowDown", "ServiceUnavailable", "InternalError"):
|
||||
raise TransientUploadError(
|
||||
f"Transient S3 error: {e}", file_name=file.filename
|
||||
) from e
|
||||
if error_code in (
|
||||
"AccessDenied",
|
||||
"InvalidAccessKeyId",
|
||||
"SignatureDoesNotMatch",
|
||||
):
|
||||
raise PermanentUploadError(
|
||||
f"S3 authentication error: {e}", file_name=file.filename
|
||||
) from e
|
||||
if error_code in ("NoSuchBucket", "InvalidBucketName"):
|
||||
raise PermanentUploadError(
|
||||
f"S3 bucket error: {e}", file_name=file.filename
|
||||
) from e
|
||||
if "Throttl" in error_type or "Throttl" in str(e):
|
||||
raise TransientUploadError(
|
||||
f"S3 throttling: {e}", file_name=file.filename
|
||||
) from e
|
||||
raise TransientUploadError(
|
||||
f"S3 upload failed: {e}", file_name=file.filename
|
||||
) from e
|
||||
|
||||
def delete(self, file_id: str) -> bool:
|
||||
"""Delete an uploaded file from S3.
|
||||
|
||||
Args:
|
||||
file_id: The S3 key to delete.
|
||||
|
||||
Returns:
|
||||
True if deletion was successful, False otherwise.
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
client.delete_object(Bucket=self.bucket_name, Key=file_id)
|
||||
logger.info(f"Deleted S3 object: s3://{self.bucket_name}/{file_id}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Failed to delete S3 object s3://{self.bucket_name}/{file_id}: {e}"
|
||||
)
|
||||
return False
|
||||
|
||||
def get_file_info(self, file_id: str) -> dict[str, Any] | None:
|
||||
"""Get information about an uploaded file.
|
||||
|
||||
Args:
|
||||
file_id: The S3 key.
|
||||
|
||||
Returns:
|
||||
Dictionary with file information, or None if not found.
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
response = client.head_object(Bucket=self.bucket_name, Key=file_id)
|
||||
return {
|
||||
"id": file_id,
|
||||
"uri": self._build_s3_uri(file_id),
|
||||
"content_type": response.get("ContentType"),
|
||||
"size": response.get("ContentLength"),
|
||||
"last_modified": response.get("LastModified"),
|
||||
"etag": response.get("ETag"),
|
||||
}
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to get S3 object info for {file_id}: {e}")
|
||||
return None
|
||||
|
||||
def list_files(self) -> list[dict[str, Any]]:
|
||||
"""List all uploaded files in the configured prefix.
|
||||
|
||||
Returns:
|
||||
List of dictionaries with file information.
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
response = client.list_objects_v2(
|
||||
Bucket=self.bucket_name,
|
||||
Prefix=self._prefix,
|
||||
)
|
||||
return [
|
||||
{
|
||||
"id": obj["Key"],
|
||||
"uri": self._build_s3_uri(obj["Key"]),
|
||||
"size": obj.get("Size"),
|
||||
"last_modified": obj.get("LastModified"),
|
||||
"etag": obj.get("ETag"),
|
||||
}
|
||||
for obj in response.get("Contents", [])
|
||||
]
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to list S3 objects: {e}")
|
||||
return []
|
||||
|
||||
async def aupload(
|
||||
self, file: FileInput, purpose: str | None = None
|
||||
) -> UploadResult:
|
||||
"""Async upload a file to S3 for use with Bedrock.
|
||||
|
||||
Args:
|
||||
file: The file to upload.
|
||||
purpose: Optional purpose (unused, kept for interface consistency).
|
||||
|
||||
Returns:
|
||||
UploadResult with the S3 URI and metadata.
|
||||
|
||||
Raises:
|
||||
TransientUploadError: For retryable errors (network, throttling).
|
||||
PermanentUploadError: For non-retryable errors (auth, validation).
|
||||
"""
|
||||
from crewai.files.processing.exceptions import (
|
||||
PermanentUploadError,
|
||||
TransientUploadError,
|
||||
)
|
||||
|
||||
try:
|
||||
session = self._get_async_client()
|
||||
content = await file.aread()
|
||||
s3_key = self._generate_s3_key(file, content)
|
||||
|
||||
logger.info(
|
||||
f"Uploading file '{file.filename}' to S3 bucket "
|
||||
f"'{self.bucket_name}' ({len(content)} bytes)"
|
||||
)
|
||||
|
||||
async with session.client("s3", region_name=self._region) as client:
|
||||
await client.put_object(
|
||||
Bucket=self.bucket_name,
|
||||
Key=s3_key,
|
||||
Body=content,
|
||||
ContentType=file.content_type,
|
||||
)
|
||||
|
||||
s3_uri = self._build_s3_uri(s3_key)
|
||||
logger.info(f"Uploaded to S3: {s3_uri}")
|
||||
|
||||
return UploadResult(
|
||||
file_id=s3_key,
|
||||
file_uri=s3_uri,
|
||||
content_type=file.content_type,
|
||||
expires_at=None,
|
||||
provider=self.provider_name,
|
||||
)
|
||||
except ImportError:
|
||||
raise
|
||||
except Exception as e:
|
||||
error_type = type(e).__name__
|
||||
error_code = getattr(e, "response", {}).get("Error", {}).get("Code", "")
|
||||
|
||||
if error_code in ("SlowDown", "ServiceUnavailable", "InternalError"):
|
||||
raise TransientUploadError(
|
||||
f"Transient S3 error: {e}", file_name=file.filename
|
||||
) from e
|
||||
if error_code in (
|
||||
"AccessDenied",
|
||||
"InvalidAccessKeyId",
|
||||
"SignatureDoesNotMatch",
|
||||
):
|
||||
raise PermanentUploadError(
|
||||
f"S3 authentication error: {e}", file_name=file.filename
|
||||
) from e
|
||||
if error_code in ("NoSuchBucket", "InvalidBucketName"):
|
||||
raise PermanentUploadError(
|
||||
f"S3 bucket error: {e}", file_name=file.filename
|
||||
) from e
|
||||
if "Throttl" in error_type or "Throttl" in str(e):
|
||||
raise TransientUploadError(
|
||||
f"S3 throttling: {e}", file_name=file.filename
|
||||
) from e
|
||||
raise TransientUploadError(
|
||||
f"S3 upload failed: {e}", file_name=file.filename
|
||||
) from e
|
||||
|
||||
async def adelete(self, file_id: str) -> bool:
|
||||
"""Async delete an uploaded file from S3.
|
||||
|
||||
Args:
|
||||
file_id: The S3 key to delete.
|
||||
|
||||
Returns:
|
||||
True if deletion was successful, False otherwise.
|
||||
"""
|
||||
try:
|
||||
session = self._get_async_client()
|
||||
async with session.client("s3", region_name=self._region) as client:
|
||||
await client.delete_object(Bucket=self.bucket_name, Key=file_id)
|
||||
logger.info(f"Deleted S3 object: s3://{self.bucket_name}/{file_id}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Failed to delete S3 object s3://{self.bucket_name}/{file_id}: {e}"
|
||||
)
|
||||
return False
|
||||
444
lib/crewai/src/crewai/files/uploaders/gemini.py
Normal file
444
lib/crewai/src/crewai/files/uploaders/gemini.py
Normal file
@@ -0,0 +1,444 @@
|
||||
"""Gemini File API uploader implementation."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from datetime import datetime, timedelta, timezone
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
from crewai.files.content_types import (
|
||||
AudioFile,
|
||||
File,
|
||||
ImageFile,
|
||||
PDFFile,
|
||||
TextFile,
|
||||
VideoFile,
|
||||
)
|
||||
from crewai.files.uploaders.base import FileUploader, UploadResult
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
FileInput = AudioFile | File | ImageFile | PDFFile | TextFile | VideoFile
|
||||
|
||||
GEMINI_FILE_TTL = timedelta(hours=48)
|
||||
|
||||
|
||||
class GeminiFileUploader(FileUploader):
|
||||
"""Uploader for Google Gemini File API.
|
||||
|
||||
Uses the google-genai SDK to upload files. Files are stored for 48 hours.
|
||||
|
||||
Attributes:
|
||||
api_key: Optional API key (uses GOOGLE_API_KEY env var if not provided).
|
||||
"""
|
||||
|
||||
def __init__(self, api_key: str | None = None) -> None:
|
||||
"""Initialize the Gemini uploader.
|
||||
|
||||
Args:
|
||||
api_key: Optional Google API key. If not provided, uses
|
||||
GOOGLE_API_KEY environment variable.
|
||||
"""
|
||||
self._api_key = api_key or os.environ.get("GOOGLE_API_KEY")
|
||||
self._client: Any = None
|
||||
|
||||
@property
|
||||
def provider_name(self) -> str:
|
||||
"""Return the provider name."""
|
||||
return "gemini"
|
||||
|
||||
def _get_client(self) -> Any:
|
||||
"""Get or create the Gemini client."""
|
||||
if self._client is None:
|
||||
try:
|
||||
from google import genai
|
||||
|
||||
self._client = genai.Client(api_key=self._api_key)
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"google-genai is required for Gemini file uploads. "
|
||||
"Install with: pip install google-genai"
|
||||
) from e
|
||||
return self._client
|
||||
|
||||
def upload(self, file: FileInput, purpose: str | None = None) -> UploadResult:
|
||||
"""Upload a file to Gemini.
|
||||
|
||||
Args:
|
||||
file: The file to upload.
|
||||
purpose: Optional purpose/description (used as display name).
|
||||
|
||||
Returns:
|
||||
UploadResult with the file URI and metadata.
|
||||
|
||||
Raises:
|
||||
TransientUploadError: For retryable errors (network, rate limits).
|
||||
PermanentUploadError: For non-retryable errors (auth, validation).
|
||||
"""
|
||||
from crewai.files.processing.exceptions import (
|
||||
PermanentUploadError,
|
||||
TransientUploadError,
|
||||
)
|
||||
|
||||
try:
|
||||
client = self._get_client()
|
||||
|
||||
content = file.read()
|
||||
display_name = purpose or file.filename
|
||||
|
||||
file_data = io.BytesIO(content)
|
||||
file_data.name = file.filename
|
||||
|
||||
logger.info(
|
||||
f"Uploading file '{file.filename}' to Gemini ({len(content)} bytes)"
|
||||
)
|
||||
|
||||
uploaded_file = client.files.upload(
|
||||
file=file_data,
|
||||
config={
|
||||
"display_name": display_name,
|
||||
"mime_type": file.content_type,
|
||||
},
|
||||
)
|
||||
|
||||
if file.content_type.startswith("video/"):
|
||||
if not self.wait_for_processing(uploaded_file.name):
|
||||
raise PermanentUploadError(
|
||||
f"Video processing failed for {file.filename}",
|
||||
file_name=file.filename,
|
||||
)
|
||||
|
||||
expires_at = datetime.now(timezone.utc) + GEMINI_FILE_TTL
|
||||
|
||||
logger.info(
|
||||
f"Uploaded to Gemini: {uploaded_file.name} (URI: {uploaded_file.uri})"
|
||||
)
|
||||
|
||||
return UploadResult(
|
||||
file_id=uploaded_file.name,
|
||||
file_uri=uploaded_file.uri,
|
||||
content_type=file.content_type,
|
||||
expires_at=expires_at,
|
||||
provider=self.provider_name,
|
||||
)
|
||||
except ImportError:
|
||||
raise
|
||||
except (TransientUploadError, PermanentUploadError):
|
||||
raise
|
||||
except Exception as e:
|
||||
error_msg = str(e).lower()
|
||||
if "quota" in error_msg or "rate" in error_msg or "limit" in error_msg:
|
||||
raise TransientUploadError(
|
||||
f"Rate limit error: {e}", file_name=file.filename
|
||||
) from e
|
||||
if (
|
||||
"auth" in error_msg
|
||||
or "permission" in error_msg
|
||||
or "denied" in error_msg
|
||||
):
|
||||
raise PermanentUploadError(
|
||||
f"Authentication/permission error: {e}", file_name=file.filename
|
||||
) from e
|
||||
if "invalid" in error_msg or "unsupported" in error_msg:
|
||||
raise PermanentUploadError(
|
||||
f"Invalid request: {e}", file_name=file.filename
|
||||
) from e
|
||||
status_code = getattr(e, "code", None) or getattr(e, "status_code", None)
|
||||
if status_code is not None:
|
||||
if isinstance(status_code, int):
|
||||
if status_code >= 500 or status_code == 429:
|
||||
raise TransientUploadError(
|
||||
f"Server error ({status_code}): {e}",
|
||||
file_name=file.filename,
|
||||
) from e
|
||||
if status_code in (401, 403):
|
||||
raise PermanentUploadError(
|
||||
f"Auth error ({status_code}): {e}", file_name=file.filename
|
||||
) from e
|
||||
if status_code == 400:
|
||||
raise PermanentUploadError(
|
||||
f"Bad request ({status_code}): {e}", file_name=file.filename
|
||||
) from e
|
||||
raise TransientUploadError(
|
||||
f"Upload failed: {e}", file_name=file.filename
|
||||
) from e
|
||||
|
||||
async def aupload(
|
||||
self, file: FileInput, purpose: str | None = None
|
||||
) -> UploadResult:
|
||||
"""Async upload a file to Gemini using native async client.
|
||||
|
||||
Uses async wait_for_processing for video files.
|
||||
|
||||
Args:
|
||||
file: The file to upload.
|
||||
purpose: Optional purpose/description (used as display name).
|
||||
|
||||
Returns:
|
||||
UploadResult with the file URI and metadata.
|
||||
|
||||
Raises:
|
||||
TransientUploadError: For retryable errors (network, rate limits).
|
||||
PermanentUploadError: For non-retryable errors (auth, validation).
|
||||
"""
|
||||
from crewai.files.processing.exceptions import (
|
||||
PermanentUploadError,
|
||||
TransientUploadError,
|
||||
)
|
||||
|
||||
try:
|
||||
client = self._get_client()
|
||||
|
||||
content = await file.aread()
|
||||
display_name = purpose or file.filename
|
||||
|
||||
file_data = io.BytesIO(content)
|
||||
file_data.name = file.filename
|
||||
|
||||
logger.info(
|
||||
f"Uploading file '{file.filename}' to Gemini ({len(content)} bytes)"
|
||||
)
|
||||
|
||||
uploaded_file = await client.aio.files.upload(
|
||||
file=file_data,
|
||||
config={
|
||||
"display_name": display_name,
|
||||
"mime_type": file.content_type,
|
||||
},
|
||||
)
|
||||
|
||||
if file.content_type.startswith("video/"):
|
||||
if not await self.await_for_processing(uploaded_file.name):
|
||||
raise PermanentUploadError(
|
||||
f"Video processing failed for {file.filename}",
|
||||
file_name=file.filename,
|
||||
)
|
||||
|
||||
expires_at = datetime.now(timezone.utc) + GEMINI_FILE_TTL
|
||||
|
||||
logger.info(
|
||||
f"Uploaded to Gemini: {uploaded_file.name} (URI: {uploaded_file.uri})"
|
||||
)
|
||||
|
||||
return UploadResult(
|
||||
file_id=uploaded_file.name,
|
||||
file_uri=uploaded_file.uri,
|
||||
content_type=file.content_type,
|
||||
expires_at=expires_at,
|
||||
provider=self.provider_name,
|
||||
)
|
||||
except ImportError:
|
||||
raise
|
||||
except (TransientUploadError, PermanentUploadError):
|
||||
raise
|
||||
except Exception as e:
|
||||
error_msg = str(e).lower()
|
||||
if "quota" in error_msg or "rate" in error_msg or "limit" in error_msg:
|
||||
raise TransientUploadError(
|
||||
f"Rate limit error: {e}", file_name=file.filename
|
||||
) from e
|
||||
if (
|
||||
"auth" in error_msg
|
||||
or "permission" in error_msg
|
||||
or "denied" in error_msg
|
||||
):
|
||||
raise PermanentUploadError(
|
||||
f"Authentication/permission error: {e}", file_name=file.filename
|
||||
) from e
|
||||
if "invalid" in error_msg or "unsupported" in error_msg:
|
||||
raise PermanentUploadError(
|
||||
f"Invalid request: {e}", file_name=file.filename
|
||||
) from e
|
||||
status_code = getattr(e, "code", None) or getattr(e, "status_code", None)
|
||||
if status_code is not None and isinstance(status_code, int):
|
||||
if status_code >= 500 or status_code == 429:
|
||||
raise TransientUploadError(
|
||||
f"Server error ({status_code}): {e}", file_name=file.filename
|
||||
) from e
|
||||
if status_code in (401, 403):
|
||||
raise PermanentUploadError(
|
||||
f"Auth error ({status_code}): {e}", file_name=file.filename
|
||||
) from e
|
||||
if status_code == 400:
|
||||
raise PermanentUploadError(
|
||||
f"Bad request ({status_code}): {e}", file_name=file.filename
|
||||
) from e
|
||||
raise TransientUploadError(
|
||||
f"Upload failed: {e}", file_name=file.filename
|
||||
) from e
|
||||
|
||||
def delete(self, file_id: str) -> bool:
|
||||
"""Delete an uploaded file from Gemini.
|
||||
|
||||
Args:
|
||||
file_id: The file name/ID to delete.
|
||||
|
||||
Returns:
|
||||
True if deletion was successful, False otherwise.
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
client.files.delete(name=file_id)
|
||||
logger.info(f"Deleted Gemini file: {file_id}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to delete Gemini file {file_id}: {e}")
|
||||
return False
|
||||
|
||||
async def adelete(self, file_id: str) -> bool:
|
||||
"""Async delete an uploaded file from Gemini.
|
||||
|
||||
Args:
|
||||
file_id: The file name/ID to delete.
|
||||
|
||||
Returns:
|
||||
True if deletion was successful, False otherwise.
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
await client.aio.files.delete(name=file_id)
|
||||
logger.info(f"Deleted Gemini file: {file_id}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to delete Gemini file {file_id}: {e}")
|
||||
return False
|
||||
|
||||
def get_file_info(self, file_id: str) -> dict[str, Any] | None:
|
||||
"""Get information about an uploaded file.
|
||||
|
||||
Args:
|
||||
file_id: The file name/ID.
|
||||
|
||||
Returns:
|
||||
Dictionary with file information, or None if not found.
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
file_info = client.files.get(name=file_id)
|
||||
return {
|
||||
"name": file_info.name,
|
||||
"uri": file_info.uri,
|
||||
"display_name": file_info.display_name,
|
||||
"mime_type": file_info.mime_type,
|
||||
"size_bytes": file_info.size_bytes,
|
||||
"state": str(file_info.state),
|
||||
"create_time": file_info.create_time,
|
||||
"expiration_time": file_info.expiration_time,
|
||||
}
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to get Gemini file info for {file_id}: {e}")
|
||||
return None
|
||||
|
||||
def list_files(self) -> list[dict[str, Any]]:
|
||||
"""List all uploaded files.
|
||||
|
||||
Returns:
|
||||
List of dictionaries with file information.
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
files = client.files.list()
|
||||
return [
|
||||
{
|
||||
"name": f.name,
|
||||
"uri": f.uri,
|
||||
"display_name": f.display_name,
|
||||
"mime_type": f.mime_type,
|
||||
"size_bytes": f.size_bytes,
|
||||
"state": str(f.state),
|
||||
}
|
||||
for f in files
|
||||
]
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to list Gemini files: {e}")
|
||||
return []
|
||||
|
||||
def wait_for_processing(self, file_id: str, timeout_seconds: int = 300) -> bool:
|
||||
"""Wait for a file to finish processing with exponential backoff.
|
||||
|
||||
Some files (especially videos) need time to process after upload.
|
||||
|
||||
Args:
|
||||
file_id: The file name/ID.
|
||||
timeout_seconds: Maximum time to wait.
|
||||
|
||||
Returns:
|
||||
True if processing completed, False if timed out or failed.
|
||||
"""
|
||||
try:
|
||||
from google.genai.types import FileState
|
||||
except ImportError:
|
||||
return True
|
||||
|
||||
client = self._get_client()
|
||||
start_time = time.time()
|
||||
base_delay = 1.0
|
||||
max_delay = 30.0
|
||||
attempt = 0
|
||||
|
||||
while time.time() - start_time < timeout_seconds:
|
||||
file_info = client.files.get(name=file_id)
|
||||
|
||||
if file_info.state == FileState.ACTIVE:
|
||||
return True
|
||||
|
||||
if file_info.state == FileState.FAILED:
|
||||
logger.error(f"Gemini file processing failed: {file_id}")
|
||||
return False
|
||||
|
||||
delay = min(base_delay * (2**attempt), max_delay)
|
||||
jitter = random.uniform(0, delay * 0.1) # noqa: S311
|
||||
time.sleep(delay + jitter)
|
||||
attempt += 1
|
||||
|
||||
logger.warning(f"Timed out waiting for Gemini file processing: {file_id}")
|
||||
return False
|
||||
|
||||
async def await_for_processing(
|
||||
self, file_id: str, timeout_seconds: int = 300
|
||||
) -> bool:
|
||||
"""Async wait for a file to finish processing with exponential backoff.
|
||||
|
||||
Some files (especially videos) need time to process after upload.
|
||||
|
||||
Args:
|
||||
file_id: The file name/ID.
|
||||
timeout_seconds: Maximum time to wait.
|
||||
|
||||
Returns:
|
||||
True if processing completed, False if timed out or failed.
|
||||
"""
|
||||
try:
|
||||
from google.genai.types import FileState
|
||||
except ImportError:
|
||||
return True
|
||||
|
||||
client = self._get_client()
|
||||
start_time = time.time()
|
||||
base_delay = 1.0
|
||||
max_delay = 30.0
|
||||
attempt = 0
|
||||
|
||||
while time.time() - start_time < timeout_seconds:
|
||||
file_info = await client.aio.files.get(name=file_id)
|
||||
|
||||
if file_info.state == FileState.ACTIVE:
|
||||
return True
|
||||
|
||||
if file_info.state == FileState.FAILED:
|
||||
logger.error(f"Gemini file processing failed: {file_id}")
|
||||
return False
|
||||
|
||||
delay = min(base_delay * (2**attempt), max_delay)
|
||||
jitter = random.uniform(0, delay * 0.1) # noqa: S311
|
||||
await asyncio.sleep(delay + jitter)
|
||||
attempt += 1
|
||||
|
||||
logger.warning(f"Timed out waiting for Gemini file processing: {file_id}")
|
||||
return False
|
||||
324
lib/crewai/src/crewai/files/uploaders/openai.py
Normal file
324
lib/crewai/src/crewai/files/uploaders/openai.py
Normal file
@@ -0,0 +1,324 @@
|
||||
"""OpenAI Files API uploader implementation."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
from crewai.files.content_types import (
|
||||
AudioFile,
|
||||
File,
|
||||
ImageFile,
|
||||
PDFFile,
|
||||
TextFile,
|
||||
VideoFile,
|
||||
)
|
||||
from crewai.files.uploaders.base import FileUploader, UploadResult
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
FileInput = AudioFile | File | ImageFile | PDFFile | TextFile | VideoFile
|
||||
|
||||
|
||||
class OpenAIFileUploader(FileUploader):
|
||||
"""Uploader for OpenAI Files API.
|
||||
|
||||
Uses the OpenAI SDK to upload files. Files are stored persistently
|
||||
until explicitly deleted.
|
||||
|
||||
Attributes:
|
||||
api_key: Optional API key (uses OPENAI_API_KEY env var if not provided).
|
||||
"""
|
||||
|
||||
def __init__(self, api_key: str | None = None) -> None:
|
||||
"""Initialize the OpenAI uploader.
|
||||
|
||||
Args:
|
||||
api_key: Optional OpenAI API key. If not provided, uses
|
||||
OPENAI_API_KEY environment variable.
|
||||
"""
|
||||
self._api_key = api_key or os.environ.get("OPENAI_API_KEY")
|
||||
self._client: Any = None
|
||||
self._async_client: Any = None
|
||||
|
||||
@property
|
||||
def provider_name(self) -> str:
|
||||
"""Return the provider name."""
|
||||
return "openai"
|
||||
|
||||
def _get_client(self) -> Any:
|
||||
"""Get or create the OpenAI client."""
|
||||
if self._client is None:
|
||||
try:
|
||||
from openai import OpenAI
|
||||
|
||||
self._client = OpenAI(api_key=self._api_key)
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"openai is required for OpenAI file uploads. "
|
||||
"Install with: pip install openai"
|
||||
) from e
|
||||
return self._client
|
||||
|
||||
def _get_async_client(self) -> Any:
|
||||
"""Get or create the async OpenAI client."""
|
||||
if self._async_client is None:
|
||||
try:
|
||||
from openai import AsyncOpenAI
|
||||
|
||||
self._async_client = AsyncOpenAI(api_key=self._api_key)
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"openai is required for OpenAI file uploads. "
|
||||
"Install with: pip install openai"
|
||||
) from e
|
||||
return self._async_client
|
||||
|
||||
def upload(self, file: FileInput, purpose: str | None = None) -> UploadResult:
|
||||
"""Upload a file to OpenAI.
|
||||
|
||||
Args:
|
||||
file: The file to upload.
|
||||
purpose: Optional purpose for the file (default: "user_data").
|
||||
|
||||
Returns:
|
||||
UploadResult with the file ID and metadata.
|
||||
|
||||
Raises:
|
||||
TransientUploadError: For retryable errors (network, rate limits).
|
||||
PermanentUploadError: For non-retryable errors (auth, validation).
|
||||
"""
|
||||
from crewai.files.processing.exceptions import (
|
||||
PermanentUploadError,
|
||||
TransientUploadError,
|
||||
)
|
||||
|
||||
try:
|
||||
client = self._get_client()
|
||||
|
||||
content = file.read()
|
||||
file_purpose = purpose or "user_data"
|
||||
|
||||
file_data = io.BytesIO(content)
|
||||
file_data.name = file.filename or "file"
|
||||
|
||||
logger.info(
|
||||
f"Uploading file '{file.filename}' to OpenAI ({len(content)} bytes)"
|
||||
)
|
||||
|
||||
uploaded_file = client.files.create(
|
||||
file=file_data,
|
||||
purpose=file_purpose,
|
||||
)
|
||||
|
||||
logger.info(f"Uploaded to OpenAI: {uploaded_file.id}")
|
||||
|
||||
return UploadResult(
|
||||
file_id=uploaded_file.id,
|
||||
file_uri=None,
|
||||
content_type=file.content_type,
|
||||
expires_at=None,
|
||||
provider=self.provider_name,
|
||||
)
|
||||
except ImportError:
|
||||
raise
|
||||
except Exception as e:
|
||||
error_type = type(e).__name__
|
||||
if "RateLimit" in error_type or "APIConnection" in error_type:
|
||||
raise TransientUploadError(
|
||||
f"Transient upload error: {e}", file_name=file.filename
|
||||
) from e
|
||||
if "Authentication" in error_type or "Permission" in error_type:
|
||||
raise PermanentUploadError(
|
||||
f"Authentication/permission error: {e}", file_name=file.filename
|
||||
) from e
|
||||
if "BadRequest" in error_type or "InvalidRequest" in error_type:
|
||||
raise PermanentUploadError(
|
||||
f"Invalid request: {e}", file_name=file.filename
|
||||
) from e
|
||||
status_code = getattr(e, "status_code", None)
|
||||
if status_code is not None:
|
||||
if status_code >= 500 or status_code == 429:
|
||||
raise TransientUploadError(
|
||||
f"Server error ({status_code}): {e}", file_name=file.filename
|
||||
) from e
|
||||
if status_code in (401, 403):
|
||||
raise PermanentUploadError(
|
||||
f"Auth error ({status_code}): {e}", file_name=file.filename
|
||||
) from e
|
||||
if status_code == 400:
|
||||
raise PermanentUploadError(
|
||||
f"Bad request ({status_code}): {e}", file_name=file.filename
|
||||
) from e
|
||||
raise TransientUploadError(
|
||||
f"Upload failed: {e}", file_name=file.filename
|
||||
) from e
|
||||
|
||||
def delete(self, file_id: str) -> bool:
|
||||
"""Delete an uploaded file from OpenAI.
|
||||
|
||||
Args:
|
||||
file_id: The file ID to delete.
|
||||
|
||||
Returns:
|
||||
True if deletion was successful, False otherwise.
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
client.files.delete(file_id)
|
||||
logger.info(f"Deleted OpenAI file: {file_id}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to delete OpenAI file {file_id}: {e}")
|
||||
return False
|
||||
|
||||
def get_file_info(self, file_id: str) -> dict[str, Any] | None:
|
||||
"""Get information about an uploaded file.
|
||||
|
||||
Args:
|
||||
file_id: The file ID.
|
||||
|
||||
Returns:
|
||||
Dictionary with file information, or None if not found.
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
file_info = client.files.retrieve(file_id)
|
||||
return {
|
||||
"id": file_info.id,
|
||||
"filename": file_info.filename,
|
||||
"purpose": file_info.purpose,
|
||||
"bytes": file_info.bytes,
|
||||
"created_at": file_info.created_at,
|
||||
"status": file_info.status,
|
||||
}
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to get OpenAI file info for {file_id}: {e}")
|
||||
return None
|
||||
|
||||
def list_files(self) -> list[dict[str, Any]]:
|
||||
"""List all uploaded files.
|
||||
|
||||
Returns:
|
||||
List of dictionaries with file information.
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
files = client.files.list()
|
||||
return [
|
||||
{
|
||||
"id": f.id,
|
||||
"filename": f.filename,
|
||||
"purpose": f.purpose,
|
||||
"bytes": f.bytes,
|
||||
"created_at": f.created_at,
|
||||
"status": f.status,
|
||||
}
|
||||
for f in files.data
|
||||
]
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to list OpenAI files: {e}")
|
||||
return []
|
||||
|
||||
async def aupload(
|
||||
self, file: FileInput, purpose: str | None = None
|
||||
) -> UploadResult:
|
||||
"""Async upload a file to OpenAI using native async client.
|
||||
|
||||
Args:
|
||||
file: The file to upload.
|
||||
purpose: Optional purpose for the file (default: "user_data").
|
||||
|
||||
Returns:
|
||||
UploadResult with the file ID and metadata.
|
||||
|
||||
Raises:
|
||||
TransientUploadError: For retryable errors (network, rate limits).
|
||||
PermanentUploadError: For non-retryable errors (auth, validation).
|
||||
"""
|
||||
from crewai.files.processing.exceptions import (
|
||||
PermanentUploadError,
|
||||
TransientUploadError,
|
||||
)
|
||||
|
||||
try:
|
||||
client = self._get_async_client()
|
||||
|
||||
content = await file.aread()
|
||||
file_purpose = purpose or "user_data"
|
||||
|
||||
file_data = io.BytesIO(content)
|
||||
file_data.name = file.filename or "file"
|
||||
|
||||
logger.info(
|
||||
f"Uploading file '{file.filename}' to OpenAI ({len(content)} bytes)"
|
||||
)
|
||||
|
||||
uploaded_file = await client.files.create(
|
||||
file=file_data,
|
||||
purpose=file_purpose,
|
||||
)
|
||||
|
||||
logger.info(f"Uploaded to OpenAI: {uploaded_file.id}")
|
||||
|
||||
return UploadResult(
|
||||
file_id=uploaded_file.id,
|
||||
file_uri=None,
|
||||
content_type=file.content_type,
|
||||
expires_at=None,
|
||||
provider=self.provider_name,
|
||||
)
|
||||
except ImportError:
|
||||
raise
|
||||
except Exception as e:
|
||||
error_type = type(e).__name__
|
||||
if "RateLimit" in error_type or "APIConnection" in error_type:
|
||||
raise TransientUploadError(
|
||||
f"Transient upload error: {e}", file_name=file.filename
|
||||
) from e
|
||||
if "Authentication" in error_type or "Permission" in error_type:
|
||||
raise PermanentUploadError(
|
||||
f"Authentication/permission error: {e}", file_name=file.filename
|
||||
) from e
|
||||
if "BadRequest" in error_type or "InvalidRequest" in error_type:
|
||||
raise PermanentUploadError(
|
||||
f"Invalid request: {e}", file_name=file.filename
|
||||
) from e
|
||||
status_code = getattr(e, "status_code", None)
|
||||
if status_code is not None:
|
||||
if status_code >= 500 or status_code == 429:
|
||||
raise TransientUploadError(
|
||||
f"Server error ({status_code}): {e}", file_name=file.filename
|
||||
) from e
|
||||
if status_code in (401, 403):
|
||||
raise PermanentUploadError(
|
||||
f"Auth error ({status_code}): {e}", file_name=file.filename
|
||||
) from e
|
||||
if status_code == 400:
|
||||
raise PermanentUploadError(
|
||||
f"Bad request ({status_code}): {e}", file_name=file.filename
|
||||
) from e
|
||||
raise TransientUploadError(
|
||||
f"Upload failed: {e}", file_name=file.filename
|
||||
) from e
|
||||
|
||||
async def adelete(self, file_id: str) -> bool:
|
||||
"""Async delete an uploaded file from OpenAI.
|
||||
|
||||
Args:
|
||||
file_id: The file ID to delete.
|
||||
|
||||
Returns:
|
||||
True if deletion was successful, False otherwise.
|
||||
"""
|
||||
try:
|
||||
client = self._get_async_client()
|
||||
await client.files.delete(file_id)
|
||||
logger.info(f"Deleted OpenAI file: {file_id}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to delete OpenAI file {file_id}: {e}")
|
||||
return False
|
||||
@@ -66,11 +66,11 @@ if TYPE_CHECKING:
|
||||
from litellm.utils import supports_response_schema
|
||||
|
||||
from crewai.agent.core import Agent
|
||||
from crewai.files import FileInput, UploadCache
|
||||
from crewai.llms.hooks.base import BaseInterceptor
|
||||
from crewai.llms.providers.anthropic.completion import AnthropicThinkingConfig
|
||||
from crewai.task import Task
|
||||
from crewai.tools.base_tool import BaseTool
|
||||
from crewai.utilities.files import FileInput, UploadCache
|
||||
from crewai.utilities.types import LLMMessage
|
||||
|
||||
try:
|
||||
@@ -2274,7 +2274,7 @@ class LLM(BaseLLM):
|
||||
"""
|
||||
import base64
|
||||
|
||||
from crewai.utilities.files import (
|
||||
from crewai.files import (
|
||||
FileResolver,
|
||||
FileResolverConfig,
|
||||
InlineBase64,
|
||||
|
||||
@@ -33,9 +33,9 @@ from crewai.types.usage_metrics import UsageMetrics
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from crewai.agent.core import Agent
|
||||
from crewai.files import FileInput, UploadCache
|
||||
from crewai.task import Task
|
||||
from crewai.tools.base_tool import BaseTool
|
||||
from crewai.utilities.files import FileInput, UploadCache
|
||||
from crewai.utilities.types import LLMMessage
|
||||
|
||||
|
||||
@@ -315,6 +315,25 @@ class BaseLLM(ABC):
|
||||
"""
|
||||
return []
|
||||
|
||||
async def aformat_multimodal_content(
|
||||
self,
|
||||
files: dict[str, FileInput],
|
||||
upload_cache: UploadCache | None = None,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Async format files as multimodal content blocks for the LLM.
|
||||
|
||||
Default implementation calls the sync version. Subclasses should
|
||||
override to use async file resolution for parallel processing.
|
||||
|
||||
Args:
|
||||
files: Dictionary mapping file names to FileInput objects.
|
||||
upload_cache: Optional cache for tracking uploaded files.
|
||||
|
||||
Returns:
|
||||
List of content blocks in the provider's expected format.
|
||||
"""
|
||||
return self.format_multimodal_content(files, upload_cache)
|
||||
|
||||
def format_text_content(self, text: str) -> dict[str, Any]:
|
||||
"""Format text as a content block for the LLM.
|
||||
|
||||
|
||||
@@ -20,8 +20,8 @@ from crewai.utilities.types import LLMMessage
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from crewai.files import FileInput, UploadCache
|
||||
from crewai.llms.hooks.base import BaseInterceptor
|
||||
from crewai.utilities.files import FileInput, UploadCache
|
||||
|
||||
DEFAULT_CACHE_TTL = "ephemeral"
|
||||
|
||||
@@ -1281,7 +1281,7 @@ class AnthropicCompletion(BaseLLM):
|
||||
if not self.supports_multimodal():
|
||||
return []
|
||||
|
||||
from crewai.utilities.files import (
|
||||
from crewai.files import (
|
||||
FileReference,
|
||||
FileResolver,
|
||||
FileResolverConfig,
|
||||
@@ -1370,3 +1370,107 @@ class AnthropicCompletion(BaseLLM):
|
||||
content_blocks.append(block)
|
||||
|
||||
return content_blocks
|
||||
|
||||
async def aformat_multimodal_content(
|
||||
self,
|
||||
files: dict[str, FileInput],
|
||||
upload_cache: UploadCache | None = None,
|
||||
enable_caching: bool = True,
|
||||
cache_ttl: str | None = None,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Async format files as Anthropic multimodal content blocks.
|
||||
|
||||
Uses parallel file resolution for improved performance with multiple files.
|
||||
|
||||
Args:
|
||||
files: Dictionary mapping file names to FileInput objects.
|
||||
upload_cache: Optional cache for tracking uploaded files.
|
||||
enable_caching: Whether to add cache_control markers (default: True).
|
||||
cache_ttl: Cache TTL - "ephemeral" (5min) or "1h" (1hr for supported models).
|
||||
|
||||
Returns:
|
||||
List of content blocks in Anthropic's expected format.
|
||||
"""
|
||||
if not self.supports_multimodal():
|
||||
return []
|
||||
|
||||
from crewai.files import (
|
||||
FileReference,
|
||||
FileResolver,
|
||||
FileResolverConfig,
|
||||
InlineBase64,
|
||||
)
|
||||
|
||||
supported_types = self.supported_multimodal_content_types()
|
||||
|
||||
supported_files = {
|
||||
name: f
|
||||
for name, f in files.items()
|
||||
if any(f.content_type.startswith(t) for t in supported_types)
|
||||
}
|
||||
|
||||
if not supported_files:
|
||||
return []
|
||||
|
||||
config = FileResolverConfig(prefer_upload=False)
|
||||
resolver = FileResolver(config=config, upload_cache=upload_cache)
|
||||
resolved_files = await resolver.aresolve_files(supported_files, "anthropic")
|
||||
|
||||
content_blocks: list[dict[str, Any]] = []
|
||||
num_files = len(resolved_files)
|
||||
file_names = list(supported_files.keys())
|
||||
|
||||
for i, name in enumerate(file_names):
|
||||
if name not in resolved_files:
|
||||
continue
|
||||
|
||||
resolved = resolved_files[name]
|
||||
file_input = supported_files[name]
|
||||
content_type = file_input.content_type
|
||||
block: dict[str, Any] = {}
|
||||
|
||||
if isinstance(resolved, FileReference):
|
||||
if content_type.startswith("image/"):
|
||||
block = {
|
||||
"type": "image",
|
||||
"source": {
|
||||
"type": "file",
|
||||
"file_id": resolved.file_id,
|
||||
},
|
||||
}
|
||||
elif content_type == "application/pdf":
|
||||
block = {
|
||||
"type": "document",
|
||||
"source": {
|
||||
"type": "file",
|
||||
"file_id": resolved.file_id,
|
||||
},
|
||||
}
|
||||
elif isinstance(resolved, InlineBase64):
|
||||
if content_type.startswith("image/"):
|
||||
block = {
|
||||
"type": "image",
|
||||
"source": {
|
||||
"type": "base64",
|
||||
"media_type": resolved.content_type,
|
||||
"data": resolved.data,
|
||||
},
|
||||
}
|
||||
elif content_type == "application/pdf":
|
||||
block = {
|
||||
"type": "document",
|
||||
"source": {
|
||||
"type": "base64",
|
||||
"media_type": resolved.content_type,
|
||||
"data": resolved.data,
|
||||
},
|
||||
}
|
||||
|
||||
if block and enable_caching and i == num_files - 1:
|
||||
cache_control: dict[str, str] = {"type": cache_ttl or DEFAULT_CACHE_TTL}
|
||||
block["cache_control"] = cache_control
|
||||
|
||||
if block:
|
||||
content_blocks.append(block)
|
||||
|
||||
return content_blocks
|
||||
|
||||
@@ -18,8 +18,8 @@ from crewai.utilities.types import LLMMessage
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from crewai.files import FileInput, UploadCache
|
||||
from crewai.llms.hooks.base import BaseInterceptor
|
||||
from crewai.utilities.files import FileInput, UploadCache
|
||||
|
||||
|
||||
try:
|
||||
@@ -1060,7 +1060,7 @@ class AzureCompletion(BaseLLM):
|
||||
if not self.supports_multimodal():
|
||||
return []
|
||||
|
||||
from crewai.utilities.files import (
|
||||
from crewai.files import (
|
||||
FileResolver,
|
||||
FileResolverConfig,
|
||||
InlineBase64,
|
||||
@@ -1100,3 +1100,54 @@ class AzureCompletion(BaseLLM):
|
||||
)
|
||||
|
||||
return content_blocks
|
||||
|
||||
async def aformat_multimodal_content(
|
||||
self,
|
||||
files: dict[str, FileInput],
|
||||
upload_cache: UploadCache | None = None,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Async format files as Azure OpenAI multimodal content blocks.
|
||||
|
||||
Uses parallel file resolution for improved performance with multiple files.
|
||||
|
||||
Args:
|
||||
files: Dictionary mapping file names to FileInput objects.
|
||||
upload_cache: Optional cache (not used by Azure but kept for interface consistency).
|
||||
|
||||
Returns:
|
||||
List of content blocks in Azure OpenAI's expected format.
|
||||
"""
|
||||
if not self.supports_multimodal():
|
||||
return []
|
||||
|
||||
from crewai.files import (
|
||||
FileResolver,
|
||||
FileResolverConfig,
|
||||
InlineBase64,
|
||||
)
|
||||
|
||||
supported_types = self.supported_multimodal_content_types()
|
||||
|
||||
supported_files = {
|
||||
name: f
|
||||
for name, f in files.items()
|
||||
if any(f.content_type.startswith(t) for t in supported_types)
|
||||
}
|
||||
|
||||
if not supported_files:
|
||||
return []
|
||||
|
||||
config = FileResolverConfig(prefer_upload=False)
|
||||
resolver = FileResolver(config=config, upload_cache=upload_cache)
|
||||
resolved_files = await resolver.aresolve_files(supported_files, "azure")
|
||||
|
||||
return [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:{resolved.content_type};base64,{resolved.data}"
|
||||
},
|
||||
}
|
||||
for resolved in resolved_files.values()
|
||||
if isinstance(resolved, InlineBase64)
|
||||
]
|
||||
|
||||
@@ -32,8 +32,8 @@ if TYPE_CHECKING:
|
||||
ToolTypeDef,
|
||||
)
|
||||
|
||||
from crewai.files import FileInput, UploadCache
|
||||
from crewai.llms.hooks.base import BaseInterceptor
|
||||
from crewai.utilities.files import FileInput, UploadCache
|
||||
|
||||
|
||||
try:
|
||||
@@ -1455,13 +1455,33 @@ class BedrockCompletion(BaseLLM):
|
||||
def supports_multimodal(self) -> bool:
|
||||
"""Check if the model supports multimodal inputs.
|
||||
|
||||
Claude models on Bedrock support vision.
|
||||
Claude 3+ and Nova Lite/Pro/Premier on Bedrock support vision.
|
||||
|
||||
Returns:
|
||||
True if the model supports images.
|
||||
"""
|
||||
vision_models = ("anthropic.claude-3",)
|
||||
return any(self.model.lower().startswith(m) for m in vision_models)
|
||||
model_lower = self.model.lower()
|
||||
vision_models = (
|
||||
"anthropic.claude-3",
|
||||
"amazon.nova-lite",
|
||||
"amazon.nova-pro",
|
||||
"amazon.nova-premier",
|
||||
"us.amazon.nova-lite",
|
||||
"us.amazon.nova-pro",
|
||||
"us.amazon.nova-premier",
|
||||
)
|
||||
return any(model_lower.startswith(m) for m in vision_models)
|
||||
|
||||
def _is_nova_model(self) -> bool:
|
||||
"""Check if the model is an Amazon Nova model.
|
||||
|
||||
Only Nova models support S3 links for multimedia.
|
||||
|
||||
Returns:
|
||||
True if the model is a Nova model.
|
||||
"""
|
||||
model_lower = self.model.lower()
|
||||
return "amazon.nova-" in model_lower
|
||||
|
||||
def supported_multimodal_content_types(self) -> list[str]:
|
||||
"""Get content types supported by Bedrock for multimodal input.
|
||||
@@ -1471,7 +1491,78 @@ class BedrockCompletion(BaseLLM):
|
||||
"""
|
||||
if not self.supports_multimodal():
|
||||
return []
|
||||
return ["image/", "application/pdf"]
|
||||
|
||||
types = ["image/png", "image/jpeg", "image/gif", "image/webp"]
|
||||
|
||||
if self._is_nova_model():
|
||||
types.extend(
|
||||
[
|
||||
"application/pdf",
|
||||
"text/csv",
|
||||
"text/plain",
|
||||
"text/markdown",
|
||||
"text/html",
|
||||
"application/msword",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"application/vnd.ms-excel",
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
"video/mp4",
|
||||
"video/quicktime",
|
||||
"video/x-matroska",
|
||||
"video/webm",
|
||||
"video/x-flv",
|
||||
"video/mpeg",
|
||||
"video/x-ms-wmv",
|
||||
"video/3gpp",
|
||||
]
|
||||
)
|
||||
else:
|
||||
types.append("application/pdf")
|
||||
|
||||
return types
|
||||
|
||||
def _get_document_format(self, content_type: str) -> str | None:
|
||||
"""Map content type to Bedrock document format.
|
||||
|
||||
Args:
|
||||
content_type: MIME type of the document.
|
||||
|
||||
Returns:
|
||||
Bedrock format string or None if unsupported.
|
||||
"""
|
||||
format_map = {
|
||||
"application/pdf": "pdf",
|
||||
"text/csv": "csv",
|
||||
"text/plain": "txt",
|
||||
"text/markdown": "md",
|
||||
"text/html": "html",
|
||||
"application/msword": "doc",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
|
||||
"application/vnd.ms-excel": "xls",
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
|
||||
}
|
||||
return format_map.get(content_type)
|
||||
|
||||
def _get_video_format(self, content_type: str) -> str | None:
|
||||
"""Map content type to Bedrock video format.
|
||||
|
||||
Args:
|
||||
content_type: MIME type of the video.
|
||||
|
||||
Returns:
|
||||
Bedrock format string or None if unsupported.
|
||||
"""
|
||||
format_map = {
|
||||
"video/mp4": "mp4",
|
||||
"video/quicktime": "mov",
|
||||
"video/x-matroska": "mkv",
|
||||
"video/webm": "webm",
|
||||
"video/x-flv": "flv",
|
||||
"video/mpeg": "mpeg",
|
||||
"video/x-ms-wmv": "wmv",
|
||||
"video/3gpp": "three_gp",
|
||||
}
|
||||
return format_map.get(content_type)
|
||||
|
||||
def format_multimodal_content(
|
||||
self,
|
||||
@@ -1480,12 +1571,12 @@ class BedrockCompletion(BaseLLM):
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Format files as Bedrock Converse API multimodal content blocks.
|
||||
|
||||
Bedrock Converse API uses specific formats for images and documents with raw bytes.
|
||||
Uses FileResolver to get InlineBytes format for Bedrock's byte-based API.
|
||||
Bedrock Converse API supports both raw bytes and S3 URI references.
|
||||
S3 uploads are only supported by Amazon Nova models.
|
||||
|
||||
Args:
|
||||
files: Dictionary mapping file names to FileInput objects.
|
||||
upload_cache: Optional cache (not used by Bedrock but kept for interface consistency).
|
||||
upload_cache: Optional cache for S3 uploads.
|
||||
|
||||
Returns:
|
||||
List of content blocks in Bedrock's expected format.
|
||||
@@ -1493,50 +1584,239 @@ class BedrockCompletion(BaseLLM):
|
||||
if not self.supports_multimodal():
|
||||
return []
|
||||
|
||||
from crewai.utilities.files import (
|
||||
import os
|
||||
|
||||
from crewai.files import (
|
||||
FileReference,
|
||||
FileResolver,
|
||||
FileResolverConfig,
|
||||
InlineBytes,
|
||||
)
|
||||
|
||||
content_blocks: list[dict[str, Any]] = []
|
||||
is_nova = self._is_nova_model()
|
||||
|
||||
# Bedrock uses raw bytes, configure resolver accordingly
|
||||
config = FileResolverConfig(prefer_upload=False, use_bytes_for_bedrock=True)
|
||||
s3_bucket = os.environ.get("CREWAI_BEDROCK_S3_BUCKET")
|
||||
s3_bucket_owner = os.environ.get("CREWAI_BEDROCK_S3_BUCKET_OWNER")
|
||||
prefer_upload = bool(s3_bucket) and is_nova
|
||||
|
||||
config = FileResolverConfig(
|
||||
prefer_upload=prefer_upload, use_bytes_for_bedrock=True
|
||||
)
|
||||
resolver = FileResolver(config=config, upload_cache=upload_cache)
|
||||
|
||||
for name, file_input in files.items():
|
||||
content_type = file_input.content_type
|
||||
|
||||
resolved = resolver.resolve(file_input, "bedrock")
|
||||
|
||||
if isinstance(resolved, InlineBytes):
|
||||
file_bytes = resolved.data
|
||||
else:
|
||||
# Fallback to reading directly
|
||||
file_bytes = file_input.read()
|
||||
if isinstance(resolved, FileReference) and resolved.file_uri:
|
||||
s3_location: dict[str, Any] = {"uri": resolved.file_uri}
|
||||
if s3_bucket_owner:
|
||||
s3_location["bucketOwner"] = s3_bucket_owner
|
||||
|
||||
if content_type.startswith("image/"):
|
||||
media_type = content_type.split("/")[-1]
|
||||
if media_type == "jpg":
|
||||
media_type = "jpeg"
|
||||
content_blocks.append(
|
||||
{
|
||||
"image": {
|
||||
"format": media_type,
|
||||
"source": {"bytes": file_bytes},
|
||||
if content_type.startswith("image/"):
|
||||
media_type = content_type.split("/")[-1]
|
||||
if media_type == "jpg":
|
||||
media_type = "jpeg"
|
||||
content_blocks.append(
|
||||
{
|
||||
"image": {
|
||||
"format": media_type,
|
||||
"source": {"s3Location": s3_location},
|
||||
}
|
||||
}
|
||||
}
|
||||
)
|
||||
elif content_type == "application/pdf":
|
||||
content_blocks.append(
|
||||
{
|
||||
"document": {
|
||||
"name": name,
|
||||
"format": "pdf",
|
||||
"source": {"bytes": file_bytes},
|
||||
)
|
||||
elif content_type.startswith("video/"):
|
||||
video_format = self._get_video_format(content_type)
|
||||
if video_format:
|
||||
content_blocks.append(
|
||||
{
|
||||
"video": {
|
||||
"format": video_format,
|
||||
"source": {"s3Location": s3_location},
|
||||
}
|
||||
}
|
||||
)
|
||||
else:
|
||||
doc_format = self._get_document_format(content_type)
|
||||
if doc_format:
|
||||
content_blocks.append(
|
||||
{
|
||||
"document": {
|
||||
"name": name,
|
||||
"format": doc_format,
|
||||
"source": {"s3Location": s3_location},
|
||||
}
|
||||
}
|
||||
)
|
||||
else:
|
||||
if isinstance(resolved, InlineBytes):
|
||||
file_bytes = resolved.data
|
||||
else:
|
||||
file_bytes = file_input.read()
|
||||
|
||||
if content_type.startswith("image/"):
|
||||
media_type = content_type.split("/")[-1]
|
||||
if media_type == "jpg":
|
||||
media_type = "jpeg"
|
||||
content_blocks.append(
|
||||
{
|
||||
"image": {
|
||||
"format": media_type,
|
||||
"source": {"bytes": file_bytes},
|
||||
}
|
||||
}
|
||||
}
|
||||
)
|
||||
)
|
||||
elif content_type.startswith("video/"):
|
||||
video_format = self._get_video_format(content_type)
|
||||
if video_format:
|
||||
content_blocks.append(
|
||||
{
|
||||
"video": {
|
||||
"format": video_format,
|
||||
"source": {"bytes": file_bytes},
|
||||
}
|
||||
}
|
||||
)
|
||||
else:
|
||||
doc_format = self._get_document_format(content_type)
|
||||
if doc_format:
|
||||
content_blocks.append(
|
||||
{
|
||||
"document": {
|
||||
"name": name,
|
||||
"format": doc_format,
|
||||
"source": {"bytes": file_bytes},
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
return content_blocks
|
||||
|
||||
async def aformat_multimodal_content(
|
||||
self,
|
||||
files: dict[str, FileInput],
|
||||
upload_cache: UploadCache | None = None,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Async format files as Bedrock Converse API multimodal content blocks.
|
||||
|
||||
Uses parallel file resolution. S3 uploads are only supported by Nova models.
|
||||
|
||||
Args:
|
||||
files: Dictionary mapping file names to FileInput objects.
|
||||
upload_cache: Optional cache for S3 uploads.
|
||||
|
||||
Returns:
|
||||
List of content blocks in Bedrock's expected format.
|
||||
"""
|
||||
if not self.supports_multimodal():
|
||||
return []
|
||||
|
||||
import os
|
||||
|
||||
from crewai.files import (
|
||||
FileReference,
|
||||
FileResolver,
|
||||
FileResolverConfig,
|
||||
InlineBytes,
|
||||
)
|
||||
|
||||
is_nova = self._is_nova_model()
|
||||
s3_bucket = os.environ.get("CREWAI_BEDROCK_S3_BUCKET")
|
||||
s3_bucket_owner = os.environ.get("CREWAI_BEDROCK_S3_BUCKET_OWNER")
|
||||
prefer_upload = bool(s3_bucket) and is_nova
|
||||
|
||||
config = FileResolverConfig(
|
||||
prefer_upload=prefer_upload, use_bytes_for_bedrock=True
|
||||
)
|
||||
resolver = FileResolver(config=config, upload_cache=upload_cache)
|
||||
resolved_files = await resolver.aresolve_files(files, "bedrock")
|
||||
|
||||
content_blocks: list[dict[str, Any]] = []
|
||||
for name, resolved in resolved_files.items():
|
||||
file_input = files[name]
|
||||
content_type = file_input.content_type
|
||||
|
||||
if isinstance(resolved, FileReference) and resolved.file_uri:
|
||||
s3_location: dict[str, Any] = {"uri": resolved.file_uri}
|
||||
if s3_bucket_owner:
|
||||
s3_location["bucketOwner"] = s3_bucket_owner
|
||||
|
||||
if content_type.startswith("image/"):
|
||||
media_type = content_type.split("/")[-1]
|
||||
if media_type == "jpg":
|
||||
media_type = "jpeg"
|
||||
content_blocks.append(
|
||||
{
|
||||
"image": {
|
||||
"format": media_type,
|
||||
"source": {"s3Location": s3_location},
|
||||
}
|
||||
}
|
||||
)
|
||||
elif content_type.startswith("video/"):
|
||||
video_format = self._get_video_format(content_type)
|
||||
if video_format:
|
||||
content_blocks.append(
|
||||
{
|
||||
"video": {
|
||||
"format": video_format,
|
||||
"source": {"s3Location": s3_location},
|
||||
}
|
||||
}
|
||||
)
|
||||
else:
|
||||
doc_format = self._get_document_format(content_type)
|
||||
if doc_format:
|
||||
content_blocks.append(
|
||||
{
|
||||
"document": {
|
||||
"name": name,
|
||||
"format": doc_format,
|
||||
"source": {"s3Location": s3_location},
|
||||
}
|
||||
}
|
||||
)
|
||||
else:
|
||||
if isinstance(resolved, InlineBytes):
|
||||
file_bytes = resolved.data
|
||||
else:
|
||||
file_bytes = await file_input.aread()
|
||||
|
||||
if content_type.startswith("image/"):
|
||||
media_type = content_type.split("/")[-1]
|
||||
if media_type == "jpg":
|
||||
media_type = "jpeg"
|
||||
content_blocks.append(
|
||||
{
|
||||
"image": {
|
||||
"format": media_type,
|
||||
"source": {"bytes": file_bytes},
|
||||
}
|
||||
}
|
||||
)
|
||||
elif content_type.startswith("video/"):
|
||||
video_format = self._get_video_format(content_type)
|
||||
if video_format:
|
||||
content_blocks.append(
|
||||
{
|
||||
"video": {
|
||||
"format": video_format,
|
||||
"source": {"bytes": file_bytes},
|
||||
}
|
||||
}
|
||||
)
|
||||
else:
|
||||
doc_format = self._get_document_format(content_type)
|
||||
if doc_format:
|
||||
content_blocks.append(
|
||||
{
|
||||
"document": {
|
||||
"name": name,
|
||||
"format": doc_format,
|
||||
"source": {"bytes": file_bytes},
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
return content_blocks
|
||||
|
||||
@@ -19,11 +19,11 @@ from crewai.utilities.types import LLMMessage
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from crewai.llms.hooks.base import BaseInterceptor
|
||||
from crewai.utilities.files import (
|
||||
from crewai.files import (
|
||||
FileInput,
|
||||
UploadCache,
|
||||
)
|
||||
from crewai.llms.hooks.base import BaseInterceptor
|
||||
|
||||
|
||||
try:
|
||||
@@ -1113,7 +1113,7 @@ class GeminiCompletion(BaseLLM):
|
||||
Returns:
|
||||
List of content blocks in Gemini's expected format.
|
||||
"""
|
||||
from crewai.utilities.files import (
|
||||
from crewai.files import (
|
||||
FileReference,
|
||||
FileResolver,
|
||||
FileResolverConfig,
|
||||
@@ -1123,7 +1123,6 @@ class GeminiCompletion(BaseLLM):
|
||||
content_blocks: list[dict[str, Any]] = []
|
||||
supported_types = self.supported_multimodal_content_types()
|
||||
|
||||
# Create resolver with optional cache
|
||||
config = FileResolverConfig(prefer_upload=False)
|
||||
resolver = FileResolver(config=config, upload_cache=upload_cache)
|
||||
|
||||
@@ -1168,6 +1167,67 @@ class GeminiCompletion(BaseLLM):
|
||||
|
||||
return content_blocks
|
||||
|
||||
async def aformat_multimodal_content(
|
||||
self,
|
||||
files: dict[str, FileInput],
|
||||
upload_cache: UploadCache | None = None,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Async format files as Gemini multimodal content blocks.
|
||||
|
||||
Uses parallel file resolution for improved performance with multiple files.
|
||||
|
||||
Args:
|
||||
files: Dictionary mapping file names to FileInput objects.
|
||||
upload_cache: Optional cache for tracking uploaded files.
|
||||
|
||||
Returns:
|
||||
List of content blocks in Gemini's expected format.
|
||||
"""
|
||||
from crewai.files import (
|
||||
FileReference,
|
||||
FileResolver,
|
||||
FileResolverConfig,
|
||||
InlineBase64,
|
||||
)
|
||||
|
||||
supported_types = self.supported_multimodal_content_types()
|
||||
|
||||
supported_files = {
|
||||
name: f
|
||||
for name, f in files.items()
|
||||
if any(f.content_type.startswith(t) for t in supported_types)
|
||||
}
|
||||
|
||||
if not supported_files:
|
||||
return []
|
||||
|
||||
config = FileResolverConfig(prefer_upload=False)
|
||||
resolver = FileResolver(config=config, upload_cache=upload_cache)
|
||||
resolved_files = await resolver.aresolve_files(supported_files, "gemini")
|
||||
|
||||
content_blocks: list[dict[str, Any]] = []
|
||||
for resolved in resolved_files.values():
|
||||
if isinstance(resolved, FileReference) and resolved.file_uri:
|
||||
content_blocks.append(
|
||||
{
|
||||
"fileData": {
|
||||
"mimeType": resolved.content_type,
|
||||
"fileUri": resolved.file_uri,
|
||||
}
|
||||
}
|
||||
)
|
||||
elif isinstance(resolved, InlineBase64):
|
||||
content_blocks.append(
|
||||
{
|
||||
"inlineData": {
|
||||
"mimeType": resolved.content_type,
|
||||
"data": resolved.data,
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
return content_blocks
|
||||
|
||||
def format_text_content(self, text: str) -> dict[str, Any]:
|
||||
"""Format text as a Gemini content block.
|
||||
|
||||
|
||||
@@ -28,10 +28,10 @@ from crewai.utilities.types import LLMMessage
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from crewai.agent.core import Agent
|
||||
from crewai.files import FileInput, UploadCache
|
||||
from crewai.llms.hooks.base import BaseInterceptor
|
||||
from crewai.task import Task
|
||||
from crewai.tools.base_tool import BaseTool
|
||||
from crewai.utilities.files import FileInput, UploadCache
|
||||
|
||||
|
||||
class OpenAICompletion(BaseLLM):
|
||||
@@ -1100,7 +1100,7 @@ class OpenAICompletion(BaseLLM):
|
||||
if not self.supports_multimodal():
|
||||
return []
|
||||
|
||||
from crewai.utilities.files import (
|
||||
from crewai.files import (
|
||||
FileReference,
|
||||
FileResolver,
|
||||
FileResolverConfig,
|
||||
@@ -1148,3 +1148,67 @@ class OpenAICompletion(BaseLLM):
|
||||
)
|
||||
|
||||
return content_blocks
|
||||
|
||||
async def aformat_multimodal_content(
|
||||
self,
|
||||
files: dict[str, FileInput],
|
||||
upload_cache: UploadCache | None = None,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Async format files as OpenAI multimodal content blocks.
|
||||
|
||||
Uses parallel file resolution for improved performance with multiple files.
|
||||
|
||||
Args:
|
||||
files: Dictionary mapping file names to FileInput objects.
|
||||
upload_cache: Optional cache for tracking uploaded files.
|
||||
|
||||
Returns:
|
||||
List of content blocks in OpenAI's expected format.
|
||||
"""
|
||||
if not self.supports_multimodal():
|
||||
return []
|
||||
|
||||
from crewai.files import (
|
||||
FileReference,
|
||||
FileResolver,
|
||||
FileResolverConfig,
|
||||
InlineBase64,
|
||||
)
|
||||
|
||||
supported_types = self.supported_multimodal_content_types()
|
||||
|
||||
supported_files = {
|
||||
name: f
|
||||
for name, f in files.items()
|
||||
if any(f.content_type.startswith(t) for t in supported_types)
|
||||
}
|
||||
|
||||
if not supported_files:
|
||||
return []
|
||||
|
||||
config = FileResolverConfig(prefer_upload=False)
|
||||
resolver = FileResolver(config=config, upload_cache=upload_cache)
|
||||
resolved_files = await resolver.aresolve_files(supported_files, "openai")
|
||||
|
||||
content_blocks: list[dict[str, Any]] = []
|
||||
for resolved in resolved_files.values():
|
||||
if isinstance(resolved, FileReference):
|
||||
content_blocks.append(
|
||||
{
|
||||
"type": "file",
|
||||
"file": {
|
||||
"file_id": resolved.file_id,
|
||||
},
|
||||
}
|
||||
)
|
||||
elif isinstance(resolved, InlineBase64):
|
||||
content_blocks.append(
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:{resolved.content_type};base64,{resolved.data}"
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
return content_blocks
|
||||
|
||||
@@ -37,6 +37,12 @@ from crewai.events.types.task_events import (
|
||||
TaskFailedEvent,
|
||||
TaskStartedEvent,
|
||||
)
|
||||
from crewai.files import (
|
||||
FileInput,
|
||||
FilePath,
|
||||
FileSourceInput,
|
||||
normalize_input_files,
|
||||
)
|
||||
from crewai.security import Fingerprint, SecurityConfig
|
||||
from crewai.tasks.output_format import OutputFormat
|
||||
from crewai.tasks.task_output import TaskOutput
|
||||
@@ -49,12 +55,6 @@ from crewai.utilities.file_store import (
|
||||
get_all_files,
|
||||
store_task_files,
|
||||
)
|
||||
from crewai.utilities.files import (
|
||||
FileInput,
|
||||
FilePath,
|
||||
FileSourceInput,
|
||||
normalize_input_files,
|
||||
)
|
||||
from crewai.utilities.guardrail import (
|
||||
process_guardrail,
|
||||
)
|
||||
|
||||
@@ -11,7 +11,7 @@ from crewai.tools.base_tool import BaseTool
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from crewai.utilities.files import FileInput
|
||||
from crewai.files import FileInput
|
||||
|
||||
|
||||
class ReadFileToolSchema(BaseModel):
|
||||
|
||||
@@ -13,7 +13,7 @@ from aiocache.serializers import PickleSerializer # type: ignore[import-untyped
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from crewai.utilities.files import FileInput
|
||||
from crewai.files import FileInput
|
||||
|
||||
_file_store = Cache(Cache.MEMORY, serializer=PickleSerializer())
|
||||
|
||||
|
||||
@@ -1,207 +1,25 @@
|
||||
"""File handling utilities for crewAI tasks."""
|
||||
"""Backwards compatibility re-exports from crewai.files.
|
||||
|
||||
from crewai.utilities.files.cleanup import (
|
||||
cleanup_expired_files,
|
||||
cleanup_provider_files,
|
||||
cleanup_uploaded_files,
|
||||
)
|
||||
from crewai.utilities.files.content_types import (
|
||||
AudioContentType,
|
||||
AudioExtension,
|
||||
AudioFile,
|
||||
BaseFile,
|
||||
File,
|
||||
FileMode,
|
||||
ImageContentType,
|
||||
ImageExtension,
|
||||
ImageFile,
|
||||
PDFContentType,
|
||||
PDFExtension,
|
||||
PDFFile,
|
||||
TextContentType,
|
||||
TextExtension,
|
||||
TextFile,
|
||||
VideoContentType,
|
||||
VideoExtension,
|
||||
VideoFile,
|
||||
)
|
||||
from crewai.utilities.files.file import (
|
||||
FileBytes,
|
||||
FilePath,
|
||||
FileSource,
|
||||
FileSourceInput,
|
||||
FileStream,
|
||||
RawFileInput,
|
||||
)
|
||||
from crewai.utilities.files.processing import (
|
||||
ANTHROPIC_CONSTRAINTS,
|
||||
BEDROCK_CONSTRAINTS,
|
||||
GEMINI_CONSTRAINTS,
|
||||
OPENAI_CONSTRAINTS,
|
||||
AudioConstraints,
|
||||
FileHandling,
|
||||
FileProcessingError,
|
||||
FileProcessor,
|
||||
FileTooLargeError,
|
||||
FileValidationError,
|
||||
ImageConstraints,
|
||||
PDFConstraints,
|
||||
ProcessingDependencyError,
|
||||
ProviderConstraints,
|
||||
UnsupportedFileTypeError,
|
||||
VideoConstraints,
|
||||
get_constraints_for_provider,
|
||||
)
|
||||
from crewai.utilities.files.resolved import (
|
||||
FileReference,
|
||||
InlineBase64,
|
||||
InlineBytes,
|
||||
ResolvedFile,
|
||||
ResolvedFileType,
|
||||
UrlReference,
|
||||
)
|
||||
from crewai.utilities.files.resolver import (
|
||||
FileResolver,
|
||||
FileResolverConfig,
|
||||
create_resolver,
|
||||
)
|
||||
from crewai.utilities.files.upload_cache import (
|
||||
CachedUpload,
|
||||
UploadCache,
|
||||
get_upload_cache,
|
||||
reset_upload_cache,
|
||||
)
|
||||
from crewai.utilities.files.uploaders import FileUploader, UploadResult, get_uploader
|
||||
Deprecated: Import from crewai.files instead.
|
||||
"""
|
||||
|
||||
import sys
|
||||
from typing import Any
|
||||
|
||||
from typing_extensions import deprecated
|
||||
|
||||
import crewai.files as _files
|
||||
|
||||
|
||||
FileInput = AudioFile | File | ImageFile | PDFFile | TextFile | VideoFile
|
||||
@deprecated("crewai.utilities.files is deprecated. Import from crewai.files instead.")
|
||||
class _DeprecatedModule:
|
||||
"""Deprecated module wrapper."""
|
||||
|
||||
def __getattr__(self, name: str) -> Any:
|
||||
return getattr(_files, name)
|
||||
|
||||
def __dir__(self) -> list[str]:
|
||||
return list(_files.__all__)
|
||||
|
||||
|
||||
def wrap_file_source(source: FileSource) -> FileInput:
|
||||
"""Wrap a FileSource in the appropriate typed FileInput wrapper.
|
||||
|
||||
Args:
|
||||
source: The file source to wrap.
|
||||
|
||||
Returns:
|
||||
Typed FileInput wrapper based on content type.
|
||||
"""
|
||||
content_type = source.content_type
|
||||
|
||||
if content_type.startswith("image/"):
|
||||
return ImageFile(source=source)
|
||||
if content_type.startswith("audio/"):
|
||||
return AudioFile(source=source)
|
||||
if content_type.startswith("video/"):
|
||||
return VideoFile(source=source)
|
||||
if content_type == "application/pdf":
|
||||
return PDFFile(source=source)
|
||||
return TextFile(source=source)
|
||||
|
||||
|
||||
def normalize_input_files(
|
||||
input_files: list[FileSourceInput | FileInput],
|
||||
) -> dict[str, FileInput]:
|
||||
"""Convert a list of file sources to a named dictionary of FileInputs.
|
||||
|
||||
Args:
|
||||
input_files: List of file source inputs or File objects.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping names to FileInput wrappers.
|
||||
"""
|
||||
from pathlib import Path
|
||||
|
||||
result: dict[str, FileInput] = {}
|
||||
|
||||
for i, item in enumerate(input_files):
|
||||
if isinstance(item, BaseFile):
|
||||
name = item.filename or f"file_{i}"
|
||||
if "." in name:
|
||||
name = name.rsplit(".", 1)[0]
|
||||
result[name] = item
|
||||
continue
|
||||
|
||||
file_source: FilePath | FileBytes | FileStream
|
||||
if isinstance(item, (FilePath, FileBytes, FileStream)):
|
||||
file_source = item
|
||||
elif isinstance(item, Path):
|
||||
file_source = FilePath(path=item)
|
||||
elif isinstance(item, str):
|
||||
file_source = FilePath(path=Path(item))
|
||||
elif isinstance(item, (bytes, memoryview)):
|
||||
file_source = FileBytes(data=bytes(item))
|
||||
else:
|
||||
continue
|
||||
|
||||
name = file_source.filename or f"file_{i}"
|
||||
result[name] = wrap_file_source(file_source)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
__all__ = [
|
||||
"ANTHROPIC_CONSTRAINTS",
|
||||
"BEDROCK_CONSTRAINTS",
|
||||
"GEMINI_CONSTRAINTS",
|
||||
"OPENAI_CONSTRAINTS",
|
||||
"AudioConstraints",
|
||||
"AudioContentType",
|
||||
"AudioExtension",
|
||||
"AudioFile",
|
||||
"BaseFile",
|
||||
"CachedUpload",
|
||||
"File",
|
||||
"FileBytes",
|
||||
"FileHandling",
|
||||
"FileInput",
|
||||
"FileMode",
|
||||
"FilePath",
|
||||
"FileProcessingError",
|
||||
"FileProcessor",
|
||||
"FileReference",
|
||||
"FileResolver",
|
||||
"FileResolverConfig",
|
||||
"FileSource",
|
||||
"FileSourceInput",
|
||||
"FileStream",
|
||||
"FileTooLargeError",
|
||||
"FileUploader",
|
||||
"FileValidationError",
|
||||
"ImageConstraints",
|
||||
"ImageContentType",
|
||||
"ImageExtension",
|
||||
"ImageFile",
|
||||
"InlineBase64",
|
||||
"InlineBytes",
|
||||
"PDFConstraints",
|
||||
"PDFContentType",
|
||||
"PDFExtension",
|
||||
"PDFFile",
|
||||
"ProcessingDependencyError",
|
||||
"ProviderConstraints",
|
||||
"RawFileInput",
|
||||
"ResolvedFile",
|
||||
"ResolvedFileType",
|
||||
"TextContentType",
|
||||
"TextExtension",
|
||||
"TextFile",
|
||||
"UnsupportedFileTypeError",
|
||||
"UploadCache",
|
||||
"UploadResult",
|
||||
"UrlReference",
|
||||
"VideoConstraints",
|
||||
"VideoContentType",
|
||||
"VideoExtension",
|
||||
"VideoFile",
|
||||
"cleanup_expired_files",
|
||||
"cleanup_provider_files",
|
||||
"cleanup_uploaded_files",
|
||||
"create_resolver",
|
||||
"get_constraints_for_provider",
|
||||
"get_upload_cache",
|
||||
"get_uploader",
|
||||
"normalize_input_files",
|
||||
"reset_upload_cache",
|
||||
"wrap_file_source",
|
||||
]
|
||||
sys.modules[__name__] = _DeprecatedModule() # type: ignore[assignment]
|
||||
|
||||
258
lib/crewai/src/crewai/utilities/files/__init__.pyi
Normal file
258
lib/crewai/src/crewai/utilities/files/__init__.pyi
Normal file
@@ -0,0 +1,258 @@
|
||||
"""Type stubs for backwards compatibility re-exports from crewai.files.
|
||||
|
||||
.. deprecated::
|
||||
Import from crewai.files instead.
|
||||
"""
|
||||
|
||||
from collections.abc import Callable
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Literal
|
||||
|
||||
from typing_extensions import deprecated
|
||||
|
||||
import crewai.files as _files
|
||||
|
||||
FileMode = Literal["strict", "auto", "warn", "chunk"]
|
||||
ImageExtension = _files.ImageExtension
|
||||
ImageContentType = _files.ImageContentType
|
||||
PDFExtension = _files.PDFExtension
|
||||
PDFContentType = _files.PDFContentType
|
||||
TextExtension = _files.TextExtension
|
||||
TextContentType = _files.TextContentType
|
||||
AudioExtension = _files.AudioExtension
|
||||
AudioContentType = _files.AudioContentType
|
||||
VideoExtension = _files.VideoExtension
|
||||
VideoContentType = _files.VideoContentType
|
||||
FileInput = _files.FileInput
|
||||
FileSource = _files.FileSource
|
||||
FileSourceInput = _files.FileSourceInput
|
||||
RawFileInput = _files.RawFileInput
|
||||
ResolvedFileType = _files.ResolvedFileType
|
||||
FileHandling = _files.FileHandling
|
||||
|
||||
# Deprecated classes
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class BaseFile(_files.BaseFile):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class ImageFile(_files.ImageFile):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class PDFFile(_files.PDFFile):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class TextFile(_files.TextFile):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class AudioFile(_files.AudioFile):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class VideoFile(_files.VideoFile):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class File(_files.File):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class FilePath(_files.FilePath):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class FileBytes(_files.FileBytes):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class FileStream(_files.FileStream):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class FileResolver(_files.FileResolver):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class FileResolverConfig(_files.FileResolverConfig):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class FileProcessor(_files.FileProcessor):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class FileUploader(_files.FileUploader):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class UploadCache(_files.UploadCache):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class CachedUpload(_files.CachedUpload):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class UploadResult(_files.UploadResult):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class ResolvedFile(_files.ResolvedFile):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class FileReference(_files.FileReference):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class UrlReference(_files.UrlReference):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class InlineBase64(_files.InlineBase64):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class InlineBytes(_files.InlineBytes):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class ProviderConstraints(_files.ProviderConstraints):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class ImageConstraints(_files.ImageConstraints):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class AudioConstraints(_files.AudioConstraints):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class VideoConstraints(_files.VideoConstraints):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class PDFConstraints(_files.PDFConstraints):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
# Exceptions
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class FileProcessingError(_files.FileProcessingError):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class FileValidationError(_files.FileValidationError):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class FileTooLargeError(_files.FileTooLargeError):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class UnsupportedFileTypeError(_files.UnsupportedFileTypeError):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
class ProcessingDependencyError(_files.ProcessingDependencyError):
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
# Constants
|
||||
OPENAI_CONSTRAINTS: _files.ProviderConstraints
|
||||
ANTHROPIC_CONSTRAINTS: _files.ProviderConstraints
|
||||
GEMINI_CONSTRAINTS: _files.ProviderConstraints
|
||||
BEDROCK_CONSTRAINTS: _files.ProviderConstraints
|
||||
|
||||
# Deprecated functions
|
||||
@deprecated("Import from crewai.files instead")
|
||||
def create_resolver(
|
||||
provider: str,
|
||||
config: FileResolverConfig | None = None,
|
||||
) -> FileResolver:
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
def get_uploader(provider: str, **kwargs: Any) -> FileUploader | None:
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
def get_upload_cache() -> UploadCache:
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
def reset_upload_cache() -> None:
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
def get_constraints_for_provider(provider: str) -> ProviderConstraints:
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
def cleanup_uploaded_files(provider: str | None = None) -> int:
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
def cleanup_expired_files() -> int:
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
def cleanup_provider_files(provider: str) -> int:
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
def normalize_input_files(
|
||||
input_files: list[FileSourceInput | FileInput],
|
||||
) -> dict[str, FileInput]:
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
@deprecated("Import from crewai.files instead")
|
||||
def wrap_file_source(source: FileSource) -> FileInput:
|
||||
""".. deprecated:: Import from crewai.files instead."""
|
||||
...
|
||||
|
||||
__all__: list[str]
|
||||
@@ -1,180 +0,0 @@
|
||||
"""Cleanup utilities for uploaded files."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from crewai.utilities.files.upload_cache import CachedUpload, UploadCache
|
||||
from crewai.utilities.files.uploaders import get_uploader
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from crewai.utilities.files.uploaders.base import FileUploader
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _safe_delete(
|
||||
uploader: FileUploader,
|
||||
file_id: str,
|
||||
provider: str,
|
||||
) -> bool:
|
||||
"""Safely delete a file, logging any errors.
|
||||
|
||||
Args:
|
||||
uploader: The file uploader to use.
|
||||
file_id: The file ID to delete.
|
||||
provider: Provider name for logging.
|
||||
|
||||
Returns:
|
||||
True if deleted successfully, False otherwise.
|
||||
"""
|
||||
try:
|
||||
if uploader.delete(file_id):
|
||||
logger.debug(f"Deleted {file_id} from {provider}")
|
||||
return True
|
||||
logger.warning(f"Failed to delete {file_id} from {provider}")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.warning(f"Error deleting {file_id} from {provider}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def cleanup_uploaded_files(
|
||||
cache: UploadCache,
|
||||
*,
|
||||
delete_from_provider: bool = True,
|
||||
providers: list[str] | None = None,
|
||||
) -> int:
|
||||
"""Clean up uploaded files from the cache and optionally from providers.
|
||||
|
||||
Args:
|
||||
cache: The upload cache to clean up.
|
||||
delete_from_provider: If True, delete files from the provider as well.
|
||||
providers: Optional list of providers to clean up. If None, cleans all.
|
||||
|
||||
Returns:
|
||||
Number of files cleaned up.
|
||||
"""
|
||||
cleaned = 0
|
||||
|
||||
provider_uploads: dict[str, list[CachedUpload]] = {}
|
||||
|
||||
for provider in _get_providers_from_cache(cache):
|
||||
if providers is not None and provider not in providers:
|
||||
continue
|
||||
provider_uploads[provider] = cache.get_all_for_provider(provider)
|
||||
|
||||
if delete_from_provider:
|
||||
for provider, uploads in provider_uploads.items():
|
||||
uploader = get_uploader(provider)
|
||||
if uploader is None:
|
||||
logger.warning(
|
||||
f"No uploader available for {provider}, skipping cleanup"
|
||||
)
|
||||
continue
|
||||
|
||||
for upload in uploads:
|
||||
if _safe_delete(uploader, upload.file_id, provider):
|
||||
cleaned += 1
|
||||
|
||||
cache.clear()
|
||||
|
||||
logger.info(f"Cleaned up {cleaned} uploaded files")
|
||||
return cleaned
|
||||
|
||||
|
||||
def cleanup_expired_files(
|
||||
cache: UploadCache,
|
||||
*,
|
||||
delete_from_provider: bool = False,
|
||||
) -> int:
|
||||
"""Clean up expired files from the cache.
|
||||
|
||||
Args:
|
||||
cache: The upload cache to clean up.
|
||||
delete_from_provider: If True, attempt to delete from provider as well.
|
||||
Note: Expired files may already be deleted by the provider.
|
||||
|
||||
Returns:
|
||||
Number of expired entries removed from cache.
|
||||
"""
|
||||
expired_entries: list[CachedUpload] = []
|
||||
|
||||
if delete_from_provider:
|
||||
for provider in _get_providers_from_cache(cache):
|
||||
expired_entries.extend(
|
||||
upload
|
||||
for upload in cache.get_all_for_provider(provider)
|
||||
if upload.is_expired()
|
||||
)
|
||||
|
||||
removed = cache.clear_expired()
|
||||
|
||||
if delete_from_provider:
|
||||
for upload in expired_entries:
|
||||
uploader = get_uploader(upload.provider)
|
||||
if uploader is not None:
|
||||
try:
|
||||
uploader.delete(upload.file_id)
|
||||
except Exception as e:
|
||||
logger.debug(f"Could not delete expired file {upload.file_id}: {e}")
|
||||
|
||||
return removed
|
||||
|
||||
|
||||
def cleanup_provider_files(
|
||||
provider: str,
|
||||
*,
|
||||
cache: UploadCache | None = None,
|
||||
delete_all_from_provider: bool = False,
|
||||
) -> int:
|
||||
"""Clean up all files for a specific provider.
|
||||
|
||||
Args:
|
||||
provider: Provider name to clean up.
|
||||
cache: Optional upload cache to clear entries from.
|
||||
delete_all_from_provider: If True, delete all files from the provider,
|
||||
not just cached ones.
|
||||
|
||||
Returns:
|
||||
Number of files deleted.
|
||||
"""
|
||||
deleted = 0
|
||||
uploader = get_uploader(provider)
|
||||
|
||||
if uploader is None:
|
||||
logger.warning(f"No uploader available for {provider}")
|
||||
return 0
|
||||
|
||||
if delete_all_from_provider:
|
||||
try:
|
||||
files = uploader.list_files()
|
||||
for file_info in files:
|
||||
file_id = file_info.get("id") or file_info.get("name")
|
||||
if file_id and uploader.delete(file_id):
|
||||
deleted += 1
|
||||
except Exception as e:
|
||||
logger.warning(f"Error listing/deleting files from {provider}: {e}")
|
||||
elif cache is not None:
|
||||
uploads = cache.get_all_for_provider(provider)
|
||||
for upload in uploads:
|
||||
if _safe_delete(uploader, upload.file_id, provider):
|
||||
deleted += 1
|
||||
cache.remove_by_file_id(upload.file_id, provider)
|
||||
|
||||
logger.info(f"Deleted {deleted} files from {provider}")
|
||||
return deleted
|
||||
|
||||
|
||||
def _get_providers_from_cache(cache: UploadCache) -> set[str]:
|
||||
"""Get unique provider names from cache entries.
|
||||
|
||||
Args:
|
||||
cache: The upload cache.
|
||||
|
||||
Returns:
|
||||
Set of provider names.
|
||||
"""
|
||||
return cache.get_providers()
|
||||
@@ -1,158 +0,0 @@
|
||||
"""Base file class for handling file inputs in tasks."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Annotated, Any, BinaryIO, cast
|
||||
|
||||
import magic
|
||||
from pydantic import (
|
||||
BaseModel,
|
||||
BeforeValidator,
|
||||
Field,
|
||||
GetCoreSchemaHandler,
|
||||
PrivateAttr,
|
||||
model_validator,
|
||||
)
|
||||
from pydantic_core import CoreSchema, core_schema
|
||||
|
||||
|
||||
def detect_content_type(data: bytes) -> str:
|
||||
"""Detect MIME type from file content.
|
||||
|
||||
Args:
|
||||
data: Raw bytes to analyze.
|
||||
|
||||
Returns:
|
||||
The detected MIME type.
|
||||
"""
|
||||
return magic.from_buffer(data, mime=True)
|
||||
|
||||
|
||||
class _BinaryIOValidator:
|
||||
"""Pydantic validator for BinaryIO types."""
|
||||
|
||||
@classmethod
|
||||
def __get_pydantic_core_schema__(
|
||||
cls, source_type: Any, handler: GetCoreSchemaHandler
|
||||
) -> CoreSchema:
|
||||
return core_schema.no_info_plain_validator_function(
|
||||
cls._validate,
|
||||
serialization=core_schema.plain_serializer_function_ser_schema(
|
||||
lambda x: None, info_arg=False
|
||||
),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _validate(value: Any) -> BinaryIO:
|
||||
if hasattr(value, "read") and hasattr(value, "seek"):
|
||||
return cast(BinaryIO, value)
|
||||
raise ValueError("Expected a binary file-like object with read() and seek()")
|
||||
|
||||
|
||||
ValidatedBinaryIO = Annotated[BinaryIO, _BinaryIOValidator()]
|
||||
|
||||
|
||||
class FilePath(BaseModel):
|
||||
"""File loaded from a filesystem path."""
|
||||
|
||||
path: Path = Field(description="Path to the file on the filesystem.")
|
||||
_content: bytes | None = PrivateAttr(default=None)
|
||||
|
||||
@model_validator(mode="after")
|
||||
def _validate_file_exists(self) -> FilePath:
|
||||
"""Validate that the file exists."""
|
||||
if not self.path.exists():
|
||||
raise ValueError(f"File not found: {self.path}")
|
||||
if not self.path.is_file():
|
||||
raise ValueError(f"Path is not a file: {self.path}")
|
||||
return self
|
||||
|
||||
@property
|
||||
def filename(self) -> str:
|
||||
"""Get the filename from the path."""
|
||||
return self.path.name
|
||||
|
||||
@property
|
||||
def content_type(self) -> str:
|
||||
"""Get the content type by reading file content."""
|
||||
return detect_content_type(self.read())
|
||||
|
||||
def read(self) -> bytes:
|
||||
"""Read the file content from disk."""
|
||||
if self._content is None:
|
||||
self._content = self.path.read_bytes()
|
||||
return self._content
|
||||
|
||||
|
||||
class FileBytes(BaseModel):
|
||||
"""File created from raw bytes content."""
|
||||
|
||||
data: bytes = Field(description="Raw bytes content of the file.")
|
||||
filename: str | None = Field(default=None, description="Optional filename.")
|
||||
|
||||
@property
|
||||
def content_type(self) -> str:
|
||||
"""Get the content type from the data."""
|
||||
return detect_content_type(self.data)
|
||||
|
||||
def read(self) -> bytes:
|
||||
"""Return the bytes content."""
|
||||
return self.data
|
||||
|
||||
|
||||
class FileStream(BaseModel):
|
||||
"""File loaded from a file-like stream."""
|
||||
|
||||
stream: ValidatedBinaryIO = Field(description="Binary file stream.")
|
||||
filename: str | None = Field(default=None, description="Optional filename.")
|
||||
_content: bytes | None = PrivateAttr(default=None)
|
||||
|
||||
def model_post_init(self, __context: object) -> None:
|
||||
"""Extract filename from stream if not provided."""
|
||||
if self.filename is None:
|
||||
name = getattr(self.stream, "name", None)
|
||||
if name is not None:
|
||||
object.__setattr__(self, "filename", Path(name).name)
|
||||
|
||||
@property
|
||||
def content_type(self) -> str:
|
||||
"""Get the content type from stream content."""
|
||||
return detect_content_type(self.read())
|
||||
|
||||
def read(self) -> bytes:
|
||||
"""Read the stream content. Content is cached after first read."""
|
||||
if self._content is None:
|
||||
position = self.stream.tell()
|
||||
self.stream.seek(0)
|
||||
self._content = self.stream.read()
|
||||
self.stream.seek(position)
|
||||
return self._content
|
||||
|
||||
def close(self) -> None:
|
||||
"""Close the underlying stream."""
|
||||
self.stream.close()
|
||||
|
||||
|
||||
FileSource = FilePath | FileBytes | FileStream
|
||||
|
||||
|
||||
def _normalize_source(value: Any) -> FileSource:
|
||||
"""Convert raw input to appropriate source type."""
|
||||
if isinstance(value, (FilePath, FileBytes, FileStream)):
|
||||
return value
|
||||
if isinstance(value, Path):
|
||||
return FilePath(path=value)
|
||||
if isinstance(value, str):
|
||||
return FilePath(path=Path(value))
|
||||
if isinstance(value, bytes):
|
||||
return FileBytes(data=value)
|
||||
if hasattr(value, "read") and hasattr(value, "seek"):
|
||||
return FileStream(stream=value)
|
||||
raise ValueError(f"Cannot convert {type(value).__name__} to file source")
|
||||
|
||||
|
||||
RawFileInput = str | Path | bytes
|
||||
FileSourceInput = Annotated[
|
||||
RawFileInput | FileSource, BeforeValidator(_normalize_source)
|
||||
]
|
||||
@@ -1,287 +0,0 @@
|
||||
"""FileResolver for deciding file delivery method and managing uploads."""
|
||||
|
||||
import base64
|
||||
from dataclasses import dataclass, field
|
||||
import logging
|
||||
|
||||
from crewai.utilities.files.content_types import (
|
||||
AudioFile,
|
||||
File,
|
||||
ImageFile,
|
||||
PDFFile,
|
||||
TextFile,
|
||||
VideoFile,
|
||||
)
|
||||
from crewai.utilities.files.processing.constraints import (
|
||||
ProviderConstraints,
|
||||
get_constraints_for_provider,
|
||||
)
|
||||
from crewai.utilities.files.resolved import (
|
||||
FileReference,
|
||||
InlineBase64,
|
||||
InlineBytes,
|
||||
ResolvedFile,
|
||||
)
|
||||
from crewai.utilities.files.upload_cache import CachedUpload, UploadCache
|
||||
from crewai.utilities.files.uploaders import get_uploader
|
||||
from crewai.utilities.files.uploaders.base import FileUploader
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
FileInput = AudioFile | File | ImageFile | PDFFile | TextFile | VideoFile
|
||||
|
||||
|
||||
@dataclass
|
||||
class FileResolverConfig:
|
||||
"""Configuration for FileResolver.
|
||||
|
||||
Attributes:
|
||||
prefer_upload: If True, prefer uploading over inline for supported providers.
|
||||
upload_threshold_bytes: Size threshold above which to use upload.
|
||||
If None, uses provider-specific threshold.
|
||||
use_bytes_for_bedrock: If True, use raw bytes instead of base64 for Bedrock.
|
||||
"""
|
||||
|
||||
prefer_upload: bool = False
|
||||
upload_threshold_bytes: int | None = None
|
||||
use_bytes_for_bedrock: bool = True
|
||||
|
||||
|
||||
@dataclass
|
||||
class FileResolver:
|
||||
"""Resolves files to their delivery format based on provider capabilities.
|
||||
|
||||
Decides whether to use inline base64, raw bytes, or file upload based on:
|
||||
- Provider constraints and capabilities
|
||||
- File size
|
||||
- Configuration preferences
|
||||
|
||||
Caches uploaded files to avoid redundant uploads.
|
||||
|
||||
Attributes:
|
||||
config: Resolver configuration.
|
||||
upload_cache: Cache for tracking uploaded files.
|
||||
"""
|
||||
|
||||
config: FileResolverConfig = field(default_factory=FileResolverConfig)
|
||||
upload_cache: UploadCache | None = None
|
||||
_uploaders: dict[str, FileUploader] = field(default_factory=dict)
|
||||
|
||||
def resolve(self, file: FileInput, provider: str) -> ResolvedFile:
|
||||
"""Resolve a file to its delivery format for a provider.
|
||||
|
||||
Args:
|
||||
file: The file to resolve.
|
||||
provider: Provider name (e.g., "gemini", "anthropic", "openai").
|
||||
|
||||
Returns:
|
||||
ResolvedFile representing the appropriate delivery format.
|
||||
"""
|
||||
provider_lower = provider.lower()
|
||||
constraints = get_constraints_for_provider(provider)
|
||||
file_size = len(file.read())
|
||||
|
||||
should_upload = self._should_upload(
|
||||
file, provider_lower, constraints, file_size
|
||||
)
|
||||
|
||||
if should_upload:
|
||||
resolved = self._resolve_via_upload(file, provider_lower)
|
||||
if resolved is not None:
|
||||
return resolved
|
||||
|
||||
return self._resolve_inline(file, provider_lower)
|
||||
|
||||
def resolve_files(
|
||||
self,
|
||||
files: dict[str, FileInput],
|
||||
provider: str,
|
||||
) -> dict[str, ResolvedFile]:
|
||||
"""Resolve multiple files for a provider.
|
||||
|
||||
Args:
|
||||
files: Dictionary mapping names to file inputs.
|
||||
provider: Provider name.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping names to resolved files.
|
||||
"""
|
||||
return {name: self.resolve(file, provider) for name, file in files.items()}
|
||||
|
||||
def _should_upload(
|
||||
self,
|
||||
file: FileInput,
|
||||
provider: str,
|
||||
constraints: ProviderConstraints | None,
|
||||
file_size: int,
|
||||
) -> bool:
|
||||
"""Determine if a file should be uploaded rather than inlined.
|
||||
|
||||
Args:
|
||||
file: The file to check.
|
||||
provider: Provider name.
|
||||
constraints: Provider constraints.
|
||||
file_size: Size of the file in bytes.
|
||||
|
||||
Returns:
|
||||
True if the file should be uploaded, False otherwise.
|
||||
"""
|
||||
if constraints is None or not constraints.supports_file_upload:
|
||||
return False
|
||||
|
||||
if self.config.prefer_upload:
|
||||
return True
|
||||
|
||||
threshold = self.config.upload_threshold_bytes
|
||||
if threshold is None and constraints is not None:
|
||||
threshold = constraints.file_upload_threshold_bytes
|
||||
|
||||
if threshold is not None and file_size > threshold:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _resolve_via_upload(
|
||||
self,
|
||||
file: FileInput,
|
||||
provider: str,
|
||||
) -> ResolvedFile | None:
|
||||
"""Resolve a file by uploading it.
|
||||
|
||||
Args:
|
||||
file: The file to upload.
|
||||
provider: Provider name.
|
||||
|
||||
Returns:
|
||||
FileReference if upload succeeds, None otherwise.
|
||||
"""
|
||||
if self.upload_cache is not None:
|
||||
cached = self.upload_cache.get(file, provider)
|
||||
if cached is not None:
|
||||
logger.debug(
|
||||
f"Using cached upload for {file.filename}: {cached.file_id}"
|
||||
)
|
||||
return FileReference(
|
||||
content_type=cached.content_type,
|
||||
file_id=cached.file_id,
|
||||
provider=cached.provider,
|
||||
expires_at=cached.expires_at,
|
||||
file_uri=cached.file_uri,
|
||||
)
|
||||
|
||||
uploader = self._get_uploader(provider)
|
||||
if uploader is None:
|
||||
logger.debug(f"No uploader available for {provider}")
|
||||
return None
|
||||
|
||||
try:
|
||||
result = uploader.upload(file)
|
||||
|
||||
if self.upload_cache is not None:
|
||||
self.upload_cache.set(
|
||||
file=file,
|
||||
provider=provider,
|
||||
file_id=result.file_id,
|
||||
file_uri=result.file_uri,
|
||||
expires_at=result.expires_at,
|
||||
)
|
||||
|
||||
return FileReference(
|
||||
content_type=result.content_type,
|
||||
file_id=result.file_id,
|
||||
provider=result.provider,
|
||||
expires_at=result.expires_at,
|
||||
file_uri=result.file_uri,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to upload {file.filename} to {provider}: {e}")
|
||||
return None
|
||||
|
||||
def _resolve_inline(self, file: FileInput, provider: str) -> ResolvedFile:
|
||||
"""Resolve a file as inline content.
|
||||
|
||||
Args:
|
||||
file: The file to resolve.
|
||||
provider: Provider name.
|
||||
|
||||
Returns:
|
||||
InlineBase64 or InlineBytes depending on provider.
|
||||
"""
|
||||
content = file.read()
|
||||
|
||||
if self.config.use_bytes_for_bedrock and "bedrock" in provider:
|
||||
return InlineBytes(
|
||||
content_type=file.content_type,
|
||||
data=content,
|
||||
)
|
||||
|
||||
encoded = base64.b64encode(content).decode("ascii")
|
||||
return InlineBase64(
|
||||
content_type=file.content_type,
|
||||
data=encoded,
|
||||
)
|
||||
|
||||
def _get_uploader(self, provider: str) -> FileUploader | None:
|
||||
"""Get or create an uploader for a provider.
|
||||
|
||||
Args:
|
||||
provider: Provider name.
|
||||
|
||||
Returns:
|
||||
FileUploader instance or None if not available.
|
||||
"""
|
||||
if provider not in self._uploaders:
|
||||
uploader = get_uploader(provider)
|
||||
if uploader is not None:
|
||||
self._uploaders[provider] = uploader
|
||||
else:
|
||||
return None
|
||||
|
||||
return self._uploaders.get(provider)
|
||||
|
||||
def get_cached_uploads(self, provider: str) -> list[CachedUpload]:
|
||||
"""Get all cached uploads for a provider.
|
||||
|
||||
Args:
|
||||
provider: Provider name.
|
||||
|
||||
Returns:
|
||||
List of cached uploads.
|
||||
"""
|
||||
if self.upload_cache is None:
|
||||
return []
|
||||
return self.upload_cache.get_all_for_provider(provider)
|
||||
|
||||
def clear_cache(self) -> None:
|
||||
"""Clear the upload cache."""
|
||||
if self.upload_cache is not None:
|
||||
self.upload_cache.clear()
|
||||
|
||||
|
||||
def create_resolver(
|
||||
provider: str | None = None,
|
||||
prefer_upload: bool = False,
|
||||
upload_threshold_bytes: int | None = None,
|
||||
enable_cache: bool = True,
|
||||
) -> FileResolver:
|
||||
"""Create a configured FileResolver.
|
||||
|
||||
Args:
|
||||
provider: Optional provider name for provider-specific configuration.
|
||||
prefer_upload: Whether to prefer upload over inline.
|
||||
upload_threshold_bytes: Size threshold for using upload.
|
||||
enable_cache: Whether to enable upload caching.
|
||||
|
||||
Returns:
|
||||
Configured FileResolver instance.
|
||||
"""
|
||||
config = FileResolverConfig(
|
||||
prefer_upload=prefer_upload,
|
||||
upload_threshold_bytes=upload_threshold_bytes,
|
||||
)
|
||||
|
||||
cache = UploadCache() if enable_cache else None
|
||||
|
||||
return FileResolver(config=config, upload_cache=cache)
|
||||
@@ -1,166 +0,0 @@
|
||||
"""Anthropic Files API uploader implementation."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
from crewai.utilities.files.content_types import (
|
||||
AudioFile,
|
||||
File,
|
||||
ImageFile,
|
||||
PDFFile,
|
||||
TextFile,
|
||||
VideoFile,
|
||||
)
|
||||
from crewai.utilities.files.uploaders.base import FileUploader, UploadResult
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
FileInput = AudioFile | File | ImageFile | PDFFile | TextFile | VideoFile
|
||||
|
||||
|
||||
class AnthropicFileUploader(FileUploader):
|
||||
"""Uploader for Anthropic Files API.
|
||||
|
||||
Uses the anthropic SDK to upload files. Files are stored persistently
|
||||
until explicitly deleted.
|
||||
|
||||
Attributes:
|
||||
api_key: Optional API key (uses ANTHROPIC_API_KEY env var if not provided).
|
||||
"""
|
||||
|
||||
def __init__(self, api_key: str | None = None) -> None:
|
||||
"""Initialize the Anthropic uploader.
|
||||
|
||||
Args:
|
||||
api_key: Optional Anthropic API key. If not provided, uses
|
||||
ANTHROPIC_API_KEY environment variable.
|
||||
"""
|
||||
self._api_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
|
||||
self._client: Any = None
|
||||
|
||||
@property
|
||||
def provider_name(self) -> str:
|
||||
"""Return the provider name."""
|
||||
return "anthropic"
|
||||
|
||||
def _get_client(self) -> Any:
|
||||
"""Get or create the Anthropic client."""
|
||||
if self._client is None:
|
||||
try:
|
||||
import anthropic
|
||||
|
||||
self._client = anthropic.Anthropic(api_key=self._api_key)
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"anthropic is required for Anthropic file uploads. "
|
||||
"Install with: pip install anthropic"
|
||||
) from e
|
||||
return self._client
|
||||
|
||||
def upload(self, file: FileInput, purpose: str | None = None) -> UploadResult:
|
||||
"""Upload a file to Anthropic.
|
||||
|
||||
Args:
|
||||
file: The file to upload.
|
||||
purpose: Optional purpose for the file (default: "user_upload").
|
||||
|
||||
Returns:
|
||||
UploadResult with the file ID and metadata.
|
||||
|
||||
Raises:
|
||||
Exception: If upload fails.
|
||||
"""
|
||||
client = self._get_client()
|
||||
|
||||
content = file.read()
|
||||
file_purpose = purpose or "user_upload"
|
||||
|
||||
file_data = io.BytesIO(content)
|
||||
|
||||
logger.info(
|
||||
f"Uploading file '{file.filename}' to Anthropic ({len(content)} bytes)"
|
||||
)
|
||||
|
||||
uploaded_file = client.files.create(
|
||||
file=(file.filename, file_data, file.content_type),
|
||||
purpose=file_purpose,
|
||||
)
|
||||
|
||||
logger.info(f"Uploaded to Anthropic: {uploaded_file.id}")
|
||||
|
||||
return UploadResult(
|
||||
file_id=uploaded_file.id,
|
||||
file_uri=None,
|
||||
content_type=file.content_type,
|
||||
expires_at=None,
|
||||
provider=self.provider_name,
|
||||
)
|
||||
|
||||
def delete(self, file_id: str) -> bool:
|
||||
"""Delete an uploaded file from Anthropic.
|
||||
|
||||
Args:
|
||||
file_id: The file ID to delete.
|
||||
|
||||
Returns:
|
||||
True if deletion was successful, False otherwise.
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
client.files.delete(file_id=file_id)
|
||||
logger.info(f"Deleted Anthropic file: {file_id}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to delete Anthropic file {file_id}: {e}")
|
||||
return False
|
||||
|
||||
def get_file_info(self, file_id: str) -> dict[str, Any] | None:
|
||||
"""Get information about an uploaded file.
|
||||
|
||||
Args:
|
||||
file_id: The file ID.
|
||||
|
||||
Returns:
|
||||
Dictionary with file information, or None if not found.
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
file_info = client.files.retrieve(file_id=file_id)
|
||||
return {
|
||||
"id": file_info.id,
|
||||
"filename": file_info.filename,
|
||||
"purpose": file_info.purpose,
|
||||
"size_bytes": file_info.size_bytes,
|
||||
"created_at": file_info.created_at,
|
||||
}
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to get Anthropic file info for {file_id}: {e}")
|
||||
return None
|
||||
|
||||
def list_files(self) -> list[dict[str, Any]]:
|
||||
"""List all uploaded files.
|
||||
|
||||
Returns:
|
||||
List of dictionaries with file information.
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
files = client.files.list()
|
||||
return [
|
||||
{
|
||||
"id": f.id,
|
||||
"filename": f.filename,
|
||||
"purpose": f.purpose,
|
||||
"size_bytes": f.size_bytes,
|
||||
"created_at": f.created_at,
|
||||
}
|
||||
for f in files.data
|
||||
]
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to list Anthropic files: {e}")
|
||||
return []
|
||||
@@ -1,217 +0,0 @@
|
||||
"""Gemini File API uploader implementation."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timedelta, timezone
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
from crewai.utilities.files.content_types import (
|
||||
AudioFile,
|
||||
File,
|
||||
ImageFile,
|
||||
PDFFile,
|
||||
TextFile,
|
||||
VideoFile,
|
||||
)
|
||||
from crewai.utilities.files.uploaders.base import FileUploader, UploadResult
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
FileInput = AudioFile | File | ImageFile | PDFFile | TextFile | VideoFile
|
||||
|
||||
GEMINI_FILE_TTL = timedelta(hours=48)
|
||||
|
||||
|
||||
class GeminiFileUploader(FileUploader):
|
||||
"""Uploader for Google Gemini File API.
|
||||
|
||||
Uses the google-genai SDK to upload files. Files are stored for 48 hours.
|
||||
|
||||
Attributes:
|
||||
api_key: Optional API key (uses GOOGLE_API_KEY env var if not provided).
|
||||
"""
|
||||
|
||||
def __init__(self, api_key: str | None = None) -> None:
|
||||
"""Initialize the Gemini uploader.
|
||||
|
||||
Args:
|
||||
api_key: Optional Google API key. If not provided, uses
|
||||
GOOGLE_API_KEY environment variable.
|
||||
"""
|
||||
self._api_key = api_key or os.environ.get("GOOGLE_API_KEY")
|
||||
self._client: Any = None
|
||||
|
||||
@property
|
||||
def provider_name(self) -> str:
|
||||
"""Return the provider name."""
|
||||
return "gemini"
|
||||
|
||||
def _get_client(self) -> Any:
|
||||
"""Get or create the Gemini client."""
|
||||
if self._client is None:
|
||||
try:
|
||||
from google import genai
|
||||
|
||||
self._client = genai.Client(api_key=self._api_key)
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"google-genai is required for Gemini file uploads. "
|
||||
"Install with: pip install google-genai"
|
||||
) from e
|
||||
return self._client
|
||||
|
||||
def upload(self, file: FileInput, purpose: str | None = None) -> UploadResult:
|
||||
"""Upload a file to Gemini.
|
||||
|
||||
Args:
|
||||
file: The file to upload.
|
||||
purpose: Optional purpose/description (used as display name).
|
||||
|
||||
Returns:
|
||||
UploadResult with the file URI and metadata.
|
||||
|
||||
Raises:
|
||||
Exception: If upload fails.
|
||||
"""
|
||||
client = self._get_client()
|
||||
|
||||
content = file.read()
|
||||
display_name = purpose or file.filename
|
||||
|
||||
file_data = io.BytesIO(content)
|
||||
file_data.name = file.filename
|
||||
|
||||
logger.info(
|
||||
f"Uploading file '{file.filename}' to Gemini ({len(content)} bytes)"
|
||||
)
|
||||
|
||||
uploaded_file = client.files.upload(
|
||||
file=file_data,
|
||||
config={
|
||||
"display_name": display_name,
|
||||
"mime_type": file.content_type,
|
||||
},
|
||||
)
|
||||
|
||||
expires_at = datetime.now(timezone.utc) + GEMINI_FILE_TTL
|
||||
|
||||
logger.info(
|
||||
f"Uploaded to Gemini: {uploaded_file.name} (URI: {uploaded_file.uri})"
|
||||
)
|
||||
|
||||
return UploadResult(
|
||||
file_id=uploaded_file.name,
|
||||
file_uri=uploaded_file.uri,
|
||||
content_type=file.content_type,
|
||||
expires_at=expires_at,
|
||||
provider=self.provider_name,
|
||||
)
|
||||
|
||||
def delete(self, file_id: str) -> bool:
|
||||
"""Delete an uploaded file from Gemini.
|
||||
|
||||
Args:
|
||||
file_id: The file name/ID to delete.
|
||||
|
||||
Returns:
|
||||
True if deletion was successful, False otherwise.
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
client.files.delete(name=file_id)
|
||||
logger.info(f"Deleted Gemini file: {file_id}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to delete Gemini file {file_id}: {e}")
|
||||
return False
|
||||
|
||||
def get_file_info(self, file_id: str) -> dict[str, Any] | None:
|
||||
"""Get information about an uploaded file.
|
||||
|
||||
Args:
|
||||
file_id: The file name/ID.
|
||||
|
||||
Returns:
|
||||
Dictionary with file information, or None if not found.
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
file_info = client.files.get(name=file_id)
|
||||
return {
|
||||
"name": file_info.name,
|
||||
"uri": file_info.uri,
|
||||
"display_name": file_info.display_name,
|
||||
"mime_type": file_info.mime_type,
|
||||
"size_bytes": file_info.size_bytes,
|
||||
"state": str(file_info.state),
|
||||
"create_time": file_info.create_time,
|
||||
"expiration_time": file_info.expiration_time,
|
||||
}
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to get Gemini file info for {file_id}: {e}")
|
||||
return None
|
||||
|
||||
def list_files(self) -> list[dict[str, Any]]:
|
||||
"""List all uploaded files.
|
||||
|
||||
Returns:
|
||||
List of dictionaries with file information.
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
files = client.files.list()
|
||||
return [
|
||||
{
|
||||
"name": f.name,
|
||||
"uri": f.uri,
|
||||
"display_name": f.display_name,
|
||||
"mime_type": f.mime_type,
|
||||
"size_bytes": f.size_bytes,
|
||||
"state": str(f.state),
|
||||
}
|
||||
for f in files
|
||||
]
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to list Gemini files: {e}")
|
||||
return []
|
||||
|
||||
def wait_for_processing(self, file_id: str, timeout_seconds: int = 300) -> bool:
|
||||
"""Wait for a file to finish processing.
|
||||
|
||||
Some files (especially videos) need time to process after upload.
|
||||
|
||||
Args:
|
||||
file_id: The file name/ID.
|
||||
timeout_seconds: Maximum time to wait.
|
||||
|
||||
Returns:
|
||||
True if processing completed, False if timed out or failed.
|
||||
"""
|
||||
import time
|
||||
|
||||
try:
|
||||
from google.genai.types import FileState
|
||||
except ImportError:
|
||||
return True
|
||||
|
||||
client = self._get_client()
|
||||
start_time = time.time()
|
||||
|
||||
while time.time() - start_time < timeout_seconds:
|
||||
file_info = client.files.get(name=file_id)
|
||||
|
||||
if file_info.state == FileState.ACTIVE:
|
||||
return True
|
||||
|
||||
if file_info.state == FileState.FAILED:
|
||||
logger.error(f"Gemini file processing failed: {file_id}")
|
||||
return False
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
logger.warning(f"Timed out waiting for Gemini file processing: {file_id}")
|
||||
return False
|
||||
@@ -1,169 +0,0 @@
|
||||
"""OpenAI Files API uploader implementation."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
from crewai.utilities.files.content_types import (
|
||||
AudioFile,
|
||||
File,
|
||||
ImageFile,
|
||||
PDFFile,
|
||||
TextFile,
|
||||
VideoFile,
|
||||
)
|
||||
from crewai.utilities.files.uploaders.base import FileUploader, UploadResult
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
FileInput = AudioFile | File | ImageFile | PDFFile | TextFile | VideoFile
|
||||
|
||||
|
||||
class OpenAIFileUploader(FileUploader):
|
||||
"""Uploader for OpenAI Files API.
|
||||
|
||||
Uses the OpenAI SDK to upload files. Files are stored persistently
|
||||
until explicitly deleted.
|
||||
|
||||
Attributes:
|
||||
api_key: Optional API key (uses OPENAI_API_KEY env var if not provided).
|
||||
"""
|
||||
|
||||
def __init__(self, api_key: str | None = None) -> None:
|
||||
"""Initialize the OpenAI uploader.
|
||||
|
||||
Args:
|
||||
api_key: Optional OpenAI API key. If not provided, uses
|
||||
OPENAI_API_KEY environment variable.
|
||||
"""
|
||||
self._api_key = api_key or os.environ.get("OPENAI_API_KEY")
|
||||
self._client: Any = None
|
||||
|
||||
@property
|
||||
def provider_name(self) -> str:
|
||||
"""Return the provider name."""
|
||||
return "openai"
|
||||
|
||||
def _get_client(self) -> Any:
|
||||
"""Get or create the OpenAI client."""
|
||||
if self._client is None:
|
||||
try:
|
||||
from openai import OpenAI
|
||||
|
||||
self._client = OpenAI(api_key=self._api_key)
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"openai is required for OpenAI file uploads. "
|
||||
"Install with: pip install openai"
|
||||
) from e
|
||||
return self._client
|
||||
|
||||
def upload(self, file: FileInput, purpose: str | None = None) -> UploadResult:
|
||||
"""Upload a file to OpenAI.
|
||||
|
||||
Args:
|
||||
file: The file to upload.
|
||||
purpose: Optional purpose for the file (default: "user_data").
|
||||
|
||||
Returns:
|
||||
UploadResult with the file ID and metadata.
|
||||
|
||||
Raises:
|
||||
Exception: If upload fails.
|
||||
"""
|
||||
client = self._get_client()
|
||||
|
||||
content = file.read()
|
||||
file_purpose = purpose or "user_data"
|
||||
|
||||
file_data = io.BytesIO(content)
|
||||
file_data.name = file.filename or "file"
|
||||
|
||||
logger.info(
|
||||
f"Uploading file '{file.filename}' to OpenAI ({len(content)} bytes)"
|
||||
)
|
||||
|
||||
uploaded_file = client.files.create(
|
||||
file=file_data,
|
||||
purpose=file_purpose,
|
||||
)
|
||||
|
||||
logger.info(f"Uploaded to OpenAI: {uploaded_file.id}")
|
||||
|
||||
return UploadResult(
|
||||
file_id=uploaded_file.id,
|
||||
file_uri=None,
|
||||
content_type=file.content_type,
|
||||
expires_at=None,
|
||||
provider=self.provider_name,
|
||||
)
|
||||
|
||||
def delete(self, file_id: str) -> bool:
|
||||
"""Delete an uploaded file from OpenAI.
|
||||
|
||||
Args:
|
||||
file_id: The file ID to delete.
|
||||
|
||||
Returns:
|
||||
True if deletion was successful, False otherwise.
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
client.files.delete(file_id)
|
||||
logger.info(f"Deleted OpenAI file: {file_id}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to delete OpenAI file {file_id}: {e}")
|
||||
return False
|
||||
|
||||
def get_file_info(self, file_id: str) -> dict[str, Any] | None:
|
||||
"""Get information about an uploaded file.
|
||||
|
||||
Args:
|
||||
file_id: The file ID.
|
||||
|
||||
Returns:
|
||||
Dictionary with file information, or None if not found.
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
file_info = client.files.retrieve(file_id)
|
||||
return {
|
||||
"id": file_info.id,
|
||||
"filename": file_info.filename,
|
||||
"purpose": file_info.purpose,
|
||||
"bytes": file_info.bytes,
|
||||
"created_at": file_info.created_at,
|
||||
"status": file_info.status,
|
||||
}
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to get OpenAI file info for {file_id}: {e}")
|
||||
return None
|
||||
|
||||
def list_files(self) -> list[dict[str, Any]]:
|
||||
"""List all uploaded files.
|
||||
|
||||
Returns:
|
||||
List of dictionaries with file information.
|
||||
"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
files = client.files.list()
|
||||
return [
|
||||
{
|
||||
"id": f.id,
|
||||
"filename": f.filename,
|
||||
"purpose": f.purpose,
|
||||
"bytes": f.bytes,
|
||||
"created_at": f.created_at,
|
||||
"status": f.status,
|
||||
}
|
||||
for f in files.data
|
||||
]
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to list OpenAI files: {e}")
|
||||
return []
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
from typing import Any, Literal, TypedDict
|
||||
|
||||
from crewai.utilities.files import FileInput
|
||||
from crewai.files import FileInput
|
||||
|
||||
|
||||
class LLMMessage(TypedDict):
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
import pytest
|
||||
|
||||
from crewai.utilities.files.processing.constraints import (
|
||||
from crewai.files.processing.constraints import (
|
||||
ANTHROPIC_CONSTRAINTS,
|
||||
BEDROCK_CONSTRAINTS,
|
||||
GEMINI_CONSTRAINTS,
|
||||
@@ -2,19 +2,19 @@
|
||||
|
||||
import pytest
|
||||
|
||||
from crewai.utilities.files import FileBytes, ImageFile, PDFFile, TextFile
|
||||
from crewai.utilities.files.processing.constraints import (
|
||||
from crewai.files import FileBytes, ImageFile, PDFFile, TextFile
|
||||
from crewai.files.processing.constraints import (
|
||||
ANTHROPIC_CONSTRAINTS,
|
||||
ImageConstraints,
|
||||
PDFConstraints,
|
||||
ProviderConstraints,
|
||||
)
|
||||
from crewai.utilities.files.processing.enums import FileHandling
|
||||
from crewai.utilities.files.processing.exceptions import (
|
||||
from crewai.files.processing.enums import FileHandling
|
||||
from crewai.files.processing.exceptions import (
|
||||
FileTooLargeError,
|
||||
FileValidationError,
|
||||
)
|
||||
from crewai.utilities.files.processing.processor import FileProcessor
|
||||
from crewai.files.processing.processor import FileProcessor
|
||||
|
||||
|
||||
# Minimal valid PNG: 8x8 pixel RGB image (valid for PIL)
|
||||
359
lib/crewai/tests/files/processing/test_transformers.py
Normal file
359
lib/crewai/tests/files/processing/test_transformers.py
Normal file
@@ -0,0 +1,359 @@
|
||||
"""Unit tests for file transformers."""
|
||||
|
||||
import io
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from crewai.files import ImageFile, PDFFile, TextFile
|
||||
from crewai.files.file import FileBytes
|
||||
from crewai.files.processing.exceptions import ProcessingDependencyError
|
||||
from crewai.files.processing.transformers import (
|
||||
chunk_pdf,
|
||||
chunk_text,
|
||||
get_image_dimensions,
|
||||
get_pdf_page_count,
|
||||
optimize_image,
|
||||
resize_image,
|
||||
)
|
||||
|
||||
|
||||
def create_test_png(width: int = 100, height: int = 100) -> bytes:
|
||||
"""Create a minimal valid PNG for testing."""
|
||||
from PIL import Image
|
||||
|
||||
img = Image.new("RGB", (width, height), color="red")
|
||||
buffer = io.BytesIO()
|
||||
img.save(buffer, format="PNG")
|
||||
return buffer.getvalue()
|
||||
|
||||
|
||||
def create_test_pdf(num_pages: int = 1) -> bytes:
|
||||
"""Create a minimal valid PDF for testing."""
|
||||
from pypdf import PdfWriter
|
||||
|
||||
writer = PdfWriter()
|
||||
for _ in range(num_pages):
|
||||
writer.add_blank_page(width=612, height=792)
|
||||
|
||||
buffer = io.BytesIO()
|
||||
writer.write(buffer)
|
||||
return buffer.getvalue()
|
||||
|
||||
|
||||
class TestResizeImage:
|
||||
"""Tests for resize_image function."""
|
||||
|
||||
def test_resize_larger_image(self) -> None:
|
||||
"""Test resizing an image larger than max dimensions."""
|
||||
png_bytes = create_test_png(200, 150)
|
||||
img = ImageFile(source=FileBytes(data=png_bytes, filename="test.png"))
|
||||
|
||||
result = resize_image(img, max_width=100, max_height=100)
|
||||
|
||||
dims = get_image_dimensions(result)
|
||||
assert dims is not None
|
||||
width, height = dims
|
||||
assert width <= 100
|
||||
assert height <= 100
|
||||
|
||||
def test_no_resize_if_within_bounds(self) -> None:
|
||||
"""Test that small images are returned unchanged."""
|
||||
png_bytes = create_test_png(50, 50)
|
||||
img = ImageFile(source=FileBytes(data=png_bytes, filename="small.png"))
|
||||
|
||||
result = resize_image(img, max_width=100, max_height=100)
|
||||
|
||||
assert result is img
|
||||
|
||||
def test_preserve_aspect_ratio(self) -> None:
|
||||
"""Test that aspect ratio is preserved during resize."""
|
||||
png_bytes = create_test_png(200, 100)
|
||||
img = ImageFile(source=FileBytes(data=png_bytes, filename="wide.png"))
|
||||
|
||||
result = resize_image(img, max_width=100, max_height=100)
|
||||
|
||||
dims = get_image_dimensions(result)
|
||||
assert dims is not None
|
||||
width, height = dims
|
||||
assert width == 100
|
||||
assert height == 50
|
||||
|
||||
def test_resize_without_aspect_ratio(self) -> None:
|
||||
"""Test resizing without preserving aspect ratio."""
|
||||
png_bytes = create_test_png(200, 100)
|
||||
img = ImageFile(source=FileBytes(data=png_bytes, filename="wide.png"))
|
||||
|
||||
result = resize_image(
|
||||
img, max_width=50, max_height=50, preserve_aspect_ratio=False
|
||||
)
|
||||
|
||||
dims = get_image_dimensions(result)
|
||||
assert dims is not None
|
||||
width, height = dims
|
||||
assert width == 50
|
||||
assert height == 50
|
||||
|
||||
def test_resize_returns_image_file(self) -> None:
|
||||
"""Test that resize returns an ImageFile instance."""
|
||||
png_bytes = create_test_png(200, 200)
|
||||
img = ImageFile(source=FileBytes(data=png_bytes, filename="test.png"))
|
||||
|
||||
result = resize_image(img, max_width=100, max_height=100)
|
||||
|
||||
assert isinstance(result, ImageFile)
|
||||
|
||||
def test_raises_without_pillow(self) -> None:
|
||||
"""Test that ProcessingDependencyError is raised without Pillow."""
|
||||
img = ImageFile(source=FileBytes(data=b"fake", filename="test.png"))
|
||||
|
||||
with patch.dict("sys.modules", {"PIL": None, "PIL.Image": None}):
|
||||
with pytest.raises(ProcessingDependencyError) as exc_info:
|
||||
# Force reimport to trigger ImportError
|
||||
import importlib
|
||||
|
||||
import crewai.files.processing.transformers as t
|
||||
|
||||
importlib.reload(t)
|
||||
t.resize_image(img, 100, 100)
|
||||
|
||||
assert "Pillow" in str(exc_info.value)
|
||||
|
||||
|
||||
class TestOptimizeImage:
|
||||
"""Tests for optimize_image function."""
|
||||
|
||||
def test_optimize_reduces_size(self) -> None:
|
||||
"""Test that optimization reduces file size."""
|
||||
png_bytes = create_test_png(500, 500)
|
||||
original_size = len(png_bytes)
|
||||
img = ImageFile(source=FileBytes(data=png_bytes, filename="large.png"))
|
||||
|
||||
result = optimize_image(img, target_size_bytes=original_size // 2)
|
||||
|
||||
result_size = len(result.read())
|
||||
assert result_size < original_size
|
||||
|
||||
def test_no_optimize_if_under_target(self) -> None:
|
||||
"""Test that small images are returned unchanged."""
|
||||
png_bytes = create_test_png(50, 50)
|
||||
img = ImageFile(source=FileBytes(data=png_bytes, filename="small.png"))
|
||||
|
||||
result = optimize_image(img, target_size_bytes=1024 * 1024)
|
||||
|
||||
assert result is img
|
||||
|
||||
def test_optimize_returns_image_file(self) -> None:
|
||||
"""Test that optimize returns an ImageFile instance."""
|
||||
png_bytes = create_test_png(200, 200)
|
||||
img = ImageFile(source=FileBytes(data=png_bytes, filename="test.png"))
|
||||
|
||||
result = optimize_image(img, target_size_bytes=100)
|
||||
|
||||
assert isinstance(result, ImageFile)
|
||||
|
||||
def test_optimize_respects_min_quality(self) -> None:
|
||||
"""Test that optimization stops at minimum quality."""
|
||||
png_bytes = create_test_png(100, 100)
|
||||
img = ImageFile(source=FileBytes(data=png_bytes, filename="test.png"))
|
||||
|
||||
# Request impossibly small size - should stop at min quality
|
||||
result = optimize_image(img, target_size_bytes=10, min_quality=50)
|
||||
|
||||
assert isinstance(result, ImageFile)
|
||||
assert len(result.read()) > 10
|
||||
|
||||
|
||||
class TestChunkPdf:
|
||||
"""Tests for chunk_pdf function."""
|
||||
|
||||
def test_chunk_splits_large_pdf(self) -> None:
|
||||
"""Test that large PDFs are split into chunks."""
|
||||
pdf_bytes = create_test_pdf(num_pages=10)
|
||||
pdf = PDFFile(source=FileBytes(data=pdf_bytes, filename="large.pdf"))
|
||||
|
||||
result = list(chunk_pdf(pdf, max_pages=3))
|
||||
|
||||
assert len(result) == 4
|
||||
assert all(isinstance(chunk, PDFFile) for chunk in result)
|
||||
|
||||
def test_no_chunk_if_within_limit(self) -> None:
|
||||
"""Test that small PDFs are returned unchanged."""
|
||||
pdf_bytes = create_test_pdf(num_pages=3)
|
||||
pdf = PDFFile(source=FileBytes(data=pdf_bytes, filename="small.pdf"))
|
||||
|
||||
result = list(chunk_pdf(pdf, max_pages=5))
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0] is pdf
|
||||
|
||||
def test_chunk_filenames(self) -> None:
|
||||
"""Test that chunked files have indexed filenames."""
|
||||
pdf_bytes = create_test_pdf(num_pages=6)
|
||||
pdf = PDFFile(source=FileBytes(data=pdf_bytes, filename="document.pdf"))
|
||||
|
||||
result = list(chunk_pdf(pdf, max_pages=2))
|
||||
|
||||
assert result[0].filename == "document_chunk_0.pdf"
|
||||
assert result[1].filename == "document_chunk_1.pdf"
|
||||
assert result[2].filename == "document_chunk_2.pdf"
|
||||
|
||||
def test_chunk_with_overlap(self) -> None:
|
||||
"""Test chunking with overlapping pages."""
|
||||
pdf_bytes = create_test_pdf(num_pages=10)
|
||||
pdf = PDFFile(source=FileBytes(data=pdf_bytes, filename="doc.pdf"))
|
||||
|
||||
result = list(chunk_pdf(pdf, max_pages=4, overlap_pages=1))
|
||||
|
||||
# With overlap, we get more chunks
|
||||
assert len(result) >= 3
|
||||
|
||||
def test_chunk_page_counts(self) -> None:
|
||||
"""Test that each chunk has correct page count."""
|
||||
pdf_bytes = create_test_pdf(num_pages=7)
|
||||
pdf = PDFFile(source=FileBytes(data=pdf_bytes, filename="doc.pdf"))
|
||||
|
||||
result = list(chunk_pdf(pdf, max_pages=3))
|
||||
|
||||
page_counts = [get_pdf_page_count(chunk) for chunk in result]
|
||||
assert page_counts == [3, 3, 1]
|
||||
|
||||
|
||||
class TestChunkText:
|
||||
"""Tests for chunk_text function."""
|
||||
|
||||
def test_chunk_splits_large_text(self) -> None:
|
||||
"""Test that large text files are split into chunks."""
|
||||
content = "Hello world. " * 100
|
||||
text = TextFile(source=content.encode(), filename="large.txt")
|
||||
|
||||
result = list(chunk_text(text, max_chars=200, overlap_chars=0))
|
||||
|
||||
assert len(result) > 1
|
||||
assert all(isinstance(chunk, TextFile) for chunk in result)
|
||||
|
||||
def test_no_chunk_if_within_limit(self) -> None:
|
||||
"""Test that small text files are returned unchanged."""
|
||||
content = "Short text"
|
||||
text = TextFile(source=content.encode(), filename="small.txt")
|
||||
|
||||
result = list(chunk_text(text, max_chars=1000, overlap_chars=0))
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0] is text
|
||||
|
||||
def test_chunk_filenames(self) -> None:
|
||||
"""Test that chunked files have indexed filenames."""
|
||||
content = "A" * 500
|
||||
text = TextFile(source=FileBytes(data=content.encode(), filename="data.txt"))
|
||||
|
||||
result = list(chunk_text(text, max_chars=200, overlap_chars=0))
|
||||
|
||||
assert result[0].filename == "data_chunk_0.txt"
|
||||
assert result[1].filename == "data_chunk_1.txt"
|
||||
assert len(result) == 3
|
||||
|
||||
def test_chunk_preserves_extension(self) -> None:
|
||||
"""Test that file extension is preserved in chunks."""
|
||||
content = "A" * 500
|
||||
text = TextFile(source=FileBytes(data=content.encode(), filename="script.py"))
|
||||
|
||||
result = list(chunk_text(text, max_chars=200, overlap_chars=0))
|
||||
|
||||
assert all(chunk.filename.endswith(".py") for chunk in result)
|
||||
|
||||
def test_chunk_prefers_newline_boundaries(self) -> None:
|
||||
"""Test that chunking prefers to split at newlines."""
|
||||
content = "Line one\nLine two\nLine three\nLine four\nLine five"
|
||||
text = TextFile(source=content.encode(), filename="lines.txt")
|
||||
|
||||
result = list(chunk_text(text, max_chars=25, overlap_chars=0, split_on_newlines=True))
|
||||
|
||||
# Should split at newline boundaries
|
||||
for chunk in result:
|
||||
chunk_text_content = chunk.read().decode()
|
||||
# Chunks should end at newlines (except possibly the last)
|
||||
if chunk != result[-1]:
|
||||
assert chunk_text_content.endswith("\n") or len(chunk_text_content) <= 25
|
||||
|
||||
def test_chunk_with_overlap(self) -> None:
|
||||
"""Test chunking with overlapping characters."""
|
||||
content = "ABCDEFGHIJ" * 10
|
||||
text = TextFile(source=content.encode(), filename="data.txt")
|
||||
|
||||
result = list(chunk_text(text, max_chars=30, overlap_chars=5))
|
||||
|
||||
# With overlap, chunks should share some content
|
||||
assert len(result) >= 3
|
||||
|
||||
def test_chunk_overlap_larger_than_max_chars(self) -> None:
|
||||
"""Test that overlap > max_chars doesn't cause infinite loop."""
|
||||
content = "A" * 100
|
||||
text = TextFile(source=content.encode(), filename="data.txt")
|
||||
|
||||
# overlap_chars > max_chars should still work (just with max overlap)
|
||||
result = list(chunk_text(text, max_chars=20, overlap_chars=50))
|
||||
|
||||
assert len(result) > 1
|
||||
# Should still complete without hanging
|
||||
|
||||
|
||||
class TestGetImageDimensions:
|
||||
"""Tests for get_image_dimensions function."""
|
||||
|
||||
def test_get_dimensions(self) -> None:
|
||||
"""Test getting image dimensions."""
|
||||
png_bytes = create_test_png(150, 100)
|
||||
img = ImageFile(source=FileBytes(data=png_bytes, filename="test.png"))
|
||||
|
||||
dims = get_image_dimensions(img)
|
||||
|
||||
assert dims == (150, 100)
|
||||
|
||||
def test_returns_none_for_invalid_image(self) -> None:
|
||||
"""Test that None is returned for invalid image data."""
|
||||
img = ImageFile(source=FileBytes(data=b"not an image", filename="bad.png"))
|
||||
|
||||
dims = get_image_dimensions(img)
|
||||
|
||||
assert dims is None
|
||||
|
||||
def test_returns_none_without_pillow(self) -> None:
|
||||
"""Test that None is returned when Pillow is not installed."""
|
||||
png_bytes = create_test_png(100, 100)
|
||||
img = ImageFile(source=FileBytes(data=png_bytes, filename="test.png"))
|
||||
|
||||
with patch.dict("sys.modules", {"PIL": None}):
|
||||
# Can't easily test this without unloading module
|
||||
# Just verify the function handles the case gracefully
|
||||
pass
|
||||
|
||||
|
||||
class TestGetPdfPageCount:
|
||||
"""Tests for get_pdf_page_count function."""
|
||||
|
||||
def test_get_page_count(self) -> None:
|
||||
"""Test getting PDF page count."""
|
||||
pdf_bytes = create_test_pdf(num_pages=5)
|
||||
pdf = PDFFile(source=FileBytes(data=pdf_bytes, filename="test.pdf"))
|
||||
|
||||
count = get_pdf_page_count(pdf)
|
||||
|
||||
assert count == 5
|
||||
|
||||
def test_single_page(self) -> None:
|
||||
"""Test page count for single page PDF."""
|
||||
pdf_bytes = create_test_pdf(num_pages=1)
|
||||
pdf = PDFFile(source=FileBytes(data=pdf_bytes, filename="single.pdf"))
|
||||
|
||||
count = get_pdf_page_count(pdf)
|
||||
|
||||
assert count == 1
|
||||
|
||||
def test_returns_none_for_invalid_pdf(self) -> None:
|
||||
"""Test that None is returned for invalid PDF data."""
|
||||
pdf = PDFFile(source=FileBytes(data=b"not a pdf", filename="bad.pdf"))
|
||||
|
||||
count = get_pdf_page_count(pdf)
|
||||
|
||||
assert count is None
|
||||
@@ -2,19 +2,19 @@
|
||||
|
||||
import pytest
|
||||
|
||||
from crewai.utilities.files import FileBytes, ImageFile, PDFFile, TextFile
|
||||
from crewai.utilities.files.processing.constraints import (
|
||||
from crewai.files import FileBytes, ImageFile, PDFFile, TextFile
|
||||
from crewai.files.processing.constraints import (
|
||||
ANTHROPIC_CONSTRAINTS,
|
||||
ImageConstraints,
|
||||
PDFConstraints,
|
||||
ProviderConstraints,
|
||||
)
|
||||
from crewai.utilities.files.processing.exceptions import (
|
||||
from crewai.files.processing.exceptions import (
|
||||
FileTooLargeError,
|
||||
FileValidationError,
|
||||
UnsupportedFileTypeError,
|
||||
)
|
||||
from crewai.utilities.files.processing.validators import (
|
||||
from crewai.files.processing.validators import (
|
||||
validate_file,
|
||||
validate_image,
|
||||
validate_pdf,
|
||||
@@ -4,7 +4,7 @@ from datetime import datetime, timezone
|
||||
|
||||
import pytest
|
||||
|
||||
from crewai.utilities.files.resolved import (
|
||||
from crewai.files.resolved import (
|
||||
FileReference,
|
||||
InlineBase64,
|
||||
InlineBytes,
|
||||
@@ -2,14 +2,14 @@
|
||||
|
||||
import pytest
|
||||
|
||||
from crewai.utilities.files import FileBytes, ImageFile
|
||||
from crewai.utilities.files.resolved import InlineBase64, InlineBytes
|
||||
from crewai.utilities.files.resolver import (
|
||||
from crewai.files import FileBytes, ImageFile
|
||||
from crewai.files.resolved import InlineBase64, InlineBytes
|
||||
from crewai.files.resolver import (
|
||||
FileResolver,
|
||||
FileResolverConfig,
|
||||
create_resolver,
|
||||
)
|
||||
from crewai.utilities.files.upload_cache import UploadCache
|
||||
from crewai.files.upload_cache import UploadCache
|
||||
|
||||
|
||||
# Minimal valid PNG
|
||||
@@ -4,8 +4,8 @@ from datetime import datetime, timedelta, timezone
|
||||
|
||||
import pytest
|
||||
|
||||
from crewai.utilities.files import FileBytes, ImageFile
|
||||
from crewai.utilities.files.upload_cache import CachedUpload, UploadCache
|
||||
from crewai.files import FileBytes, ImageFile
|
||||
from crewai.files.upload_cache import CachedUpload, UploadCache
|
||||
|
||||
|
||||
# Minimal valid PNG
|
||||
5
lib/crewai/tests/fixtures/quarterly_report.csv
vendored
Normal file
5
lib/crewai/tests/fixtures/quarterly_report.csv
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
Quarter,Revenue ($M),Expenses ($M),Profit ($M)
|
||||
Q1 2024,70,40,30
|
||||
Q2 2024,75,42,33
|
||||
Q3 2024,80,45,35
|
||||
Q4 2024,75,44,31
|
||||
|
BIN
lib/crewai/tests/fixtures/revenue_chart.png
vendored
Normal file
BIN
lib/crewai/tests/fixtures/revenue_chart.png
vendored
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 27 KiB |
10
lib/crewai/tests/fixtures/review_guidelines.txt
vendored
Normal file
10
lib/crewai/tests/fixtures/review_guidelines.txt
vendored
Normal file
@@ -0,0 +1,10 @@
|
||||
Review Guidelines
|
||||
|
||||
1. Be clear and concise: Write feedback that is easy to understand.
|
||||
2. Focus on behavior and outcomes: Describe what happened and why it matters.
|
||||
3. Be specific: Provide examples to support your points.
|
||||
4. Balance positives and improvements: Highlight strengths and areas to grow.
|
||||
5. Be respectful and constructive: Assume positive intent and offer solutions.
|
||||
6. Use objective criteria: Reference goals, metrics, or expectations where possible.
|
||||
7. Suggest next steps: Recommend actionable ways to improve.
|
||||
8. Proofread: Check tone, grammar, and clarity before submitting.
|
||||
@@ -7,7 +7,7 @@ from unittest.mock import patch
|
||||
import pytest
|
||||
|
||||
from crewai.llm import LLM
|
||||
from crewai.utilities.files import ImageFile, PDFFile, TextFile
|
||||
from crewai.files import ImageFile, PDFFile, TextFile
|
||||
|
||||
# Check for optional provider dependencies
|
||||
try:
|
||||
|
||||
@@ -9,7 +9,7 @@ from pathlib import Path
|
||||
import pytest
|
||||
|
||||
from crewai.llm import LLM
|
||||
from crewai.utilities.files import File, ImageFile, PDFFile, TextFile
|
||||
from crewai.files import File, ImageFile, PDFFile, TextFile
|
||||
|
||||
|
||||
# Path to test data files
|
||||
|
||||
@@ -5,7 +5,7 @@ import base64
|
||||
import pytest
|
||||
|
||||
from crewai.tools.agent_tools.read_file_tool import ReadFileTool
|
||||
from crewai.utilities.files import ImageFile, PDFFile, TextFile
|
||||
from crewai.files import ImageFile, PDFFile, TextFile
|
||||
|
||||
|
||||
class TestReadFileTool:
|
||||
|
||||
@@ -13,7 +13,7 @@ from crewai.utilities.file_store import (
|
||||
store_files,
|
||||
store_task_files,
|
||||
)
|
||||
from crewai.utilities.files import TextFile
|
||||
from crewai.files import TextFile
|
||||
|
||||
|
||||
class TestFileStore:
|
||||
|
||||
@@ -6,7 +6,7 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from crewai.utilities.files import (
|
||||
from crewai.files import (
|
||||
AudioFile,
|
||||
File,
|
||||
FileBytes,
|
||||
@@ -20,7 +20,7 @@ from crewai.utilities.files import (
|
||||
normalize_input_files,
|
||||
wrap_file_source,
|
||||
)
|
||||
from crewai.utilities.files.file import detect_content_type
|
||||
from crewai.files.file import detect_content_type
|
||||
|
||||
|
||||
class TestDetectContentType:
|
||||
@@ -34,7 +34,7 @@ class TestDetectContentType:
|
||||
def test_detect_json(self) -> None:
|
||||
"""Test detection of JSON content."""
|
||||
result = detect_content_type(b'{"key": "value"}')
|
||||
assert result in ("text/plain", "application/json")
|
||||
assert result == "application/json"
|
||||
|
||||
def test_detect_png(self) -> None:
|
||||
"""Test detection of PNG content."""
|
||||
|
||||
Reference in New Issue
Block a user