mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-22 22:58:13 +00:00
feat: add format hints to audio/video duration detection
This commit is contained in:
@@ -104,6 +104,8 @@ file-processing = [
|
||||
"python-magic>=0.4.27",
|
||||
"aiocache~=0.12.3",
|
||||
"aiofiles~=24.1.0",
|
||||
"tinytag~=1.10.0",
|
||||
"av~=13.0.0",
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -32,6 +32,7 @@ from crewai.files.file import (
|
||||
FileSource,
|
||||
FileSourceInput,
|
||||
FileStream,
|
||||
FileUrl,
|
||||
RawFileInput,
|
||||
)
|
||||
from crewai.files.processing import (
|
||||
@@ -103,6 +104,7 @@ __all__ = [
|
||||
"FileStream",
|
||||
"FileTooLargeError",
|
||||
"FileUploader",
|
||||
"FileUrl",
|
||||
"FileValidationError",
|
||||
"ImageConstraints",
|
||||
"ImageExtension",
|
||||
|
||||
@@ -16,6 +16,7 @@ from crewai.files.file import (
|
||||
FilePath,
|
||||
FileSource,
|
||||
FileStream,
|
||||
FileUrl,
|
||||
)
|
||||
from crewai.files.utils import is_file_source
|
||||
|
||||
@@ -29,12 +30,14 @@ class _FileSourceCoercer:
|
||||
@classmethod
|
||||
def _coerce(cls, v: Any) -> FileSource:
|
||||
"""Convert raw input to appropriate FileSource type."""
|
||||
if isinstance(v, (FilePath, FileBytes, FileStream)):
|
||||
if isinstance(v, (FilePath, FileBytes, FileStream, FileUrl)):
|
||||
return v
|
||||
if isinstance(v, str):
|
||||
if v.startswith(("http://", "https://")):
|
||||
return FileUrl(url=v)
|
||||
return FilePath(path=Path(v))
|
||||
if isinstance(v, Path):
|
||||
return FilePath(path=v)
|
||||
if isinstance(v, str):
|
||||
return FilePath(path=Path(v))
|
||||
if isinstance(v, bytes):
|
||||
return FileBytes(data=v)
|
||||
if isinstance(v, (IOBase, BinaryIO)):
|
||||
@@ -203,7 +206,7 @@ class BaseFile(ABC, BaseModel):
|
||||
TypeError: If the underlying source doesn't support async read.
|
||||
"""
|
||||
source = self._file_source
|
||||
if isinstance(source, (FilePath, FileBytes, AsyncFileStream)):
|
||||
if isinstance(source, (FilePath, FileBytes, AsyncFileStream, FileUrl)):
|
||||
return await source.aread()
|
||||
raise TypeError(f"{type(source).__name__} does not support async read")
|
||||
|
||||
|
||||
@@ -414,17 +414,84 @@ class AsyncFileStream(BaseModel):
|
||||
yield chunk
|
||||
|
||||
|
||||
FileSource = FilePath | FileBytes | FileStream | AsyncFileStream
|
||||
class FileUrl(BaseModel):
|
||||
"""File referenced by URL.
|
||||
|
||||
For providers that support URL references, the URL is passed directly.
|
||||
For providers that don't, content is fetched on demand.
|
||||
|
||||
Attributes:
|
||||
url: URL where the file can be accessed.
|
||||
filename: Optional filename (extracted from URL if not provided).
|
||||
"""
|
||||
|
||||
url: str = Field(description="URL where the file can be accessed.")
|
||||
filename: str | None = Field(default=None, description="Optional filename.")
|
||||
_content_type: str | None = PrivateAttr(default=None)
|
||||
_content: bytes | None = PrivateAttr(default=None)
|
||||
|
||||
@model_validator(mode="after")
|
||||
def _validate_url(self) -> FileUrl:
|
||||
"""Validate URL format."""
|
||||
if not self.url.startswith(("http://", "https://")):
|
||||
raise ValueError(f"Invalid URL scheme: {self.url}")
|
||||
return self
|
||||
|
||||
@property
|
||||
def content_type(self) -> str:
|
||||
"""Get the content type, guessing from URL extension if not set."""
|
||||
if self._content_type is None:
|
||||
self._content_type = self._guess_content_type()
|
||||
return self._content_type
|
||||
|
||||
def _guess_content_type(self) -> str:
|
||||
"""Guess content type from URL extension."""
|
||||
from urllib.parse import urlparse
|
||||
|
||||
parsed = urlparse(self.url)
|
||||
path = parsed.path
|
||||
guessed, _ = mimetypes.guess_type(path)
|
||||
return guessed or "application/octet-stream"
|
||||
|
||||
def read(self) -> bytes:
|
||||
"""Fetch content from URL (for providers that don't support URL references)."""
|
||||
if self._content is None:
|
||||
import httpx
|
||||
|
||||
response = httpx.get(self.url, follow_redirects=True)
|
||||
response.raise_for_status()
|
||||
self._content = response.content
|
||||
if "content-type" in response.headers:
|
||||
self._content_type = response.headers["content-type"].split(";")[0]
|
||||
return self._content
|
||||
|
||||
async def aread(self) -> bytes:
|
||||
"""Async fetch content from URL."""
|
||||
if self._content is None:
|
||||
import httpx
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.get(self.url, follow_redirects=True)
|
||||
response.raise_for_status()
|
||||
self._content = response.content
|
||||
if "content-type" in response.headers:
|
||||
self._content_type = response.headers["content-type"].split(";")[0]
|
||||
return self._content
|
||||
|
||||
|
||||
FileSource = FilePath | FileBytes | FileStream | AsyncFileStream | FileUrl
|
||||
|
||||
|
||||
def _normalize_source(value: Any) -> FileSource:
|
||||
"""Convert raw input to appropriate source type."""
|
||||
if isinstance(value, (FilePath, FileBytes, FileStream, AsyncFileStream)):
|
||||
if isinstance(value, (FilePath, FileBytes, FileStream, AsyncFileStream, FileUrl)):
|
||||
return value
|
||||
if isinstance(value, str):
|
||||
if value.startswith(("http://", "https://")):
|
||||
return FileUrl(url=value)
|
||||
return FilePath(path=Path(value))
|
||||
if isinstance(value, Path):
|
||||
return FilePath(path=value)
|
||||
if isinstance(value, str):
|
||||
return FilePath(path=Path(value))
|
||||
if isinstance(value, bytes):
|
||||
return FileBytes(data=value)
|
||||
if isinstance(value, AsyncReadable):
|
||||
|
||||
@@ -148,6 +148,7 @@ class ProviderConstraints:
|
||||
general_max_size_bytes: Maximum size for any file type.
|
||||
supports_file_upload: Whether the provider supports file upload APIs.
|
||||
file_upload_threshold_bytes: Size threshold above which to use file upload.
|
||||
supports_url_references: Whether the provider supports URL-based file references.
|
||||
"""
|
||||
|
||||
name: ProviderName
|
||||
@@ -158,21 +159,24 @@ class ProviderConstraints:
|
||||
general_max_size_bytes: int | None = None
|
||||
supports_file_upload: bool = False
|
||||
file_upload_threshold_bytes: int | None = None
|
||||
supports_url_references: bool = False
|
||||
|
||||
|
||||
ANTHROPIC_CONSTRAINTS = ProviderConstraints(
|
||||
name="anthropic",
|
||||
image=ImageConstraints(
|
||||
max_size_bytes=5_242_880,
|
||||
max_size_bytes=5_242_880, # 5 MB per image
|
||||
max_width=8000,
|
||||
max_height=8000,
|
||||
max_images_per_request=100,
|
||||
),
|
||||
pdf=PDFConstraints(
|
||||
max_size_bytes=31_457_280,
|
||||
max_size_bytes=33_554_432, # 32 MB request size limit
|
||||
max_pages=100,
|
||||
),
|
||||
supports_file_upload=True,
|
||||
file_upload_threshold_bytes=5_242_880,
|
||||
supports_url_references=True,
|
||||
)
|
||||
|
||||
OPENAI_CONSTRAINTS = ProviderConstraints(
|
||||
@@ -181,8 +185,13 @@ OPENAI_CONSTRAINTS = ProviderConstraints(
|
||||
max_size_bytes=20_971_520,
|
||||
max_images_per_request=10,
|
||||
),
|
||||
audio=AudioConstraints(
|
||||
max_size_bytes=26_214_400, # 25 MB - whisper limit
|
||||
max_duration_seconds=1500, # 25 minutes, arbitrary-ish, this is from the transcriptions limit
|
||||
),
|
||||
supports_file_upload=True,
|
||||
file_upload_threshold_bytes=5_242_880,
|
||||
supports_url_references=True,
|
||||
)
|
||||
|
||||
GEMINI_CONSTRAINTS = ProviderConstraints(
|
||||
@@ -196,14 +205,17 @@ GEMINI_CONSTRAINTS = ProviderConstraints(
|
||||
),
|
||||
audio=AudioConstraints(
|
||||
max_size_bytes=104_857_600,
|
||||
max_duration_seconds=34200, # 9.5 hours
|
||||
supported_formats=GEMINI_AUDIO_FORMATS,
|
||||
),
|
||||
video=VideoConstraints(
|
||||
max_size_bytes=2_147_483_648,
|
||||
max_duration_seconds=3600, # 1 hour at default resolution
|
||||
supported_formats=GEMINI_VIDEO_FORMATS,
|
||||
),
|
||||
supports_file_upload=True,
|
||||
file_upload_threshold_bytes=20_971_520,
|
||||
supports_url_references=True,
|
||||
)
|
||||
|
||||
BEDROCK_CONSTRAINTS = ProviderConstraints(
|
||||
@@ -225,6 +237,11 @@ AZURE_CONSTRAINTS = ProviderConstraints(
|
||||
max_size_bytes=20_971_520,
|
||||
max_images_per_request=10,
|
||||
),
|
||||
audio=AudioConstraints(
|
||||
max_size_bytes=26_214_400, # 25 MB - same as openai
|
||||
max_duration_seconds=1500, # 25 minutes - same as openai
|
||||
),
|
||||
supports_url_references=True,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -43,7 +43,7 @@ def _get_image_dimensions(content: bytes) -> tuple[int, int] | None:
|
||||
|
||||
with Image.open(io.BytesIO(content)) as img:
|
||||
width, height = img.size
|
||||
return (int(width), int(height))
|
||||
return int(width), int(height)
|
||||
except ImportError:
|
||||
logger.warning(
|
||||
"Pillow not installed - cannot validate image dimensions. "
|
||||
@@ -74,6 +74,81 @@ def _get_pdf_page_count(content: bytes) -> int | None:
|
||||
return None
|
||||
|
||||
|
||||
def _get_audio_duration(content: bytes, filename: str | None = None) -> float | None:
|
||||
"""Get audio duration in seconds using tinytag if available.
|
||||
|
||||
Args:
|
||||
content: Raw audio bytes.
|
||||
filename: Optional filename for format detection hint.
|
||||
|
||||
Returns:
|
||||
Duration in seconds or None if tinytag unavailable.
|
||||
"""
|
||||
try:
|
||||
from tinytag import TinyTag # type: ignore[import-untyped]
|
||||
except ImportError:
|
||||
logger.warning(
|
||||
"tinytag not installed - cannot validate audio duration. "
|
||||
"Install with: pip install tinytag"
|
||||
)
|
||||
return None
|
||||
|
||||
try:
|
||||
tag = TinyTag.get(file_obj=io.BytesIO(content), filename=filename)
|
||||
duration: float | None = tag.duration
|
||||
return duration
|
||||
except Exception as e:
|
||||
logger.debug(f"Could not determine audio duration: {e}")
|
||||
return None
|
||||
|
||||
|
||||
_VIDEO_FORMAT_MAP: dict[str, str] = {
|
||||
"video/mp4": "mp4",
|
||||
"video/webm": "webm",
|
||||
"video/x-matroska": "matroska",
|
||||
"video/quicktime": "mov",
|
||||
"video/x-msvideo": "avi",
|
||||
"video/x-flv": "flv",
|
||||
}
|
||||
|
||||
|
||||
def _get_video_duration(
|
||||
content: bytes, content_type: str | None = None
|
||||
) -> float | None:
|
||||
"""Get video duration in seconds using av if available.
|
||||
|
||||
Args:
|
||||
content: Raw video bytes.
|
||||
content_type: Optional MIME type for format detection hint.
|
||||
|
||||
Returns:
|
||||
Duration in seconds or None if av unavailable.
|
||||
"""
|
||||
try:
|
||||
import av
|
||||
except ImportError:
|
||||
logger.warning(
|
||||
"av (PyAV) not installed - cannot validate video duration. "
|
||||
"Install with: pip install av"
|
||||
)
|
||||
return None
|
||||
|
||||
format_hint = _VIDEO_FORMAT_MAP.get(content_type) if content_type else None
|
||||
|
||||
try:
|
||||
container = av.open(io.BytesIO(content), format=format_hint) # type: ignore[attr-defined]
|
||||
try:
|
||||
duration = getattr(container, "duration", None)
|
||||
if duration is None:
|
||||
return None
|
||||
return float(duration) / 1_000_000
|
||||
finally:
|
||||
container.close()
|
||||
except Exception as e:
|
||||
logger.debug(f"Could not determine video duration: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def _format_size(size_bytes: int) -> str:
|
||||
"""Format byte size to human-readable string."""
|
||||
if size_bytes >= 1024 * 1024 * 1024:
|
||||
@@ -273,14 +348,17 @@ def validate_audio(
|
||||
|
||||
Raises:
|
||||
FileTooLargeError: If the file exceeds size limits.
|
||||
FileValidationError: If the file exceeds duration limits.
|
||||
UnsupportedFileTypeError: If the format is not supported.
|
||||
"""
|
||||
errors: list[str] = []
|
||||
file_size = len(file.read())
|
||||
content = file.read()
|
||||
file_size = len(content)
|
||||
filename = file.filename
|
||||
|
||||
_validate_size(
|
||||
"Audio",
|
||||
file.filename,
|
||||
filename,
|
||||
file_size,
|
||||
constraints.max_size_bytes,
|
||||
errors,
|
||||
@@ -288,13 +366,24 @@ def validate_audio(
|
||||
)
|
||||
_validate_format(
|
||||
"Audio",
|
||||
file.filename,
|
||||
filename,
|
||||
file.content_type,
|
||||
constraints.supported_formats,
|
||||
errors,
|
||||
raise_on_error,
|
||||
)
|
||||
|
||||
if constraints.max_duration_seconds is not None:
|
||||
duration = _get_audio_duration(content, filename)
|
||||
if duration is not None and duration > constraints.max_duration_seconds:
|
||||
msg = (
|
||||
f"Audio '{filename}' duration ({duration:.1f}s) exceeds "
|
||||
f"maximum ({constraints.max_duration_seconds}s)"
|
||||
)
|
||||
errors.append(msg)
|
||||
if raise_on_error:
|
||||
raise FileValidationError(msg, file_name=filename)
|
||||
|
||||
return errors
|
||||
|
||||
|
||||
@@ -316,14 +405,17 @@ def validate_video(
|
||||
|
||||
Raises:
|
||||
FileTooLargeError: If the file exceeds size limits.
|
||||
FileValidationError: If the file exceeds duration limits.
|
||||
UnsupportedFileTypeError: If the format is not supported.
|
||||
"""
|
||||
errors: list[str] = []
|
||||
file_size = len(file.read())
|
||||
content = file.read()
|
||||
file_size = len(content)
|
||||
filename = file.filename
|
||||
|
||||
_validate_size(
|
||||
"Video",
|
||||
file.filename,
|
||||
filename,
|
||||
file_size,
|
||||
constraints.max_size_bytes,
|
||||
errors,
|
||||
@@ -331,13 +423,24 @@ def validate_video(
|
||||
)
|
||||
_validate_format(
|
||||
"Video",
|
||||
file.filename,
|
||||
filename,
|
||||
file.content_type,
|
||||
constraints.supported_formats,
|
||||
errors,
|
||||
raise_on_error,
|
||||
)
|
||||
|
||||
if constraints.max_duration_seconds is not None:
|
||||
duration = _get_video_duration(content, file.content_type)
|
||||
if duration is not None and duration > constraints.max_duration_seconds:
|
||||
msg = (
|
||||
f"Video '{filename}' duration ({duration:.1f}s) exceeds "
|
||||
f"maximum ({constraints.max_duration_seconds}s)"
|
||||
)
|
||||
errors.append(msg)
|
||||
if raise_on_error:
|
||||
raise FileValidationError(msg, file_name=filename)
|
||||
|
||||
return errors
|
||||
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@ import logging
|
||||
|
||||
from crewai.files.constants import UPLOAD_MAX_RETRIES, UPLOAD_RETRY_DELAY_BASE
|
||||
from crewai.files.content_types import FileInput
|
||||
from crewai.files.file import FileUrl
|
||||
from crewai.files.metrics import measure_operation
|
||||
from crewai.files.processing.constraints import (
|
||||
AudioConstraints,
|
||||
@@ -22,10 +23,12 @@ from crewai.files.resolved import (
|
||||
InlineBase64,
|
||||
InlineBytes,
|
||||
ResolvedFile,
|
||||
UrlReference,
|
||||
)
|
||||
from crewai.files.upload_cache import CachedUpload, UploadCache
|
||||
from crewai.files.uploaders import UploadResult, get_uploader
|
||||
from crewai.files.uploaders.base import FileUploader
|
||||
from crewai.files.uploaders.factory import ProviderType
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -102,7 +105,49 @@ class FileResolver:
|
||||
content_type=file.content_type,
|
||||
)
|
||||
|
||||
def resolve(self, file: FileInput, provider: str) -> ResolvedFile:
|
||||
@staticmethod
|
||||
def _is_url_source(file: FileInput) -> bool:
|
||||
"""Check if file source is a URL.
|
||||
|
||||
Args:
|
||||
file: The file to check.
|
||||
|
||||
Returns:
|
||||
True if the file source is a FileUrl, False otherwise.
|
||||
"""
|
||||
return isinstance(file._file_source, FileUrl)
|
||||
|
||||
@staticmethod
|
||||
def _supports_url(constraints: ProviderConstraints | None) -> bool:
|
||||
"""Check if provider supports URL references.
|
||||
|
||||
Args:
|
||||
constraints: Provider constraints.
|
||||
|
||||
Returns:
|
||||
True if the provider supports URL references, False otherwise.
|
||||
"""
|
||||
return constraints is not None and constraints.supports_url_references
|
||||
|
||||
@staticmethod
|
||||
def _resolve_as_url(file: FileInput) -> UrlReference:
|
||||
"""Resolve a URL source as UrlReference.
|
||||
|
||||
Args:
|
||||
file: The file with URL source.
|
||||
|
||||
Returns:
|
||||
UrlReference with the URL and content type.
|
||||
"""
|
||||
source = file._file_source
|
||||
if not isinstance(source, FileUrl):
|
||||
raise TypeError(f"Expected FileUrl source, got {type(source).__name__}")
|
||||
return UrlReference(
|
||||
content_type=file.content_type,
|
||||
url=source.url,
|
||||
)
|
||||
|
||||
def resolve(self, file: FileInput, provider: ProviderType) -> ResolvedFile:
|
||||
"""Resolve a file to its delivery format for a provider.
|
||||
|
||||
Args:
|
||||
@@ -112,25 +157,26 @@ class FileResolver:
|
||||
Returns:
|
||||
ResolvedFile representing the appropriate delivery format.
|
||||
"""
|
||||
provider_lower = provider.lower()
|
||||
constraints = get_constraints_for_provider(provider)
|
||||
|
||||
if self._is_url_source(file) and self._supports_url(constraints):
|
||||
return self._resolve_as_url(file)
|
||||
|
||||
context = self._build_file_context(file)
|
||||
|
||||
should_upload = self._should_upload(
|
||||
file, provider_lower, constraints, context.size
|
||||
)
|
||||
should_upload = self._should_upload(file, provider, constraints, context.size)
|
||||
|
||||
if should_upload:
|
||||
resolved = self._resolve_via_upload(file, provider_lower, context)
|
||||
resolved = self._resolve_via_upload(file, provider, context)
|
||||
if resolved is not None:
|
||||
return resolved
|
||||
|
||||
return self._resolve_inline(file, provider_lower, context)
|
||||
return self._resolve_inline(file, provider, context)
|
||||
|
||||
def resolve_files(
|
||||
self,
|
||||
files: dict[str, FileInput],
|
||||
provider: str,
|
||||
provider: ProviderType,
|
||||
) -> dict[str, ResolvedFile]:
|
||||
"""Resolve multiple files for a provider.
|
||||
|
||||
@@ -220,7 +266,7 @@ class FileResolver:
|
||||
def _resolve_via_upload(
|
||||
self,
|
||||
file: FileInput,
|
||||
provider: str,
|
||||
provider: ProviderType,
|
||||
context: FileContext,
|
||||
) -> ResolvedFile | None:
|
||||
"""Resolve a file by uploading it.
|
||||
@@ -367,7 +413,7 @@ class FileResolver:
|
||||
data=encoded,
|
||||
)
|
||||
|
||||
async def aresolve(self, file: FileInput, provider: str) -> ResolvedFile:
|
||||
async def aresolve(self, file: FileInput, provider: ProviderType) -> ResolvedFile:
|
||||
"""Async resolve a file to its delivery format for a provider.
|
||||
|
||||
Args:
|
||||
@@ -377,25 +423,26 @@ class FileResolver:
|
||||
Returns:
|
||||
ResolvedFile representing the appropriate delivery format.
|
||||
"""
|
||||
provider_lower = provider.lower()
|
||||
constraints = get_constraints_for_provider(provider)
|
||||
|
||||
if self._is_url_source(file) and self._supports_url(constraints):
|
||||
return self._resolve_as_url(file)
|
||||
|
||||
context = self._build_file_context(file)
|
||||
|
||||
should_upload = self._should_upload(
|
||||
file, provider_lower, constraints, context.size
|
||||
)
|
||||
should_upload = self._should_upload(file, provider, constraints, context.size)
|
||||
|
||||
if should_upload:
|
||||
resolved = await self._aresolve_via_upload(file, provider_lower, context)
|
||||
resolved = await self._aresolve_via_upload(file, provider, context)
|
||||
if resolved is not None:
|
||||
return resolved
|
||||
|
||||
return self._resolve_inline(file, provider_lower, context)
|
||||
return self._resolve_inline(file, provider, context)
|
||||
|
||||
async def aresolve_files(
|
||||
self,
|
||||
files: dict[str, FileInput],
|
||||
provider: str,
|
||||
provider: ProviderType,
|
||||
max_concurrency: int = 10,
|
||||
) -> dict[str, ResolvedFile]:
|
||||
"""Async resolve multiple files in parallel.
|
||||
@@ -434,7 +481,7 @@ class FileResolver:
|
||||
async def _aresolve_via_upload(
|
||||
self,
|
||||
file: FileInput,
|
||||
provider: str,
|
||||
provider: ProviderType,
|
||||
context: FileContext,
|
||||
) -> ResolvedFile | None:
|
||||
"""Async resolve a file by uploading it.
|
||||
@@ -552,7 +599,7 @@ class FileResolver:
|
||||
)
|
||||
return None
|
||||
|
||||
def _get_uploader(self, provider: str) -> FileUploader | None:
|
||||
def _get_uploader(self, provider: ProviderType) -> FileUploader | None:
|
||||
"""Get or create an uploader for a provider.
|
||||
|
||||
Args:
|
||||
|
||||
@@ -15,9 +15,9 @@ if TYPE_CHECKING:
|
||||
|
||||
def is_file_source(v: object) -> TypeIs[FileSource]:
|
||||
"""Type guard to narrow input to FileSource."""
|
||||
from crewai.files.file import FileBytes, FilePath, FileStream
|
||||
from crewai.files.file import FileBytes, FilePath, FileStream, FileUrl
|
||||
|
||||
return isinstance(v, (FilePath, FileBytes, FileStream))
|
||||
return isinstance(v, (FilePath, FileBytes, FileStream, FileUrl))
|
||||
|
||||
|
||||
def wrap_file_source(source: FileSource) -> FileInput:
|
||||
@@ -62,7 +62,7 @@ def normalize_input_files(
|
||||
Dictionary mapping names to FileInput wrappers.
|
||||
"""
|
||||
from crewai.files.content_types import BaseFile
|
||||
from crewai.files.file import FileBytes, FilePath, FileStream
|
||||
from crewai.files.file import FileBytes, FilePath, FileStream, FileUrl
|
||||
|
||||
result: dict[str, FileInput] = {}
|
||||
|
||||
@@ -74,13 +74,16 @@ def normalize_input_files(
|
||||
result[name] = item
|
||||
continue
|
||||
|
||||
file_source: FilePath | FileBytes | FileStream
|
||||
if isinstance(item, (FilePath, FileBytes, FileStream)):
|
||||
file_source: FilePath | FileBytes | FileStream | FileUrl
|
||||
if isinstance(item, (FilePath, FileBytes, FileStream, FileUrl)):
|
||||
file_source = item
|
||||
elif isinstance(item, Path):
|
||||
file_source = FilePath(path=item)
|
||||
elif isinstance(item, str):
|
||||
file_source = FilePath(path=Path(item))
|
||||
if item.startswith(("http://", "https://")):
|
||||
file_source = FileUrl(url=item)
|
||||
else:
|
||||
file_source = FilePath(path=Path(item))
|
||||
elif isinstance(item, (bytes, memoryview)):
|
||||
file_source = FileBytes(data=bytes(item))
|
||||
else:
|
||||
|
||||
@@ -1,13 +1,17 @@
|
||||
"""Tests for file validators."""
|
||||
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from crewai.files import FileBytes, ImageFile, PDFFile, TextFile
|
||||
from crewai.files import AudioFile, FileBytes, ImageFile, PDFFile, TextFile, VideoFile
|
||||
from crewai.files.processing.constraints import (
|
||||
ANTHROPIC_CONSTRAINTS,
|
||||
AudioConstraints,
|
||||
ImageConstraints,
|
||||
PDFConstraints,
|
||||
ProviderConstraints,
|
||||
VideoConstraints,
|
||||
)
|
||||
from crewai.files.processing.exceptions import (
|
||||
FileTooLargeError,
|
||||
@@ -15,10 +19,14 @@ from crewai.files.processing.exceptions import (
|
||||
UnsupportedFileTypeError,
|
||||
)
|
||||
from crewai.files.processing.validators import (
|
||||
_get_audio_duration,
|
||||
_get_video_duration,
|
||||
validate_audio,
|
||||
validate_file,
|
||||
validate_image,
|
||||
validate_pdf,
|
||||
validate_text,
|
||||
validate_video,
|
||||
)
|
||||
|
||||
|
||||
@@ -206,3 +214,281 @@ class TestValidateFile:
|
||||
validate_file(file, constraints)
|
||||
|
||||
assert "does not support PDFs" in str(exc_info.value)
|
||||
|
||||
|
||||
# Minimal audio bytes for testing (not a valid audio file, used for mocked tests)
|
||||
MINIMAL_AUDIO = b"\x00" * 100
|
||||
|
||||
# Minimal video bytes for testing (not a valid video file, used for mocked tests)
|
||||
MINIMAL_VIDEO = b"\x00" * 100
|
||||
|
||||
# Fallback content type when python-magic cannot detect
|
||||
FALLBACK_CONTENT_TYPE = "application/octet-stream"
|
||||
|
||||
|
||||
class TestValidateAudio:
|
||||
"""Tests for validate_audio function and audio duration validation."""
|
||||
|
||||
def test_validate_valid_audio(self):
|
||||
"""Test validating a valid audio file within constraints."""
|
||||
constraints = AudioConstraints(
|
||||
max_size_bytes=10 * 1024 * 1024,
|
||||
supported_formats=("audio/mp3", "audio/mpeg", FALLBACK_CONTENT_TYPE),
|
||||
)
|
||||
file = AudioFile(source=FileBytes(data=MINIMAL_AUDIO, filename="test.mp3"))
|
||||
|
||||
errors = validate_audio(file, constraints, raise_on_error=False)
|
||||
|
||||
assert len(errors) == 0
|
||||
|
||||
def test_validate_audio_too_large(self):
|
||||
"""Test validating an audio file that exceeds size limit."""
|
||||
constraints = AudioConstraints(
|
||||
max_size_bytes=10, # Very small limit
|
||||
supported_formats=("audio/mp3", "audio/mpeg", FALLBACK_CONTENT_TYPE),
|
||||
)
|
||||
file = AudioFile(source=FileBytes(data=MINIMAL_AUDIO, filename="test.mp3"))
|
||||
|
||||
with pytest.raises(FileTooLargeError) as exc_info:
|
||||
validate_audio(file, constraints)
|
||||
|
||||
assert "exceeds" in str(exc_info.value)
|
||||
assert exc_info.value.file_name == "test.mp3"
|
||||
|
||||
def test_validate_audio_unsupported_format(self):
|
||||
"""Test validating an audio file with unsupported format."""
|
||||
constraints = AudioConstraints(
|
||||
max_size_bytes=10 * 1024 * 1024,
|
||||
supported_formats=("audio/wav",), # Only WAV
|
||||
)
|
||||
file = AudioFile(source=FileBytes(data=MINIMAL_AUDIO, filename="test.mp3"))
|
||||
|
||||
with pytest.raises(UnsupportedFileTypeError) as exc_info:
|
||||
validate_audio(file, constraints)
|
||||
|
||||
assert "not supported" in str(exc_info.value)
|
||||
|
||||
@patch("crewai.files.processing.validators._get_audio_duration")
|
||||
def test_validate_audio_duration_passes(self, mock_get_duration):
|
||||
"""Test validating audio when duration is under limit."""
|
||||
mock_get_duration.return_value = 30.0
|
||||
constraints = AudioConstraints(
|
||||
max_size_bytes=10 * 1024 * 1024,
|
||||
max_duration_seconds=60,
|
||||
supported_formats=("audio/mp3", "audio/mpeg", FALLBACK_CONTENT_TYPE),
|
||||
)
|
||||
file = AudioFile(source=FileBytes(data=MINIMAL_AUDIO, filename="test.mp3"))
|
||||
|
||||
errors = validate_audio(file, constraints, raise_on_error=False)
|
||||
|
||||
assert len(errors) == 0
|
||||
mock_get_duration.assert_called_once()
|
||||
|
||||
@patch("crewai.files.processing.validators._get_audio_duration")
|
||||
def test_validate_audio_duration_fails(self, mock_get_duration):
|
||||
"""Test validating audio when duration exceeds limit."""
|
||||
mock_get_duration.return_value = 120.5
|
||||
constraints = AudioConstraints(
|
||||
max_size_bytes=10 * 1024 * 1024,
|
||||
max_duration_seconds=60,
|
||||
supported_formats=("audio/mp3", "audio/mpeg", FALLBACK_CONTENT_TYPE),
|
||||
)
|
||||
file = AudioFile(source=FileBytes(data=MINIMAL_AUDIO, filename="test.mp3"))
|
||||
|
||||
with pytest.raises(FileValidationError) as exc_info:
|
||||
validate_audio(file, constraints)
|
||||
|
||||
assert "duration" in str(exc_info.value).lower()
|
||||
assert "120.5s" in str(exc_info.value)
|
||||
assert "60s" in str(exc_info.value)
|
||||
|
||||
@patch("crewai.files.processing.validators._get_audio_duration")
|
||||
def test_validate_audio_duration_no_raise(self, mock_get_duration):
|
||||
"""Test audio duration validation with raise_on_error=False."""
|
||||
mock_get_duration.return_value = 120.5
|
||||
constraints = AudioConstraints(
|
||||
max_size_bytes=10 * 1024 * 1024,
|
||||
max_duration_seconds=60,
|
||||
supported_formats=("audio/mp3", "audio/mpeg", FALLBACK_CONTENT_TYPE),
|
||||
)
|
||||
file = AudioFile(source=FileBytes(data=MINIMAL_AUDIO, filename="test.mp3"))
|
||||
|
||||
errors = validate_audio(file, constraints, raise_on_error=False)
|
||||
|
||||
assert len(errors) == 1
|
||||
assert "duration" in errors[0].lower()
|
||||
|
||||
@patch("crewai.files.processing.validators._get_audio_duration")
|
||||
def test_validate_audio_duration_none_skips(self, mock_get_duration):
|
||||
"""Test that duration validation is skipped when max_duration_seconds is None."""
|
||||
constraints = AudioConstraints(
|
||||
max_size_bytes=10 * 1024 * 1024,
|
||||
max_duration_seconds=None,
|
||||
supported_formats=("audio/mp3", "audio/mpeg", FALLBACK_CONTENT_TYPE),
|
||||
)
|
||||
file = AudioFile(source=FileBytes(data=MINIMAL_AUDIO, filename="test.mp3"))
|
||||
|
||||
errors = validate_audio(file, constraints, raise_on_error=False)
|
||||
|
||||
assert len(errors) == 0
|
||||
mock_get_duration.assert_not_called()
|
||||
|
||||
@patch("crewai.files.processing.validators._get_audio_duration")
|
||||
def test_validate_audio_duration_detection_returns_none(self, mock_get_duration):
|
||||
"""Test that validation passes when duration detection returns None."""
|
||||
mock_get_duration.return_value = None
|
||||
constraints = AudioConstraints(
|
||||
max_size_bytes=10 * 1024 * 1024,
|
||||
max_duration_seconds=60,
|
||||
supported_formats=("audio/mp3", "audio/mpeg", FALLBACK_CONTENT_TYPE),
|
||||
)
|
||||
file = AudioFile(source=FileBytes(data=MINIMAL_AUDIO, filename="test.mp3"))
|
||||
|
||||
errors = validate_audio(file, constraints, raise_on_error=False)
|
||||
|
||||
assert len(errors) == 0
|
||||
|
||||
|
||||
class TestValidateVideo:
|
||||
"""Tests for validate_video function and video duration validation."""
|
||||
|
||||
def test_validate_valid_video(self):
|
||||
"""Test validating a valid video file within constraints."""
|
||||
constraints = VideoConstraints(
|
||||
max_size_bytes=10 * 1024 * 1024,
|
||||
supported_formats=("video/mp4", FALLBACK_CONTENT_TYPE),
|
||||
)
|
||||
file = VideoFile(source=FileBytes(data=MINIMAL_VIDEO, filename="test.mp4"))
|
||||
|
||||
errors = validate_video(file, constraints, raise_on_error=False)
|
||||
|
||||
assert len(errors) == 0
|
||||
|
||||
def test_validate_video_too_large(self):
|
||||
"""Test validating a video file that exceeds size limit."""
|
||||
constraints = VideoConstraints(
|
||||
max_size_bytes=10, # Very small limit
|
||||
supported_formats=("video/mp4", FALLBACK_CONTENT_TYPE),
|
||||
)
|
||||
file = VideoFile(source=FileBytes(data=MINIMAL_VIDEO, filename="test.mp4"))
|
||||
|
||||
with pytest.raises(FileTooLargeError) as exc_info:
|
||||
validate_video(file, constraints)
|
||||
|
||||
assert "exceeds" in str(exc_info.value)
|
||||
assert exc_info.value.file_name == "test.mp4"
|
||||
|
||||
def test_validate_video_unsupported_format(self):
|
||||
"""Test validating a video file with unsupported format."""
|
||||
constraints = VideoConstraints(
|
||||
max_size_bytes=10 * 1024 * 1024,
|
||||
supported_formats=("video/webm",), # Only WebM
|
||||
)
|
||||
file = VideoFile(source=FileBytes(data=MINIMAL_VIDEO, filename="test.mp4"))
|
||||
|
||||
with pytest.raises(UnsupportedFileTypeError) as exc_info:
|
||||
validate_video(file, constraints)
|
||||
|
||||
assert "not supported" in str(exc_info.value)
|
||||
|
||||
@patch("crewai.files.processing.validators._get_video_duration")
|
||||
def test_validate_video_duration_passes(self, mock_get_duration):
|
||||
"""Test validating video when duration is under limit."""
|
||||
mock_get_duration.return_value = 30.0
|
||||
constraints = VideoConstraints(
|
||||
max_size_bytes=10 * 1024 * 1024,
|
||||
max_duration_seconds=60,
|
||||
supported_formats=("video/mp4", FALLBACK_CONTENT_TYPE),
|
||||
)
|
||||
file = VideoFile(source=FileBytes(data=MINIMAL_VIDEO, filename="test.mp4"))
|
||||
|
||||
errors = validate_video(file, constraints, raise_on_error=False)
|
||||
|
||||
assert len(errors) == 0
|
||||
mock_get_duration.assert_called_once()
|
||||
|
||||
@patch("crewai.files.processing.validators._get_video_duration")
|
||||
def test_validate_video_duration_fails(self, mock_get_duration):
|
||||
"""Test validating video when duration exceeds limit."""
|
||||
mock_get_duration.return_value = 180.0
|
||||
constraints = VideoConstraints(
|
||||
max_size_bytes=10 * 1024 * 1024,
|
||||
max_duration_seconds=60,
|
||||
supported_formats=("video/mp4", FALLBACK_CONTENT_TYPE),
|
||||
)
|
||||
file = VideoFile(source=FileBytes(data=MINIMAL_VIDEO, filename="test.mp4"))
|
||||
|
||||
with pytest.raises(FileValidationError) as exc_info:
|
||||
validate_video(file, constraints)
|
||||
|
||||
assert "duration" in str(exc_info.value).lower()
|
||||
assert "180.0s" in str(exc_info.value)
|
||||
assert "60s" in str(exc_info.value)
|
||||
|
||||
@patch("crewai.files.processing.validators._get_video_duration")
|
||||
def test_validate_video_duration_no_raise(self, mock_get_duration):
|
||||
"""Test video duration validation with raise_on_error=False."""
|
||||
mock_get_duration.return_value = 180.0
|
||||
constraints = VideoConstraints(
|
||||
max_size_bytes=10 * 1024 * 1024,
|
||||
max_duration_seconds=60,
|
||||
supported_formats=("video/mp4", FALLBACK_CONTENT_TYPE),
|
||||
)
|
||||
file = VideoFile(source=FileBytes(data=MINIMAL_VIDEO, filename="test.mp4"))
|
||||
|
||||
errors = validate_video(file, constraints, raise_on_error=False)
|
||||
|
||||
assert len(errors) == 1
|
||||
assert "duration" in errors[0].lower()
|
||||
|
||||
@patch("crewai.files.processing.validators._get_video_duration")
|
||||
def test_validate_video_duration_none_skips(self, mock_get_duration):
|
||||
"""Test that duration validation is skipped when max_duration_seconds is None."""
|
||||
constraints = VideoConstraints(
|
||||
max_size_bytes=10 * 1024 * 1024,
|
||||
max_duration_seconds=None,
|
||||
supported_formats=("video/mp4", FALLBACK_CONTENT_TYPE),
|
||||
)
|
||||
file = VideoFile(source=FileBytes(data=MINIMAL_VIDEO, filename="test.mp4"))
|
||||
|
||||
errors = validate_video(file, constraints, raise_on_error=False)
|
||||
|
||||
assert len(errors) == 0
|
||||
mock_get_duration.assert_not_called()
|
||||
|
||||
@patch("crewai.files.processing.validators._get_video_duration")
|
||||
def test_validate_video_duration_detection_returns_none(self, mock_get_duration):
|
||||
"""Test that validation passes when duration detection returns None."""
|
||||
mock_get_duration.return_value = None
|
||||
constraints = VideoConstraints(
|
||||
max_size_bytes=10 * 1024 * 1024,
|
||||
max_duration_seconds=60,
|
||||
supported_formats=("video/mp4", FALLBACK_CONTENT_TYPE),
|
||||
)
|
||||
file = VideoFile(source=FileBytes(data=MINIMAL_VIDEO, filename="test.mp4"))
|
||||
|
||||
errors = validate_video(file, constraints, raise_on_error=False)
|
||||
|
||||
assert len(errors) == 0
|
||||
|
||||
|
||||
class TestGetAudioDuration:
|
||||
"""Tests for _get_audio_duration helper function."""
|
||||
|
||||
def test_get_audio_duration_corrupt_file(self):
|
||||
"""Test handling of corrupt audio data."""
|
||||
corrupt_data = b"not valid audio data at all"
|
||||
result = _get_audio_duration(corrupt_data)
|
||||
|
||||
assert result is None
|
||||
|
||||
|
||||
class TestGetVideoDuration:
|
||||
"""Tests for _get_video_duration helper function."""
|
||||
|
||||
def test_get_video_duration_corrupt_file(self):
|
||||
"""Test handling of corrupt video data."""
|
||||
corrupt_data = b"not valid video data at all"
|
||||
result = _get_video_duration(corrupt_data)
|
||||
|
||||
assert result is None
|
||||
|
||||
312
lib/crewai/tests/files/test_file_url.py
Normal file
312
lib/crewai/tests/files/test_file_url.py
Normal file
@@ -0,0 +1,312 @@
|
||||
"""Tests for FileUrl source type and URL resolution."""
|
||||
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from crewai.files import FileBytes, FileUrl, ImageFile
|
||||
from crewai.files.file import _normalize_source, FilePath
|
||||
from crewai.files.resolved import InlineBase64, UrlReference
|
||||
from crewai.files.resolver import FileResolver
|
||||
|
||||
|
||||
class TestFileUrl:
|
||||
"""Tests for FileUrl source type."""
|
||||
|
||||
def test_create_file_url(self):
|
||||
"""Test creating FileUrl with valid URL."""
|
||||
url = FileUrl(url="https://example.com/image.png")
|
||||
|
||||
assert url.url == "https://example.com/image.png"
|
||||
assert url.filename is None
|
||||
|
||||
def test_create_file_url_with_filename(self):
|
||||
"""Test creating FileUrl with custom filename."""
|
||||
url = FileUrl(url="https://example.com/image.png", filename="custom.png")
|
||||
|
||||
assert url.url == "https://example.com/image.png"
|
||||
assert url.filename == "custom.png"
|
||||
|
||||
def test_invalid_url_scheme_raises(self):
|
||||
"""Test that non-http(s) URLs raise ValueError."""
|
||||
with pytest.raises(ValueError, match="Invalid URL scheme"):
|
||||
FileUrl(url="ftp://example.com/file.txt")
|
||||
|
||||
def test_invalid_url_scheme_file_raises(self):
|
||||
"""Test that file:// URLs raise ValueError."""
|
||||
with pytest.raises(ValueError, match="Invalid URL scheme"):
|
||||
FileUrl(url="file:///path/to/file.txt")
|
||||
|
||||
def test_http_url_valid(self):
|
||||
"""Test that HTTP URLs are valid."""
|
||||
url = FileUrl(url="http://example.com/image.jpg")
|
||||
|
||||
assert url.url == "http://example.com/image.jpg"
|
||||
|
||||
def test_https_url_valid(self):
|
||||
"""Test that HTTPS URLs are valid."""
|
||||
url = FileUrl(url="https://example.com/image.jpg")
|
||||
|
||||
assert url.url == "https://example.com/image.jpg"
|
||||
|
||||
def test_content_type_guessing_png(self):
|
||||
"""Test content type guessing for PNG files."""
|
||||
url = FileUrl(url="https://example.com/image.png")
|
||||
|
||||
assert url.content_type == "image/png"
|
||||
|
||||
def test_content_type_guessing_jpeg(self):
|
||||
"""Test content type guessing for JPEG files."""
|
||||
url = FileUrl(url="https://example.com/photo.jpg")
|
||||
|
||||
assert url.content_type == "image/jpeg"
|
||||
|
||||
def test_content_type_guessing_pdf(self):
|
||||
"""Test content type guessing for PDF files."""
|
||||
url = FileUrl(url="https://example.com/document.pdf")
|
||||
|
||||
assert url.content_type == "application/pdf"
|
||||
|
||||
def test_content_type_guessing_with_query_params(self):
|
||||
"""Test content type guessing with URL query parameters."""
|
||||
url = FileUrl(url="https://example.com/image.png?v=123&token=abc")
|
||||
|
||||
assert url.content_type == "image/png"
|
||||
|
||||
def test_content_type_fallback_unknown(self):
|
||||
"""Test content type falls back to octet-stream for unknown extensions."""
|
||||
url = FileUrl(url="https://example.com/file.unknownext123")
|
||||
|
||||
assert url.content_type == "application/octet-stream"
|
||||
|
||||
def test_content_type_no_extension(self):
|
||||
"""Test content type for URL without extension."""
|
||||
url = FileUrl(url="https://example.com/file")
|
||||
|
||||
assert url.content_type == "application/octet-stream"
|
||||
|
||||
def test_read_fetches_content(self):
|
||||
"""Test that read() fetches content from URL."""
|
||||
url = FileUrl(url="https://example.com/image.png")
|
||||
mock_response = MagicMock()
|
||||
mock_response.content = b"fake image content"
|
||||
mock_response.headers = {"content-type": "image/png"}
|
||||
|
||||
with patch("httpx.get", return_value=mock_response) as mock_get:
|
||||
content = url.read()
|
||||
|
||||
mock_get.assert_called_once_with(
|
||||
"https://example.com/image.png", follow_redirects=True
|
||||
)
|
||||
assert content == b"fake image content"
|
||||
|
||||
def test_read_caches_content(self):
|
||||
"""Test that read() caches content."""
|
||||
url = FileUrl(url="https://example.com/image.png")
|
||||
mock_response = MagicMock()
|
||||
mock_response.content = b"fake content"
|
||||
mock_response.headers = {}
|
||||
|
||||
with patch("httpx.get", return_value=mock_response) as mock_get:
|
||||
content1 = url.read()
|
||||
content2 = url.read()
|
||||
|
||||
mock_get.assert_called_once()
|
||||
assert content1 == content2
|
||||
|
||||
def test_read_updates_content_type_from_response(self):
|
||||
"""Test that read() updates content type from response headers."""
|
||||
url = FileUrl(url="https://example.com/file")
|
||||
mock_response = MagicMock()
|
||||
mock_response.content = b"fake content"
|
||||
mock_response.headers = {"content-type": "image/webp; charset=utf-8"}
|
||||
|
||||
with patch("httpx.get", return_value=mock_response):
|
||||
url.read()
|
||||
|
||||
assert url.content_type == "image/webp"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_aread_fetches_content(self):
|
||||
"""Test that aread() fetches content from URL asynchronously."""
|
||||
url = FileUrl(url="https://example.com/image.png")
|
||||
mock_response = MagicMock()
|
||||
mock_response.content = b"async fake content"
|
||||
mock_response.headers = {"content-type": "image/png"}
|
||||
mock_response.raise_for_status = MagicMock()
|
||||
|
||||
mock_client = MagicMock()
|
||||
mock_client.get = AsyncMock(return_value=mock_response)
|
||||
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||
mock_client.__aexit__ = AsyncMock(return_value=None)
|
||||
|
||||
with patch("httpx.AsyncClient", return_value=mock_client):
|
||||
content = await url.aread()
|
||||
|
||||
assert content == b"async fake content"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_aread_caches_content(self):
|
||||
"""Test that aread() caches content."""
|
||||
url = FileUrl(url="https://example.com/image.png")
|
||||
mock_response = MagicMock()
|
||||
mock_response.content = b"cached content"
|
||||
mock_response.headers = {}
|
||||
mock_response.raise_for_status = MagicMock()
|
||||
|
||||
mock_client = MagicMock()
|
||||
mock_client.get = AsyncMock(return_value=mock_response)
|
||||
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||
mock_client.__aexit__ = AsyncMock(return_value=None)
|
||||
|
||||
with patch("httpx.AsyncClient", return_value=mock_client):
|
||||
content1 = await url.aread()
|
||||
content2 = await url.aread()
|
||||
|
||||
mock_client.get.assert_called_once()
|
||||
assert content1 == content2
|
||||
|
||||
|
||||
class TestNormalizeSource:
|
||||
"""Tests for _normalize_source with URL detection."""
|
||||
|
||||
def test_normalize_url_string(self):
|
||||
"""Test that URL strings are converted to FileUrl."""
|
||||
result = _normalize_source("https://example.com/image.png")
|
||||
|
||||
assert isinstance(result, FileUrl)
|
||||
assert result.url == "https://example.com/image.png"
|
||||
|
||||
def test_normalize_http_url_string(self):
|
||||
"""Test that HTTP URL strings are converted to FileUrl."""
|
||||
result = _normalize_source("http://example.com/file.pdf")
|
||||
|
||||
assert isinstance(result, FileUrl)
|
||||
assert result.url == "http://example.com/file.pdf"
|
||||
|
||||
def test_normalize_file_path_string(self, tmp_path):
|
||||
"""Test that file path strings are converted to FilePath."""
|
||||
test_file = tmp_path / "test.png"
|
||||
test_file.write_bytes(b"test content")
|
||||
|
||||
result = _normalize_source(str(test_file))
|
||||
|
||||
assert isinstance(result, FilePath)
|
||||
|
||||
def test_normalize_relative_path_is_not_url(self):
|
||||
"""Test that relative path strings are not treated as URLs."""
|
||||
result = _normalize_source("https://example.com/file.png")
|
||||
|
||||
assert isinstance(result, FileUrl)
|
||||
assert not isinstance(result, FilePath)
|
||||
|
||||
def test_normalize_file_url_passthrough(self):
|
||||
"""Test that FileUrl instances pass through unchanged."""
|
||||
original = FileUrl(url="https://example.com/image.png")
|
||||
result = _normalize_source(original)
|
||||
|
||||
assert result is original
|
||||
|
||||
|
||||
class TestResolverUrlHandling:
|
||||
"""Tests for FileResolver URL handling."""
|
||||
|
||||
def test_resolve_url_source_for_supported_provider(self):
|
||||
"""Test URL source resolves to UrlReference for supported providers."""
|
||||
resolver = FileResolver()
|
||||
file = ImageFile(source=FileUrl(url="https://example.com/image.png"))
|
||||
|
||||
resolved = resolver.resolve(file, "anthropic")
|
||||
|
||||
assert isinstance(resolved, UrlReference)
|
||||
assert resolved.url == "https://example.com/image.png"
|
||||
assert resolved.content_type == "image/png"
|
||||
|
||||
def test_resolve_url_source_openai(self):
|
||||
"""Test URL source resolves to UrlReference for OpenAI."""
|
||||
resolver = FileResolver()
|
||||
file = ImageFile(source=FileUrl(url="https://example.com/photo.jpg"))
|
||||
|
||||
resolved = resolver.resolve(file, "openai")
|
||||
|
||||
assert isinstance(resolved, UrlReference)
|
||||
assert resolved.url == "https://example.com/photo.jpg"
|
||||
|
||||
def test_resolve_url_source_gemini(self):
|
||||
"""Test URL source resolves to UrlReference for Gemini."""
|
||||
resolver = FileResolver()
|
||||
file = ImageFile(source=FileUrl(url="https://example.com/image.webp"))
|
||||
|
||||
resolved = resolver.resolve(file, "gemini")
|
||||
|
||||
assert isinstance(resolved, UrlReference)
|
||||
assert resolved.url == "https://example.com/image.webp"
|
||||
|
||||
def test_resolve_url_source_azure(self):
|
||||
"""Test URL source resolves to UrlReference for Azure."""
|
||||
resolver = FileResolver()
|
||||
file = ImageFile(source=FileUrl(url="https://example.com/image.gif"))
|
||||
|
||||
resolved = resolver.resolve(file, "azure")
|
||||
|
||||
assert isinstance(resolved, UrlReference)
|
||||
assert resolved.url == "https://example.com/image.gif"
|
||||
|
||||
def test_resolve_url_source_bedrock_fetches_content(self):
|
||||
"""Test URL source fetches content for Bedrock (unsupported URLs)."""
|
||||
resolver = FileResolver()
|
||||
file_url = FileUrl(url="https://example.com/image.png")
|
||||
file = ImageFile(source=file_url)
|
||||
|
||||
mock_response = MagicMock()
|
||||
mock_response.content = b"\x89PNG\r\n\x1a\n" + b"\x00" * 50
|
||||
mock_response.headers = {"content-type": "image/png"}
|
||||
|
||||
with patch("httpx.get", return_value=mock_response):
|
||||
resolved = resolver.resolve(file, "bedrock")
|
||||
|
||||
assert not isinstance(resolved, UrlReference)
|
||||
|
||||
def test_resolve_bytes_source_still_works(self):
|
||||
"""Test that bytes source still resolves normally."""
|
||||
resolver = FileResolver()
|
||||
minimal_png = (
|
||||
b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x08\x00\x00\x00\x08"
|
||||
b"\x01\x00\x00\x00\x00\xf9Y\xab\xcd\x00\x00\x00\nIDATx\x9cc`\x00\x00"
|
||||
b"\x00\x02\x00\x01\xe2!\xbc3\x00\x00\x00\x00IEND\xaeB`\x82"
|
||||
)
|
||||
file = ImageFile(source=FileBytes(data=minimal_png, filename="test.png"))
|
||||
|
||||
resolved = resolver.resolve(file, "anthropic")
|
||||
|
||||
assert isinstance(resolved, InlineBase64)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_aresolve_url_source(self):
|
||||
"""Test async URL resolution for supported provider."""
|
||||
resolver = FileResolver()
|
||||
file = ImageFile(source=FileUrl(url="https://example.com/image.png"))
|
||||
|
||||
resolved = await resolver.aresolve(file, "anthropic")
|
||||
|
||||
assert isinstance(resolved, UrlReference)
|
||||
assert resolved.url == "https://example.com/image.png"
|
||||
|
||||
|
||||
class TestImageFileWithUrl:
|
||||
"""Tests for creating ImageFile with URL source."""
|
||||
|
||||
def test_image_file_from_url_string(self):
|
||||
"""Test creating ImageFile from URL string."""
|
||||
file = ImageFile(source="https://example.com/image.png")
|
||||
|
||||
assert isinstance(file.source, FileUrl)
|
||||
assert file.source.url == "https://example.com/image.png"
|
||||
|
||||
def test_image_file_from_file_url(self):
|
||||
"""Test creating ImageFile from FileUrl instance."""
|
||||
url = FileUrl(url="https://example.com/photo.jpg")
|
||||
file = ImageFile(source=url)
|
||||
|
||||
assert file.source is url
|
||||
assert file.content_type == "image/jpeg"
|
||||
51
uv.lock
generated
51
uv.lock
generated
@@ -418,6 +418,44 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/f8/aa/5082412d1ee302e9e7d80b6949bc4d2a8fa1149aaab610c5fc24709605d6/authlib-1.6.5-py2.py3-none-any.whl", hash = "sha256:3e0e0507807f842b02175507bdee8957a1d5707fd4afb17c32fb43fee90b6e3a", size = 243608, upload-time = "2025-10-02T13:36:07.637Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "av"
|
||||
version = "13.0.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/e1/df/4f77aa98b998e1a19622b7a45da07884a053826e9038138d8023208e31e5/av-13.0.0.tar.gz", hash = "sha256:7fb1a5588cd8ce4d0564ddf82221f886541ea2d5152f15e63ab890430dcd3c31", size = 3884902, upload-time = "2024-09-04T08:30:48.971Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/07/ac/fdacc4e49b946ac9274c9363eeedceed824a71fa09df5c799cb4a137d80d/av-13.0.0-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:a0f3563eb232c46811388d19eb8da3435ebd98e3b26c567da76acb878c772a4f", size = 24229400, upload-time = "2024-09-04T08:28:26.627Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/55/8d/bc8670f8a2084aaf4b738017e490a5c762023b88517fd579cbaff6ab18f3/av-13.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:52713a673ccf743cb0692c7aa9b02429d7efee3fa19281dda1167685f8c21864", size = 19446165, upload-time = "2024-09-04T08:28:30.132Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/13/23/8280bc3a0df950f6fd8e57621f037d708c2065534311c7b6d88ec22e080a/av-13.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf667841f54cc82d5a09b9c31921dfafc22a6293aa17b9bd11f33c6c08e372d0", size = 31141668, upload-time = "2024-09-04T08:28:33.811Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/72/d3/16dfe2bc810be142f06ef93b9eadfddc51309bcdb0ca80c566aa889f0dde/av-13.0.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d6a3a4a572d3c70fd3d8709b9ae5d8a7cd6ef813b46d571a95477a87d0f3e282", size = 30565447, upload-time = "2024-09-04T08:28:37.579Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/64/56/41f067fa8344027c03abbaeaf5826838c97404a47472c521a658f0656472/av-13.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ed3b70ca98c3f3ba130f23ec1393316eb714f35d41b4c1d9d1ef4951f862cc0", size = 32975707, upload-time = "2024-09-04T08:28:41.418Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/23/53/182589a2501f44cde451a18c8db372fec714bd3dfdd8906277fce3b10c18/av-13.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:43db19eb2704a5a8b6060c070bcf05e0ce1132edb3140f8a19271ac8eac63706", size = 25747720, upload-time = "2024-09-04T08:28:44.816Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d1/b3/37460a6b94ee2a284b8d585a19cc63b32a9318b4c1eee0e25b6f24df415a/av-13.0.0-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:b3ec126e5c30a0d44c6ce6cd0be72b2af83529e5b19c41e6569a7c4d00261d04", size = 24224476, upload-time = "2024-09-04T08:28:48.276Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b0/a7/1cc83b2e0aeead07c3e9c59cbddf15f2b555578c6b725cc65bdbbec4c4d6/av-13.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:0014c16d9123f50f366e32baed5c358429ed64c701ed5cea135fba333a5c9b13", size = 19438756, upload-time = "2024-09-04T08:28:51.511Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b3/b6/d6a85b89b14d60b360fb8eab65a9e7d8119d2807dcb025bc93baeff565a6/av-13.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3fa360cfc3e55ef1b22199741c74b584a57d2af75d5e5d9b54dd8cc999ae50bb", size = 32084112, upload-time = "2024-09-04T08:28:54.434Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/cf/1d/3b5d4ce10de1b383a1f68dcf4f7679a34f5f6cf8aad1a0dfcfbf05c5fd7e/av-13.0.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3519e3effea342295de5f52dbcd263800db2ab1ab5e43ec6485ba1ed07c2e503", size = 31396374, upload-time = "2024-09-04T08:28:58.027Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/7a/8e/c5bea32963acacbc0db7b1c6e6d5a181afee2951981b88533c771beabc53/av-13.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3f76e0060f4aa4be0911db624039e31c973dce9f9f2d410dc817b2b88e199a74", size = 33913273, upload-time = "2024-09-04T08:29:01.251Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ce/30/1912588c0bce8baf6e490103e5c4ef1963f8bc0f0c00d82cde2b6b3793fc/av-13.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:b21254571904b214fc586568ba1da62d38f00cc4f76c7eebbe14af9f8dd8a40f", size = 25750490, upload-time = "2024-09-04T08:29:04.985Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/df/90/f8120cebf0b86ff70691603a6fb1ef473d1fd9c99db058d0413e9a630538/av-13.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:9eaf76c3a8a40dc3424ee9360b457143699d96f6e3faffb00867fd747b821ab9", size = 24238853, upload-time = "2024-09-04T08:29:08.611Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/62/7d/090813d188eebbe183acad6e0cfbd9cdeca0e7f7318a0a3bd6f44ac7d16f/av-13.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:623809f0684bf4379328ced38a25c295969997ba574ed17b99fa4ee3aa564d66", size = 19446605, upload-time = "2024-09-04T08:29:11.922Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/71/ec/bdc954939463127ca38ee023061be0ac89bdf2f2de6ab23f6a1d8112d070/av-13.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8dc441b3899f1eb259af17acb2e5218762dcc99a4fbd6fe4d1f4155e253728b2", size = 32317356, upload-time = "2024-09-04T08:29:15.475Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/00/78/8d808f4868862b1b539ffd9af1775792f128a903f134c2dbfdb39a7799e3/av-13.0.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8b9654f9261ba123377b95fd5a9214e05ba43d7545cb41a5ae2dd5ea5fe6fbc9", size = 31666294, upload-time = "2024-09-04T08:29:18.805Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f7/fd/ee64d545a60c73795285cbe70f27e49b46c40e1ca3c8c35411b75ea310e6/av-13.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8157821b9da3814720d9b7ea45d961275dc73be8161eae7258afe2f737da5779", size = 34243366, upload-time = "2024-09-04T08:29:22.423Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c1/49/08552c5c2b838016cbba90547a0c082e9e8b700eaaf90c8eb0c11fec595e/av-13.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:736c4a9cb6ef6e5f3aa1cb12609a615f6c93bf16f36439010dc1ba160beed827", size = 25751891, upload-time = "2024-09-04T08:29:26.781Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/4e/fd/08eeec9bd07129242989cb69cb45be5ff4c394af27b661d7c4428c460669/av-13.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4074615d89852dc8d7aa852b9162fe855bc2c6850e0cab74a875d4e72eefe343", size = 24197575, upload-time = "2024-09-04T08:29:30.194Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f3/0a/70d1848f325fd595f009f419e11134020aca1e0bf99c0041c0f5a767a01d/av-13.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a2df1f311610dcedbd0b08a5a419ae17076aa9cd808a6d4f0b5cb8c69d604e9f", size = 19406017, upload-time = "2024-09-04T08:29:32.951Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/3f/10/2c1007829950cc1b7b17593d0d304adf008331729083af3d9b7c34e10b52/av-13.0.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b1990d1398c25d90045c771450a64bf9aff33d8e6c89568fbbc5cc85ec6ceaa1", size = 31966860, upload-time = "2024-09-04T08:29:36.272Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/1d/d7/f64af0713a669560ef33eea30c08add46916cab4ff0b26b473c14a9ff32c/av-13.0.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3303584abfcc2787a3dcf303fddcab0968329a309360c22348cc2c31e060f8d9", size = 31333914, upload-time = "2024-09-04T08:29:40.417Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c5/6c/647368ea1b60059a0a0dec3eae7c76b3aaec3e222c3cbcb54af0c2716d37/av-13.0.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:05de5e2e6dde42d804dc41aa36102f64849fc72d0c7f9afc28406a7b240dba7a", size = 33908881, upload-time = "2024-09-04T08:29:44.161Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/2a/bc/e2305f5e18eb47b5eac80e29de2fc1110898bb48131bb2a6d0d893080969/av-13.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:f9cea8906abf010f6d4894c7cad52e257667d0a498d4eec7e5beb4eff519d3ff", size = 25724252, upload-time = "2024-09-04T08:29:48.344Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/35/6e/1cba0d4506a3855f718615a826958b5b9f08d3b263216b8ba2fc578e54da/av-13.0.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:d066d441efbd329947ff36604422b3a22ee65a98a78caa0869d2400cebc46381", size = 23837589, upload-time = "2024-09-04T08:30:13.345Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/2a/23/8553944c6d782c4fe0883f969866f2ab1ad8546a4361c942aa80873583d5/av-13.0.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:9836b348f648ef5a364075626e623cef39383fe439159f5875e588429c7c90ea", size = 19091589, upload-time = "2024-09-04T08:30:16.075Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/0d/d4/5286b9bea8d6a87853f93116f4eef6f3d5ab64a9382371d851eb705d9299/av-13.0.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:52aeefdaa9fd5182aee1d4ae53325756273e293173810c77960e012a9a4efda0", size = 22823448, upload-time = "2024-09-04T08:30:19.446Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/27/3f/37253b9746459f570a871170d70c7c43eed58a4e755a9e1f2c67c27d6dbe/av-13.0.0-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aae4116c3cc94f514501f856df4a351eb3386fbc5623d3dcb17476237ffae221", size = 22673845, upload-time = "2024-09-04T08:30:22.129Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/de/fa/e6995a721ce5ca9aa7e5a58dfeeb3df7c6f846f10e54ac32cbaf2948682a/av-13.0.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2425c8b0c8a022f10a20f3075bec05fc8efe4c5848e038d7d168cbbca089f08a", size = 24628585, upload-time = "2024-09-04T08:30:25.345Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/33/b9/1023b925f6505cba49fe22a08020dd0dfb9185c42d4f26fc6217b9e1c2e2/av-13.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:894dc43623b959d00ab9a62c0357929ba7a8dd8667b37afb046caee756f9e90a", size = 25536060, upload-time = "2024-09-04T08:30:28.418Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "azure-ai-inference"
|
||||
version = "1.0.0b9"
|
||||
@@ -1203,9 +1241,11 @@ embeddings = [
|
||||
file-processing = [
|
||||
{ name = "aiocache" },
|
||||
{ name = "aiofiles" },
|
||||
{ name = "av" },
|
||||
{ name = "pillow" },
|
||||
{ name = "pypdf" },
|
||||
{ name = "python-magic" },
|
||||
{ name = "tinytag" },
|
||||
]
|
||||
google-genai = [
|
||||
{ name = "google-genai" },
|
||||
@@ -1245,6 +1285,7 @@ requires-dist = [
|
||||
{ name = "aiosqlite", specifier = "~=0.21.0" },
|
||||
{ name = "anthropic", marker = "extra == 'anthropic'", specifier = "~=0.71.0" },
|
||||
{ name = "appdirs", specifier = "~=1.4.4" },
|
||||
{ name = "av", marker = "extra == 'file-processing'", specifier = "~=13.0.0" },
|
||||
{ name = "azure-ai-inference", marker = "extra == 'azure-ai-inference'", specifier = "~=1.0.0b9" },
|
||||
{ name = "boto3", marker = "extra == 'aws'", specifier = "~=1.40.38" },
|
||||
{ name = "boto3", marker = "extra == 'bedrock'", specifier = "~=1.40.45" },
|
||||
@@ -1282,6 +1323,7 @@ requires-dist = [
|
||||
{ name = "qdrant-client", extras = ["fastembed"], marker = "extra == 'qdrant'", specifier = "~=1.14.3" },
|
||||
{ name = "regex", specifier = "~=2024.9.11" },
|
||||
{ name = "tiktoken", marker = "extra == 'embeddings'", specifier = "~=0.8.0" },
|
||||
{ name = "tinytag", marker = "extra == 'file-processing'", specifier = "~=1.10.0" },
|
||||
{ name = "tokenizers", specifier = "~=0.20.3" },
|
||||
{ name = "tomli", specifier = "~=2.0.2" },
|
||||
{ name = "tomli-w", specifier = "~=1.1.0" },
|
||||
@@ -7806,6 +7848,15 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/d6/14/fc04d491527b774ec7479897f5861959209de1480e4c4cd32ed098ff8bea/timm-1.0.22-py3-none-any.whl", hash = "sha256:888981753e65cbaacfc07494370138b1700a27b1f0af587f4f9b47bc024161d0", size = 2530238, upload-time = "2025-11-05T04:06:06.823Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinytag"
|
||||
version = "1.10.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/59/b5/ff5e5f9ca9677be7272260f67c87f7e8e885babc7ce94604e837dcfd8d76/tinytag-1.10.1.tar.gz", hash = "sha256:122a63b836f85094aacca43fc807aaee3290be3de17d134f5f4a08b509ae268f", size = 40906, upload-time = "2023-10-26T19:30:38.791Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/2f/04/ef783cbc4aa3a5ed75969e300b3e3929daf3d1b52fe80e950c63e0d66d95/tinytag-1.10.1-py3-none-any.whl", hash = "sha256:e437654d04c966fbbbdbf807af61eb9759f1d80e4173a7d26202506b37cfdaf0", size = 37900, upload-time = "2023-10-26T19:30:36.724Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokenizers"
|
||||
version = "0.20.3"
|
||||
|
||||
Reference in New Issue
Block a user