From 9be88e05ee72c3a1941389f322c754a0e49e8b96 Mon Sep 17 00:00:00 2001 From: Greyson LaLonde Date: Thu, 22 Jan 2026 14:02:55 -0500 Subject: [PATCH] feat: add format hints to audio/video duration detection --- lib/crewai/pyproject.toml | 2 + lib/crewai/src/crewai/files/__init__.py | 2 + lib/crewai/src/crewai/files/content_types.py | 11 +- lib/crewai/src/crewai/files/file.py | 75 ++++- .../crewai/files/processing/constraints.py | 21 +- .../src/crewai/files/processing/validators.py | 117 ++++++- lib/crewai/src/crewai/files/resolver.py | 85 +++-- lib/crewai/src/crewai/files/utils.py | 15 +- .../tests/files/processing/test_validators.py | 288 +++++++++++++++- lib/crewai/tests/files/test_file_url.py | 312 ++++++++++++++++++ uv.lock | 51 +++ 11 files changed, 936 insertions(+), 43 deletions(-) create mode 100644 lib/crewai/tests/files/test_file_url.py diff --git a/lib/crewai/pyproject.toml b/lib/crewai/pyproject.toml index baa61ccd3..f92be5ac7 100644 --- a/lib/crewai/pyproject.toml +++ b/lib/crewai/pyproject.toml @@ -104,6 +104,8 @@ file-processing = [ "python-magic>=0.4.27", "aiocache~=0.12.3", "aiofiles~=24.1.0", + "tinytag~=1.10.0", + "av~=13.0.0", ] diff --git a/lib/crewai/src/crewai/files/__init__.py b/lib/crewai/src/crewai/files/__init__.py index d33c10894..1c7360227 100644 --- a/lib/crewai/src/crewai/files/__init__.py +++ b/lib/crewai/src/crewai/files/__init__.py @@ -32,6 +32,7 @@ from crewai.files.file import ( FileSource, FileSourceInput, FileStream, + FileUrl, RawFileInput, ) from crewai.files.processing import ( @@ -103,6 +104,7 @@ __all__ = [ "FileStream", "FileTooLargeError", "FileUploader", + "FileUrl", "FileValidationError", "ImageConstraints", "ImageExtension", diff --git a/lib/crewai/src/crewai/files/content_types.py b/lib/crewai/src/crewai/files/content_types.py index f110f58bb..7995a0557 100644 --- a/lib/crewai/src/crewai/files/content_types.py +++ b/lib/crewai/src/crewai/files/content_types.py @@ -16,6 +16,7 @@ from crewai.files.file import ( FilePath, FileSource, FileStream, + FileUrl, ) from crewai.files.utils import is_file_source @@ -29,12 +30,14 @@ class _FileSourceCoercer: @classmethod def _coerce(cls, v: Any) -> FileSource: """Convert raw input to appropriate FileSource type.""" - if isinstance(v, (FilePath, FileBytes, FileStream)): + if isinstance(v, (FilePath, FileBytes, FileStream, FileUrl)): return v + if isinstance(v, str): + if v.startswith(("http://", "https://")): + return FileUrl(url=v) + return FilePath(path=Path(v)) if isinstance(v, Path): return FilePath(path=v) - if isinstance(v, str): - return FilePath(path=Path(v)) if isinstance(v, bytes): return FileBytes(data=v) if isinstance(v, (IOBase, BinaryIO)): @@ -203,7 +206,7 @@ class BaseFile(ABC, BaseModel): TypeError: If the underlying source doesn't support async read. """ source = self._file_source - if isinstance(source, (FilePath, FileBytes, AsyncFileStream)): + if isinstance(source, (FilePath, FileBytes, AsyncFileStream, FileUrl)): return await source.aread() raise TypeError(f"{type(source).__name__} does not support async read") diff --git a/lib/crewai/src/crewai/files/file.py b/lib/crewai/src/crewai/files/file.py index 53d9a08f1..7ed2ab5d3 100644 --- a/lib/crewai/src/crewai/files/file.py +++ b/lib/crewai/src/crewai/files/file.py @@ -414,17 +414,84 @@ class AsyncFileStream(BaseModel): yield chunk -FileSource = FilePath | FileBytes | FileStream | AsyncFileStream +class FileUrl(BaseModel): + """File referenced by URL. + + For providers that support URL references, the URL is passed directly. + For providers that don't, content is fetched on demand. + + Attributes: + url: URL where the file can be accessed. + filename: Optional filename (extracted from URL if not provided). + """ + + url: str = Field(description="URL where the file can be accessed.") + filename: str | None = Field(default=None, description="Optional filename.") + _content_type: str | None = PrivateAttr(default=None) + _content: bytes | None = PrivateAttr(default=None) + + @model_validator(mode="after") + def _validate_url(self) -> FileUrl: + """Validate URL format.""" + if not self.url.startswith(("http://", "https://")): + raise ValueError(f"Invalid URL scheme: {self.url}") + return self + + @property + def content_type(self) -> str: + """Get the content type, guessing from URL extension if not set.""" + if self._content_type is None: + self._content_type = self._guess_content_type() + return self._content_type + + def _guess_content_type(self) -> str: + """Guess content type from URL extension.""" + from urllib.parse import urlparse + + parsed = urlparse(self.url) + path = parsed.path + guessed, _ = mimetypes.guess_type(path) + return guessed or "application/octet-stream" + + def read(self) -> bytes: + """Fetch content from URL (for providers that don't support URL references).""" + if self._content is None: + import httpx + + response = httpx.get(self.url, follow_redirects=True) + response.raise_for_status() + self._content = response.content + if "content-type" in response.headers: + self._content_type = response.headers["content-type"].split(";")[0] + return self._content + + async def aread(self) -> bytes: + """Async fetch content from URL.""" + if self._content is None: + import httpx + + async with httpx.AsyncClient() as client: + response = await client.get(self.url, follow_redirects=True) + response.raise_for_status() + self._content = response.content + if "content-type" in response.headers: + self._content_type = response.headers["content-type"].split(";")[0] + return self._content + + +FileSource = FilePath | FileBytes | FileStream | AsyncFileStream | FileUrl def _normalize_source(value: Any) -> FileSource: """Convert raw input to appropriate source type.""" - if isinstance(value, (FilePath, FileBytes, FileStream, AsyncFileStream)): + if isinstance(value, (FilePath, FileBytes, FileStream, AsyncFileStream, FileUrl)): return value + if isinstance(value, str): + if value.startswith(("http://", "https://")): + return FileUrl(url=value) + return FilePath(path=Path(value)) if isinstance(value, Path): return FilePath(path=value) - if isinstance(value, str): - return FilePath(path=Path(value)) if isinstance(value, bytes): return FileBytes(data=value) if isinstance(value, AsyncReadable): diff --git a/lib/crewai/src/crewai/files/processing/constraints.py b/lib/crewai/src/crewai/files/processing/constraints.py index 02019e8c1..8d8fba783 100644 --- a/lib/crewai/src/crewai/files/processing/constraints.py +++ b/lib/crewai/src/crewai/files/processing/constraints.py @@ -148,6 +148,7 @@ class ProviderConstraints: general_max_size_bytes: Maximum size for any file type. supports_file_upload: Whether the provider supports file upload APIs. file_upload_threshold_bytes: Size threshold above which to use file upload. + supports_url_references: Whether the provider supports URL-based file references. """ name: ProviderName @@ -158,21 +159,24 @@ class ProviderConstraints: general_max_size_bytes: int | None = None supports_file_upload: bool = False file_upload_threshold_bytes: int | None = None + supports_url_references: bool = False ANTHROPIC_CONSTRAINTS = ProviderConstraints( name="anthropic", image=ImageConstraints( - max_size_bytes=5_242_880, + max_size_bytes=5_242_880, # 5 MB per image max_width=8000, max_height=8000, + max_images_per_request=100, ), pdf=PDFConstraints( - max_size_bytes=31_457_280, + max_size_bytes=33_554_432, # 32 MB request size limit max_pages=100, ), supports_file_upload=True, file_upload_threshold_bytes=5_242_880, + supports_url_references=True, ) OPENAI_CONSTRAINTS = ProviderConstraints( @@ -181,8 +185,13 @@ OPENAI_CONSTRAINTS = ProviderConstraints( max_size_bytes=20_971_520, max_images_per_request=10, ), + audio=AudioConstraints( + max_size_bytes=26_214_400, # 25 MB - whisper limit + max_duration_seconds=1500, # 25 minutes, arbitrary-ish, this is from the transcriptions limit + ), supports_file_upload=True, file_upload_threshold_bytes=5_242_880, + supports_url_references=True, ) GEMINI_CONSTRAINTS = ProviderConstraints( @@ -196,14 +205,17 @@ GEMINI_CONSTRAINTS = ProviderConstraints( ), audio=AudioConstraints( max_size_bytes=104_857_600, + max_duration_seconds=34200, # 9.5 hours supported_formats=GEMINI_AUDIO_FORMATS, ), video=VideoConstraints( max_size_bytes=2_147_483_648, + max_duration_seconds=3600, # 1 hour at default resolution supported_formats=GEMINI_VIDEO_FORMATS, ), supports_file_upload=True, file_upload_threshold_bytes=20_971_520, + supports_url_references=True, ) BEDROCK_CONSTRAINTS = ProviderConstraints( @@ -225,6 +237,11 @@ AZURE_CONSTRAINTS = ProviderConstraints( max_size_bytes=20_971_520, max_images_per_request=10, ), + audio=AudioConstraints( + max_size_bytes=26_214_400, # 25 MB - same as openai + max_duration_seconds=1500, # 25 minutes - same as openai + ), + supports_url_references=True, ) diff --git a/lib/crewai/src/crewai/files/processing/validators.py b/lib/crewai/src/crewai/files/processing/validators.py index b76aeced6..99a3d31ec 100644 --- a/lib/crewai/src/crewai/files/processing/validators.py +++ b/lib/crewai/src/crewai/files/processing/validators.py @@ -43,7 +43,7 @@ def _get_image_dimensions(content: bytes) -> tuple[int, int] | None: with Image.open(io.BytesIO(content)) as img: width, height = img.size - return (int(width), int(height)) + return int(width), int(height) except ImportError: logger.warning( "Pillow not installed - cannot validate image dimensions. " @@ -74,6 +74,81 @@ def _get_pdf_page_count(content: bytes) -> int | None: return None +def _get_audio_duration(content: bytes, filename: str | None = None) -> float | None: + """Get audio duration in seconds using tinytag if available. + + Args: + content: Raw audio bytes. + filename: Optional filename for format detection hint. + + Returns: + Duration in seconds or None if tinytag unavailable. + """ + try: + from tinytag import TinyTag # type: ignore[import-untyped] + except ImportError: + logger.warning( + "tinytag not installed - cannot validate audio duration. " + "Install with: pip install tinytag" + ) + return None + + try: + tag = TinyTag.get(file_obj=io.BytesIO(content), filename=filename) + duration: float | None = tag.duration + return duration + except Exception as e: + logger.debug(f"Could not determine audio duration: {e}") + return None + + +_VIDEO_FORMAT_MAP: dict[str, str] = { + "video/mp4": "mp4", + "video/webm": "webm", + "video/x-matroska": "matroska", + "video/quicktime": "mov", + "video/x-msvideo": "avi", + "video/x-flv": "flv", +} + + +def _get_video_duration( + content: bytes, content_type: str | None = None +) -> float | None: + """Get video duration in seconds using av if available. + + Args: + content: Raw video bytes. + content_type: Optional MIME type for format detection hint. + + Returns: + Duration in seconds or None if av unavailable. + """ + try: + import av + except ImportError: + logger.warning( + "av (PyAV) not installed - cannot validate video duration. " + "Install with: pip install av" + ) + return None + + format_hint = _VIDEO_FORMAT_MAP.get(content_type) if content_type else None + + try: + container = av.open(io.BytesIO(content), format=format_hint) # type: ignore[attr-defined] + try: + duration = getattr(container, "duration", None) + if duration is None: + return None + return float(duration) / 1_000_000 + finally: + container.close() + except Exception as e: + logger.debug(f"Could not determine video duration: {e}") + return None + + def _format_size(size_bytes: int) -> str: """Format byte size to human-readable string.""" if size_bytes >= 1024 * 1024 * 1024: @@ -273,14 +348,17 @@ def validate_audio( Raises: FileTooLargeError: If the file exceeds size limits. + FileValidationError: If the file exceeds duration limits. UnsupportedFileTypeError: If the format is not supported. """ errors: list[str] = [] - file_size = len(file.read()) + content = file.read() + file_size = len(content) + filename = file.filename _validate_size( "Audio", - file.filename, + filename, file_size, constraints.max_size_bytes, errors, @@ -288,13 +366,24 @@ def validate_audio( ) _validate_format( "Audio", - file.filename, + filename, file.content_type, constraints.supported_formats, errors, raise_on_error, ) + if constraints.max_duration_seconds is not None: + duration = _get_audio_duration(content, filename) + if duration is not None and duration > constraints.max_duration_seconds: + msg = ( + f"Audio '{filename}' duration ({duration:.1f}s) exceeds " + f"maximum ({constraints.max_duration_seconds}s)" + ) + errors.append(msg) + if raise_on_error: + raise FileValidationError(msg, file_name=filename) + return errors @@ -316,14 +405,17 @@ def validate_video( Raises: FileTooLargeError: If the file exceeds size limits. + FileValidationError: If the file exceeds duration limits. UnsupportedFileTypeError: If the format is not supported. """ errors: list[str] = [] - file_size = len(file.read()) + content = file.read() + file_size = len(content) + filename = file.filename _validate_size( "Video", - file.filename, + filename, file_size, constraints.max_size_bytes, errors, @@ -331,13 +423,24 @@ def validate_video( ) _validate_format( "Video", - file.filename, + filename, file.content_type, constraints.supported_formats, errors, raise_on_error, ) + if constraints.max_duration_seconds is not None: + duration = _get_video_duration(content, file.content_type) + if duration is not None and duration > constraints.max_duration_seconds: + msg = ( + f"Video '{filename}' duration ({duration:.1f}s) exceeds " + f"maximum ({constraints.max_duration_seconds}s)" + ) + errors.append(msg) + if raise_on_error: + raise FileValidationError(msg, file_name=filename) + return errors diff --git a/lib/crewai/src/crewai/files/resolver.py b/lib/crewai/src/crewai/files/resolver.py index 138740334..4aa2e4b65 100644 --- a/lib/crewai/src/crewai/files/resolver.py +++ b/lib/crewai/src/crewai/files/resolver.py @@ -8,6 +8,7 @@ import logging from crewai.files.constants import UPLOAD_MAX_RETRIES, UPLOAD_RETRY_DELAY_BASE from crewai.files.content_types import FileInput +from crewai.files.file import FileUrl from crewai.files.metrics import measure_operation from crewai.files.processing.constraints import ( AudioConstraints, @@ -22,10 +23,12 @@ from crewai.files.resolved import ( InlineBase64, InlineBytes, ResolvedFile, + UrlReference, ) from crewai.files.upload_cache import CachedUpload, UploadCache from crewai.files.uploaders import UploadResult, get_uploader from crewai.files.uploaders.base import FileUploader +from crewai.files.uploaders.factory import ProviderType logger = logging.getLogger(__name__) @@ -102,7 +105,49 @@ class FileResolver: content_type=file.content_type, ) - def resolve(self, file: FileInput, provider: str) -> ResolvedFile: + @staticmethod + def _is_url_source(file: FileInput) -> bool: + """Check if file source is a URL. + + Args: + file: The file to check. + + Returns: + True if the file source is a FileUrl, False otherwise. + """ + return isinstance(file._file_source, FileUrl) + + @staticmethod + def _supports_url(constraints: ProviderConstraints | None) -> bool: + """Check if provider supports URL references. + + Args: + constraints: Provider constraints. + + Returns: + True if the provider supports URL references, False otherwise. + """ + return constraints is not None and constraints.supports_url_references + + @staticmethod + def _resolve_as_url(file: FileInput) -> UrlReference: + """Resolve a URL source as UrlReference. + + Args: + file: The file with URL source. + + Returns: + UrlReference with the URL and content type. + """ + source = file._file_source + if not isinstance(source, FileUrl): + raise TypeError(f"Expected FileUrl source, got {type(source).__name__}") + return UrlReference( + content_type=file.content_type, + url=source.url, + ) + + def resolve(self, file: FileInput, provider: ProviderType) -> ResolvedFile: """Resolve a file to its delivery format for a provider. Args: @@ -112,25 +157,26 @@ class FileResolver: Returns: ResolvedFile representing the appropriate delivery format. """ - provider_lower = provider.lower() constraints = get_constraints_for_provider(provider) + + if self._is_url_source(file) and self._supports_url(constraints): + return self._resolve_as_url(file) + context = self._build_file_context(file) - should_upload = self._should_upload( - file, provider_lower, constraints, context.size - ) + should_upload = self._should_upload(file, provider, constraints, context.size) if should_upload: - resolved = self._resolve_via_upload(file, provider_lower, context) + resolved = self._resolve_via_upload(file, provider, context) if resolved is not None: return resolved - return self._resolve_inline(file, provider_lower, context) + return self._resolve_inline(file, provider, context) def resolve_files( self, files: dict[str, FileInput], - provider: str, + provider: ProviderType, ) -> dict[str, ResolvedFile]: """Resolve multiple files for a provider. @@ -220,7 +266,7 @@ class FileResolver: def _resolve_via_upload( self, file: FileInput, - provider: str, + provider: ProviderType, context: FileContext, ) -> ResolvedFile | None: """Resolve a file by uploading it. @@ -367,7 +413,7 @@ class FileResolver: data=encoded, ) - async def aresolve(self, file: FileInput, provider: str) -> ResolvedFile: + async def aresolve(self, file: FileInput, provider: ProviderType) -> ResolvedFile: """Async resolve a file to its delivery format for a provider. Args: @@ -377,25 +423,26 @@ class FileResolver: Returns: ResolvedFile representing the appropriate delivery format. """ - provider_lower = provider.lower() constraints = get_constraints_for_provider(provider) + + if self._is_url_source(file) and self._supports_url(constraints): + return self._resolve_as_url(file) + context = self._build_file_context(file) - should_upload = self._should_upload( - file, provider_lower, constraints, context.size - ) + should_upload = self._should_upload(file, provider, constraints, context.size) if should_upload: - resolved = await self._aresolve_via_upload(file, provider_lower, context) + resolved = await self._aresolve_via_upload(file, provider, context) if resolved is not None: return resolved - return self._resolve_inline(file, provider_lower, context) + return self._resolve_inline(file, provider, context) async def aresolve_files( self, files: dict[str, FileInput], - provider: str, + provider: ProviderType, max_concurrency: int = 10, ) -> dict[str, ResolvedFile]: """Async resolve multiple files in parallel. @@ -434,7 +481,7 @@ class FileResolver: async def _aresolve_via_upload( self, file: FileInput, - provider: str, + provider: ProviderType, context: FileContext, ) -> ResolvedFile | None: """Async resolve a file by uploading it. @@ -552,7 +599,7 @@ class FileResolver: ) return None - def _get_uploader(self, provider: str) -> FileUploader | None: + def _get_uploader(self, provider: ProviderType) -> FileUploader | None: """Get or create an uploader for a provider. Args: diff --git a/lib/crewai/src/crewai/files/utils.py b/lib/crewai/src/crewai/files/utils.py index e8069bae4..2f424bf64 100644 --- a/lib/crewai/src/crewai/files/utils.py +++ b/lib/crewai/src/crewai/files/utils.py @@ -15,9 +15,9 @@ if TYPE_CHECKING: def is_file_source(v: object) -> TypeIs[FileSource]: """Type guard to narrow input to FileSource.""" - from crewai.files.file import FileBytes, FilePath, FileStream + from crewai.files.file import FileBytes, FilePath, FileStream, FileUrl - return isinstance(v, (FilePath, FileBytes, FileStream)) + return isinstance(v, (FilePath, FileBytes, FileStream, FileUrl)) def wrap_file_source(source: FileSource) -> FileInput: @@ -62,7 +62,7 @@ def normalize_input_files( Dictionary mapping names to FileInput wrappers. """ from crewai.files.content_types import BaseFile - from crewai.files.file import FileBytes, FilePath, FileStream + from crewai.files.file import FileBytes, FilePath, FileStream, FileUrl result: dict[str, FileInput] = {} @@ -74,13 +74,16 @@ def normalize_input_files( result[name] = item continue - file_source: FilePath | FileBytes | FileStream - if isinstance(item, (FilePath, FileBytes, FileStream)): + file_source: FilePath | FileBytes | FileStream | FileUrl + if isinstance(item, (FilePath, FileBytes, FileStream, FileUrl)): file_source = item elif isinstance(item, Path): file_source = FilePath(path=item) elif isinstance(item, str): - file_source = FilePath(path=Path(item)) + if item.startswith(("http://", "https://")): + file_source = FileUrl(url=item) + else: + file_source = FilePath(path=Path(item)) elif isinstance(item, (bytes, memoryview)): file_source = FileBytes(data=bytes(item)) else: diff --git a/lib/crewai/tests/files/processing/test_validators.py b/lib/crewai/tests/files/processing/test_validators.py index 4b9d32294..64f803192 100644 --- a/lib/crewai/tests/files/processing/test_validators.py +++ b/lib/crewai/tests/files/processing/test_validators.py @@ -1,13 +1,17 @@ """Tests for file validators.""" +from unittest.mock import patch + import pytest -from crewai.files import FileBytes, ImageFile, PDFFile, TextFile +from crewai.files import AudioFile, FileBytes, ImageFile, PDFFile, TextFile, VideoFile from crewai.files.processing.constraints import ( ANTHROPIC_CONSTRAINTS, + AudioConstraints, ImageConstraints, PDFConstraints, ProviderConstraints, + VideoConstraints, ) from crewai.files.processing.exceptions import ( FileTooLargeError, @@ -15,10 +19,14 @@ from crewai.files.processing.exceptions import ( UnsupportedFileTypeError, ) from crewai.files.processing.validators import ( + _get_audio_duration, + _get_video_duration, + validate_audio, validate_file, validate_image, validate_pdf, validate_text, + validate_video, ) @@ -206,3 +214,281 @@ class TestValidateFile: validate_file(file, constraints) assert "does not support PDFs" in str(exc_info.value) + + +# Minimal audio bytes for testing (not a valid audio file, used for mocked tests) +MINIMAL_AUDIO = b"\x00" * 100 + +# Minimal video bytes for testing (not a valid video file, used for mocked tests) +MINIMAL_VIDEO = b"\x00" * 100 + +# Fallback content type when python-magic cannot detect +FALLBACK_CONTENT_TYPE = "application/octet-stream" + + +class TestValidateAudio: + """Tests for validate_audio function and audio duration validation.""" + + def test_validate_valid_audio(self): + """Test validating a valid audio file within constraints.""" + constraints = AudioConstraints( + max_size_bytes=10 * 1024 * 1024, + supported_formats=("audio/mp3", "audio/mpeg", FALLBACK_CONTENT_TYPE), + ) + file = AudioFile(source=FileBytes(data=MINIMAL_AUDIO, filename="test.mp3")) + + errors = validate_audio(file, constraints, raise_on_error=False) + + assert len(errors) == 0 + + def test_validate_audio_too_large(self): + """Test validating an audio file that exceeds size limit.""" + constraints = AudioConstraints( + max_size_bytes=10, # Very small limit + supported_formats=("audio/mp3", "audio/mpeg", FALLBACK_CONTENT_TYPE), + ) + file = AudioFile(source=FileBytes(data=MINIMAL_AUDIO, filename="test.mp3")) + + with pytest.raises(FileTooLargeError) as exc_info: + validate_audio(file, constraints) + + assert "exceeds" in str(exc_info.value) + assert exc_info.value.file_name == "test.mp3" + + def test_validate_audio_unsupported_format(self): + """Test validating an audio file with unsupported format.""" + constraints = AudioConstraints( + max_size_bytes=10 * 1024 * 1024, + supported_formats=("audio/wav",), # Only WAV + ) + file = AudioFile(source=FileBytes(data=MINIMAL_AUDIO, filename="test.mp3")) + + with pytest.raises(UnsupportedFileTypeError) as exc_info: + validate_audio(file, constraints) + + assert "not supported" in str(exc_info.value) + + @patch("crewai.files.processing.validators._get_audio_duration") + def test_validate_audio_duration_passes(self, mock_get_duration): + """Test validating audio when duration is under limit.""" + mock_get_duration.return_value = 30.0 + constraints = AudioConstraints( + max_size_bytes=10 * 1024 * 1024, + max_duration_seconds=60, + supported_formats=("audio/mp3", "audio/mpeg", FALLBACK_CONTENT_TYPE), + ) + file = AudioFile(source=FileBytes(data=MINIMAL_AUDIO, filename="test.mp3")) + + errors = validate_audio(file, constraints, raise_on_error=False) + + assert len(errors) == 0 + mock_get_duration.assert_called_once() + + @patch("crewai.files.processing.validators._get_audio_duration") + def test_validate_audio_duration_fails(self, mock_get_duration): + """Test validating audio when duration exceeds limit.""" + mock_get_duration.return_value = 120.5 + constraints = AudioConstraints( + max_size_bytes=10 * 1024 * 1024, + max_duration_seconds=60, + supported_formats=("audio/mp3", "audio/mpeg", FALLBACK_CONTENT_TYPE), + ) + file = AudioFile(source=FileBytes(data=MINIMAL_AUDIO, filename="test.mp3")) + + with pytest.raises(FileValidationError) as exc_info: + validate_audio(file, constraints) + + assert "duration" in str(exc_info.value).lower() + assert "120.5s" in str(exc_info.value) + assert "60s" in str(exc_info.value) + + @patch("crewai.files.processing.validators._get_audio_duration") + def test_validate_audio_duration_no_raise(self, mock_get_duration): + """Test audio duration validation with raise_on_error=False.""" + mock_get_duration.return_value = 120.5 + constraints = AudioConstraints( + max_size_bytes=10 * 1024 * 1024, + max_duration_seconds=60, + supported_formats=("audio/mp3", "audio/mpeg", FALLBACK_CONTENT_TYPE), + ) + file = AudioFile(source=FileBytes(data=MINIMAL_AUDIO, filename="test.mp3")) + + errors = validate_audio(file, constraints, raise_on_error=False) + + assert len(errors) == 1 + assert "duration" in errors[0].lower() + + @patch("crewai.files.processing.validators._get_audio_duration") + def test_validate_audio_duration_none_skips(self, mock_get_duration): + """Test that duration validation is skipped when max_duration_seconds is None.""" + constraints = AudioConstraints( + max_size_bytes=10 * 1024 * 1024, + max_duration_seconds=None, + supported_formats=("audio/mp3", "audio/mpeg", FALLBACK_CONTENT_TYPE), + ) + file = AudioFile(source=FileBytes(data=MINIMAL_AUDIO, filename="test.mp3")) + + errors = validate_audio(file, constraints, raise_on_error=False) + + assert len(errors) == 0 + mock_get_duration.assert_not_called() + + @patch("crewai.files.processing.validators._get_audio_duration") + def test_validate_audio_duration_detection_returns_none(self, mock_get_duration): + """Test that validation passes when duration detection returns None.""" + mock_get_duration.return_value = None + constraints = AudioConstraints( + max_size_bytes=10 * 1024 * 1024, + max_duration_seconds=60, + supported_formats=("audio/mp3", "audio/mpeg", FALLBACK_CONTENT_TYPE), + ) + file = AudioFile(source=FileBytes(data=MINIMAL_AUDIO, filename="test.mp3")) + + errors = validate_audio(file, constraints, raise_on_error=False) + + assert len(errors) == 0 + + +class TestValidateVideo: + """Tests for validate_video function and video duration validation.""" + + def test_validate_valid_video(self): + """Test validating a valid video file within constraints.""" + constraints = VideoConstraints( + max_size_bytes=10 * 1024 * 1024, + supported_formats=("video/mp4", FALLBACK_CONTENT_TYPE), + ) + file = VideoFile(source=FileBytes(data=MINIMAL_VIDEO, filename="test.mp4")) + + errors = validate_video(file, constraints, raise_on_error=False) + + assert len(errors) == 0 + + def test_validate_video_too_large(self): + """Test validating a video file that exceeds size limit.""" + constraints = VideoConstraints( + max_size_bytes=10, # Very small limit + supported_formats=("video/mp4", FALLBACK_CONTENT_TYPE), + ) + file = VideoFile(source=FileBytes(data=MINIMAL_VIDEO, filename="test.mp4")) + + with pytest.raises(FileTooLargeError) as exc_info: + validate_video(file, constraints) + + assert "exceeds" in str(exc_info.value) + assert exc_info.value.file_name == "test.mp4" + + def test_validate_video_unsupported_format(self): + """Test validating a video file with unsupported format.""" + constraints = VideoConstraints( + max_size_bytes=10 * 1024 * 1024, + supported_formats=("video/webm",), # Only WebM + ) + file = VideoFile(source=FileBytes(data=MINIMAL_VIDEO, filename="test.mp4")) + + with pytest.raises(UnsupportedFileTypeError) as exc_info: + validate_video(file, constraints) + + assert "not supported" in str(exc_info.value) + + @patch("crewai.files.processing.validators._get_video_duration") + def test_validate_video_duration_passes(self, mock_get_duration): + """Test validating video when duration is under limit.""" + mock_get_duration.return_value = 30.0 + constraints = VideoConstraints( + max_size_bytes=10 * 1024 * 1024, + max_duration_seconds=60, + supported_formats=("video/mp4", FALLBACK_CONTENT_TYPE), + ) + file = VideoFile(source=FileBytes(data=MINIMAL_VIDEO, filename="test.mp4")) + + errors = validate_video(file, constraints, raise_on_error=False) + + assert len(errors) == 0 + mock_get_duration.assert_called_once() + + @patch("crewai.files.processing.validators._get_video_duration") + def test_validate_video_duration_fails(self, mock_get_duration): + """Test validating video when duration exceeds limit.""" + mock_get_duration.return_value = 180.0 + constraints = VideoConstraints( + max_size_bytes=10 * 1024 * 1024, + max_duration_seconds=60, + supported_formats=("video/mp4", FALLBACK_CONTENT_TYPE), + ) + file = VideoFile(source=FileBytes(data=MINIMAL_VIDEO, filename="test.mp4")) + + with pytest.raises(FileValidationError) as exc_info: + validate_video(file, constraints) + + assert "duration" in str(exc_info.value).lower() + assert "180.0s" in str(exc_info.value) + assert "60s" in str(exc_info.value) + + @patch("crewai.files.processing.validators._get_video_duration") + def test_validate_video_duration_no_raise(self, mock_get_duration): + """Test video duration validation with raise_on_error=False.""" + mock_get_duration.return_value = 180.0 + constraints = VideoConstraints( + max_size_bytes=10 * 1024 * 1024, + max_duration_seconds=60, + supported_formats=("video/mp4", FALLBACK_CONTENT_TYPE), + ) + file = VideoFile(source=FileBytes(data=MINIMAL_VIDEO, filename="test.mp4")) + + errors = validate_video(file, constraints, raise_on_error=False) + + assert len(errors) == 1 + assert "duration" in errors[0].lower() + + @patch("crewai.files.processing.validators._get_video_duration") + def test_validate_video_duration_none_skips(self, mock_get_duration): + """Test that duration validation is skipped when max_duration_seconds is None.""" + constraints = VideoConstraints( + max_size_bytes=10 * 1024 * 1024, + max_duration_seconds=None, + supported_formats=("video/mp4", FALLBACK_CONTENT_TYPE), + ) + file = VideoFile(source=FileBytes(data=MINIMAL_VIDEO, filename="test.mp4")) + + errors = validate_video(file, constraints, raise_on_error=False) + + assert len(errors) == 0 + mock_get_duration.assert_not_called() + + @patch("crewai.files.processing.validators._get_video_duration") + def test_validate_video_duration_detection_returns_none(self, mock_get_duration): + """Test that validation passes when duration detection returns None.""" + mock_get_duration.return_value = None + constraints = VideoConstraints( + max_size_bytes=10 * 1024 * 1024, + max_duration_seconds=60, + supported_formats=("video/mp4", FALLBACK_CONTENT_TYPE), + ) + file = VideoFile(source=FileBytes(data=MINIMAL_VIDEO, filename="test.mp4")) + + errors = validate_video(file, constraints, raise_on_error=False) + + assert len(errors) == 0 + + +class TestGetAudioDuration: + """Tests for _get_audio_duration helper function.""" + + def test_get_audio_duration_corrupt_file(self): + """Test handling of corrupt audio data.""" + corrupt_data = b"not valid audio data at all" + result = _get_audio_duration(corrupt_data) + + assert result is None + + +class TestGetVideoDuration: + """Tests for _get_video_duration helper function.""" + + def test_get_video_duration_corrupt_file(self): + """Test handling of corrupt video data.""" + corrupt_data = b"not valid video data at all" + result = _get_video_duration(corrupt_data) + + assert result is None diff --git a/lib/crewai/tests/files/test_file_url.py b/lib/crewai/tests/files/test_file_url.py new file mode 100644 index 000000000..1f862bbee --- /dev/null +++ b/lib/crewai/tests/files/test_file_url.py @@ -0,0 +1,312 @@ +"""Tests for FileUrl source type and URL resolution.""" + +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from crewai.files import FileBytes, FileUrl, ImageFile +from crewai.files.file import _normalize_source, FilePath +from crewai.files.resolved import InlineBase64, UrlReference +from crewai.files.resolver import FileResolver + + +class TestFileUrl: + """Tests for FileUrl source type.""" + + def test_create_file_url(self): + """Test creating FileUrl with valid URL.""" + url = FileUrl(url="https://example.com/image.png") + + assert url.url == "https://example.com/image.png" + assert url.filename is None + + def test_create_file_url_with_filename(self): + """Test creating FileUrl with custom filename.""" + url = FileUrl(url="https://example.com/image.png", filename="custom.png") + + assert url.url == "https://example.com/image.png" + assert url.filename == "custom.png" + + def test_invalid_url_scheme_raises(self): + """Test that non-http(s) URLs raise ValueError.""" + with pytest.raises(ValueError, match="Invalid URL scheme"): + FileUrl(url="ftp://example.com/file.txt") + + def test_invalid_url_scheme_file_raises(self): + """Test that file:// URLs raise ValueError.""" + with pytest.raises(ValueError, match="Invalid URL scheme"): + FileUrl(url="file:///path/to/file.txt") + + def test_http_url_valid(self): + """Test that HTTP URLs are valid.""" + url = FileUrl(url="http://example.com/image.jpg") + + assert url.url == "http://example.com/image.jpg" + + def test_https_url_valid(self): + """Test that HTTPS URLs are valid.""" + url = FileUrl(url="https://example.com/image.jpg") + + assert url.url == "https://example.com/image.jpg" + + def test_content_type_guessing_png(self): + """Test content type guessing for PNG files.""" + url = FileUrl(url="https://example.com/image.png") + + assert url.content_type == "image/png" + + def test_content_type_guessing_jpeg(self): + """Test content type guessing for JPEG files.""" + url = FileUrl(url="https://example.com/photo.jpg") + + assert url.content_type == "image/jpeg" + + def test_content_type_guessing_pdf(self): + """Test content type guessing for PDF files.""" + url = FileUrl(url="https://example.com/document.pdf") + + assert url.content_type == "application/pdf" + + def test_content_type_guessing_with_query_params(self): + """Test content type guessing with URL query parameters.""" + url = FileUrl(url="https://example.com/image.png?v=123&token=abc") + + assert url.content_type == "image/png" + + def test_content_type_fallback_unknown(self): + """Test content type falls back to octet-stream for unknown extensions.""" + url = FileUrl(url="https://example.com/file.unknownext123") + + assert url.content_type == "application/octet-stream" + + def test_content_type_no_extension(self): + """Test content type for URL without extension.""" + url = FileUrl(url="https://example.com/file") + + assert url.content_type == "application/octet-stream" + + def test_read_fetches_content(self): + """Test that read() fetches content from URL.""" + url = FileUrl(url="https://example.com/image.png") + mock_response = MagicMock() + mock_response.content = b"fake image content" + mock_response.headers = {"content-type": "image/png"} + + with patch("httpx.get", return_value=mock_response) as mock_get: + content = url.read() + + mock_get.assert_called_once_with( + "https://example.com/image.png", follow_redirects=True + ) + assert content == b"fake image content" + + def test_read_caches_content(self): + """Test that read() caches content.""" + url = FileUrl(url="https://example.com/image.png") + mock_response = MagicMock() + mock_response.content = b"fake content" + mock_response.headers = {} + + with patch("httpx.get", return_value=mock_response) as mock_get: + content1 = url.read() + content2 = url.read() + + mock_get.assert_called_once() + assert content1 == content2 + + def test_read_updates_content_type_from_response(self): + """Test that read() updates content type from response headers.""" + url = FileUrl(url="https://example.com/file") + mock_response = MagicMock() + mock_response.content = b"fake content" + mock_response.headers = {"content-type": "image/webp; charset=utf-8"} + + with patch("httpx.get", return_value=mock_response): + url.read() + + assert url.content_type == "image/webp" + + @pytest.mark.asyncio + async def test_aread_fetches_content(self): + """Test that aread() fetches content from URL asynchronously.""" + url = FileUrl(url="https://example.com/image.png") + mock_response = MagicMock() + mock_response.content = b"async fake content" + mock_response.headers = {"content-type": "image/png"} + mock_response.raise_for_status = MagicMock() + + mock_client = MagicMock() + mock_client.get = AsyncMock(return_value=mock_response) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=None) + + with patch("httpx.AsyncClient", return_value=mock_client): + content = await url.aread() + + assert content == b"async fake content" + + @pytest.mark.asyncio + async def test_aread_caches_content(self): + """Test that aread() caches content.""" + url = FileUrl(url="https://example.com/image.png") + mock_response = MagicMock() + mock_response.content = b"cached content" + mock_response.headers = {} + mock_response.raise_for_status = MagicMock() + + mock_client = MagicMock() + mock_client.get = AsyncMock(return_value=mock_response) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=None) + + with patch("httpx.AsyncClient", return_value=mock_client): + content1 = await url.aread() + content2 = await url.aread() + + mock_client.get.assert_called_once() + assert content1 == content2 + + +class TestNormalizeSource: + """Tests for _normalize_source with URL detection.""" + + def test_normalize_url_string(self): + """Test that URL strings are converted to FileUrl.""" + result = _normalize_source("https://example.com/image.png") + + assert isinstance(result, FileUrl) + assert result.url == "https://example.com/image.png" + + def test_normalize_http_url_string(self): + """Test that HTTP URL strings are converted to FileUrl.""" + result = _normalize_source("http://example.com/file.pdf") + + assert isinstance(result, FileUrl) + assert result.url == "http://example.com/file.pdf" + + def test_normalize_file_path_string(self, tmp_path): + """Test that file path strings are converted to FilePath.""" + test_file = tmp_path / "test.png" + test_file.write_bytes(b"test content") + + result = _normalize_source(str(test_file)) + + assert isinstance(result, FilePath) + + def test_normalize_relative_path_is_not_url(self): + """Test that relative path strings are not treated as URLs.""" + result = _normalize_source("https://example.com/file.png") + + assert isinstance(result, FileUrl) + assert not isinstance(result, FilePath) + + def test_normalize_file_url_passthrough(self): + """Test that FileUrl instances pass through unchanged.""" + original = FileUrl(url="https://example.com/image.png") + result = _normalize_source(original) + + assert result is original + + +class TestResolverUrlHandling: + """Tests for FileResolver URL handling.""" + + def test_resolve_url_source_for_supported_provider(self): + """Test URL source resolves to UrlReference for supported providers.""" + resolver = FileResolver() + file = ImageFile(source=FileUrl(url="https://example.com/image.png")) + + resolved = resolver.resolve(file, "anthropic") + + assert isinstance(resolved, UrlReference) + assert resolved.url == "https://example.com/image.png" + assert resolved.content_type == "image/png" + + def test_resolve_url_source_openai(self): + """Test URL source resolves to UrlReference for OpenAI.""" + resolver = FileResolver() + file = ImageFile(source=FileUrl(url="https://example.com/photo.jpg")) + + resolved = resolver.resolve(file, "openai") + + assert isinstance(resolved, UrlReference) + assert resolved.url == "https://example.com/photo.jpg" + + def test_resolve_url_source_gemini(self): + """Test URL source resolves to UrlReference for Gemini.""" + resolver = FileResolver() + file = ImageFile(source=FileUrl(url="https://example.com/image.webp")) + + resolved = resolver.resolve(file, "gemini") + + assert isinstance(resolved, UrlReference) + assert resolved.url == "https://example.com/image.webp" + + def test_resolve_url_source_azure(self): + """Test URL source resolves to UrlReference for Azure.""" + resolver = FileResolver() + file = ImageFile(source=FileUrl(url="https://example.com/image.gif")) + + resolved = resolver.resolve(file, "azure") + + assert isinstance(resolved, UrlReference) + assert resolved.url == "https://example.com/image.gif" + + def test_resolve_url_source_bedrock_fetches_content(self): + """Test URL source fetches content for Bedrock (unsupported URLs).""" + resolver = FileResolver() + file_url = FileUrl(url="https://example.com/image.png") + file = ImageFile(source=file_url) + + mock_response = MagicMock() + mock_response.content = b"\x89PNG\r\n\x1a\n" + b"\x00" * 50 + mock_response.headers = {"content-type": "image/png"} + + with patch("httpx.get", return_value=mock_response): + resolved = resolver.resolve(file, "bedrock") + + assert not isinstance(resolved, UrlReference) + + def test_resolve_bytes_source_still_works(self): + """Test that bytes source still resolves normally.""" + resolver = FileResolver() + minimal_png = ( + b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x08\x00\x00\x00\x08" + b"\x01\x00\x00\x00\x00\xf9Y\xab\xcd\x00\x00\x00\nIDATx\x9cc`\x00\x00" + b"\x00\x02\x00\x01\xe2!\xbc3\x00\x00\x00\x00IEND\xaeB`\x82" + ) + file = ImageFile(source=FileBytes(data=minimal_png, filename="test.png")) + + resolved = resolver.resolve(file, "anthropic") + + assert isinstance(resolved, InlineBase64) + + @pytest.mark.asyncio + async def test_aresolve_url_source(self): + """Test async URL resolution for supported provider.""" + resolver = FileResolver() + file = ImageFile(source=FileUrl(url="https://example.com/image.png")) + + resolved = await resolver.aresolve(file, "anthropic") + + assert isinstance(resolved, UrlReference) + assert resolved.url == "https://example.com/image.png" + + +class TestImageFileWithUrl: + """Tests for creating ImageFile with URL source.""" + + def test_image_file_from_url_string(self): + """Test creating ImageFile from URL string.""" + file = ImageFile(source="https://example.com/image.png") + + assert isinstance(file.source, FileUrl) + assert file.source.url == "https://example.com/image.png" + + def test_image_file_from_file_url(self): + """Test creating ImageFile from FileUrl instance.""" + url = FileUrl(url="https://example.com/photo.jpg") + file = ImageFile(source=url) + + assert file.source is url + assert file.content_type == "image/jpeg" diff --git a/uv.lock b/uv.lock index 7bf50ac53..d94c4bda5 100644 --- a/uv.lock +++ b/uv.lock @@ -418,6 +418,44 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f8/aa/5082412d1ee302e9e7d80b6949bc4d2a8fa1149aaab610c5fc24709605d6/authlib-1.6.5-py2.py3-none-any.whl", hash = "sha256:3e0e0507807f842b02175507bdee8957a1d5707fd4afb17c32fb43fee90b6e3a", size = 243608, upload-time = "2025-10-02T13:36:07.637Z" }, ] +[[package]] +name = "av" +version = "13.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e1/df/4f77aa98b998e1a19622b7a45da07884a053826e9038138d8023208e31e5/av-13.0.0.tar.gz", hash = "sha256:7fb1a5588cd8ce4d0564ddf82221f886541ea2d5152f15e63ab890430dcd3c31", size = 3884902, upload-time = "2024-09-04T08:30:48.971Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/07/ac/fdacc4e49b946ac9274c9363eeedceed824a71fa09df5c799cb4a137d80d/av-13.0.0-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:a0f3563eb232c46811388d19eb8da3435ebd98e3b26c567da76acb878c772a4f", size = 24229400, upload-time = "2024-09-04T08:28:26.627Z" }, + { url = "https://files.pythonhosted.org/packages/55/8d/bc8670f8a2084aaf4b738017e490a5c762023b88517fd579cbaff6ab18f3/av-13.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:52713a673ccf743cb0692c7aa9b02429d7efee3fa19281dda1167685f8c21864", size = 19446165, upload-time = "2024-09-04T08:28:30.132Z" }, + { url = "https://files.pythonhosted.org/packages/13/23/8280bc3a0df950f6fd8e57621f037d708c2065534311c7b6d88ec22e080a/av-13.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf667841f54cc82d5a09b9c31921dfafc22a6293aa17b9bd11f33c6c08e372d0", size = 31141668, upload-time = "2024-09-04T08:28:33.811Z" }, + { url = "https://files.pythonhosted.org/packages/72/d3/16dfe2bc810be142f06ef93b9eadfddc51309bcdb0ca80c566aa889f0dde/av-13.0.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d6a3a4a572d3c70fd3d8709b9ae5d8a7cd6ef813b46d571a95477a87d0f3e282", size = 30565447, upload-time = "2024-09-04T08:28:37.579Z" }, + { url = "https://files.pythonhosted.org/packages/64/56/41f067fa8344027c03abbaeaf5826838c97404a47472c521a658f0656472/av-13.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ed3b70ca98c3f3ba130f23ec1393316eb714f35d41b4c1d9d1ef4951f862cc0", size = 32975707, upload-time = "2024-09-04T08:28:41.418Z" }, + { url = "https://files.pythonhosted.org/packages/23/53/182589a2501f44cde451a18c8db372fec714bd3dfdd8906277fce3b10c18/av-13.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:43db19eb2704a5a8b6060c070bcf05e0ce1132edb3140f8a19271ac8eac63706", size = 25747720, upload-time = "2024-09-04T08:28:44.816Z" }, + { url = "https://files.pythonhosted.org/packages/d1/b3/37460a6b94ee2a284b8d585a19cc63b32a9318b4c1eee0e25b6f24df415a/av-13.0.0-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:b3ec126e5c30a0d44c6ce6cd0be72b2af83529e5b19c41e6569a7c4d00261d04", size = 24224476, upload-time = "2024-09-04T08:28:48.276Z" }, + { url = "https://files.pythonhosted.org/packages/b0/a7/1cc83b2e0aeead07c3e9c59cbddf15f2b555578c6b725cc65bdbbec4c4d6/av-13.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:0014c16d9123f50f366e32baed5c358429ed64c701ed5cea135fba333a5c9b13", size = 19438756, upload-time = "2024-09-04T08:28:51.511Z" }, + { url = "https://files.pythonhosted.org/packages/b3/b6/d6a85b89b14d60b360fb8eab65a9e7d8119d2807dcb025bc93baeff565a6/av-13.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3fa360cfc3e55ef1b22199741c74b584a57d2af75d5e5d9b54dd8cc999ae50bb", size = 32084112, upload-time = "2024-09-04T08:28:54.434Z" }, + { url = "https://files.pythonhosted.org/packages/cf/1d/3b5d4ce10de1b383a1f68dcf4f7679a34f5f6cf8aad1a0dfcfbf05c5fd7e/av-13.0.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3519e3effea342295de5f52dbcd263800db2ab1ab5e43ec6485ba1ed07c2e503", size = 31396374, upload-time = "2024-09-04T08:28:58.027Z" }, + { url = "https://files.pythonhosted.org/packages/7a/8e/c5bea32963acacbc0db7b1c6e6d5a181afee2951981b88533c771beabc53/av-13.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3f76e0060f4aa4be0911db624039e31c973dce9f9f2d410dc817b2b88e199a74", size = 33913273, upload-time = "2024-09-04T08:29:01.251Z" }, + { url = "https://files.pythonhosted.org/packages/ce/30/1912588c0bce8baf6e490103e5c4ef1963f8bc0f0c00d82cde2b6b3793fc/av-13.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:b21254571904b214fc586568ba1da62d38f00cc4f76c7eebbe14af9f8dd8a40f", size = 25750490, upload-time = "2024-09-04T08:29:04.985Z" }, + { url = "https://files.pythonhosted.org/packages/df/90/f8120cebf0b86ff70691603a6fb1ef473d1fd9c99db058d0413e9a630538/av-13.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:9eaf76c3a8a40dc3424ee9360b457143699d96f6e3faffb00867fd747b821ab9", size = 24238853, upload-time = "2024-09-04T08:29:08.611Z" }, + { url = "https://files.pythonhosted.org/packages/62/7d/090813d188eebbe183acad6e0cfbd9cdeca0e7f7318a0a3bd6f44ac7d16f/av-13.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:623809f0684bf4379328ced38a25c295969997ba574ed17b99fa4ee3aa564d66", size = 19446605, upload-time = "2024-09-04T08:29:11.922Z" }, + { url = "https://files.pythonhosted.org/packages/71/ec/bdc954939463127ca38ee023061be0ac89bdf2f2de6ab23f6a1d8112d070/av-13.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8dc441b3899f1eb259af17acb2e5218762dcc99a4fbd6fe4d1f4155e253728b2", size = 32317356, upload-time = "2024-09-04T08:29:15.475Z" }, + { url = "https://files.pythonhosted.org/packages/00/78/8d808f4868862b1b539ffd9af1775792f128a903f134c2dbfdb39a7799e3/av-13.0.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8b9654f9261ba123377b95fd5a9214e05ba43d7545cb41a5ae2dd5ea5fe6fbc9", size = 31666294, upload-time = "2024-09-04T08:29:18.805Z" }, + { url = "https://files.pythonhosted.org/packages/f7/fd/ee64d545a60c73795285cbe70f27e49b46c40e1ca3c8c35411b75ea310e6/av-13.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8157821b9da3814720d9b7ea45d961275dc73be8161eae7258afe2f737da5779", size = 34243366, upload-time = "2024-09-04T08:29:22.423Z" }, + { url = "https://files.pythonhosted.org/packages/c1/49/08552c5c2b838016cbba90547a0c082e9e8b700eaaf90c8eb0c11fec595e/av-13.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:736c4a9cb6ef6e5f3aa1cb12609a615f6c93bf16f36439010dc1ba160beed827", size = 25751891, upload-time = "2024-09-04T08:29:26.781Z" }, + { url = "https://files.pythonhosted.org/packages/4e/fd/08eeec9bd07129242989cb69cb45be5ff4c394af27b661d7c4428c460669/av-13.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4074615d89852dc8d7aa852b9162fe855bc2c6850e0cab74a875d4e72eefe343", size = 24197575, upload-time = "2024-09-04T08:29:30.194Z" }, + { url = "https://files.pythonhosted.org/packages/f3/0a/70d1848f325fd595f009f419e11134020aca1e0bf99c0041c0f5a767a01d/av-13.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a2df1f311610dcedbd0b08a5a419ae17076aa9cd808a6d4f0b5cb8c69d604e9f", size = 19406017, upload-time = "2024-09-04T08:29:32.951Z" }, + { url = "https://files.pythonhosted.org/packages/3f/10/2c1007829950cc1b7b17593d0d304adf008331729083af3d9b7c34e10b52/av-13.0.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b1990d1398c25d90045c771450a64bf9aff33d8e6c89568fbbc5cc85ec6ceaa1", size = 31966860, upload-time = "2024-09-04T08:29:36.272Z" }, + { url = "https://files.pythonhosted.org/packages/1d/d7/f64af0713a669560ef33eea30c08add46916cab4ff0b26b473c14a9ff32c/av-13.0.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3303584abfcc2787a3dcf303fddcab0968329a309360c22348cc2c31e060f8d9", size = 31333914, upload-time = "2024-09-04T08:29:40.417Z" }, + { url = "https://files.pythonhosted.org/packages/c5/6c/647368ea1b60059a0a0dec3eae7c76b3aaec3e222c3cbcb54af0c2716d37/av-13.0.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:05de5e2e6dde42d804dc41aa36102f64849fc72d0c7f9afc28406a7b240dba7a", size = 33908881, upload-time = "2024-09-04T08:29:44.161Z" }, + { url = "https://files.pythonhosted.org/packages/2a/bc/e2305f5e18eb47b5eac80e29de2fc1110898bb48131bb2a6d0d893080969/av-13.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:f9cea8906abf010f6d4894c7cad52e257667d0a498d4eec7e5beb4eff519d3ff", size = 25724252, upload-time = "2024-09-04T08:29:48.344Z" }, + { url = "https://files.pythonhosted.org/packages/35/6e/1cba0d4506a3855f718615a826958b5b9f08d3b263216b8ba2fc578e54da/av-13.0.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:d066d441efbd329947ff36604422b3a22ee65a98a78caa0869d2400cebc46381", size = 23837589, upload-time = "2024-09-04T08:30:13.345Z" }, + { url = "https://files.pythonhosted.org/packages/2a/23/8553944c6d782c4fe0883f969866f2ab1ad8546a4361c942aa80873583d5/av-13.0.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:9836b348f648ef5a364075626e623cef39383fe439159f5875e588429c7c90ea", size = 19091589, upload-time = "2024-09-04T08:30:16.075Z" }, + { url = "https://files.pythonhosted.org/packages/0d/d4/5286b9bea8d6a87853f93116f4eef6f3d5ab64a9382371d851eb705d9299/av-13.0.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:52aeefdaa9fd5182aee1d4ae53325756273e293173810c77960e012a9a4efda0", size = 22823448, upload-time = "2024-09-04T08:30:19.446Z" }, + { url = "https://files.pythonhosted.org/packages/27/3f/37253b9746459f570a871170d70c7c43eed58a4e755a9e1f2c67c27d6dbe/av-13.0.0-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aae4116c3cc94f514501f856df4a351eb3386fbc5623d3dcb17476237ffae221", size = 22673845, upload-time = "2024-09-04T08:30:22.129Z" }, + { url = "https://files.pythonhosted.org/packages/de/fa/e6995a721ce5ca9aa7e5a58dfeeb3df7c6f846f10e54ac32cbaf2948682a/av-13.0.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2425c8b0c8a022f10a20f3075bec05fc8efe4c5848e038d7d168cbbca089f08a", size = 24628585, upload-time = "2024-09-04T08:30:25.345Z" }, + { url = "https://files.pythonhosted.org/packages/33/b9/1023b925f6505cba49fe22a08020dd0dfb9185c42d4f26fc6217b9e1c2e2/av-13.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:894dc43623b959d00ab9a62c0357929ba7a8dd8667b37afb046caee756f9e90a", size = 25536060, upload-time = "2024-09-04T08:30:28.418Z" }, +] + [[package]] name = "azure-ai-inference" version = "1.0.0b9" @@ -1203,9 +1241,11 @@ embeddings = [ file-processing = [ { name = "aiocache" }, { name = "aiofiles" }, + { name = "av" }, { name = "pillow" }, { name = "pypdf" }, { name = "python-magic" }, + { name = "tinytag" }, ] google-genai = [ { name = "google-genai" }, @@ -1245,6 +1285,7 @@ requires-dist = [ { name = "aiosqlite", specifier = "~=0.21.0" }, { name = "anthropic", marker = "extra == 'anthropic'", specifier = "~=0.71.0" }, { name = "appdirs", specifier = "~=1.4.4" }, + { name = "av", marker = "extra == 'file-processing'", specifier = "~=13.0.0" }, { name = "azure-ai-inference", marker = "extra == 'azure-ai-inference'", specifier = "~=1.0.0b9" }, { name = "boto3", marker = "extra == 'aws'", specifier = "~=1.40.38" }, { name = "boto3", marker = "extra == 'bedrock'", specifier = "~=1.40.45" }, @@ -1282,6 +1323,7 @@ requires-dist = [ { name = "qdrant-client", extras = ["fastembed"], marker = "extra == 'qdrant'", specifier = "~=1.14.3" }, { name = "regex", specifier = "~=2024.9.11" }, { name = "tiktoken", marker = "extra == 'embeddings'", specifier = "~=0.8.0" }, + { name = "tinytag", marker = "extra == 'file-processing'", specifier = "~=1.10.0" }, { name = "tokenizers", specifier = "~=0.20.3" }, { name = "tomli", specifier = "~=2.0.2" }, { name = "tomli-w", specifier = "~=1.1.0" }, @@ -7806,6 +7848,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d6/14/fc04d491527b774ec7479897f5861959209de1480e4c4cd32ed098ff8bea/timm-1.0.22-py3-none-any.whl", hash = "sha256:888981753e65cbaacfc07494370138b1700a27b1f0af587f4f9b47bc024161d0", size = 2530238, upload-time = "2025-11-05T04:06:06.823Z" }, ] +[[package]] +name = "tinytag" +version = "1.10.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/59/b5/ff5e5f9ca9677be7272260f67c87f7e8e885babc7ce94604e837dcfd8d76/tinytag-1.10.1.tar.gz", hash = "sha256:122a63b836f85094aacca43fc807aaee3290be3de17d134f5f4a08b509ae268f", size = 40906, upload-time = "2023-10-26T19:30:38.791Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2f/04/ef783cbc4aa3a5ed75969e300b3e3929daf3d1b52fe80e950c63e0d66d95/tinytag-1.10.1-py3-none-any.whl", hash = "sha256:e437654d04c966fbbbdbf807af61eb9759f1d80e4173a7d26202506b37cfdaf0", size = 37900, upload-time = "2023-10-26T19:30:36.724Z" }, +] + [[package]] name = "tokenizers" version = "0.20.3"