mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-23 07:08:14 +00:00
565 lines
16 KiB
Python
565 lines
16 KiB
Python
"""File validation functions for checking against provider constraints."""
|
|
|
|
from collections.abc import Sequence
|
|
import io
|
|
import logging
|
|
|
|
from crewai_files.core.types import (
|
|
AudioFile,
|
|
FileInput,
|
|
ImageFile,
|
|
PDFFile,
|
|
TextFile,
|
|
VideoFile,
|
|
)
|
|
from crewai_files.processing.constraints import (
|
|
AudioConstraints,
|
|
ImageConstraints,
|
|
PDFConstraints,
|
|
ProviderConstraints,
|
|
VideoConstraints,
|
|
)
|
|
from crewai_files.processing.exceptions import (
|
|
FileTooLargeError,
|
|
FileValidationError,
|
|
UnsupportedFileTypeError,
|
|
)
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def _get_image_dimensions(content: bytes) -> tuple[int, int] | None:
|
|
"""Get image dimensions using Pillow if available.
|
|
|
|
Args:
|
|
content: Raw image bytes.
|
|
|
|
Returns:
|
|
Tuple of (width, height) or None if Pillow unavailable.
|
|
"""
|
|
try:
|
|
from PIL import Image
|
|
|
|
with Image.open(io.BytesIO(content)) as img:
|
|
width, height = img.size
|
|
return int(width), int(height)
|
|
except ImportError:
|
|
logger.warning(
|
|
"Pillow not installed - cannot validate image dimensions. "
|
|
"Install with: pip install Pillow"
|
|
)
|
|
return None
|
|
|
|
|
|
def _get_pdf_page_count(content: bytes) -> int | None:
|
|
"""Get PDF page count using pypdf if available.
|
|
|
|
Args:
|
|
content: Raw PDF bytes.
|
|
|
|
Returns:
|
|
Page count or None if pypdf unavailable.
|
|
"""
|
|
try:
|
|
from pypdf import PdfReader
|
|
|
|
reader = PdfReader(io.BytesIO(content))
|
|
return len(reader.pages)
|
|
except ImportError:
|
|
logger.warning(
|
|
"pypdf not installed - cannot validate PDF page count. "
|
|
"Install with: pip install pypdf"
|
|
)
|
|
return None
|
|
|
|
|
|
def _get_audio_duration(content: bytes, filename: str | None = None) -> float | None:
|
|
"""Get audio duration in seconds using tinytag if available.
|
|
|
|
Args:
|
|
content: Raw audio bytes.
|
|
filename: Optional filename for format detection hint.
|
|
|
|
Returns:
|
|
Duration in seconds or None if tinytag unavailable.
|
|
"""
|
|
try:
|
|
from tinytag import TinyTag # type: ignore[import-untyped]
|
|
except ImportError:
|
|
logger.warning(
|
|
"tinytag not installed - cannot validate audio duration. "
|
|
"Install with: pip install tinytag"
|
|
)
|
|
return None
|
|
|
|
try:
|
|
tag = TinyTag.get(file_obj=io.BytesIO(content), filename=filename)
|
|
duration: float | None = tag.duration
|
|
return duration
|
|
except Exception as e:
|
|
logger.debug(f"Could not determine audio duration: {e}")
|
|
return None
|
|
|
|
|
|
_VIDEO_FORMAT_MAP: dict[str, str] = {
|
|
"video/mp4": "mp4",
|
|
"video/webm": "webm",
|
|
"video/x-matroska": "matroska",
|
|
"video/quicktime": "mov",
|
|
"video/x-msvideo": "avi",
|
|
"video/x-flv": "flv",
|
|
}
|
|
|
|
|
|
def _get_video_duration(
|
|
content: bytes, content_type: str | None = None
|
|
) -> float | None:
|
|
"""Get video duration in seconds using av if available.
|
|
|
|
Args:
|
|
content: Raw video bytes.
|
|
content_type: Optional MIME type for format detection hint.
|
|
|
|
Returns:
|
|
Duration in seconds or None if av unavailable.
|
|
"""
|
|
try:
|
|
import av
|
|
except ImportError:
|
|
logger.warning(
|
|
"av (PyAV) not installed - cannot validate video duration. "
|
|
"Install with: pip install av"
|
|
)
|
|
return None
|
|
|
|
format_hint = _VIDEO_FORMAT_MAP.get(content_type) if content_type else None
|
|
|
|
try:
|
|
with av.open(io.BytesIO(content), format=format_hint) as container: # type: ignore[attr-defined]
|
|
duration: int | None = container.duration # type: ignore[union-attr]
|
|
if duration is None:
|
|
return None
|
|
return float(duration) / 1_000_000
|
|
except Exception as e:
|
|
logger.debug(f"Could not determine video duration: {e}")
|
|
|
|
return None
|
|
|
|
|
|
def _format_size(size_bytes: int) -> str:
|
|
"""Format byte size to human-readable string."""
|
|
if size_bytes >= 1024 * 1024 * 1024:
|
|
return f"{size_bytes / (1024 * 1024 * 1024):.1f}GB"
|
|
if size_bytes >= 1024 * 1024:
|
|
return f"{size_bytes / (1024 * 1024):.1f}MB"
|
|
if size_bytes >= 1024:
|
|
return f"{size_bytes / 1024:.1f}KB"
|
|
return f"{size_bytes}B"
|
|
|
|
|
|
def _validate_size(
|
|
file_type: str,
|
|
filename: str | None,
|
|
file_size: int,
|
|
max_size: int,
|
|
errors: list[str],
|
|
raise_on_error: bool,
|
|
) -> None:
|
|
"""Validate file size against maximum.
|
|
|
|
Args:
|
|
file_type: Type label for error messages (e.g., "Image", "PDF").
|
|
filename: Name of the file being validated.
|
|
file_size: Actual file size in bytes.
|
|
max_size: Maximum allowed size in bytes.
|
|
errors: List to append error messages to.
|
|
raise_on_error: If True, raise FileTooLargeError on failure.
|
|
"""
|
|
if file_size > max_size:
|
|
msg = (
|
|
f"{file_type} '{filename}' size ({_format_size(file_size)}) exceeds "
|
|
f"maximum ({_format_size(max_size)})"
|
|
)
|
|
errors.append(msg)
|
|
if raise_on_error:
|
|
raise FileTooLargeError(
|
|
msg,
|
|
file_name=filename,
|
|
actual_size=file_size,
|
|
max_size=max_size,
|
|
)
|
|
|
|
|
|
def _validate_format(
|
|
file_type: str,
|
|
filename: str | None,
|
|
content_type: str,
|
|
supported_formats: tuple[str, ...],
|
|
errors: list[str],
|
|
raise_on_error: bool,
|
|
) -> None:
|
|
"""Validate content type against supported formats.
|
|
|
|
Args:
|
|
file_type: Type label for error messages (e.g., "Image", "Audio").
|
|
filename: Name of the file being validated.
|
|
content_type: MIME type of the file.
|
|
supported_formats: Tuple of supported MIME types.
|
|
errors: List to append error messages to.
|
|
raise_on_error: If True, raise UnsupportedFileTypeError on failure.
|
|
"""
|
|
if content_type not in supported_formats:
|
|
msg = (
|
|
f"{file_type} format '{content_type}' is not supported. "
|
|
f"Supported: {', '.join(supported_formats)}"
|
|
)
|
|
errors.append(msg)
|
|
if raise_on_error:
|
|
raise UnsupportedFileTypeError(
|
|
msg, file_name=filename, content_type=content_type
|
|
)
|
|
|
|
|
|
def validate_image(
|
|
file: ImageFile,
|
|
constraints: ImageConstraints,
|
|
*,
|
|
raise_on_error: bool = True,
|
|
) -> Sequence[str]:
|
|
"""Validate an image file against constraints.
|
|
|
|
Args:
|
|
file: The image file to validate.
|
|
constraints: Image constraints to validate against.
|
|
raise_on_error: If True, raise exceptions on validation failure.
|
|
|
|
Returns:
|
|
List of validation error messages (empty if valid).
|
|
|
|
Raises:
|
|
FileTooLargeError: If the file exceeds size limits.
|
|
FileValidationError: If the file exceeds dimension limits.
|
|
UnsupportedFileTypeError: If the format is not supported.
|
|
"""
|
|
errors: list[str] = []
|
|
content = file.read()
|
|
file_size = len(content)
|
|
filename = file.filename
|
|
|
|
_validate_size(
|
|
"Image", filename, file_size, constraints.max_size_bytes, errors, raise_on_error
|
|
)
|
|
_validate_format(
|
|
"Image",
|
|
filename,
|
|
file.content_type,
|
|
constraints.supported_formats,
|
|
errors,
|
|
raise_on_error,
|
|
)
|
|
|
|
if constraints.max_width is not None or constraints.max_height is not None:
|
|
dimensions = _get_image_dimensions(content)
|
|
if dimensions is not None:
|
|
width, height = dimensions
|
|
|
|
if constraints.max_width and width > constraints.max_width:
|
|
msg = (
|
|
f"Image '{filename}' width ({width}px) exceeds "
|
|
f"maximum ({constraints.max_width}px)"
|
|
)
|
|
errors.append(msg)
|
|
if raise_on_error:
|
|
raise FileValidationError(msg, file_name=filename)
|
|
|
|
if constraints.max_height and height > constraints.max_height:
|
|
msg = (
|
|
f"Image '{filename}' height ({height}px) exceeds "
|
|
f"maximum ({constraints.max_height}px)"
|
|
)
|
|
errors.append(msg)
|
|
if raise_on_error:
|
|
raise FileValidationError(msg, file_name=filename)
|
|
|
|
return errors
|
|
|
|
|
|
def validate_pdf(
|
|
file: PDFFile,
|
|
constraints: PDFConstraints,
|
|
*,
|
|
raise_on_error: bool = True,
|
|
) -> Sequence[str]:
|
|
"""Validate a PDF file against constraints.
|
|
|
|
Args:
|
|
file: The PDF file to validate.
|
|
constraints: PDF constraints to validate against.
|
|
raise_on_error: If True, raise exceptions on validation failure.
|
|
|
|
Returns:
|
|
List of validation error messages (empty if valid).
|
|
|
|
Raises:
|
|
FileTooLargeError: If the file exceeds size limits.
|
|
FileValidationError: If the file exceeds page limits.
|
|
"""
|
|
errors: list[str] = []
|
|
content = file.read()
|
|
file_size = len(content)
|
|
filename = file.filename
|
|
|
|
_validate_size(
|
|
"PDF", filename, file_size, constraints.max_size_bytes, errors, raise_on_error
|
|
)
|
|
|
|
if constraints.max_pages is not None:
|
|
page_count = _get_pdf_page_count(content)
|
|
if page_count is not None and page_count > constraints.max_pages:
|
|
msg = (
|
|
f"PDF '{filename}' page count ({page_count}) exceeds "
|
|
f"maximum ({constraints.max_pages})"
|
|
)
|
|
errors.append(msg)
|
|
if raise_on_error:
|
|
raise FileValidationError(msg, file_name=filename)
|
|
|
|
return errors
|
|
|
|
|
|
def validate_audio(
|
|
file: AudioFile,
|
|
constraints: AudioConstraints,
|
|
*,
|
|
raise_on_error: bool = True,
|
|
) -> Sequence[str]:
|
|
"""Validate an audio file against constraints.
|
|
|
|
Args:
|
|
file: The audio file to validate.
|
|
constraints: Audio constraints to validate against.
|
|
raise_on_error: If True, raise exceptions on validation failure.
|
|
|
|
Returns:
|
|
List of validation error messages (empty if valid).
|
|
|
|
Raises:
|
|
FileTooLargeError: If the file exceeds size limits.
|
|
FileValidationError: If the file exceeds duration limits.
|
|
UnsupportedFileTypeError: If the format is not supported.
|
|
"""
|
|
errors: list[str] = []
|
|
content = file.read()
|
|
file_size = len(content)
|
|
filename = file.filename
|
|
|
|
_validate_size(
|
|
"Audio",
|
|
filename,
|
|
file_size,
|
|
constraints.max_size_bytes,
|
|
errors,
|
|
raise_on_error,
|
|
)
|
|
_validate_format(
|
|
"Audio",
|
|
filename,
|
|
file.content_type,
|
|
constraints.supported_formats,
|
|
errors,
|
|
raise_on_error,
|
|
)
|
|
|
|
if constraints.max_duration_seconds is not None:
|
|
duration = _get_audio_duration(content, filename)
|
|
if duration is not None and duration > constraints.max_duration_seconds:
|
|
msg = (
|
|
f"Audio '{filename}' duration ({duration:.1f}s) exceeds "
|
|
f"maximum ({constraints.max_duration_seconds}s)"
|
|
)
|
|
errors.append(msg)
|
|
if raise_on_error:
|
|
raise FileValidationError(msg, file_name=filename)
|
|
|
|
return errors
|
|
|
|
|
|
def validate_video(
|
|
file: VideoFile,
|
|
constraints: VideoConstraints,
|
|
*,
|
|
raise_on_error: bool = True,
|
|
) -> Sequence[str]:
|
|
"""Validate a video file against constraints.
|
|
|
|
Args:
|
|
file: The video file to validate.
|
|
constraints: Video constraints to validate against.
|
|
raise_on_error: If True, raise exceptions on validation failure.
|
|
|
|
Returns:
|
|
List of validation error messages (empty if valid).
|
|
|
|
Raises:
|
|
FileTooLargeError: If the file exceeds size limits.
|
|
FileValidationError: If the file exceeds duration limits.
|
|
UnsupportedFileTypeError: If the format is not supported.
|
|
"""
|
|
errors: list[str] = []
|
|
content = file.read()
|
|
file_size = len(content)
|
|
filename = file.filename
|
|
|
|
_validate_size(
|
|
"Video",
|
|
filename,
|
|
file_size,
|
|
constraints.max_size_bytes,
|
|
errors,
|
|
raise_on_error,
|
|
)
|
|
_validate_format(
|
|
"Video",
|
|
filename,
|
|
file.content_type,
|
|
constraints.supported_formats,
|
|
errors,
|
|
raise_on_error,
|
|
)
|
|
|
|
if constraints.max_duration_seconds is not None:
|
|
duration = _get_video_duration(content)
|
|
if duration is not None and duration > constraints.max_duration_seconds:
|
|
msg = (
|
|
f"Video '{filename}' duration ({duration:.1f}s) exceeds "
|
|
f"maximum ({constraints.max_duration_seconds}s)"
|
|
)
|
|
errors.append(msg)
|
|
if raise_on_error:
|
|
raise FileValidationError(msg, file_name=filename)
|
|
|
|
return errors
|
|
|
|
|
|
def validate_text(
|
|
file: TextFile,
|
|
constraints: ProviderConstraints,
|
|
*,
|
|
raise_on_error: bool = True,
|
|
) -> Sequence[str]:
|
|
"""Validate a text file against general constraints.
|
|
|
|
Args:
|
|
file: The text file to validate.
|
|
constraints: Provider constraints to validate against.
|
|
raise_on_error: If True, raise exceptions on validation failure.
|
|
|
|
Returns:
|
|
List of validation error messages (empty if valid).
|
|
|
|
Raises:
|
|
FileTooLargeError: If the file exceeds size limits.
|
|
"""
|
|
errors: list[str] = []
|
|
|
|
if constraints.general_max_size_bytes is None:
|
|
return errors
|
|
|
|
file_size = len(file.read())
|
|
_validate_size(
|
|
"Text file",
|
|
file.filename,
|
|
file_size,
|
|
constraints.general_max_size_bytes,
|
|
errors,
|
|
raise_on_error,
|
|
)
|
|
|
|
return errors
|
|
|
|
|
|
def _check_unsupported_type(
|
|
file: FileInput,
|
|
provider_name: str,
|
|
type_name: str,
|
|
raise_on_error: bool,
|
|
) -> Sequence[str]:
|
|
"""Check if file type is unsupported and handle error.
|
|
|
|
Args:
|
|
file: The file being validated.
|
|
provider_name: Name of the provider.
|
|
type_name: Name of the file type (e.g., "images", "PDFs").
|
|
raise_on_error: If True, raise exception instead of returning errors.
|
|
|
|
Returns:
|
|
List with error message (only returns when raise_on_error is False).
|
|
|
|
Raises:
|
|
UnsupportedFileTypeError: If raise_on_error is True.
|
|
"""
|
|
msg = f"Provider '{provider_name}' does not support {type_name}"
|
|
if raise_on_error:
|
|
raise UnsupportedFileTypeError(
|
|
msg, file_name=file.filename, content_type=file.content_type
|
|
)
|
|
return [msg]
|
|
|
|
|
|
def validate_file(
|
|
file: FileInput,
|
|
constraints: ProviderConstraints,
|
|
*,
|
|
raise_on_error: bool = True,
|
|
) -> Sequence[str]:
|
|
"""Validate a file against provider constraints.
|
|
|
|
Dispatches to the appropriate validator based on file type.
|
|
|
|
Args:
|
|
file: The file to validate.
|
|
constraints: Provider constraints to validate against.
|
|
raise_on_error: If True, raise exceptions on validation failure.
|
|
|
|
Returns:
|
|
List of validation error messages (empty if valid).
|
|
|
|
Raises:
|
|
FileTooLargeError: If the file exceeds size limits.
|
|
FileValidationError: If the file fails other validation checks.
|
|
UnsupportedFileTypeError: If the file type is not supported.
|
|
"""
|
|
if isinstance(file, ImageFile):
|
|
if constraints.image is None:
|
|
return _check_unsupported_type(
|
|
file, constraints.name, "images", raise_on_error
|
|
)
|
|
return validate_image(file, constraints.image, raise_on_error=raise_on_error)
|
|
|
|
if isinstance(file, PDFFile):
|
|
if constraints.pdf is None:
|
|
return _check_unsupported_type(
|
|
file, constraints.name, "PDFs", raise_on_error
|
|
)
|
|
return validate_pdf(file, constraints.pdf, raise_on_error=raise_on_error)
|
|
|
|
if isinstance(file, AudioFile):
|
|
if constraints.audio is None:
|
|
return _check_unsupported_type(
|
|
file, constraints.name, "audio", raise_on_error
|
|
)
|
|
return validate_audio(file, constraints.audio, raise_on_error=raise_on_error)
|
|
|
|
if isinstance(file, VideoFile):
|
|
if constraints.video is None:
|
|
return _check_unsupported_type(
|
|
file, constraints.name, "video", raise_on_error
|
|
)
|
|
return validate_video(file, constraints.video, raise_on_error=raise_on_error)
|
|
|
|
if isinstance(file, TextFile):
|
|
return validate_text(file, constraints, raise_on_error=raise_on_error)
|
|
|
|
return []
|