refactor: extract files module to standalone crewai-files package

2026-05-01 23:32:39 +00:00 · 2026-01-22 15:06:20 -05:00
parent a064b84ead
commit b95a3a9bc8
62 changed files with 639 additions and 582 deletions
--- a/lib/crewai-files/src/crewai_files/processing/processor.py
+++ b/lib/crewai-files/src/crewai_files/processing/processor.py
@@ -0,0 +1,346 @@
+"""FileProcessor for validating and transforming files based on provider constraints."""
+
+import asyncio
+from collections.abc import Sequence
+import logging
+
+from crewai_files.core.types import (
+    AudioFile,
+    File,
+    FileInput,
+    ImageFile,
+    PDFFile,
+    TextFile,
+    VideoFile,
+)
+from crewai_files.processing.constraints import (
+    ProviderConstraints,
+    get_constraints_for_provider,
+)
+from crewai_files.processing.enums import FileHandling
+from crewai_files.processing.exceptions import (
+    FileProcessingError,
+    FileTooLargeError,
+    FileValidationError,
+    UnsupportedFileTypeError,
+)
+from crewai_files.processing.transformers import (
+    chunk_pdf,
+    chunk_text,
+    get_image_dimensions,
+    get_pdf_page_count,
+    optimize_image,
+    resize_image,
+)
+from crewai_files.processing.validators import validate_file
+
+
+logger = logging.getLogger(__name__)
+
+
+class FileProcessor:
+    """Processes files according to provider constraints and per-file mode mode.
+
+    Validates files against provider-specific limits and optionally transforms
+    them (resize, compress, chunk) to meet those limits. Each file specifies
+    its own mode mode via `file.mode`.
+
+    Attributes:
+        constraints: Provider constraints for validation.
+    """
+
+    def __init__(
+        self,
+        constraints: ProviderConstraints | str | None = None,
+    ) -> None:
+        """Initialize the FileProcessor.
+
+        Args:
+            constraints: Provider constraints or provider name string.
+                If None, validation is skipped.
+        """
+        if isinstance(constraints, str):
+            resolved = get_constraints_for_provider(constraints)
+            if resolved is None:
+                logger.warning(
+                    f"Unknown provider '{constraints}' - validation disabled"
+                )
+            self.constraints = resolved
+        else:
+            self.constraints = constraints
+
+    def validate(self, file: FileInput) -> Sequence[str]:
+        """Validate a file against provider constraints.
+
+        Args:
+            file: The file to validate.
+
+        Returns:
+            List of validation error messages (empty if valid).
+
+        Raises:
+            FileValidationError: If file.mode is STRICT and validation fails.
+        """
+        if self.constraints is None:
+            return []
+
+        mode = self._get_mode(file)
+        raise_on_error = mode == FileHandling.STRICT
+        return validate_file(file, self.constraints, raise_on_error=raise_on_error)
+
+    @staticmethod
+    def _get_mode(file: FileInput) -> FileHandling:
+        """Get the mode mode for a file.
+
+        Args:
+            file: The file to get mode for.
+
+        Returns:
+            The file's mode mode, defaulting to AUTO.
+        """
+        mode = getattr(file, "mode", None)
+        if mode is None:
+            return FileHandling.AUTO
+        if isinstance(mode, str):
+            return FileHandling(mode)
+        if isinstance(mode, FileHandling):
+            return mode
+        return FileHandling.AUTO
+
+    def process(self, file: FileInput) -> FileInput | Sequence[FileInput]:
+        """Process a single file according to constraints and its mode mode.
+
+        Args:
+            file: The file to process.
+
+        Returns:
+            The processed file (possibly transformed) or a sequence of files
+            if the file was chunked.
+
+        Raises:
+            FileProcessingError: If file.mode is STRICT and processing fails.
+        """
+        if self.constraints is None:
+            return file
+
+        mode = self._get_mode(file)
+
+        try:
+            errors = self.validate(file)
+
+            if not errors:
+                return file
+
+            if mode == FileHandling.STRICT:
+                raise FileValidationError("; ".join(errors), file_name=file.filename)
+
+            if mode == FileHandling.WARN:
+                for error in errors:
+                    logger.warning(error)
+                return file
+
+            if mode == FileHandling.AUTO:
+                return self._auto_process(file)
+
+            if mode == FileHandling.CHUNK:
+                return self._chunk_process(file)
+
+            return file
+
+        except (FileValidationError, FileTooLargeError, UnsupportedFileTypeError):
+            raise
+        except Exception as e:
+            logger.error(f"Error processing file '{file.filename}': {e}")
+            if mode == FileHandling.STRICT:
+                raise FileProcessingError(str(e), file_name=file.filename) from e
+            return file
+
+    def process_files(
+        self,
+        files: dict[str, FileInput],
+    ) -> dict[str, FileInput]:
+        """Process multiple files according to constraints.
+
+        Args:
+            files: Dictionary mapping names to file inputs.
+
+        Returns:
+            Dictionary mapping names to processed files. If a file is chunked,
+            multiple entries are created with indexed names.
+        """
+        result: dict[str, FileInput] = {}
+
+        for name, file in files.items():
+            processed = self.process(file)
+
+            if isinstance(processed, Sequence) and not isinstance(
+                processed, (str, bytes)
+            ):
+                for i, chunk in enumerate(processed):
+                    chunk_name = f"{name}_chunk_{i}"
+                    result[chunk_name] = chunk
+            else:
+                result[name] = processed
+
+        return result
+
+    async def aprocess_files(
+        self,
+        files: dict[str, FileInput],
+        max_concurrency: int = 10,
+    ) -> dict[str, FileInput]:
+        """Async process multiple files in parallel.
+
+        Args:
+            files: Dictionary mapping names to file inputs.
+            max_concurrency: Maximum number of concurrent processing tasks.
+
+        Returns:
+            Dictionary mapping names to processed files. If a file is chunked,
+            multiple entries are created with indexed names.
+        """
+        semaphore = asyncio.Semaphore(max_concurrency)
+
+        async def process_single(
+            key: str, input_file: FileInput
+        ) -> tuple[str, FileInput | Sequence[FileInput]]:
+            """Process a single file with semaphore limiting."""
+            async with semaphore:
+                loop = asyncio.get_running_loop()
+                result = await loop.run_in_executor(None, self.process, input_file)
+                return key, result
+
+        tasks = [process_single(n, f) for n, f in files.items()]
+        gather_results = await asyncio.gather(*tasks, return_exceptions=True)
+
+        output: dict[str, FileInput] = {}
+        for item in gather_results:
+            if isinstance(item, BaseException):
+                logger.error(f"Processing failed: {item}")
+                continue
+            entry_name, processed = item
+            if isinstance(processed, Sequence) and not isinstance(
+                processed, (str, bytes)
+            ):
+                for i, chunk in enumerate(processed):
+                    output[f"{entry_name}_chunk_{i}"] = chunk
+            elif isinstance(
+                processed, (AudioFile, File, ImageFile, PDFFile, TextFile, VideoFile)
+            ):
+                output[entry_name] = processed
+
+        return output
+
+    def _auto_process(self, file: FileInput) -> FileInput:
+        """Automatically resize/compress file to meet constraints.
+
+        Args:
+            file: The file to process.
+
+        Returns:
+            The processed file.
+        """
+        if self.constraints is None:
+            return file
+
+        if isinstance(file, ImageFile) and self.constraints.image is not None:
+            return self._auto_process_image(file)
+
+        if isinstance(file, PDFFile) and self.constraints.pdf is not None:
+            logger.warning(
+                f"Cannot auto-compress PDF '{file.filename}'. "
+                "Consider using CHUNK mode for large PDFs."
+            )
+            return file
+
+        if isinstance(file, (AudioFile, VideoFile)):
+            logger.warning(
+                f"Auto-processing not supported for {type(file).__name__}. "
+                "File will be used as-is."
+            )
+            return file
+
+        return file
+
+    def _auto_process_image(self, file: ImageFile) -> ImageFile:
+        """Auto-process an image file.
+
+        Args:
+            file: The image file to process.
+
+        Returns:
+            The processed image file.
+        """
+        if self.constraints is None or self.constraints.image is None:
+            return file
+
+        image_constraints = self.constraints.image
+        processed = file
+        content = file.read()
+        current_size = len(content)
+
+        if image_constraints.max_width or image_constraints.max_height:
+            dimensions = get_image_dimensions(file)
+            if dimensions:
+                width, height = dimensions
+                max_w = image_constraints.max_width or width
+                max_h = image_constraints.max_height or height
+
+                if width > max_w or height > max_h:
+                    try:
+                        processed = resize_image(file, max_w, max_h)
+                        content = processed.read()
+                        current_size = len(content)
+                    except Exception as e:
+                        logger.warning(f"Failed to resize image: {e}")
+
+        if current_size > image_constraints.max_size_bytes:
+            try:
+                processed = optimize_image(processed, image_constraints.max_size_bytes)
+            except Exception as e:
+                logger.warning(f"Failed to optimize image: {e}")
+
+        return processed
+
+    def _chunk_process(self, file: FileInput) -> FileInput | Sequence[FileInput]:
+        """Split file into chunks to meet constraints.
+
+        Args:
+            file: The file to chunk.
+
+        Returns:
+            Original file if chunking not needed, or sequence of chunked files.
+        """
+        if self.constraints is None:
+            return file
+
+        if isinstance(file, PDFFile) and self.constraints.pdf is not None:
+            max_pages = self.constraints.pdf.max_pages
+            if max_pages is not None:
+                page_count = get_pdf_page_count(file)
+                if page_count is not None and page_count > max_pages:
+                    try:
+                        return list(chunk_pdf(file, max_pages))
+                    except Exception as e:
+                        logger.warning(f"Failed to chunk PDF: {e}")
+                        return file
+
+        if isinstance(file, TextFile):
+            # Use general max size as character limit approximation
+            max_size = self.constraints.general_max_size_bytes
+            if max_size is not None:
+                content = file.read()
+                if len(content) > max_size:
+                    try:
+                        return list(chunk_text(file, max_size))
+                    except Exception as e:
+                        logger.warning(f"Failed to chunk text file: {e}")
+                        return file
+
+        if isinstance(file, (ImageFile, AudioFile, VideoFile)):
+            logger.warning(
+                f"Chunking not supported for {type(file).__name__}. "
+                "Consider using AUTO mode for images."
+            )
+
+        return file