refactor: reorganize files module with centralized constants and utilities

2026-01-26 16:48:13 +00:00 · 2026-01-22 11:46:17 -05:00
parent e2a5177da2
commit 6147d4eb2e
13 changed files with 379 additions and 195 deletions
--- a/lib/crewai/src/crewai/files/init.py
+++ b/lib/crewai/src/crewai/files/init.py
@@ -73,69 +73,7 @@ from crewai.files.upload_cache import (
    reset_upload_cache,
 )
 from crewai.files.uploaders import FileUploader, UploadResult, get_uploader
-
+from crewai.files.utils import normalize_input_files, wrap_file_source
 def wrap_file_source(source: FileSource) -> FileInput:
    """Wrap a FileSource in the appropriate typed FileInput wrapper.
    Args:
        source: The file source to wrap.
    Returns:
        Typed FileInput wrapper based on content type.
    """
    content_type = source.content_type
    if content_type.startswith("image/"):
        return ImageFile(source=source)
    if content_type.startswith("audio/"):
        return AudioFile(source=source)
    if content_type.startswith("video/"):
        return VideoFile(source=source)
    if content_type == "application/pdf":
        return PDFFile(source=source)
    return TextFile(source=source)
 def normalize_input_files(
    input_files: list[FileSourceInput | FileInput],
 ) -> dict[str, FileInput]:
    """Convert a list of file sources to a named dictionary of FileInputs.
    Args:
        input_files: List of file source inputs or File objects.
    Returns:
        Dictionary mapping names to FileInput wrappers.
    """
    from pathlib import Path
    result: dict[str, FileInput] = {}
    for i, item in enumerate(input_files):
        if isinstance(item, BaseFile):
            name = item.filename or f"file_{i}"
            if "." in name:
                name = name.rsplit(".", 1)[0]
            result[name] = item
            continue
        file_source: FilePath | FileBytes | FileStream
        if isinstance(item, (FilePath, FileBytes, FileStream)):
            file_source = item
        elif isinstance(item, Path):
            file_source = FilePath(path=item)
        elif isinstance(item, str):
            file_source = FilePath(path=Path(item))
        elif isinstance(item, (bytes, memoryview)):
            file_source = FileBytes(data=bytes(item))
        else:
            continue
        name = file_source.filename or f"file_{i}"
        result[name] = wrap_file_source(file_source)
    return result
 __all__ = [
--- a/lib/crewai/src/crewai/files/constants.py
+++ b/lib/crewai/src/crewai/files/constants.py
@@ -0,0 +1,26 @@
 """Constants for file handling utilities."""
 from datetime import timedelta
 from typing import Final, Literal
 DEFAULT_MAX_FILE_SIZE_BYTES: Final[Literal[524_288_000]] = 524_288_000
 MAGIC_BUFFER_SIZE: Final[Literal[2048]] = 2048
 UPLOAD_MAX_RETRIES: Final[Literal[3]] = 3
 UPLOAD_RETRY_DELAY_BASE: Final[Literal[2]] = 2
 DEFAULT_TTL_SECONDS: Final[Literal[86_400]] = 86_400
 DEFAULT_MAX_CACHE_ENTRIES: Final[Literal[1000]] = 1000
 GEMINI_FILE_TTL: Final[timedelta] = timedelta(hours=48)
 BACKOFF_BASE_DELAY: Final[float] = 1.0
 BACKOFF_MAX_DELAY: Final[float] = 30.0
 BACKOFF_JITTER_FACTOR: Final[float] = 0.1
 FILES_API_MAX_SIZE: Final[Literal[536_870_912]] = 536_870_912
 DEFAULT_UPLOAD_CHUNK_SIZE: Final[Literal[67_108_864]] = 67_108_864
 MULTIPART_THRESHOLD: Final[Literal[8_388_608]] = 8_388_608
 MULTIPART_CHUNKSIZE: Final[Literal[8_388_608]] = 8_388_608
 MAX_CONCURRENCY: Final[Literal[10]] = 10
--- a/lib/crewai/src/crewai/files/content_types.py
+++ b/lib/crewai/src/crewai/files/content_types.py
@@ -9,7 +9,6 @@ from typing import Annotated, Any, BinaryIO, Literal, Self
 from pydantic import BaseModel, Field, GetCoreSchemaHandler
 from pydantic_core import CoreSchema, core_schema
 from typing_extensions import TypeIs
 from crewai.files.file import (
    AsyncFileStream,
@@ -18,6 +17,7 @@ from crewai.files.file import (
    FileSource,
    FileStream,
 )
 from crewai.files.utils import is_file_source
 FileSourceInput = str | Path | bytes | IOBase | FileSource
@@ -60,12 +60,6 @@ class _FileSourceCoercer:
 CoercedFileSource = Annotated[FileSourceInput, _FileSourceCoercer]
 def _is_file_source(v: FileSourceInput) -> TypeIs[FileSource]:
    """Type guard to narrow FileSourceInput to FileSource."""
    return isinstance(v, (FilePath, FileBytes, FileStream))
 FileMode = Literal["strict", "auto", "warn", "chunk"]
@@ -184,7 +178,7 @@ class BaseFile(ABC, BaseModel):
    @property
    def _file_source(self) -> FileSource:
        """Get source with narrowed type (always FileSource after validation)."""
-        if _is_file_source(self.source):
+        if is_file_source(self.source):
            return self.source
        raise TypeError("source must be a FileSource after validation")
--- a/lib/crewai/src/crewai/files/file.py
+++ b/lib/crewai/src/crewai/files/file.py
@@ -18,6 +18,8 @@ from pydantic import (
 )
 from pydantic_core import CoreSchema, core_schema
 from crewai.files.constants import DEFAULT_MAX_FILE_SIZE_BYTES, MAGIC_BUFFER_SIZE
@runtime_checkable
 class AsyncReadable(Protocol):
@@ -51,7 +53,14 @@ class _AsyncReadableValidator:
 ValidatedAsyncReadable = Annotated[AsyncReadable, _AsyncReadableValidator()]
-DEFAULT_MAX_FILE_SIZE_BYTES = 500 * 1024 * 1024  # 500MB
+
 def _fallback_content_type(filename: str | None) -> str:
    """Get content type from filename extension or return default."""
    if filename:
        mime_type, _ = mimetypes.guess_type(filename)
        if mime_type:
            return mime_type
    return "application/octet-stream"
 def detect_content_type(data: bytes, filename: str | None = None) -> str:
@@ -61,7 +70,7 @@ def detect_content_type(data: bytes, filename: str | None = None) -> str:
    falls back to mimetypes module using filename extension.
    Args:
-        data: Raw bytes to analyze.
+        data: Raw bytes to analyze (only first 2048 bytes are used).
        filename: Optional filename for extension-based fallback.
    Returns:
@@ -70,14 +79,32 @@ def detect_content_type(data: bytes, filename: str | None = None) -> str:
    try:
        import magic
-        result: str = magic.from_buffer(data, mime=True)
+        result: str = magic.from_buffer(data[:MAGIC_BUFFER_SIZE], mime=True)
        return result
    except ImportError:
-        if filename:
+        return _fallback_content_type(filename)
-            mime_type, _ = mimetypes.guess_type(filename)
+
-            if mime_type:
+
-                return mime_type
+def detect_content_type_from_path(path: Path, filename: str | None = None) -> str:
-        return "application/octet-stream"
+    """Detect MIME type from file path.
    Uses python-magic's from_file() for accurate detection without reading
    the entire file into memory.
    Args:
        path: Path to the file.
        filename: Optional filename for extension-based fallback.
    Returns:
        The detected MIME type.
    """
    try:
        import magic
        result: str = magic.from_file(str(path), mime=True)
        return result
    except ImportError:
        return _fallback_content_type(filename or path.name)
 class _BinaryIOValidator:
@@ -114,6 +141,7 @@ class FilePath(BaseModel):
        description="Maximum file size in bytes.",
    )
    _content: bytes | None = PrivateAttr(default=None)
    _content_type: str = PrivateAttr()
    @model_validator(mode="after")
    def _validate_file_exists(self) -> FilePath:
@@ -144,6 +172,7 @@ class FilePath(BaseModel):
                max_size=self.max_size_bytes,
            )
        self._content_type = detect_content_type_from_path(self.path, self.path.name)
        return self
    @property
@@ -153,8 +182,8 @@ class FilePath(BaseModel):
    @property
    def content_type(self) -> str:
-        """Get the content type by reading file content."""
+        """Get the content type."""
-        return detect_content_type(self.read(), self.filename)
+        return self._content_type
    def read(self) -> bytes:
        """Read the file content from disk."""
@@ -201,11 +230,18 @@ class FileBytes(BaseModel):
    data: bytes = Field(description="Raw bytes content of the file.")
    filename: str | None = Field(default=None, description="Optional filename.")
    _content_type: str = PrivateAttr()
    @model_validator(mode="after")
    def _detect_content_type(self) -> FileBytes:
        """Detect and cache content type from data."""
        self._content_type = detect_content_type(self.data, self.filename)
        return self
    @property
    def content_type(self) -> str:
-        """Get the content type from the data."""
+        """Get the content type."""
-        return detect_content_type(self.data, self.filename)
+        return self._content_type
    def read(self) -> bytes:
        """Return the bytes content."""
@@ -246,18 +282,27 @@ class FileStream(BaseModel):
    stream: ValidatedBinaryIO = Field(description="Binary file stream.")
    filename: str | None = Field(default=None, description="Optional filename.")
    _content: bytes | None = PrivateAttr(default=None)
    _content_type: str = PrivateAttr()
-    def model_post_init(self, __context: object) -> None:
+    @model_validator(mode="after")
-        """Extract filename from stream if not provided."""
+    def _initialize(self) -> FileStream:
        """Extract filename and detect content type."""
        if self.filename is None:
            name = getattr(self.stream, "name", None)
            if name is not None:
-                object.__setattr__(self, "filename", Path(name).name)
+                self.filename = Path(name).name
        position = self.stream.tell()
        self.stream.seek(0)
        header = self.stream.read(MAGIC_BUFFER_SIZE)
        self.stream.seek(position)
        self._content_type = detect_content_type(header, self.filename)
        return self
    @property
    def content_type(self) -> str:
-        """Get the content type from stream content."""
+        """Get the content type."""
-        return detect_content_type(self.read(), self.filename)
+        return self._content_type
    def read(self) -> bytes:
        """Read the stream content. Content is cached after first read."""
@@ -319,13 +364,16 @@ class AsyncFileStream(BaseModel):
    )
    filename: str | None = Field(default=None, description="Optional filename.")
    _content: bytes | None = PrivateAttr(default=None)
    _content_type: str | None = PrivateAttr(default=None)
    @property
    def content_type(self) -> str:
-        """Get the content type from stream content. Requires aread() first."""
+        """Get the content type from stream content (cached). Requires aread() first."""
        if self._content is None:
            raise RuntimeError("Call aread() first to load content")
-        return detect_content_type(self._content, self.filename)
+        if self._content_type is None:
            self._content_type = detect_content_type(self._content, self.filename)
        return self._content_type
    async def aread(self) -> bytes:
        """Async read the stream content. Content is cached after first read."""
--- a/lib/crewai/src/crewai/files/processing/constraints.py
+++ b/lib/crewai/src/crewai/files/processing/constraints.py
@@ -1,6 +1,7 @@
 """Provider-specific file constraints for multimodal content."""
 from dataclasses import dataclass
 from functools import lru_cache
 from typing import Literal
 from crewai.files.content_types import (
@@ -162,47 +163,47 @@ class ProviderConstraints:
 ANTHROPIC_CONSTRAINTS = ProviderConstraints(
    name="anthropic",
    image=ImageConstraints(
-        max_size_bytes=5 * 1024 * 1024,
+        max_size_bytes=5_242_880,
        max_width=8000,
        max_height=8000,
    ),
    pdf=PDFConstraints(
-        max_size_bytes=30 * 1024 * 1024,
+        max_size_bytes=31_457_280,
        max_pages=100,
    ),
    supports_file_upload=True,
-    file_upload_threshold_bytes=5 * 1024 * 1024,
+    file_upload_threshold_bytes=5_242_880,
 )
 OPENAI_CONSTRAINTS = ProviderConstraints(
    name="openai",
    image=ImageConstraints(
-        max_size_bytes=20 * 1024 * 1024,
+        max_size_bytes=20_971_520,
        max_images_per_request=10,
    ),
    supports_file_upload=True,
-    file_upload_threshold_bytes=5 * 1024 * 1024,
+    file_upload_threshold_bytes=5_242_880,
 )
 GEMINI_CONSTRAINTS = ProviderConstraints(
    name="gemini",
    image=ImageConstraints(
-        max_size_bytes=100 * 1024 * 1024,
+        max_size_bytes=104_857_600,
        supported_formats=GEMINI_IMAGE_FORMATS,
    ),
    pdf=PDFConstraints(
-        max_size_bytes=50 * 1024 * 1024,
+        max_size_bytes=52_428_800,
    ),
    audio=AudioConstraints(
-        max_size_bytes=100 * 1024 * 1024,
+        max_size_bytes=104_857_600,
        supported_formats=GEMINI_AUDIO_FORMATS,
    ),
    video=VideoConstraints(
-        max_size_bytes=2 * 1024 * 1024 * 1024,
+        max_size_bytes=2_147_483_648,
        supported_formats=GEMINI_VIDEO_FORMATS,
    ),
    supports_file_upload=True,
-    file_upload_threshold_bytes=20 * 1024 * 1024,
+    file_upload_threshold_bytes=20_971_520,
 )
 BEDROCK_CONSTRAINTS = ProviderConstraints(
@@ -221,7 +222,7 @@ BEDROCK_CONSTRAINTS = ProviderConstraints(
 AZURE_CONSTRAINTS = ProviderConstraints(
    name="azure",
    image=ImageConstraints(
-        max_size_bytes=20 * 1024 * 1024,
+        max_size_bytes=20_971_520,
        max_images_per_request=10,
    ),
 )
@@ -240,6 +241,7 @@ _PROVIDER_CONSTRAINTS_MAP: dict[str, ProviderConstraints] = {
 }
@lru_cache(maxsize=32)
 def get_constraints_for_provider(
    provider: str | ProviderConstraints,
 ) -> ProviderConstraints | None:
--- a/lib/crewai/src/crewai/files/resolver.py
+++ b/lib/crewai/src/crewai/files/resolver.py
@@ -6,6 +6,7 @@ from dataclasses import dataclass, field
 import hashlib
 import logging
 from crewai.files.constants import UPLOAD_MAX_RETRIES, UPLOAD_RETRY_DELAY_BASE
 from crewai.files.content_types import FileInput
 from crewai.files.metrics import measure_operation
 from crewai.files.processing.constraints import (
@@ -29,9 +30,6 @@ from crewai.files.uploaders.base import FileUploader
 logger = logging.getLogger(__name__)
 UPLOAD_MAX_RETRIES = 3
 UPLOAD_RETRY_DELAY_BASE = 2
@dataclass
 class FileContext:
--- a/lib/crewai/src/crewai/files/upload_cache.py
+++ b/lib/crewai/src/crewai/files/upload_cache.py
@@ -15,15 +15,14 @@ from typing import TYPE_CHECKING, Any
 from aiocache import Cache  # type: ignore[import-untyped]
 from aiocache.serializers import PickleSerializer  # type: ignore[import-untyped]
 from crewai.files.constants import DEFAULT_MAX_CACHE_ENTRIES, DEFAULT_TTL_SECONDS
 if TYPE_CHECKING:
    from crewai.files.content_types import FileInput
 logger = logging.getLogger(__name__)
 DEFAULT_TTL_SECONDS = 24 * 60 * 60  # 24 hours
 DEFAULT_MAX_CACHE_ENTRIES = 1000
@dataclass
 class CachedUpload:
--- a/lib/crewai/src/crewai/files/uploaders/init.py
+++ b/lib/crewai/src/crewai/files/uploaders/init.py
@@ -1,84 +1,11 @@
 """File uploader implementations for provider File APIs."""
 from __future__ import annotations
 import logging
 from typing import Any
 from crewai.files.uploaders.base import FileUploader, UploadResult
 from crewai.files.uploaders.factory import get_uploader
 logger = logging.getLogger(__name__)
 __all__ = [
    "FileUploader",
    "UploadResult",
    "get_uploader",
 ]
 def get_uploader(provider: str, **kwargs: Any) -> FileUploader | None:
    """Get a file uploader for a specific provider.
    Args:
        provider: Provider name (e.g., "gemini", "anthropic").
        **kwargs: Additional arguments passed to the uploader constructor.
    Returns:
        FileUploader instance for the provider, or None if not supported.
    """
    provider_lower = provider.lower()
    if "gemini" in provider_lower or "google" in provider_lower:
        try:
            from crewai.files.uploaders.gemini import GeminiFileUploader
            return GeminiFileUploader(**kwargs)
        except ImportError:
            logger.warning(
                "google-genai not installed. Install with: pip install google-genai"
            )
            return None
    if "anthropic" in provider_lower or "claude" in provider_lower:
        try:
            from crewai.files.uploaders.anthropic import AnthropicFileUploader
            return AnthropicFileUploader(**kwargs)
        except ImportError:
            logger.warning(
                "anthropic not installed. Install with: pip install anthropic"
            )
            return None
    if "openai" in provider_lower or "gpt" in provider_lower:
        try:
            from crewai.files.uploaders.openai import OpenAIFileUploader
            return OpenAIFileUploader(**kwargs)
        except ImportError:
            logger.warning("openai not installed. Install with: pip install openai")
            return None
    if "bedrock" in provider_lower or "aws" in provider_lower:
        import os
        if (
            not os.environ.get("CREWAI_BEDROCK_S3_BUCKET")
            and "bucket_name" not in kwargs
        ):
            logger.debug(
                "Bedrock S3 uploader not configured. "
                "Set CREWAI_BEDROCK_S3_BUCKET environment variable to enable."
            )
            return None
        try:
            from crewai.files.uploaders.bedrock import BedrockFileUploader
            return BedrockFileUploader(**kwargs)
        except ImportError:
            logger.warning("boto3 not installed. Install with: pip install boto3")
            return None
    logger.debug(f"No file uploader available for provider: {provider}")
    return None
--- a/lib/crewai/src/crewai/files/uploaders/bedrock.py
+++ b/lib/crewai/src/crewai/files/uploaders/bedrock.py
@@ -8,6 +8,11 @@ import os
 from pathlib import Path
 from typing import Any
 from crewai.files.constants import (
    MAX_CONCURRENCY,
    MULTIPART_CHUNKSIZE,
    MULTIPART_THRESHOLD,
 )
 from crewai.files.content_types import FileInput
 from crewai.files.file import FileBytes, FilePath
 from crewai.files.processing.exceptions import (
@@ -19,10 +24,6 @@ from crewai.files.uploaders.base import FileUploader, UploadResult
 logger = logging.getLogger(__name__)
 MULTIPART_THRESHOLD = 8 * 1024 * 1024
 MULTIPART_CHUNKSIZE = 8 * 1024 * 1024
 MAX_CONCURRENCY = 10
 def _classify_s3_error(e: Exception, filename: str | None) -> Exception:
    """Classify an S3 exception as transient or permanent upload error.
--- a/lib/crewai/src/crewai/files/uploaders/factory.py
+++ b/lib/crewai/src/crewai/files/uploaders/factory.py
@@ -0,0 +1,161 @@
 """Factory for creating file uploaders."""
 from __future__ import annotations
 import logging
 from typing import Literal, TypedDict, overload, reveal_type
 from typing_extensions import Unpack
 from crewai.files.uploaders.base import FileUploader
 logger = logging.getLogger(__name__)
 ProviderType = Literal[
    "gemini", "google", "anthropic", "claude", "openai", "gpt", "bedrock", "aws"
 ]
 UnknownProvider = str
 class AllOptions(TypedDict):
    """Kwargs for uploader factory."""
    api_key: str | None
    chunk_size: int
    bucket_name: str
    bucket_owner: str
    prefix: str
    region: str
@overload
 def get_uploader(provider: UnknownProvider, /) -> None:
    """Get file uploader for unknown provider."""
@overload
 def get_uploader(
    provider: Literal["gemini", "google"],
    *,
    api_key: str | None = ...,
 ) -> FileUploader:
    """Get Gemini file uploader."""
@overload
 def get_uploader(
    provider: Literal["anthropic", "claude"],
    *,
    api_key: str | None = ...,
 ) -> FileUploader:
    """Get Anthropic file uploader."""
@overload
 def get_uploader(
    provider: Literal["openai", "gpt"],
    *,
    api_key: str | None = ...,
    chunk_size: int = ...,
 ) -> FileUploader | None:
    """Get OpenAI file uploader."""
@overload
 def get_uploader(
    provider: Literal["bedrock", "aws"],
    *,
    bucket_name: str | None = ...,
    bucket_owner: str | None = ...,
    prefix: str = ...,
    region: str | None = ...,
 ) -> FileUploader | None:
    """Get Bedrock file uploader."""
@overload
 def get_uploader(
    provider: ProviderType | UnknownProvider, **kwargs: Unpack[AllOptions]
 ) -> FileUploader | None:
    """Get any file uploader."""
 def get_uploader(provider, **kwargs):  # type: ignore[no-untyped-def]
    """Get a file uploader for a specific provider.
    Args:
        provider: Provider name (e.g., "gemini", "anthropic").
        **kwargs: Additional arguments passed to the uploader constructor.
    Returns:
        FileUploader instance for the provider, or None if not supported.
    """
    provider_lower = provider.lower()
    if "gemini" in provider_lower or "google" in provider_lower:
        try:
            from crewai.files.uploaders.gemini import GeminiFileUploader
            return GeminiFileUploader(api_key=kwargs.get("api_key"))
        except ImportError:
            logger.warning(
                "google-genai not installed. Install with: pip install google-genai"
            )
            return None
    if "anthropic" in provider_lower or "claude" in provider_lower:
        try:
            from crewai.files.uploaders.anthropic import AnthropicFileUploader
            return AnthropicFileUploader(api_key=kwargs.get("api_key"))
        except ImportError:
            logger.warning(
                "anthropic not installed. Install with: pip install anthropic"
            )
            return None
    if "openai" in provider_lower or "gpt" in provider_lower:
        try:
            from crewai.files.uploaders.openai import OpenAIFileUploader
            return OpenAIFileUploader(
                api_key=kwargs.get("api_key"),
                chunk_size=kwargs.get("chunk_size", 67_108_864),
            )
        except ImportError:
            logger.warning("openai not installed. Install with: pip install openai")
            return None
    if "bedrock" in provider_lower or "aws" in provider_lower:
        import os
        if (
            not os.environ.get("CREWAI_BEDROCK_S3_BUCKET")
            and "bucket_name" not in kwargs
        ):
            logger.debug(
                "Bedrock S3 uploader not configured. "
                "Set CREWAI_BEDROCK_S3_BUCKET environment variable to enable."
            )
            return None
        try:
            from crewai.files.uploaders.bedrock import BedrockFileUploader
            return BedrockFileUploader(
                bucket_name=kwargs.get("bucket_name"),
                bucket_owner=kwargs.get("bucket_owner"),
                prefix=kwargs.get("prefix", "crewai-files"),
                region=kwargs.get("region"),
            )
        except ImportError:
            logger.warning("boto3 not installed. Install with: pip install boto3")
            return None
    logger.debug(f"No file uploader available for provider: {provider}")
    return None
 t = get_uploader("openai")
 reveal_type(t)
--- a/lib/crewai/src/crewai/files/uploaders/gemini.py
+++ b/lib/crewai/src/crewai/files/uploaders/gemini.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 import asyncio
-from datetime import datetime, timedelta, timezone
+from datetime import datetime, timezone
 import io
 import logging
 import os
@@ -12,6 +12,12 @@ import random
 import time
 from typing import Any
 from crewai.files.constants import (
    BACKOFF_BASE_DELAY,
    BACKOFF_JITTER_FACTOR,
    BACKOFF_MAX_DELAY,
    GEMINI_FILE_TTL,
 )
 from crewai.files.content_types import FileInput
 from crewai.files.file import FilePath
 from crewai.files.processing.exceptions import (
@@ -24,12 +30,6 @@ from crewai.files.uploaders.base import FileUploader, UploadResult
 logger = logging.getLogger(__name__)
 GEMINI_FILE_TTL = timedelta(hours=48)
 BACKOFF_BASE_DELAY = 1.0
 BACKOFF_MAX_DELAY = 30.0
 BACKOFF_JITTER_FACTOR = 0.1
 def _compute_backoff_delay(attempt: int) -> float:
    """Compute exponential backoff delay with jitter.
--- a/lib/crewai/src/crewai/files/uploaders/openai.py
+++ b/lib/crewai/src/crewai/files/uploaders/openai.py
@@ -8,6 +8,7 @@ import logging
 import os
 from typing import Any
 from crewai.files.constants import DEFAULT_UPLOAD_CHUNK_SIZE, FILES_API_MAX_SIZE
 from crewai.files.content_types import FileInput
 from crewai.files.file import FileBytes, FilePath, FileStream
 from crewai.files.processing.exceptions import (
@@ -20,9 +21,6 @@ from crewai.files.uploaders.base import FileUploader, UploadResult
 logger = logging.getLogger(__name__)
 FILES_API_MAX_SIZE = 512 * 1024 * 1024
 DEFAULT_UPLOAD_CHUNK_SIZE = 64 * 1024 * 1024
 def _get_file_size(file: FileInput) -> int | None:
    """Get file size without reading content if possible.
--- a/lib/crewai/src/crewai/files/utils.py
+++ b/lib/crewai/src/crewai/files/utils.py
@@ -0,0 +1,92 @@
 """Utility functions for file handling."""
 from __future__ import annotations
 from pathlib import Path
 from typing import TYPE_CHECKING
 from typing_extensions import TypeIs
 if TYPE_CHECKING:
    from crewai.files.content_types import FileInput
    from crewai.files.file import FileSource, FileSourceInput
 def is_file_source(v: object) -> TypeIs[FileSource]:
    """Type guard to narrow input to FileSource."""
    from crewai.files.file import FileBytes, FilePath, FileStream
    return isinstance(v, (FilePath, FileBytes, FileStream))
 def wrap_file_source(source: FileSource) -> FileInput:
    """Wrap a FileSource in the appropriate typed FileInput wrapper.
    Args:
        source: The file source to wrap.
    Returns:
        Typed FileInput wrapper based on content type.
    """
    from crewai.files.content_types import (
        AudioFile,
        ImageFile,
        PDFFile,
        TextFile,
        VideoFile,
    )
    content_type = source.content_type
    if content_type.startswith("image/"):
        return ImageFile(source=source)
    if content_type.startswith("audio/"):
        return AudioFile(source=source)
    if content_type.startswith("video/"):
        return VideoFile(source=source)
    if content_type == "application/pdf":
        return PDFFile(source=source)
    return TextFile(source=source)
 def normalize_input_files(
    input_files: list[FileSourceInput | FileInput],
 ) -> dict[str, FileInput]:
    """Convert a list of file sources to a named dictionary of FileInputs.
    Args:
        input_files: List of file source inputs or File objects.
    Returns:
        Dictionary mapping names to FileInput wrappers.
    """
    from crewai.files.content_types import BaseFile
    from crewai.files.file import FileBytes, FilePath, FileStream
    result: dict[str, FileInput] = {}
    for i, item in enumerate(input_files):
        if isinstance(item, BaseFile):
            name = item.filename or f"file_{i}"
            if "." in name:
                name = name.rsplit(".", 1)[0]
            result[name] = item
            continue
        file_source: FilePath | FileBytes | FileStream
        if isinstance(item, (FilePath, FileBytes, FileStream)):
            file_source = item
        elif isinstance(item, Path):
            file_source = FilePath(path=item)
        elif isinstance(item, str):
            file_source = FilePath(path=Path(item))
        elif isinstance(item, (bytes, memoryview)):
            file_source = FileBytes(data=bytes(item))
        else:
            continue
        name = file_source.filename or f"file_{i}"
        result[name] = wrap_file_source(file_source)
    return result