mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-05-03 00:02:36 +00:00
refactor: extract files module to standalone crewai-files package
This commit is contained in:
336
lib/crewai-files/src/crewai_files/processing/transformers.py
Normal file
336
lib/crewai-files/src/crewai_files/processing/transformers.py
Normal file
@@ -0,0 +1,336 @@
|
||||
"""File transformation functions for resizing, optimizing, and chunking."""
|
||||
|
||||
from collections.abc import Iterator
|
||||
import io
|
||||
import logging
|
||||
|
||||
from crewai_files.core.sources import FileBytes
|
||||
from crewai_files.core.types import ImageFile, PDFFile, TextFile
|
||||
from crewai_files.processing.exceptions import ProcessingDependencyError
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def resize_image(
|
||||
file: ImageFile,
|
||||
max_width: int,
|
||||
max_height: int,
|
||||
*,
|
||||
preserve_aspect_ratio: bool = True,
|
||||
) -> ImageFile:
|
||||
"""Resize an image to fit within the specified dimensions.
|
||||
|
||||
Args:
|
||||
file: The image file to resize.
|
||||
max_width: Maximum width in pixels.
|
||||
max_height: Maximum height in pixels.
|
||||
preserve_aspect_ratio: If True, maintain aspect ratio while fitting within bounds.
|
||||
|
||||
Returns:
|
||||
A new ImageFile with the resized image data.
|
||||
|
||||
Raises:
|
||||
ProcessingDependencyError: If Pillow is not installed.
|
||||
"""
|
||||
try:
|
||||
from PIL import Image
|
||||
except ImportError as e:
|
||||
raise ProcessingDependencyError(
|
||||
"Pillow is required for image resizing",
|
||||
dependency="Pillow",
|
||||
install_command="pip install Pillow",
|
||||
) from e
|
||||
|
||||
content = file.read()
|
||||
|
||||
with Image.open(io.BytesIO(content)) as img:
|
||||
original_width, original_height = img.size
|
||||
|
||||
if original_width <= max_width and original_height <= max_height:
|
||||
return file
|
||||
|
||||
if preserve_aspect_ratio:
|
||||
width_ratio = max_width / original_width
|
||||
height_ratio = max_height / original_height
|
||||
scale_factor = min(width_ratio, height_ratio)
|
||||
|
||||
new_width = int(original_width * scale_factor)
|
||||
new_height = int(original_height * scale_factor)
|
||||
else:
|
||||
new_width = min(original_width, max_width)
|
||||
new_height = min(original_height, max_height)
|
||||
|
||||
resized_img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
|
||||
|
||||
output_format = img.format or "PNG"
|
||||
if output_format.upper() == "JPEG":
|
||||
if resized_img.mode in ("RGBA", "LA", "P"):
|
||||
resized_img = resized_img.convert("RGB")
|
||||
|
||||
output_buffer = io.BytesIO()
|
||||
resized_img.save(output_buffer, format=output_format)
|
||||
output_bytes = output_buffer.getvalue()
|
||||
|
||||
logger.info(
|
||||
f"Resized image '{file.filename}' from {original_width}x{original_height} "
|
||||
f"to {new_width}x{new_height}"
|
||||
)
|
||||
|
||||
return ImageFile(source=FileBytes(data=output_bytes, filename=file.filename))
|
||||
|
||||
|
||||
def optimize_image(
|
||||
file: ImageFile,
|
||||
target_size_bytes: int,
|
||||
*,
|
||||
min_quality: int = 20,
|
||||
initial_quality: int = 85,
|
||||
) -> ImageFile:
|
||||
"""Optimize an image to fit within a target file size.
|
||||
|
||||
Uses iterative quality reduction to achieve target size.
|
||||
|
||||
Args:
|
||||
file: The image file to optimize.
|
||||
target_size_bytes: Target maximum file size in bytes.
|
||||
min_quality: Minimum quality to use (prevents excessive degradation).
|
||||
initial_quality: Starting quality for optimization.
|
||||
|
||||
Returns:
|
||||
A new ImageFile with the optimized image data.
|
||||
|
||||
Raises:
|
||||
ProcessingDependencyError: If Pillow is not installed.
|
||||
"""
|
||||
try:
|
||||
from PIL import Image
|
||||
except ImportError as e:
|
||||
raise ProcessingDependencyError(
|
||||
"Pillow is required for image optimization",
|
||||
dependency="Pillow",
|
||||
install_command="pip install Pillow",
|
||||
) from e
|
||||
|
||||
content = file.read()
|
||||
current_size = len(content)
|
||||
|
||||
if current_size <= target_size_bytes:
|
||||
return file
|
||||
|
||||
with Image.open(io.BytesIO(content)) as img:
|
||||
if img.mode in ("RGBA", "LA", "P"):
|
||||
img = img.convert("RGB")
|
||||
output_format = "JPEG"
|
||||
else:
|
||||
output_format = img.format or "JPEG"
|
||||
if output_format.upper() not in ("JPEG", "JPG"):
|
||||
output_format = "JPEG"
|
||||
|
||||
quality = initial_quality
|
||||
output_bytes = content
|
||||
|
||||
while len(output_bytes) > target_size_bytes and quality >= min_quality:
|
||||
output_buffer = io.BytesIO()
|
||||
img.save(
|
||||
output_buffer, format=output_format, quality=quality, optimize=True
|
||||
)
|
||||
output_bytes = output_buffer.getvalue()
|
||||
|
||||
if len(output_bytes) > target_size_bytes:
|
||||
quality -= 5
|
||||
|
||||
logger.info(
|
||||
f"Optimized image '{file.filename}' from {current_size} bytes to "
|
||||
f"{len(output_bytes)} bytes (quality={quality})"
|
||||
)
|
||||
|
||||
filename = file.filename
|
||||
if (
|
||||
filename
|
||||
and output_format.upper() == "JPEG"
|
||||
and not filename.lower().endswith((".jpg", ".jpeg"))
|
||||
):
|
||||
filename = filename.rsplit(".", 1)[0] + ".jpg"
|
||||
|
||||
return ImageFile(source=FileBytes(data=output_bytes, filename=filename))
|
||||
|
||||
|
||||
def chunk_pdf(
|
||||
file: PDFFile,
|
||||
max_pages: int,
|
||||
*,
|
||||
overlap_pages: int = 0,
|
||||
) -> Iterator[PDFFile]:
|
||||
"""Split a PDF into chunks of maximum page count.
|
||||
|
||||
Yields chunks one at a time to minimize memory usage.
|
||||
|
||||
Args:
|
||||
file: The PDF file to chunk.
|
||||
max_pages: Maximum pages per chunk.
|
||||
overlap_pages: Number of overlapping pages between chunks (for context).
|
||||
|
||||
Yields:
|
||||
PDFFile objects, one per chunk.
|
||||
|
||||
Raises:
|
||||
ProcessingDependencyError: If pypdf is not installed.
|
||||
"""
|
||||
try:
|
||||
from pypdf import PdfReader, PdfWriter
|
||||
except ImportError as e:
|
||||
raise ProcessingDependencyError(
|
||||
"pypdf is required for PDF chunking",
|
||||
dependency="pypdf",
|
||||
install_command="pip install pypdf",
|
||||
) from e
|
||||
|
||||
content = file.read()
|
||||
reader = PdfReader(io.BytesIO(content))
|
||||
total_pages = len(reader.pages)
|
||||
|
||||
if total_pages <= max_pages:
|
||||
yield file
|
||||
return
|
||||
|
||||
filename = file.filename or "document.pdf"
|
||||
base_filename = filename.rsplit(".", 1)[0]
|
||||
step = max_pages - overlap_pages
|
||||
|
||||
chunk_num = 0
|
||||
start_page = 0
|
||||
|
||||
while start_page < total_pages:
|
||||
end_page = min(start_page + max_pages, total_pages)
|
||||
|
||||
writer = PdfWriter()
|
||||
for page_num in range(start_page, end_page):
|
||||
writer.add_page(reader.pages[page_num])
|
||||
|
||||
output_buffer = io.BytesIO()
|
||||
writer.write(output_buffer)
|
||||
output_bytes = output_buffer.getvalue()
|
||||
|
||||
chunk_filename = f"{base_filename}_chunk_{chunk_num}.pdf"
|
||||
|
||||
logger.info(
|
||||
f"Created PDF chunk '{chunk_filename}' with pages {start_page + 1}-{end_page}"
|
||||
)
|
||||
|
||||
yield PDFFile(source=FileBytes(data=output_bytes, filename=chunk_filename))
|
||||
|
||||
start_page += step
|
||||
chunk_num += 1
|
||||
|
||||
|
||||
def chunk_text(
|
||||
file: TextFile,
|
||||
max_chars: int,
|
||||
*,
|
||||
overlap_chars: int = 200,
|
||||
split_on_newlines: bool = True,
|
||||
) -> Iterator[TextFile]:
|
||||
"""Split a text file into chunks of maximum character count.
|
||||
|
||||
Yields chunks one at a time to minimize memory usage.
|
||||
|
||||
Args:
|
||||
file: The text file to chunk.
|
||||
max_chars: Maximum characters per chunk.
|
||||
overlap_chars: Number of overlapping characters between chunks.
|
||||
split_on_newlines: If True, prefer splitting at newline boundaries.
|
||||
|
||||
Yields:
|
||||
TextFile objects, one per chunk.
|
||||
"""
|
||||
content = file.read()
|
||||
text = content.decode(errors="replace")
|
||||
total_chars = len(text)
|
||||
|
||||
if total_chars <= max_chars:
|
||||
yield file
|
||||
return
|
||||
|
||||
filename = file.filename or "text.txt"
|
||||
base_filename = filename.rsplit(".", 1)[0]
|
||||
extension = filename.rsplit(".", 1)[-1] if "." in filename else "txt"
|
||||
|
||||
chunk_num = 0
|
||||
start_pos = 0
|
||||
|
||||
while start_pos < total_chars:
|
||||
end_pos = min(start_pos + max_chars, total_chars)
|
||||
|
||||
if end_pos < total_chars and split_on_newlines:
|
||||
last_newline = text.rfind("\n", start_pos, end_pos)
|
||||
if last_newline > start_pos + max_chars // 2:
|
||||
end_pos = last_newline + 1
|
||||
|
||||
chunk_content = text[start_pos:end_pos]
|
||||
chunk_bytes = chunk_content.encode()
|
||||
|
||||
chunk_filename = f"{base_filename}_chunk_{chunk_num}.{extension}"
|
||||
|
||||
logger.info(
|
||||
f"Created text chunk '{chunk_filename}' with {len(chunk_content)} characters"
|
||||
)
|
||||
|
||||
yield TextFile(source=FileBytes(data=chunk_bytes, filename=chunk_filename))
|
||||
|
||||
if end_pos < total_chars:
|
||||
start_pos = max(start_pos + 1, end_pos - overlap_chars)
|
||||
else:
|
||||
start_pos = total_chars
|
||||
chunk_num += 1
|
||||
|
||||
|
||||
def get_image_dimensions(file: ImageFile) -> tuple[int, int] | None:
|
||||
"""Get the dimensions of an image file.
|
||||
|
||||
Args:
|
||||
file: The image file to measure.
|
||||
|
||||
Returns:
|
||||
Tuple of (width, height) in pixels, or None if dimensions cannot be determined.
|
||||
"""
|
||||
try:
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
logger.warning("Pillow not installed - cannot get image dimensions")
|
||||
return None
|
||||
|
||||
content = file.read()
|
||||
|
||||
try:
|
||||
with Image.open(io.BytesIO(content)) as img:
|
||||
width, height = img.size
|
||||
return width, height
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to get image dimensions: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def get_pdf_page_count(file: PDFFile) -> int | None:
|
||||
"""Get the page count of a PDF file.
|
||||
|
||||
Args:
|
||||
file: The PDF file to measure.
|
||||
|
||||
Returns:
|
||||
Number of pages, or None if page count cannot be determined.
|
||||
"""
|
||||
try:
|
||||
from pypdf import PdfReader
|
||||
except ImportError:
|
||||
logger.warning("pypdf not installed - cannot get PDF page count")
|
||||
return None
|
||||
|
||||
content = file.read()
|
||||
|
||||
try:
|
||||
reader = PdfReader(io.BytesIO(content))
|
||||
return len(reader.pages)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to get PDF page count: {e}")
|
||||
return None
|
||||
Reference in New Issue
Block a user