Files
crewAI/lib/crewai-files/src/crewai_files/processing/transformers.py

337 lines
9.7 KiB
Python

"""File transformation functions for resizing, optimizing, and chunking."""
from collections.abc import Iterator
import io
import logging
from crewai_files.core.sources import FileBytes
from crewai_files.core.types import ImageFile, PDFFile, TextFile
from crewai_files.processing.exceptions import ProcessingDependencyError
logger = logging.getLogger(__name__)
def resize_image(
file: ImageFile,
max_width: int,
max_height: int,
*,
preserve_aspect_ratio: bool = True,
) -> ImageFile:
"""Resize an image to fit within the specified dimensions.
Args:
file: The image file to resize.
max_width: Maximum width in pixels.
max_height: Maximum height in pixels.
preserve_aspect_ratio: If True, maintain aspect ratio while fitting within bounds.
Returns:
A new ImageFile with the resized image data.
Raises:
ProcessingDependencyError: If Pillow is not installed.
"""
try:
from PIL import Image
except ImportError as e:
raise ProcessingDependencyError(
"Pillow is required for image resizing",
dependency="Pillow",
install_command="pip install Pillow",
) from e
content = file.read()
with Image.open(io.BytesIO(content)) as img:
original_width, original_height = img.size
if original_width <= max_width and original_height <= max_height:
return file
if preserve_aspect_ratio:
width_ratio = max_width / original_width
height_ratio = max_height / original_height
scale_factor = min(width_ratio, height_ratio)
new_width = int(original_width * scale_factor)
new_height = int(original_height * scale_factor)
else:
new_width = min(original_width, max_width)
new_height = min(original_height, max_height)
resized_img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
output_format = img.format or "PNG"
if output_format.upper() == "JPEG":
if resized_img.mode in ("RGBA", "LA", "P"):
resized_img = resized_img.convert("RGB")
output_buffer = io.BytesIO()
resized_img.save(output_buffer, format=output_format)
output_bytes = output_buffer.getvalue()
logger.info(
f"Resized image '{file.filename}' from {original_width}x{original_height} "
f"to {new_width}x{new_height}"
)
return ImageFile(source=FileBytes(data=output_bytes, filename=file.filename))
def optimize_image(
file: ImageFile,
target_size_bytes: int,
*,
min_quality: int = 20,
initial_quality: int = 85,
) -> ImageFile:
"""Optimize an image to fit within a target file size.
Uses iterative quality reduction to achieve target size.
Args:
file: The image file to optimize.
target_size_bytes: Target maximum file size in bytes.
min_quality: Minimum quality to use (prevents excessive degradation).
initial_quality: Starting quality for optimization.
Returns:
A new ImageFile with the optimized image data.
Raises:
ProcessingDependencyError: If Pillow is not installed.
"""
try:
from PIL import Image
except ImportError as e:
raise ProcessingDependencyError(
"Pillow is required for image optimization",
dependency="Pillow",
install_command="pip install Pillow",
) from e
content = file.read()
current_size = len(content)
if current_size <= target_size_bytes:
return file
with Image.open(io.BytesIO(content)) as img:
if img.mode in ("RGBA", "LA", "P"):
img = img.convert("RGB")
output_format = "JPEG"
else:
output_format = img.format or "JPEG"
if output_format.upper() not in ("JPEG", "JPG"):
output_format = "JPEG"
quality = initial_quality
output_bytes = content
while len(output_bytes) > target_size_bytes and quality >= min_quality:
output_buffer = io.BytesIO()
img.save(
output_buffer, format=output_format, quality=quality, optimize=True
)
output_bytes = output_buffer.getvalue()
if len(output_bytes) > target_size_bytes:
quality -= 5
logger.info(
f"Optimized image '{file.filename}' from {current_size} bytes to "
f"{len(output_bytes)} bytes (quality={quality})"
)
filename = file.filename
if (
filename
and output_format.upper() == "JPEG"
and not filename.lower().endswith((".jpg", ".jpeg"))
):
filename = filename.rsplit(".", 1)[0] + ".jpg"
return ImageFile(source=FileBytes(data=output_bytes, filename=filename))
def chunk_pdf(
file: PDFFile,
max_pages: int,
*,
overlap_pages: int = 0,
) -> Iterator[PDFFile]:
"""Split a PDF into chunks of maximum page count.
Yields chunks one at a time to minimize memory usage.
Args:
file: The PDF file to chunk.
max_pages: Maximum pages per chunk.
overlap_pages: Number of overlapping pages between chunks (for context).
Yields:
PDFFile objects, one per chunk.
Raises:
ProcessingDependencyError: If pypdf is not installed.
"""
try:
from pypdf import PdfReader, PdfWriter
except ImportError as e:
raise ProcessingDependencyError(
"pypdf is required for PDF chunking",
dependency="pypdf",
install_command="pip install pypdf",
) from e
content = file.read()
reader = PdfReader(io.BytesIO(content))
total_pages = len(reader.pages)
if total_pages <= max_pages:
yield file
return
filename = file.filename or "document.pdf"
base_filename = filename.rsplit(".", 1)[0]
step = max_pages - overlap_pages
chunk_num = 0
start_page = 0
while start_page < total_pages:
end_page = min(start_page + max_pages, total_pages)
writer = PdfWriter()
for page_num in range(start_page, end_page):
writer.add_page(reader.pages[page_num])
output_buffer = io.BytesIO()
writer.write(output_buffer)
output_bytes = output_buffer.getvalue()
chunk_filename = f"{base_filename}_chunk_{chunk_num}.pdf"
logger.info(
f"Created PDF chunk '{chunk_filename}' with pages {start_page + 1}-{end_page}"
)
yield PDFFile(source=FileBytes(data=output_bytes, filename=chunk_filename))
start_page += step
chunk_num += 1
def chunk_text(
file: TextFile,
max_chars: int,
*,
overlap_chars: int = 200,
split_on_newlines: bool = True,
) -> Iterator[TextFile]:
"""Split a text file into chunks of maximum character count.
Yields chunks one at a time to minimize memory usage.
Args:
file: The text file to chunk.
max_chars: Maximum characters per chunk.
overlap_chars: Number of overlapping characters between chunks.
split_on_newlines: If True, prefer splitting at newline boundaries.
Yields:
TextFile objects, one per chunk.
"""
content = file.read()
text = content.decode(errors="replace")
total_chars = len(text)
if total_chars <= max_chars:
yield file
return
filename = file.filename or "text.txt"
base_filename = filename.rsplit(".", 1)[0]
extension = filename.rsplit(".", 1)[-1] if "." in filename else "txt"
chunk_num = 0
start_pos = 0
while start_pos < total_chars:
end_pos = min(start_pos + max_chars, total_chars)
if end_pos < total_chars and split_on_newlines:
last_newline = text.rfind("\n", start_pos, end_pos)
if last_newline > start_pos + max_chars // 2:
end_pos = last_newline + 1
chunk_content = text[start_pos:end_pos]
chunk_bytes = chunk_content.encode()
chunk_filename = f"{base_filename}_chunk_{chunk_num}.{extension}"
logger.info(
f"Created text chunk '{chunk_filename}' with {len(chunk_content)} characters"
)
yield TextFile(source=FileBytes(data=chunk_bytes, filename=chunk_filename))
if end_pos < total_chars:
start_pos = max(start_pos + 1, end_pos - overlap_chars)
else:
start_pos = total_chars
chunk_num += 1
def get_image_dimensions(file: ImageFile) -> tuple[int, int] | None:
"""Get the dimensions of an image file.
Args:
file: The image file to measure.
Returns:
Tuple of (width, height) in pixels, or None if dimensions cannot be determined.
"""
try:
from PIL import Image
except ImportError:
logger.warning("Pillow not installed - cannot get image dimensions")
return None
content = file.read()
try:
with Image.open(io.BytesIO(content)) as img:
width, height = img.size
return width, height
except Exception as e:
logger.warning(f"Failed to get image dimensions: {e}")
return None
def get_pdf_page_count(file: PDFFile) -> int | None:
"""Get the page count of a PDF file.
Args:
file: The PDF file to measure.
Returns:
Number of pages, or None if page count cannot be determined.
"""
try:
from pypdf import PdfReader
except ImportError:
logger.warning("pypdf not installed - cannot get PDF page count")
return None
content = file.read()
try:
reader = PdfReader(io.BytesIO(content))
return len(reader.pages)
except Exception as e:
logger.warning(f"Failed to get PDF page count: {e}")
return None