Files
crewAI/packages/tools/src/crewai_tools/rag/chunkers/structured_chunker.py
Greyson LaLonde c5c07331bb feat: merge latest changes from crewAI-tools main into packages/tools
- Merged upstream changes from crewAI-tools main branch
- Resolved conflicts due to monorepo structure (crewai_tools -> src/crewai_tools)
- Removed deprecated embedchain adapters
- Added new RAG loaders and crewai_rag_adapter
- Consolidated dependencies in pyproject.toml

Fixed critical linting issues:
- Added ClassVar annotations for mutable class attributes
- Added timeouts to requests calls (30s default)
- Fixed exception handling with proper 'from' clauses
- Added noqa comments for public API functions (backward compatibility)
- Updated ruff config to ignore expected patterns:
  - F401 in __init__ files (intentional re-exports)
  - S101 in test files (assertions are expected)
  - S607 for subprocess calls (uv/pip commands are safe)

Remaining issues are from upstream code and will be addressed in separate PRs.
2025-09-19 00:08:27 -04:00

67 lines
2.1 KiB
Python

from crewai_tools.rag.chunkers.base_chunker import BaseChunker
class CsvChunker(BaseChunker):
def __init__(
self,
chunk_size: int = 1200,
chunk_overlap: int = 100,
separators: list[str] | None = None,
keep_separator: bool = True,
):
if separators is None:
separators = [
"\nRow ", # Row boundaries (from CSVLoader format)
"\n", # Line breaks
" | ", # Column separators
", ", # Comma separators
" ", # Word breaks
"", # Character level
]
super().__init__(chunk_size, chunk_overlap, separators, keep_separator)
class JsonChunker(BaseChunker):
def __init__(
self,
chunk_size: int = 2000,
chunk_overlap: int = 200,
separators: list[str] | None = None,
keep_separator: bool = True,
):
if separators is None:
separators = [
"\n\n", # Object/array boundaries
"\n", # Line breaks
"},", # Object endings
"],", # Array endings
", ", # Property separators
": ", # Key-value separators
" ", # Word breaks
"", # Character level
]
super().__init__(chunk_size, chunk_overlap, separators, keep_separator)
class XmlChunker(BaseChunker):
def __init__(
self,
chunk_size: int = 2500,
chunk_overlap: int = 250,
separators: list[str] | None = None,
keep_separator: bool = True,
):
if separators is None:
separators = [
"\n\n", # Element boundaries
"\n", # Line breaks
">", # Tag endings
". ", # Sentence endings (for text content)
"! ", # Exclamation endings
"? ", # Question endings
", ", # Comma separators
" ", # Word breaks
"", # Character level
]
super().__init__(chunk_size, chunk_overlap, separators, keep_separator)