Files
crewAI/lib/crewai-tools/src/crewai_tools/rag/chunkers/text_chunker.py
2025-10-02 11:06:38 -04:00

77 lines
2.6 KiB
Python

from crewai_tools.rag.chunkers.base_chunker import BaseChunker
class TextChunker(BaseChunker):
def __init__(
self,
chunk_size: int = 1500,
chunk_overlap: int = 150,
separators: list[str] | None = None,
keep_separator: bool = True,
):
if separators is None:
separators = [
"\n\n\n", # Multiple line breaks (sections)
"\n\n", # Paragraph breaks
"\n", # Line breaks
". ", # Sentence endings
"! ", # Exclamation endings
"? ", # Question endings
"; ", # Semicolon breaks
", ", # Comma breaks
" ", # Word breaks
"", # Character level
]
super().__init__(chunk_size, chunk_overlap, separators, keep_separator)
class DocxChunker(BaseChunker):
def __init__(
self,
chunk_size: int = 2500,
chunk_overlap: int = 250,
separators: list[str] | None = None,
keep_separator: bool = True,
):
if separators is None:
separators = [
"\n\n\n", # Multiple line breaks (major sections)
"\n\n", # Paragraph breaks
"\n", # Line breaks
". ", # Sentence endings
"! ", # Exclamation endings
"? ", # Question endings
"; ", # Semicolon breaks
", ", # Comma breaks
" ", # Word breaks
"", # Character level
]
super().__init__(chunk_size, chunk_overlap, separators, keep_separator)
class MdxChunker(BaseChunker):
def __init__(
self,
chunk_size: int = 3000,
chunk_overlap: int = 300,
separators: list[str] | None = None,
keep_separator: bool = True,
):
if separators is None:
separators = [
"\n## ", # H2 headers (major sections)
"\n### ", # H3 headers (subsections)
"\n#### ", # H4 headers (sub-subsections)
"\n\n", # Paragraph breaks
"\n```", # Code block boundaries
"\n", # Line breaks
". ", # Sentence endings
"! ", # Exclamation endings
"? ", # Question endings
"; ", # Semicolon breaks
", ", # Comma breaks
" ", # Word breaks
"", # Character level
]
super().__init__(chunk_size, chunk_overlap, separators, keep_separator)