Adds RAG feature (#406)

* feat: initialize rag

* refactor: using cosine distance metric for chromadb

* feat: use RecursiveCharacterTextSplitter as chunker strategy

* feat: support chucker and loader per data_type

* feat: adding JSON loader

* feat: adding CSVLoader

* feat: adding loader for DOCX files

* feat: add loader for MDX files

* feat: add loader for XML files

* feat: add loader for parser Webpage

* feat: support to load files from an entire directory

* feat: support to auto-load the loaders for additional DataType

* feat: add chuckers for some specific data type

- Each chunker uses separators specific to its content type

* feat: prevent document duplication and centralize content management

- Implement document deduplication logic in RAG
  * Check for existing documents by source reference
  * Compare doc IDs to detect content changes
  * Automatically replace outdated content while preventing duplicates

- Centralize common functionality for better maintainability
  * Create SourceContent class to handle URLs, files, and text uniformly
  * Extract shared utilities (compute_sha256) to misc.py
  * Standardize doc ID generation across all loaders

- Improve RAG system architecture
  * All loaders now inherit consistent patterns from centralized BaseLoader
  * Better separation of concerns with dedicated content management classes
  * Standardized LoaderResult structure across all loader implementations

* chore: split text loaders file

* test: adding missing tests about RAG loaders

* refactor: QOL

* fix: add missing uv syntax on DOCXLoader
This commit is contained in:
Lucas Gomide
2025-08-19 19:30:35 -03:00
committed by GitHub
parent 1ce016df8b
commit dc039cfac8
31 changed files with 2595 additions and 0 deletions

View File

@@ -0,0 +1,15 @@
from crewai_tools.rag.chunkers.base_chunker import BaseChunker
from crewai_tools.rag.chunkers.default_chunker import DefaultChunker
from crewai_tools.rag.chunkers.text_chunker import TextChunker, DocxChunker, MdxChunker
from crewai_tools.rag.chunkers.structured_chunker import CsvChunker, JsonChunker, XmlChunker
__all__ = [
"BaseChunker",
"DefaultChunker",
"TextChunker",
"DocxChunker",
"MdxChunker",
"CsvChunker",
"JsonChunker",
"XmlChunker",
]

View File

@@ -0,0 +1,167 @@
from typing import List, Optional
import re
class RecursiveCharacterTextSplitter:
"""
A text splitter that recursively splits text based on a hierarchy of separators.
"""
def __init__(
self,
chunk_size: int = 4000,
chunk_overlap: int = 200,
separators: Optional[List[str]] = None,
keep_separator: bool = True,
):
"""
Initialize the RecursiveCharacterTextSplitter.
Args:
chunk_size: Maximum size of each chunk
chunk_overlap: Number of characters to overlap between chunks
separators: List of separators to use for splitting (in order of preference)
keep_separator: Whether to keep the separator in the split text
"""
if chunk_overlap >= chunk_size:
raise ValueError(f"Chunk overlap ({chunk_overlap}) cannot be >= chunk size ({chunk_size})")
self._chunk_size = chunk_size
self._chunk_overlap = chunk_overlap
self._keep_separator = keep_separator
self._separators = separators or [
"\n\n",
"\n",
" ",
"",
]
def split_text(self, text: str) -> List[str]:
return self._split_text(text, self._separators)
def _split_text(self, text: str, separators: List[str]) -> List[str]:
separator = separators[-1]
new_separators = []
for i, sep in enumerate(separators):
if sep == "":
separator = sep
break
if re.search(re.escape(sep), text):
separator = sep
new_separators = separators[i + 1:]
break
splits = self._split_text_with_separator(text, separator)
good_splits = []
for split in splits:
if len(split) < self._chunk_size:
good_splits.append(split)
else:
if new_separators:
other_info = self._split_text(split, new_separators)
good_splits.extend(other_info)
else:
good_splits.extend(self._split_by_characters(split))
return self._merge_splits(good_splits, separator)
def _split_text_with_separator(self, text: str, separator: str) -> List[str]:
if separator == "":
return list(text)
if self._keep_separator and separator in text:
parts = text.split(separator)
splits = []
for i, part in enumerate(parts):
if i == 0:
splits.append(part)
elif i == len(parts) - 1:
if part:
splits.append(separator + part)
else:
if part:
splits.append(separator + part)
else:
if splits:
splits[-1] += separator
return [s for s in splits if s]
else:
return text.split(separator)
def _split_by_characters(self, text: str) -> List[str]:
chunks = []
for i in range(0, len(text), self._chunk_size):
chunks.append(text[i:i + self._chunk_size])
return chunks
def _merge_splits(self, splits: List[str], separator: str) -> List[str]:
"""Merge splits into chunks with proper overlap."""
docs = []
current_doc = []
total = 0
for split in splits:
split_len = len(split)
if total + split_len > self._chunk_size and current_doc:
if separator == "":
doc = "".join(current_doc)
else:
doc = separator.join(current_doc)
if doc:
docs.append(doc)
# Handle overlap by keeping some of the previous content
while total > self._chunk_overlap and len(current_doc) > 1:
removed = current_doc.pop(0)
total -= len(removed)
if separator != "":
total -= len(separator)
current_doc.append(split)
total += split_len
if separator != "" and len(current_doc) > 1:
total += len(separator)
if current_doc:
if separator == "":
doc = "".join(current_doc)
else:
doc = separator.join(current_doc)
if doc:
docs.append(doc)
return docs
class BaseChunker:
def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200, separators: Optional[List[str]] = None, keep_separator: bool = True):
"""
Initialize the Chunker
Args:
chunk_size: Maximum size of each chunk
chunk_overlap: Number of characters to overlap between chunks
separators: List of separators to use for splitting
keep_separator: Whether to keep separators in the chunks
"""
self._splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
separators=separators,
keep_separator=keep_separator,
)
def chunk(self, text: str) -> List[str]:
if not text or not text.strip():
return []
return self._splitter.split_text(text)

View File

@@ -0,0 +1,6 @@
from crewai_tools.rag.chunkers.base_chunker import BaseChunker
from typing import List, Optional
class DefaultChunker(BaseChunker):
def __init__(self, chunk_size: int = 2000, chunk_overlap: int = 20, separators: Optional[List[str]] = None, keep_separator: bool = True):
super().__init__(chunk_size, chunk_overlap, separators, keep_separator)

View File

@@ -0,0 +1,49 @@
from crewai_tools.rag.chunkers.base_chunker import BaseChunker
from typing import List, Optional
class CsvChunker(BaseChunker):
def __init__(self, chunk_size: int = 1200, chunk_overlap: int = 100, separators: Optional[List[str]] = None, keep_separator: bool = True):
if separators is None:
separators = [
"\nRow ", # Row boundaries (from CSVLoader format)
"\n", # Line breaks
" | ", # Column separators
", ", # Comma separators
" ", # Word breaks
"", # Character level
]
super().__init__(chunk_size, chunk_overlap, separators, keep_separator)
class JsonChunker(BaseChunker):
def __init__(self, chunk_size: int = 2000, chunk_overlap: int = 200, separators: Optional[List[str]] = None, keep_separator: bool = True):
if separators is None:
separators = [
"\n\n", # Object/array boundaries
"\n", # Line breaks
"},", # Object endings
"],", # Array endings
", ", # Property separators
": ", # Key-value separators
" ", # Word breaks
"", # Character level
]
super().__init__(chunk_size, chunk_overlap, separators, keep_separator)
class XmlChunker(BaseChunker):
def __init__(self, chunk_size: int = 2500, chunk_overlap: int = 250, separators: Optional[List[str]] = None, keep_separator: bool = True):
if separators is None:
separators = [
"\n\n", # Element boundaries
"\n", # Line breaks
">", # Tag endings
". ", # Sentence endings (for text content)
"! ", # Exclamation endings
"? ", # Question endings
", ", # Comma separators
" ", # Word breaks
"", # Character level
]
super().__init__(chunk_size, chunk_overlap, separators, keep_separator)

View File

@@ -0,0 +1,59 @@
from crewai_tools.rag.chunkers.base_chunker import BaseChunker
from typing import List, Optional
class TextChunker(BaseChunker):
def __init__(self, chunk_size: int = 1500, chunk_overlap: int = 150, separators: Optional[List[str]] = None, keep_separator: bool = True):
if separators is None:
separators = [
"\n\n\n", # Multiple line breaks (sections)
"\n\n", # Paragraph breaks
"\n", # Line breaks
". ", # Sentence endings
"! ", # Exclamation endings
"? ", # Question endings
"; ", # Semicolon breaks
", ", # Comma breaks
" ", # Word breaks
"", # Character level
]
super().__init__(chunk_size, chunk_overlap, separators, keep_separator)
class DocxChunker(BaseChunker):
def __init__(self, chunk_size: int = 2500, chunk_overlap: int = 250, separators: Optional[List[str]] = None, keep_separator: bool = True):
if separators is None:
separators = [
"\n\n\n", # Multiple line breaks (major sections)
"\n\n", # Paragraph breaks
"\n", # Line breaks
". ", # Sentence endings
"! ", # Exclamation endings
"? ", # Question endings
"; ", # Semicolon breaks
", ", # Comma breaks
" ", # Word breaks
"", # Character level
]
super().__init__(chunk_size, chunk_overlap, separators, keep_separator)
class MdxChunker(BaseChunker):
def __init__(self, chunk_size: int = 3000, chunk_overlap: int = 300, separators: Optional[List[str]] = None, keep_separator: bool = True):
if separators is None:
separators = [
"\n## ", # H2 headers (major sections)
"\n### ", # H3 headers (subsections)
"\n#### ", # H4 headers (sub-subsections)
"\n\n", # Paragraph breaks
"\n```", # Code block boundaries
"\n", # Line breaks
". ", # Sentence endings
"! ", # Exclamation endings
"? ", # Question endings
"; ", # Semicolon breaks
", ", # Comma breaks
" ", # Word breaks
"", # Character level
]
super().__init__(chunk_size, chunk_overlap, separators, keep_separator)

View File

@@ -0,0 +1,20 @@
from crewai_tools.rag.chunkers.base_chunker import BaseChunker
from typing import List, Optional
class WebsiteChunker(BaseChunker):
def __init__(self, chunk_size: int = 2500, chunk_overlap: int = 250, separators: Optional[List[str]] = None, keep_separator: bool = True):
if separators is None:
separators = [
"\n\n\n", # Major section breaks
"\n\n", # Paragraph breaks
"\n", # Line breaks
". ", # Sentence endings
"! ", # Exclamation endings
"? ", # Question endings
"; ", # Semicolon breaks
", ", # Comma breaks
" ", # Word breaks
"", # Character level
]
super().__init__(chunk_size, chunk_overlap, separators, keep_separator)