Adds RAG feature (#406)

* feat: initialize rag * refactor: using cosine distance metric for chromadb * feat: use RecursiveCharacterTextSplitter as chunker strategy * feat: support chucker and loader per data_type * feat: adding JSON loader * feat: adding CSVLoader * feat: adding loader for DOCX files * feat: add loader for MDX files * feat: add loader for XML files * feat: add loader for parser Webpage * feat: support to load files from an entire directory * feat: support to auto-load the loaders for additional DataType * feat: add chuckers for some specific data type - Each chunker uses separators specific to its content type * feat: prevent document duplication and centralize content management - Implement document deduplication logic in RAG * Check for existing documents by source reference * Compare doc IDs to detect content changes * Automatically replace outdated content while preventing duplicates - Centralize common functionality for better maintainability * Create SourceContent class to handle URLs, files, and text uniformly * Extract shared utilities (compute_sha256) to misc.py * Standardize doc ID generation across all loaders - Improve RAG system architecture * All loaders now inherit consistent patterns from centralized BaseLoader * Better separation of concerns with dedicated content management classes * Standardized LoaderResult structure across all loader implementations * chore: split text loaders file * test: adding missing tests about RAG loaders * refactor: QOL * fix: add missing uv syntax on DOCXLoader
2026-05-03 00:02:36 +00:00 · 2025-08-19 19:30:35 -03:00
parent 1ce016df8b
commit dc039cfac8
31 changed files with 2595 additions and 0 deletions
--- a/src/crewai_tools/rag/chunkers/init.py
+++ b/src/crewai_tools/rag/chunkers/init.py
@@ -0,0 +1,15 @@
+from crewai_tools.rag.chunkers.base_chunker import BaseChunker
+from crewai_tools.rag.chunkers.default_chunker import DefaultChunker
+from crewai_tools.rag.chunkers.text_chunker import TextChunker, DocxChunker, MdxChunker
+from crewai_tools.rag.chunkers.structured_chunker import CsvChunker, JsonChunker, XmlChunker
+
+__all__ = [
+    "BaseChunker",
+    "DefaultChunker",
+    "TextChunker",
+    "DocxChunker",
+    "MdxChunker",
+    "CsvChunker",
+    "JsonChunker",
+    "XmlChunker",
+]
--- a/src/crewai_tools/rag/chunkers/base_chunker.py
+++ b/src/crewai_tools/rag/chunkers/base_chunker.py
@@ -0,0 +1,167 @@
+from typing import List, Optional
+import re
+
+class RecursiveCharacterTextSplitter:
+    """
+    A text splitter that recursively splits text based on a hierarchy of separators.
+    """
+
+    def __init__(
+        self,
+        chunk_size: int = 4000,
+        chunk_overlap: int = 200,
+        separators: Optional[List[str]] = None,
+        keep_separator: bool = True,
+    ):
+        """
+        Initialize the RecursiveCharacterTextSplitter.
+
+        Args:
+            chunk_size: Maximum size of each chunk
+            chunk_overlap: Number of characters to overlap between chunks
+            separators: List of separators to use for splitting (in order of preference)
+            keep_separator: Whether to keep the separator in the split text
+        """
+        if chunk_overlap >= chunk_size:
+            raise ValueError(f"Chunk overlap ({chunk_overlap}) cannot be >= chunk size ({chunk_size})")
+
+        self._chunk_size = chunk_size
+        self._chunk_overlap = chunk_overlap
+        self._keep_separator = keep_separator
+
+        self._separators = separators or [
+            "\n\n",
+            "\n",
+            " ",
+            "",
+        ]
+
+    def split_text(self, text: str) -> List[str]:
+        return self._split_text(text, self._separators)
+
+    def _split_text(self, text: str, separators: List[str]) -> List[str]:
+        separator = separators[-1]
+        new_separators = []
+
+        for i, sep in enumerate(separators):
+            if sep == "":
+                separator = sep
+                break
+            if re.search(re.escape(sep), text):
+                separator = sep
+                new_separators = separators[i + 1:]
+                break
+
+        splits = self._split_text_with_separator(text, separator)
+
+        good_splits = []
+
+        for split in splits:
+            if len(split) < self._chunk_size:
+                good_splits.append(split)
+            else:
+                if new_separators:
+                    other_info = self._split_text(split, new_separators)
+                    good_splits.extend(other_info)
+                else:
+                    good_splits.extend(self._split_by_characters(split))
+
+        return self._merge_splits(good_splits, separator)
+
+    def _split_text_with_separator(self, text: str, separator: str) -> List[str]:
+        if separator == "":
+            return list(text)
+
+        if self._keep_separator and separator in text:
+            parts = text.split(separator)
+            splits = []
+
+            for i, part in enumerate(parts):
+                if i == 0:
+                    splits.append(part)
+                elif i == len(parts) - 1:
+                    if part:
+                        splits.append(separator + part)
+                else:
+                    if part:
+                        splits.append(separator + part)
+                    else:
+                        if splits:
+                            splits[-1] += separator
+
+            return [s for s in splits if s]
+        else:
+            return text.split(separator)
+
+    def _split_by_characters(self, text: str) -> List[str]:
+        chunks = []
+        for i in range(0, len(text), self._chunk_size):
+            chunks.append(text[i:i + self._chunk_size])
+        return chunks
+
+    def _merge_splits(self, splits: List[str], separator: str) -> List[str]:
+        """Merge splits into chunks with proper overlap."""
+        docs = []
+        current_doc = []
+        total = 0
+
+        for split in splits:
+            split_len = len(split)
+
+            if total + split_len > self._chunk_size and current_doc:
+                if separator == "":
+                    doc = "".join(current_doc)
+                else:
+                    doc = separator.join(current_doc)
+
+                if doc:
+                    docs.append(doc)
+
+                # Handle overlap by keeping some of the previous content
+                while total > self._chunk_overlap and len(current_doc) > 1:
+                    removed = current_doc.pop(0)
+                    total -= len(removed)
+                    if separator != "":
+                        total -= len(separator)
+
+            current_doc.append(split)
+            total += split_len
+            if separator != "" and len(current_doc) > 1:
+                total += len(separator)
+
+        if current_doc:
+            if separator == "":
+                doc = "".join(current_doc)
+            else:
+                doc = separator.join(current_doc)
+
+            if doc:
+                docs.append(doc)
+
+        return docs
+
+class BaseChunker:
+    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200, separators: Optional[List[str]] = None, keep_separator: bool = True):
+        """
+        Initialize the Chunker
+
+        Args:
+            chunk_size: Maximum size of each chunk
+            chunk_overlap: Number of characters to overlap between chunks
+            separators: List of separators to use for splitting
+            keep_separator: Whether to keep separators in the chunks
+        """
+
+        self._splitter = RecursiveCharacterTextSplitter(
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap,
+            separators=separators,
+            keep_separator=keep_separator,
+        )
+
+
+    def chunk(self, text: str) -> List[str]:
+        if not text or not text.strip():
+            return []
+
+        return self._splitter.split_text(text)
--- a/src/crewai_tools/rag/chunkers/default_chunker.py
+++ b/src/crewai_tools/rag/chunkers/default_chunker.py
@@ -0,0 +1,6 @@
+from crewai_tools.rag.chunkers.base_chunker import BaseChunker
+from typing import List, Optional
+
+class DefaultChunker(BaseChunker):
+    def __init__(self, chunk_size: int = 2000, chunk_overlap: int = 20, separators: Optional[List[str]] = None, keep_separator: bool = True):
+        super().__init__(chunk_size, chunk_overlap, separators, keep_separator)
--- a/src/crewai_tools/rag/chunkers/structured_chunker.py
+++ b/src/crewai_tools/rag/chunkers/structured_chunker.py
@@ -0,0 +1,49 @@
+from crewai_tools.rag.chunkers.base_chunker import BaseChunker
+from typing import List, Optional
+
+
+class CsvChunker(BaseChunker):
+    def __init__(self, chunk_size: int = 1200, chunk_overlap: int = 100, separators: Optional[List[str]] = None, keep_separator: bool = True):
+        if separators is None:
+            separators = [
+                "\nRow ",   # Row boundaries (from CSVLoader format)
+                "\n",       # Line breaks
+                " | ",      # Column separators
+                ", ",       # Comma separators
+                " ",        # Word breaks
+                "",         # Character level
+            ]
+        super().__init__(chunk_size, chunk_overlap, separators, keep_separator)
+
+
+class JsonChunker(BaseChunker):
+    def __init__(self, chunk_size: int = 2000, chunk_overlap: int = 200, separators: Optional[List[str]] = None, keep_separator: bool = True):
+        if separators is None:
+            separators = [
+                "\n\n",     # Object/array boundaries
+                "\n",       # Line breaks
+                "},",       # Object endings
+                "],",       # Array endings
+                ", ",       # Property separators
+                ": ",       # Key-value separators
+                " ",        # Word breaks
+                "",         # Character level
+            ]
+        super().__init__(chunk_size, chunk_overlap, separators, keep_separator)
+
+
+class XmlChunker(BaseChunker):
+    def __init__(self, chunk_size: int = 2500, chunk_overlap: int = 250, separators: Optional[List[str]] = None, keep_separator: bool = True):
+        if separators is None:
+            separators = [
+                "\n\n",     # Element boundaries
+                "\n",       # Line breaks
+                ">",        # Tag endings
+                ". ",       # Sentence endings (for text content)
+                "! ",       # Exclamation endings
+                "? ",       # Question endings
+                ", ",       # Comma separators
+                " ",        # Word breaks
+                "",         # Character level
+            ]
+        super().__init__(chunk_size, chunk_overlap, separators, keep_separator)
--- a/src/crewai_tools/rag/chunkers/text_chunker.py
+++ b/src/crewai_tools/rag/chunkers/text_chunker.py
@@ -0,0 +1,59 @@
+from crewai_tools.rag.chunkers.base_chunker import BaseChunker
+from typing import List, Optional
+
+
+class TextChunker(BaseChunker):
+    def __init__(self, chunk_size: int = 1500, chunk_overlap: int = 150, separators: Optional[List[str]] = None, keep_separator: bool = True):
+        if separators is None:
+            separators = [
+                "\n\n\n",  # Multiple line breaks (sections)
+                "\n\n",    # Paragraph breaks
+                "\n",      # Line breaks
+                ". ",      # Sentence endings
+                "! ",      # Exclamation endings
+                "? ",      # Question endings
+                "; ",      # Semicolon breaks
+                ", ",      # Comma breaks
+                " ",       # Word breaks
+                "",        # Character level
+            ]
+        super().__init__(chunk_size, chunk_overlap, separators, keep_separator)
+
+
+class DocxChunker(BaseChunker):
+    def __init__(self, chunk_size: int = 2500, chunk_overlap: int = 250, separators: Optional[List[str]] = None, keep_separator: bool = True):
+        if separators is None:
+            separators = [
+                "\n\n\n",  # Multiple line breaks (major sections)
+                "\n\n",    # Paragraph breaks
+                "\n",      # Line breaks
+                ". ",      # Sentence endings
+                "! ",      # Exclamation endings
+                "? ",      # Question endings
+                "; ",      # Semicolon breaks
+                ", ",      # Comma breaks
+                " ",       # Word breaks
+                "",        # Character level
+            ]
+        super().__init__(chunk_size, chunk_overlap, separators, keep_separator)
+
+
+class MdxChunker(BaseChunker):
+    def __init__(self, chunk_size: int = 3000, chunk_overlap: int = 300, separators: Optional[List[str]] = None, keep_separator: bool = True):
+        if separators is None:
+            separators = [
+                "\n## ",   # H2 headers (major sections)
+                "\n### ",  # H3 headers (subsections)
+                "\n#### ", # H4 headers (sub-subsections)
+                "\n\n",    # Paragraph breaks
+                "\n```",   # Code block boundaries
+                "\n",      # Line breaks
+                ". ",      # Sentence endings
+                "! ",      # Exclamation endings
+                "? ",      # Question endings
+                "; ",      # Semicolon breaks
+                ", ",      # Comma breaks
+                " ",       # Word breaks
+                "",        # Character level
+            ]
+        super().__init__(chunk_size, chunk_overlap, separators, keep_separator)
--- a/src/crewai_tools/rag/chunkers/web_chunker.py
+++ b/src/crewai_tools/rag/chunkers/web_chunker.py
@@ -0,0 +1,20 @@
+from crewai_tools.rag.chunkers.base_chunker import BaseChunker
+from typing import List, Optional
+
+
+class WebsiteChunker(BaseChunker):
+    def __init__(self, chunk_size: int = 2500, chunk_overlap: int = 250, separators: Optional[List[str]] = None, keep_separator: bool = True):
+        if separators is None:
+            separators = [
+                "\n\n\n",  # Major section breaks
+                "\n\n",    # Paragraph breaks
+                "\n",      # Line breaks
+                ". ",      # Sentence endings
+                "! ",      # Exclamation endings
+                "? ",      # Question endings
+                "; ",      # Semicolon breaks
+                ", ",      # Comma breaks
+                " ",       # Word breaks
+                "",        # Character level
+            ]
+        super().__init__(chunk_size, chunk_overlap, separators, keep_separator)