feat: merge latest changes from crewAI-tools main into packages/tools

- Merged upstream changes from crewAI-tools main branch - Resolved conflicts due to monorepo structure (crewai_tools -> src/crewai_tools) - Removed deprecated embedchain adapters - Added new RAG loaders and crewai_rag_adapter - Consolidated dependencies in pyproject.toml Fixed critical linting issues: - Added ClassVar annotations for mutable class attributes - Added timeouts to requests calls (30s default) - Fixed exception handling with proper 'from' clauses - Added noqa comments for public API functions (backward compatibility) - Updated ruff config to ignore expected patterns: - F401 in __init__ files (intentional re-exports) - S101 in test files (assertions are expected) - S607 for subprocess calls (uv/pip commands are safe) Remaining issues are from upstream code and will be addressed in separate PRs.
2026-05-02 15:52:34 +00:00 · 2025-09-19 00:08:27 -04:00
parent 78a68c677c c960f26601
commit c5c07331bb
156 changed files with 4530 additions and 2718 deletions
--- a/packages/tools/src/crewai_tools/rag/chunkers/init.py
+++ b/packages/tools/src/crewai_tools/rag/chunkers/init.py
@@ -1,15 +1,19 @@
 from crewai_tools.rag.chunkers.base_chunker import BaseChunker
 from crewai_tools.rag.chunkers.default_chunker import DefaultChunker
-from crewai_tools.rag.chunkers.text_chunker import TextChunker, DocxChunker, MdxChunker
-from crewai_tools.rag.chunkers.structured_chunker import CsvChunker, JsonChunker, XmlChunker
+from crewai_tools.rag.chunkers.structured_chunker import (
+    CsvChunker,
+    JsonChunker,
+    XmlChunker,
+)
+from crewai_tools.rag.chunkers.text_chunker import DocxChunker, MdxChunker, TextChunker

 __all__ = [
    "BaseChunker",
-    "DefaultChunker",
-    "TextChunker",
-    "DocxChunker",
-    "MdxChunker",
    "CsvChunker",
+    "DefaultChunker",
+    "DocxChunker",
    "JsonChunker",
+    "MdxChunker",
+    "TextChunker",
    "XmlChunker",
 ]
--- a/packages/tools/src/crewai_tools/rag/chunkers/base_chunker.py
+++ b/packages/tools/src/crewai_tools/rag/chunkers/base_chunker.py
@@ -1,6 +1,6 @@
-from typing import List, Optional
 import re

+
 class RecursiveCharacterTextSplitter:
    """
    A text splitter that recursively splits text based on a hierarchy of separators.
@@ -10,7 +10,7 @@ class RecursiveCharacterTextSplitter:
        self,
        chunk_size: int = 4000,
        chunk_overlap: int = 200,
-        separators: Optional[List[str]] = None,
+        separators: list[str] | None = None,
        keep_separator: bool = True,
    ):
        """
@@ -23,7 +23,9 @@ class RecursiveCharacterTextSplitter:
            keep_separator: Whether to keep the separator in the split text
        """
        if chunk_overlap >= chunk_size:
-            raise ValueError(f"Chunk overlap ({chunk_overlap}) cannot be >= chunk size ({chunk_size})")
+            raise ValueError(
+                f"Chunk overlap ({chunk_overlap}) cannot be >= chunk size ({chunk_size})"
+            )

        self._chunk_size = chunk_size
        self._chunk_overlap = chunk_overlap
@@ -36,10 +38,10 @@ class RecursiveCharacterTextSplitter:
            "",
        ]

-    def split_text(self, text: str) -> List[str]:
+    def split_text(self, text: str) -> list[str]:
        return self._split_text(text, self._separators)

-    def _split_text(self, text: str, separators: List[str]) -> List[str]:
+    def _split_text(self, text: str, separators: list[str]) -> list[str]:
        separator = separators[-1]
        new_separators = []

@@ -49,7 +51,7 @@ class RecursiveCharacterTextSplitter:
                break
            if re.search(re.escape(sep), text):
                separator = sep
-                new_separators = separators[i + 1:]
+                new_separators = separators[i + 1 :]
                break

        splits = self._split_text_with_separator(text, separator)
@@ -68,7 +70,7 @@ class RecursiveCharacterTextSplitter:

        return self._merge_splits(good_splits, separator)

-    def _split_text_with_separator(self, text: str, separator: str) -> List[str]:
+    def _split_text_with_separator(self, text: str, separator: str) -> list[str]:
        if separator == "":
            return list(text)

@@ -90,16 +92,15 @@ class RecursiveCharacterTextSplitter:
                            splits[-1] += separator

            return [s for s in splits if s]
-        else:
-            return text.split(separator)
+        return text.split(separator)

-    def _split_by_characters(self, text: str) -> List[str]:
+    def _split_by_characters(self, text: str) -> list[str]:
        chunks = []
        for i in range(0, len(text), self._chunk_size):
-            chunks.append(text[i:i + self._chunk_size])
+            chunks.append(text[i : i + self._chunk_size])
        return chunks

-    def _merge_splits(self, splits: List[str], separator: str) -> List[str]:
+    def _merge_splits(self, splits: list[str], separator: str) -> list[str]:
        """Merge splits into chunks with proper overlap."""
        docs = []
        current_doc = []
@@ -112,7 +113,10 @@ class RecursiveCharacterTextSplitter:
                if separator == "":
                    doc = "".join(current_doc)
                else:
-                    doc = separator.join(current_doc)
+                    if self._keep_separator and separator == " ":
+                        doc = "".join(current_doc)
+                    else:
+                        doc = separator.join(current_doc)

                if doc:
                    docs.append(doc)
@@ -133,15 +137,25 @@ class RecursiveCharacterTextSplitter:
            if separator == "":
                doc = "".join(current_doc)
            else:
-                doc = separator.join(current_doc)
+                if self._keep_separator and separator == " ":
+                    doc = "".join(current_doc)
+                else:
+                    doc = separator.join(current_doc)

            if doc:
                docs.append(doc)

        return docs

+
 class BaseChunker:
-    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200, separators: Optional[List[str]] = None, keep_separator: bool = True):
+    def __init__(
+        self,
+        chunk_size: int = 1000,
+        chunk_overlap: int = 200,
+        separators: list[str] | None = None,
+        keep_separator: bool = True,
+    ):
        """
        Initialize the Chunker

@@ -159,8 +173,7 @@ class BaseChunker:
            keep_separator=keep_separator,
        )

-
-    def chunk(self, text: str) -> List[str]:
+    def chunk(self, text: str) -> list[str]:
        if not text or not text.strip():
            return []

--- a/packages/tools/src/crewai_tools/rag/chunkers/default_chunker.py
+++ b/packages/tools/src/crewai_tools/rag/chunkers/default_chunker.py
@@ -1,6 +1,12 @@
 from crewai_tools.rag.chunkers.base_chunker import BaseChunker
-from typing import List, Optional
+

 class DefaultChunker(BaseChunker):
-    def __init__(self, chunk_size: int = 2000, chunk_overlap: int = 20, separators: Optional[List[str]] = None, keep_separator: bool = True):
+    def __init__(
+        self,
+        chunk_size: int = 2000,
+        chunk_overlap: int = 20,
+        separators: list[str] | None = None,
+        keep_separator: bool = True,
+    ):
        super().__init__(chunk_size, chunk_overlap, separators, keep_separator)
--- a/packages/tools/src/crewai_tools/rag/chunkers/structured_chunker.py
+++ b/packages/tools/src/crewai_tools/rag/chunkers/structured_chunker.py
@@ -1,49 +1,66 @@
 from crewai_tools.rag.chunkers.base_chunker import BaseChunker
-from typing import List, Optional


 class CsvChunker(BaseChunker):
-    def __init__(self, chunk_size: int = 1200, chunk_overlap: int = 100, separators: Optional[List[str]] = None, keep_separator: bool = True):
+    def __init__(
+        self,
+        chunk_size: int = 1200,
+        chunk_overlap: int = 100,
+        separators: list[str] | None = None,
+        keep_separator: bool = True,
+    ):
        if separators is None:
            separators = [
-                "\nRow ",   # Row boundaries (from CSVLoader format)
-                "\n",       # Line breaks
-                " | ",      # Column separators
-                ", ",       # Comma separators
-                " ",        # Word breaks
-                "",         # Character level
+                "\nRow ",  # Row boundaries (from CSVLoader format)
+                "\n",  # Line breaks
+                " | ",  # Column separators
+                ", ",  # Comma separators
+                " ",  # Word breaks
+                "",  # Character level
            ]
        super().__init__(chunk_size, chunk_overlap, separators, keep_separator)


 class JsonChunker(BaseChunker):
-    def __init__(self, chunk_size: int = 2000, chunk_overlap: int = 200, separators: Optional[List[str]] = None, keep_separator: bool = True):
+    def __init__(
+        self,
+        chunk_size: int = 2000,
+        chunk_overlap: int = 200,
+        separators: list[str] | None = None,
+        keep_separator: bool = True,
+    ):
        if separators is None:
            separators = [
-                "\n\n",     # Object/array boundaries
-                "\n",       # Line breaks
-                "},",       # Object endings
-                "],",       # Array endings
-                ", ",       # Property separators
-                ": ",       # Key-value separators
-                " ",        # Word breaks
-                "",         # Character level
+                "\n\n",  # Object/array boundaries
+                "\n",  # Line breaks
+                "},",  # Object endings
+                "],",  # Array endings
+                ", ",  # Property separators
+                ": ",  # Key-value separators
+                " ",  # Word breaks
+                "",  # Character level
            ]
        super().__init__(chunk_size, chunk_overlap, separators, keep_separator)


 class XmlChunker(BaseChunker):
-    def __init__(self, chunk_size: int = 2500, chunk_overlap: int = 250, separators: Optional[List[str]] = None, keep_separator: bool = True):
+    def __init__(
+        self,
+        chunk_size: int = 2500,
+        chunk_overlap: int = 250,
+        separators: list[str] | None = None,
+        keep_separator: bool = True,
+    ):
        if separators is None:
            separators = [
-                "\n\n",     # Element boundaries
-                "\n",       # Line breaks
-                ">",        # Tag endings
-                ". ",       # Sentence endings (for text content)
-                "! ",       # Exclamation endings
-                "? ",       # Question endings
-                ", ",       # Comma separators
-                " ",        # Word breaks
-                "",         # Character level
+                "\n\n",  # Element boundaries
+                "\n",  # Line breaks
+                ">",  # Tag endings
+                ". ",  # Sentence endings (for text content)
+                "! ",  # Exclamation endings
+                "? ",  # Question endings
+                ", ",  # Comma separators
+                " ",  # Word breaks
+                "",  # Character level
            ]
        super().__init__(chunk_size, chunk_overlap, separators, keep_separator)
--- a/packages/tools/src/crewai_tools/rag/chunkers/text_chunker.py
+++ b/packages/tools/src/crewai_tools/rag/chunkers/text_chunker.py
@@ -1,59 +1,76 @@
 from crewai_tools.rag.chunkers.base_chunker import BaseChunker
-from typing import List, Optional


 class TextChunker(BaseChunker):
-    def __init__(self, chunk_size: int = 1500, chunk_overlap: int = 150, separators: Optional[List[str]] = None, keep_separator: bool = True):
+    def __init__(
+        self,
+        chunk_size: int = 1500,
+        chunk_overlap: int = 150,
+        separators: list[str] | None = None,
+        keep_separator: bool = True,
+    ):
        if separators is None:
            separators = [
                "\n\n\n",  # Multiple line breaks (sections)
-                "\n\n",    # Paragraph breaks
-                "\n",      # Line breaks
-                ". ",      # Sentence endings
-                "! ",      # Exclamation endings
-                "? ",      # Question endings
-                "; ",      # Semicolon breaks
-                ", ",      # Comma breaks
-                " ",       # Word breaks
-                "",        # Character level
+                "\n\n",  # Paragraph breaks
+                "\n",  # Line breaks
+                ". ",  # Sentence endings
+                "! ",  # Exclamation endings
+                "? ",  # Question endings
+                "; ",  # Semicolon breaks
+                ", ",  # Comma breaks
+                " ",  # Word breaks
+                "",  # Character level
            ]
        super().__init__(chunk_size, chunk_overlap, separators, keep_separator)


 class DocxChunker(BaseChunker):
-    def __init__(self, chunk_size: int = 2500, chunk_overlap: int = 250, separators: Optional[List[str]] = None, keep_separator: bool = True):
+    def __init__(
+        self,
+        chunk_size: int = 2500,
+        chunk_overlap: int = 250,
+        separators: list[str] | None = None,
+        keep_separator: bool = True,
+    ):
        if separators is None:
            separators = [
                "\n\n\n",  # Multiple line breaks (major sections)
-                "\n\n",    # Paragraph breaks
-                "\n",      # Line breaks
-                ". ",      # Sentence endings
-                "! ",      # Exclamation endings
-                "? ",      # Question endings
-                "; ",      # Semicolon breaks
-                ", ",      # Comma breaks
-                " ",       # Word breaks
-                "",        # Character level
+                "\n\n",  # Paragraph breaks
+                "\n",  # Line breaks
+                ". ",  # Sentence endings
+                "! ",  # Exclamation endings
+                "? ",  # Question endings
+                "; ",  # Semicolon breaks
+                ", ",  # Comma breaks
+                " ",  # Word breaks
+                "",  # Character level
            ]
        super().__init__(chunk_size, chunk_overlap, separators, keep_separator)


 class MdxChunker(BaseChunker):
-    def __init__(self, chunk_size: int = 3000, chunk_overlap: int = 300, separators: Optional[List[str]] = None, keep_separator: bool = True):
+    def __init__(
+        self,
+        chunk_size: int = 3000,
+        chunk_overlap: int = 300,
+        separators: list[str] | None = None,
+        keep_separator: bool = True,
+    ):
        if separators is None:
            separators = [
-                "\n## ",   # H2 headers (major sections)
+                "\n## ",  # H2 headers (major sections)
                "\n### ",  # H3 headers (subsections)
-                "\n#### ", # H4 headers (sub-subsections)
-                "\n\n",    # Paragraph breaks
-                "\n```",   # Code block boundaries
-                "\n",      # Line breaks
-                ". ",      # Sentence endings
-                "! ",      # Exclamation endings
-                "? ",      # Question endings
-                "; ",      # Semicolon breaks
-                ", ",      # Comma breaks
-                " ",       # Word breaks
-                "",        # Character level
+                "\n#### ",  # H4 headers (sub-subsections)
+                "\n\n",  # Paragraph breaks
+                "\n```",  # Code block boundaries
+                "\n",  # Line breaks
+                ". ",  # Sentence endings
+                "! ",  # Exclamation endings
+                "? ",  # Question endings
+                "; ",  # Semicolon breaks
+                ", ",  # Comma breaks
+                " ",  # Word breaks
+                "",  # Character level
            ]
        super().__init__(chunk_size, chunk_overlap, separators, keep_separator)
--- a/packages/tools/src/crewai_tools/rag/chunkers/web_chunker.py
+++ b/packages/tools/src/crewai_tools/rag/chunkers/web_chunker.py
@@ -1,20 +1,25 @@
 from crewai_tools.rag.chunkers.base_chunker import BaseChunker
-from typing import List, Optional


 class WebsiteChunker(BaseChunker):
-    def __init__(self, chunk_size: int = 2500, chunk_overlap: int = 250, separators: Optional[List[str]] = None, keep_separator: bool = True):
+    def __init__(
+        self,
+        chunk_size: int = 2500,
+        chunk_overlap: int = 250,
+        separators: list[str] | None = None,
+        keep_separator: bool = True,
+    ):
        if separators is None:
            separators = [
                "\n\n\n",  # Major section breaks
-                "\n\n",    # Paragraph breaks
-                "\n",      # Line breaks
-                ". ",      # Sentence endings
-                "! ",      # Exclamation endings
-                "? ",      # Question endings
-                "; ",      # Semicolon breaks
-                ", ",      # Comma breaks
-                " ",       # Word breaks
-                "",        # Character level
+                "\n\n",  # Paragraph breaks
+                "\n",  # Line breaks
+                ". ",  # Sentence endings
+                "! ",  # Exclamation endings
+                "? ",  # Question endings
+                "; ",  # Semicolon breaks
+                ", ",  # Comma breaks
+                " ",  # Word breaks
+                "",  # Character level
            ]
        super().__init__(chunk_size, chunk_overlap, separators, keep_separator)