mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-05-02 15:52:34 +00:00
feat: merge latest changes from crewAI-tools main into packages/tools
- Merged upstream changes from crewAI-tools main branch - Resolved conflicts due to monorepo structure (crewai_tools -> src/crewai_tools) - Removed deprecated embedchain adapters - Added new RAG loaders and crewai_rag_adapter - Consolidated dependencies in pyproject.toml Fixed critical linting issues: - Added ClassVar annotations for mutable class attributes - Added timeouts to requests calls (30s default) - Fixed exception handling with proper 'from' clauses - Added noqa comments for public API functions (backward compatibility) - Updated ruff config to ignore expected patterns: - F401 in __init__ files (intentional re-exports) - S101 in test files (assertions are expected) - S607 for subprocess calls (uv/pip commands are safe) Remaining issues are from upstream code and will be addressed in separate PRs.
This commit is contained in:
@@ -1,15 +1,19 @@
|
||||
from crewai_tools.rag.chunkers.base_chunker import BaseChunker
|
||||
from crewai_tools.rag.chunkers.default_chunker import DefaultChunker
|
||||
from crewai_tools.rag.chunkers.text_chunker import TextChunker, DocxChunker, MdxChunker
|
||||
from crewai_tools.rag.chunkers.structured_chunker import CsvChunker, JsonChunker, XmlChunker
|
||||
from crewai_tools.rag.chunkers.structured_chunker import (
|
||||
CsvChunker,
|
||||
JsonChunker,
|
||||
XmlChunker,
|
||||
)
|
||||
from crewai_tools.rag.chunkers.text_chunker import DocxChunker, MdxChunker, TextChunker
|
||||
|
||||
__all__ = [
|
||||
"BaseChunker",
|
||||
"DefaultChunker",
|
||||
"TextChunker",
|
||||
"DocxChunker",
|
||||
"MdxChunker",
|
||||
"CsvChunker",
|
||||
"DefaultChunker",
|
||||
"DocxChunker",
|
||||
"JsonChunker",
|
||||
"MdxChunker",
|
||||
"TextChunker",
|
||||
"XmlChunker",
|
||||
]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from typing import List, Optional
|
||||
import re
|
||||
|
||||
|
||||
class RecursiveCharacterTextSplitter:
|
||||
"""
|
||||
A text splitter that recursively splits text based on a hierarchy of separators.
|
||||
@@ -10,7 +10,7 @@ class RecursiveCharacterTextSplitter:
|
||||
self,
|
||||
chunk_size: int = 4000,
|
||||
chunk_overlap: int = 200,
|
||||
separators: Optional[List[str]] = None,
|
||||
separators: list[str] | None = None,
|
||||
keep_separator: bool = True,
|
||||
):
|
||||
"""
|
||||
@@ -23,7 +23,9 @@ class RecursiveCharacterTextSplitter:
|
||||
keep_separator: Whether to keep the separator in the split text
|
||||
"""
|
||||
if chunk_overlap >= chunk_size:
|
||||
raise ValueError(f"Chunk overlap ({chunk_overlap}) cannot be >= chunk size ({chunk_size})")
|
||||
raise ValueError(
|
||||
f"Chunk overlap ({chunk_overlap}) cannot be >= chunk size ({chunk_size})"
|
||||
)
|
||||
|
||||
self._chunk_size = chunk_size
|
||||
self._chunk_overlap = chunk_overlap
|
||||
@@ -36,10 +38,10 @@ class RecursiveCharacterTextSplitter:
|
||||
"",
|
||||
]
|
||||
|
||||
def split_text(self, text: str) -> List[str]:
|
||||
def split_text(self, text: str) -> list[str]:
|
||||
return self._split_text(text, self._separators)
|
||||
|
||||
def _split_text(self, text: str, separators: List[str]) -> List[str]:
|
||||
def _split_text(self, text: str, separators: list[str]) -> list[str]:
|
||||
separator = separators[-1]
|
||||
new_separators = []
|
||||
|
||||
@@ -49,7 +51,7 @@ class RecursiveCharacterTextSplitter:
|
||||
break
|
||||
if re.search(re.escape(sep), text):
|
||||
separator = sep
|
||||
new_separators = separators[i + 1:]
|
||||
new_separators = separators[i + 1 :]
|
||||
break
|
||||
|
||||
splits = self._split_text_with_separator(text, separator)
|
||||
@@ -68,7 +70,7 @@ class RecursiveCharacterTextSplitter:
|
||||
|
||||
return self._merge_splits(good_splits, separator)
|
||||
|
||||
def _split_text_with_separator(self, text: str, separator: str) -> List[str]:
|
||||
def _split_text_with_separator(self, text: str, separator: str) -> list[str]:
|
||||
if separator == "":
|
||||
return list(text)
|
||||
|
||||
@@ -90,16 +92,15 @@ class RecursiveCharacterTextSplitter:
|
||||
splits[-1] += separator
|
||||
|
||||
return [s for s in splits if s]
|
||||
else:
|
||||
return text.split(separator)
|
||||
return text.split(separator)
|
||||
|
||||
def _split_by_characters(self, text: str) -> List[str]:
|
||||
def _split_by_characters(self, text: str) -> list[str]:
|
||||
chunks = []
|
||||
for i in range(0, len(text), self._chunk_size):
|
||||
chunks.append(text[i:i + self._chunk_size])
|
||||
chunks.append(text[i : i + self._chunk_size])
|
||||
return chunks
|
||||
|
||||
def _merge_splits(self, splits: List[str], separator: str) -> List[str]:
|
||||
def _merge_splits(self, splits: list[str], separator: str) -> list[str]:
|
||||
"""Merge splits into chunks with proper overlap."""
|
||||
docs = []
|
||||
current_doc = []
|
||||
@@ -112,7 +113,10 @@ class RecursiveCharacterTextSplitter:
|
||||
if separator == "":
|
||||
doc = "".join(current_doc)
|
||||
else:
|
||||
doc = separator.join(current_doc)
|
||||
if self._keep_separator and separator == " ":
|
||||
doc = "".join(current_doc)
|
||||
else:
|
||||
doc = separator.join(current_doc)
|
||||
|
||||
if doc:
|
||||
docs.append(doc)
|
||||
@@ -133,15 +137,25 @@ class RecursiveCharacterTextSplitter:
|
||||
if separator == "":
|
||||
doc = "".join(current_doc)
|
||||
else:
|
||||
doc = separator.join(current_doc)
|
||||
if self._keep_separator and separator == " ":
|
||||
doc = "".join(current_doc)
|
||||
else:
|
||||
doc = separator.join(current_doc)
|
||||
|
||||
if doc:
|
||||
docs.append(doc)
|
||||
|
||||
return docs
|
||||
|
||||
|
||||
class BaseChunker:
|
||||
def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200, separators: Optional[List[str]] = None, keep_separator: bool = True):
|
||||
def __init__(
|
||||
self,
|
||||
chunk_size: int = 1000,
|
||||
chunk_overlap: int = 200,
|
||||
separators: list[str] | None = None,
|
||||
keep_separator: bool = True,
|
||||
):
|
||||
"""
|
||||
Initialize the Chunker
|
||||
|
||||
@@ -159,8 +173,7 @@ class BaseChunker:
|
||||
keep_separator=keep_separator,
|
||||
)
|
||||
|
||||
|
||||
def chunk(self, text: str) -> List[str]:
|
||||
def chunk(self, text: str) -> list[str]:
|
||||
if not text or not text.strip():
|
||||
return []
|
||||
|
||||
|
||||
@@ -1,6 +1,12 @@
|
||||
from crewai_tools.rag.chunkers.base_chunker import BaseChunker
|
||||
from typing import List, Optional
|
||||
|
||||
|
||||
class DefaultChunker(BaseChunker):
|
||||
def __init__(self, chunk_size: int = 2000, chunk_overlap: int = 20, separators: Optional[List[str]] = None, keep_separator: bool = True):
|
||||
def __init__(
|
||||
self,
|
||||
chunk_size: int = 2000,
|
||||
chunk_overlap: int = 20,
|
||||
separators: list[str] | None = None,
|
||||
keep_separator: bool = True,
|
||||
):
|
||||
super().__init__(chunk_size, chunk_overlap, separators, keep_separator)
|
||||
|
||||
@@ -1,49 +1,66 @@
|
||||
from crewai_tools.rag.chunkers.base_chunker import BaseChunker
|
||||
from typing import List, Optional
|
||||
|
||||
|
||||
class CsvChunker(BaseChunker):
|
||||
def __init__(self, chunk_size: int = 1200, chunk_overlap: int = 100, separators: Optional[List[str]] = None, keep_separator: bool = True):
|
||||
def __init__(
|
||||
self,
|
||||
chunk_size: int = 1200,
|
||||
chunk_overlap: int = 100,
|
||||
separators: list[str] | None = None,
|
||||
keep_separator: bool = True,
|
||||
):
|
||||
if separators is None:
|
||||
separators = [
|
||||
"\nRow ", # Row boundaries (from CSVLoader format)
|
||||
"\n", # Line breaks
|
||||
" | ", # Column separators
|
||||
", ", # Comma separators
|
||||
" ", # Word breaks
|
||||
"", # Character level
|
||||
"\nRow ", # Row boundaries (from CSVLoader format)
|
||||
"\n", # Line breaks
|
||||
" | ", # Column separators
|
||||
", ", # Comma separators
|
||||
" ", # Word breaks
|
||||
"", # Character level
|
||||
]
|
||||
super().__init__(chunk_size, chunk_overlap, separators, keep_separator)
|
||||
|
||||
|
||||
class JsonChunker(BaseChunker):
|
||||
def __init__(self, chunk_size: int = 2000, chunk_overlap: int = 200, separators: Optional[List[str]] = None, keep_separator: bool = True):
|
||||
def __init__(
|
||||
self,
|
||||
chunk_size: int = 2000,
|
||||
chunk_overlap: int = 200,
|
||||
separators: list[str] | None = None,
|
||||
keep_separator: bool = True,
|
||||
):
|
||||
if separators is None:
|
||||
separators = [
|
||||
"\n\n", # Object/array boundaries
|
||||
"\n", # Line breaks
|
||||
"},", # Object endings
|
||||
"],", # Array endings
|
||||
", ", # Property separators
|
||||
": ", # Key-value separators
|
||||
" ", # Word breaks
|
||||
"", # Character level
|
||||
"\n\n", # Object/array boundaries
|
||||
"\n", # Line breaks
|
||||
"},", # Object endings
|
||||
"],", # Array endings
|
||||
", ", # Property separators
|
||||
": ", # Key-value separators
|
||||
" ", # Word breaks
|
||||
"", # Character level
|
||||
]
|
||||
super().__init__(chunk_size, chunk_overlap, separators, keep_separator)
|
||||
|
||||
|
||||
class XmlChunker(BaseChunker):
|
||||
def __init__(self, chunk_size: int = 2500, chunk_overlap: int = 250, separators: Optional[List[str]] = None, keep_separator: bool = True):
|
||||
def __init__(
|
||||
self,
|
||||
chunk_size: int = 2500,
|
||||
chunk_overlap: int = 250,
|
||||
separators: list[str] | None = None,
|
||||
keep_separator: bool = True,
|
||||
):
|
||||
if separators is None:
|
||||
separators = [
|
||||
"\n\n", # Element boundaries
|
||||
"\n", # Line breaks
|
||||
">", # Tag endings
|
||||
". ", # Sentence endings (for text content)
|
||||
"! ", # Exclamation endings
|
||||
"? ", # Question endings
|
||||
", ", # Comma separators
|
||||
" ", # Word breaks
|
||||
"", # Character level
|
||||
"\n\n", # Element boundaries
|
||||
"\n", # Line breaks
|
||||
">", # Tag endings
|
||||
". ", # Sentence endings (for text content)
|
||||
"! ", # Exclamation endings
|
||||
"? ", # Question endings
|
||||
", ", # Comma separators
|
||||
" ", # Word breaks
|
||||
"", # Character level
|
||||
]
|
||||
super().__init__(chunk_size, chunk_overlap, separators, keep_separator)
|
||||
|
||||
@@ -1,59 +1,76 @@
|
||||
from crewai_tools.rag.chunkers.base_chunker import BaseChunker
|
||||
from typing import List, Optional
|
||||
|
||||
|
||||
class TextChunker(BaseChunker):
|
||||
def __init__(self, chunk_size: int = 1500, chunk_overlap: int = 150, separators: Optional[List[str]] = None, keep_separator: bool = True):
|
||||
def __init__(
|
||||
self,
|
||||
chunk_size: int = 1500,
|
||||
chunk_overlap: int = 150,
|
||||
separators: list[str] | None = None,
|
||||
keep_separator: bool = True,
|
||||
):
|
||||
if separators is None:
|
||||
separators = [
|
||||
"\n\n\n", # Multiple line breaks (sections)
|
||||
"\n\n", # Paragraph breaks
|
||||
"\n", # Line breaks
|
||||
". ", # Sentence endings
|
||||
"! ", # Exclamation endings
|
||||
"? ", # Question endings
|
||||
"; ", # Semicolon breaks
|
||||
", ", # Comma breaks
|
||||
" ", # Word breaks
|
||||
"", # Character level
|
||||
"\n\n", # Paragraph breaks
|
||||
"\n", # Line breaks
|
||||
". ", # Sentence endings
|
||||
"! ", # Exclamation endings
|
||||
"? ", # Question endings
|
||||
"; ", # Semicolon breaks
|
||||
", ", # Comma breaks
|
||||
" ", # Word breaks
|
||||
"", # Character level
|
||||
]
|
||||
super().__init__(chunk_size, chunk_overlap, separators, keep_separator)
|
||||
|
||||
|
||||
class DocxChunker(BaseChunker):
|
||||
def __init__(self, chunk_size: int = 2500, chunk_overlap: int = 250, separators: Optional[List[str]] = None, keep_separator: bool = True):
|
||||
def __init__(
|
||||
self,
|
||||
chunk_size: int = 2500,
|
||||
chunk_overlap: int = 250,
|
||||
separators: list[str] | None = None,
|
||||
keep_separator: bool = True,
|
||||
):
|
||||
if separators is None:
|
||||
separators = [
|
||||
"\n\n\n", # Multiple line breaks (major sections)
|
||||
"\n\n", # Paragraph breaks
|
||||
"\n", # Line breaks
|
||||
". ", # Sentence endings
|
||||
"! ", # Exclamation endings
|
||||
"? ", # Question endings
|
||||
"; ", # Semicolon breaks
|
||||
", ", # Comma breaks
|
||||
" ", # Word breaks
|
||||
"", # Character level
|
||||
"\n\n", # Paragraph breaks
|
||||
"\n", # Line breaks
|
||||
". ", # Sentence endings
|
||||
"! ", # Exclamation endings
|
||||
"? ", # Question endings
|
||||
"; ", # Semicolon breaks
|
||||
", ", # Comma breaks
|
||||
" ", # Word breaks
|
||||
"", # Character level
|
||||
]
|
||||
super().__init__(chunk_size, chunk_overlap, separators, keep_separator)
|
||||
|
||||
|
||||
class MdxChunker(BaseChunker):
|
||||
def __init__(self, chunk_size: int = 3000, chunk_overlap: int = 300, separators: Optional[List[str]] = None, keep_separator: bool = True):
|
||||
def __init__(
|
||||
self,
|
||||
chunk_size: int = 3000,
|
||||
chunk_overlap: int = 300,
|
||||
separators: list[str] | None = None,
|
||||
keep_separator: bool = True,
|
||||
):
|
||||
if separators is None:
|
||||
separators = [
|
||||
"\n## ", # H2 headers (major sections)
|
||||
"\n## ", # H2 headers (major sections)
|
||||
"\n### ", # H3 headers (subsections)
|
||||
"\n#### ", # H4 headers (sub-subsections)
|
||||
"\n\n", # Paragraph breaks
|
||||
"\n```", # Code block boundaries
|
||||
"\n", # Line breaks
|
||||
". ", # Sentence endings
|
||||
"! ", # Exclamation endings
|
||||
"? ", # Question endings
|
||||
"; ", # Semicolon breaks
|
||||
", ", # Comma breaks
|
||||
" ", # Word breaks
|
||||
"", # Character level
|
||||
"\n#### ", # H4 headers (sub-subsections)
|
||||
"\n\n", # Paragraph breaks
|
||||
"\n```", # Code block boundaries
|
||||
"\n", # Line breaks
|
||||
". ", # Sentence endings
|
||||
"! ", # Exclamation endings
|
||||
"? ", # Question endings
|
||||
"; ", # Semicolon breaks
|
||||
", ", # Comma breaks
|
||||
" ", # Word breaks
|
||||
"", # Character level
|
||||
]
|
||||
super().__init__(chunk_size, chunk_overlap, separators, keep_separator)
|
||||
|
||||
@@ -1,20 +1,25 @@
|
||||
from crewai_tools.rag.chunkers.base_chunker import BaseChunker
|
||||
from typing import List, Optional
|
||||
|
||||
|
||||
class WebsiteChunker(BaseChunker):
|
||||
def __init__(self, chunk_size: int = 2500, chunk_overlap: int = 250, separators: Optional[List[str]] = None, keep_separator: bool = True):
|
||||
def __init__(
|
||||
self,
|
||||
chunk_size: int = 2500,
|
||||
chunk_overlap: int = 250,
|
||||
separators: list[str] | None = None,
|
||||
keep_separator: bool = True,
|
||||
):
|
||||
if separators is None:
|
||||
separators = [
|
||||
"\n\n\n", # Major section breaks
|
||||
"\n\n", # Paragraph breaks
|
||||
"\n", # Line breaks
|
||||
". ", # Sentence endings
|
||||
"! ", # Exclamation endings
|
||||
"? ", # Question endings
|
||||
"; ", # Semicolon breaks
|
||||
", ", # Comma breaks
|
||||
" ", # Word breaks
|
||||
"", # Character level
|
||||
"\n\n", # Paragraph breaks
|
||||
"\n", # Line breaks
|
||||
". ", # Sentence endings
|
||||
"! ", # Exclamation endings
|
||||
"? ", # Question endings
|
||||
"; ", # Semicolon breaks
|
||||
", ", # Comma breaks
|
||||
" ", # Word breaks
|
||||
"", # Character level
|
||||
]
|
||||
super().__init__(chunk_size, chunk_overlap, separators, keep_separator)
|
||||
|
||||
Reference in New Issue
Block a user