Files
crewAI/tests/rag/test_text_loaders.py
Lucas Gomide dc039cfac8 Adds RAG feature (#406)
* feat: initialize rag

* refactor: using cosine distance metric for chromadb

* feat: use RecursiveCharacterTextSplitter as chunker strategy

* feat: support chucker and loader per data_type

* feat: adding JSON loader

* feat: adding CSVLoader

* feat: adding loader for DOCX files

* feat: add loader for MDX files

* feat: add loader for XML files

* feat: add loader for parser Webpage

* feat: support to load files from an entire directory

* feat: support to auto-load the loaders for additional DataType

* feat: add chuckers for some specific data type

- Each chunker uses separators specific to its content type

* feat: prevent document duplication and centralize content management

- Implement document deduplication logic in RAG
  * Check for existing documents by source reference
  * Compare doc IDs to detect content changes
  * Automatically replace outdated content while preventing duplicates

- Centralize common functionality for better maintainability
  * Create SourceContent class to handle URLs, files, and text uniformly
  * Extract shared utilities (compute_sha256) to misc.py
  * Standardize doc ID generation across all loaders

- Improve RAG system architecture
  * All loaders now inherit consistent patterns from centralized BaseLoader
  * Better separation of concerns with dedicated content management classes
  * Standardized LoaderResult structure across all loader implementations

* chore: split text loaders file

* test: adding missing tests about RAG loaders

* refactor: QOL

* fix: add missing uv syntax on DOCXLoader
2025-08-19 18:30:35 -04:00

161 lines
5.5 KiB
Python

import hashlib
import os
import tempfile
import pytest
from crewai_tools.rag.loaders.text_loader import TextFileLoader, TextLoader
from crewai_tools.rag.base_loader import LoaderResult
from crewai_tools.rag.source_content import SourceContent
def write_temp_file(content, suffix=".txt", encoding="utf-8"):
with tempfile.NamedTemporaryFile(mode="w", suffix=suffix, delete=False, encoding=encoding) as f:
f.write(content)
return f.name
def cleanup_temp_file(path):
try:
os.unlink(path)
except FileNotFoundError:
pass
class TestTextFileLoader:
def test_basic_text_file(self):
content = "This is test content\nWith multiple lines\nAnd more text"
path = write_temp_file(content)
try:
result = TextFileLoader().load(SourceContent(path))
assert isinstance(result, LoaderResult)
assert result.content == content
assert result.source == path
assert result.doc_id
assert result.metadata in (None, {})
finally:
cleanup_temp_file(path)
def test_empty_file(self):
path = write_temp_file("")
try:
result = TextFileLoader().load(SourceContent(path))
assert result.content == ""
finally:
cleanup_temp_file(path)
def test_unicode_content(self):
content = "Hello 世界 🌍 émojis 🎉 åäö"
path = write_temp_file(content)
try:
result = TextFileLoader().load(SourceContent(path))
assert content in result.content
finally:
cleanup_temp_file(path)
def test_large_file(self):
content = "\n".join(f"Line {i}" for i in range(100))
path = write_temp_file(content)
try:
result = TextFileLoader().load(SourceContent(path))
assert "Line 0" in result.content
assert "Line 99" in result.content
assert result.content.count("\n") == 99
finally:
cleanup_temp_file(path)
def test_missing_file(self):
with pytest.raises(FileNotFoundError):
TextFileLoader().load(SourceContent("/nonexistent/path.txt"))
def test_permission_denied(self):
path = write_temp_file("Some content")
os.chmod(path, 0o000)
try:
with pytest.raises(PermissionError):
TextFileLoader().load(SourceContent(path))
finally:
os.chmod(path, 0o644)
cleanup_temp_file(path)
def test_doc_id_consistency(self):
content = "Consistent content"
path = write_temp_file(content)
try:
loader = TextFileLoader()
result1 = loader.load(SourceContent(path))
result2 = loader.load(SourceContent(path))
expected_id = hashlib.sha256((path + content).encode("utf-8")).hexdigest()
assert result1.doc_id == result2.doc_id == expected_id
finally:
cleanup_temp_file(path)
def test_various_extensions(self):
content = "Same content"
for ext in [".txt", ".md", ".log", ".json"]:
path = write_temp_file(content, suffix=ext)
try:
result = TextFileLoader().load(SourceContent(path))
assert result.content == content
finally:
cleanup_temp_file(path)
class TestTextLoader:
def test_basic_text(self):
content = "Raw text"
result = TextLoader().load(SourceContent(content))
expected_hash = hashlib.sha256(content.encode("utf-8")).hexdigest()
assert result.content == content
assert result.source == expected_hash
assert result.doc_id == expected_hash
def test_multiline_text(self):
content = "Line 1\nLine 2\nLine 3"
result = TextLoader().load(SourceContent(content))
assert "Line 2" in result.content
def test_empty_text(self):
result = TextLoader().load(SourceContent(""))
assert result.content == ""
assert result.source == hashlib.sha256("".encode("utf-8")).hexdigest()
def test_unicode_text(self):
content = "世界 🌍 émojis 🎉 åäö"
result = TextLoader().load(SourceContent(content))
assert content in result.content
def test_special_characters(self):
content = "!@#$$%^&*()_+-=~`{}[]\\|;:'\",.<>/?"
result = TextLoader().load(SourceContent(content))
assert result.content == content
def test_doc_id_uniqueness(self):
result1 = TextLoader().load(SourceContent("A"))
result2 = TextLoader().load(SourceContent("B"))
assert result1.doc_id != result2.doc_id
def test_whitespace_text(self):
content = " \n\t "
result = TextLoader().load(SourceContent(content))
assert result.content == content
def test_long_text(self):
content = "A" * 10000
result = TextLoader().load(SourceContent(content))
assert len(result.content) == 10000
class TestTextLoadersIntegration:
def test_consistency_between_loaders(self):
content = "Consistent content"
text_result = TextLoader().load(SourceContent(content))
file_path = write_temp_file(content)
try:
file_result = TextFileLoader().load(SourceContent(file_path))
assert text_result.content == file_result.content
assert text_result.source != file_result.source
assert text_result.doc_id != file_result.doc_id
finally:
cleanup_temp_file(file_path)