mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-24 07:38:14 +00:00
Adds RAG feature (#406)
* feat: initialize rag * refactor: using cosine distance metric for chromadb * feat: use RecursiveCharacterTextSplitter as chunker strategy * feat: support chucker and loader per data_type * feat: adding JSON loader * feat: adding CSVLoader * feat: adding loader for DOCX files * feat: add loader for MDX files * feat: add loader for XML files * feat: add loader for parser Webpage * feat: support to load files from an entire directory * feat: support to auto-load the loaders for additional DataType * feat: add chuckers for some specific data type - Each chunker uses separators specific to its content type * feat: prevent document duplication and centralize content management - Implement document deduplication logic in RAG * Check for existing documents by source reference * Compare doc IDs to detect content changes * Automatically replace outdated content while preventing duplicates - Centralize common functionality for better maintainability * Create SourceContent class to handle URLs, files, and text uniformly * Extract shared utilities (compute_sha256) to misc.py * Standardize doc ID generation across all loaders - Improve RAG system architecture * All loaders now inherit consistent patterns from centralized BaseLoader * Better separation of concerns with dedicated content management classes * Standardized LoaderResult structure across all loader implementations * chore: split text loaders file * test: adding missing tests about RAG loaders * refactor: QOL * fix: add missing uv syntax on DOCXLoader
This commit is contained in:
176
tests/rag/test_mdx_loader.py
Normal file
176
tests/rag/test_mdx_loader.py
Normal file
@@ -0,0 +1,176 @@
|
||||
import os
|
||||
import tempfile
|
||||
import pytest
|
||||
from unittest.mock import patch, Mock
|
||||
|
||||
from crewai_tools.rag.loaders.mdx_loader import MDXLoader
|
||||
from crewai_tools.rag.base_loader import LoaderResult
|
||||
from crewai_tools.rag.source_content import SourceContent
|
||||
|
||||
|
||||
class TestMDXLoader:
|
||||
|
||||
def _write_temp_mdx(self, content):
|
||||
f = tempfile.NamedTemporaryFile(mode='w', suffix='.mdx', delete=False)
|
||||
f.write(content)
|
||||
f.close()
|
||||
return f.name
|
||||
|
||||
def _load_from_file(self, content):
|
||||
path = self._write_temp_mdx(content)
|
||||
try:
|
||||
loader = MDXLoader()
|
||||
return loader.load(SourceContent(path)), path
|
||||
finally:
|
||||
os.unlink(path)
|
||||
|
||||
def test_load_basic_mdx_file(self):
|
||||
content = """
|
||||
import Component from './Component'
|
||||
export const meta = { title: 'Test' }
|
||||
|
||||
# Test MDX File
|
||||
|
||||
This is a **markdown** file with JSX.
|
||||
|
||||
<Component prop="value" />
|
||||
|
||||
Some more content.
|
||||
|
||||
<div className="container">
|
||||
<p>Nested content</p>
|
||||
</div>
|
||||
"""
|
||||
result, path = self._load_from_file(content)
|
||||
|
||||
assert isinstance(result, LoaderResult)
|
||||
assert all(tag not in result.content for tag in ["import", "export", "<Component", "<div", "</div>"])
|
||||
assert all(text in result.content for text in ["# Test MDX File", "markdown", "Some more content", "Nested content"])
|
||||
assert result.metadata["format"] == "mdx"
|
||||
assert result.source == path
|
||||
|
||||
def test_mdx_multiple_imports_exports(self):
|
||||
content = """
|
||||
import React from 'react'
|
||||
import { useState } from 'react'
|
||||
import CustomComponent from './custom'
|
||||
|
||||
export default function Layout() { return null }
|
||||
export const config = { test: true }
|
||||
|
||||
# Content
|
||||
|
||||
Regular markdown content here.
|
||||
"""
|
||||
result, _ = self._load_from_file(content)
|
||||
assert "# Content" in result.content
|
||||
assert "Regular markdown content here." in result.content
|
||||
assert "import" not in result.content and "export" not in result.content
|
||||
|
||||
def test_complex_jsx_cleanup(self):
|
||||
content = """
|
||||
# MDX with Complex JSX
|
||||
|
||||
<div className="alert alert-info">
|
||||
<strong>Info:</strong> This is important information.
|
||||
<ul><li>Item 1</li><li>Item 2</li></ul>
|
||||
</div>
|
||||
|
||||
Regular paragraph text.
|
||||
|
||||
<MyComponent prop1="value1">Nested content inside component</MyComponent>
|
||||
"""
|
||||
result, _ = self._load_from_file(content)
|
||||
assert all(tag not in result.content for tag in ["<div", "<strong>", "<ul>", "<MyComponent"])
|
||||
assert all(text in result.content for text in ["Info:", "Item 1", "Regular paragraph text.", "Nested content inside component"])
|
||||
|
||||
def test_whitespace_cleanup(self):
|
||||
content = """
|
||||
|
||||
|
||||
# Title
|
||||
|
||||
|
||||
Some content.
|
||||
|
||||
|
||||
More content after multiple newlines.
|
||||
|
||||
|
||||
|
||||
Final content.
|
||||
"""
|
||||
result, _ = self._load_from_file(content)
|
||||
assert result.content.count('\n\n\n') == 0
|
||||
assert result.content.startswith('# Title')
|
||||
assert result.content.endswith('Final content.')
|
||||
|
||||
def test_only_jsx_content(self):
|
||||
content = """
|
||||
<div>
|
||||
<h1>Only JSX content</h1>
|
||||
<p>No markdown here</p>
|
||||
</div>
|
||||
"""
|
||||
result, _ = self._load_from_file(content)
|
||||
assert all(tag not in result.content for tag in ["<div>", "<h1>", "<p>"])
|
||||
assert "Only JSX content" in result.content
|
||||
assert "No markdown here" in result.content
|
||||
|
||||
@patch('requests.get')
|
||||
def test_load_mdx_from_url(self, mock_get):
|
||||
mock_get.return_value = Mock(text="# MDX from URL\n\nContent here.\n\n<Component />", raise_for_status=lambda: None)
|
||||
loader = MDXLoader()
|
||||
result = loader.load(SourceContent("https://example.com/content.mdx"))
|
||||
assert "# MDX from URL" in result.content
|
||||
assert "<Component />" not in result.content
|
||||
|
||||
@patch('requests.get')
|
||||
def test_load_mdx_with_custom_headers(self, mock_get):
|
||||
mock_get.return_value = Mock(text="# Custom headers test", raise_for_status=lambda: None)
|
||||
loader = MDXLoader()
|
||||
loader.load(SourceContent("https://example.com"), headers={"Authorization": "Bearer token"})
|
||||
assert mock_get.call_args[1]['headers'] == {"Authorization": "Bearer token"}
|
||||
|
||||
@patch('requests.get')
|
||||
def test_mdx_url_fetch_error(self, mock_get):
|
||||
mock_get.side_effect = Exception("Network error")
|
||||
with pytest.raises(ValueError, match="Error fetching MDX from URL"):
|
||||
MDXLoader().load(SourceContent("https://example.com"))
|
||||
|
||||
def test_load_inline_mdx_text(self):
|
||||
content = """# Inline MDX\n\nimport Something from 'somewhere'\n\nContent with <Component prop=\"value\" />.\n\nexport const meta = { title: 'Test' }"""
|
||||
loader = MDXLoader()
|
||||
result = loader.load(SourceContent(content))
|
||||
assert "# Inline MDX" in result.content
|
||||
assert "Content with ." in result.content
|
||||
|
||||
def test_empty_result_after_cleaning(self):
|
||||
content = """
|
||||
import Something from 'somewhere'
|
||||
export const config = {}
|
||||
<div></div>
|
||||
"""
|
||||
result, _ = self._load_from_file(content)
|
||||
assert result.content.strip() == ""
|
||||
|
||||
def test_edge_case_parsing(self):
|
||||
content = """
|
||||
# Title
|
||||
|
||||
<Component>
|
||||
Multi-line
|
||||
JSX content
|
||||
</Component>
|
||||
|
||||
import { a, b } from 'module'
|
||||
|
||||
export { x, y }
|
||||
|
||||
Final text.
|
||||
"""
|
||||
result, _ = self._load_from_file(content)
|
||||
assert "# Title" in result.content
|
||||
assert "JSX content" in result.content
|
||||
assert "Final text." in result.content
|
||||
assert all(phrase not in result.content for phrase in ["import {", "export {", "<Component>"])
|
||||
Reference in New Issue
Block a user