feat: add crewai-tools library to workspace

- Migrate crewai-tools as standalone package in lib/tools
- Configure UV workspace for monorepo structure
- Move assets to repository root
- Clean up duplicate README files
- Focus pre-commit hooks on lib/crewai/src only
This commit is contained in:
Greyson LaLonde
2025-09-26 15:05:41 -04:00
300 changed files with 31874 additions and 1 deletions

View File

View File

@@ -0,0 +1,130 @@
import os
import tempfile
import pytest
from unittest.mock import patch, Mock
from crewai_tools.rag.loaders.csv_loader import CSVLoader
from crewai_tools.rag.base_loader import LoaderResult
from crewai_tools.rag.source_content import SourceContent
@pytest.fixture
def temp_csv_file():
created_files = []
def _create(content: str):
f = tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False)
f.write(content)
f.close()
created_files.append(f.name)
return f.name
yield _create
for path in created_files:
os.unlink(path)
class TestCSVLoader:
def test_load_csv_from_file(self, temp_csv_file):
path = temp_csv_file("name,age,city\nJohn,25,New York\nJane,30,Chicago")
loader = CSVLoader()
result = loader.load(SourceContent(path))
assert isinstance(result, LoaderResult)
assert "Headers: name | age | city" in result.content
assert "Row 1: name: John | age: 25 | city: New York" in result.content
assert "Row 2: name: Jane | age: 30 | city: Chicago" in result.content
assert result.metadata == {
"format": "csv",
"columns": ["name", "age", "city"],
"rows": 2,
}
assert result.source == path
assert result.doc_id
def test_load_csv_with_empty_values(self, temp_csv_file):
path = temp_csv_file("name,age,city\nJohn,,New York\n,30,")
result = CSVLoader().load(SourceContent(path))
assert "Row 1: name: John | city: New York" in result.content
assert "Row 2: age: 30" in result.content
assert result.metadata["rows"] == 2
def test_load_csv_malformed(self, temp_csv_file):
path = temp_csv_file("invalid,csv\nunclosed quote \"missing")
result = CSVLoader().load(SourceContent(path))
assert "Headers: invalid | csv" in result.content
assert 'Row 1: invalid: unclosed quote "missing' in result.content
assert result.metadata["columns"] == ["invalid", "csv"]
def test_load_csv_empty_file(self, temp_csv_file):
path = temp_csv_file("")
result = CSVLoader().load(SourceContent(path))
assert result.content == ""
assert result.metadata["rows"] == 0
def test_load_csv_text_input(self):
raw_csv = "col1,col2\nvalue1,value2\nvalue3,value4"
result = CSVLoader().load(SourceContent(raw_csv))
assert "Headers: col1 | col2" in result.content
assert "Row 1: col1: value1 | col2: value2" in result.content
assert "Row 2: col1: value3 | col2: value4" in result.content
assert result.metadata["columns"] == ["col1", "col2"]
assert result.metadata["rows"] == 2
def test_doc_id_is_deterministic(self, temp_csv_file):
path = temp_csv_file("name,value\ntest,123")
loader = CSVLoader()
result1 = loader.load(SourceContent(path))
result2 = loader.load(SourceContent(path))
assert result1.doc_id == result2.doc_id
@patch("requests.get")
def test_load_csv_from_url(self, mock_get):
mock_get.return_value = Mock(
text="name,value\ntest,123",
raise_for_status=Mock(return_value=None)
)
result = CSVLoader().load(SourceContent("https://example.com/data.csv"))
assert "Headers: name | value" in result.content
assert "Row 1: name: test | value: 123" in result.content
headers = mock_get.call_args[1]["headers"]
assert "text/csv" in headers["Accept"]
assert "crewai-tools CSVLoader" in headers["User-Agent"]
@patch("requests.get")
def test_load_csv_with_custom_headers(self, mock_get):
mock_get.return_value = Mock(
text="data,value\ntest,456",
raise_for_status=Mock(return_value=None)
)
headers = {"Authorization": "Bearer token", "Custom-Header": "value"}
result = CSVLoader().load(SourceContent("https://example.com/data.csv"), headers=headers)
assert "Headers: data | value" in result.content
assert mock_get.call_args[1]["headers"] == headers
@patch("requests.get")
def test_csv_loader_handles_network_errors(self, mock_get):
mock_get.side_effect = Exception("Network error")
loader = CSVLoader()
with pytest.raises(ValueError, match="Error fetching CSV from URL"):
loader.load(SourceContent("https://example.com/data.csv"))
@patch("requests.get")
def test_csv_loader_handles_http_error(self, mock_get):
mock_get.return_value = Mock()
mock_get.return_value.raise_for_status.side_effect = Exception("404 Not Found")
loader = CSVLoader()
with pytest.raises(ValueError, match="Error fetching CSV from URL"):
loader.load(SourceContent("https://example.com/notfound.csv"))

View File

@@ -0,0 +1,149 @@
import os
import tempfile
import pytest
from crewai_tools.rag.loaders.directory_loader import DirectoryLoader
from crewai_tools.rag.base_loader import LoaderResult
from crewai_tools.rag.source_content import SourceContent
@pytest.fixture
def temp_directory():
with tempfile.TemporaryDirectory() as temp_dir:
yield temp_dir
class TestDirectoryLoader:
def _create_file(self, directory, filename, content="test content"):
path = os.path.join(directory, filename)
with open(path, "w") as f:
f.write(content)
return path
def test_load_non_recursive(self, temp_directory):
self._create_file(temp_directory, "file1.txt")
self._create_file(temp_directory, "file2.txt")
subdir = os.path.join(temp_directory, "subdir")
os.makedirs(subdir)
self._create_file(subdir, "file3.txt")
loader = DirectoryLoader()
result = loader.load(SourceContent(temp_directory), recursive=False)
assert isinstance(result, LoaderResult)
assert "file1.txt" in result.content
assert "file2.txt" in result.content
assert "file3.txt" not in result.content
assert result.metadata["total_files"] == 2
def test_load_recursive(self, temp_directory):
self._create_file(temp_directory, "file1.txt")
nested = os.path.join(temp_directory, "subdir", "nested")
os.makedirs(nested)
self._create_file(os.path.join(temp_directory, "subdir"), "file2.txt")
self._create_file(nested, "file3.txt")
loader = DirectoryLoader()
result = loader.load(SourceContent(temp_directory), recursive=True)
assert all(f"file{i}.txt" in result.content for i in range(1, 4))
def test_include_and_exclude_extensions(self, temp_directory):
self._create_file(temp_directory, "a.txt")
self._create_file(temp_directory, "b.py")
self._create_file(temp_directory, "c.md")
loader = DirectoryLoader()
result = loader.load(SourceContent(temp_directory), include_extensions=[".txt", ".py"])
assert "a.txt" in result.content
assert "b.py" in result.content
assert "c.md" not in result.content
result2 = loader.load(SourceContent(temp_directory), exclude_extensions=[".py", ".md"])
assert "a.txt" in result2.content
assert "b.py" not in result2.content
assert "c.md" not in result2.content
def test_max_files_limit(self, temp_directory):
for i in range(5):
self._create_file(temp_directory, f"file{i}.txt")
loader = DirectoryLoader()
result = loader.load(SourceContent(temp_directory), max_files=3)
assert result.metadata["total_files"] == 3
assert all(f"file{i}.txt" in result.content for i in range(3))
def test_hidden_files_and_dirs_excluded(self, temp_directory):
self._create_file(temp_directory, "visible.txt", "visible")
self._create_file(temp_directory, ".hidden.txt", "hidden")
hidden_dir = os.path.join(temp_directory, ".hidden")
os.makedirs(hidden_dir)
self._create_file(hidden_dir, "inside_hidden.txt")
loader = DirectoryLoader()
result = loader.load(SourceContent(temp_directory), recursive=True)
assert "visible.txt" in result.content
assert ".hidden.txt" not in result.content
assert "inside_hidden.txt" not in result.content
def test_directory_does_not_exist(self):
loader = DirectoryLoader()
with pytest.raises(FileNotFoundError, match="Directory does not exist"):
loader.load(SourceContent("/path/does/not/exist"))
def test_path_is_not_a_directory(self):
with tempfile.NamedTemporaryFile() as f:
loader = DirectoryLoader()
with pytest.raises(ValueError, match="Path is not a directory"):
loader.load(SourceContent(f.name))
def test_url_not_supported(self):
loader = DirectoryLoader()
with pytest.raises(ValueError, match="URL directory loading is not supported"):
loader.load(SourceContent("https://example.com"))
def test_processing_error_handling(self, temp_directory):
self._create_file(temp_directory, "valid.txt")
error_file = self._create_file(temp_directory, "error.txt")
loader = DirectoryLoader()
original_method = loader._process_single_file
def mock(file_path):
if "error" in file_path:
raise ValueError("Mock error")
return original_method(file_path)
loader._process_single_file = mock
result = loader.load(SourceContent(temp_directory))
assert "valid.txt" in result.content
assert "error.txt (ERROR)" in result.content
assert result.metadata["errors"] == 1
assert len(result.metadata["error_details"]) == 1
def test_metadata_structure(self, temp_directory):
self._create_file(temp_directory, "test.txt", "Sample")
loader = DirectoryLoader()
result = loader.load(SourceContent(temp_directory))
metadata = result.metadata
expected_keys = {
"format", "directory_path", "total_files", "processed_files",
"errors", "file_details", "error_details"
}
assert expected_keys.issubset(metadata)
assert all(k in metadata["file_details"][0] for k in ("path", "metadata", "source"))
def test_empty_directory(self, temp_directory):
loader = DirectoryLoader()
result = loader.load(SourceContent(temp_directory))
assert result.content == ""
assert result.metadata["total_files"] == 0
assert result.metadata["processed_files"] == 0

View File

@@ -0,0 +1,135 @@
import tempfile
import pytest
from unittest.mock import patch, Mock
from crewai_tools.rag.loaders.docx_loader import DOCXLoader
from crewai_tools.rag.base_loader import LoaderResult
from crewai_tools.rag.source_content import SourceContent
class TestDOCXLoader:
@patch('docx.Document')
def test_load_docx_from_file(self, mock_docx_class):
mock_doc = Mock()
mock_doc.paragraphs = [
Mock(text="First paragraph"),
Mock(text="Second paragraph"),
Mock(text=" ") # Blank paragraph
]
mock_doc.tables = []
mock_docx_class.return_value = mock_doc
with tempfile.NamedTemporaryFile(suffix='.docx') as f:
loader = DOCXLoader()
result = loader.load(SourceContent(f.name))
assert isinstance(result, LoaderResult)
assert result.content == "First paragraph\nSecond paragraph"
assert result.metadata == {"format": "docx", "paragraphs": 3, "tables": 0}
assert result.source == f.name
@patch('docx.Document')
def test_load_docx_with_tables(self, mock_docx_class):
mock_doc = Mock()
mock_doc.paragraphs = [Mock(text="Document with table")]
mock_doc.tables = [Mock(), Mock()]
mock_docx_class.return_value = mock_doc
with tempfile.NamedTemporaryFile(suffix='.docx') as f:
loader = DOCXLoader()
result = loader.load(SourceContent(f.name))
assert result.metadata["tables"] == 2
@patch('requests.get')
@patch('docx.Document')
@patch('tempfile.NamedTemporaryFile')
@patch('os.unlink')
def test_load_docx_from_url(self, mock_unlink, mock_tempfile, mock_docx_class, mock_get):
mock_get.return_value = Mock(content=b"fake docx content", raise_for_status=Mock())
mock_temp = Mock(name="/tmp/temp_docx_file.docx")
mock_temp.__enter__ = Mock(return_value=mock_temp)
mock_temp.__exit__ = Mock(return_value=None)
mock_tempfile.return_value = mock_temp
mock_doc = Mock()
mock_doc.paragraphs = [Mock(text="Content from URL")]
mock_doc.tables = []
mock_docx_class.return_value = mock_doc
loader = DOCXLoader()
result = loader.load(SourceContent("https://example.com/test.docx"))
assert "Content from URL" in result.content
assert result.source == "https://example.com/test.docx"
headers = mock_get.call_args[1]['headers']
assert "application/vnd.openxmlformats-officedocument.wordprocessingml.document" in headers['Accept']
assert "crewai-tools DOCXLoader" in headers['User-Agent']
mock_temp.write.assert_called_once_with(b"fake docx content")
@patch('requests.get')
@patch('docx.Document')
def test_load_docx_from_url_with_custom_headers(self, mock_docx_class, mock_get):
mock_get.return_value = Mock(content=b"fake docx content", raise_for_status=Mock())
mock_docx_class.return_value = Mock(paragraphs=[], tables=[])
loader = DOCXLoader()
custom_headers = {"Authorization": "Bearer token"}
with patch('tempfile.NamedTemporaryFile'), patch('os.unlink'):
loader.load(SourceContent("https://example.com/test.docx"), headers=custom_headers)
assert mock_get.call_args[1]['headers'] == custom_headers
@patch('requests.get')
def test_load_docx_url_download_error(self, mock_get):
mock_get.side_effect = Exception("Network error")
loader = DOCXLoader()
with pytest.raises(ValueError, match="Error fetching DOCX from URL"):
loader.load(SourceContent("https://example.com/test.docx"))
@patch('requests.get')
def test_load_docx_url_http_error(self, mock_get):
mock_get.return_value = Mock(raise_for_status=Mock(side_effect=Exception("404 Not Found")))
loader = DOCXLoader()
with pytest.raises(ValueError, match="Error fetching DOCX from URL"):
loader.load(SourceContent("https://example.com/notfound.docx"))
def test_load_docx_invalid_source(self):
loader = DOCXLoader()
with pytest.raises(ValueError, match="Source must be a valid file path or URL"):
loader.load(SourceContent("not_a_file_or_url"))
@patch('docx.Document')
def test_load_docx_parsing_error(self, mock_docx_class):
mock_docx_class.side_effect = Exception("Invalid DOCX file")
with tempfile.NamedTemporaryFile(suffix='.docx') as f:
loader = DOCXLoader()
with pytest.raises(ValueError, match="Error loading DOCX file"):
loader.load(SourceContent(f.name))
@patch('docx.Document')
def test_load_docx_empty_document(self, mock_docx_class):
mock_docx_class.return_value = Mock(paragraphs=[], tables=[])
with tempfile.NamedTemporaryFile(suffix='.docx') as f:
loader = DOCXLoader()
result = loader.load(SourceContent(f.name))
assert result.content == ""
assert result.metadata == {"paragraphs": 0, "tables": 0, "format": "docx"}
@patch('docx.Document')
def test_docx_doc_id_generation(self, mock_docx_class):
mock_docx_class.return_value = Mock(paragraphs=[Mock(text="Consistent content")], tables=[])
with tempfile.NamedTemporaryFile(suffix='.docx') as f:
loader = DOCXLoader()
source = SourceContent(f.name)
assert loader.load(source).doc_id == loader.load(source).doc_id

View File

@@ -0,0 +1,180 @@
import json
import os
import tempfile
import pytest
from unittest.mock import patch, Mock
from crewai_tools.rag.loaders.json_loader import JSONLoader
from crewai_tools.rag.base_loader import LoaderResult
from crewai_tools.rag.source_content import SourceContent
class TestJSONLoader:
def _create_temp_json_file(self, data) -> str:
"""Helper to write JSON data to a temporary file and return its path."""
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
json.dump(data, f)
return f.name
def _create_temp_raw_file(self, content: str) -> str:
"""Helper to write raw content to a temporary file and return its path."""
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
f.write(content)
return f.name
def _load_from_path(self, path) -> LoaderResult:
loader = JSONLoader()
return loader.load(SourceContent(path))
def test_load_json_dict(self):
path = self._create_temp_json_file({"name": "John", "age": 30, "items": ["a", "b", "c"]})
try:
result = self._load_from_path(path)
assert isinstance(result, LoaderResult)
assert all(k in result.content for k in ["name", "John", "age", "30"])
assert result.metadata == {
"format": "json", "type": "dict", "size": 3
}
assert result.source == path
finally:
os.unlink(path)
def test_load_json_list(self):
path = self._create_temp_json_file([
{"id": 1, "name": "Item 1"},
{"id": 2, "name": "Item 2"},
])
try:
result = self._load_from_path(path)
assert result.metadata["type"] == "list"
assert result.metadata["size"] == 2
assert all(item in result.content for item in ["Item 1", "Item 2"])
finally:
os.unlink(path)
@pytest.mark.parametrize("value, expected_type", [
("simple string value", "str"),
(42, "int"),
])
def test_load_json_primitives(self, value, expected_type):
path = self._create_temp_json_file(value)
try:
result = self._load_from_path(path)
assert result.metadata["type"] == expected_type
assert result.metadata["size"] == 1
assert str(value) in result.content
finally:
os.unlink(path)
def test_load_malformed_json(self):
path = self._create_temp_raw_file('{"invalid": json,}')
try:
result = self._load_from_path(path)
assert result.metadata["format"] == "json"
assert "parse_error" in result.metadata
assert result.content == '{"invalid": json,}'
finally:
os.unlink(path)
def test_load_empty_file(self):
path = self._create_temp_raw_file('')
try:
result = self._load_from_path(path)
assert "parse_error" in result.metadata
assert result.content == ''
finally:
os.unlink(path)
def test_load_text_input(self):
json_text = '{"message": "hello", "count": 5}'
loader = JSONLoader()
result = loader.load(SourceContent(json_text))
assert all(part in result.content for part in ["message", "hello", "count", "5"])
assert result.metadata["type"] == "dict"
assert result.metadata["size"] == 2
def test_load_complex_nested_json(self):
data = {
"users": [
{"id": 1, "profile": {"name": "Alice", "settings": {"theme": "dark"}}},
{"id": 2, "profile": {"name": "Bob", "settings": {"theme": "light"}}}
],
"meta": {"total": 2, "version": "1.0"}
}
path = self._create_temp_json_file(data)
try:
result = self._load_from_path(path)
for value in ["Alice", "Bob", "dark", "light"]:
assert value in result.content
assert result.metadata["size"] == 2 # top-level keys
finally:
os.unlink(path)
def test_consistent_doc_id(self):
path = self._create_temp_json_file({"test": "data"})
try:
result1 = self._load_from_path(path)
result2 = self._load_from_path(path)
assert result1.doc_id == result2.doc_id
finally:
os.unlink(path)
# ------------------------------
# URL-based tests
# ------------------------------
@patch('requests.get')
def test_url_response_valid_json(self, mock_get):
mock_get.return_value = Mock(
text='{"key": "value", "number": 123}',
json=Mock(return_value={"key": "value", "number": 123}),
raise_for_status=Mock()
)
loader = JSONLoader()
result = loader.load(SourceContent("https://api.example.com/data.json"))
assert all(val in result.content for val in ["key", "value", "number", "123"])
headers = mock_get.call_args[1]['headers']
assert "application/json" in headers['Accept']
assert "crewai-tools JSONLoader" in headers['User-Agent']
@patch('requests.get')
def test_url_response_not_json(self, mock_get):
mock_get.return_value = Mock(
text='{"key": "value"}',
json=Mock(side_effect=ValueError("Not JSON")),
raise_for_status=Mock()
)
loader = JSONLoader()
result = loader.load(SourceContent("https://example.com/data.json"))
assert all(part in result.content for part in ["key", "value"])
@patch('requests.get')
def test_url_with_custom_headers(self, mock_get):
mock_get.return_value = Mock(
text='{"data": "test"}',
json=Mock(return_value={"data": "test"}),
raise_for_status=Mock()
)
headers = {"Authorization": "Bearer token", "Custom-Header": "value"}
loader = JSONLoader()
loader.load(SourceContent("https://api.example.com/data.json"), headers=headers)
assert mock_get.call_args[1]['headers'] == headers
@patch('requests.get')
def test_url_network_failure(self, mock_get):
mock_get.side_effect = Exception("Network error")
loader = JSONLoader()
with pytest.raises(ValueError, match="Error fetching JSON from URL"):
loader.load(SourceContent("https://api.example.com/data.json"))
@patch('requests.get')
def test_url_http_error(self, mock_get):
mock_get.return_value = Mock(raise_for_status=Mock(side_effect=Exception("404")))
loader = JSONLoader()
with pytest.raises(ValueError, match="Error fetching JSON from URL"):
loader.load(SourceContent("https://api.example.com/404.json"))

View File

@@ -0,0 +1,176 @@
import os
import tempfile
import pytest
from unittest.mock import patch, Mock
from crewai_tools.rag.loaders.mdx_loader import MDXLoader
from crewai_tools.rag.base_loader import LoaderResult
from crewai_tools.rag.source_content import SourceContent
class TestMDXLoader:
def _write_temp_mdx(self, content):
f = tempfile.NamedTemporaryFile(mode='w', suffix='.mdx', delete=False)
f.write(content)
f.close()
return f.name
def _load_from_file(self, content):
path = self._write_temp_mdx(content)
try:
loader = MDXLoader()
return loader.load(SourceContent(path)), path
finally:
os.unlink(path)
def test_load_basic_mdx_file(self):
content = """
import Component from './Component'
export const meta = { title: 'Test' }
# Test MDX File
This is a **markdown** file with JSX.
<Component prop="value" />
Some more content.
<div className="container">
<p>Nested content</p>
</div>
"""
result, path = self._load_from_file(content)
assert isinstance(result, LoaderResult)
assert all(tag not in result.content for tag in ["import", "export", "<Component", "<div", "</div>"])
assert all(text in result.content for text in ["# Test MDX File", "markdown", "Some more content", "Nested content"])
assert result.metadata["format"] == "mdx"
assert result.source == path
def test_mdx_multiple_imports_exports(self):
content = """
import React from 'react'
import { useState } from 'react'
import CustomComponent from './custom'
export default function Layout() { return null }
export const config = { test: true }
# Content
Regular markdown content here.
"""
result, _ = self._load_from_file(content)
assert "# Content" in result.content
assert "Regular markdown content here." in result.content
assert "import" not in result.content and "export" not in result.content
def test_complex_jsx_cleanup(self):
content = """
# MDX with Complex JSX
<div className="alert alert-info">
<strong>Info:</strong> This is important information.
<ul><li>Item 1</li><li>Item 2</li></ul>
</div>
Regular paragraph text.
<MyComponent prop1="value1">Nested content inside component</MyComponent>
"""
result, _ = self._load_from_file(content)
assert all(tag not in result.content for tag in ["<div", "<strong>", "<ul>", "<MyComponent"])
assert all(text in result.content for text in ["Info:", "Item 1", "Regular paragraph text.", "Nested content inside component"])
def test_whitespace_cleanup(self):
content = """
# Title
Some content.
More content after multiple newlines.
Final content.
"""
result, _ = self._load_from_file(content)
assert result.content.count('\n\n\n') == 0
assert result.content.startswith('# Title')
assert result.content.endswith('Final content.')
def test_only_jsx_content(self):
content = """
<div>
<h1>Only JSX content</h1>
<p>No markdown here</p>
</div>
"""
result, _ = self._load_from_file(content)
assert all(tag not in result.content for tag in ["<div>", "<h1>", "<p>"])
assert "Only JSX content" in result.content
assert "No markdown here" in result.content
@patch('requests.get')
def test_load_mdx_from_url(self, mock_get):
mock_get.return_value = Mock(text="# MDX from URL\n\nContent here.\n\n<Component />", raise_for_status=lambda: None)
loader = MDXLoader()
result = loader.load(SourceContent("https://example.com/content.mdx"))
assert "# MDX from URL" in result.content
assert "<Component />" not in result.content
@patch('requests.get')
def test_load_mdx_with_custom_headers(self, mock_get):
mock_get.return_value = Mock(text="# Custom headers test", raise_for_status=lambda: None)
loader = MDXLoader()
loader.load(SourceContent("https://example.com"), headers={"Authorization": "Bearer token"})
assert mock_get.call_args[1]['headers'] == {"Authorization": "Bearer token"}
@patch('requests.get')
def test_mdx_url_fetch_error(self, mock_get):
mock_get.side_effect = Exception("Network error")
with pytest.raises(ValueError, match="Error fetching MDX from URL"):
MDXLoader().load(SourceContent("https://example.com"))
def test_load_inline_mdx_text(self):
content = """# Inline MDX\n\nimport Something from 'somewhere'\n\nContent with <Component prop=\"value\" />.\n\nexport const meta = { title: 'Test' }"""
loader = MDXLoader()
result = loader.load(SourceContent(content))
assert "# Inline MDX" in result.content
assert "Content with ." in result.content
def test_empty_result_after_cleaning(self):
content = """
import Something from 'somewhere'
export const config = {}
<div></div>
"""
result, _ = self._load_from_file(content)
assert result.content.strip() == ""
def test_edge_case_parsing(self):
content = """
# Title
<Component>
Multi-line
JSX content
</Component>
import { a, b } from 'module'
export { x, y }
Final text.
"""
result, _ = self._load_from_file(content)
assert "# Title" in result.content
assert "JSX content" in result.content
assert "Final text." in result.content
assert all(phrase not in result.content for phrase in ["import {", "export {", "<Component>"])

View File

@@ -0,0 +1,160 @@
import hashlib
import os
import tempfile
import pytest
from crewai_tools.rag.loaders.text_loader import TextFileLoader, TextLoader
from crewai_tools.rag.base_loader import LoaderResult
from crewai_tools.rag.source_content import SourceContent
def write_temp_file(content, suffix=".txt", encoding="utf-8"):
with tempfile.NamedTemporaryFile(mode="w", suffix=suffix, delete=False, encoding=encoding) as f:
f.write(content)
return f.name
def cleanup_temp_file(path):
try:
os.unlink(path)
except FileNotFoundError:
pass
class TestTextFileLoader:
def test_basic_text_file(self):
content = "This is test content\nWith multiple lines\nAnd more text"
path = write_temp_file(content)
try:
result = TextFileLoader().load(SourceContent(path))
assert isinstance(result, LoaderResult)
assert result.content == content
assert result.source == path
assert result.doc_id
assert result.metadata in (None, {})
finally:
cleanup_temp_file(path)
def test_empty_file(self):
path = write_temp_file("")
try:
result = TextFileLoader().load(SourceContent(path))
assert result.content == ""
finally:
cleanup_temp_file(path)
def test_unicode_content(self):
content = "Hello 世界 🌍 émojis 🎉 åäö"
path = write_temp_file(content)
try:
result = TextFileLoader().load(SourceContent(path))
assert content in result.content
finally:
cleanup_temp_file(path)
def test_large_file(self):
content = "\n".join(f"Line {i}" for i in range(100))
path = write_temp_file(content)
try:
result = TextFileLoader().load(SourceContent(path))
assert "Line 0" in result.content
assert "Line 99" in result.content
assert result.content.count("\n") == 99
finally:
cleanup_temp_file(path)
def test_missing_file(self):
with pytest.raises(FileNotFoundError):
TextFileLoader().load(SourceContent("/nonexistent/path.txt"))
def test_permission_denied(self):
path = write_temp_file("Some content")
os.chmod(path, 0o000)
try:
with pytest.raises(PermissionError):
TextFileLoader().load(SourceContent(path))
finally:
os.chmod(path, 0o644)
cleanup_temp_file(path)
def test_doc_id_consistency(self):
content = "Consistent content"
path = write_temp_file(content)
try:
loader = TextFileLoader()
result1 = loader.load(SourceContent(path))
result2 = loader.load(SourceContent(path))
expected_id = hashlib.sha256((path + content).encode("utf-8")).hexdigest()
assert result1.doc_id == result2.doc_id == expected_id
finally:
cleanup_temp_file(path)
def test_various_extensions(self):
content = "Same content"
for ext in [".txt", ".md", ".log", ".json"]:
path = write_temp_file(content, suffix=ext)
try:
result = TextFileLoader().load(SourceContent(path))
assert result.content == content
finally:
cleanup_temp_file(path)
class TestTextLoader:
def test_basic_text(self):
content = "Raw text"
result = TextLoader().load(SourceContent(content))
expected_hash = hashlib.sha256(content.encode("utf-8")).hexdigest()
assert result.content == content
assert result.source == expected_hash
assert result.doc_id == expected_hash
def test_multiline_text(self):
content = "Line 1\nLine 2\nLine 3"
result = TextLoader().load(SourceContent(content))
assert "Line 2" in result.content
def test_empty_text(self):
result = TextLoader().load(SourceContent(""))
assert result.content == ""
assert result.source == hashlib.sha256("".encode("utf-8")).hexdigest()
def test_unicode_text(self):
content = "世界 🌍 émojis 🎉 åäö"
result = TextLoader().load(SourceContent(content))
assert content in result.content
def test_special_characters(self):
content = "!@#$$%^&*()_+-=~`{}[]\\|;:'\",.<>/?"
result = TextLoader().load(SourceContent(content))
assert result.content == content
def test_doc_id_uniqueness(self):
result1 = TextLoader().load(SourceContent("A"))
result2 = TextLoader().load(SourceContent("B"))
assert result1.doc_id != result2.doc_id
def test_whitespace_text(self):
content = " \n\t "
result = TextLoader().load(SourceContent(content))
assert result.content == content
def test_long_text(self):
content = "A" * 10000
result = TextLoader().load(SourceContent(content))
assert len(result.content) == 10000
class TestTextLoadersIntegration:
def test_consistency_between_loaders(self):
content = "Consistent content"
text_result = TextLoader().load(SourceContent(content))
file_path = write_temp_file(content)
try:
file_result = TextFileLoader().load(SourceContent(file_path))
assert text_result.content == file_result.content
assert text_result.source != file_result.source
assert text_result.doc_id != file_result.doc_id
finally:
cleanup_temp_file(file_path)

View File

@@ -0,0 +1,137 @@
import pytest
from unittest.mock import patch, Mock
from crewai_tools.rag.loaders.webpage_loader import WebPageLoader
from crewai_tools.rag.base_loader import LoaderResult
from crewai_tools.rag.source_content import SourceContent
class TestWebPageLoader:
def setup_mock_response(self, text, status_code=200, content_type="text/html"):
response = Mock()
response.text = text
response.apparent_encoding = "utf-8"
response.status_code = status_code
response.headers = {"content-type": content_type}
return response
def setup_mock_soup(self, text, title=None, script_style_elements=None):
soup = Mock()
soup.get_text.return_value = text
soup.title = Mock(string=title) if title is not None else None
soup.return_value = script_style_elements or []
return soup
@patch('requests.get')
@patch('crewai_tools.rag.loaders.webpage_loader.BeautifulSoup')
def test_load_basic_webpage(self, mock_bs, mock_get):
mock_get.return_value = self.setup_mock_response("<html><head><title>Test Page</title></head><body><p>Test content</p></body></html>")
mock_bs.return_value = self.setup_mock_soup("Test content", title="Test Page")
loader = WebPageLoader()
result = loader.load(SourceContent("https://example.com"))
assert isinstance(result, LoaderResult)
assert result.content == "Test content"
assert result.metadata["title"] == "Test Page"
@patch('requests.get')
@patch('crewai_tools.rag.loaders.webpage_loader.BeautifulSoup')
def test_load_webpage_with_scripts_and_styles(self, mock_bs, mock_get):
html = """
<html><head><title>Page with Scripts</title><style>body { color: red; }</style></head>
<body><script>console.log('test');</script><p>Visible content</p></body></html>
"""
mock_get.return_value = self.setup_mock_response(html)
scripts = [Mock(), Mock()]
styles = [Mock()]
for el in scripts + styles:
el.decompose = Mock()
mock_bs.return_value = self.setup_mock_soup("Page with Scripts Visible content", title="Page with Scripts", script_style_elements=scripts + styles)
loader = WebPageLoader()
result = loader.load(SourceContent("https://example.com/with-scripts"))
assert "Visible content" in result.content
for el in scripts + styles:
el.decompose.assert_called_once()
@patch('requests.get')
@patch('crewai_tools.rag.loaders.webpage_loader.BeautifulSoup')
def test_text_cleaning_and_title_handling(self, mock_bs, mock_get):
mock_get.return_value = self.setup_mock_response("<html><body><p> Messy text </p></body></html>")
mock_bs.return_value = self.setup_mock_soup("Text with extra spaces\n\n More\t text \n\n", title=None)
loader = WebPageLoader()
result = loader.load(SourceContent("https://example.com/messy-text"))
assert result.content is not None
assert result.metadata["title"] == ""
@patch('requests.get')
@patch('crewai_tools.rag.loaders.webpage_loader.BeautifulSoup')
def test_empty_or_missing_title(self, mock_bs, mock_get):
for title in [None, ""]:
mock_get.return_value = self.setup_mock_response("<html><head><title></title></head><body>Content</body></html>")
mock_bs.return_value = self.setup_mock_soup("Content", title=title)
loader = WebPageLoader()
result = loader.load(SourceContent("https://example.com"))
assert result.metadata["title"] == ""
@patch('requests.get')
def test_custom_and_default_headers(self, mock_get):
mock_get.return_value = self.setup_mock_response("<html><body>Test</body></html>")
custom_headers = {"User-Agent": "Bot", "Authorization": "Bearer xyz", "Accept": "text/html"}
with patch('crewai_tools.rag.loaders.webpage_loader.BeautifulSoup') as mock_bs:
mock_bs.return_value = self.setup_mock_soup("Test")
WebPageLoader().load(SourceContent("https://example.com"), headers=custom_headers)
assert mock_get.call_args[1]['headers'] == custom_headers
@patch('requests.get')
def test_error_handling(self, mock_get):
for error in [Exception("Fail"), ValueError("Bad"), ImportError("Oops")]:
mock_get.side_effect = error
with pytest.raises(ValueError, match="Error loading webpage"):
WebPageLoader().load(SourceContent("https://example.com"))
@patch('requests.get')
def test_timeout_and_http_error(self, mock_get):
import requests
mock_get.side_effect = requests.Timeout("Timeout")
with pytest.raises(ValueError):
WebPageLoader().load(SourceContent("https://example.com"))
mock_response = Mock()
mock_response.raise_for_status.side_effect = requests.HTTPError("404")
mock_get.side_effect = None
mock_get.return_value = mock_response
with pytest.raises(ValueError):
WebPageLoader().load(SourceContent("https://example.com/404"))
@patch('requests.get')
@patch('crewai_tools.rag.loaders.webpage_loader.BeautifulSoup')
def test_doc_id_consistency(self, mock_bs, mock_get):
mock_get.return_value = self.setup_mock_response("<html><body>Doc</body></html>")
mock_bs.return_value = self.setup_mock_soup("Doc")
loader = WebPageLoader()
result1 = loader.load(SourceContent("https://example.com"))
result2 = loader.load(SourceContent("https://example.com"))
assert result1.doc_id == result2.doc_id
@patch('requests.get')
@patch('crewai_tools.rag.loaders.webpage_loader.BeautifulSoup')
def test_status_code_and_content_type(self, mock_bs, mock_get):
for status in [200, 201, 301]:
mock_get.return_value = self.setup_mock_response(f"<html><body>Status {status}</body></html>", status_code=status)
mock_bs.return_value = self.setup_mock_soup(f"Status {status}")
result = WebPageLoader().load(SourceContent(f"https://example.com/{status}"))
assert result.metadata["status_code"] == status
for ctype in ["text/html", "text/plain", "application/xhtml+xml"]:
mock_get.return_value = self.setup_mock_response("<html><body>Content</body></html>", content_type=ctype)
mock_bs.return_value = self.setup_mock_soup("Content")
result = WebPageLoader().load(SourceContent("https://example.com"))
assert result.metadata["content_type"] == ctype

View File

@@ -0,0 +1,137 @@
import pytest
from unittest.mock import patch, Mock
from crewai_tools.rag.loaders.webpage_loader import WebPageLoader
from crewai_tools.rag.base_loader import LoaderResult
from crewai_tools.rag.source_content import SourceContent
class TestWebPageLoader:
def setup_mock_response(self, text, status_code=200, content_type="text/html"):
response = Mock()
response.text = text
response.apparent_encoding = "utf-8"
response.status_code = status_code
response.headers = {"content-type": content_type}
return response
def setup_mock_soup(self, text, title=None, script_style_elements=None):
soup = Mock()
soup.get_text.return_value = text
soup.title = Mock(string=title) if title is not None else None
soup.return_value = script_style_elements or []
return soup
@patch('requests.get')
@patch('crewai_tools.rag.loaders.webpage_loader.BeautifulSoup')
def test_load_basic_webpage(self, mock_bs, mock_get):
mock_get.return_value = self.setup_mock_response("<html><head><title>Test Page</title></head><body><p>Test content</p></body></html>")
mock_bs.return_value = self.setup_mock_soup("Test content", title="Test Page")
loader = WebPageLoader()
result = loader.load(SourceContent("https://example.com"))
assert isinstance(result, LoaderResult)
assert result.content == "Test content"
assert result.metadata["title"] == "Test Page"
@patch('requests.get')
@patch('crewai_tools.rag.loaders.webpage_loader.BeautifulSoup')
def test_load_webpage_with_scripts_and_styles(self, mock_bs, mock_get):
html = """
<html><head><title>Page with Scripts</title><style>body { color: red; }</style></head>
<body><script>console.log('test');</script><p>Visible content</p></body></html>
"""
mock_get.return_value = self.setup_mock_response(html)
scripts = [Mock(), Mock()]
styles = [Mock()]
for el in scripts + styles:
el.decompose = Mock()
mock_bs.return_value = self.setup_mock_soup("Page with Scripts Visible content", title="Page with Scripts", script_style_elements=scripts + styles)
loader = WebPageLoader()
result = loader.load(SourceContent("https://example.com/with-scripts"))
assert "Visible content" in result.content
for el in scripts + styles:
el.decompose.assert_called_once()
@patch('requests.get')
@patch('crewai_tools.rag.loaders.webpage_loader.BeautifulSoup')
def test_text_cleaning_and_title_handling(self, mock_bs, mock_get):
mock_get.return_value = self.setup_mock_response("<html><body><p> Messy text </p></body></html>")
mock_bs.return_value = self.setup_mock_soup("Text with extra spaces\n\n More\t text \n\n", title=None)
loader = WebPageLoader()
result = loader.load(SourceContent("https://example.com/messy-text"))
assert result.content is not None
assert result.metadata["title"] == ""
@patch('requests.get')
@patch('crewai_tools.rag.loaders.webpage_loader.BeautifulSoup')
def test_empty_or_missing_title(self, mock_bs, mock_get):
for title in [None, ""]:
mock_get.return_value = self.setup_mock_response("<html><head><title></title></head><body>Content</body></html>")
mock_bs.return_value = self.setup_mock_soup("Content", title=title)
loader = WebPageLoader()
result = loader.load(SourceContent("https://example.com"))
assert result.metadata["title"] == ""
@patch('requests.get')
def test_custom_and_default_headers(self, mock_get):
mock_get.return_value = self.setup_mock_response("<html><body>Test</body></html>")
custom_headers = {"User-Agent": "Bot", "Authorization": "Bearer xyz", "Accept": "text/html"}
with patch('crewai_tools.rag.loaders.webpage_loader.BeautifulSoup') as mock_bs:
mock_bs.return_value = self.setup_mock_soup("Test")
WebPageLoader().load(SourceContent("https://example.com"), headers=custom_headers)
assert mock_get.call_args[1]['headers'] == custom_headers
@patch('requests.get')
def test_error_handling(self, mock_get):
for error in [Exception("Fail"), ValueError("Bad"), ImportError("Oops")]:
mock_get.side_effect = error
with pytest.raises(ValueError, match="Error loading webpage"):
WebPageLoader().load(SourceContent("https://example.com"))
@patch('requests.get')
def test_timeout_and_http_error(self, mock_get):
import requests
mock_get.side_effect = requests.Timeout("Timeout")
with pytest.raises(ValueError):
WebPageLoader().load(SourceContent("https://example.com"))
mock_response = Mock()
mock_response.raise_for_status.side_effect = requests.HTTPError("404")
mock_get.side_effect = None
mock_get.return_value = mock_response
with pytest.raises(ValueError):
WebPageLoader().load(SourceContent("https://example.com/404"))
@patch('requests.get')
@patch('crewai_tools.rag.loaders.webpage_loader.BeautifulSoup')
def test_doc_id_consistency(self, mock_bs, mock_get):
mock_get.return_value = self.setup_mock_response("<html><body>Doc</body></html>")
mock_bs.return_value = self.setup_mock_soup("Doc")
loader = WebPageLoader()
result1 = loader.load(SourceContent("https://example.com"))
result2 = loader.load(SourceContent("https://example.com"))
assert result1.doc_id == result2.doc_id
@patch('requests.get')
@patch('crewai_tools.rag.loaders.webpage_loader.BeautifulSoup')
def test_status_code_and_content_type(self, mock_bs, mock_get):
for status in [200, 201, 301]:
mock_get.return_value = self.setup_mock_response(f"<html><body>Status {status}</body></html>", status_code=status)
mock_bs.return_value = self.setup_mock_soup(f"Status {status}")
result = WebPageLoader().load(SourceContent(f"https://example.com/{status}"))
assert result.metadata["status_code"] == status
for ctype in ["text/html", "text/plain", "application/xhtml+xml"]:
mock_get.return_value = self.setup_mock_response("<html><body>Content</body></html>", content_type=ctype)
mock_bs.return_value = self.setup_mock_soup("Content")
result = WebPageLoader().load(SourceContent("https://example.com"))
assert result.metadata["content_type"] == ctype