Files
crewAI/lib/crewai-tools/src/crewai_tools/rag/loaders/xml_loader.py
Devin AI 506155b4f4 fix: replace xml.etree.ElementTree with defusedxml to prevent XXE attacks
Addresses #4865 - The native Python xml library is vulnerable to XML
External Entity (XXE) attacks that can leak confidential data and XML
bombs that can cause denial of service.

Changes:
- Replace xml.etree.ElementTree with defusedxml.ElementTree in xml_loader.py
- Replace xml.etree.ElementTree with defusedxml.ElementTree in arxiv_paper_tool.py
- Add defusedxml~=0.7.1 as a dependency in crewai-tools pyproject.toml
- Update arxiv_paper_tool_test.py to use defusedxml
- Replace WebPageLoader tests in test_xml_loader.py with proper XMLLoader tests
- Add XXE attack tests (entity expansion, billion laughs, parameter entities)
- Remove noqa: S314 comments since defusedxml is safe

Co-Authored-By: João <joao@crewai.com>
2026-03-14 05:24:39 +00:00

65 lines
2.2 KiB
Python

from typing import Any
from defusedxml.ElementTree import ParseError, fromstring, parse
from crewai_tools.rag.base_loader import BaseLoader, LoaderResult
from crewai_tools.rag.loaders.utils import load_from_url
from crewai_tools.rag.source_content import SourceContent
class XMLLoader(BaseLoader):
def load(self, source_content: SourceContent, **kwargs: Any) -> LoaderResult: # type: ignore[override]
"""Load and parse XML content from various sources.
Args:
source_content: SourceContent: The source content to load.
**kwargs: Additional keyword arguments for loading from URL.
Returns:
LoaderResult: The result of loading and parsing the XML content.
"""
source_ref = source_content.source_ref
content = source_content.source
if source_content.is_url():
content = load_from_url(
source_ref,
kwargs,
accept_header="application/xml, text/xml, text/plain",
loader_name="XMLLoader",
)
elif source_content.path_exists():
content = self._load_from_file(source_ref)
return self._parse_xml(content, source_ref)
@staticmethod
def _load_from_file(path: str) -> str:
with open(path, encoding="utf-8") as file:
return file.read()
def _parse_xml(self, content: str, source_ref: str) -> LoaderResult:
try:
if content.strip().startswith("<"):
root = fromstring(content)
else:
root = parse(source_ref).getroot()
text_parts = []
for text_content in root.itertext():
if text_content and text_content.strip():
text_parts.append(text_content.strip()) # noqa: PERF401
text = "\n".join(text_parts)
metadata = {"format": "xml", "root_tag": root.tag}
except ParseError as e:
text = content
metadata = {"format": "xml", "parse_error": str(e)}
return LoaderResult(
content=text,
source=source_ref,
metadata=metadata,
doc_id=self.generate_doc_id(source_ref=source_ref, content=text),
)