mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-04-15 15:32:40 +00:00
* fix: add path and URL validation to RAG tools Add validation utilities to prevent unauthorized file reads and SSRF when RAG tools accept LLM-controlled paths/URLs at runtime. Changes: - New crewai_tools.utilities.safe_path module with validate_file_path(), validate_directory_path(), and validate_url() - File paths validated against base directory (defaults to cwd). Resolves symlinks and ../ traversal. Rejects escape attempts. - URLs validated: file:// blocked entirely. HTTP/HTTPS resolves DNS and blocks private/reserved IPs (10.x, 172.16-31.x, 192.168.x, 127.x, 169.254.x, 0.0.0.0, ::1, fc00::/7). - Validation applied in RagTool.add() — catches all RAG search tools (JSON, CSV, PDF, TXT, DOCX, MDX, Directory, etc.) - Removed file:// scheme support from DataTypes.from_content() - CREWAI_TOOLS_ALLOW_UNSAFE_PATHS=true env var for backward compat - 27 tests covering traversal, symlinks, private IPs, cloud metadata, IPv6, escape hatch, and valid paths/URLs * fix: validate path/URL keyword args in RagTool.add() The original patch validated positional *args but left all keyword arguments (path=, file_path=, directory_path=, url=, website=, github_url=, youtube_url=) unvalidated, providing a trivial bypass for both path-traversal and SSRF checks. Applies validate_file_path() to path/file_path/directory_path kwargs and validate_url() to url/website/github_url/youtube_url kwargs before they reach the adapter. Adds a regression-test file covering all eight kwarg vectors plus the two existing positional-arg checks. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * fix: address CodeQL and review comments on RAG path/URL validation - Replace insecure tempfile.mktemp() with inline symlink target in test - Remove unused 'target' variable and unused tempfile import - Narrow broad except Exception: pass to only catch urlparse errors; validate_url ValueError now propagates instead of being silently swallowed - Fix ruff B904 (raise-without-from-inside-except) in safe_path.py - Fix ruff B007 (unused loop variable 'family') in safe_path.py - Use validate_directory_path in DirectorySearchTool.add() so the public utility is exercised in production code Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * style: fix ruff format + remaining lint issues * fix: resolve mypy type errors in RAG path/URL validation - Cast sockaddr[0] to str() to satisfy mypy (socket.getaddrinfo returns sockaddr where [0] is str but typed as str | int) - Remove now-unnecessary `type: ignore[assignment]` and `type: ignore[literal-required]` comments in rag_tool.py Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * fix: unroll dynamic TypedDict key loops to satisfy mypy literal-required Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * test: allow tmp paths in RAG data-type tests via CREWAI_TOOLS_ALLOW_UNSAFE_PATHS TemporaryDirectory creates files under /tmp/ which is outside CWD and is correctly blocked by the new path validation. These tests exercise data-type handling, not security, so add an autouse fixture that sets CREWAI_TOOLS_ALLOW_UNSAFE_PATHS=true for the whole file. Path/URL security is covered by test_rag_tool_path_validation.py. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * test: allow tmp paths in search-tool and rag_tool tests via CREWAI_TOOLS_ALLOW_UNSAFE_PATHS test_search_tools.py has tests for TXTSearchTool, CSVSearchTool, MDXSearchTool, JSONSearchTool, and DirectorySearchTool that create files under /tmp/ via tempfile, which is outside CWD and correctly blocked by the new path validation. rag_tool_test.py has one test that calls tool.add() with a TemporaryDirectory path. Add the same autouse allow_tmp_paths fixture used in test_rag_tool_add_data_type.py. Security is covered separately by test_rag_tool_path_validation.py. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * chore: update tool specifications * docs: document CodeInterpreterTool removal and RAG path/URL validation Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * fix: address three review comments on path/URL validation - safe_path._is_private_or_reserved: after unwrapping IPv4-mapped IPv6 to IPv4, only check against IPv4 networks to avoid TypeError when comparing an IPv4Address against IPv6Network objects. - safe_path.validate_file_path: handle filesystem-root base_dir ('/') by not appending os.sep when the base already ends with a separator, preventing the '//'-prefix bug. - rag_tool.add: path-detection heuristic now checks for both '/' and os.sep so forward-slash paths are caught on Windows as well as Unix. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * fix: remove unused _BLOCKED_NETWORKS variable after IPv4/IPv6 split * chore: update tool specifications --------- Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
364 lines
12 KiB
Python
364 lines
12 KiB
Python
import os
|
|
from pathlib import Path
|
|
import tempfile
|
|
from unittest.mock import MagicMock
|
|
|
|
from crewai_tools.rag.data_types import DataType
|
|
from crewai_tools.tools import (
|
|
CSVSearchTool,
|
|
CodeDocsSearchTool,
|
|
DOCXSearchTool,
|
|
DirectorySearchTool,
|
|
GithubSearchTool,
|
|
JSONSearchTool,
|
|
MDXSearchTool,
|
|
PDFSearchTool,
|
|
TXTSearchTool,
|
|
WebsiteSearchTool,
|
|
XMLSearchTool,
|
|
YoutubeChannelSearchTool,
|
|
YoutubeVideoSearchTool,
|
|
)
|
|
from crewai_tools.tools.rag.rag_tool import Adapter
|
|
import pytest
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def allow_tmp_paths(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
"""Allow absolute paths outside CWD (e.g. /tmp/) for these search-tool tests.
|
|
|
|
Path validation is tested separately in test_rag_tool_path_validation.py.
|
|
"""
|
|
monkeypatch.setenv("CREWAI_TOOLS_ALLOW_UNSAFE_PATHS", "true")
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_adapter():
|
|
mock_adapter = MagicMock(spec=Adapter)
|
|
return mock_adapter
|
|
|
|
|
|
@pytest.mark.vcr()
|
|
def test_directory_search_tool():
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
test_file = Path(temp_dir) / "test.txt"
|
|
test_file.write_text("This is a test file for directory search")
|
|
|
|
tool = DirectorySearchTool(directory=temp_dir)
|
|
result = tool._run(search_query="test file")
|
|
assert "test file" in result.lower()
|
|
|
|
|
|
def test_pdf_search_tool(mock_adapter):
|
|
mock_adapter.query.return_value = "this is a test"
|
|
|
|
tool = PDFSearchTool(pdf="test.pdf", adapter=mock_adapter)
|
|
result = tool._run(query="test content")
|
|
assert "this is a test" in result.lower()
|
|
mock_adapter.add.assert_called_once_with("test.pdf", data_type=DataType.PDF_FILE)
|
|
mock_adapter.query.assert_called_once_with(
|
|
"test content", similarity_threshold=0.6, limit=5
|
|
)
|
|
|
|
mock_adapter.query.reset_mock()
|
|
mock_adapter.add.reset_mock()
|
|
|
|
tool = PDFSearchTool(adapter=mock_adapter)
|
|
result = tool._run(pdf="test.pdf", query="test content")
|
|
assert "this is a test" in result.lower()
|
|
mock_adapter.add.assert_called_once_with("test.pdf", data_type=DataType.PDF_FILE)
|
|
mock_adapter.query.assert_called_once_with(
|
|
"test content", similarity_threshold=0.6, limit=5
|
|
)
|
|
|
|
|
|
@pytest.mark.vcr()
|
|
def test_txt_search_tool():
|
|
with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as temp_file:
|
|
temp_file.write(b"This is a test file for txt search")
|
|
temp_file_path = temp_file.name
|
|
|
|
try:
|
|
tool = TXTSearchTool()
|
|
tool.add(temp_file_path)
|
|
result = tool._run(search_query="test file")
|
|
assert "test file" in result.lower()
|
|
finally:
|
|
os.unlink(temp_file_path)
|
|
|
|
|
|
def test_docx_search_tool(mock_adapter):
|
|
mock_adapter.query.return_value = "this is a test"
|
|
|
|
tool = DOCXSearchTool(docx="test.docx", adapter=mock_adapter)
|
|
result = tool._run(search_query="test content")
|
|
assert "this is a test" in result.lower()
|
|
mock_adapter.add.assert_called_once_with("test.docx", data_type=DataType.DOCX)
|
|
mock_adapter.query.assert_called_once_with(
|
|
"test content", similarity_threshold=0.6, limit=5
|
|
)
|
|
|
|
mock_adapter.query.reset_mock()
|
|
mock_adapter.add.reset_mock()
|
|
|
|
tool = DOCXSearchTool(adapter=mock_adapter)
|
|
result = tool._run(docx="test.docx", search_query="test content")
|
|
assert "this is a test" in result.lower()
|
|
mock_adapter.add.assert_called_once_with("test.docx", data_type=DataType.DOCX)
|
|
mock_adapter.query.assert_called_once_with(
|
|
"test content", similarity_threshold=0.6, limit=5
|
|
)
|
|
|
|
|
|
@pytest.mark.vcr()
|
|
def test_json_search_tool():
|
|
with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as temp_file:
|
|
temp_file.write(b'{"test": "This is a test JSON file"}')
|
|
temp_file_path = temp_file.name
|
|
|
|
try:
|
|
tool = JSONSearchTool()
|
|
result = tool._run(search_query="test JSON", json_path=temp_file_path)
|
|
assert "test json" in result.lower()
|
|
finally:
|
|
os.unlink(temp_file_path)
|
|
|
|
|
|
def test_xml_search_tool(mock_adapter):
|
|
mock_adapter.query.return_value = "this is a test"
|
|
|
|
tool = XMLSearchTool(adapter=mock_adapter)
|
|
result = tool._run(search_query="test XML", xml="test.xml")
|
|
assert "this is a test" in result.lower()
|
|
mock_adapter.add.assert_called_once_with("test.xml")
|
|
mock_adapter.query.assert_called_once_with(
|
|
"test XML", similarity_threshold=0.6, limit=5
|
|
)
|
|
|
|
|
|
@pytest.mark.vcr()
|
|
def test_csv_search_tool():
|
|
with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as temp_file:
|
|
temp_file.write(b"name,description\ntest,This is a test CSV file")
|
|
temp_file_path = temp_file.name
|
|
|
|
try:
|
|
tool = CSVSearchTool()
|
|
tool.add(temp_file_path)
|
|
result = tool._run(search_query="test CSV")
|
|
assert "test csv" in result.lower()
|
|
finally:
|
|
os.unlink(temp_file_path)
|
|
|
|
|
|
@pytest.mark.vcr()
|
|
def test_mdx_search_tool():
|
|
with tempfile.NamedTemporaryFile(suffix=".mdx", delete=False) as temp_file:
|
|
temp_file.write(b"# Test MDX\nThis is a test MDX file")
|
|
temp_file_path = temp_file.name
|
|
|
|
try:
|
|
tool = MDXSearchTool()
|
|
tool.add(temp_file_path)
|
|
result = tool._run(search_query="test MDX")
|
|
assert "test mdx" in result.lower()
|
|
finally:
|
|
os.unlink(temp_file_path)
|
|
|
|
|
|
def test_website_search_tool(mock_adapter):
|
|
mock_adapter.query.return_value = "this is a test"
|
|
|
|
website = "https://crewai.com"
|
|
search_query = "what is crewai?"
|
|
tool = WebsiteSearchTool(website=website, adapter=mock_adapter)
|
|
result = tool._run(search_query=search_query)
|
|
|
|
mock_adapter.query.assert_called_once_with(
|
|
"what is crewai?", similarity_threshold=0.6, limit=5
|
|
)
|
|
mock_adapter.add.assert_called_once_with(website, data_type=DataType.WEBSITE)
|
|
|
|
assert "this is a test" in result.lower()
|
|
|
|
mock_adapter.query.reset_mock()
|
|
mock_adapter.add.reset_mock()
|
|
|
|
tool = WebsiteSearchTool(adapter=mock_adapter)
|
|
result = tool._run(website=website, search_query=search_query)
|
|
|
|
mock_adapter.query.assert_called_once_with(
|
|
"what is crewai?", similarity_threshold=0.6, limit=5
|
|
)
|
|
mock_adapter.add.assert_called_once_with(website, data_type=DataType.WEBSITE)
|
|
|
|
assert "this is a test" in result.lower()
|
|
|
|
|
|
def test_youtube_video_search_tool(mock_adapter):
|
|
mock_adapter.query.return_value = "some video description"
|
|
|
|
youtube_video_url = "https://www.youtube.com/watch?v=sample-video-id"
|
|
search_query = "what is the video about?"
|
|
tool = YoutubeVideoSearchTool(
|
|
youtube_video_url=youtube_video_url,
|
|
adapter=mock_adapter,
|
|
)
|
|
result = tool._run(search_query=search_query)
|
|
assert "some video description" in result
|
|
|
|
mock_adapter.add.assert_called_once_with(
|
|
youtube_video_url, data_type=DataType.YOUTUBE_VIDEO
|
|
)
|
|
mock_adapter.query.assert_called_once_with(
|
|
search_query, similarity_threshold=0.6, limit=5
|
|
)
|
|
|
|
mock_adapter.query.reset_mock()
|
|
mock_adapter.add.reset_mock()
|
|
|
|
tool = YoutubeVideoSearchTool(adapter=mock_adapter)
|
|
result = tool._run(youtube_video_url=youtube_video_url, search_query=search_query)
|
|
assert "some video description" in result
|
|
|
|
mock_adapter.add.assert_called_once_with(
|
|
youtube_video_url, data_type=DataType.YOUTUBE_VIDEO
|
|
)
|
|
mock_adapter.query.assert_called_once_with(
|
|
search_query, similarity_threshold=0.6, limit=5
|
|
)
|
|
|
|
|
|
def test_youtube_channel_search_tool(mock_adapter):
|
|
mock_adapter.query.return_value = "channel description"
|
|
|
|
youtube_channel_handle = "@crewai"
|
|
search_query = "what is the channel about?"
|
|
tool = YoutubeChannelSearchTool(
|
|
youtube_channel_handle=youtube_channel_handle, adapter=mock_adapter
|
|
)
|
|
result = tool._run(search_query=search_query)
|
|
assert "channel description" in result
|
|
mock_adapter.add.assert_called_once_with(
|
|
youtube_channel_handle, data_type=DataType.YOUTUBE_CHANNEL
|
|
)
|
|
mock_adapter.query.assert_called_once_with(
|
|
search_query, similarity_threshold=0.6, limit=5
|
|
)
|
|
|
|
mock_adapter.query.reset_mock()
|
|
mock_adapter.add.reset_mock()
|
|
|
|
tool = YoutubeChannelSearchTool(adapter=mock_adapter)
|
|
result = tool._run(
|
|
youtube_channel_handle=youtube_channel_handle, search_query=search_query
|
|
)
|
|
assert "channel description" in result
|
|
|
|
mock_adapter.add.assert_called_once_with(
|
|
youtube_channel_handle, data_type=DataType.YOUTUBE_CHANNEL
|
|
)
|
|
mock_adapter.query.assert_called_once_with(
|
|
search_query, similarity_threshold=0.6, limit=5
|
|
)
|
|
|
|
|
|
def test_code_docs_search_tool(mock_adapter):
|
|
mock_adapter.query.return_value = "test documentation"
|
|
|
|
docs_url = "https://crewai.com/any-docs-url"
|
|
search_query = "test documentation"
|
|
tool = CodeDocsSearchTool(docs_url=docs_url, adapter=mock_adapter)
|
|
result = tool._run(search_query=search_query)
|
|
assert "test documentation" in result
|
|
mock_adapter.add.assert_called_once_with(docs_url, data_type=DataType.DOCS_SITE)
|
|
mock_adapter.query.assert_called_once_with(
|
|
search_query, similarity_threshold=0.6, limit=5
|
|
)
|
|
|
|
mock_adapter.query.reset_mock()
|
|
mock_adapter.add.reset_mock()
|
|
|
|
tool = CodeDocsSearchTool(adapter=mock_adapter)
|
|
result = tool._run(docs_url=docs_url, search_query=search_query)
|
|
assert "test documentation" in result
|
|
mock_adapter.add.assert_called_once_with(docs_url, data_type=DataType.DOCS_SITE)
|
|
mock_adapter.query.assert_called_once_with(
|
|
search_query, similarity_threshold=0.6, limit=5
|
|
)
|
|
|
|
|
|
def test_github_search_tool(mock_adapter):
|
|
mock_adapter.query.return_value = "repo description"
|
|
|
|
# ensure the provided repo and content types are used after initialization
|
|
tool = GithubSearchTool(
|
|
gh_token="test_token",
|
|
github_repo="crewai/crewai",
|
|
content_types=["code"],
|
|
adapter=mock_adapter,
|
|
)
|
|
result = tool._run(search_query="tell me about crewai repo")
|
|
assert "repo description" in result
|
|
mock_adapter.add.assert_called_once_with(
|
|
"https://github.com/crewai/crewai",
|
|
data_type=DataType.GITHUB,
|
|
metadata={"content_types": ["code"], "gh_token": "test_token"},
|
|
)
|
|
mock_adapter.query.assert_called_once_with(
|
|
"tell me about crewai repo", similarity_threshold=0.6, limit=5
|
|
)
|
|
|
|
# ensure content types provided by run call is used
|
|
mock_adapter.query.reset_mock()
|
|
mock_adapter.add.reset_mock()
|
|
|
|
tool = GithubSearchTool(gh_token="test_token", adapter=mock_adapter)
|
|
result = tool._run(
|
|
github_repo="crewai/crewai",
|
|
content_types=["code", "issue"],
|
|
search_query="tell me about crewai repo",
|
|
)
|
|
assert "repo description" in result
|
|
mock_adapter.add.assert_called_once_with(
|
|
"https://github.com/crewai/crewai",
|
|
data_type=DataType.GITHUB,
|
|
metadata={"content_types": ["code", "issue"], "gh_token": "test_token"},
|
|
)
|
|
mock_adapter.query.assert_called_once_with(
|
|
"tell me about crewai repo", similarity_threshold=0.6, limit=5
|
|
)
|
|
|
|
# ensure default content types are used if not provided
|
|
mock_adapter.query.reset_mock()
|
|
mock_adapter.add.reset_mock()
|
|
|
|
tool = GithubSearchTool(gh_token="test_token", adapter=mock_adapter)
|
|
result = tool._run(
|
|
github_repo="crewai/crewai",
|
|
search_query="tell me about crewai repo",
|
|
)
|
|
assert "repo description" in result
|
|
mock_adapter.add.assert_called_once_with(
|
|
"https://github.com/crewai/crewai",
|
|
data_type=DataType.GITHUB,
|
|
metadata={
|
|
"content_types": ["code", "repo", "pr", "issue"],
|
|
"gh_token": "test_token",
|
|
},
|
|
)
|
|
mock_adapter.query.assert_called_once_with(
|
|
"tell me about crewai repo", similarity_threshold=0.6, limit=5
|
|
)
|
|
|
|
# ensure nothing is added if no repo is provided
|
|
mock_adapter.query.reset_mock()
|
|
mock_adapter.add.reset_mock()
|
|
|
|
tool = GithubSearchTool(gh_token="test_token", adapter=mock_adapter)
|
|
result = tool._run(search_query="tell me about crewai repo")
|
|
mock_adapter.add.assert_not_called()
|
|
mock_adapter.query.assert_called_once_with(
|
|
"tell me about crewai repo", similarity_threshold=0.6, limit=5
|
|
)
|