mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-04-14 23:12:37 +00:00
* fix: add path and URL validation to RAG tools Add validation utilities to prevent unauthorized file reads and SSRF when RAG tools accept LLM-controlled paths/URLs at runtime. Changes: - New crewai_tools.utilities.safe_path module with validate_file_path(), validate_directory_path(), and validate_url() - File paths validated against base directory (defaults to cwd). Resolves symlinks and ../ traversal. Rejects escape attempts. - URLs validated: file:// blocked entirely. HTTP/HTTPS resolves DNS and blocks private/reserved IPs (10.x, 172.16-31.x, 192.168.x, 127.x, 169.254.x, 0.0.0.0, ::1, fc00::/7). - Validation applied in RagTool.add() — catches all RAG search tools (JSON, CSV, PDF, TXT, DOCX, MDX, Directory, etc.) - Removed file:// scheme support from DataTypes.from_content() - CREWAI_TOOLS_ALLOW_UNSAFE_PATHS=true env var for backward compat - 27 tests covering traversal, symlinks, private IPs, cloud metadata, IPv6, escape hatch, and valid paths/URLs * fix: validate path/URL keyword args in RagTool.add() The original patch validated positional *args but left all keyword arguments (path=, file_path=, directory_path=, url=, website=, github_url=, youtube_url=) unvalidated, providing a trivial bypass for both path-traversal and SSRF checks. Applies validate_file_path() to path/file_path/directory_path kwargs and validate_url() to url/website/github_url/youtube_url kwargs before they reach the adapter. Adds a regression-test file covering all eight kwarg vectors plus the two existing positional-arg checks. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * fix: address CodeQL and review comments on RAG path/URL validation - Replace insecure tempfile.mktemp() with inline symlink target in test - Remove unused 'target' variable and unused tempfile import - Narrow broad except Exception: pass to only catch urlparse errors; validate_url ValueError now propagates instead of being silently swallowed - Fix ruff B904 (raise-without-from-inside-except) in safe_path.py - Fix ruff B007 (unused loop variable 'family') in safe_path.py - Use validate_directory_path in DirectorySearchTool.add() so the public utility is exercised in production code Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * style: fix ruff format + remaining lint issues * fix: resolve mypy type errors in RAG path/URL validation - Cast sockaddr[0] to str() to satisfy mypy (socket.getaddrinfo returns sockaddr where [0] is str but typed as str | int) - Remove now-unnecessary `type: ignore[assignment]` and `type: ignore[literal-required]` comments in rag_tool.py Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * fix: unroll dynamic TypedDict key loops to satisfy mypy literal-required Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * test: allow tmp paths in RAG data-type tests via CREWAI_TOOLS_ALLOW_UNSAFE_PATHS TemporaryDirectory creates files under /tmp/ which is outside CWD and is correctly blocked by the new path validation. These tests exercise data-type handling, not security, so add an autouse fixture that sets CREWAI_TOOLS_ALLOW_UNSAFE_PATHS=true for the whole file. Path/URL security is covered by test_rag_tool_path_validation.py. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * test: allow tmp paths in search-tool and rag_tool tests via CREWAI_TOOLS_ALLOW_UNSAFE_PATHS test_search_tools.py has tests for TXTSearchTool, CSVSearchTool, MDXSearchTool, JSONSearchTool, and DirectorySearchTool that create files under /tmp/ via tempfile, which is outside CWD and correctly blocked by the new path validation. rag_tool_test.py has one test that calls tool.add() with a TemporaryDirectory path. Add the same autouse allow_tmp_paths fixture used in test_rag_tool_add_data_type.py. Security is covered separately by test_rag_tool_path_validation.py. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * chore: update tool specifications * docs: document CodeInterpreterTool removal and RAG path/URL validation Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * fix: address three review comments on path/URL validation - safe_path._is_private_or_reserved: after unwrapping IPv4-mapped IPv6 to IPv4, only check against IPv4 networks to avoid TypeError when comparing an IPv4Address against IPv6Network objects. - safe_path.validate_file_path: handle filesystem-root base_dir ('/') by not appending os.sep when the base already ends with a separator, preventing the '//'-prefix bug. - rag_tool.add: path-detection heuristic now checks for both '/' and os.sep so forward-slash paths are caught on Windows as well as Unix. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * fix: remove unused _BLOCKED_NETWORKS variable after IPv4/IPv6 split * chore: update tool specifications --------- Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
313 lines
10 KiB
Python
313 lines
10 KiB
Python
from pathlib import Path
|
|
from tempfile import TemporaryDirectory
|
|
from typing import cast
|
|
from unittest.mock import MagicMock, Mock, patch
|
|
|
|
import pytest
|
|
|
|
from crewai_tools.adapters.crewai_rag_adapter import CrewAIRagAdapter
|
|
from crewai_tools.tools.rag.rag_tool import RagTool
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def allow_tmp_paths(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
"""Allow absolute paths outside CWD (e.g. /tmp/) for these RagTool tests.
|
|
|
|
Path validation is tested separately in test_rag_tool_path_validation.py.
|
|
"""
|
|
monkeypatch.setenv("CREWAI_TOOLS_ALLOW_UNSAFE_PATHS", "true")
|
|
|
|
|
|
@patch("crewai_tools.adapters.crewai_rag_adapter.get_rag_client")
|
|
@patch("crewai_tools.adapters.crewai_rag_adapter.create_client")
|
|
def test_rag_tool_initialization(
|
|
mock_create_client: Mock, mock_get_rag_client: Mock
|
|
) -> None:
|
|
"""Test that RagTool initializes with CrewAI adapter by default."""
|
|
mock_client = MagicMock()
|
|
mock_client.get_or_create_collection = MagicMock(return_value=None)
|
|
mock_get_rag_client.return_value = mock_client
|
|
mock_create_client.return_value = mock_client
|
|
|
|
class MyTool(RagTool):
|
|
pass
|
|
|
|
tool = MyTool()
|
|
assert tool.adapter is not None
|
|
assert isinstance(tool.adapter, CrewAIRagAdapter)
|
|
|
|
adapter = cast(CrewAIRagAdapter, tool.adapter)
|
|
assert adapter.collection_name == "rag_tool_collection"
|
|
assert adapter._client is not None
|
|
|
|
|
|
@patch("crewai_tools.adapters.crewai_rag_adapter.get_rag_client")
|
|
@patch("crewai_tools.adapters.crewai_rag_adapter.create_client")
|
|
def test_rag_tool_add_and_query(
|
|
mock_create_client: Mock, mock_get_rag_client: Mock
|
|
) -> None:
|
|
"""Test adding content and querying with RagTool."""
|
|
mock_client = MagicMock()
|
|
mock_client.get_or_create_collection = MagicMock(return_value=None)
|
|
mock_client.add_documents = MagicMock(return_value=None)
|
|
mock_client.search = MagicMock(
|
|
return_value=[
|
|
{"content": "The sky is blue on a clear day.", "metadata": {}, "score": 0.9}
|
|
]
|
|
)
|
|
mock_get_rag_client.return_value = mock_client
|
|
mock_create_client.return_value = mock_client
|
|
|
|
class MyTool(RagTool):
|
|
pass
|
|
|
|
tool = MyTool()
|
|
|
|
tool.add("The sky is blue on a clear day.")
|
|
tool.add("Machine learning is a subset of artificial intelligence.")
|
|
|
|
# Verify documents were added
|
|
assert mock_client.add_documents.call_count == 2
|
|
|
|
result = tool._run(query="What color is the sky?")
|
|
assert "Relevant Content:" in result
|
|
assert "The sky is blue" in result
|
|
|
|
mock_client.search.return_value = [
|
|
{
|
|
"content": "Machine learning is a subset of artificial intelligence.",
|
|
"metadata": {},
|
|
"score": 0.85,
|
|
}
|
|
]
|
|
|
|
result = tool._run(query="Tell me about machine learning")
|
|
assert "Relevant Content:" in result
|
|
assert "Machine learning" in result
|
|
|
|
|
|
@patch("crewai_tools.adapters.crewai_rag_adapter.get_rag_client")
|
|
@patch("crewai_tools.adapters.crewai_rag_adapter.create_client")
|
|
def test_rag_tool_with_file(
|
|
mock_create_client: Mock, mock_get_rag_client: Mock
|
|
) -> None:
|
|
"""Test RagTool with file content."""
|
|
mock_client = MagicMock()
|
|
mock_client.get_or_create_collection = MagicMock(return_value=None)
|
|
mock_client.add_documents = MagicMock(return_value=None)
|
|
mock_client.search = MagicMock(
|
|
return_value=[
|
|
{
|
|
"content": "Python is a programming language known for its simplicity.",
|
|
"metadata": {"file_path": "test.txt"},
|
|
"score": 0.95,
|
|
}
|
|
]
|
|
)
|
|
mock_get_rag_client.return_value = mock_client
|
|
mock_create_client.return_value = mock_client
|
|
|
|
with TemporaryDirectory() as tmpdir:
|
|
test_file = Path(tmpdir) / "test.txt"
|
|
test_file.write_text(
|
|
"Python is a programming language known for its simplicity."
|
|
)
|
|
|
|
class MyTool(RagTool):
|
|
pass
|
|
|
|
tool = MyTool()
|
|
tool.add(str(test_file))
|
|
|
|
assert mock_client.add_documents.called
|
|
|
|
result = tool._run(query="What is Python?")
|
|
assert "Relevant Content:" in result
|
|
assert "Python is a programming language" in result
|
|
|
|
|
|
@patch("crewai_tools.tools.rag.rag_tool.build_embedder")
|
|
@patch("crewai_tools.adapters.crewai_rag_adapter.create_client")
|
|
def test_rag_tool_with_custom_embeddings(
|
|
mock_create_client: Mock, mock_build_embedder: Mock
|
|
) -> None:
|
|
"""Test RagTool with custom embeddings configuration to ensure no API calls."""
|
|
mock_embedding_func = MagicMock()
|
|
mock_embedding_func.return_value = [[0.2] * 1536]
|
|
mock_build_embedder.return_value = mock_embedding_func
|
|
|
|
mock_client = MagicMock()
|
|
mock_client.get_or_create_collection = MagicMock(return_value=None)
|
|
mock_client.add_documents = MagicMock(return_value=None)
|
|
mock_client.search = MagicMock(
|
|
return_value=[{"content": "Test content", "metadata": {}, "score": 0.8}]
|
|
)
|
|
mock_create_client.return_value = mock_client
|
|
|
|
class MyTool(RagTool):
|
|
pass
|
|
|
|
config = {
|
|
"vectordb": {"provider": "chromadb", "config": {}},
|
|
"embedding_model": {
|
|
"provider": "openai",
|
|
"config": {"model": "text-embedding-3-small"},
|
|
},
|
|
}
|
|
|
|
tool = MyTool(config=config)
|
|
tool.add("Test content")
|
|
|
|
result = tool._run(query="Test query")
|
|
assert "Relevant Content:" in result
|
|
assert "Test content" in result
|
|
|
|
mock_build_embedder.assert_called()
|
|
|
|
|
|
@patch("crewai_tools.adapters.crewai_rag_adapter.get_rag_client")
|
|
@patch("crewai_tools.adapters.crewai_rag_adapter.create_client")
|
|
def test_rag_tool_no_results(
|
|
mock_create_client: Mock, mock_get_rag_client: Mock
|
|
) -> None:
|
|
"""Test RagTool when no relevant content is found."""
|
|
mock_client = MagicMock()
|
|
mock_client.get_or_create_collection = MagicMock(return_value=None)
|
|
mock_client.search = MagicMock(return_value=[])
|
|
mock_get_rag_client.return_value = mock_client
|
|
mock_create_client.return_value = mock_client
|
|
|
|
class MyTool(RagTool):
|
|
pass
|
|
|
|
tool = MyTool()
|
|
|
|
result = tool._run(query="Non-existent content")
|
|
assert "Relevant Content:" in result
|
|
assert "No relevant content found" in result
|
|
|
|
|
|
@patch("crewai_tools.adapters.crewai_rag_adapter.create_client")
|
|
def test_rag_tool_with_azure_config_without_env_vars(
|
|
mock_create_client: Mock,
|
|
) -> None:
|
|
"""Test that RagTool accepts Azure config without requiring env vars.
|
|
|
|
This test verifies the fix for the issue where RAG tools were ignoring
|
|
the embedding configuration passed via the config parameter and instead
|
|
requiring environment variables like EMBEDDINGS_OPENAI_API_KEY.
|
|
"""
|
|
mock_embedding_func = MagicMock()
|
|
mock_embedding_func.return_value = [[0.1] * 1536]
|
|
|
|
mock_client = MagicMock()
|
|
mock_client.get_or_create_collection = MagicMock(return_value=None)
|
|
mock_client.add_documents = MagicMock(return_value=None)
|
|
mock_create_client.return_value = mock_client
|
|
|
|
# Patch the embedding function builder to avoid actual API calls
|
|
with patch(
|
|
"crewai_tools.tools.rag.rag_tool.build_embedder",
|
|
return_value=mock_embedding_func,
|
|
):
|
|
|
|
class MyTool(RagTool):
|
|
pass
|
|
|
|
# Configuration with explicit Azure credentials - should work without env vars
|
|
config = {
|
|
"embedding_model": {
|
|
"provider": "azure",
|
|
"config": {
|
|
"model": "text-embedding-3-small",
|
|
"api_key": "test-api-key",
|
|
"api_base": "https://test.openai.azure.com/",
|
|
"api_version": "2024-02-01",
|
|
"api_type": "azure",
|
|
"deployment_id": "test-deployment",
|
|
},
|
|
}
|
|
}
|
|
|
|
# This should not raise a validation error about missing env vars
|
|
tool = MyTool(config=config)
|
|
|
|
assert tool.adapter is not None
|
|
assert isinstance(tool.adapter, CrewAIRagAdapter)
|
|
|
|
|
|
@patch("crewai_tools.adapters.crewai_rag_adapter.create_client")
|
|
def test_rag_tool_with_openai_config_without_env_vars(
|
|
mock_create_client: Mock,
|
|
) -> None:
|
|
"""Test that RagTool accepts OpenAI config without requiring env vars."""
|
|
mock_embedding_func = MagicMock()
|
|
mock_embedding_func.return_value = [[0.1] * 1536]
|
|
|
|
mock_client = MagicMock()
|
|
mock_client.get_or_create_collection = MagicMock(return_value=None)
|
|
mock_create_client.return_value = mock_client
|
|
|
|
with patch(
|
|
"crewai_tools.tools.rag.rag_tool.build_embedder",
|
|
return_value=mock_embedding_func,
|
|
):
|
|
|
|
class MyTool(RagTool):
|
|
pass
|
|
|
|
config = {
|
|
"embedding_model": {
|
|
"provider": "openai",
|
|
"config": {
|
|
"model": "text-embedding-3-small",
|
|
"api_key": "sk-test123",
|
|
},
|
|
}
|
|
}
|
|
|
|
tool = MyTool(config=config)
|
|
|
|
assert tool.adapter is not None
|
|
assert isinstance(tool.adapter, CrewAIRagAdapter)
|
|
|
|
|
|
@patch("crewai_tools.adapters.crewai_rag_adapter.create_client")
|
|
def test_rag_tool_config_with_qdrant_and_azure_embeddings(
|
|
mock_create_client: Mock,
|
|
) -> None:
|
|
"""Test RagTool with Qdrant vector DB and Azure embeddings config."""
|
|
mock_embedding_func = MagicMock()
|
|
mock_embedding_func.return_value = [[0.1] * 1536]
|
|
|
|
mock_client = MagicMock()
|
|
mock_client.get_or_create_collection = MagicMock(return_value=None)
|
|
mock_create_client.return_value = mock_client
|
|
|
|
with patch(
|
|
"crewai_tools.tools.rag.rag_tool.build_embedder",
|
|
return_value=mock_embedding_func,
|
|
):
|
|
|
|
class MyTool(RagTool):
|
|
pass
|
|
|
|
config = {
|
|
"vectordb": {"provider": "qdrant", "config": {}},
|
|
"embedding_model": {
|
|
"provider": "azure",
|
|
"config": {
|
|
"model": "text-embedding-3-large",
|
|
"api_key": "test-key",
|
|
"api_base": "https://test.openai.azure.com/",
|
|
"api_version": "2024-02-01",
|
|
"deployment_id": "test-deployment",
|
|
},
|
|
},
|
|
}
|
|
|
|
tool = MyTool(config=config)
|
|
|
|
assert tool.adapter is not None
|
|
assert isinstance(tool.adapter, CrewAIRagAdapter)
|