diff --git a/lib/crewai-tools/src/crewai_tools/rag/loaders/docs_site_loader.py b/lib/crewai-tools/src/crewai_tools/rag/loaders/docs_site_loader.py index 87b97266e..3f60bc35f 100644 --- a/lib/crewai-tools/src/crewai_tools/rag/loaders/docs_site_loader.py +++ b/lib/crewai-tools/src/crewai_tools/rag/loaders/docs_site_loader.py @@ -8,6 +8,7 @@ import requests from crewai_tools.rag.base_loader import BaseLoader, LoaderResult from crewai_tools.rag.source_content import SourceContent +from crewai_tools.security.safe_requests import safe_get class DocsSiteLoader(BaseLoader): @@ -26,7 +27,7 @@ class DocsSiteLoader(BaseLoader): docs_url = source.source try: - response = requests.get(docs_url, timeout=30) + response = safe_get(docs_url, timeout=30) response.raise_for_status() except requests.RequestException as e: raise ValueError( diff --git a/lib/crewai-tools/src/crewai_tools/rag/loaders/docx_loader.py b/lib/crewai-tools/src/crewai_tools/rag/loaders/docx_loader.py index 201bc12c2..df736e348 100644 --- a/lib/crewai-tools/src/crewai_tools/rag/loaders/docx_loader.py +++ b/lib/crewai-tools/src/crewai_tools/rag/loaders/docx_loader.py @@ -2,10 +2,9 @@ import os import tempfile from typing import Any -import requests - from crewai_tools.rag.base_loader import BaseLoader, LoaderResult from crewai_tools.rag.source_content import SourceContent +from crewai_tools.security.safe_requests import safe_get class DOCXLoader(BaseLoader): @@ -43,7 +42,7 @@ class DOCXLoader(BaseLoader): ) try: - response = requests.get(url, headers=headers, timeout=30) + response = safe_get(url, headers=headers, timeout=30) response.raise_for_status() # Create temporary file to save the DOCX content diff --git a/lib/crewai-tools/src/crewai_tools/rag/loaders/pdf_loader.py b/lib/crewai-tools/src/crewai_tools/rag/loaders/pdf_loader.py index 774de92d9..259181481 100644 --- a/lib/crewai-tools/src/crewai_tools/rag/loaders/pdf_loader.py +++ b/lib/crewai-tools/src/crewai_tools/rag/loaders/pdf_loader.py @@ -6,10 +6,9 @@ import tempfile from typing import Any from urllib.parse import urlparse -import requests - from crewai_tools.rag.base_loader import BaseLoader, LoaderResult from crewai_tools.rag.source_content import SourceContent +from crewai_tools.security.safe_requests import safe_get class PDFLoader(BaseLoader): @@ -47,7 +46,7 @@ class PDFLoader(BaseLoader): ) try: - response = requests.get(url, headers=headers, timeout=30) + response = safe_get(url, headers=headers, timeout=30) response.raise_for_status() with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file: diff --git a/lib/crewai-tools/src/crewai_tools/rag/loaders/utils.py b/lib/crewai-tools/src/crewai_tools/rag/loaders/utils.py index f944070c8..f239a0a05 100644 --- a/lib/crewai-tools/src/crewai_tools/rag/loaders/utils.py +++ b/lib/crewai-tools/src/crewai_tools/rag/loaders/utils.py @@ -23,7 +23,7 @@ def load_from_url( Raises: ValueError: If there's an error fetching the URL """ - import requests + from crewai_tools.security.safe_requests import safe_get headers = kwargs.get( "headers", @@ -34,7 +34,7 @@ def load_from_url( ) try: - response = requests.get(url, headers=headers, timeout=30) + response = safe_get(url, headers=headers, timeout=30) response.raise_for_status() return response.text except Exception as e: diff --git a/lib/crewai-tools/src/crewai_tools/rag/loaders/webpage_loader.py b/lib/crewai-tools/src/crewai_tools/rag/loaders/webpage_loader.py index 5d9a2d180..a22c09e66 100644 --- a/lib/crewai-tools/src/crewai_tools/rag/loaders/webpage_loader.py +++ b/lib/crewai-tools/src/crewai_tools/rag/loaders/webpage_loader.py @@ -2,10 +2,10 @@ import re from typing import Any, Final from bs4 import BeautifulSoup -import requests from crewai_tools.rag.base_loader import BaseLoader, LoaderResult from crewai_tools.rag.source_content import SourceContent +from crewai_tools.security.safe_requests import safe_get _SPACES_PATTERN: Final[re.Pattern[str]] = re.compile(r"[ \t]+") @@ -25,7 +25,7 @@ class WebPageLoader(BaseLoader): ) try: - response = requests.get(url, timeout=15, headers=headers) + response = safe_get(url, timeout=15, headers=headers) response.encoding = response.apparent_encoding soup = BeautifulSoup(response.text, "html.parser") diff --git a/lib/crewai-tools/src/crewai_tools/security/safe_requests.py b/lib/crewai-tools/src/crewai_tools/security/safe_requests.py new file mode 100644 index 000000000..8106867df --- /dev/null +++ b/lib/crewai-tools/src/crewai_tools/security/safe_requests.py @@ -0,0 +1,49 @@ +"""HTTP helpers that preserve crewai-tools URL safety checks.""" + +from __future__ import annotations + +from typing import Any +from urllib.parse import urljoin + +import requests + +from crewai_tools.security.safe_path import validate_url + + +_REDIRECT_STATUS_CODES = {301, 302, 303, 307, 308} + + +def safe_get(url: str, *, max_redirects: int = 10, **kwargs: Any) -> requests.Response: + """GET a URL while validating each redirect target before following it.""" + current_url = validate_url(url) + request_kwargs = {**kwargs, "allow_redirects": False} + timeout = request_kwargs.pop("timeout", 30) + history: list[requests.Response] = [] + redirects_followed = 0 + + while True: + response = requests.get(current_url, timeout=timeout, **request_kwargs) + if ( + response.status_code not in _REDIRECT_STATUS_CODES + or "Location" not in response.headers + ): + response.history = history + return response + + if redirects_followed >= max_redirects: + response.close() + raise ValueError(f"Too many redirects while fetching URL: {url}") + + location = response.headers.get("Location") + if not location: + response.history = history + return response + + try: + redirect_url = validate_url(urljoin(response.url, location)) + except ValueError: + response.close() + raise + history.append(response) + current_url = redirect_url + redirects_followed += 1 diff --git a/lib/crewai-tools/src/crewai_tools/tools/scrape_element_from_website/scrape_element_from_website.py b/lib/crewai-tools/src/crewai_tools/tools/scrape_element_from_website/scrape_element_from_website.py index 7bba12b72..535bbf9d5 100644 --- a/lib/crewai-tools/src/crewai_tools/tools/scrape_element_from_website/scrape_element_from_website.py +++ b/lib/crewai-tools/src/crewai_tools/tools/scrape_element_from_website/scrape_element_from_website.py @@ -3,9 +3,8 @@ from typing import Any from crewai.tools import BaseTool from pydantic import BaseModel, Field -import requests -from crewai_tools.security.safe_path import validate_url +from crewai_tools.security.safe_requests import safe_get try: @@ -83,8 +82,7 @@ class ScrapeElementFromWebsiteTool(BaseTool): if website_url is None or css_element is None: raise ValueError("Both website_url and css_element must be provided.") - website_url = validate_url(website_url) - page = requests.get( + page = safe_get( website_url, headers=self.headers, cookies=self.cookies if self.cookies else {}, diff --git a/lib/crewai-tools/src/crewai_tools/tools/scrape_website_tool/scrape_website_tool.py b/lib/crewai-tools/src/crewai_tools/tools/scrape_website_tool/scrape_website_tool.py index d297dfe08..62bd7b0be 100644 --- a/lib/crewai-tools/src/crewai_tools/tools/scrape_website_tool/scrape_website_tool.py +++ b/lib/crewai-tools/src/crewai_tools/tools/scrape_website_tool/scrape_website_tool.py @@ -3,9 +3,8 @@ import re from typing import Any from pydantic import Field -import requests -from crewai_tools.security.safe_path import validate_url +from crewai_tools.security.safe_requests import safe_get try: @@ -75,8 +74,7 @@ class ScrapeWebsiteTool(BaseTool): if website_url is None: raise ValueError("Website URL must be provided.") - website_url = validate_url(website_url) - page = requests.get( + page = safe_get( website_url, timeout=15, headers=self.headers, diff --git a/lib/crewai-tools/tests/rag/conftest.py b/lib/crewai-tools/tests/rag/conftest.py new file mode 100644 index 000000000..a0301ae87 --- /dev/null +++ b/lib/crewai-tools/tests/rag/conftest.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +import socket +from typing import Any + +import pytest + + +@pytest.fixture(autouse=True) +def public_example_dns(monkeypatch: pytest.MonkeyPatch) -> None: + original_getaddrinfo = socket.getaddrinfo + + def fake_getaddrinfo( + host: str, port: int, *args: Any, **kwargs: Any + ) -> list[tuple[Any, ...]]: + if host in {"example.com", "api.example.com"}: + return [ + ( + socket.AF_INET, + socket.SOCK_STREAM, + 6, + "", + ("93.184.216.34", port), + ) + ] + return original_getaddrinfo(host, port, *args, **kwargs) + + monkeypatch.setattr(socket, "getaddrinfo", fake_getaddrinfo) diff --git a/lib/crewai-tools/tests/utilities/test_safe_requests.py b/lib/crewai-tools/tests/utilities/test_safe_requests.py new file mode 100644 index 000000000..0f3e9ffa3 --- /dev/null +++ b/lib/crewai-tools/tests/utilities/test_safe_requests.py @@ -0,0 +1,114 @@ +"""Tests for redirect-aware safe HTTP helpers.""" + +from __future__ import annotations + +import socket +from io import BytesIO +from typing import Any + +import pytest +import requests + +from crewai_tools.security.safe_requests import safe_get + + +def _response(url: str, status_code: int, *, location: str | None = None) -> requests.Response: + response = requests.Response() + response.status_code = status_code + response.url = url + response._content = b"ok" + response.raw = BytesIO() + if location is not None: + response.headers["Location"] = location + return response + + +@pytest.fixture +def public_dns(monkeypatch: pytest.MonkeyPatch) -> None: + original_getaddrinfo = socket.getaddrinfo + + def fake_getaddrinfo( + host: str, port: int, *args: Any, **kwargs: Any + ) -> list[tuple[Any, ...]]: + if host in {"public.example", "safe.example"}: + return [ + ( + socket.AF_INET, + socket.SOCK_STREAM, + 6, + "", + ("93.184.216.34", port), + ) + ] + return original_getaddrinfo(host, port, *args, **kwargs) + + monkeypatch.setattr(socket, "getaddrinfo", fake_getaddrinfo) + + +def test_safe_get_blocks_direct_internal_url() -> None: + with pytest.raises(ValueError, match="private/reserved IP"): + safe_get("http://127.0.0.1/admin", timeout=15) + + +def test_safe_get_blocks_redirect_to_internal_url( + monkeypatch: pytest.MonkeyPatch, public_dns: None +) -> None: + requested_urls: list[str] = [] + + def fake_get(url: str, **kwargs: Any) -> requests.Response: + requested_urls.append(url) + assert kwargs["allow_redirects"] is False + return _response(url, 302, location="http://127.0.0.1/admin") + + monkeypatch.setattr( + "crewai_tools.security.safe_requests.requests.get", + fake_get, + ) + + with pytest.raises(ValueError, match="private/reserved IP"): + safe_get("http://public.example/start", timeout=15) + + assert requested_urls == ["http://public.example/start"] + + +def test_safe_get_follows_safe_relative_redirect( + monkeypatch: pytest.MonkeyPatch, public_dns: None +) -> None: + requested_urls: list[str] = [] + + def fake_get(url: str, **kwargs: Any) -> requests.Response: + requested_urls.append(url) + assert kwargs["allow_redirects"] is False + if url == "http://public.example/start": + return _response(url, 302, location="/final") + return _response(url, 200) + + monkeypatch.setattr( + "crewai_tools.security.safe_requests.requests.get", + fake_get, + ) + + response = safe_get("http://public.example/start", timeout=15) + + assert response.status_code == 200 + assert response.url == "http://public.example/final" + assert requested_urls == [ + "http://public.example/start", + "http://public.example/final", + ] + assert len(response.history) == 1 + + +def test_safe_get_fails_closed_after_too_many_redirects( + monkeypatch: pytest.MonkeyPatch, public_dns: None +) -> None: + def fake_get(url: str, **kwargs: Any) -> requests.Response: + return _response(url, 302, location="http://safe.example/again") + + monkeypatch.setattr( + "crewai_tools.security.safe_requests.requests.get", + fake_get, + ) + + with pytest.raises(ValueError, match="Too many redirects"): + safe_get("http://public.example/start", max_redirects=1, timeout=15)