Add Oxylabs Web Scraping tools (#312)

* Add Oxylabs tools * Review updates * Add package_dependencies attribute
2026-01-08 23:58:34 +00:00 · 2025-06-24 16:56:47 +03:00
parent c13b08de2e
commit 78a062a907
11 changed files with 1013 additions and 0 deletions
--- a/src/crewai_tools/init.py
+++ b/src/crewai_tools/init.py
@@ -37,6 +37,10 @@ from .tools import (
    MultiOnTool,
    MySQLSearchTool,
    NL2SQLTool,
+    OxylabsUniversalScraperTool,
+    OxylabsGoogleSearchScraperTool,
+    OxylabsAmazonProductScraperTool,
+    OxylabsAmazonSearchScraperTool,
    PatronusEvalTool,
    PatronusLocalEvaluatorTool,
    PatronusPredefinedCriteriaEvalTool,
--- a/src/crewai_tools/tools/init.py
+++ b/src/crewai_tools/tools/init.py
@@ -32,6 +32,18 @@ from .mdx_search_tool.mdx_search_tool import MDXSearchTool
 from .multion_tool.multion_tool import MultiOnTool
 from .mysql_search_tool.mysql_search_tool import MySQLSearchTool
 from .nl2sql.nl2sql_tool import NL2SQLTool
+from .oxylabs_universal_scraper_tool.oxylabs_universal_scraper_tool import (
+    OxylabsUniversalScraperTool,
+)
+from .oxylabs_google_search_scraper_tool.oxylabs_google_search_scraper_tool import (
+    OxylabsGoogleSearchScraperTool,
+)
+from .oxylabs_amazon_product_scraper_tool.oxylabs_amazon_product_scraper_tool import (
+    OxylabsAmazonProductScraperTool,
+)
+from .oxylabs_amazon_search_scraper_tool.oxylabs_amazon_search_scraper_tool import (
+    OxylabsAmazonSearchScraperTool,
+)
 from .patronus_eval_tool import (
    PatronusEvalTool,
    PatronusLocalEvaluatorTool,
--- a/src/crewai_tools/tools/oxylabs_amazon_product_scraper_tool/README.md
+++ b/src/crewai_tools/tools/oxylabs_amazon_product_scraper_tool/README.md
@@ -0,0 +1,55 @@
+# OxylabsAmazonProductScraperTool
+
+Scrape any website with `OxylabsAmazonProductScraperTool`
+
+## Installation
+
+```
+pip install 'crewai[tools]' oxylabs
+```
+
+## Example
+
+```python
+from crewai_tools import OxylabsAmazonProductScraperTool
+
+# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set
+tool = OxylabsAmazonProductScraperTool()
+
+result = tool.run(query="AAAAABBBBCC")
+
+print(result)
+```
+
+## Arguments
+
+- `username`: Oxylabs username.
+- `password`: Oxylabs password.
+
+Get the credentials by creating an Oxylabs Account [here](https://oxylabs.io).
+
+## Advanced example
+
+Check out the Oxylabs [documentation](https://developers.oxylabs.io/scraper-apis/web-scraper-api/targets/amazon/product) to get the full list of parameters.
+
+```python
+from crewai_tools import OxylabsAmazonProductScraperTool
+
+# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set
+tool = OxylabsAmazonProductScraperTool(
+    config={
+        "domain": "com",
+        "parse": True,
+        "context": [
+            {
+                "key": "autoselect_variant", 
+                "value": True
+            }
+        ]
+    }
+)
+
+result = tool.run(query="AAAAABBBBCC")
+
+print(result)
+```
--- a/src/crewai_tools/tools/oxylabs_amazon_product_scraper_tool/oxylabs_amazon_product_scraper_tool.py
+++ b/src/crewai_tools/tools/oxylabs_amazon_product_scraper_tool/oxylabs_amazon_product_scraper_tool.py
@@ -0,0 +1,151 @@
+import json
+import os
+from importlib.metadata import version
+from platform import architecture, python_version
+from typing import Any, List, Type
+
+from crewai.tools import BaseTool
+from pydantic import BaseModel, ConfigDict, Field
+
+try:
+    from oxylabs import RealtimeClient
+    from oxylabs.sources.response import Response as OxylabsResponse
+
+    OXYLABS_AVAILABLE = True
+except ImportError:
+    RealtimeClient = Any
+    OxylabsResponse = Any
+
+    OXYLABS_AVAILABLE = False
+
+
+__all__ = ["OxylabsAmazonProductScraperTool", "OxylabsAmazonProductScraperConfig"]
+
+
+class OxylabsAmazonProductScraperArgs(BaseModel):
+    query: str = Field(description="Amazon product ASIN")
+
+
+class OxylabsAmazonProductScraperConfig(BaseModel):
+    """
+    Amazon Product Scraper configuration options:
+    https://developers.oxylabs.io/scraper-apis/web-scraper-api/targets/amazon/product
+    """
+
+    domain: str | None = Field(
+        None, description="The domain to limit the search results to."
+    )
+    geo_location: str | None = Field(None, description="The Deliver to location.")
+    user_agent_type: str | None = Field(None, description="Device type and browser.")
+    render: str | None = Field(None, description="Enables JavaScript rendering.")
+    callback_url: str | None = Field(None, description="URL to your callback endpoint.")
+    context: list | None = Field(
+        None,
+        description="Additional advanced settings and controls for specialized requirements.",
+    )
+    parse: bool | None = Field(None, description="True will return structured data.")
+    parsing_instructions: dict | None = Field(
+        None, description="Instructions for parsing the results."
+    )
+
+
+class OxylabsAmazonProductScraperTool(BaseTool):
+    """
+    Scrape Amazon product pages with OxylabsAmazonProductScraperTool.
+
+    Get Oxylabs account:
+    https://dashboard.oxylabs.io/en
+
+    Args:
+        username (str): Oxylabs username.
+        password (str): Oxylabs password.
+        config: Configuration options. See ``OxylabsAmazonProductScraperConfig``
+    """
+
+    model_config = ConfigDict(
+        arbitrary_types_allowed=True,
+        validate_assignment=True,
+    )
+    name: str = "Oxylabs Amazon Product Scraper tool"
+    description: str = "Scrape Amazon product pages with Oxylabs Amazon Product Scraper"
+    args_schema: Type[BaseModel] = OxylabsAmazonProductScraperArgs
+
+    oxylabs_api: RealtimeClient
+    config: OxylabsAmazonProductScraperConfig
+    package_dependencies: List[str] = ["oxylabs"]
+
+    def __init__(
+        self,
+        username: str | None = None,
+        password: str | None = None,
+        config: OxylabsAmazonProductScraperConfig
+        | dict = OxylabsAmazonProductScraperConfig(),
+        **kwargs,
+    ) -> None:
+        bits, _ = architecture()
+        sdk_type = (
+            f"oxylabs-crewai-sdk-python/"
+            f"{version('crewai')} "
+            f"({python_version()}; {bits})"
+        )
+
+        if username is None or password is None:
+            username, password = self._get_credentials_from_env()
+
+        if OXYLABS_AVAILABLE:
+            # import RealtimeClient to make it accessible for the current scope
+            from oxylabs import RealtimeClient
+
+            kwargs["oxylabs_api"] = RealtimeClient(
+                username=username,
+                password=password,
+                sdk_type=sdk_type,
+            )
+        else:
+            import click
+
+            if click.confirm(
+                "You are missing the 'oxylabs' package. Would you like to install it?"
+            ):
+                import subprocess
+
+                try:
+                    subprocess.run(["uv", "add", "oxylabs"], check=True)
+                    from oxylabs import RealtimeClient
+
+                    kwargs["oxylabs_api"] = RealtimeClient(
+                        username=username,
+                        password=password,
+                        sdk_type=sdk_type,
+                    )
+                except subprocess.CalledProcessError:
+                    raise ImportError("Failed to install oxylabs package")
+            else:
+                raise ImportError(
+                    "`oxylabs` package not found, please run `uv add oxylabs`"
+                )
+
+        super().__init__(config=config, **kwargs)
+
+    def _get_credentials_from_env(self) -> tuple[str, str]:
+        username = os.environ.get("OXYLABS_USERNAME")
+        password = os.environ.get("OXYLABS_PASSWORD")
+        if not username or not password:
+            raise ValueError(
+                "You must pass oxylabs username and password when instantiating the tool "
+                "or specify OXYLABS_USERNAME and OXYLABS_PASSWORD environment variables"
+            )
+        return username, password
+
+    def _run(self, query: str) -> str:
+        response = self.oxylabs_api.amazon.scrape_product(
+            query,
+            **self.config.model_dump(exclude_none=True),
+        )
+
+        content = response.results[0].content
+
+        if isinstance(content, dict):
+            return json.dumps(content)
+
+        return content
--- a/src/crewai_tools/tools/oxylabs_amazon_search_scraper_tool/README.md
+++ b/src/crewai_tools/tools/oxylabs_amazon_search_scraper_tool/README.md
@@ -0,0 +1,54 @@
+# OxylabsAmazonSearchScraperTool
+
+Scrape any website with `OxylabsAmazonSearchScraperTool`
+
+## Installation
+
+```
+pip install 'crewai[tools]' oxylabs
+```
+
+## Example
+
+```python
+from crewai_tools import OxylabsAmazonSearchScraperTool
+
+# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set
+tool = OxylabsAmazonSearchScraperTool()
+
+result = tool.run(query="headsets")
+
+print(result)
+```
+
+## Arguments
+
+- `username`: Oxylabs username.
+- `password`: Oxylabs password.
+
+Get the credentials by creating an Oxylabs Account [here](https://oxylabs.io).
+
+## Advanced example
+
+Check out the Oxylabs [documentation](https://developers.oxylabs.io/scraper-apis/web-scraper-api/targets/amazon/search) to get the full list of parameters.
+
+```python
+from crewai_tools import OxylabsAmazonSearchScraperTool
+
+# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set
+tool = OxylabsAmazonSearchScraperTool(
+    config={
+        "domain": 'nl',
+        "start_page": 2,
+        "pages": 2,
+        "parse": True,
+        "context": [
+            {'key': 'category_id', 'value': 16391693031}
+        ],
+    }
+)
+
+result = tool.run(query='nirvana tshirt')
+
+print(result)
+```
--- a/src/crewai_tools/tools/oxylabs_amazon_search_scraper_tool/oxylabs_amazon_search_scraper_tool.py
+++ b/src/crewai_tools/tools/oxylabs_amazon_search_scraper_tool/oxylabs_amazon_search_scraper_tool.py
@@ -0,0 +1,153 @@
+import json
+import os
+from importlib.metadata import version
+from platform import architecture, python_version
+from typing import Any, List, Type
+
+from crewai.tools import BaseTool
+from pydantic import BaseModel, ConfigDict, Field
+
+try:
+    from oxylabs import RealtimeClient
+    from oxylabs.sources.response import Response as OxylabsResponse
+
+    OXYLABS_AVAILABLE = True
+except ImportError:
+    RealtimeClient = Any
+    OxylabsResponse = Any
+
+    OXYLABS_AVAILABLE = False
+
+
+__all__ = ["OxylabsAmazonSearchScraperTool", "OxylabsAmazonSearchScraperConfig"]
+
+
+class OxylabsAmazonSearchScraperArgs(BaseModel):
+    query: str = Field(description="Amazon search term")
+
+
+class OxylabsAmazonSearchScraperConfig(BaseModel):
+    """
+    Amazon Search Scraper configuration options:
+    https://developers.oxylabs.io/scraper-apis/web-scraper-api/targets/amazon/search
+    """
+
+    domain: str | None = Field(
+        None, description="The domain to limit the search results to."
+    )
+    start_page: int | None = Field(None, description="The starting page number.")
+    pages: int | None = Field(None, description="The number of pages to scrape.")
+    geo_location: str | None = Field(None, description="The Deliver to location.")
+    user_agent_type: str | None = Field(None, description="Device type and browser.")
+    render: str | None = Field(None, description="Enables JavaScript rendering.")
+    callback_url: str | None = Field(None, description="URL to your callback endpoint.")
+    context: list | None = Field(
+        None,
+        description="Additional advanced settings and controls for specialized requirements.",
+    )
+    parse: bool | None = Field(None, description="True will return structured data.")
+    parsing_instructions: dict | None = Field(
+        None, description="Instructions for parsing the results."
+    )
+
+
+class OxylabsAmazonSearchScraperTool(BaseTool):
+    """
+    Scrape Amazon search results with OxylabsAmazonSearchScraperTool.
+
+    Get Oxylabs account:
+    https://dashboard.oxylabs.io/en
+
+    Args:
+        username (str): Oxylabs username.
+        password (str): Oxylabs password.
+        config: Configuration options. See ``OxylabsAmazonSearchScraperConfig``
+    """
+
+    model_config = ConfigDict(
+        arbitrary_types_allowed=True,
+        validate_assignment=True,
+    )
+    name: str = "Oxylabs Amazon Search Scraper tool"
+    description: str = "Scrape Amazon search results with Oxylabs Amazon Search Scraper"
+    args_schema: Type[BaseModel] = OxylabsAmazonSearchScraperArgs
+
+    oxylabs_api: RealtimeClient
+    config: OxylabsAmazonSearchScraperConfig
+    package_dependencies: List[str] = ["oxylabs"]
+
+    def __init__(
+        self,
+        username: str | None = None,
+        password: str | None = None,
+        config: OxylabsAmazonSearchScraperConfig
+        | dict = OxylabsAmazonSearchScraperConfig(),
+        **kwargs,
+    ):
+        bits, _ = architecture()
+        sdk_type = (
+            f"oxylabs-crewai-sdk-python/"
+            f"{version('crewai')} "
+            f"({python_version()}; {bits})"
+        )
+
+        if username is None or password is None:
+            username, password = self._get_credentials_from_env()
+
+        if OXYLABS_AVAILABLE:
+            # import RealtimeClient to make it accessible for the current scope
+            from oxylabs import RealtimeClient
+
+            kwargs["oxylabs_api"] = RealtimeClient(
+                username=username,
+                password=password,
+                sdk_type=sdk_type,
+            )
+        else:
+            import click
+
+            if click.confirm(
+                "You are missing the 'oxylabs' package. Would you like to install it?"
+            ):
+                import subprocess
+
+                try:
+                    subprocess.run(["uv", "add", "oxylabs"], check=True)
+                    from oxylabs import RealtimeClient
+
+                    kwargs["oxylabs_api"] = RealtimeClient(
+                        username=username,
+                        password=password,
+                        sdk_type=sdk_type,
+                    )
+                except subprocess.CalledProcessError:
+                    raise ImportError("Failed to install oxylabs package")
+            else:
+                raise ImportError(
+                    "`oxylabs` package not found, please run `uv add oxylabs`"
+                )
+
+        super().__init__(config=config, **kwargs)
+
+    def _get_credentials_from_env(self) -> tuple[str, str]:
+        username = os.environ.get("OXYLABS_USERNAME")
+        password = os.environ.get("OXYLABS_PASSWORD")
+        if not username or not password:
+            raise ValueError(
+                "You must pass oxylabs username and password when instantiating the tool "
+                "or specify OXYLABS_USERNAME and OXYLABS_PASSWORD environment variables"
+            )
+        return username, password
+
+    def _run(self, query: str) -> str:
+        response = self.oxylabs_api.amazon.scrape_search(
+            query,
+            **self.config.model_dump(exclude_none=True),
+        )
+
+        content = response.results[0].content
+
+        if isinstance(content, dict):
+            return json.dumps(content)
+
+        return content
--- a/src/crewai_tools/tools/oxylabs_google_search_scraper_tool/README.md
+++ b/src/crewai_tools/tools/oxylabs_google_search_scraper_tool/README.md
@@ -0,0 +1,50 @@
+# OxylabsGoogleSearchScraperTool
+
+Scrape any website with `OxylabsGoogleSearchScraperTool`
+
+## Installation
+
+```
+pip install 'crewai[tools]' oxylabs
+```
+
+## Example
+
+```python
+from crewai_tools import OxylabsGoogleSearchScraperTool
+
+# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set
+tool = OxylabsGoogleSearchScraperTool()
+
+result = tool.run(query="iPhone 16")
+
+print(result)
+```
+
+## Arguments
+
+- `username`: Oxylabs username.
+- `password`: Oxylabs password.
+
+Get the credentials by creating an Oxylabs Account [here](https://oxylabs.io).
+
+## Advanced example
+
+Check out the Oxylabs [documentation](https://developers.oxylabs.io/scraper-apis/web-scraper-api/targets/google/search/search) to get the full list of parameters.
+
+```python
+from crewai_tools import OxylabsGoogleSearchScraperTool
+
+# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set
+tool = OxylabsGoogleSearchScraperTool(
+    config={
+        "parse": True,
+        "geo_location": "Paris, France",
+        "user_agent_type": "tablet",
+    }
+)
+
+result = tool.run(query="iPhone 16")
+
+print(result)
+```
--- a/src/crewai_tools/tools/oxylabs_google_search_scraper_tool/oxylabs_google_search_scraper_tool.py
+++ b/src/crewai_tools/tools/oxylabs_google_search_scraper_tool/oxylabs_google_search_scraper_tool.py
@@ -0,0 +1,156 @@
+import json
+import os
+from importlib.metadata import version
+from platform import architecture, python_version
+from typing import Any, List, Type
+
+from crewai.tools import BaseTool
+from pydantic import BaseModel, ConfigDict, Field
+
+try:
+    from oxylabs import RealtimeClient
+    from oxylabs.sources.response import Response as OxylabsResponse
+
+    OXYLABS_AVAILABLE = True
+except ImportError:
+    RealtimeClient = Any
+    OxylabsResponse = Any
+
+    OXYLABS_AVAILABLE = False
+
+
+__all__ = ["OxylabsGoogleSearchScraperTool", "OxylabsGoogleSearchScraperConfig"]
+
+
+class OxylabsGoogleSearchScraperArgs(BaseModel):
+    query: str = Field(description="Search query")
+
+
+class OxylabsGoogleSearchScraperConfig(BaseModel):
+    """
+    Google Search Scraper configuration options:
+    https://developers.oxylabs.io/scraper-apis/web-scraper-api/targets/google/search/search
+    """
+
+    domain: str | None = Field(
+        None, description="The domain to limit the search results to."
+    )
+    start_page: int | None = Field(None, description="The starting page number.")
+    pages: int | None = Field(None, description="The number of pages to scrape.")
+    limit: int | None = Field(
+        None, description="Number of results to retrieve in each page."
+    )
+    geo_location: str | None = Field(None, description="The Deliver to location.")
+    user_agent_type: str | None = Field(None, description="Device type and browser.")
+    render: str | None = Field(None, description="Enables JavaScript rendering.")
+    callback_url: str | None = Field(None, description="URL to your callback endpoint.")
+    context: list | None = Field(
+        None,
+        description="Additional advanced settings and controls for specialized requirements.",
+    )
+    parse: bool | None = Field(None, description="True will return structured data.")
+    parsing_instructions: dict | None = Field(
+        None, description="Instructions for parsing the results."
+    )
+
+
+class OxylabsGoogleSearchScraperTool(BaseTool):
+    """
+    Scrape Google Search results with OxylabsGoogleSearchScraperTool.
+
+    Get Oxylabs account:
+    https://dashboard.oxylabs.io/en
+
+    Args:
+        username (str): Oxylabs username.
+        password (str): Oxylabs password.
+        config: Configuration options. See ``OxylabsGoogleSearchScraperConfig``
+    """
+
+    model_config = ConfigDict(
+        arbitrary_types_allowed=True,
+        validate_assignment=True,
+    )
+    name: str = "Oxylabs Google Search Scraper tool"
+    description: str = "Scrape Google Search results with Oxylabs Google Search Scraper"
+    args_schema: Type[BaseModel] = OxylabsGoogleSearchScraperArgs
+
+    oxylabs_api: RealtimeClient
+    config: OxylabsGoogleSearchScraperConfig
+    package_dependencies: List[str] = ["oxylabs"]
+
+    def __init__(
+        self,
+        username: str | None = None,
+        password: str | None = None,
+        config: OxylabsGoogleSearchScraperConfig
+        | dict = OxylabsGoogleSearchScraperConfig(),
+        **kwargs,
+    ):
+        bits, _ = architecture()
+        sdk_type = (
+            f"oxylabs-crewai-sdk-python/"
+            f"{version('crewai')} "
+            f"({python_version()}; {bits})"
+        )
+
+        if username is None or password is None:
+            username, password = self._get_credentials_from_env()
+
+        if OXYLABS_AVAILABLE:
+            # import RealtimeClient to make it accessible for the current scope
+            from oxylabs import RealtimeClient
+
+            kwargs["oxylabs_api"] = RealtimeClient(
+                username=username,
+                password=password,
+                sdk_type=sdk_type,
+            )
+        else:
+            import click
+
+            if click.confirm(
+                "You are missing the 'oxylabs' package. Would you like to install it?"
+            ):
+                import subprocess
+
+                try:
+                    subprocess.run(["uv", "add", "oxylabs"], check=True)
+                    from oxylabs import RealtimeClient
+
+                    kwargs["oxylabs_api"] = RealtimeClient(
+                        username=username,
+                        password=password,
+                        sdk_type=sdk_type,
+                    )
+                except subprocess.CalledProcessError:
+                    raise ImportError("Failed to install oxylabs package")
+            else:
+                raise ImportError(
+                    "`oxylabs` package not found, please run `uv add oxylabs`"
+                )
+
+        super().__init__(config=config, **kwargs)
+
+    def _get_credentials_from_env(self) -> tuple[str, str]:
+        username = os.environ.get("OXYLABS_USERNAME")
+        password = os.environ.get("OXYLABS_PASSWORD")
+        if not username or not password:
+            raise ValueError(
+                "You must pass oxylabs username and password when instantiating the tool "
+                "or specify OXYLABS_USERNAME and OXYLABS_PASSWORD environment variables"
+            )
+        return username, password
+
+    def _run(self, query: str, **kwargs) -> str:
+        response = self.oxylabs_api.google.scrape_search(
+            query,
+            **self.config.model_dump(exclude_none=True),
+        )
+
+        content = response.results[0].content
+
+        if isinstance(content, dict):
+            return json.dumps(content)
+
+        return content
--- a/src/crewai_tools/tools/oxylabs_universal_scraper_tool/README.md
+++ b/src/crewai_tools/tools/oxylabs_universal_scraper_tool/README.md
@@ -0,0 +1,69 @@
+# OxylabsUniversalScraperTool
+
+Scrape any website with `OxylabsUniversalScraperTool`
+
+## Installation
+
+```
+pip install 'crewai[tools]' oxylabs
+```
+
+## Example
+
+```python
+from crewai_tools import OxylabsUniversalScraperTool
+
+# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set
+tool = OxylabsUniversalScraperTool()
+
+result = tool.run(url="https://ip.oxylabs.io")
+
+print(result)
+```
+
+## Arguments
+
+- `username`: Oxylabs username.
+- `password`: Oxylabs password.
+
+Get the credentials by creating an Oxylabs Account [here](https://oxylabs.io).
+
+## Advanced example
+
+Check out the Oxylabs [documentation](https://developers.oxylabs.io/scraper-apis/web-scraper-api/other-websites) to get the full list of parameters.
+
+```python
+from crewai_tools import OxylabsUniversalScraperTool
+
+# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set
+tool = OxylabsUniversalScraperTool(
+    config={
+        "render": "html",
+        "user_agent_type": "mobile",
+        "context": [
+            {"key": "force_headers", "value": True},
+            {"key": "force_cookies", "value": True},
+            {
+                "key": "headers",
+                "value": {
+                    "Custom-Header-Name": "custom header content",
+                },
+            },
+            {
+                "key": "cookies",
+                "value": [
+                    {"key": "NID", "value": "1234567890"},
+                    {"key": "1P JAR", "value": "0987654321"},
+                ],
+            },
+            {"key": "http_method", "value": "get"},
+            {"key": "follow_redirects", "value": True},
+            {"key": "successful_status_codes", "value": [808, 909]},
+        ],
+    }
+)
+
+result = tool.run(url="https://ip.oxylabs.io")
+
+print(result)
+```
--- a/src/crewai_tools/tools/oxylabs_universal_scraper_tool/oxylabs_universal_scraper_tool.py
+++ b/src/crewai_tools/tools/oxylabs_universal_scraper_tool/oxylabs_universal_scraper_tool.py
@@ -0,0 +1,146 @@
+import json
+import os
+from importlib.metadata import version
+from platform import architecture, python_version
+from typing import Any, List, Type
+
+from crewai.tools import BaseTool
+from pydantic import BaseModel, ConfigDict, Field
+
+try:
+    from oxylabs import RealtimeClient
+    from oxylabs.sources.response import Response as OxylabsResponse
+
+    OXYLABS_AVAILABLE = True
+except ImportError:
+    RealtimeClient = Any
+    OxylabsResponse = Any
+
+    OXYLABS_AVAILABLE = False
+
+__all__ = ["OxylabsUniversalScraperTool", "OxylabsUniversalScraperConfig"]
+
+
+class OxylabsUniversalScraperArgs(BaseModel):
+    url: str = Field(description="Website URL")
+
+
+class OxylabsUniversalScraperConfig(BaseModel):
+    """
+    Universal Scraper configuration options:
+    https://developers.oxylabs.io/scraper-apis/web-scraper-api/other-websites
+    """
+
+    geo_location: str | None = Field(None, description="The Deliver to location.")
+    user_agent_type: str | None = Field(None, description="Device type and browser.")
+    render: str | None = Field(None, description="Enables JavaScript rendering.")
+    callback_url: str | None = Field(None, description="URL to your callback endpoint.")
+    context: list | None = Field(
+        None,
+        description="Additional advanced settings and controls for specialized requirements.",
+    )
+    parse: bool | None = Field(None, description="True will return structured data.")
+    parsing_instructions: dict | None = Field(
+        None, description="Instructions for parsing the results."
+    )
+
+
+class OxylabsUniversalScraperTool(BaseTool):
+    """
+    Scrape any website with OxylabsUniversalScraperTool.
+
+    Get Oxylabs account:
+    https://dashboard.oxylabs.io/en
+
+    Args:
+        username (str): Oxylabs username.
+        password (str): Oxylabs password.
+        config: Configuration options. See ``OxylabsUniversalScraperConfig``
+    """
+
+    model_config = ConfigDict(
+        arbitrary_types_allowed=True,
+        validate_assignment=True,
+    )
+    name: str = "Oxylabs Universal Scraper tool"
+    description: str = "Scrape any url with Oxylabs Universal Scraper"
+    args_schema: Type[BaseModel] = OxylabsUniversalScraperArgs
+
+    oxylabs_api: RealtimeClient
+    config: OxylabsUniversalScraperConfig
+    package_dependencies: List[str] = ["oxylabs"]
+
+    def __init__(
+        self,
+        username: str | None = None,
+        password: str | None = None,
+        config: OxylabsUniversalScraperConfig | dict = OxylabsUniversalScraperConfig(),
+        **kwargs,
+    ):
+        bits, _ = architecture()
+        sdk_type = (
+            f"oxylabs-crewai-sdk-python/"
+            f"{version('crewai')} "
+            f"({python_version()}; {bits})"
+        )
+
+        if username is None or password is None:
+            username, password = self._get_credentials_from_env()
+
+        if OXYLABS_AVAILABLE:
+            # import RealtimeClient to make it accessible for the current scope
+            from oxylabs import RealtimeClient
+
+            kwargs["oxylabs_api"] = RealtimeClient(
+                username=username,
+                password=password,
+                sdk_type=sdk_type,
+            )
+        else:
+            import click
+
+            if click.confirm(
+                "You are missing the 'oxylabs' package. Would you like to install it?"
+            ):
+                import subprocess
+
+                try:
+                    subprocess.run(["uv", "add", "oxylabs"], check=True)
+                    from oxylabs import RealtimeClient
+
+                    kwargs["oxylabs_api"] = RealtimeClient(
+                        username=username,
+                        password=password,
+                        sdk_type=sdk_type,
+                    )
+                except subprocess.CalledProcessError:
+                    raise ImportError("Failed to install oxylabs package")
+            else:
+                raise ImportError(
+                    "`oxylabs` package not found, please run `uv add oxylabs`"
+                )
+
+        super().__init__(config=config, **kwargs)
+
+    def _get_credentials_from_env(self) -> tuple[str, str]:
+        username = os.environ.get("OXYLABS_USERNAME")
+        password = os.environ.get("OXYLABS_PASSWORD")
+        if not username or not password:
+            raise ValueError(
+                "You must pass oxylabs username and password when instantiating the tool "
+                "or specify OXYLABS_USERNAME and OXYLABS_PASSWORD environment variables"
+            )
+        return username, password
+
+    def _run(self, url: str) -> str:
+        response = self.oxylabs_api.universal.scrape_url(
+            url,
+            **self.config.model_dump(exclude_none=True),
+        )
+
+        content = response.results[0].content
+
+        if isinstance(content, dict):
+            return json.dumps(content)
+
+        return content
--- a/tests/tools/test_oxylabs_tools.py
+++ b/tests/tools/test_oxylabs_tools.py
@@ -0,0 +1,163 @@
+import json
+import os
+from typing import Type
+from unittest.mock import MagicMock
+
+import pytest
+from crewai.tools.base_tool import BaseTool
+from oxylabs import RealtimeClient
+from oxylabs.sources.response import Response as OxylabsResponse
+from pydantic import BaseModel
+
+from crewai_tools import (
+    OxylabsAmazonProductScraperTool,
+    OxylabsAmazonSearchScraperTool,
+    OxylabsGoogleSearchScraperTool,
+    OxylabsUniversalScraperTool,
+)
+from crewai_tools.tools.oxylabs_amazon_product_scraper_tool.oxylabs_amazon_product_scraper_tool import (
+    OxylabsAmazonProductScraperConfig,
+)
+from crewai_tools.tools.oxylabs_google_search_scraper_tool.oxylabs_google_search_scraper_tool import (
+    OxylabsGoogleSearchScraperConfig,
+)
+
+
+@pytest.fixture
+def oxylabs_api() -> RealtimeClient:
+    oxylabs_api_mock = MagicMock()
+
+    html_content = """
+    <!DOCTYPE html>
+    <html lang="en">
+    <head>
+        <meta charset="UTF-8">
+        <title>Scraping Sandbox</title>
+    </head>
+    <body>
+    <div id="main">
+        <div id="product-list">
+            <div>
+                <p>Amazing product</p>
+                <p>Price $14.99</p>
+            </div>
+            <div>
+                <p>Good product</p>
+                <p>Price $9.99</p>
+            </div>
+        </div>
+    </div>
+    </body>
+    </html>
+    """
+
+    json_content = {
+        "results": {
+            "products": [
+                {"title": "Amazing product", "price": 14.99, "currency": "USD"},
+                {"title": "Good product", "price": 9.99, "currency": "USD"},
+            ],
+        },
+    }
+
+    html_response = OxylabsResponse({"results": [{"content": html_content}]})
+    json_response = OxylabsResponse({"results": [{"content": json_content}]})
+
+    oxylabs_api_mock.universal.scrape_url.side_effect = [json_response, html_response]
+    oxylabs_api_mock.amazon.scrape_search.side_effect = [json_response, html_response]
+    oxylabs_api_mock.amazon.scrape_product.side_effect = [json_response, html_response]
+    oxylabs_api_mock.google.scrape_search.side_effect = [json_response, html_response]
+
+    return oxylabs_api_mock
+
+
+@pytest.mark.parametrize(
+    ("tool_class",),
+    [
+        (OxylabsUniversalScraperTool,),
+        (OxylabsAmazonSearchScraperTool,),
+        (OxylabsGoogleSearchScraperTool,),
+        (OxylabsAmazonProductScraperTool,),
+    ],
+)
+def test_tool_initialization(tool_class: Type[BaseTool]):
+    tool = tool_class(username="username", password="password")
+    assert isinstance(tool, tool_class)
+
+
+@pytest.mark.parametrize(
+    ("tool_class",),
+    [
+        (OxylabsUniversalScraperTool,),
+        (OxylabsAmazonSearchScraperTool,),
+        (OxylabsGoogleSearchScraperTool,),
+        (OxylabsAmazonProductScraperTool,),
+    ],
+)
+def test_tool_initialization_with_env_vars(tool_class: Type[BaseTool]):
+    os.environ["OXYLABS_USERNAME"] = "username"
+    os.environ["OXYLABS_PASSWORD"] = "password"
+
+    tool = tool_class()
+    assert isinstance(tool, tool_class)
+
+    del os.environ["OXYLABS_USERNAME"]
+    del os.environ["OXYLABS_PASSWORD"]
+
+
+@pytest.mark.parametrize(
+    ("tool_class",),
+    [
+        (OxylabsUniversalScraperTool,),
+        (OxylabsAmazonSearchScraperTool,),
+        (OxylabsGoogleSearchScraperTool,),
+        (OxylabsAmazonProductScraperTool,),
+    ],
+)
+def test_tool_initialization_failure(tool_class: Type[BaseTool]):
+    # making sure env vars are not set
+    for key in ["OXYLABS_USERNAME", "OXYLABS_PASSWORD"]:
+        if key in os.environ:
+            del os.environ[key]
+
+    with pytest.raises(ValueError):
+        tool_class()
+
+
+@pytest.mark.parametrize(
+    ("tool_class", "tool_config"),
+    [
+        (OxylabsUniversalScraperTool, {"geo_location": "Paris, France"}),
+        (
+            OxylabsAmazonSearchScraperTool,
+            {"domain": "co.uk"},
+        ),
+        (
+            OxylabsGoogleSearchScraperTool,
+            OxylabsGoogleSearchScraperConfig(render="html"),
+        ),
+        (
+            OxylabsAmazonProductScraperTool,
+            OxylabsAmazonProductScraperConfig(parse=True),
+        ),
+    ],
+)
+def test_tool_invocation(
+    tool_class: Type[BaseTool],
+    tool_config: BaseModel,
+    oxylabs_api: RealtimeClient,
+):
+    tool = tool_class(username="username", password="password", config=tool_config)
+
+    # setting via __dict__ to bypass pydantic validation
+    tool.__dict__["oxylabs_api"] = oxylabs_api
+
+    # verifying parsed job returns json content
+    result = tool.run("Scraping Query 1")
+    assert isinstance(result, str)
+    assert isinstance(json.loads(result), dict)
+
+    # verifying raw job returns str
+    result = tool.run("Scraping Query 2")
+    assert isinstance(result, str)
+    assert "<!DOCTYPE html>" in result