Add Oxylabs Web Scraping tools (#312)

* Add Oxylabs tools

* Review updates

* Add package_dependencies attribute
This commit is contained in:
Rostyslav Borovyk
2025-06-24 16:56:47 +03:00
committed by GitHub
parent c13b08de2e
commit 78a062a907
11 changed files with 1013 additions and 0 deletions

View File

@@ -37,6 +37,10 @@ from .tools import (
MultiOnTool,
MySQLSearchTool,
NL2SQLTool,
OxylabsUniversalScraperTool,
OxylabsGoogleSearchScraperTool,
OxylabsAmazonProductScraperTool,
OxylabsAmazonSearchScraperTool,
PatronusEvalTool,
PatronusLocalEvaluatorTool,
PatronusPredefinedCriteriaEvalTool,

View File

@@ -32,6 +32,18 @@ from .mdx_search_tool.mdx_search_tool import MDXSearchTool
from .multion_tool.multion_tool import MultiOnTool
from .mysql_search_tool.mysql_search_tool import MySQLSearchTool
from .nl2sql.nl2sql_tool import NL2SQLTool
from .oxylabs_universal_scraper_tool.oxylabs_universal_scraper_tool import (
OxylabsUniversalScraperTool,
)
from .oxylabs_google_search_scraper_tool.oxylabs_google_search_scraper_tool import (
OxylabsGoogleSearchScraperTool,
)
from .oxylabs_amazon_product_scraper_tool.oxylabs_amazon_product_scraper_tool import (
OxylabsAmazonProductScraperTool,
)
from .oxylabs_amazon_search_scraper_tool.oxylabs_amazon_search_scraper_tool import (
OxylabsAmazonSearchScraperTool,
)
from .patronus_eval_tool import (
PatronusEvalTool,
PatronusLocalEvaluatorTool,

View File

@@ -0,0 +1,55 @@
# OxylabsAmazonProductScraperTool
Scrape any website with `OxylabsAmazonProductScraperTool`
## Installation
```
pip install 'crewai[tools]' oxylabs
```
## Example
```python
from crewai_tools import OxylabsAmazonProductScraperTool
# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set
tool = OxylabsAmazonProductScraperTool()
result = tool.run(query="AAAAABBBBCC")
print(result)
```
## Arguments
- `username`: Oxylabs username.
- `password`: Oxylabs password.
Get the credentials by creating an Oxylabs Account [here](https://oxylabs.io).
## Advanced example
Check out the Oxylabs [documentation](https://developers.oxylabs.io/scraper-apis/web-scraper-api/targets/amazon/product) to get the full list of parameters.
```python
from crewai_tools import OxylabsAmazonProductScraperTool
# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set
tool = OxylabsAmazonProductScraperTool(
config={
"domain": "com",
"parse": True,
"context": [
{
"key": "autoselect_variant",
"value": True
}
]
}
)
result = tool.run(query="AAAAABBBBCC")
print(result)
```

View File

@@ -0,0 +1,151 @@
import json
import os
from importlib.metadata import version
from platform import architecture, python_version
from typing import Any, List, Type
from crewai.tools import BaseTool
from pydantic import BaseModel, ConfigDict, Field
try:
from oxylabs import RealtimeClient
from oxylabs.sources.response import Response as OxylabsResponse
OXYLABS_AVAILABLE = True
except ImportError:
RealtimeClient = Any
OxylabsResponse = Any
OXYLABS_AVAILABLE = False
__all__ = ["OxylabsAmazonProductScraperTool", "OxylabsAmazonProductScraperConfig"]
class OxylabsAmazonProductScraperArgs(BaseModel):
query: str = Field(description="Amazon product ASIN")
class OxylabsAmazonProductScraperConfig(BaseModel):
"""
Amazon Product Scraper configuration options:
https://developers.oxylabs.io/scraper-apis/web-scraper-api/targets/amazon/product
"""
domain: str | None = Field(
None, description="The domain to limit the search results to."
)
geo_location: str | None = Field(None, description="The Deliver to location.")
user_agent_type: str | None = Field(None, description="Device type and browser.")
render: str | None = Field(None, description="Enables JavaScript rendering.")
callback_url: str | None = Field(None, description="URL to your callback endpoint.")
context: list | None = Field(
None,
description="Additional advanced settings and controls for specialized requirements.",
)
parse: bool | None = Field(None, description="True will return structured data.")
parsing_instructions: dict | None = Field(
None, description="Instructions for parsing the results."
)
class OxylabsAmazonProductScraperTool(BaseTool):
"""
Scrape Amazon product pages with OxylabsAmazonProductScraperTool.
Get Oxylabs account:
https://dashboard.oxylabs.io/en
Args:
username (str): Oxylabs username.
password (str): Oxylabs password.
config: Configuration options. See ``OxylabsAmazonProductScraperConfig``
"""
model_config = ConfigDict(
arbitrary_types_allowed=True,
validate_assignment=True,
)
name: str = "Oxylabs Amazon Product Scraper tool"
description: str = "Scrape Amazon product pages with Oxylabs Amazon Product Scraper"
args_schema: Type[BaseModel] = OxylabsAmazonProductScraperArgs
oxylabs_api: RealtimeClient
config: OxylabsAmazonProductScraperConfig
package_dependencies: List[str] = ["oxylabs"]
def __init__(
self,
username: str | None = None,
password: str | None = None,
config: OxylabsAmazonProductScraperConfig
| dict = OxylabsAmazonProductScraperConfig(),
**kwargs,
) -> None:
bits, _ = architecture()
sdk_type = (
f"oxylabs-crewai-sdk-python/"
f"{version('crewai')} "
f"({python_version()}; {bits})"
)
if username is None or password is None:
username, password = self._get_credentials_from_env()
if OXYLABS_AVAILABLE:
# import RealtimeClient to make it accessible for the current scope
from oxylabs import RealtimeClient
kwargs["oxylabs_api"] = RealtimeClient(
username=username,
password=password,
sdk_type=sdk_type,
)
else:
import click
if click.confirm(
"You are missing the 'oxylabs' package. Would you like to install it?"
):
import subprocess
try:
subprocess.run(["uv", "add", "oxylabs"], check=True)
from oxylabs import RealtimeClient
kwargs["oxylabs_api"] = RealtimeClient(
username=username,
password=password,
sdk_type=sdk_type,
)
except subprocess.CalledProcessError:
raise ImportError("Failed to install oxylabs package")
else:
raise ImportError(
"`oxylabs` package not found, please run `uv add oxylabs`"
)
super().__init__(config=config, **kwargs)
def _get_credentials_from_env(self) -> tuple[str, str]:
username = os.environ.get("OXYLABS_USERNAME")
password = os.environ.get("OXYLABS_PASSWORD")
if not username or not password:
raise ValueError(
"You must pass oxylabs username and password when instantiating the tool "
"or specify OXYLABS_USERNAME and OXYLABS_PASSWORD environment variables"
)
return username, password
def _run(self, query: str) -> str:
response = self.oxylabs_api.amazon.scrape_product(
query,
**self.config.model_dump(exclude_none=True),
)
content = response.results[0].content
if isinstance(content, dict):
return json.dumps(content)
return content

View File

@@ -0,0 +1,54 @@
# OxylabsAmazonSearchScraperTool
Scrape any website with `OxylabsAmazonSearchScraperTool`
## Installation
```
pip install 'crewai[tools]' oxylabs
```
## Example
```python
from crewai_tools import OxylabsAmazonSearchScraperTool
# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set
tool = OxylabsAmazonSearchScraperTool()
result = tool.run(query="headsets")
print(result)
```
## Arguments
- `username`: Oxylabs username.
- `password`: Oxylabs password.
Get the credentials by creating an Oxylabs Account [here](https://oxylabs.io).
## Advanced example
Check out the Oxylabs [documentation](https://developers.oxylabs.io/scraper-apis/web-scraper-api/targets/amazon/search) to get the full list of parameters.
```python
from crewai_tools import OxylabsAmazonSearchScraperTool
# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set
tool = OxylabsAmazonSearchScraperTool(
config={
"domain": 'nl',
"start_page": 2,
"pages": 2,
"parse": True,
"context": [
{'key': 'category_id', 'value': 16391693031}
],
}
)
result = tool.run(query='nirvana tshirt')
print(result)
```

View File

@@ -0,0 +1,153 @@
import json
import os
from importlib.metadata import version
from platform import architecture, python_version
from typing import Any, List, Type
from crewai.tools import BaseTool
from pydantic import BaseModel, ConfigDict, Field
try:
from oxylabs import RealtimeClient
from oxylabs.sources.response import Response as OxylabsResponse
OXYLABS_AVAILABLE = True
except ImportError:
RealtimeClient = Any
OxylabsResponse = Any
OXYLABS_AVAILABLE = False
__all__ = ["OxylabsAmazonSearchScraperTool", "OxylabsAmazonSearchScraperConfig"]
class OxylabsAmazonSearchScraperArgs(BaseModel):
query: str = Field(description="Amazon search term")
class OxylabsAmazonSearchScraperConfig(BaseModel):
"""
Amazon Search Scraper configuration options:
https://developers.oxylabs.io/scraper-apis/web-scraper-api/targets/amazon/search
"""
domain: str | None = Field(
None, description="The domain to limit the search results to."
)
start_page: int | None = Field(None, description="The starting page number.")
pages: int | None = Field(None, description="The number of pages to scrape.")
geo_location: str | None = Field(None, description="The Deliver to location.")
user_agent_type: str | None = Field(None, description="Device type and browser.")
render: str | None = Field(None, description="Enables JavaScript rendering.")
callback_url: str | None = Field(None, description="URL to your callback endpoint.")
context: list | None = Field(
None,
description="Additional advanced settings and controls for specialized requirements.",
)
parse: bool | None = Field(None, description="True will return structured data.")
parsing_instructions: dict | None = Field(
None, description="Instructions for parsing the results."
)
class OxylabsAmazonSearchScraperTool(BaseTool):
"""
Scrape Amazon search results with OxylabsAmazonSearchScraperTool.
Get Oxylabs account:
https://dashboard.oxylabs.io/en
Args:
username (str): Oxylabs username.
password (str): Oxylabs password.
config: Configuration options. See ``OxylabsAmazonSearchScraperConfig``
"""
model_config = ConfigDict(
arbitrary_types_allowed=True,
validate_assignment=True,
)
name: str = "Oxylabs Amazon Search Scraper tool"
description: str = "Scrape Amazon search results with Oxylabs Amazon Search Scraper"
args_schema: Type[BaseModel] = OxylabsAmazonSearchScraperArgs
oxylabs_api: RealtimeClient
config: OxylabsAmazonSearchScraperConfig
package_dependencies: List[str] = ["oxylabs"]
def __init__(
self,
username: str | None = None,
password: str | None = None,
config: OxylabsAmazonSearchScraperConfig
| dict = OxylabsAmazonSearchScraperConfig(),
**kwargs,
):
bits, _ = architecture()
sdk_type = (
f"oxylabs-crewai-sdk-python/"
f"{version('crewai')} "
f"({python_version()}; {bits})"
)
if username is None or password is None:
username, password = self._get_credentials_from_env()
if OXYLABS_AVAILABLE:
# import RealtimeClient to make it accessible for the current scope
from oxylabs import RealtimeClient
kwargs["oxylabs_api"] = RealtimeClient(
username=username,
password=password,
sdk_type=sdk_type,
)
else:
import click
if click.confirm(
"You are missing the 'oxylabs' package. Would you like to install it?"
):
import subprocess
try:
subprocess.run(["uv", "add", "oxylabs"], check=True)
from oxylabs import RealtimeClient
kwargs["oxylabs_api"] = RealtimeClient(
username=username,
password=password,
sdk_type=sdk_type,
)
except subprocess.CalledProcessError:
raise ImportError("Failed to install oxylabs package")
else:
raise ImportError(
"`oxylabs` package not found, please run `uv add oxylabs`"
)
super().__init__(config=config, **kwargs)
def _get_credentials_from_env(self) -> tuple[str, str]:
username = os.environ.get("OXYLABS_USERNAME")
password = os.environ.get("OXYLABS_PASSWORD")
if not username or not password:
raise ValueError(
"You must pass oxylabs username and password when instantiating the tool "
"or specify OXYLABS_USERNAME and OXYLABS_PASSWORD environment variables"
)
return username, password
def _run(self, query: str) -> str:
response = self.oxylabs_api.amazon.scrape_search(
query,
**self.config.model_dump(exclude_none=True),
)
content = response.results[0].content
if isinstance(content, dict):
return json.dumps(content)
return content

View File

@@ -0,0 +1,50 @@
# OxylabsGoogleSearchScraperTool
Scrape any website with `OxylabsGoogleSearchScraperTool`
## Installation
```
pip install 'crewai[tools]' oxylabs
```
## Example
```python
from crewai_tools import OxylabsGoogleSearchScraperTool
# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set
tool = OxylabsGoogleSearchScraperTool()
result = tool.run(query="iPhone 16")
print(result)
```
## Arguments
- `username`: Oxylabs username.
- `password`: Oxylabs password.
Get the credentials by creating an Oxylabs Account [here](https://oxylabs.io).
## Advanced example
Check out the Oxylabs [documentation](https://developers.oxylabs.io/scraper-apis/web-scraper-api/targets/google/search/search) to get the full list of parameters.
```python
from crewai_tools import OxylabsGoogleSearchScraperTool
# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set
tool = OxylabsGoogleSearchScraperTool(
config={
"parse": True,
"geo_location": "Paris, France",
"user_agent_type": "tablet",
}
)
result = tool.run(query="iPhone 16")
print(result)
```

View File

@@ -0,0 +1,156 @@
import json
import os
from importlib.metadata import version
from platform import architecture, python_version
from typing import Any, List, Type
from crewai.tools import BaseTool
from pydantic import BaseModel, ConfigDict, Field
try:
from oxylabs import RealtimeClient
from oxylabs.sources.response import Response as OxylabsResponse
OXYLABS_AVAILABLE = True
except ImportError:
RealtimeClient = Any
OxylabsResponse = Any
OXYLABS_AVAILABLE = False
__all__ = ["OxylabsGoogleSearchScraperTool", "OxylabsGoogleSearchScraperConfig"]
class OxylabsGoogleSearchScraperArgs(BaseModel):
query: str = Field(description="Search query")
class OxylabsGoogleSearchScraperConfig(BaseModel):
"""
Google Search Scraper configuration options:
https://developers.oxylabs.io/scraper-apis/web-scraper-api/targets/google/search/search
"""
domain: str | None = Field(
None, description="The domain to limit the search results to."
)
start_page: int | None = Field(None, description="The starting page number.")
pages: int | None = Field(None, description="The number of pages to scrape.")
limit: int | None = Field(
None, description="Number of results to retrieve in each page."
)
geo_location: str | None = Field(None, description="The Deliver to location.")
user_agent_type: str | None = Field(None, description="Device type and browser.")
render: str | None = Field(None, description="Enables JavaScript rendering.")
callback_url: str | None = Field(None, description="URL to your callback endpoint.")
context: list | None = Field(
None,
description="Additional advanced settings and controls for specialized requirements.",
)
parse: bool | None = Field(None, description="True will return structured data.")
parsing_instructions: dict | None = Field(
None, description="Instructions for parsing the results."
)
class OxylabsGoogleSearchScraperTool(BaseTool):
"""
Scrape Google Search results with OxylabsGoogleSearchScraperTool.
Get Oxylabs account:
https://dashboard.oxylabs.io/en
Args:
username (str): Oxylabs username.
password (str): Oxylabs password.
config: Configuration options. See ``OxylabsGoogleSearchScraperConfig``
"""
model_config = ConfigDict(
arbitrary_types_allowed=True,
validate_assignment=True,
)
name: str = "Oxylabs Google Search Scraper tool"
description: str = "Scrape Google Search results with Oxylabs Google Search Scraper"
args_schema: Type[BaseModel] = OxylabsGoogleSearchScraperArgs
oxylabs_api: RealtimeClient
config: OxylabsGoogleSearchScraperConfig
package_dependencies: List[str] = ["oxylabs"]
def __init__(
self,
username: str | None = None,
password: str | None = None,
config: OxylabsGoogleSearchScraperConfig
| dict = OxylabsGoogleSearchScraperConfig(),
**kwargs,
):
bits, _ = architecture()
sdk_type = (
f"oxylabs-crewai-sdk-python/"
f"{version('crewai')} "
f"({python_version()}; {bits})"
)
if username is None or password is None:
username, password = self._get_credentials_from_env()
if OXYLABS_AVAILABLE:
# import RealtimeClient to make it accessible for the current scope
from oxylabs import RealtimeClient
kwargs["oxylabs_api"] = RealtimeClient(
username=username,
password=password,
sdk_type=sdk_type,
)
else:
import click
if click.confirm(
"You are missing the 'oxylabs' package. Would you like to install it?"
):
import subprocess
try:
subprocess.run(["uv", "add", "oxylabs"], check=True)
from oxylabs import RealtimeClient
kwargs["oxylabs_api"] = RealtimeClient(
username=username,
password=password,
sdk_type=sdk_type,
)
except subprocess.CalledProcessError:
raise ImportError("Failed to install oxylabs package")
else:
raise ImportError(
"`oxylabs` package not found, please run `uv add oxylabs`"
)
super().__init__(config=config, **kwargs)
def _get_credentials_from_env(self) -> tuple[str, str]:
username = os.environ.get("OXYLABS_USERNAME")
password = os.environ.get("OXYLABS_PASSWORD")
if not username or not password:
raise ValueError(
"You must pass oxylabs username and password when instantiating the tool "
"or specify OXYLABS_USERNAME and OXYLABS_PASSWORD environment variables"
)
return username, password
def _run(self, query: str, **kwargs) -> str:
response = self.oxylabs_api.google.scrape_search(
query,
**self.config.model_dump(exclude_none=True),
)
content = response.results[0].content
if isinstance(content, dict):
return json.dumps(content)
return content

View File

@@ -0,0 +1,69 @@
# OxylabsUniversalScraperTool
Scrape any website with `OxylabsUniversalScraperTool`
## Installation
```
pip install 'crewai[tools]' oxylabs
```
## Example
```python
from crewai_tools import OxylabsUniversalScraperTool
# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set
tool = OxylabsUniversalScraperTool()
result = tool.run(url="https://ip.oxylabs.io")
print(result)
```
## Arguments
- `username`: Oxylabs username.
- `password`: Oxylabs password.
Get the credentials by creating an Oxylabs Account [here](https://oxylabs.io).
## Advanced example
Check out the Oxylabs [documentation](https://developers.oxylabs.io/scraper-apis/web-scraper-api/other-websites) to get the full list of parameters.
```python
from crewai_tools import OxylabsUniversalScraperTool
# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set
tool = OxylabsUniversalScraperTool(
config={
"render": "html",
"user_agent_type": "mobile",
"context": [
{"key": "force_headers", "value": True},
{"key": "force_cookies", "value": True},
{
"key": "headers",
"value": {
"Custom-Header-Name": "custom header content",
},
},
{
"key": "cookies",
"value": [
{"key": "NID", "value": "1234567890"},
{"key": "1P JAR", "value": "0987654321"},
],
},
{"key": "http_method", "value": "get"},
{"key": "follow_redirects", "value": True},
{"key": "successful_status_codes", "value": [808, 909]},
],
}
)
result = tool.run(url="https://ip.oxylabs.io")
print(result)
```

View File

@@ -0,0 +1,146 @@
import json
import os
from importlib.metadata import version
from platform import architecture, python_version
from typing import Any, List, Type
from crewai.tools import BaseTool
from pydantic import BaseModel, ConfigDict, Field
try:
from oxylabs import RealtimeClient
from oxylabs.sources.response import Response as OxylabsResponse
OXYLABS_AVAILABLE = True
except ImportError:
RealtimeClient = Any
OxylabsResponse = Any
OXYLABS_AVAILABLE = False
__all__ = ["OxylabsUniversalScraperTool", "OxylabsUniversalScraperConfig"]
class OxylabsUniversalScraperArgs(BaseModel):
url: str = Field(description="Website URL")
class OxylabsUniversalScraperConfig(BaseModel):
"""
Universal Scraper configuration options:
https://developers.oxylabs.io/scraper-apis/web-scraper-api/other-websites
"""
geo_location: str | None = Field(None, description="The Deliver to location.")
user_agent_type: str | None = Field(None, description="Device type and browser.")
render: str | None = Field(None, description="Enables JavaScript rendering.")
callback_url: str | None = Field(None, description="URL to your callback endpoint.")
context: list | None = Field(
None,
description="Additional advanced settings and controls for specialized requirements.",
)
parse: bool | None = Field(None, description="True will return structured data.")
parsing_instructions: dict | None = Field(
None, description="Instructions for parsing the results."
)
class OxylabsUniversalScraperTool(BaseTool):
"""
Scrape any website with OxylabsUniversalScraperTool.
Get Oxylabs account:
https://dashboard.oxylabs.io/en
Args:
username (str): Oxylabs username.
password (str): Oxylabs password.
config: Configuration options. See ``OxylabsUniversalScraperConfig``
"""
model_config = ConfigDict(
arbitrary_types_allowed=True,
validate_assignment=True,
)
name: str = "Oxylabs Universal Scraper tool"
description: str = "Scrape any url with Oxylabs Universal Scraper"
args_schema: Type[BaseModel] = OxylabsUniversalScraperArgs
oxylabs_api: RealtimeClient
config: OxylabsUniversalScraperConfig
package_dependencies: List[str] = ["oxylabs"]
def __init__(
self,
username: str | None = None,
password: str | None = None,
config: OxylabsUniversalScraperConfig | dict = OxylabsUniversalScraperConfig(),
**kwargs,
):
bits, _ = architecture()
sdk_type = (
f"oxylabs-crewai-sdk-python/"
f"{version('crewai')} "
f"({python_version()}; {bits})"
)
if username is None or password is None:
username, password = self._get_credentials_from_env()
if OXYLABS_AVAILABLE:
# import RealtimeClient to make it accessible for the current scope
from oxylabs import RealtimeClient
kwargs["oxylabs_api"] = RealtimeClient(
username=username,
password=password,
sdk_type=sdk_type,
)
else:
import click
if click.confirm(
"You are missing the 'oxylabs' package. Would you like to install it?"
):
import subprocess
try:
subprocess.run(["uv", "add", "oxylabs"], check=True)
from oxylabs import RealtimeClient
kwargs["oxylabs_api"] = RealtimeClient(
username=username,
password=password,
sdk_type=sdk_type,
)
except subprocess.CalledProcessError:
raise ImportError("Failed to install oxylabs package")
else:
raise ImportError(
"`oxylabs` package not found, please run `uv add oxylabs`"
)
super().__init__(config=config, **kwargs)
def _get_credentials_from_env(self) -> tuple[str, str]:
username = os.environ.get("OXYLABS_USERNAME")
password = os.environ.get("OXYLABS_PASSWORD")
if not username or not password:
raise ValueError(
"You must pass oxylabs username and password when instantiating the tool "
"or specify OXYLABS_USERNAME and OXYLABS_PASSWORD environment variables"
)
return username, password
def _run(self, url: str) -> str:
response = self.oxylabs_api.universal.scrape_url(
url,
**self.config.model_dump(exclude_none=True),
)
content = response.results[0].content
if isinstance(content, dict):
return json.dumps(content)
return content

View File

@@ -0,0 +1,163 @@
import json
import os
from typing import Type
from unittest.mock import MagicMock
import pytest
from crewai.tools.base_tool import BaseTool
from oxylabs import RealtimeClient
from oxylabs.sources.response import Response as OxylabsResponse
from pydantic import BaseModel
from crewai_tools import (
OxylabsAmazonProductScraperTool,
OxylabsAmazonSearchScraperTool,
OxylabsGoogleSearchScraperTool,
OxylabsUniversalScraperTool,
)
from crewai_tools.tools.oxylabs_amazon_product_scraper_tool.oxylabs_amazon_product_scraper_tool import (
OxylabsAmazonProductScraperConfig,
)
from crewai_tools.tools.oxylabs_google_search_scraper_tool.oxylabs_google_search_scraper_tool import (
OxylabsGoogleSearchScraperConfig,
)
@pytest.fixture
def oxylabs_api() -> RealtimeClient:
oxylabs_api_mock = MagicMock()
html_content = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Scraping Sandbox</title>
</head>
<body>
<div id="main">
<div id="product-list">
<div>
<p>Amazing product</p>
<p>Price $14.99</p>
</div>
<div>
<p>Good product</p>
<p>Price $9.99</p>
</div>
</div>
</div>
</body>
</html>
"""
json_content = {
"results": {
"products": [
{"title": "Amazing product", "price": 14.99, "currency": "USD"},
{"title": "Good product", "price": 9.99, "currency": "USD"},
],
},
}
html_response = OxylabsResponse({"results": [{"content": html_content}]})
json_response = OxylabsResponse({"results": [{"content": json_content}]})
oxylabs_api_mock.universal.scrape_url.side_effect = [json_response, html_response]
oxylabs_api_mock.amazon.scrape_search.side_effect = [json_response, html_response]
oxylabs_api_mock.amazon.scrape_product.side_effect = [json_response, html_response]
oxylabs_api_mock.google.scrape_search.side_effect = [json_response, html_response]
return oxylabs_api_mock
@pytest.mark.parametrize(
("tool_class",),
[
(OxylabsUniversalScraperTool,),
(OxylabsAmazonSearchScraperTool,),
(OxylabsGoogleSearchScraperTool,),
(OxylabsAmazonProductScraperTool,),
],
)
def test_tool_initialization(tool_class: Type[BaseTool]):
tool = tool_class(username="username", password="password")
assert isinstance(tool, tool_class)
@pytest.mark.parametrize(
("tool_class",),
[
(OxylabsUniversalScraperTool,),
(OxylabsAmazonSearchScraperTool,),
(OxylabsGoogleSearchScraperTool,),
(OxylabsAmazonProductScraperTool,),
],
)
def test_tool_initialization_with_env_vars(tool_class: Type[BaseTool]):
os.environ["OXYLABS_USERNAME"] = "username"
os.environ["OXYLABS_PASSWORD"] = "password"
tool = tool_class()
assert isinstance(tool, tool_class)
del os.environ["OXYLABS_USERNAME"]
del os.environ["OXYLABS_PASSWORD"]
@pytest.mark.parametrize(
("tool_class",),
[
(OxylabsUniversalScraperTool,),
(OxylabsAmazonSearchScraperTool,),
(OxylabsGoogleSearchScraperTool,),
(OxylabsAmazonProductScraperTool,),
],
)
def test_tool_initialization_failure(tool_class: Type[BaseTool]):
# making sure env vars are not set
for key in ["OXYLABS_USERNAME", "OXYLABS_PASSWORD"]:
if key in os.environ:
del os.environ[key]
with pytest.raises(ValueError):
tool_class()
@pytest.mark.parametrize(
("tool_class", "tool_config"),
[
(OxylabsUniversalScraperTool, {"geo_location": "Paris, France"}),
(
OxylabsAmazonSearchScraperTool,
{"domain": "co.uk"},
),
(
OxylabsGoogleSearchScraperTool,
OxylabsGoogleSearchScraperConfig(render="html"),
),
(
OxylabsAmazonProductScraperTool,
OxylabsAmazonProductScraperConfig(parse=True),
),
],
)
def test_tool_invocation(
tool_class: Type[BaseTool],
tool_config: BaseModel,
oxylabs_api: RealtimeClient,
):
tool = tool_class(username="username", password="password", config=tool_config)
# setting via __dict__ to bypass pydantic validation
tool.__dict__["oxylabs_api"] = oxylabs_api
# verifying parsed job returns json content
result = tool.run("Scraping Query 1")
assert isinstance(result, str)
assert isinstance(json.loads(result), dict)
# verifying raw job returns str
result = tool.run("Scraping Query 2")
assert isinstance(result, str)
assert "<!DOCTYPE html>" in result