mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-05-01 23:32:39 +00:00
Add Oxylabs Web Scraping tools (#312)
* Add Oxylabs tools * Review updates * Add package_dependencies attribute
This commit is contained in:
committed by
GitHub
parent
c13b08de2e
commit
78a062a907
@@ -37,6 +37,10 @@ from .tools import (
|
|||||||
MultiOnTool,
|
MultiOnTool,
|
||||||
MySQLSearchTool,
|
MySQLSearchTool,
|
||||||
NL2SQLTool,
|
NL2SQLTool,
|
||||||
|
OxylabsUniversalScraperTool,
|
||||||
|
OxylabsGoogleSearchScraperTool,
|
||||||
|
OxylabsAmazonProductScraperTool,
|
||||||
|
OxylabsAmazonSearchScraperTool,
|
||||||
PatronusEvalTool,
|
PatronusEvalTool,
|
||||||
PatronusLocalEvaluatorTool,
|
PatronusLocalEvaluatorTool,
|
||||||
PatronusPredefinedCriteriaEvalTool,
|
PatronusPredefinedCriteriaEvalTool,
|
||||||
|
|||||||
@@ -32,6 +32,18 @@ from .mdx_search_tool.mdx_search_tool import MDXSearchTool
|
|||||||
from .multion_tool.multion_tool import MultiOnTool
|
from .multion_tool.multion_tool import MultiOnTool
|
||||||
from .mysql_search_tool.mysql_search_tool import MySQLSearchTool
|
from .mysql_search_tool.mysql_search_tool import MySQLSearchTool
|
||||||
from .nl2sql.nl2sql_tool import NL2SQLTool
|
from .nl2sql.nl2sql_tool import NL2SQLTool
|
||||||
|
from .oxylabs_universal_scraper_tool.oxylabs_universal_scraper_tool import (
|
||||||
|
OxylabsUniversalScraperTool,
|
||||||
|
)
|
||||||
|
from .oxylabs_google_search_scraper_tool.oxylabs_google_search_scraper_tool import (
|
||||||
|
OxylabsGoogleSearchScraperTool,
|
||||||
|
)
|
||||||
|
from .oxylabs_amazon_product_scraper_tool.oxylabs_amazon_product_scraper_tool import (
|
||||||
|
OxylabsAmazonProductScraperTool,
|
||||||
|
)
|
||||||
|
from .oxylabs_amazon_search_scraper_tool.oxylabs_amazon_search_scraper_tool import (
|
||||||
|
OxylabsAmazonSearchScraperTool,
|
||||||
|
)
|
||||||
from .patronus_eval_tool import (
|
from .patronus_eval_tool import (
|
||||||
PatronusEvalTool,
|
PatronusEvalTool,
|
||||||
PatronusLocalEvaluatorTool,
|
PatronusLocalEvaluatorTool,
|
||||||
|
|||||||
@@ -0,0 +1,55 @@
|
|||||||
|
# OxylabsAmazonProductScraperTool
|
||||||
|
|
||||||
|
Scrape any website with `OxylabsAmazonProductScraperTool`
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
```
|
||||||
|
pip install 'crewai[tools]' oxylabs
|
||||||
|
```
|
||||||
|
|
||||||
|
## Example
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crewai_tools import OxylabsAmazonProductScraperTool
|
||||||
|
|
||||||
|
# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set
|
||||||
|
tool = OxylabsAmazonProductScraperTool()
|
||||||
|
|
||||||
|
result = tool.run(query="AAAAABBBBCC")
|
||||||
|
|
||||||
|
print(result)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Arguments
|
||||||
|
|
||||||
|
- `username`: Oxylabs username.
|
||||||
|
- `password`: Oxylabs password.
|
||||||
|
|
||||||
|
Get the credentials by creating an Oxylabs Account [here](https://oxylabs.io).
|
||||||
|
|
||||||
|
## Advanced example
|
||||||
|
|
||||||
|
Check out the Oxylabs [documentation](https://developers.oxylabs.io/scraper-apis/web-scraper-api/targets/amazon/product) to get the full list of parameters.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crewai_tools import OxylabsAmazonProductScraperTool
|
||||||
|
|
||||||
|
# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set
|
||||||
|
tool = OxylabsAmazonProductScraperTool(
|
||||||
|
config={
|
||||||
|
"domain": "com",
|
||||||
|
"parse": True,
|
||||||
|
"context": [
|
||||||
|
{
|
||||||
|
"key": "autoselect_variant",
|
||||||
|
"value": True
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
result = tool.run(query="AAAAABBBBCC")
|
||||||
|
|
||||||
|
print(result)
|
||||||
|
```
|
||||||
@@ -0,0 +1,151 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
from importlib.metadata import version
|
||||||
|
from platform import architecture, python_version
|
||||||
|
from typing import Any, List, Type
|
||||||
|
|
||||||
|
from crewai.tools import BaseTool
|
||||||
|
from pydantic import BaseModel, ConfigDict, Field
|
||||||
|
|
||||||
|
try:
|
||||||
|
from oxylabs import RealtimeClient
|
||||||
|
from oxylabs.sources.response import Response as OxylabsResponse
|
||||||
|
|
||||||
|
OXYLABS_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
RealtimeClient = Any
|
||||||
|
OxylabsResponse = Any
|
||||||
|
|
||||||
|
OXYLABS_AVAILABLE = False
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["OxylabsAmazonProductScraperTool", "OxylabsAmazonProductScraperConfig"]
|
||||||
|
|
||||||
|
|
||||||
|
class OxylabsAmazonProductScraperArgs(BaseModel):
|
||||||
|
query: str = Field(description="Amazon product ASIN")
|
||||||
|
|
||||||
|
|
||||||
|
class OxylabsAmazonProductScraperConfig(BaseModel):
|
||||||
|
"""
|
||||||
|
Amazon Product Scraper configuration options:
|
||||||
|
https://developers.oxylabs.io/scraper-apis/web-scraper-api/targets/amazon/product
|
||||||
|
"""
|
||||||
|
|
||||||
|
domain: str | None = Field(
|
||||||
|
None, description="The domain to limit the search results to."
|
||||||
|
)
|
||||||
|
geo_location: str | None = Field(None, description="The Deliver to location.")
|
||||||
|
user_agent_type: str | None = Field(None, description="Device type and browser.")
|
||||||
|
render: str | None = Field(None, description="Enables JavaScript rendering.")
|
||||||
|
callback_url: str | None = Field(None, description="URL to your callback endpoint.")
|
||||||
|
context: list | None = Field(
|
||||||
|
None,
|
||||||
|
description="Additional advanced settings and controls for specialized requirements.",
|
||||||
|
)
|
||||||
|
parse: bool | None = Field(None, description="True will return structured data.")
|
||||||
|
parsing_instructions: dict | None = Field(
|
||||||
|
None, description="Instructions for parsing the results."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class OxylabsAmazonProductScraperTool(BaseTool):
|
||||||
|
"""
|
||||||
|
Scrape Amazon product pages with OxylabsAmazonProductScraperTool.
|
||||||
|
|
||||||
|
Get Oxylabs account:
|
||||||
|
https://dashboard.oxylabs.io/en
|
||||||
|
|
||||||
|
Args:
|
||||||
|
username (str): Oxylabs username.
|
||||||
|
password (str): Oxylabs password.
|
||||||
|
config: Configuration options. See ``OxylabsAmazonProductScraperConfig``
|
||||||
|
"""
|
||||||
|
|
||||||
|
model_config = ConfigDict(
|
||||||
|
arbitrary_types_allowed=True,
|
||||||
|
validate_assignment=True,
|
||||||
|
)
|
||||||
|
name: str = "Oxylabs Amazon Product Scraper tool"
|
||||||
|
description: str = "Scrape Amazon product pages with Oxylabs Amazon Product Scraper"
|
||||||
|
args_schema: Type[BaseModel] = OxylabsAmazonProductScraperArgs
|
||||||
|
|
||||||
|
oxylabs_api: RealtimeClient
|
||||||
|
config: OxylabsAmazonProductScraperConfig
|
||||||
|
package_dependencies: List[str] = ["oxylabs"]
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
username: str | None = None,
|
||||||
|
password: str | None = None,
|
||||||
|
config: OxylabsAmazonProductScraperConfig
|
||||||
|
| dict = OxylabsAmazonProductScraperConfig(),
|
||||||
|
**kwargs,
|
||||||
|
) -> None:
|
||||||
|
bits, _ = architecture()
|
||||||
|
sdk_type = (
|
||||||
|
f"oxylabs-crewai-sdk-python/"
|
||||||
|
f"{version('crewai')} "
|
||||||
|
f"({python_version()}; {bits})"
|
||||||
|
)
|
||||||
|
|
||||||
|
if username is None or password is None:
|
||||||
|
username, password = self._get_credentials_from_env()
|
||||||
|
|
||||||
|
if OXYLABS_AVAILABLE:
|
||||||
|
# import RealtimeClient to make it accessible for the current scope
|
||||||
|
from oxylabs import RealtimeClient
|
||||||
|
|
||||||
|
kwargs["oxylabs_api"] = RealtimeClient(
|
||||||
|
username=username,
|
||||||
|
password=password,
|
||||||
|
sdk_type=sdk_type,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
import click
|
||||||
|
|
||||||
|
if click.confirm(
|
||||||
|
"You are missing the 'oxylabs' package. Would you like to install it?"
|
||||||
|
):
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
try:
|
||||||
|
subprocess.run(["uv", "add", "oxylabs"], check=True)
|
||||||
|
from oxylabs import RealtimeClient
|
||||||
|
|
||||||
|
kwargs["oxylabs_api"] = RealtimeClient(
|
||||||
|
username=username,
|
||||||
|
password=password,
|
||||||
|
sdk_type=sdk_type,
|
||||||
|
)
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
raise ImportError("Failed to install oxylabs package")
|
||||||
|
else:
|
||||||
|
raise ImportError(
|
||||||
|
"`oxylabs` package not found, please run `uv add oxylabs`"
|
||||||
|
)
|
||||||
|
|
||||||
|
super().__init__(config=config, **kwargs)
|
||||||
|
|
||||||
|
def _get_credentials_from_env(self) -> tuple[str, str]:
|
||||||
|
username = os.environ.get("OXYLABS_USERNAME")
|
||||||
|
password = os.environ.get("OXYLABS_PASSWORD")
|
||||||
|
if not username or not password:
|
||||||
|
raise ValueError(
|
||||||
|
"You must pass oxylabs username and password when instantiating the tool "
|
||||||
|
"or specify OXYLABS_USERNAME and OXYLABS_PASSWORD environment variables"
|
||||||
|
)
|
||||||
|
return username, password
|
||||||
|
|
||||||
|
def _run(self, query: str) -> str:
|
||||||
|
response = self.oxylabs_api.amazon.scrape_product(
|
||||||
|
query,
|
||||||
|
**self.config.model_dump(exclude_none=True),
|
||||||
|
)
|
||||||
|
|
||||||
|
content = response.results[0].content
|
||||||
|
|
||||||
|
if isinstance(content, dict):
|
||||||
|
return json.dumps(content)
|
||||||
|
|
||||||
|
return content
|
||||||
@@ -0,0 +1,54 @@
|
|||||||
|
# OxylabsAmazonSearchScraperTool
|
||||||
|
|
||||||
|
Scrape any website with `OxylabsAmazonSearchScraperTool`
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
```
|
||||||
|
pip install 'crewai[tools]' oxylabs
|
||||||
|
```
|
||||||
|
|
||||||
|
## Example
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crewai_tools import OxylabsAmazonSearchScraperTool
|
||||||
|
|
||||||
|
# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set
|
||||||
|
tool = OxylabsAmazonSearchScraperTool()
|
||||||
|
|
||||||
|
result = tool.run(query="headsets")
|
||||||
|
|
||||||
|
print(result)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Arguments
|
||||||
|
|
||||||
|
- `username`: Oxylabs username.
|
||||||
|
- `password`: Oxylabs password.
|
||||||
|
|
||||||
|
Get the credentials by creating an Oxylabs Account [here](https://oxylabs.io).
|
||||||
|
|
||||||
|
## Advanced example
|
||||||
|
|
||||||
|
Check out the Oxylabs [documentation](https://developers.oxylabs.io/scraper-apis/web-scraper-api/targets/amazon/search) to get the full list of parameters.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crewai_tools import OxylabsAmazonSearchScraperTool
|
||||||
|
|
||||||
|
# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set
|
||||||
|
tool = OxylabsAmazonSearchScraperTool(
|
||||||
|
config={
|
||||||
|
"domain": 'nl',
|
||||||
|
"start_page": 2,
|
||||||
|
"pages": 2,
|
||||||
|
"parse": True,
|
||||||
|
"context": [
|
||||||
|
{'key': 'category_id', 'value': 16391693031}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
result = tool.run(query='nirvana tshirt')
|
||||||
|
|
||||||
|
print(result)
|
||||||
|
```
|
||||||
@@ -0,0 +1,153 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
from importlib.metadata import version
|
||||||
|
from platform import architecture, python_version
|
||||||
|
from typing import Any, List, Type
|
||||||
|
|
||||||
|
from crewai.tools import BaseTool
|
||||||
|
from pydantic import BaseModel, ConfigDict, Field
|
||||||
|
|
||||||
|
try:
|
||||||
|
from oxylabs import RealtimeClient
|
||||||
|
from oxylabs.sources.response import Response as OxylabsResponse
|
||||||
|
|
||||||
|
OXYLABS_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
RealtimeClient = Any
|
||||||
|
OxylabsResponse = Any
|
||||||
|
|
||||||
|
OXYLABS_AVAILABLE = False
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["OxylabsAmazonSearchScraperTool", "OxylabsAmazonSearchScraperConfig"]
|
||||||
|
|
||||||
|
|
||||||
|
class OxylabsAmazonSearchScraperArgs(BaseModel):
|
||||||
|
query: str = Field(description="Amazon search term")
|
||||||
|
|
||||||
|
|
||||||
|
class OxylabsAmazonSearchScraperConfig(BaseModel):
|
||||||
|
"""
|
||||||
|
Amazon Search Scraper configuration options:
|
||||||
|
https://developers.oxylabs.io/scraper-apis/web-scraper-api/targets/amazon/search
|
||||||
|
"""
|
||||||
|
|
||||||
|
domain: str | None = Field(
|
||||||
|
None, description="The domain to limit the search results to."
|
||||||
|
)
|
||||||
|
start_page: int | None = Field(None, description="The starting page number.")
|
||||||
|
pages: int | None = Field(None, description="The number of pages to scrape.")
|
||||||
|
geo_location: str | None = Field(None, description="The Deliver to location.")
|
||||||
|
user_agent_type: str | None = Field(None, description="Device type and browser.")
|
||||||
|
render: str | None = Field(None, description="Enables JavaScript rendering.")
|
||||||
|
callback_url: str | None = Field(None, description="URL to your callback endpoint.")
|
||||||
|
context: list | None = Field(
|
||||||
|
None,
|
||||||
|
description="Additional advanced settings and controls for specialized requirements.",
|
||||||
|
)
|
||||||
|
parse: bool | None = Field(None, description="True will return structured data.")
|
||||||
|
parsing_instructions: dict | None = Field(
|
||||||
|
None, description="Instructions for parsing the results."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class OxylabsAmazonSearchScraperTool(BaseTool):
|
||||||
|
"""
|
||||||
|
Scrape Amazon search results with OxylabsAmazonSearchScraperTool.
|
||||||
|
|
||||||
|
Get Oxylabs account:
|
||||||
|
https://dashboard.oxylabs.io/en
|
||||||
|
|
||||||
|
Args:
|
||||||
|
username (str): Oxylabs username.
|
||||||
|
password (str): Oxylabs password.
|
||||||
|
config: Configuration options. See ``OxylabsAmazonSearchScraperConfig``
|
||||||
|
"""
|
||||||
|
|
||||||
|
model_config = ConfigDict(
|
||||||
|
arbitrary_types_allowed=True,
|
||||||
|
validate_assignment=True,
|
||||||
|
)
|
||||||
|
name: str = "Oxylabs Amazon Search Scraper tool"
|
||||||
|
description: str = "Scrape Amazon search results with Oxylabs Amazon Search Scraper"
|
||||||
|
args_schema: Type[BaseModel] = OxylabsAmazonSearchScraperArgs
|
||||||
|
|
||||||
|
oxylabs_api: RealtimeClient
|
||||||
|
config: OxylabsAmazonSearchScraperConfig
|
||||||
|
package_dependencies: List[str] = ["oxylabs"]
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
username: str | None = None,
|
||||||
|
password: str | None = None,
|
||||||
|
config: OxylabsAmazonSearchScraperConfig
|
||||||
|
| dict = OxylabsAmazonSearchScraperConfig(),
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
bits, _ = architecture()
|
||||||
|
sdk_type = (
|
||||||
|
f"oxylabs-crewai-sdk-python/"
|
||||||
|
f"{version('crewai')} "
|
||||||
|
f"({python_version()}; {bits})"
|
||||||
|
)
|
||||||
|
|
||||||
|
if username is None or password is None:
|
||||||
|
username, password = self._get_credentials_from_env()
|
||||||
|
|
||||||
|
if OXYLABS_AVAILABLE:
|
||||||
|
# import RealtimeClient to make it accessible for the current scope
|
||||||
|
from oxylabs import RealtimeClient
|
||||||
|
|
||||||
|
kwargs["oxylabs_api"] = RealtimeClient(
|
||||||
|
username=username,
|
||||||
|
password=password,
|
||||||
|
sdk_type=sdk_type,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
import click
|
||||||
|
|
||||||
|
if click.confirm(
|
||||||
|
"You are missing the 'oxylabs' package. Would you like to install it?"
|
||||||
|
):
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
try:
|
||||||
|
subprocess.run(["uv", "add", "oxylabs"], check=True)
|
||||||
|
from oxylabs import RealtimeClient
|
||||||
|
|
||||||
|
kwargs["oxylabs_api"] = RealtimeClient(
|
||||||
|
username=username,
|
||||||
|
password=password,
|
||||||
|
sdk_type=sdk_type,
|
||||||
|
)
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
raise ImportError("Failed to install oxylabs package")
|
||||||
|
else:
|
||||||
|
raise ImportError(
|
||||||
|
"`oxylabs` package not found, please run `uv add oxylabs`"
|
||||||
|
)
|
||||||
|
|
||||||
|
super().__init__(config=config, **kwargs)
|
||||||
|
|
||||||
|
def _get_credentials_from_env(self) -> tuple[str, str]:
|
||||||
|
username = os.environ.get("OXYLABS_USERNAME")
|
||||||
|
password = os.environ.get("OXYLABS_PASSWORD")
|
||||||
|
if not username or not password:
|
||||||
|
raise ValueError(
|
||||||
|
"You must pass oxylabs username and password when instantiating the tool "
|
||||||
|
"or specify OXYLABS_USERNAME and OXYLABS_PASSWORD environment variables"
|
||||||
|
)
|
||||||
|
return username, password
|
||||||
|
|
||||||
|
def _run(self, query: str) -> str:
|
||||||
|
response = self.oxylabs_api.amazon.scrape_search(
|
||||||
|
query,
|
||||||
|
**self.config.model_dump(exclude_none=True),
|
||||||
|
)
|
||||||
|
|
||||||
|
content = response.results[0].content
|
||||||
|
|
||||||
|
if isinstance(content, dict):
|
||||||
|
return json.dumps(content)
|
||||||
|
|
||||||
|
return content
|
||||||
@@ -0,0 +1,50 @@
|
|||||||
|
# OxylabsGoogleSearchScraperTool
|
||||||
|
|
||||||
|
Scrape any website with `OxylabsGoogleSearchScraperTool`
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
```
|
||||||
|
pip install 'crewai[tools]' oxylabs
|
||||||
|
```
|
||||||
|
|
||||||
|
## Example
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crewai_tools import OxylabsGoogleSearchScraperTool
|
||||||
|
|
||||||
|
# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set
|
||||||
|
tool = OxylabsGoogleSearchScraperTool()
|
||||||
|
|
||||||
|
result = tool.run(query="iPhone 16")
|
||||||
|
|
||||||
|
print(result)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Arguments
|
||||||
|
|
||||||
|
- `username`: Oxylabs username.
|
||||||
|
- `password`: Oxylabs password.
|
||||||
|
|
||||||
|
Get the credentials by creating an Oxylabs Account [here](https://oxylabs.io).
|
||||||
|
|
||||||
|
## Advanced example
|
||||||
|
|
||||||
|
Check out the Oxylabs [documentation](https://developers.oxylabs.io/scraper-apis/web-scraper-api/targets/google/search/search) to get the full list of parameters.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crewai_tools import OxylabsGoogleSearchScraperTool
|
||||||
|
|
||||||
|
# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set
|
||||||
|
tool = OxylabsGoogleSearchScraperTool(
|
||||||
|
config={
|
||||||
|
"parse": True,
|
||||||
|
"geo_location": "Paris, France",
|
||||||
|
"user_agent_type": "tablet",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
result = tool.run(query="iPhone 16")
|
||||||
|
|
||||||
|
print(result)
|
||||||
|
```
|
||||||
@@ -0,0 +1,156 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
from importlib.metadata import version
|
||||||
|
from platform import architecture, python_version
|
||||||
|
from typing import Any, List, Type
|
||||||
|
|
||||||
|
from crewai.tools import BaseTool
|
||||||
|
from pydantic import BaseModel, ConfigDict, Field
|
||||||
|
|
||||||
|
try:
|
||||||
|
from oxylabs import RealtimeClient
|
||||||
|
from oxylabs.sources.response import Response as OxylabsResponse
|
||||||
|
|
||||||
|
OXYLABS_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
RealtimeClient = Any
|
||||||
|
OxylabsResponse = Any
|
||||||
|
|
||||||
|
OXYLABS_AVAILABLE = False
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["OxylabsGoogleSearchScraperTool", "OxylabsGoogleSearchScraperConfig"]
|
||||||
|
|
||||||
|
|
||||||
|
class OxylabsGoogleSearchScraperArgs(BaseModel):
|
||||||
|
query: str = Field(description="Search query")
|
||||||
|
|
||||||
|
|
||||||
|
class OxylabsGoogleSearchScraperConfig(BaseModel):
|
||||||
|
"""
|
||||||
|
Google Search Scraper configuration options:
|
||||||
|
https://developers.oxylabs.io/scraper-apis/web-scraper-api/targets/google/search/search
|
||||||
|
"""
|
||||||
|
|
||||||
|
domain: str | None = Field(
|
||||||
|
None, description="The domain to limit the search results to."
|
||||||
|
)
|
||||||
|
start_page: int | None = Field(None, description="The starting page number.")
|
||||||
|
pages: int | None = Field(None, description="The number of pages to scrape.")
|
||||||
|
limit: int | None = Field(
|
||||||
|
None, description="Number of results to retrieve in each page."
|
||||||
|
)
|
||||||
|
geo_location: str | None = Field(None, description="The Deliver to location.")
|
||||||
|
user_agent_type: str | None = Field(None, description="Device type and browser.")
|
||||||
|
render: str | None = Field(None, description="Enables JavaScript rendering.")
|
||||||
|
callback_url: str | None = Field(None, description="URL to your callback endpoint.")
|
||||||
|
context: list | None = Field(
|
||||||
|
None,
|
||||||
|
description="Additional advanced settings and controls for specialized requirements.",
|
||||||
|
)
|
||||||
|
parse: bool | None = Field(None, description="True will return structured data.")
|
||||||
|
parsing_instructions: dict | None = Field(
|
||||||
|
None, description="Instructions for parsing the results."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class OxylabsGoogleSearchScraperTool(BaseTool):
|
||||||
|
"""
|
||||||
|
Scrape Google Search results with OxylabsGoogleSearchScraperTool.
|
||||||
|
|
||||||
|
Get Oxylabs account:
|
||||||
|
https://dashboard.oxylabs.io/en
|
||||||
|
|
||||||
|
Args:
|
||||||
|
username (str): Oxylabs username.
|
||||||
|
password (str): Oxylabs password.
|
||||||
|
config: Configuration options. See ``OxylabsGoogleSearchScraperConfig``
|
||||||
|
"""
|
||||||
|
|
||||||
|
model_config = ConfigDict(
|
||||||
|
arbitrary_types_allowed=True,
|
||||||
|
validate_assignment=True,
|
||||||
|
)
|
||||||
|
name: str = "Oxylabs Google Search Scraper tool"
|
||||||
|
description: str = "Scrape Google Search results with Oxylabs Google Search Scraper"
|
||||||
|
args_schema: Type[BaseModel] = OxylabsGoogleSearchScraperArgs
|
||||||
|
|
||||||
|
oxylabs_api: RealtimeClient
|
||||||
|
config: OxylabsGoogleSearchScraperConfig
|
||||||
|
package_dependencies: List[str] = ["oxylabs"]
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
username: str | None = None,
|
||||||
|
password: str | None = None,
|
||||||
|
config: OxylabsGoogleSearchScraperConfig
|
||||||
|
| dict = OxylabsGoogleSearchScraperConfig(),
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
bits, _ = architecture()
|
||||||
|
sdk_type = (
|
||||||
|
f"oxylabs-crewai-sdk-python/"
|
||||||
|
f"{version('crewai')} "
|
||||||
|
f"({python_version()}; {bits})"
|
||||||
|
)
|
||||||
|
|
||||||
|
if username is None or password is None:
|
||||||
|
username, password = self._get_credentials_from_env()
|
||||||
|
|
||||||
|
if OXYLABS_AVAILABLE:
|
||||||
|
# import RealtimeClient to make it accessible for the current scope
|
||||||
|
from oxylabs import RealtimeClient
|
||||||
|
|
||||||
|
kwargs["oxylabs_api"] = RealtimeClient(
|
||||||
|
username=username,
|
||||||
|
password=password,
|
||||||
|
sdk_type=sdk_type,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
import click
|
||||||
|
|
||||||
|
if click.confirm(
|
||||||
|
"You are missing the 'oxylabs' package. Would you like to install it?"
|
||||||
|
):
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
try:
|
||||||
|
subprocess.run(["uv", "add", "oxylabs"], check=True)
|
||||||
|
from oxylabs import RealtimeClient
|
||||||
|
|
||||||
|
kwargs["oxylabs_api"] = RealtimeClient(
|
||||||
|
username=username,
|
||||||
|
password=password,
|
||||||
|
sdk_type=sdk_type,
|
||||||
|
)
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
raise ImportError("Failed to install oxylabs package")
|
||||||
|
else:
|
||||||
|
raise ImportError(
|
||||||
|
"`oxylabs` package not found, please run `uv add oxylabs`"
|
||||||
|
)
|
||||||
|
|
||||||
|
super().__init__(config=config, **kwargs)
|
||||||
|
|
||||||
|
def _get_credentials_from_env(self) -> tuple[str, str]:
|
||||||
|
username = os.environ.get("OXYLABS_USERNAME")
|
||||||
|
password = os.environ.get("OXYLABS_PASSWORD")
|
||||||
|
if not username or not password:
|
||||||
|
raise ValueError(
|
||||||
|
"You must pass oxylabs username and password when instantiating the tool "
|
||||||
|
"or specify OXYLABS_USERNAME and OXYLABS_PASSWORD environment variables"
|
||||||
|
)
|
||||||
|
return username, password
|
||||||
|
|
||||||
|
def _run(self, query: str, **kwargs) -> str:
|
||||||
|
response = self.oxylabs_api.google.scrape_search(
|
||||||
|
query,
|
||||||
|
**self.config.model_dump(exclude_none=True),
|
||||||
|
)
|
||||||
|
|
||||||
|
content = response.results[0].content
|
||||||
|
|
||||||
|
if isinstance(content, dict):
|
||||||
|
return json.dumps(content)
|
||||||
|
|
||||||
|
return content
|
||||||
@@ -0,0 +1,69 @@
|
|||||||
|
# OxylabsUniversalScraperTool
|
||||||
|
|
||||||
|
Scrape any website with `OxylabsUniversalScraperTool`
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
```
|
||||||
|
pip install 'crewai[tools]' oxylabs
|
||||||
|
```
|
||||||
|
|
||||||
|
## Example
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crewai_tools import OxylabsUniversalScraperTool
|
||||||
|
|
||||||
|
# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set
|
||||||
|
tool = OxylabsUniversalScraperTool()
|
||||||
|
|
||||||
|
result = tool.run(url="https://ip.oxylabs.io")
|
||||||
|
|
||||||
|
print(result)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Arguments
|
||||||
|
|
||||||
|
- `username`: Oxylabs username.
|
||||||
|
- `password`: Oxylabs password.
|
||||||
|
|
||||||
|
Get the credentials by creating an Oxylabs Account [here](https://oxylabs.io).
|
||||||
|
|
||||||
|
## Advanced example
|
||||||
|
|
||||||
|
Check out the Oxylabs [documentation](https://developers.oxylabs.io/scraper-apis/web-scraper-api/other-websites) to get the full list of parameters.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crewai_tools import OxylabsUniversalScraperTool
|
||||||
|
|
||||||
|
# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set
|
||||||
|
tool = OxylabsUniversalScraperTool(
|
||||||
|
config={
|
||||||
|
"render": "html",
|
||||||
|
"user_agent_type": "mobile",
|
||||||
|
"context": [
|
||||||
|
{"key": "force_headers", "value": True},
|
||||||
|
{"key": "force_cookies", "value": True},
|
||||||
|
{
|
||||||
|
"key": "headers",
|
||||||
|
"value": {
|
||||||
|
"Custom-Header-Name": "custom header content",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"key": "cookies",
|
||||||
|
"value": [
|
||||||
|
{"key": "NID", "value": "1234567890"},
|
||||||
|
{"key": "1P JAR", "value": "0987654321"},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{"key": "http_method", "value": "get"},
|
||||||
|
{"key": "follow_redirects", "value": True},
|
||||||
|
{"key": "successful_status_codes", "value": [808, 909]},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
result = tool.run(url="https://ip.oxylabs.io")
|
||||||
|
|
||||||
|
print(result)
|
||||||
|
```
|
||||||
@@ -0,0 +1,146 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
from importlib.metadata import version
|
||||||
|
from platform import architecture, python_version
|
||||||
|
from typing import Any, List, Type
|
||||||
|
|
||||||
|
from crewai.tools import BaseTool
|
||||||
|
from pydantic import BaseModel, ConfigDict, Field
|
||||||
|
|
||||||
|
try:
|
||||||
|
from oxylabs import RealtimeClient
|
||||||
|
from oxylabs.sources.response import Response as OxylabsResponse
|
||||||
|
|
||||||
|
OXYLABS_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
RealtimeClient = Any
|
||||||
|
OxylabsResponse = Any
|
||||||
|
|
||||||
|
OXYLABS_AVAILABLE = False
|
||||||
|
|
||||||
|
__all__ = ["OxylabsUniversalScraperTool", "OxylabsUniversalScraperConfig"]
|
||||||
|
|
||||||
|
|
||||||
|
class OxylabsUniversalScraperArgs(BaseModel):
|
||||||
|
url: str = Field(description="Website URL")
|
||||||
|
|
||||||
|
|
||||||
|
class OxylabsUniversalScraperConfig(BaseModel):
|
||||||
|
"""
|
||||||
|
Universal Scraper configuration options:
|
||||||
|
https://developers.oxylabs.io/scraper-apis/web-scraper-api/other-websites
|
||||||
|
"""
|
||||||
|
|
||||||
|
geo_location: str | None = Field(None, description="The Deliver to location.")
|
||||||
|
user_agent_type: str | None = Field(None, description="Device type and browser.")
|
||||||
|
render: str | None = Field(None, description="Enables JavaScript rendering.")
|
||||||
|
callback_url: str | None = Field(None, description="URL to your callback endpoint.")
|
||||||
|
context: list | None = Field(
|
||||||
|
None,
|
||||||
|
description="Additional advanced settings and controls for specialized requirements.",
|
||||||
|
)
|
||||||
|
parse: bool | None = Field(None, description="True will return structured data.")
|
||||||
|
parsing_instructions: dict | None = Field(
|
||||||
|
None, description="Instructions for parsing the results."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class OxylabsUniversalScraperTool(BaseTool):
|
||||||
|
"""
|
||||||
|
Scrape any website with OxylabsUniversalScraperTool.
|
||||||
|
|
||||||
|
Get Oxylabs account:
|
||||||
|
https://dashboard.oxylabs.io/en
|
||||||
|
|
||||||
|
Args:
|
||||||
|
username (str): Oxylabs username.
|
||||||
|
password (str): Oxylabs password.
|
||||||
|
config: Configuration options. See ``OxylabsUniversalScraperConfig``
|
||||||
|
"""
|
||||||
|
|
||||||
|
model_config = ConfigDict(
|
||||||
|
arbitrary_types_allowed=True,
|
||||||
|
validate_assignment=True,
|
||||||
|
)
|
||||||
|
name: str = "Oxylabs Universal Scraper tool"
|
||||||
|
description: str = "Scrape any url with Oxylabs Universal Scraper"
|
||||||
|
args_schema: Type[BaseModel] = OxylabsUniversalScraperArgs
|
||||||
|
|
||||||
|
oxylabs_api: RealtimeClient
|
||||||
|
config: OxylabsUniversalScraperConfig
|
||||||
|
package_dependencies: List[str] = ["oxylabs"]
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
username: str | None = None,
|
||||||
|
password: str | None = None,
|
||||||
|
config: OxylabsUniversalScraperConfig | dict = OxylabsUniversalScraperConfig(),
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
bits, _ = architecture()
|
||||||
|
sdk_type = (
|
||||||
|
f"oxylabs-crewai-sdk-python/"
|
||||||
|
f"{version('crewai')} "
|
||||||
|
f"({python_version()}; {bits})"
|
||||||
|
)
|
||||||
|
|
||||||
|
if username is None or password is None:
|
||||||
|
username, password = self._get_credentials_from_env()
|
||||||
|
|
||||||
|
if OXYLABS_AVAILABLE:
|
||||||
|
# import RealtimeClient to make it accessible for the current scope
|
||||||
|
from oxylabs import RealtimeClient
|
||||||
|
|
||||||
|
kwargs["oxylabs_api"] = RealtimeClient(
|
||||||
|
username=username,
|
||||||
|
password=password,
|
||||||
|
sdk_type=sdk_type,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
import click
|
||||||
|
|
||||||
|
if click.confirm(
|
||||||
|
"You are missing the 'oxylabs' package. Would you like to install it?"
|
||||||
|
):
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
try:
|
||||||
|
subprocess.run(["uv", "add", "oxylabs"], check=True)
|
||||||
|
from oxylabs import RealtimeClient
|
||||||
|
|
||||||
|
kwargs["oxylabs_api"] = RealtimeClient(
|
||||||
|
username=username,
|
||||||
|
password=password,
|
||||||
|
sdk_type=sdk_type,
|
||||||
|
)
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
raise ImportError("Failed to install oxylabs package")
|
||||||
|
else:
|
||||||
|
raise ImportError(
|
||||||
|
"`oxylabs` package not found, please run `uv add oxylabs`"
|
||||||
|
)
|
||||||
|
|
||||||
|
super().__init__(config=config, **kwargs)
|
||||||
|
|
||||||
|
def _get_credentials_from_env(self) -> tuple[str, str]:
|
||||||
|
username = os.environ.get("OXYLABS_USERNAME")
|
||||||
|
password = os.environ.get("OXYLABS_PASSWORD")
|
||||||
|
if not username or not password:
|
||||||
|
raise ValueError(
|
||||||
|
"You must pass oxylabs username and password when instantiating the tool "
|
||||||
|
"or specify OXYLABS_USERNAME and OXYLABS_PASSWORD environment variables"
|
||||||
|
)
|
||||||
|
return username, password
|
||||||
|
|
||||||
|
def _run(self, url: str) -> str:
|
||||||
|
response = self.oxylabs_api.universal.scrape_url(
|
||||||
|
url,
|
||||||
|
**self.config.model_dump(exclude_none=True),
|
||||||
|
)
|
||||||
|
|
||||||
|
content = response.results[0].content
|
||||||
|
|
||||||
|
if isinstance(content, dict):
|
||||||
|
return json.dumps(content)
|
||||||
|
|
||||||
|
return content
|
||||||
163
tests/tools/test_oxylabs_tools.py
Normal file
163
tests/tools/test_oxylabs_tools.py
Normal file
@@ -0,0 +1,163 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
from typing import Type
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from crewai.tools.base_tool import BaseTool
|
||||||
|
from oxylabs import RealtimeClient
|
||||||
|
from oxylabs.sources.response import Response as OxylabsResponse
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from crewai_tools import (
|
||||||
|
OxylabsAmazonProductScraperTool,
|
||||||
|
OxylabsAmazonSearchScraperTool,
|
||||||
|
OxylabsGoogleSearchScraperTool,
|
||||||
|
OxylabsUniversalScraperTool,
|
||||||
|
)
|
||||||
|
from crewai_tools.tools.oxylabs_amazon_product_scraper_tool.oxylabs_amazon_product_scraper_tool import (
|
||||||
|
OxylabsAmazonProductScraperConfig,
|
||||||
|
)
|
||||||
|
from crewai_tools.tools.oxylabs_google_search_scraper_tool.oxylabs_google_search_scraper_tool import (
|
||||||
|
OxylabsGoogleSearchScraperConfig,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def oxylabs_api() -> RealtimeClient:
|
||||||
|
oxylabs_api_mock = MagicMock()
|
||||||
|
|
||||||
|
html_content = """
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<title>Scraping Sandbox</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div id="main">
|
||||||
|
<div id="product-list">
|
||||||
|
<div>
|
||||||
|
<p>Amazing product</p>
|
||||||
|
<p>Price $14.99</p>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<p>Good product</p>
|
||||||
|
<p>Price $9.99</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
json_content = {
|
||||||
|
"results": {
|
||||||
|
"products": [
|
||||||
|
{"title": "Amazing product", "price": 14.99, "currency": "USD"},
|
||||||
|
{"title": "Good product", "price": 9.99, "currency": "USD"},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
html_response = OxylabsResponse({"results": [{"content": html_content}]})
|
||||||
|
json_response = OxylabsResponse({"results": [{"content": json_content}]})
|
||||||
|
|
||||||
|
oxylabs_api_mock.universal.scrape_url.side_effect = [json_response, html_response]
|
||||||
|
oxylabs_api_mock.amazon.scrape_search.side_effect = [json_response, html_response]
|
||||||
|
oxylabs_api_mock.amazon.scrape_product.side_effect = [json_response, html_response]
|
||||||
|
oxylabs_api_mock.google.scrape_search.side_effect = [json_response, html_response]
|
||||||
|
|
||||||
|
return oxylabs_api_mock
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("tool_class",),
|
||||||
|
[
|
||||||
|
(OxylabsUniversalScraperTool,),
|
||||||
|
(OxylabsAmazonSearchScraperTool,),
|
||||||
|
(OxylabsGoogleSearchScraperTool,),
|
||||||
|
(OxylabsAmazonProductScraperTool,),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_tool_initialization(tool_class: Type[BaseTool]):
|
||||||
|
tool = tool_class(username="username", password="password")
|
||||||
|
assert isinstance(tool, tool_class)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("tool_class",),
|
||||||
|
[
|
||||||
|
(OxylabsUniversalScraperTool,),
|
||||||
|
(OxylabsAmazonSearchScraperTool,),
|
||||||
|
(OxylabsGoogleSearchScraperTool,),
|
||||||
|
(OxylabsAmazonProductScraperTool,),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_tool_initialization_with_env_vars(tool_class: Type[BaseTool]):
|
||||||
|
os.environ["OXYLABS_USERNAME"] = "username"
|
||||||
|
os.environ["OXYLABS_PASSWORD"] = "password"
|
||||||
|
|
||||||
|
tool = tool_class()
|
||||||
|
assert isinstance(tool, tool_class)
|
||||||
|
|
||||||
|
del os.environ["OXYLABS_USERNAME"]
|
||||||
|
del os.environ["OXYLABS_PASSWORD"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("tool_class",),
|
||||||
|
[
|
||||||
|
(OxylabsUniversalScraperTool,),
|
||||||
|
(OxylabsAmazonSearchScraperTool,),
|
||||||
|
(OxylabsGoogleSearchScraperTool,),
|
||||||
|
(OxylabsAmazonProductScraperTool,),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_tool_initialization_failure(tool_class: Type[BaseTool]):
|
||||||
|
# making sure env vars are not set
|
||||||
|
for key in ["OXYLABS_USERNAME", "OXYLABS_PASSWORD"]:
|
||||||
|
if key in os.environ:
|
||||||
|
del os.environ[key]
|
||||||
|
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
tool_class()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("tool_class", "tool_config"),
|
||||||
|
[
|
||||||
|
(OxylabsUniversalScraperTool, {"geo_location": "Paris, France"}),
|
||||||
|
(
|
||||||
|
OxylabsAmazonSearchScraperTool,
|
||||||
|
{"domain": "co.uk"},
|
||||||
|
),
|
||||||
|
(
|
||||||
|
OxylabsGoogleSearchScraperTool,
|
||||||
|
OxylabsGoogleSearchScraperConfig(render="html"),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
OxylabsAmazonProductScraperTool,
|
||||||
|
OxylabsAmazonProductScraperConfig(parse=True),
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_tool_invocation(
|
||||||
|
tool_class: Type[BaseTool],
|
||||||
|
tool_config: BaseModel,
|
||||||
|
oxylabs_api: RealtimeClient,
|
||||||
|
):
|
||||||
|
tool = tool_class(username="username", password="password", config=tool_config)
|
||||||
|
|
||||||
|
# setting via __dict__ to bypass pydantic validation
|
||||||
|
tool.__dict__["oxylabs_api"] = oxylabs_api
|
||||||
|
|
||||||
|
# verifying parsed job returns json content
|
||||||
|
result = tool.run("Scraping Query 1")
|
||||||
|
assert isinstance(result, str)
|
||||||
|
assert isinstance(json.loads(result), dict)
|
||||||
|
|
||||||
|
# verifying raw job returns str
|
||||||
|
result = tool.run("Scraping Query 2")
|
||||||
|
assert isinstance(result, str)
|
||||||
|
assert "<!DOCTYPE html>" in result
|
||||||
Reference in New Issue
Block a user