mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-08 23:58:34 +00:00
Add Oxylabs Web Scraping tools (#312)
* Add Oxylabs tools * Review updates * Add package_dependencies attribute
This commit is contained in:
committed by
GitHub
parent
c13b08de2e
commit
78a062a907
@@ -37,6 +37,10 @@ from .tools import (
|
||||
MultiOnTool,
|
||||
MySQLSearchTool,
|
||||
NL2SQLTool,
|
||||
OxylabsUniversalScraperTool,
|
||||
OxylabsGoogleSearchScraperTool,
|
||||
OxylabsAmazonProductScraperTool,
|
||||
OxylabsAmazonSearchScraperTool,
|
||||
PatronusEvalTool,
|
||||
PatronusLocalEvaluatorTool,
|
||||
PatronusPredefinedCriteriaEvalTool,
|
||||
|
||||
@@ -32,6 +32,18 @@ from .mdx_search_tool.mdx_search_tool import MDXSearchTool
|
||||
from .multion_tool.multion_tool import MultiOnTool
|
||||
from .mysql_search_tool.mysql_search_tool import MySQLSearchTool
|
||||
from .nl2sql.nl2sql_tool import NL2SQLTool
|
||||
from .oxylabs_universal_scraper_tool.oxylabs_universal_scraper_tool import (
|
||||
OxylabsUniversalScraperTool,
|
||||
)
|
||||
from .oxylabs_google_search_scraper_tool.oxylabs_google_search_scraper_tool import (
|
||||
OxylabsGoogleSearchScraperTool,
|
||||
)
|
||||
from .oxylabs_amazon_product_scraper_tool.oxylabs_amazon_product_scraper_tool import (
|
||||
OxylabsAmazonProductScraperTool,
|
||||
)
|
||||
from .oxylabs_amazon_search_scraper_tool.oxylabs_amazon_search_scraper_tool import (
|
||||
OxylabsAmazonSearchScraperTool,
|
||||
)
|
||||
from .patronus_eval_tool import (
|
||||
PatronusEvalTool,
|
||||
PatronusLocalEvaluatorTool,
|
||||
|
||||
@@ -0,0 +1,55 @@
|
||||
# OxylabsAmazonProductScraperTool
|
||||
|
||||
Scrape any website with `OxylabsAmazonProductScraperTool`
|
||||
|
||||
## Installation
|
||||
|
||||
```
|
||||
pip install 'crewai[tools]' oxylabs
|
||||
```
|
||||
|
||||
## Example
|
||||
|
||||
```python
|
||||
from crewai_tools import OxylabsAmazonProductScraperTool
|
||||
|
||||
# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set
|
||||
tool = OxylabsAmazonProductScraperTool()
|
||||
|
||||
result = tool.run(query="AAAAABBBBCC")
|
||||
|
||||
print(result)
|
||||
```
|
||||
|
||||
## Arguments
|
||||
|
||||
- `username`: Oxylabs username.
|
||||
- `password`: Oxylabs password.
|
||||
|
||||
Get the credentials by creating an Oxylabs Account [here](https://oxylabs.io).
|
||||
|
||||
## Advanced example
|
||||
|
||||
Check out the Oxylabs [documentation](https://developers.oxylabs.io/scraper-apis/web-scraper-api/targets/amazon/product) to get the full list of parameters.
|
||||
|
||||
```python
|
||||
from crewai_tools import OxylabsAmazonProductScraperTool
|
||||
|
||||
# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set
|
||||
tool = OxylabsAmazonProductScraperTool(
|
||||
config={
|
||||
"domain": "com",
|
||||
"parse": True,
|
||||
"context": [
|
||||
{
|
||||
"key": "autoselect_variant",
|
||||
"value": True
|
||||
}
|
||||
]
|
||||
}
|
||||
)
|
||||
|
||||
result = tool.run(query="AAAAABBBBCC")
|
||||
|
||||
print(result)
|
||||
```
|
||||
@@ -0,0 +1,151 @@
|
||||
import json
|
||||
import os
|
||||
from importlib.metadata import version
|
||||
from platform import architecture, python_version
|
||||
from typing import Any, List, Type
|
||||
|
||||
from crewai.tools import BaseTool
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
|
||||
try:
|
||||
from oxylabs import RealtimeClient
|
||||
from oxylabs.sources.response import Response as OxylabsResponse
|
||||
|
||||
OXYLABS_AVAILABLE = True
|
||||
except ImportError:
|
||||
RealtimeClient = Any
|
||||
OxylabsResponse = Any
|
||||
|
||||
OXYLABS_AVAILABLE = False
|
||||
|
||||
|
||||
__all__ = ["OxylabsAmazonProductScraperTool", "OxylabsAmazonProductScraperConfig"]
|
||||
|
||||
|
||||
class OxylabsAmazonProductScraperArgs(BaseModel):
|
||||
query: str = Field(description="Amazon product ASIN")
|
||||
|
||||
|
||||
class OxylabsAmazonProductScraperConfig(BaseModel):
|
||||
"""
|
||||
Amazon Product Scraper configuration options:
|
||||
https://developers.oxylabs.io/scraper-apis/web-scraper-api/targets/amazon/product
|
||||
"""
|
||||
|
||||
domain: str | None = Field(
|
||||
None, description="The domain to limit the search results to."
|
||||
)
|
||||
geo_location: str | None = Field(None, description="The Deliver to location.")
|
||||
user_agent_type: str | None = Field(None, description="Device type and browser.")
|
||||
render: str | None = Field(None, description="Enables JavaScript rendering.")
|
||||
callback_url: str | None = Field(None, description="URL to your callback endpoint.")
|
||||
context: list | None = Field(
|
||||
None,
|
||||
description="Additional advanced settings and controls for specialized requirements.",
|
||||
)
|
||||
parse: bool | None = Field(None, description="True will return structured data.")
|
||||
parsing_instructions: dict | None = Field(
|
||||
None, description="Instructions for parsing the results."
|
||||
)
|
||||
|
||||
|
||||
class OxylabsAmazonProductScraperTool(BaseTool):
|
||||
"""
|
||||
Scrape Amazon product pages with OxylabsAmazonProductScraperTool.
|
||||
|
||||
Get Oxylabs account:
|
||||
https://dashboard.oxylabs.io/en
|
||||
|
||||
Args:
|
||||
username (str): Oxylabs username.
|
||||
password (str): Oxylabs password.
|
||||
config: Configuration options. See ``OxylabsAmazonProductScraperConfig``
|
||||
"""
|
||||
|
||||
model_config = ConfigDict(
|
||||
arbitrary_types_allowed=True,
|
||||
validate_assignment=True,
|
||||
)
|
||||
name: str = "Oxylabs Amazon Product Scraper tool"
|
||||
description: str = "Scrape Amazon product pages with Oxylabs Amazon Product Scraper"
|
||||
args_schema: Type[BaseModel] = OxylabsAmazonProductScraperArgs
|
||||
|
||||
oxylabs_api: RealtimeClient
|
||||
config: OxylabsAmazonProductScraperConfig
|
||||
package_dependencies: List[str] = ["oxylabs"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
username: str | None = None,
|
||||
password: str | None = None,
|
||||
config: OxylabsAmazonProductScraperConfig
|
||||
| dict = OxylabsAmazonProductScraperConfig(),
|
||||
**kwargs,
|
||||
) -> None:
|
||||
bits, _ = architecture()
|
||||
sdk_type = (
|
||||
f"oxylabs-crewai-sdk-python/"
|
||||
f"{version('crewai')} "
|
||||
f"({python_version()}; {bits})"
|
||||
)
|
||||
|
||||
if username is None or password is None:
|
||||
username, password = self._get_credentials_from_env()
|
||||
|
||||
if OXYLABS_AVAILABLE:
|
||||
# import RealtimeClient to make it accessible for the current scope
|
||||
from oxylabs import RealtimeClient
|
||||
|
||||
kwargs["oxylabs_api"] = RealtimeClient(
|
||||
username=username,
|
||||
password=password,
|
||||
sdk_type=sdk_type,
|
||||
)
|
||||
else:
|
||||
import click
|
||||
|
||||
if click.confirm(
|
||||
"You are missing the 'oxylabs' package. Would you like to install it?"
|
||||
):
|
||||
import subprocess
|
||||
|
||||
try:
|
||||
subprocess.run(["uv", "add", "oxylabs"], check=True)
|
||||
from oxylabs import RealtimeClient
|
||||
|
||||
kwargs["oxylabs_api"] = RealtimeClient(
|
||||
username=username,
|
||||
password=password,
|
||||
sdk_type=sdk_type,
|
||||
)
|
||||
except subprocess.CalledProcessError:
|
||||
raise ImportError("Failed to install oxylabs package")
|
||||
else:
|
||||
raise ImportError(
|
||||
"`oxylabs` package not found, please run `uv add oxylabs`"
|
||||
)
|
||||
|
||||
super().__init__(config=config, **kwargs)
|
||||
|
||||
def _get_credentials_from_env(self) -> tuple[str, str]:
|
||||
username = os.environ.get("OXYLABS_USERNAME")
|
||||
password = os.environ.get("OXYLABS_PASSWORD")
|
||||
if not username or not password:
|
||||
raise ValueError(
|
||||
"You must pass oxylabs username and password when instantiating the tool "
|
||||
"or specify OXYLABS_USERNAME and OXYLABS_PASSWORD environment variables"
|
||||
)
|
||||
return username, password
|
||||
|
||||
def _run(self, query: str) -> str:
|
||||
response = self.oxylabs_api.amazon.scrape_product(
|
||||
query,
|
||||
**self.config.model_dump(exclude_none=True),
|
||||
)
|
||||
|
||||
content = response.results[0].content
|
||||
|
||||
if isinstance(content, dict):
|
||||
return json.dumps(content)
|
||||
|
||||
return content
|
||||
@@ -0,0 +1,54 @@
|
||||
# OxylabsAmazonSearchScraperTool
|
||||
|
||||
Scrape any website with `OxylabsAmazonSearchScraperTool`
|
||||
|
||||
## Installation
|
||||
|
||||
```
|
||||
pip install 'crewai[tools]' oxylabs
|
||||
```
|
||||
|
||||
## Example
|
||||
|
||||
```python
|
||||
from crewai_tools import OxylabsAmazonSearchScraperTool
|
||||
|
||||
# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set
|
||||
tool = OxylabsAmazonSearchScraperTool()
|
||||
|
||||
result = tool.run(query="headsets")
|
||||
|
||||
print(result)
|
||||
```
|
||||
|
||||
## Arguments
|
||||
|
||||
- `username`: Oxylabs username.
|
||||
- `password`: Oxylabs password.
|
||||
|
||||
Get the credentials by creating an Oxylabs Account [here](https://oxylabs.io).
|
||||
|
||||
## Advanced example
|
||||
|
||||
Check out the Oxylabs [documentation](https://developers.oxylabs.io/scraper-apis/web-scraper-api/targets/amazon/search) to get the full list of parameters.
|
||||
|
||||
```python
|
||||
from crewai_tools import OxylabsAmazonSearchScraperTool
|
||||
|
||||
# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set
|
||||
tool = OxylabsAmazonSearchScraperTool(
|
||||
config={
|
||||
"domain": 'nl',
|
||||
"start_page": 2,
|
||||
"pages": 2,
|
||||
"parse": True,
|
||||
"context": [
|
||||
{'key': 'category_id', 'value': 16391693031}
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
result = tool.run(query='nirvana tshirt')
|
||||
|
||||
print(result)
|
||||
```
|
||||
@@ -0,0 +1,153 @@
|
||||
import json
|
||||
import os
|
||||
from importlib.metadata import version
|
||||
from platform import architecture, python_version
|
||||
from typing import Any, List, Type
|
||||
|
||||
from crewai.tools import BaseTool
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
|
||||
try:
|
||||
from oxylabs import RealtimeClient
|
||||
from oxylabs.sources.response import Response as OxylabsResponse
|
||||
|
||||
OXYLABS_AVAILABLE = True
|
||||
except ImportError:
|
||||
RealtimeClient = Any
|
||||
OxylabsResponse = Any
|
||||
|
||||
OXYLABS_AVAILABLE = False
|
||||
|
||||
|
||||
__all__ = ["OxylabsAmazonSearchScraperTool", "OxylabsAmazonSearchScraperConfig"]
|
||||
|
||||
|
||||
class OxylabsAmazonSearchScraperArgs(BaseModel):
|
||||
query: str = Field(description="Amazon search term")
|
||||
|
||||
|
||||
class OxylabsAmazonSearchScraperConfig(BaseModel):
|
||||
"""
|
||||
Amazon Search Scraper configuration options:
|
||||
https://developers.oxylabs.io/scraper-apis/web-scraper-api/targets/amazon/search
|
||||
"""
|
||||
|
||||
domain: str | None = Field(
|
||||
None, description="The domain to limit the search results to."
|
||||
)
|
||||
start_page: int | None = Field(None, description="The starting page number.")
|
||||
pages: int | None = Field(None, description="The number of pages to scrape.")
|
||||
geo_location: str | None = Field(None, description="The Deliver to location.")
|
||||
user_agent_type: str | None = Field(None, description="Device type and browser.")
|
||||
render: str | None = Field(None, description="Enables JavaScript rendering.")
|
||||
callback_url: str | None = Field(None, description="URL to your callback endpoint.")
|
||||
context: list | None = Field(
|
||||
None,
|
||||
description="Additional advanced settings and controls for specialized requirements.",
|
||||
)
|
||||
parse: bool | None = Field(None, description="True will return structured data.")
|
||||
parsing_instructions: dict | None = Field(
|
||||
None, description="Instructions for parsing the results."
|
||||
)
|
||||
|
||||
|
||||
class OxylabsAmazonSearchScraperTool(BaseTool):
|
||||
"""
|
||||
Scrape Amazon search results with OxylabsAmazonSearchScraperTool.
|
||||
|
||||
Get Oxylabs account:
|
||||
https://dashboard.oxylabs.io/en
|
||||
|
||||
Args:
|
||||
username (str): Oxylabs username.
|
||||
password (str): Oxylabs password.
|
||||
config: Configuration options. See ``OxylabsAmazonSearchScraperConfig``
|
||||
"""
|
||||
|
||||
model_config = ConfigDict(
|
||||
arbitrary_types_allowed=True,
|
||||
validate_assignment=True,
|
||||
)
|
||||
name: str = "Oxylabs Amazon Search Scraper tool"
|
||||
description: str = "Scrape Amazon search results with Oxylabs Amazon Search Scraper"
|
||||
args_schema: Type[BaseModel] = OxylabsAmazonSearchScraperArgs
|
||||
|
||||
oxylabs_api: RealtimeClient
|
||||
config: OxylabsAmazonSearchScraperConfig
|
||||
package_dependencies: List[str] = ["oxylabs"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
username: str | None = None,
|
||||
password: str | None = None,
|
||||
config: OxylabsAmazonSearchScraperConfig
|
||||
| dict = OxylabsAmazonSearchScraperConfig(),
|
||||
**kwargs,
|
||||
):
|
||||
bits, _ = architecture()
|
||||
sdk_type = (
|
||||
f"oxylabs-crewai-sdk-python/"
|
||||
f"{version('crewai')} "
|
||||
f"({python_version()}; {bits})"
|
||||
)
|
||||
|
||||
if username is None or password is None:
|
||||
username, password = self._get_credentials_from_env()
|
||||
|
||||
if OXYLABS_AVAILABLE:
|
||||
# import RealtimeClient to make it accessible for the current scope
|
||||
from oxylabs import RealtimeClient
|
||||
|
||||
kwargs["oxylabs_api"] = RealtimeClient(
|
||||
username=username,
|
||||
password=password,
|
||||
sdk_type=sdk_type,
|
||||
)
|
||||
else:
|
||||
import click
|
||||
|
||||
if click.confirm(
|
||||
"You are missing the 'oxylabs' package. Would you like to install it?"
|
||||
):
|
||||
import subprocess
|
||||
|
||||
try:
|
||||
subprocess.run(["uv", "add", "oxylabs"], check=True)
|
||||
from oxylabs import RealtimeClient
|
||||
|
||||
kwargs["oxylabs_api"] = RealtimeClient(
|
||||
username=username,
|
||||
password=password,
|
||||
sdk_type=sdk_type,
|
||||
)
|
||||
except subprocess.CalledProcessError:
|
||||
raise ImportError("Failed to install oxylabs package")
|
||||
else:
|
||||
raise ImportError(
|
||||
"`oxylabs` package not found, please run `uv add oxylabs`"
|
||||
)
|
||||
|
||||
super().__init__(config=config, **kwargs)
|
||||
|
||||
def _get_credentials_from_env(self) -> tuple[str, str]:
|
||||
username = os.environ.get("OXYLABS_USERNAME")
|
||||
password = os.environ.get("OXYLABS_PASSWORD")
|
||||
if not username or not password:
|
||||
raise ValueError(
|
||||
"You must pass oxylabs username and password when instantiating the tool "
|
||||
"or specify OXYLABS_USERNAME and OXYLABS_PASSWORD environment variables"
|
||||
)
|
||||
return username, password
|
||||
|
||||
def _run(self, query: str) -> str:
|
||||
response = self.oxylabs_api.amazon.scrape_search(
|
||||
query,
|
||||
**self.config.model_dump(exclude_none=True),
|
||||
)
|
||||
|
||||
content = response.results[0].content
|
||||
|
||||
if isinstance(content, dict):
|
||||
return json.dumps(content)
|
||||
|
||||
return content
|
||||
@@ -0,0 +1,50 @@
|
||||
# OxylabsGoogleSearchScraperTool
|
||||
|
||||
Scrape any website with `OxylabsGoogleSearchScraperTool`
|
||||
|
||||
## Installation
|
||||
|
||||
```
|
||||
pip install 'crewai[tools]' oxylabs
|
||||
```
|
||||
|
||||
## Example
|
||||
|
||||
```python
|
||||
from crewai_tools import OxylabsGoogleSearchScraperTool
|
||||
|
||||
# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set
|
||||
tool = OxylabsGoogleSearchScraperTool()
|
||||
|
||||
result = tool.run(query="iPhone 16")
|
||||
|
||||
print(result)
|
||||
```
|
||||
|
||||
## Arguments
|
||||
|
||||
- `username`: Oxylabs username.
|
||||
- `password`: Oxylabs password.
|
||||
|
||||
Get the credentials by creating an Oxylabs Account [here](https://oxylabs.io).
|
||||
|
||||
## Advanced example
|
||||
|
||||
Check out the Oxylabs [documentation](https://developers.oxylabs.io/scraper-apis/web-scraper-api/targets/google/search/search) to get the full list of parameters.
|
||||
|
||||
```python
|
||||
from crewai_tools import OxylabsGoogleSearchScraperTool
|
||||
|
||||
# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set
|
||||
tool = OxylabsGoogleSearchScraperTool(
|
||||
config={
|
||||
"parse": True,
|
||||
"geo_location": "Paris, France",
|
||||
"user_agent_type": "tablet",
|
||||
}
|
||||
)
|
||||
|
||||
result = tool.run(query="iPhone 16")
|
||||
|
||||
print(result)
|
||||
```
|
||||
@@ -0,0 +1,156 @@
|
||||
import json
|
||||
import os
|
||||
from importlib.metadata import version
|
||||
from platform import architecture, python_version
|
||||
from typing import Any, List, Type
|
||||
|
||||
from crewai.tools import BaseTool
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
|
||||
try:
|
||||
from oxylabs import RealtimeClient
|
||||
from oxylabs.sources.response import Response as OxylabsResponse
|
||||
|
||||
OXYLABS_AVAILABLE = True
|
||||
except ImportError:
|
||||
RealtimeClient = Any
|
||||
OxylabsResponse = Any
|
||||
|
||||
OXYLABS_AVAILABLE = False
|
||||
|
||||
|
||||
__all__ = ["OxylabsGoogleSearchScraperTool", "OxylabsGoogleSearchScraperConfig"]
|
||||
|
||||
|
||||
class OxylabsGoogleSearchScraperArgs(BaseModel):
|
||||
query: str = Field(description="Search query")
|
||||
|
||||
|
||||
class OxylabsGoogleSearchScraperConfig(BaseModel):
|
||||
"""
|
||||
Google Search Scraper configuration options:
|
||||
https://developers.oxylabs.io/scraper-apis/web-scraper-api/targets/google/search/search
|
||||
"""
|
||||
|
||||
domain: str | None = Field(
|
||||
None, description="The domain to limit the search results to."
|
||||
)
|
||||
start_page: int | None = Field(None, description="The starting page number.")
|
||||
pages: int | None = Field(None, description="The number of pages to scrape.")
|
||||
limit: int | None = Field(
|
||||
None, description="Number of results to retrieve in each page."
|
||||
)
|
||||
geo_location: str | None = Field(None, description="The Deliver to location.")
|
||||
user_agent_type: str | None = Field(None, description="Device type and browser.")
|
||||
render: str | None = Field(None, description="Enables JavaScript rendering.")
|
||||
callback_url: str | None = Field(None, description="URL to your callback endpoint.")
|
||||
context: list | None = Field(
|
||||
None,
|
||||
description="Additional advanced settings and controls for specialized requirements.",
|
||||
)
|
||||
parse: bool | None = Field(None, description="True will return structured data.")
|
||||
parsing_instructions: dict | None = Field(
|
||||
None, description="Instructions for parsing the results."
|
||||
)
|
||||
|
||||
|
||||
class OxylabsGoogleSearchScraperTool(BaseTool):
|
||||
"""
|
||||
Scrape Google Search results with OxylabsGoogleSearchScraperTool.
|
||||
|
||||
Get Oxylabs account:
|
||||
https://dashboard.oxylabs.io/en
|
||||
|
||||
Args:
|
||||
username (str): Oxylabs username.
|
||||
password (str): Oxylabs password.
|
||||
config: Configuration options. See ``OxylabsGoogleSearchScraperConfig``
|
||||
"""
|
||||
|
||||
model_config = ConfigDict(
|
||||
arbitrary_types_allowed=True,
|
||||
validate_assignment=True,
|
||||
)
|
||||
name: str = "Oxylabs Google Search Scraper tool"
|
||||
description: str = "Scrape Google Search results with Oxylabs Google Search Scraper"
|
||||
args_schema: Type[BaseModel] = OxylabsGoogleSearchScraperArgs
|
||||
|
||||
oxylabs_api: RealtimeClient
|
||||
config: OxylabsGoogleSearchScraperConfig
|
||||
package_dependencies: List[str] = ["oxylabs"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
username: str | None = None,
|
||||
password: str | None = None,
|
||||
config: OxylabsGoogleSearchScraperConfig
|
||||
| dict = OxylabsGoogleSearchScraperConfig(),
|
||||
**kwargs,
|
||||
):
|
||||
bits, _ = architecture()
|
||||
sdk_type = (
|
||||
f"oxylabs-crewai-sdk-python/"
|
||||
f"{version('crewai')} "
|
||||
f"({python_version()}; {bits})"
|
||||
)
|
||||
|
||||
if username is None or password is None:
|
||||
username, password = self._get_credentials_from_env()
|
||||
|
||||
if OXYLABS_AVAILABLE:
|
||||
# import RealtimeClient to make it accessible for the current scope
|
||||
from oxylabs import RealtimeClient
|
||||
|
||||
kwargs["oxylabs_api"] = RealtimeClient(
|
||||
username=username,
|
||||
password=password,
|
||||
sdk_type=sdk_type,
|
||||
)
|
||||
else:
|
||||
import click
|
||||
|
||||
if click.confirm(
|
||||
"You are missing the 'oxylabs' package. Would you like to install it?"
|
||||
):
|
||||
import subprocess
|
||||
|
||||
try:
|
||||
subprocess.run(["uv", "add", "oxylabs"], check=True)
|
||||
from oxylabs import RealtimeClient
|
||||
|
||||
kwargs["oxylabs_api"] = RealtimeClient(
|
||||
username=username,
|
||||
password=password,
|
||||
sdk_type=sdk_type,
|
||||
)
|
||||
except subprocess.CalledProcessError:
|
||||
raise ImportError("Failed to install oxylabs package")
|
||||
else:
|
||||
raise ImportError(
|
||||
"`oxylabs` package not found, please run `uv add oxylabs`"
|
||||
)
|
||||
|
||||
super().__init__(config=config, **kwargs)
|
||||
|
||||
def _get_credentials_from_env(self) -> tuple[str, str]:
|
||||
username = os.environ.get("OXYLABS_USERNAME")
|
||||
password = os.environ.get("OXYLABS_PASSWORD")
|
||||
if not username or not password:
|
||||
raise ValueError(
|
||||
"You must pass oxylabs username and password when instantiating the tool "
|
||||
"or specify OXYLABS_USERNAME and OXYLABS_PASSWORD environment variables"
|
||||
)
|
||||
return username, password
|
||||
|
||||
def _run(self, query: str, **kwargs) -> str:
|
||||
response = self.oxylabs_api.google.scrape_search(
|
||||
query,
|
||||
**self.config.model_dump(exclude_none=True),
|
||||
)
|
||||
|
||||
content = response.results[0].content
|
||||
|
||||
if isinstance(content, dict):
|
||||
return json.dumps(content)
|
||||
|
||||
return content
|
||||
@@ -0,0 +1,69 @@
|
||||
# OxylabsUniversalScraperTool
|
||||
|
||||
Scrape any website with `OxylabsUniversalScraperTool`
|
||||
|
||||
## Installation
|
||||
|
||||
```
|
||||
pip install 'crewai[tools]' oxylabs
|
||||
```
|
||||
|
||||
## Example
|
||||
|
||||
```python
|
||||
from crewai_tools import OxylabsUniversalScraperTool
|
||||
|
||||
# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set
|
||||
tool = OxylabsUniversalScraperTool()
|
||||
|
||||
result = tool.run(url="https://ip.oxylabs.io")
|
||||
|
||||
print(result)
|
||||
```
|
||||
|
||||
## Arguments
|
||||
|
||||
- `username`: Oxylabs username.
|
||||
- `password`: Oxylabs password.
|
||||
|
||||
Get the credentials by creating an Oxylabs Account [here](https://oxylabs.io).
|
||||
|
||||
## Advanced example
|
||||
|
||||
Check out the Oxylabs [documentation](https://developers.oxylabs.io/scraper-apis/web-scraper-api/other-websites) to get the full list of parameters.
|
||||
|
||||
```python
|
||||
from crewai_tools import OxylabsUniversalScraperTool
|
||||
|
||||
# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set
|
||||
tool = OxylabsUniversalScraperTool(
|
||||
config={
|
||||
"render": "html",
|
||||
"user_agent_type": "mobile",
|
||||
"context": [
|
||||
{"key": "force_headers", "value": True},
|
||||
{"key": "force_cookies", "value": True},
|
||||
{
|
||||
"key": "headers",
|
||||
"value": {
|
||||
"Custom-Header-Name": "custom header content",
|
||||
},
|
||||
},
|
||||
{
|
||||
"key": "cookies",
|
||||
"value": [
|
||||
{"key": "NID", "value": "1234567890"},
|
||||
{"key": "1P JAR", "value": "0987654321"},
|
||||
],
|
||||
},
|
||||
{"key": "http_method", "value": "get"},
|
||||
{"key": "follow_redirects", "value": True},
|
||||
{"key": "successful_status_codes", "value": [808, 909]},
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
result = tool.run(url="https://ip.oxylabs.io")
|
||||
|
||||
print(result)
|
||||
```
|
||||
@@ -0,0 +1,146 @@
|
||||
import json
|
||||
import os
|
||||
from importlib.metadata import version
|
||||
from platform import architecture, python_version
|
||||
from typing import Any, List, Type
|
||||
|
||||
from crewai.tools import BaseTool
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
|
||||
try:
|
||||
from oxylabs import RealtimeClient
|
||||
from oxylabs.sources.response import Response as OxylabsResponse
|
||||
|
||||
OXYLABS_AVAILABLE = True
|
||||
except ImportError:
|
||||
RealtimeClient = Any
|
||||
OxylabsResponse = Any
|
||||
|
||||
OXYLABS_AVAILABLE = False
|
||||
|
||||
__all__ = ["OxylabsUniversalScraperTool", "OxylabsUniversalScraperConfig"]
|
||||
|
||||
|
||||
class OxylabsUniversalScraperArgs(BaseModel):
|
||||
url: str = Field(description="Website URL")
|
||||
|
||||
|
||||
class OxylabsUniversalScraperConfig(BaseModel):
|
||||
"""
|
||||
Universal Scraper configuration options:
|
||||
https://developers.oxylabs.io/scraper-apis/web-scraper-api/other-websites
|
||||
"""
|
||||
|
||||
geo_location: str | None = Field(None, description="The Deliver to location.")
|
||||
user_agent_type: str | None = Field(None, description="Device type and browser.")
|
||||
render: str | None = Field(None, description="Enables JavaScript rendering.")
|
||||
callback_url: str | None = Field(None, description="URL to your callback endpoint.")
|
||||
context: list | None = Field(
|
||||
None,
|
||||
description="Additional advanced settings and controls for specialized requirements.",
|
||||
)
|
||||
parse: bool | None = Field(None, description="True will return structured data.")
|
||||
parsing_instructions: dict | None = Field(
|
||||
None, description="Instructions for parsing the results."
|
||||
)
|
||||
|
||||
|
||||
class OxylabsUniversalScraperTool(BaseTool):
|
||||
"""
|
||||
Scrape any website with OxylabsUniversalScraperTool.
|
||||
|
||||
Get Oxylabs account:
|
||||
https://dashboard.oxylabs.io/en
|
||||
|
||||
Args:
|
||||
username (str): Oxylabs username.
|
||||
password (str): Oxylabs password.
|
||||
config: Configuration options. See ``OxylabsUniversalScraperConfig``
|
||||
"""
|
||||
|
||||
model_config = ConfigDict(
|
||||
arbitrary_types_allowed=True,
|
||||
validate_assignment=True,
|
||||
)
|
||||
name: str = "Oxylabs Universal Scraper tool"
|
||||
description: str = "Scrape any url with Oxylabs Universal Scraper"
|
||||
args_schema: Type[BaseModel] = OxylabsUniversalScraperArgs
|
||||
|
||||
oxylabs_api: RealtimeClient
|
||||
config: OxylabsUniversalScraperConfig
|
||||
package_dependencies: List[str] = ["oxylabs"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
username: str | None = None,
|
||||
password: str | None = None,
|
||||
config: OxylabsUniversalScraperConfig | dict = OxylabsUniversalScraperConfig(),
|
||||
**kwargs,
|
||||
):
|
||||
bits, _ = architecture()
|
||||
sdk_type = (
|
||||
f"oxylabs-crewai-sdk-python/"
|
||||
f"{version('crewai')} "
|
||||
f"({python_version()}; {bits})"
|
||||
)
|
||||
|
||||
if username is None or password is None:
|
||||
username, password = self._get_credentials_from_env()
|
||||
|
||||
if OXYLABS_AVAILABLE:
|
||||
# import RealtimeClient to make it accessible for the current scope
|
||||
from oxylabs import RealtimeClient
|
||||
|
||||
kwargs["oxylabs_api"] = RealtimeClient(
|
||||
username=username,
|
||||
password=password,
|
||||
sdk_type=sdk_type,
|
||||
)
|
||||
else:
|
||||
import click
|
||||
|
||||
if click.confirm(
|
||||
"You are missing the 'oxylabs' package. Would you like to install it?"
|
||||
):
|
||||
import subprocess
|
||||
|
||||
try:
|
||||
subprocess.run(["uv", "add", "oxylabs"], check=True)
|
||||
from oxylabs import RealtimeClient
|
||||
|
||||
kwargs["oxylabs_api"] = RealtimeClient(
|
||||
username=username,
|
||||
password=password,
|
||||
sdk_type=sdk_type,
|
||||
)
|
||||
except subprocess.CalledProcessError:
|
||||
raise ImportError("Failed to install oxylabs package")
|
||||
else:
|
||||
raise ImportError(
|
||||
"`oxylabs` package not found, please run `uv add oxylabs`"
|
||||
)
|
||||
|
||||
super().__init__(config=config, **kwargs)
|
||||
|
||||
def _get_credentials_from_env(self) -> tuple[str, str]:
|
||||
username = os.environ.get("OXYLABS_USERNAME")
|
||||
password = os.environ.get("OXYLABS_PASSWORD")
|
||||
if not username or not password:
|
||||
raise ValueError(
|
||||
"You must pass oxylabs username and password when instantiating the tool "
|
||||
"or specify OXYLABS_USERNAME and OXYLABS_PASSWORD environment variables"
|
||||
)
|
||||
return username, password
|
||||
|
||||
def _run(self, url: str) -> str:
|
||||
response = self.oxylabs_api.universal.scrape_url(
|
||||
url,
|
||||
**self.config.model_dump(exclude_none=True),
|
||||
)
|
||||
|
||||
content = response.results[0].content
|
||||
|
||||
if isinstance(content, dict):
|
||||
return json.dumps(content)
|
||||
|
||||
return content
|
||||
163
tests/tools/test_oxylabs_tools.py
Normal file
163
tests/tools/test_oxylabs_tools.py
Normal file
@@ -0,0 +1,163 @@
|
||||
import json
|
||||
import os
|
||||
from typing import Type
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
from crewai.tools.base_tool import BaseTool
|
||||
from oxylabs import RealtimeClient
|
||||
from oxylabs.sources.response import Response as OxylabsResponse
|
||||
from pydantic import BaseModel
|
||||
|
||||
from crewai_tools import (
|
||||
OxylabsAmazonProductScraperTool,
|
||||
OxylabsAmazonSearchScraperTool,
|
||||
OxylabsGoogleSearchScraperTool,
|
||||
OxylabsUniversalScraperTool,
|
||||
)
|
||||
from crewai_tools.tools.oxylabs_amazon_product_scraper_tool.oxylabs_amazon_product_scraper_tool import (
|
||||
OxylabsAmazonProductScraperConfig,
|
||||
)
|
||||
from crewai_tools.tools.oxylabs_google_search_scraper_tool.oxylabs_google_search_scraper_tool import (
|
||||
OxylabsGoogleSearchScraperConfig,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def oxylabs_api() -> RealtimeClient:
|
||||
oxylabs_api_mock = MagicMock()
|
||||
|
||||
html_content = """
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>Scraping Sandbox</title>
|
||||
</head>
|
||||
<body>
|
||||
<div id="main">
|
||||
<div id="product-list">
|
||||
<div>
|
||||
<p>Amazing product</p>
|
||||
<p>Price $14.99</p>
|
||||
</div>
|
||||
<div>
|
||||
<p>Good product</p>
|
||||
<p>Price $9.99</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
json_content = {
|
||||
"results": {
|
||||
"products": [
|
||||
{"title": "Amazing product", "price": 14.99, "currency": "USD"},
|
||||
{"title": "Good product", "price": 9.99, "currency": "USD"},
|
||||
],
|
||||
},
|
||||
}
|
||||
|
||||
html_response = OxylabsResponse({"results": [{"content": html_content}]})
|
||||
json_response = OxylabsResponse({"results": [{"content": json_content}]})
|
||||
|
||||
oxylabs_api_mock.universal.scrape_url.side_effect = [json_response, html_response]
|
||||
oxylabs_api_mock.amazon.scrape_search.side_effect = [json_response, html_response]
|
||||
oxylabs_api_mock.amazon.scrape_product.side_effect = [json_response, html_response]
|
||||
oxylabs_api_mock.google.scrape_search.side_effect = [json_response, html_response]
|
||||
|
||||
return oxylabs_api_mock
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("tool_class",),
|
||||
[
|
||||
(OxylabsUniversalScraperTool,),
|
||||
(OxylabsAmazonSearchScraperTool,),
|
||||
(OxylabsGoogleSearchScraperTool,),
|
||||
(OxylabsAmazonProductScraperTool,),
|
||||
],
|
||||
)
|
||||
def test_tool_initialization(tool_class: Type[BaseTool]):
|
||||
tool = tool_class(username="username", password="password")
|
||||
assert isinstance(tool, tool_class)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("tool_class",),
|
||||
[
|
||||
(OxylabsUniversalScraperTool,),
|
||||
(OxylabsAmazonSearchScraperTool,),
|
||||
(OxylabsGoogleSearchScraperTool,),
|
||||
(OxylabsAmazonProductScraperTool,),
|
||||
],
|
||||
)
|
||||
def test_tool_initialization_with_env_vars(tool_class: Type[BaseTool]):
|
||||
os.environ["OXYLABS_USERNAME"] = "username"
|
||||
os.environ["OXYLABS_PASSWORD"] = "password"
|
||||
|
||||
tool = tool_class()
|
||||
assert isinstance(tool, tool_class)
|
||||
|
||||
del os.environ["OXYLABS_USERNAME"]
|
||||
del os.environ["OXYLABS_PASSWORD"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("tool_class",),
|
||||
[
|
||||
(OxylabsUniversalScraperTool,),
|
||||
(OxylabsAmazonSearchScraperTool,),
|
||||
(OxylabsGoogleSearchScraperTool,),
|
||||
(OxylabsAmazonProductScraperTool,),
|
||||
],
|
||||
)
|
||||
def test_tool_initialization_failure(tool_class: Type[BaseTool]):
|
||||
# making sure env vars are not set
|
||||
for key in ["OXYLABS_USERNAME", "OXYLABS_PASSWORD"]:
|
||||
if key in os.environ:
|
||||
del os.environ[key]
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
tool_class()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("tool_class", "tool_config"),
|
||||
[
|
||||
(OxylabsUniversalScraperTool, {"geo_location": "Paris, France"}),
|
||||
(
|
||||
OxylabsAmazonSearchScraperTool,
|
||||
{"domain": "co.uk"},
|
||||
),
|
||||
(
|
||||
OxylabsGoogleSearchScraperTool,
|
||||
OxylabsGoogleSearchScraperConfig(render="html"),
|
||||
),
|
||||
(
|
||||
OxylabsAmazonProductScraperTool,
|
||||
OxylabsAmazonProductScraperConfig(parse=True),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_tool_invocation(
|
||||
tool_class: Type[BaseTool],
|
||||
tool_config: BaseModel,
|
||||
oxylabs_api: RealtimeClient,
|
||||
):
|
||||
tool = tool_class(username="username", password="password", config=tool_config)
|
||||
|
||||
# setting via __dict__ to bypass pydantic validation
|
||||
tool.__dict__["oxylabs_api"] = oxylabs_api
|
||||
|
||||
# verifying parsed job returns json content
|
||||
result = tool.run("Scraping Query 1")
|
||||
assert isinstance(result, str)
|
||||
assert isinstance(json.loads(result), dict)
|
||||
|
||||
# verifying raw job returns str
|
||||
result = tool.run("Scraping Query 2")
|
||||
assert isinstance(result, str)
|
||||
assert "<!DOCTYPE html>" in result
|
||||
Reference in New Issue
Block a user