From c96d4a6823fb5a3ef196427612b784efe504fdce Mon Sep 17 00:00:00 2001 From: Rostyslav Borovyk Date: Mon, 23 Jun 2025 20:58:16 +0300 Subject: [PATCH] Add Oxylabs Web Scraping tools (#2905) * Add Oxylabs tools * Review updates * Review updates --------- Co-authored-by: Tony Kipkemboi --- docs/docs.json | 2 +- docs/tools/web-scraping/overview.mdx | 6 +- .../web-scraping/oxylabsscraperstool.mdx | 236 ++++++++++++++++++ 3 files changed, 242 insertions(+), 2 deletions(-) create mode 100644 docs/tools/web-scraping/oxylabsscraperstool.mdx diff --git a/docs/docs.json b/docs/docs.json index 87e3bf2a7..1f949e35e 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -134,7 +134,7 @@ "tools/web-scraping/stagehandtool", "tools/web-scraping/firecrawlcrawlwebsitetool", "tools/web-scraping/firecrawlscrapewebsitetool", - "tools/web-scraping/firecrawlsearchtool" + "tools/web-scraping/oxylabsscraperstool" ] }, { diff --git a/docs/tools/web-scraping/overview.mdx b/docs/tools/web-scraping/overview.mdx index 8e920f493..e44aba4ba 100644 --- a/docs/tools/web-scraping/overview.mdx +++ b/docs/tools/web-scraping/overview.mdx @@ -56,6 +56,10 @@ These tools enable your agents to interact with the web, extract data from websi Intelligent browser automation with natural language commands. + + + Access web data at scale with Oxylabs. + ## **Common Use Cases** @@ -100,4 +104,4 @@ agent = Agent( - **JavaScript-Heavy Sites**: Use `SeleniumScrapingTool` for dynamic content - **Scale & Performance**: Use `FirecrawlScrapeWebsiteTool` for high-volume scraping - **Cloud Infrastructure**: Use `BrowserBaseLoadTool` for scalable browser automation -- **Complex Workflows**: Use `StagehandTool` for intelligent browser interactions \ No newline at end of file +- **Complex Workflows**: Use `StagehandTool` for intelligent browser interactions diff --git a/docs/tools/web-scraping/oxylabsscraperstool.mdx b/docs/tools/web-scraping/oxylabsscraperstool.mdx new file mode 100644 index 000000000..6c90f0908 --- /dev/null +++ b/docs/tools/web-scraping/oxylabsscraperstool.mdx @@ -0,0 +1,236 @@ +--- +title: Oxylabs Scrapers +description: > + Oxylabs Scrapers allow to easily access the information from the respective sources. Please see the list of available sources below: + - `Amazon Product` + - `Amazon Search` + - `Google Seach` + - `Universal` +icon: globe +--- + +## Installation + +Get the credentials by creating an Oxylabs Account [here](https://oxylabs.io). +```shell +pip install 'crewai[tools]' oxylabs +``` +Check [Oxylabs Documentation](https://developers.oxylabs.io/scraping-solutions/web-scraper-api/targets) to get more information about API parameters. + +# `OxylabsAmazonProductScraperTool` + +### Example + +```python +from crewai_tools import OxylabsAmazonProductScraperTool + +# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set +tool = OxylabsAmazonProductScraperTool() + +result = tool.run(query="AAAAABBBBCC") + +print(result) +``` + +### Parameters + +- `query` - 10-symbol ASIN code. +- `domain` - domain localization for Amazon. +- `geo_location` - the _Deliver to_ location. +- `user_agent_type` - device type and browser. +- `render` - enables JavaScript rendering when set to `html`. +- `callback_url` - URL to your callback endpoint. +- `context` - Additional advanced settings and controls for specialized requirements. +- `parse` - returns parsed data when set to true. +- `parsing_instructions` - define your own parsing and data transformation logic that will be executed on an HTML scraping result. + +### Advanced example + +```python +from crewai_tools import OxylabsAmazonProductScraperTool + +# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set +tool = OxylabsAmazonProductScraperTool( + config={ + "domain": "com", + "parse": True, + "context": [ + { + "key": "autoselect_variant", + "value": True + } + ] + } +) + +result = tool.run(query="AAAAABBBBCC") + +print(result) +``` + +# `OxylabsAmazonSearchScraperTool` + +### Example + +```python +from crewai_tools import OxylabsAmazonSearchScraperTool + +# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set +tool = OxylabsAmazonSearchScraperTool() + +result = tool.run(query="headsets") + +print(result) +``` + +### Parameters + +- `query` - Amazon search term. +- `domain` - Domain localization for Bestbuy. +- `start_page` - starting page number. +- `pages` - number of pages to retrieve. +- `geo_location` - the _Deliver to_ location. +- `user_agent_type` - device type and browser. +- `render` - enables JavaScript rendering when set to `html`. +- `callback_url` - URL to your callback endpoint. +- `context` - Additional advanced settings and controls for specialized requirements. +- `parse` - returns parsed data when set to true. +- `parsing_instructions` - define your own parsing and data transformation logic that will be executed on an HTML scraping result. + +### Advanced example + +```python +from crewai_tools import OxylabsAmazonSearchScraperTool + +# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set +tool = OxylabsAmazonSearchScraperTool( + config={ + "domain": 'nl', + "start_page": 2, + "pages": 2, + "parse": True, + "context": [ + {'key': 'category_id', 'value': 16391693031} + ], + } +) + +result = tool.run(query='nirvana tshirt') + +print(result) +``` + +# `OxylabsGoogleSearchScraperTool` + +### Example + +```python +from crewai_tools import OxylabsGoogleSearchScraperTool + +# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set +tool = OxylabsGoogleSearchScraperTool() + +result = tool.run(query="iPhone 16") + +print(result) +``` + +### Parameters + +- `query` - search keyword. +- `domain` - domain localization for Google. +- `start_page` - starting page number. +- `pages` - number of pages to retrieve. +- `limit` - number of results to retrieve in each page. +- `locale` - `Accept-Language` header value which changes your Google search page web interface language. +- `geo_location` - the geographical location that the result should be adapted for. Using this parameter correctly is extremely important to get the right data. +- `user_agent_type` - device type and browser. +- `render` - enables JavaScript rendering when set to `html`. +- `callback_url` - URL to your callback endpoint. +- `context` - Additional advanced settings and controls for specialized requirements. +- `parse` - returns parsed data when set to true. +- `parsing_instructions` - define your own parsing and data transformation logic that will be executed on an HTML scraping result. + +### Advanced example + +```python +from crewai_tools import OxylabsGoogleSearchScraperTool + +# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set +tool = OxylabsGoogleSearchScraperTool( + config={ + "parse": True, + "geo_location": "Paris, France", + "user_agent_type": "tablet", + } +) + +result = tool.run(query="iPhone 16") + +print(result) +``` + +# `OxylabsUniversalScraperTool` + +### Example + +```python +from crewai_tools import OxylabsUniversalScraperTool + +# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set +tool = OxylabsUniversalScraperTool() + +result = tool.run(url="https://ip.oxylabs.io") + +print(result) +``` + +### Parameters + +- `url` - website url to scrape. +- `user_agent_type` - device type and browser. +- `geo_location` - sets the proxy's geolocation to retrieve data. +- `render` - enables JavaScript rendering when set to `html`. +- `callback_url` - URL to your callback endpoint. +- `context` - Additional advanced settings and controls for specialized requirements. +- `parse` - returns parsed data when set to `true`, as long as a dedicated parser exists for the submitted URL's page type. +- `parsing_instructions` - define your own parsing and data transformation logic that will be executed on an HTML scraping result. + + +### Advanced example + +```python +from crewai_tools import OxylabsUniversalScraperTool + +# make sure OXYLABS_USERNAME and OXYLABS_PASSWORD variables are set +tool = OxylabsUniversalScraperTool( + config={ + "render": "html", + "user_agent_type": "mobile", + "context": [ + {"key": "force_headers", "value": True}, + {"key": "force_cookies", "value": True}, + { + "key": "headers", + "value": { + "Custom-Header-Name": "custom header content", + }, + }, + { + "key": "cookies", + "value": [ + {"key": "NID", "value": "1234567890"}, + {"key": "1P JAR", "value": "0987654321"}, + ], + }, + {"key": "http_method", "value": "get"}, + {"key": "follow_redirects", "value": True}, + {"key": "successful_status_codes", "value": [808, 909]}, + ], + } +) + +result = tool.run(url="https://ip.oxylabs.io") + +print(result) +```