Files
crewAI/src/crewai_tools/tools/brightdata_tool/brightdata_dataset.py
Lucas Gomide 8d9cee45f2 feat: attempt to make embedchain optional (#450)
* fix: attempt to make embedchain optional

* fix: drop pydantic_settings dependency

* fix: ensure the package is importable without any extra dependency

After making embedchain option many packages were unstalled which caused errors in some tools due to failing import directives
2025-09-12 13:35:17 -04:00

571 lines
20 KiB
Python

import asyncio
import os
from typing import Any, Dict, Optional, Type
import aiohttp
from crewai.tools import BaseTool
from pydantic import BaseModel, Field
class BrightDataConfig(BaseModel):
API_URL: str = "https://api.brightdata.com"
DEFAULT_TIMEOUT: int = 600
DEFAULT_POLLING_INTERVAL: int = 1
@classmethod
def from_env(cls):
return cls(
API_URL=os.environ.get("BRIGHTDATA_API_URL", "https://api.brightdata.com"),
DEFAULT_TIMEOUT=int(os.environ.get("BRIGHTDATA_DEFAULT_TIMEOUT", "600")),
DEFAULT_POLLING_INTERVAL=int(os.environ.get("BRIGHTDATA_DEFAULT_POLLING_INTERVAL", "1"))
)
class BrightDataDatasetToolException(Exception):
"""Exception raised for custom error in the application."""
def __init__(self, message, error_code):
self.message = message
super().__init__(message)
self.error_code = error_code
def __str__(self):
return f"{self.message} (Error Code: {self.error_code})"
class BrightDataDatasetToolSchema(BaseModel):
"""
Schema for validating input parameters for the BrightDataDatasetTool.
Attributes:
dataset_type (str): Required Bright Data Dataset Type used to specify which dataset to access.
format (str): Response format (json by default). Multiple formats exist - json, ndjson, jsonl, csv
url (str): The URL from which structured data needs to be extracted.
zipcode (Optional[str]): An optional ZIP code to narrow down the data geographically.
additional_params (Optional[Dict]): Extra parameters for the Bright Data API call.
"""
dataset_type: str = Field(..., description="The Bright Data Dataset Type")
format: Optional[str] = Field(
default="json", description="Response format (json by default)"
)
url: str = Field(..., description="The URL to extract data from")
zipcode: Optional[str] = Field(default=None, description="Optional zipcode")
additional_params: Optional[Dict[str, Any]] = Field(
default=None, description="Additional params if any"
)
config = BrightDataConfig.from_env()
BRIGHTDATA_API_URL = config.API_URL
timeout = config.DEFAULT_TIMEOUT
datasets = [
{
"id": "amazon_product",
"dataset_id": "gd_l7q7dkf244hwjntr0",
"description": "\n".join(
[
"Quickly read structured amazon product data.",
"Requires a valid product URL with /dp/ in it.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "amazon_product_reviews",
"dataset_id": "gd_le8e811kzy4ggddlq",
"description": "\n".join(
[
"Quickly read structured amazon product review data.",
"Requires a valid product URL with /dp/ in it.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "amazon_product_search",
"dataset_id": "gd_lwdb4vjm1ehb499uxs",
"description": "\n".join(
[
"Quickly read structured amazon product search data.",
"Requires a valid search keyword and amazon domain URL.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["keyword", "url", "pages_to_search"],
"defaults": {"pages_to_search": "1"},
},
{
"id": "walmart_product",
"dataset_id": "gd_l95fol7l1ru6rlo116",
"description": "\n".join(
[
"Quickly read structured walmart product data.",
"Requires a valid product URL with /ip/ in it.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "walmart_seller",
"dataset_id": "gd_m7ke48w81ocyu4hhz0",
"description": "\n".join(
[
"Quickly read structured walmart seller data.",
"Requires a valid walmart seller URL.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "ebay_product",
"dataset_id": "gd_ltr9mjt81n0zzdk1fb",
"description": "\n".join(
[
"Quickly read structured ebay product data.",
"Requires a valid ebay product URL.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "homedepot_products",
"dataset_id": "gd_lmusivh019i7g97q2n",
"description": "\n".join(
[
"Quickly read structured homedepot product data.",
"Requires a valid homedepot product URL.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "zara_products",
"dataset_id": "gd_lct4vafw1tgx27d4o0",
"description": "\n".join(
[
"Quickly read structured zara product data.",
"Requires a valid zara product URL.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "etsy_products",
"dataset_id": "gd_ltppk0jdv1jqz25mz",
"description": "\n".join(
[
"Quickly read structured etsy product data.",
"Requires a valid etsy product URL.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "bestbuy_products",
"dataset_id": "gd_ltre1jqe1jfr7cccf",
"description": "\n".join(
[
"Quickly read structured bestbuy product data.",
"Requires a valid bestbuy product URL.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "linkedin_person_profile",
"dataset_id": "gd_l1viktl72bvl7bjuj0",
"description": "\n".join(
[
"Quickly read structured linkedin people profile data.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "linkedin_company_profile",
"dataset_id": "gd_l1vikfnt1wgvvqz95w",
"description": "\n".join(
[
"Quickly read structured linkedin company profile data",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "linkedin_job_listings",
"dataset_id": "gd_lpfll7v5hcqtkxl6l",
"description": "\n".join(
[
"Quickly read structured linkedin job listings data",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "linkedin_posts",
"dataset_id": "gd_lyy3tktm25m4avu764",
"description": "\n".join(
[
"Quickly read structured linkedin posts data",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "linkedin_people_search",
"dataset_id": "gd_m8d03he47z8nwb5xc",
"description": "\n".join(
[
"Quickly read structured linkedin people search data",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url", "first_name", "last_name"],
},
{
"id": "crunchbase_company",
"dataset_id": "gd_l1vijqt9jfj7olije",
"description": "\n".join(
[
"Quickly read structured crunchbase company data",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "zoominfo_company_profile",
"dataset_id": "gd_m0ci4a4ivx3j5l6nx",
"description": "\n".join(
[
"Quickly read structured ZoomInfo company profile data.",
"Requires a valid ZoomInfo company URL.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "instagram_profiles",
"dataset_id": "gd_l1vikfch901nx3by4",
"description": "\n".join(
[
"Quickly read structured Instagram profile data.",
"Requires a valid Instagram URL.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "instagram_posts",
"dataset_id": "gd_lk5ns7kz21pck8jpis",
"description": "\n".join(
[
"Quickly read structured Instagram post data.",
"Requires a valid Instagram URL.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "instagram_reels",
"dataset_id": "gd_lyclm20il4r5helnj",
"description": "\n".join(
[
"Quickly read structured Instagram reel data.",
"Requires a valid Instagram URL.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "instagram_comments",
"dataset_id": "gd_ltppn085pokosxh13",
"description": "\n".join(
[
"Quickly read structured Instagram comments data.",
"Requires a valid Instagram URL.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "facebook_posts",
"dataset_id": "gd_lyclm1571iy3mv57zw",
"description": "\n".join(
[
"Quickly read structured Facebook post data.",
"Requires a valid Facebook post URL.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "facebook_marketplace_listings",
"dataset_id": "gd_lvt9iwuh6fbcwmx1a",
"description": "\n".join(
[
"Quickly read structured Facebook marketplace listing data.",
"Requires a valid Facebook marketplace listing URL.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "facebook_company_reviews",
"dataset_id": "gd_m0dtqpiu1mbcyc2g86",
"description": "\n".join(
[
"Quickly read structured Facebook company reviews data.",
"Requires a valid Facebook company URL and number of reviews.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url", "num_of_reviews"],
},
{
"id": "facebook_events",
"dataset_id": "gd_m14sd0to1jz48ppm51",
"description": "\n".join(
[
"Quickly read structured Facebook events data.",
"Requires a valid Facebook event URL.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "tiktok_profiles",
"dataset_id": "gd_l1villgoiiidt09ci",
"description": "\n".join(
[
"Quickly read structured Tiktok profiles data.",
"Requires a valid Tiktok profile URL.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "tiktok_posts",
"dataset_id": "gd_lu702nij2f790tmv9h",
"description": "\n".join(
[
"Quickly read structured Tiktok post data.",
"Requires a valid Tiktok post URL.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "tiktok_shop",
"dataset_id": "gd_m45m1u911dsa4274pi",
"description": "\n".join(
[
"Quickly read structured Tiktok shop data.",
"Requires a valid Tiktok shop product URL.",
"This can be a cache lookup...",
]
),
"inputs": ["url"],
},
]
class BrightDataDatasetTool(BaseTool):
"""
CrewAI-compatible tool for scraping structured data using Bright Data Datasets.
Attributes:
name (str): Tool name displayed in the CrewAI environment.
description (str): Tool description shown to agents or users.
args_schema (Type[BaseModel]): Pydantic schema for validating input arguments.
"""
name: str = "Bright Data Dataset Tool"
description: str = "Scrapes structured data using Bright Data Dataset API from a URL and optional input parameters"
args_schema: Type[BaseModel] = BrightDataDatasetToolSchema
dataset_type: Optional[str] = None
url: Optional[str] = None
format: str = "json"
zipcode: Optional[str] = None
additional_params: Optional[Dict[str, Any]] = None
def __init__(self, dataset_type: str = None, url: str = None, format: str = "json", zipcode: str = None, additional_params: Dict[str, Any] = None):
super().__init__()
self.dataset_type = dataset_type
self.url = url
self.format = format
self.zipcode = zipcode
self.additional_params = additional_params
def filter_dataset_by_id(self, target_id):
return [dataset for dataset in datasets if dataset["id"] == target_id]
async def get_dataset_data_async(
self,
dataset_type: str,
output_format: str,
url: str,
zipcode: Optional[str] = None,
additional_params: Optional[Dict[str, Any]] = None,
polling_interval: int = 1,
) -> Dict:
"""
Asynchronously trigger and poll Bright Data dataset scraping.
Args:
dataset_type (str): Bright Data Dataset Type.
url (str): Target URL to scrape.
zipcode (Optional[str]): Optional ZIP code for geo-specific data.
additional_params (Optional[Dict]): Extra API parameters.
polling_interval (int): Time interval in seconds between polling attempts.
Returns:
Dict: Structured dataset result from Bright Data.
Raises:
Exception: If any API step fails or the job fails.
TimeoutError: If polling times out before job completion.
"""
request_data = {"url": url}
if zipcode is not None:
request_data["zipcode"] = zipcode
# Set additional parameters dynamically depending upon the dataset that is being requested
if additional_params:
request_data.update(additional_params)
api_key = os.getenv("BRIGHT_DATA_API_KEY")
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
}
dataset_id = ""
dataset = self.filter_dataset_by_id(dataset_type)
if len(dataset) == 1:
dataset_id = dataset[0]["dataset_id"]
else:
raise ValueError(
f"Unable to find the dataset for {dataset_type}. Please make sure to pass a valid one"
)
async with aiohttp.ClientSession() as session:
# Step 1: Trigger job
async with session.post(
f"{BRIGHTDATA_API_URL}/datasets/v3/trigger",
params={"dataset_id": dataset_id, "include_errors": "true"},
json=[request_data],
headers=headers,
) as trigger_response:
if trigger_response.status != 200:
raise BrightDataDatasetToolException(
f"Trigger failed: {await trigger_response.text()}",
trigger_response.status,
)
trigger_data = await trigger_response.json()
print(trigger_data)
snapshot_id = trigger_data.get("snapshot_id")
# Step 2: Poll for completion
elapsed = 0
while elapsed < timeout:
await asyncio.sleep(polling_interval)
elapsed += polling_interval
async with session.get(
f"{BRIGHTDATA_API_URL}/datasets/v3/progress/{snapshot_id}",
headers=headers,
) as status_response:
if status_response.status != 200:
raise BrightDataDatasetToolException(
f"Status check failed: {await status_response.text()}",
status_response.status,
)
status_data = await status_response.json()
if status_data.get("status") == "ready":
print("Job is ready")
break
elif status_data.get("status") == "error":
raise BrightDataDatasetToolException(
f"Job failed: {status_data}", 0
)
else:
raise TimeoutError("Polling timed out before job completed.")
# Step 3: Retrieve result
async with session.get(
f"{BRIGHTDATA_API_URL}/datasets/v3/snapshot/{snapshot_id}",
params={"format": output_format},
headers=headers,
) as snapshot_response:
if snapshot_response.status != 200:
raise BrightDataDatasetToolException(
f"Result fetch failed: {await snapshot_response.text()}",
snapshot_response.status,
)
return await snapshot_response.text()
def _run(self, url: str = None, dataset_type: str = None, format: str = None, zipcode: str = None, additional_params: Dict[str, Any] = None, **kwargs: Any) -> Any:
dataset_type = dataset_type or self.dataset_type
output_format = format or self.format
url = url or self.url
zipcode = zipcode or self.zipcode
additional_params = additional_params or self.additional_params
if not dataset_type:
raise ValueError("dataset_type is required either in constructor or method call")
if not url:
raise ValueError("url is required either in constructor or method call")
valid_output_formats = {"json", "ndjson", "jsonl", "csv"}
if output_format not in valid_output_formats:
raise ValueError(
f"Unsupported output format: {output_format}. Must be one of {', '.join(valid_output_formats)}."
)
api_key = os.getenv("BRIGHT_DATA_API_KEY")
if not api_key:
raise ValueError("BRIGHT_DATA_API_KEY environment variable is required.")
try:
return asyncio.run(
self.get_dataset_data_async(
dataset_type=dataset_type,
output_format=output_format,
url=url,
zipcode=zipcode,
additional_params=additional_params,
)
)
except TimeoutError as e:
return f"Timeout Exception occured in method : get_dataset_data_async. Details - {str(e)}"
except BrightDataDatasetToolException as e:
return f"Exception occured in method : get_dataset_data_async. Details - {str(e)}"
except Exception as e:
return f"Bright Data API error: {str(e)}"