mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-13 18:18:29 +00:00
git-subtree-dir: packages/tools git-subtree-split: 78317b9c127f18bd040c1d77e3c0840cdc9a5b38
571 lines
20 KiB
Python
571 lines
20 KiB
Python
import asyncio
|
|
import os
|
|
from typing import Any, Dict, Optional, Type
|
|
|
|
import aiohttp
|
|
from crewai.tools import BaseTool
|
|
from pydantic import BaseModel, Field
|
|
|
|
class BrightDataConfig(BaseModel):
|
|
API_URL: str = "https://api.brightdata.com"
|
|
DEFAULT_TIMEOUT: int = 600
|
|
DEFAULT_POLLING_INTERVAL: int = 1
|
|
|
|
@classmethod
|
|
def from_env(cls):
|
|
return cls(
|
|
API_URL=os.environ.get("BRIGHTDATA_API_URL", "https://api.brightdata.com"),
|
|
DEFAULT_TIMEOUT=int(os.environ.get("BRIGHTDATA_DEFAULT_TIMEOUT", "600")),
|
|
DEFAULT_POLLING_INTERVAL=int(os.environ.get("BRIGHTDATA_DEFAULT_POLLING_INTERVAL", "1"))
|
|
)
|
|
class BrightDataDatasetToolException(Exception):
|
|
"""Exception raised for custom error in the application."""
|
|
|
|
def __init__(self, message, error_code):
|
|
self.message = message
|
|
super().__init__(message)
|
|
self.error_code = error_code
|
|
|
|
def __str__(self):
|
|
return f"{self.message} (Error Code: {self.error_code})"
|
|
|
|
|
|
class BrightDataDatasetToolSchema(BaseModel):
|
|
"""
|
|
Schema for validating input parameters for the BrightDataDatasetTool.
|
|
|
|
Attributes:
|
|
dataset_type (str): Required Bright Data Dataset Type used to specify which dataset to access.
|
|
format (str): Response format (json by default). Multiple formats exist - json, ndjson, jsonl, csv
|
|
url (str): The URL from which structured data needs to be extracted.
|
|
zipcode (Optional[str]): An optional ZIP code to narrow down the data geographically.
|
|
additional_params (Optional[Dict]): Extra parameters for the Bright Data API call.
|
|
"""
|
|
|
|
dataset_type: str = Field(..., description="The Bright Data Dataset Type")
|
|
format: Optional[str] = Field(
|
|
default="json", description="Response format (json by default)"
|
|
)
|
|
url: str = Field(..., description="The URL to extract data from")
|
|
zipcode: Optional[str] = Field(default=None, description="Optional zipcode")
|
|
additional_params: Optional[Dict[str, Any]] = Field(
|
|
default=None, description="Additional params if any"
|
|
)
|
|
|
|
config = BrightDataConfig.from_env()
|
|
|
|
BRIGHTDATA_API_URL = config.API_URL
|
|
timeout = config.DEFAULT_TIMEOUT
|
|
|
|
datasets = [
|
|
{
|
|
"id": "amazon_product",
|
|
"dataset_id": "gd_l7q7dkf244hwjntr0",
|
|
"description": "\n".join(
|
|
[
|
|
"Quickly read structured amazon product data.",
|
|
"Requires a valid product URL with /dp/ in it.",
|
|
"This can be a cache lookup, so it can be more reliable than scraping",
|
|
]
|
|
),
|
|
"inputs": ["url"],
|
|
},
|
|
{
|
|
"id": "amazon_product_reviews",
|
|
"dataset_id": "gd_le8e811kzy4ggddlq",
|
|
"description": "\n".join(
|
|
[
|
|
"Quickly read structured amazon product review data.",
|
|
"Requires a valid product URL with /dp/ in it.",
|
|
"This can be a cache lookup, so it can be more reliable than scraping",
|
|
]
|
|
),
|
|
"inputs": ["url"],
|
|
},
|
|
{
|
|
"id": "amazon_product_search",
|
|
"dataset_id": "gd_lwdb4vjm1ehb499uxs",
|
|
"description": "\n".join(
|
|
[
|
|
"Quickly read structured amazon product search data.",
|
|
"Requires a valid search keyword and amazon domain URL.",
|
|
"This can be a cache lookup, so it can be more reliable than scraping",
|
|
]
|
|
),
|
|
"inputs": ["keyword", "url", "pages_to_search"],
|
|
"defaults": {"pages_to_search": "1"},
|
|
},
|
|
{
|
|
"id": "walmart_product",
|
|
"dataset_id": "gd_l95fol7l1ru6rlo116",
|
|
"description": "\n".join(
|
|
[
|
|
"Quickly read structured walmart product data.",
|
|
"Requires a valid product URL with /ip/ in it.",
|
|
"This can be a cache lookup, so it can be more reliable than scraping",
|
|
]
|
|
),
|
|
"inputs": ["url"],
|
|
},
|
|
{
|
|
"id": "walmart_seller",
|
|
"dataset_id": "gd_m7ke48w81ocyu4hhz0",
|
|
"description": "\n".join(
|
|
[
|
|
"Quickly read structured walmart seller data.",
|
|
"Requires a valid walmart seller URL.",
|
|
"This can be a cache lookup, so it can be more reliable than scraping",
|
|
]
|
|
),
|
|
"inputs": ["url"],
|
|
},
|
|
{
|
|
"id": "ebay_product",
|
|
"dataset_id": "gd_ltr9mjt81n0zzdk1fb",
|
|
"description": "\n".join(
|
|
[
|
|
"Quickly read structured ebay product data.",
|
|
"Requires a valid ebay product URL.",
|
|
"This can be a cache lookup, so it can be more reliable than scraping",
|
|
]
|
|
),
|
|
"inputs": ["url"],
|
|
},
|
|
{
|
|
"id": "homedepot_products",
|
|
"dataset_id": "gd_lmusivh019i7g97q2n",
|
|
"description": "\n".join(
|
|
[
|
|
"Quickly read structured homedepot product data.",
|
|
"Requires a valid homedepot product URL.",
|
|
"This can be a cache lookup, so it can be more reliable than scraping",
|
|
]
|
|
),
|
|
"inputs": ["url"],
|
|
},
|
|
{
|
|
"id": "zara_products",
|
|
"dataset_id": "gd_lct4vafw1tgx27d4o0",
|
|
"description": "\n".join(
|
|
[
|
|
"Quickly read structured zara product data.",
|
|
"Requires a valid zara product URL.",
|
|
"This can be a cache lookup, so it can be more reliable than scraping",
|
|
]
|
|
),
|
|
"inputs": ["url"],
|
|
},
|
|
{
|
|
"id": "etsy_products",
|
|
"dataset_id": "gd_ltppk0jdv1jqz25mz",
|
|
"description": "\n".join(
|
|
[
|
|
"Quickly read structured etsy product data.",
|
|
"Requires a valid etsy product URL.",
|
|
"This can be a cache lookup, so it can be more reliable than scraping",
|
|
]
|
|
),
|
|
"inputs": ["url"],
|
|
},
|
|
{
|
|
"id": "bestbuy_products",
|
|
"dataset_id": "gd_ltre1jqe1jfr7cccf",
|
|
"description": "\n".join(
|
|
[
|
|
"Quickly read structured bestbuy product data.",
|
|
"Requires a valid bestbuy product URL.",
|
|
"This can be a cache lookup, so it can be more reliable than scraping",
|
|
]
|
|
),
|
|
"inputs": ["url"],
|
|
},
|
|
{
|
|
"id": "linkedin_person_profile",
|
|
"dataset_id": "gd_l1viktl72bvl7bjuj0",
|
|
"description": "\n".join(
|
|
[
|
|
"Quickly read structured linkedin people profile data.",
|
|
"This can be a cache lookup, so it can be more reliable than scraping",
|
|
]
|
|
),
|
|
"inputs": ["url"],
|
|
},
|
|
{
|
|
"id": "linkedin_company_profile",
|
|
"dataset_id": "gd_l1vikfnt1wgvvqz95w",
|
|
"description": "\n".join(
|
|
[
|
|
"Quickly read structured linkedin company profile data",
|
|
"This can be a cache lookup, so it can be more reliable than scraping",
|
|
]
|
|
),
|
|
"inputs": ["url"],
|
|
},
|
|
{
|
|
"id": "linkedin_job_listings",
|
|
"dataset_id": "gd_lpfll7v5hcqtkxl6l",
|
|
"description": "\n".join(
|
|
[
|
|
"Quickly read structured linkedin job listings data",
|
|
"This can be a cache lookup, so it can be more reliable than scraping",
|
|
]
|
|
),
|
|
"inputs": ["url"],
|
|
},
|
|
{
|
|
"id": "linkedin_posts",
|
|
"dataset_id": "gd_lyy3tktm25m4avu764",
|
|
"description": "\n".join(
|
|
[
|
|
"Quickly read structured linkedin posts data",
|
|
"This can be a cache lookup, so it can be more reliable than scraping",
|
|
]
|
|
),
|
|
"inputs": ["url"],
|
|
},
|
|
{
|
|
"id": "linkedin_people_search",
|
|
"dataset_id": "gd_m8d03he47z8nwb5xc",
|
|
"description": "\n".join(
|
|
[
|
|
"Quickly read structured linkedin people search data",
|
|
"This can be a cache lookup, so it can be more reliable than scraping",
|
|
]
|
|
),
|
|
"inputs": ["url", "first_name", "last_name"],
|
|
},
|
|
{
|
|
"id": "crunchbase_company",
|
|
"dataset_id": "gd_l1vijqt9jfj7olije",
|
|
"description": "\n".join(
|
|
[
|
|
"Quickly read structured crunchbase company data",
|
|
"This can be a cache lookup, so it can be more reliable than scraping",
|
|
]
|
|
),
|
|
"inputs": ["url"],
|
|
},
|
|
{
|
|
"id": "zoominfo_company_profile",
|
|
"dataset_id": "gd_m0ci4a4ivx3j5l6nx",
|
|
"description": "\n".join(
|
|
[
|
|
"Quickly read structured ZoomInfo company profile data.",
|
|
"Requires a valid ZoomInfo company URL.",
|
|
"This can be a cache lookup, so it can be more reliable than scraping",
|
|
]
|
|
),
|
|
"inputs": ["url"],
|
|
},
|
|
{
|
|
"id": "instagram_profiles",
|
|
"dataset_id": "gd_l1vikfch901nx3by4",
|
|
"description": "\n".join(
|
|
[
|
|
"Quickly read structured Instagram profile data.",
|
|
"Requires a valid Instagram URL.",
|
|
"This can be a cache lookup, so it can be more reliable than scraping",
|
|
]
|
|
),
|
|
"inputs": ["url"],
|
|
},
|
|
{
|
|
"id": "instagram_posts",
|
|
"dataset_id": "gd_lk5ns7kz21pck8jpis",
|
|
"description": "\n".join(
|
|
[
|
|
"Quickly read structured Instagram post data.",
|
|
"Requires a valid Instagram URL.",
|
|
"This can be a cache lookup, so it can be more reliable than scraping",
|
|
]
|
|
),
|
|
"inputs": ["url"],
|
|
},
|
|
{
|
|
"id": "instagram_reels",
|
|
"dataset_id": "gd_lyclm20il4r5helnj",
|
|
"description": "\n".join(
|
|
[
|
|
"Quickly read structured Instagram reel data.",
|
|
"Requires a valid Instagram URL.",
|
|
"This can be a cache lookup, so it can be more reliable than scraping",
|
|
]
|
|
),
|
|
"inputs": ["url"],
|
|
},
|
|
{
|
|
"id": "instagram_comments",
|
|
"dataset_id": "gd_ltppn085pokosxh13",
|
|
"description": "\n".join(
|
|
[
|
|
"Quickly read structured Instagram comments data.",
|
|
"Requires a valid Instagram URL.",
|
|
"This can be a cache lookup, so it can be more reliable than scraping",
|
|
]
|
|
),
|
|
"inputs": ["url"],
|
|
},
|
|
{
|
|
"id": "facebook_posts",
|
|
"dataset_id": "gd_lyclm1571iy3mv57zw",
|
|
"description": "\n".join(
|
|
[
|
|
"Quickly read structured Facebook post data.",
|
|
"Requires a valid Facebook post URL.",
|
|
"This can be a cache lookup, so it can be more reliable than scraping",
|
|
]
|
|
),
|
|
"inputs": ["url"],
|
|
},
|
|
{
|
|
"id": "facebook_marketplace_listings",
|
|
"dataset_id": "gd_lvt9iwuh6fbcwmx1a",
|
|
"description": "\n".join(
|
|
[
|
|
"Quickly read structured Facebook marketplace listing data.",
|
|
"Requires a valid Facebook marketplace listing URL.",
|
|
"This can be a cache lookup, so it can be more reliable than scraping",
|
|
]
|
|
),
|
|
"inputs": ["url"],
|
|
},
|
|
{
|
|
"id": "facebook_company_reviews",
|
|
"dataset_id": "gd_m0dtqpiu1mbcyc2g86",
|
|
"description": "\n".join(
|
|
[
|
|
"Quickly read structured Facebook company reviews data.",
|
|
"Requires a valid Facebook company URL and number of reviews.",
|
|
"This can be a cache lookup, so it can be more reliable than scraping",
|
|
]
|
|
),
|
|
"inputs": ["url", "num_of_reviews"],
|
|
},
|
|
{
|
|
"id": "facebook_events",
|
|
"dataset_id": "gd_m14sd0to1jz48ppm51",
|
|
"description": "\n".join(
|
|
[
|
|
"Quickly read structured Facebook events data.",
|
|
"Requires a valid Facebook event URL.",
|
|
"This can be a cache lookup, so it can be more reliable than scraping",
|
|
]
|
|
),
|
|
"inputs": ["url"],
|
|
},
|
|
{
|
|
"id": "tiktok_profiles",
|
|
"dataset_id": "gd_l1villgoiiidt09ci",
|
|
"description": "\n".join(
|
|
[
|
|
"Quickly read structured Tiktok profiles data.",
|
|
"Requires a valid Tiktok profile URL.",
|
|
"This can be a cache lookup, so it can be more reliable than scraping",
|
|
]
|
|
),
|
|
"inputs": ["url"],
|
|
},
|
|
{
|
|
"id": "tiktok_posts",
|
|
"dataset_id": "gd_lu702nij2f790tmv9h",
|
|
"description": "\n".join(
|
|
[
|
|
"Quickly read structured Tiktok post data.",
|
|
"Requires a valid Tiktok post URL.",
|
|
"This can be a cache lookup, so it can be more reliable than scraping",
|
|
]
|
|
),
|
|
"inputs": ["url"],
|
|
},
|
|
{
|
|
"id": "tiktok_shop",
|
|
"dataset_id": "gd_m45m1u911dsa4274pi",
|
|
"description": "\n".join(
|
|
[
|
|
"Quickly read structured Tiktok shop data.",
|
|
"Requires a valid Tiktok shop product URL.",
|
|
"This can be a cache lookup...",
|
|
]
|
|
),
|
|
"inputs": ["url"],
|
|
},
|
|
]
|
|
|
|
|
|
class BrightDataDatasetTool(BaseTool):
|
|
"""
|
|
CrewAI-compatible tool for scraping structured data using Bright Data Datasets.
|
|
|
|
Attributes:
|
|
name (str): Tool name displayed in the CrewAI environment.
|
|
description (str): Tool description shown to agents or users.
|
|
args_schema (Type[BaseModel]): Pydantic schema for validating input arguments.
|
|
"""
|
|
|
|
name: str = "Bright Data Dataset Tool"
|
|
description: str = "Scrapes structured data using Bright Data Dataset API from a URL and optional input parameters"
|
|
args_schema: Type[BaseModel] = BrightDataDatasetToolSchema
|
|
dataset_type: Optional[str] = None
|
|
url: Optional[str] = None
|
|
format: str = "json"
|
|
zipcode: Optional[str] = None
|
|
additional_params: Optional[Dict[str, Any]] = None
|
|
|
|
def __init__(self, dataset_type: str = None, url: str = None, format: str = "json", zipcode: str = None, additional_params: Dict[str, Any] = None):
|
|
super().__init__()
|
|
self.dataset_type = dataset_type
|
|
self.url = url
|
|
self.format = format
|
|
self.zipcode = zipcode
|
|
self.additional_params = additional_params
|
|
|
|
def filter_dataset_by_id(self, target_id):
|
|
return [dataset for dataset in datasets if dataset["id"] == target_id]
|
|
|
|
async def get_dataset_data_async(
|
|
self,
|
|
dataset_type: str,
|
|
output_format: str,
|
|
url: str,
|
|
zipcode: Optional[str] = None,
|
|
additional_params: Optional[Dict[str, Any]] = None,
|
|
polling_interval: int = 1,
|
|
) -> Dict:
|
|
"""
|
|
Asynchronously trigger and poll Bright Data dataset scraping.
|
|
|
|
Args:
|
|
dataset_type (str): Bright Data Dataset Type.
|
|
url (str): Target URL to scrape.
|
|
zipcode (Optional[str]): Optional ZIP code for geo-specific data.
|
|
additional_params (Optional[Dict]): Extra API parameters.
|
|
polling_interval (int): Time interval in seconds between polling attempts.
|
|
|
|
Returns:
|
|
Dict: Structured dataset result from Bright Data.
|
|
|
|
Raises:
|
|
Exception: If any API step fails or the job fails.
|
|
TimeoutError: If polling times out before job completion.
|
|
"""
|
|
request_data = {"url": url}
|
|
if zipcode is not None:
|
|
request_data["zipcode"] = zipcode
|
|
|
|
# Set additional parameters dynamically depending upon the dataset that is being requested
|
|
if additional_params:
|
|
request_data.update(additional_params)
|
|
|
|
api_key = os.getenv("BRIGHT_DATA_API_KEY")
|
|
|
|
headers = {
|
|
"Authorization": f"Bearer {api_key}",
|
|
"Content-Type": "application/json",
|
|
}
|
|
|
|
dataset_id = ""
|
|
dataset = self.filter_dataset_by_id(dataset_type)
|
|
|
|
if len(dataset) == 1:
|
|
dataset_id = dataset[0]["dataset_id"]
|
|
else:
|
|
raise ValueError(
|
|
f"Unable to find the dataset for {dataset_type}. Please make sure to pass a valid one"
|
|
)
|
|
|
|
async with aiohttp.ClientSession() as session:
|
|
# Step 1: Trigger job
|
|
async with session.post(
|
|
f"{BRIGHTDATA_API_URL}/datasets/v3/trigger",
|
|
params={"dataset_id": dataset_id, "include_errors": "true"},
|
|
json=[request_data],
|
|
headers=headers,
|
|
) as trigger_response:
|
|
if trigger_response.status != 200:
|
|
raise BrightDataDatasetToolException(
|
|
f"Trigger failed: {await trigger_response.text()}",
|
|
trigger_response.status,
|
|
)
|
|
trigger_data = await trigger_response.json()
|
|
print(trigger_data)
|
|
snapshot_id = trigger_data.get("snapshot_id")
|
|
|
|
# Step 2: Poll for completion
|
|
elapsed = 0
|
|
while elapsed < timeout:
|
|
await asyncio.sleep(polling_interval)
|
|
elapsed += polling_interval
|
|
|
|
async with session.get(
|
|
f"{BRIGHTDATA_API_URL}/datasets/v3/progress/{snapshot_id}",
|
|
headers=headers,
|
|
) as status_response:
|
|
if status_response.status != 200:
|
|
raise BrightDataDatasetToolException(
|
|
f"Status check failed: {await status_response.text()}",
|
|
status_response.status,
|
|
)
|
|
status_data = await status_response.json()
|
|
if status_data.get("status") == "ready":
|
|
print("Job is ready")
|
|
break
|
|
elif status_data.get("status") == "error":
|
|
raise BrightDataDatasetToolException(
|
|
f"Job failed: {status_data}", 0
|
|
)
|
|
else:
|
|
raise TimeoutError("Polling timed out before job completed.")
|
|
|
|
# Step 3: Retrieve result
|
|
async with session.get(
|
|
f"{BRIGHTDATA_API_URL}/datasets/v3/snapshot/{snapshot_id}",
|
|
params={"format": output_format},
|
|
headers=headers,
|
|
) as snapshot_response:
|
|
if snapshot_response.status != 200:
|
|
raise BrightDataDatasetToolException(
|
|
f"Result fetch failed: {await snapshot_response.text()}",
|
|
snapshot_response.status,
|
|
)
|
|
|
|
return await snapshot_response.text()
|
|
|
|
def _run(self, url: str = None, dataset_type: str = None, format: str = None, zipcode: str = None, additional_params: Dict[str, Any] = None, **kwargs: Any) -> Any:
|
|
dataset_type = dataset_type or self.dataset_type
|
|
output_format = format or self.format
|
|
url = url or self.url
|
|
zipcode = zipcode or self.zipcode
|
|
additional_params = additional_params or self.additional_params
|
|
|
|
if not dataset_type:
|
|
raise ValueError("dataset_type is required either in constructor or method call")
|
|
if not url:
|
|
raise ValueError("url is required either in constructor or method call")
|
|
|
|
valid_output_formats = {"json", "ndjson", "jsonl", "csv"}
|
|
if output_format not in valid_output_formats:
|
|
raise ValueError(
|
|
f"Unsupported output format: {output_format}. Must be one of {', '.join(valid_output_formats)}."
|
|
)
|
|
|
|
api_key = os.getenv("BRIGHT_DATA_API_KEY")
|
|
if not api_key:
|
|
raise ValueError("BRIGHT_DATA_API_KEY environment variable is required.")
|
|
|
|
try:
|
|
return asyncio.run(
|
|
self.get_dataset_data_async(
|
|
dataset_type=dataset_type,
|
|
output_format=output_format,
|
|
url=url,
|
|
zipcode=zipcode,
|
|
additional_params=additional_params,
|
|
)
|
|
)
|
|
except TimeoutError as e:
|
|
return f"Timeout Exception occured in method : get_dataset_data_async. Details - {str(e)}"
|
|
except BrightDataDatasetToolException as e:
|
|
return f"Exception occured in method : get_dataset_data_async. Details - {str(e)}"
|
|
except Exception as e:
|
|
return f"Bright Data API error: {str(e)}"
|