mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-05-01 15:22:37 +00:00
Squashed 'packages/tools/' content from commit 78317b9c
git-subtree-dir: packages/tools git-subtree-split: 78317b9c127f18bd040c1d77e3c0840cdc9a5b38
This commit is contained in:
122
crewai_tools/tools/brightdata_tool/brightdata_unlocker.py
Normal file
122
crewai_tools/tools/brightdata_tool/brightdata_unlocker.py
Normal file
@@ -0,0 +1,122 @@
|
||||
import os
|
||||
from typing import Any, Optional, Type
|
||||
|
||||
import requests
|
||||
from crewai.tools import BaseTool
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
class BrightDataConfig(BaseModel):
|
||||
API_URL: str = "https://api.brightdata.com/request"
|
||||
|
||||
@classmethod
|
||||
def from_env(cls):
|
||||
return cls(
|
||||
API_URL=os.environ.get("BRIGHTDATA_API_URL", "https://api.brightdata.com/request")
|
||||
)
|
||||
|
||||
class BrightDataUnlockerToolSchema(BaseModel):
|
||||
"""
|
||||
Pydantic schema for input parameters used by the BrightDataWebUnlockerTool.
|
||||
|
||||
This schema defines the structure and validation for parameters passed when performing
|
||||
a web scraping request using Bright Data's Web Unlocker.
|
||||
|
||||
Attributes:
|
||||
url (str): The target URL to scrape.
|
||||
format (Optional[str]): Format of the response returned by Bright Data. Default 'raw' format.
|
||||
data_format (Optional[str]): Response data format (html by default). markdown is one more option.
|
||||
"""
|
||||
|
||||
url: str = Field(..., description="URL to perform the web scraping")
|
||||
format: Optional[str] = Field(
|
||||
default="raw", description="Response format (raw is standard)"
|
||||
)
|
||||
data_format: Optional[str] = Field(
|
||||
default="markdown", description="Response data format (html by default)"
|
||||
)
|
||||
|
||||
|
||||
class BrightDataWebUnlockerTool(BaseTool):
|
||||
"""
|
||||
A tool for performing web scraping using the Bright Data Web Unlocker API.
|
||||
|
||||
This tool allows automated and programmatic access to web pages by routing requests
|
||||
through Bright Data's unlocking and proxy infrastructure, which can bypass bot
|
||||
protection mechanisms like CAPTCHA, geo-restrictions, and anti-bot detection.
|
||||
|
||||
Attributes:
|
||||
name (str): Name of the tool.
|
||||
description (str): Description of what the tool does.
|
||||
args_schema (Type[BaseModel]): Pydantic model schema for expected input arguments.
|
||||
base_url (str): Base URL of the Bright Data Web Unlocker API.
|
||||
api_key (str): Bright Data API key (must be set in the BRIGHT_DATA_API_KEY environment variable).
|
||||
zone (str): Bright Data zone identifier (must be set in the BRIGHT_DATA_ZONE environment variable).
|
||||
|
||||
Methods:
|
||||
_run(**kwargs: Any) -> Any:
|
||||
Sends a scraping request to Bright Data's Web Unlocker API and returns the result.
|
||||
"""
|
||||
|
||||
name: str = "Bright Data Web Unlocker Scraping"
|
||||
description: str = "Tool to perform web scraping using Bright Data Web Unlocker"
|
||||
args_schema: Type[BaseModel] = BrightDataUnlockerToolSchema
|
||||
_config = BrightDataConfig.from_env()
|
||||
base_url: str = ""
|
||||
api_key: str = ""
|
||||
zone: str = ""
|
||||
url: Optional[str] = None
|
||||
format: str = "raw"
|
||||
data_format: str = "markdown"
|
||||
|
||||
def __init__(self, url: str = None, format: str = "raw", data_format: str = "markdown"):
|
||||
super().__init__()
|
||||
self.base_url = self._config.API_URL
|
||||
self.url = url
|
||||
self.format = format
|
||||
self.data_format = data_format
|
||||
|
||||
self.api_key = os.getenv("BRIGHT_DATA_API_KEY")
|
||||
self.zone = os.getenv("BRIGHT_DATA_ZONE")
|
||||
if not self.api_key:
|
||||
raise ValueError("BRIGHT_DATA_API_KEY environment variable is required.")
|
||||
if not self.zone:
|
||||
raise ValueError("BRIGHT_DATA_ZONE environment variable is required.")
|
||||
|
||||
def _run(self, url: str = None, format: str = None, data_format: str = None, **kwargs: Any) -> Any:
|
||||
url = url or self.url
|
||||
format = format or self.format
|
||||
data_format = data_format or self.data_format
|
||||
|
||||
if not url:
|
||||
raise ValueError("url is required either in constructor or method call")
|
||||
|
||||
payload = {
|
||||
"url": url,
|
||||
"zone": self.zone,
|
||||
"format": format,
|
||||
}
|
||||
valid_data_formats = {"html", "markdown"}
|
||||
if data_format not in valid_data_formats:
|
||||
raise ValueError(
|
||||
f"Unsupported data format: {data_format}. Must be one of {', '.join(valid_data_formats)}."
|
||||
)
|
||||
|
||||
if data_format == "markdown":
|
||||
payload["data_format"] = "markdown"
|
||||
|
||||
headers = {
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(self.base_url, json=payload, headers=headers)
|
||||
print(f"Status Code: {response.status_code}")
|
||||
response.raise_for_status()
|
||||
|
||||
return response.text
|
||||
|
||||
except requests.RequestException as e:
|
||||
return f"HTTP Error performing BrightData Web Unlocker Scrape: {e}\nResponse: {getattr(e.response, 'text', '')}"
|
||||
except Exception as e:
|
||||
return f"Error fetching results: {str(e)}"
|
||||
Reference in New Issue
Block a user