mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-10 16:48:30 +00:00
git-subtree-dir: packages/tools git-subtree-split: 78317b9c127f18bd040c1d77e3c0840cdc9a5b38
208 lines
8.2 KiB
Python
208 lines
8.2 KiB
Python
import os
|
|
import urllib.parse
|
|
from typing import Any, Optional, Type
|
|
|
|
import requests
|
|
from crewai.tools import BaseTool
|
|
from pydantic import BaseModel, Field
|
|
|
|
class BrightDataConfig(BaseModel):
|
|
API_URL: str = "https://api.brightdata.com/request"
|
|
|
|
@classmethod
|
|
def from_env(cls):
|
|
return cls(
|
|
API_URL=os.environ.get("BRIGHTDATA_API_URL", "https://api.brightdata.com/request")
|
|
)
|
|
|
|
class BrightDataSearchToolSchema(BaseModel):
|
|
"""
|
|
Schema that defines the input arguments for the BrightDataSearchToolSchema.
|
|
|
|
Attributes:
|
|
query (str): The search query to be executed (e.g., "latest AI news").
|
|
search_engine (Optional[str]): The search engine to use ("google", "bing", "yandex"). Default is "google".
|
|
country (Optional[str]): Two-letter country code for geo-targeting (e.g., "us", "in"). Default is "us".
|
|
language (Optional[str]): Language code for search results (e.g., "en", "es"). Default is "en".
|
|
search_type (Optional[str]): Type of search, such as "isch" (images), "nws" (news), "jobs", etc.
|
|
device_type (Optional[str]): Device type to simulate ("desktop", "mobile", "ios", "android"). Default is "desktop".
|
|
parse_results (Optional[bool]): If True, results will be returned in structured JSON. If False, raw HTML. Default is True.
|
|
"""
|
|
|
|
query: str = Field(..., description="Search query to perform")
|
|
search_engine: Optional[str] = Field(
|
|
default="google",
|
|
description="Search engine domain (e.g., 'google', 'bing', 'yandex')",
|
|
)
|
|
country: Optional[str] = Field(
|
|
default="us",
|
|
description="Two-letter country code for geo-targeting (e.g., 'us', 'gb')",
|
|
)
|
|
language: Optional[str] = Field(
|
|
default="en",
|
|
description="Language code (e.g., 'en', 'es') used in the query URL",
|
|
)
|
|
search_type: Optional[str] = Field(
|
|
default=None,
|
|
description="Type of search (e.g., 'isch' for images, 'nws' for news)",
|
|
)
|
|
device_type: Optional[str] = Field(
|
|
default="desktop",
|
|
description="Device type to simulate (e.g., 'mobile', 'desktop', 'ios')",
|
|
)
|
|
parse_results: Optional[bool] = Field(
|
|
default=True,
|
|
description="Whether to parse and return JSON (True) or raw HTML/text (False)",
|
|
)
|
|
|
|
|
|
class BrightDataSearchTool(BaseTool):
|
|
"""
|
|
A web search tool that utilizes Bright Data's SERP API to perform queries and return either structured results
|
|
or raw page content from search engines like Google or Bing.
|
|
|
|
Attributes:
|
|
name (str): Tool name used by the agent.
|
|
description (str): A brief explanation of what the tool does.
|
|
args_schema (Type[BaseModel]): Schema class for validating tool arguments.
|
|
base_url (str): The Bright Data API endpoint used for making the POST request.
|
|
api_key (str): Bright Data API key loaded from the environment variable 'BRIGHT_DATA_API_KEY'.
|
|
zone (str): Zone identifier from Bright Data, loaded from the environment variable 'BRIGHT_DATA_ZONE'.
|
|
|
|
Raises:
|
|
ValueError: If API key or zone environment variables are not set.
|
|
"""
|
|
|
|
name: str = "Bright Data SERP Search"
|
|
description: str = "Tool to perform web search using Bright Data SERP API."
|
|
args_schema: Type[BaseModel] = BrightDataSearchToolSchema
|
|
_config = BrightDataConfig.from_env()
|
|
base_url: str = ""
|
|
api_key: str = ""
|
|
zone: str = ""
|
|
query: Optional[str] = None
|
|
search_engine: str = "google"
|
|
country: str = "us"
|
|
language: str = "en"
|
|
search_type: Optional[str] = None
|
|
device_type: str = "desktop"
|
|
parse_results: bool = True
|
|
|
|
def __init__(self, query: str = None, search_engine: str = "google", country: str = "us", language: str = "en", search_type: str = None, device_type: str = "desktop", parse_results: bool = True):
|
|
super().__init__()
|
|
self.base_url = self._config.API_URL
|
|
self.query = query
|
|
self.search_engine = search_engine
|
|
self.country = country
|
|
self.language = language
|
|
self.search_type = search_type
|
|
self.device_type = device_type
|
|
self.parse_results = parse_results
|
|
|
|
self.api_key = os.getenv("BRIGHT_DATA_API_KEY")
|
|
self.zone = os.getenv("BRIGHT_DATA_ZONE")
|
|
if not self.api_key:
|
|
raise ValueError("BRIGHT_DATA_API_KEY environment variable is required.")
|
|
if not self.zone:
|
|
raise ValueError("BRIGHT_DATA_ZONE environment variable is required.")
|
|
|
|
def get_search_url(self, engine: str, query: str):
|
|
if engine == "yandex":
|
|
return f"https://yandex.com/search/?text=${query}"
|
|
elif engine == "bing":
|
|
return f"https://www.bing.com/search?q=${query}"
|
|
return f"https://www.google.com/search?q=${query}"
|
|
|
|
def _run(self, query: str = None, search_engine: str = None, country: str = None, language: str = None, search_type: str = None, device_type: str = None, parse_results: bool = None, **kwargs) -> Any:
|
|
"""
|
|
Executes a search query using Bright Data SERP API and returns results.
|
|
|
|
Args:
|
|
query (str): The search query string (URL encoded internally).
|
|
search_engine (str): The search engine to use (default: "google").
|
|
country (str): Country code for geotargeting (default: "us").
|
|
language (str): Language code for the query (default: "en").
|
|
search_type (str): Optional type of search such as "nws", "isch", "jobs".
|
|
device_type (str): Optional device type to simulate (e.g., "mobile", "ios", "desktop").
|
|
parse_results (bool): If True, returns structured data; else raw page (default: True).
|
|
results_count (str or int): Number of search results to fetch (default: "10").
|
|
|
|
Returns:
|
|
dict or str: Parsed JSON data from Bright Data if available, otherwise error message.
|
|
"""
|
|
|
|
query = query or self.query
|
|
search_engine = search_engine or self.search_engine
|
|
country = country or self.country
|
|
language = language or self.language
|
|
search_type = search_type or self.search_type
|
|
device_type = device_type or self.device_type
|
|
parse_results = parse_results if parse_results is not None else self.parse_results
|
|
results_count = kwargs.get("results_count", "10")
|
|
|
|
# Validate required parameters
|
|
if not query:
|
|
raise ValueError("query is required either in constructor or method call")
|
|
|
|
# Build the search URL
|
|
query = urllib.parse.quote(query)
|
|
url = self.get_search_url(search_engine, query)
|
|
|
|
# Add parameters to the URL
|
|
params = []
|
|
|
|
if country:
|
|
params.append(f"gl={country}")
|
|
|
|
if language:
|
|
params.append(f"hl={language}")
|
|
|
|
if results_count:
|
|
params.append(f"num={results_count}")
|
|
|
|
if parse_results:
|
|
params.append(f"brd_json=1")
|
|
|
|
if search_type:
|
|
if search_type == "jobs":
|
|
params.append("ibp=htl;jobs")
|
|
else:
|
|
params.append(f"tbm={search_type}")
|
|
|
|
if device_type:
|
|
if device_type == "mobile":
|
|
params.append("brd_mobile=1")
|
|
elif device_type == "ios":
|
|
params.append("brd_mobile=ios")
|
|
elif device_type == "android":
|
|
params.append("brd_mobile=android")
|
|
|
|
# Combine parameters with the URL
|
|
if params:
|
|
url += "&" + "&".join(params)
|
|
|
|
# Set up the API request parameters
|
|
request_params = {"zone": self.zone, "url": url, "format": "raw"}
|
|
|
|
request_params = {k: v for k, v in request_params.items() if v is not None}
|
|
|
|
headers = {
|
|
"Authorization": f"Bearer {self.api_key}",
|
|
"Content-Type": "application/json",
|
|
}
|
|
|
|
try:
|
|
response = requests.post(
|
|
self.base_url, json=request_params, headers=headers
|
|
)
|
|
|
|
print(f"Status code: {response.status_code}")
|
|
response.raise_for_status()
|
|
|
|
return response.text
|
|
|
|
except requests.RequestException as e:
|
|
return f"Error performing BrightData search: {str(e)}"
|
|
except Exception as e:
|
|
return f"Error fetching results: {str(e)}"
|