Files
crewAI/crewai_tools/tools/brightdata_tool/brightdata_serp.py
Greyson Lalonde e16606672a Squashed 'packages/tools/' content from commit 78317b9c
git-subtree-dir: packages/tools
git-subtree-split: 78317b9c127f18bd040c1d77e3c0840cdc9a5b38
2025-09-12 21:58:02 -04:00

208 lines
8.2 KiB
Python

import os
import urllib.parse
from typing import Any, Optional, Type
import requests
from crewai.tools import BaseTool
from pydantic import BaseModel, Field
class BrightDataConfig(BaseModel):
API_URL: str = "https://api.brightdata.com/request"
@classmethod
def from_env(cls):
return cls(
API_URL=os.environ.get("BRIGHTDATA_API_URL", "https://api.brightdata.com/request")
)
class BrightDataSearchToolSchema(BaseModel):
"""
Schema that defines the input arguments for the BrightDataSearchToolSchema.
Attributes:
query (str): The search query to be executed (e.g., "latest AI news").
search_engine (Optional[str]): The search engine to use ("google", "bing", "yandex"). Default is "google".
country (Optional[str]): Two-letter country code for geo-targeting (e.g., "us", "in"). Default is "us".
language (Optional[str]): Language code for search results (e.g., "en", "es"). Default is "en".
search_type (Optional[str]): Type of search, such as "isch" (images), "nws" (news), "jobs", etc.
device_type (Optional[str]): Device type to simulate ("desktop", "mobile", "ios", "android"). Default is "desktop".
parse_results (Optional[bool]): If True, results will be returned in structured JSON. If False, raw HTML. Default is True.
"""
query: str = Field(..., description="Search query to perform")
search_engine: Optional[str] = Field(
default="google",
description="Search engine domain (e.g., 'google', 'bing', 'yandex')",
)
country: Optional[str] = Field(
default="us",
description="Two-letter country code for geo-targeting (e.g., 'us', 'gb')",
)
language: Optional[str] = Field(
default="en",
description="Language code (e.g., 'en', 'es') used in the query URL",
)
search_type: Optional[str] = Field(
default=None,
description="Type of search (e.g., 'isch' for images, 'nws' for news)",
)
device_type: Optional[str] = Field(
default="desktop",
description="Device type to simulate (e.g., 'mobile', 'desktop', 'ios')",
)
parse_results: Optional[bool] = Field(
default=True,
description="Whether to parse and return JSON (True) or raw HTML/text (False)",
)
class BrightDataSearchTool(BaseTool):
"""
A web search tool that utilizes Bright Data's SERP API to perform queries and return either structured results
or raw page content from search engines like Google or Bing.
Attributes:
name (str): Tool name used by the agent.
description (str): A brief explanation of what the tool does.
args_schema (Type[BaseModel]): Schema class for validating tool arguments.
base_url (str): The Bright Data API endpoint used for making the POST request.
api_key (str): Bright Data API key loaded from the environment variable 'BRIGHT_DATA_API_KEY'.
zone (str): Zone identifier from Bright Data, loaded from the environment variable 'BRIGHT_DATA_ZONE'.
Raises:
ValueError: If API key or zone environment variables are not set.
"""
name: str = "Bright Data SERP Search"
description: str = "Tool to perform web search using Bright Data SERP API."
args_schema: Type[BaseModel] = BrightDataSearchToolSchema
_config = BrightDataConfig.from_env()
base_url: str = ""
api_key: str = ""
zone: str = ""
query: Optional[str] = None
search_engine: str = "google"
country: str = "us"
language: str = "en"
search_type: Optional[str] = None
device_type: str = "desktop"
parse_results: bool = True
def __init__(self, query: str = None, search_engine: str = "google", country: str = "us", language: str = "en", search_type: str = None, device_type: str = "desktop", parse_results: bool = True):
super().__init__()
self.base_url = self._config.API_URL
self.query = query
self.search_engine = search_engine
self.country = country
self.language = language
self.search_type = search_type
self.device_type = device_type
self.parse_results = parse_results
self.api_key = os.getenv("BRIGHT_DATA_API_KEY")
self.zone = os.getenv("BRIGHT_DATA_ZONE")
if not self.api_key:
raise ValueError("BRIGHT_DATA_API_KEY environment variable is required.")
if not self.zone:
raise ValueError("BRIGHT_DATA_ZONE environment variable is required.")
def get_search_url(self, engine: str, query: str):
if engine == "yandex":
return f"https://yandex.com/search/?text=${query}"
elif engine == "bing":
return f"https://www.bing.com/search?q=${query}"
return f"https://www.google.com/search?q=${query}"
def _run(self, query: str = None, search_engine: str = None, country: str = None, language: str = None, search_type: str = None, device_type: str = None, parse_results: bool = None, **kwargs) -> Any:
"""
Executes a search query using Bright Data SERP API and returns results.
Args:
query (str): The search query string (URL encoded internally).
search_engine (str): The search engine to use (default: "google").
country (str): Country code for geotargeting (default: "us").
language (str): Language code for the query (default: "en").
search_type (str): Optional type of search such as "nws", "isch", "jobs".
device_type (str): Optional device type to simulate (e.g., "mobile", "ios", "desktop").
parse_results (bool): If True, returns structured data; else raw page (default: True).
results_count (str or int): Number of search results to fetch (default: "10").
Returns:
dict or str: Parsed JSON data from Bright Data if available, otherwise error message.
"""
query = query or self.query
search_engine = search_engine or self.search_engine
country = country or self.country
language = language or self.language
search_type = search_type or self.search_type
device_type = device_type or self.device_type
parse_results = parse_results if parse_results is not None else self.parse_results
results_count = kwargs.get("results_count", "10")
# Validate required parameters
if not query:
raise ValueError("query is required either in constructor or method call")
# Build the search URL
query = urllib.parse.quote(query)
url = self.get_search_url(search_engine, query)
# Add parameters to the URL
params = []
if country:
params.append(f"gl={country}")
if language:
params.append(f"hl={language}")
if results_count:
params.append(f"num={results_count}")
if parse_results:
params.append(f"brd_json=1")
if search_type:
if search_type == "jobs":
params.append("ibp=htl;jobs")
else:
params.append(f"tbm={search_type}")
if device_type:
if device_type == "mobile":
params.append("brd_mobile=1")
elif device_type == "ios":
params.append("brd_mobile=ios")
elif device_type == "android":
params.append("brd_mobile=android")
# Combine parameters with the URL
if params:
url += "&" + "&".join(params)
# Set up the API request parameters
request_params = {"zone": self.zone, "url": url, "format": "raw"}
request_params = {k: v for k, v in request_params.items() if v is not None}
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
}
try:
response = requests.post(
self.base_url, json=request_params, headers=headers
)
print(f"Status code: {response.status_code}")
response.raise_for_status()
return response.text
except requests.RequestException as e:
return f"Error performing BrightData search: {str(e)}"
except Exception as e:
return f"Error fetching results: {str(e)}"