mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-09 08:08:32 +00:00
Merge pull request #183 from NikhilShahi/feature/add-hyperbrowser
added HyperbrowserLoadTool
This commit is contained in:
@@ -16,6 +16,7 @@ from .tools import (
|
|||||||
FirecrawlScrapeWebsiteTool,
|
FirecrawlScrapeWebsiteTool,
|
||||||
FirecrawlSearchTool,
|
FirecrawlSearchTool,
|
||||||
GithubSearchTool,
|
GithubSearchTool,
|
||||||
|
HyperbrowserLoadTool,
|
||||||
JSONSearchTool,
|
JSONSearchTool,
|
||||||
LinkupSearchTool,
|
LinkupSearchTool,
|
||||||
LlamaIndexTool,
|
LlamaIndexTool,
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ from .firecrawl_scrape_website_tool.firecrawl_scrape_website_tool import (
|
|||||||
)
|
)
|
||||||
from .firecrawl_search_tool.firecrawl_search_tool import FirecrawlSearchTool
|
from .firecrawl_search_tool.firecrawl_search_tool import FirecrawlSearchTool
|
||||||
from .github_search_tool.github_search_tool import GithubSearchTool
|
from .github_search_tool.github_search_tool import GithubSearchTool
|
||||||
|
from .hyperbrowser_load_tool.hyperbrowser_load_tool import HyperbrowserLoadTool
|
||||||
from .json_search_tool.json_search_tool import JSONSearchTool
|
from .json_search_tool.json_search_tool import JSONSearchTool
|
||||||
from .linkup.linkup_search_tool import LinkupSearchTool
|
from .linkup.linkup_search_tool import LinkupSearchTool
|
||||||
from .llamaindex_tool.llamaindex_tool import LlamaIndexTool
|
from .llamaindex_tool.llamaindex_tool import LlamaIndexTool
|
||||||
|
|||||||
42
src/crewai_tools/tools/hyperbrowser_load_tool/README.md
Normal file
42
src/crewai_tools/tools/hyperbrowser_load_tool/README.md
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
# HyperbrowserLoadTool
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
[Hyperbrowser](https://hyperbrowser.ai) is a platform for running and scaling headless browsers. It lets you launch and manage browser sessions at scale and provides easy to use solutions for any webscraping needs, such as scraping a single page or crawling an entire site.
|
||||||
|
|
||||||
|
Key Features:
|
||||||
|
- Instant Scalability - Spin up hundreds of browser sessions in seconds without infrastructure headaches
|
||||||
|
- Simple Integration - Works seamlessly with popular tools like Puppeteer and Playwright
|
||||||
|
- Powerful APIs - Easy to use APIs for scraping/crawling any site, and much more
|
||||||
|
- Bypass Anti-Bot Measures - Built-in stealth mode, ad blocking, automatic CAPTCHA solving, and rotating proxies
|
||||||
|
|
||||||
|
For more information about Hyperbrowser, please visit the [Hyperbrowser website](https://hyperbrowser.ai) or if you want to check out the docs, you can visit the [Hyperbrowser docs](https://docs.hyperbrowser.ai).
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
- Head to [Hyperbrowser](https://app.hyperbrowser.ai/) to sign up and generate an API key. Once you've done this set the `HYPERBROWSER_API_KEY` environment variable or you can pass it to the `HyperbrowserLoadTool` constructor.
|
||||||
|
- Install the [Hyperbrowser SDK](https://github.com/hyperbrowserai/python-sdk):
|
||||||
|
|
||||||
|
```
|
||||||
|
pip install hyperbrowser 'crewai[tools]'
|
||||||
|
```
|
||||||
|
|
||||||
|
## Example
|
||||||
|
|
||||||
|
Utilize the HyperbrowserLoadTool as follows to allow your agent to load websites:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crewai_tools import HyperbrowserLoadTool
|
||||||
|
|
||||||
|
tool = HyperbrowserLoadTool()
|
||||||
|
```
|
||||||
|
|
||||||
|
## Arguments
|
||||||
|
|
||||||
|
`__init__` arguments:
|
||||||
|
- `api_key`: Optional. Specifies Hyperbrowser API key. Defaults to the `HYPERBROWSER_API_KEY` environment variable.
|
||||||
|
|
||||||
|
`run` arguments:
|
||||||
|
- `url`: The base URL to start scraping or crawling from.
|
||||||
|
- `operation`: Optional. Specifies the operation to perform on the website. Either 'scrape' or 'crawl'. Defaults is 'scrape'.
|
||||||
|
- `params`: Optional. Specifies the params for the operation. For more information on the supported params, visit https://docs.hyperbrowser.ai/reference/sdks/python/scrape#start-scrape-job-and-wait or https://docs.hyperbrowser.ai/reference/sdks/python/crawl#start-crawl-job-and-wait.
|
||||||
@@ -0,0 +1,103 @@
|
|||||||
|
import os
|
||||||
|
from typing import Any, Optional, Type, Dict, Literal, Union
|
||||||
|
|
||||||
|
from crewai.tools import BaseTool
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
|
||||||
|
class HyperbrowserLoadToolSchema(BaseModel):
|
||||||
|
url: str = Field(description="Website URL")
|
||||||
|
operation: Literal['scrape', 'crawl'] = Field(description="Operation to perform on the website. Either 'scrape' or 'crawl'")
|
||||||
|
params: Optional[Dict] = Field(description="Optional params for scrape or crawl. For more information on the supported params, visit https://docs.hyperbrowser.ai/reference/sdks/python/scrape#start-scrape-job-and-wait or https://docs.hyperbrowser.ai/reference/sdks/python/crawl#start-crawl-job-and-wait")
|
||||||
|
|
||||||
|
class HyperbrowserLoadTool(BaseTool):
|
||||||
|
"""HyperbrowserLoadTool.
|
||||||
|
|
||||||
|
Scrape or crawl web pages and load the contents with optional parameters for configuring content extraction.
|
||||||
|
Requires the `hyperbrowser` package.
|
||||||
|
Get your API Key from https://app.hyperbrowser.ai/
|
||||||
|
|
||||||
|
Args:
|
||||||
|
api_key: The Hyperbrowser API key, can be set as an environment variable `HYPERBROWSER_API_KEY` or passed directly
|
||||||
|
"""
|
||||||
|
name: str = "Hyperbrowser web load tool"
|
||||||
|
description: str = "Scrape or crawl a website using Hyperbrowser and return the contents in properly formatted markdown or html"
|
||||||
|
args_schema: Type[BaseModel] = HyperbrowserLoadToolSchema
|
||||||
|
api_key: Optional[str] = None
|
||||||
|
hyperbrowser: Optional[Any] = None
|
||||||
|
|
||||||
|
def __init__(self, api_key: Optional[str] = None, **kwargs):
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
self.api_key = api_key or os.getenv('HYPERBROWSER_API_KEY')
|
||||||
|
if not api_key:
|
||||||
|
raise ValueError(
|
||||||
|
"`api_key` is required, please set the `HYPERBROWSER_API_KEY` environment variable or pass it directly"
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
from hyperbrowser import Hyperbrowser
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError("`hyperbrowser` package not found, please run `pip install hyperbrowser`")
|
||||||
|
|
||||||
|
if not self.api_key:
|
||||||
|
raise ValueError("HYPERBROWSER_API_KEY is not set. Please provide it either via the constructor with the `api_key` argument or by setting the HYPERBROWSER_API_KEY environment variable.")
|
||||||
|
|
||||||
|
self.hyperbrowser = Hyperbrowser(api_key=self.api_key)
|
||||||
|
|
||||||
|
def _prepare_params(self, params: Dict) -> Dict:
|
||||||
|
"""Prepare session and scrape options parameters."""
|
||||||
|
try:
|
||||||
|
from hyperbrowser.models.session import CreateSessionParams
|
||||||
|
from hyperbrowser.models.scrape import ScrapeOptions
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"`hyperbrowser` package not found, please run `pip install hyperbrowser`"
|
||||||
|
)
|
||||||
|
|
||||||
|
if "scrape_options" in params:
|
||||||
|
if "formats" in params["scrape_options"]:
|
||||||
|
formats = params["scrape_options"]["formats"]
|
||||||
|
if not all(fmt in ["markdown", "html"] for fmt in formats):
|
||||||
|
raise ValueError("formats can only contain 'markdown' or 'html'")
|
||||||
|
|
||||||
|
if "session_options" in params:
|
||||||
|
params["session_options"] = CreateSessionParams(**params["session_options"])
|
||||||
|
if "scrape_options" in params:
|
||||||
|
params["scrape_options"] = ScrapeOptions(**params["scrape_options"])
|
||||||
|
return params
|
||||||
|
|
||||||
|
def _extract_content(self, data: Union[Any, None]):
|
||||||
|
"""Extract content from response data."""
|
||||||
|
content = ""
|
||||||
|
if data:
|
||||||
|
content = data.markdown or data.html or ""
|
||||||
|
return content
|
||||||
|
|
||||||
|
def _run(self, url: str, operation: Literal['scrape', 'crawl'] = 'scrape', params: Optional[Dict] = {}):
|
||||||
|
try:
|
||||||
|
from hyperbrowser.models.scrape import StartScrapeJobParams
|
||||||
|
from hyperbrowser.models.crawl import StartCrawlJobParams
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"`hyperbrowser` package not found, please run `pip install hyperbrowser`"
|
||||||
|
)
|
||||||
|
|
||||||
|
params = self._prepare_params(params)
|
||||||
|
|
||||||
|
if operation == 'scrape':
|
||||||
|
scrape_params = StartScrapeJobParams(url=url, **params)
|
||||||
|
scrape_resp = self.hyperbrowser.scrape.start_and_wait(scrape_params)
|
||||||
|
content = self._extract_content(scrape_resp.data)
|
||||||
|
return content
|
||||||
|
else:
|
||||||
|
crawl_params = StartCrawlJobParams(url=url, **params)
|
||||||
|
crawl_resp = self.hyperbrowser.crawl.start_and_wait(crawl_params)
|
||||||
|
content = ""
|
||||||
|
if crawl_resp.data:
|
||||||
|
for page in crawl_resp.data:
|
||||||
|
page_content = self._extract_content(page)
|
||||||
|
if page_content:
|
||||||
|
content += (
|
||||||
|
f"\n{'-'*50}\nUrl: {page.url}\nContent:\n{page_content}\n"
|
||||||
|
)
|
||||||
|
return content
|
||||||
Reference in New Issue
Block a user