From 1a824cf432bbb9feab15ee12b27abeaaa8915e3e Mon Sep 17 00:00:00 2001 From: Nikhil Shahi Date: Mon, 13 Jan 2025 15:48:45 -0600 Subject: [PATCH 1/2] added HyperbrowserLoadTool --- src/crewai_tools/__init__.py | 1 + src/crewai_tools/tools/__init__.py | 1 + .../tools/hyperbrowser_load_tool/README.md | 42 +++++++++ .../hyperbrowser_load_tool.py | 94 +++++++++++++++++++ 4 files changed, 138 insertions(+) create mode 100644 src/crewai_tools/tools/hyperbrowser_load_tool/README.md create mode 100644 src/crewai_tools/tools/hyperbrowser_load_tool/hyperbrowser_load_tool.py diff --git a/src/crewai_tools/__init__.py b/src/crewai_tools/__init__.py index 2db0fa05f..ca46c34d2 100644 --- a/src/crewai_tools/__init__.py +++ b/src/crewai_tools/__init__.py @@ -16,6 +16,7 @@ from .tools import ( FirecrawlScrapeWebsiteTool, FirecrawlSearchTool, GithubSearchTool, + HyperbrowserLoadTool, JSONSearchTool, LinkupSearchTool, LlamaIndexTool, diff --git a/src/crewai_tools/tools/__init__.py b/src/crewai_tools/tools/__init__.py index e4288a310..ac42857bc 100644 --- a/src/crewai_tools/tools/__init__.py +++ b/src/crewai_tools/tools/__init__.py @@ -19,6 +19,7 @@ from .firecrawl_scrape_website_tool.firecrawl_scrape_website_tool import ( ) from .firecrawl_search_tool.firecrawl_search_tool import FirecrawlSearchTool from .github_search_tool.github_search_tool import GithubSearchTool +from .hyperbrowser_load_tool.hyperbrowser_load_tool import HyperbrowserLoadTool from .json_search_tool.json_search_tool import JSONSearchTool from .linkup.linkup_search_tool import LinkupSearchTool from .llamaindex_tool.llamaindex_tool import LlamaIndexTool diff --git a/src/crewai_tools/tools/hyperbrowser_load_tool/README.md b/src/crewai_tools/tools/hyperbrowser_load_tool/README.md new file mode 100644 index 000000000..e95864f5a --- /dev/null +++ b/src/crewai_tools/tools/hyperbrowser_load_tool/README.md @@ -0,0 +1,42 @@ +# HyperbrowserLoadTool + +## Description + +[Hyperbrowser](https://hyperbrowser.ai) is a platform for running and scaling headless browsers. It lets you launch and manage browser sessions at scale and provides easy to use solutions for any webscraping needs, such as scraping a single page or crawling an entire site. + +Key Features: +- Instant Scalability - Spin up hundreds of browser sessions in seconds without infrastructure headaches +- Simple Integration - Works seamlessly with popular tools like Puppeteer and Playwright +- Powerful APIs - Easy to use APIs for scraping/crawling any site, and much more +- Bypass Anti-Bot Measures - Built-in stealth mode, ad blocking, automatic CAPTCHA solving, and rotating proxies + +For more information about Hyperbrowser, please visit the [Hyperbrowser website](https://hyperbrowser.ai) or if you want to check out the docs, you can visit the [Hyperbrowser docs](https://docs.hyperbrowser.ai). + +## Installation + +- Head to [Hyperbrowser](https://app.hyperbrowser.ai/) to sign up and generate an API key. Once you've done this set the `HYPERBROWSER_API_KEY` environment variable or you can pass it to the `HyperbrowserLoadTool` constructor. +- Install the [Hyperbrowser SDK](https://github.com/hyperbrowserai/python-sdk): + +``` +pip install hyperbrowser 'crewai[tools]' +``` + +## Example + +Utilize the HyperbrowserLoadTool as follows to allow your agent to load websites: + +```python +from crewai_tools import HyperbrowserLoadTool + +tool = HyperbrowserLoadTool() +``` + +## Arguments + +`__init__` arguments: +- `api_key`: Optional. Specifies Hyperbrowser API key. Defaults to the `HYPERBROWSER_API_KEY` environment variable. + +`run` arguments: +- `url`: The base URL to start scraping or crawling from. +- `operation`: Optional. Specifies the operation to perform on the website. Either 'scrape' or 'crawl'. Defaults is 'scrape'. +- `params`: Optional. Specifies the params for the operation. For more information on the supported params, visit https://docs.hyperbrowser.ai/reference/sdks/python/scrape#start-scrape-job-and-wait or https://docs.hyperbrowser.ai/reference/sdks/python/crawl#start-crawl-job-and-wait. diff --git a/src/crewai_tools/tools/hyperbrowser_load_tool/hyperbrowser_load_tool.py b/src/crewai_tools/tools/hyperbrowser_load_tool/hyperbrowser_load_tool.py new file mode 100644 index 000000000..eb52b151c --- /dev/null +++ b/src/crewai_tools/tools/hyperbrowser_load_tool/hyperbrowser_load_tool.py @@ -0,0 +1,94 @@ +import os +from typing import Any, Optional, Type, Dict, Literal, Union + +from crewai.tools import BaseTool +from pydantic import BaseModel, Field + + +class HyperbrowserLoadToolSchema(BaseModel): + url: str = Field(description="Website URL") + operation: Literal['scrape', 'crawl'] = Field(description="Operation to perform on the website. Either 'scrape' or 'crawl'") + params: Optional[Dict] = Field(description="Optional params for scrape or crawl. For more information on the supported params, visit https://docs.hyperbrowser.ai/reference/sdks/python/scrape#start-scrape-job-and-wait or https://docs.hyperbrowser.ai/reference/sdks/python/crawl#start-crawl-job-and-wait") + +class HyperbrowserLoadTool(BaseTool): + name: str = "Hyperbrowser web load tool" + description: str = "Scrape or crawl a website using Hyperbrowser and return the contents in properly formatted markdown or html" + args_schema: Type[BaseModel] = HyperbrowserLoadToolSchema + api_key: Optional[str] = None + hyperbrowser: Optional[Any] = None + + def __init__(self, api_key: Optional[str] = None, **kwargs): + super().__init__(**kwargs) + self.api_key = api_key or os.getenv('HYPERBROWSER_API_KEY') + if not api_key: + raise ValueError( + "`api_key` is required, please set the `HYPERBROWSER_API_KEY` environment variable or pass it directly" + ) + + try: + from hyperbrowser import Hyperbrowser + except ImportError: + raise ImportError("`hyperbrowser` package not found, please run `pip install hyperbrowser`") + + if not self.api_key: + raise ValueError("HYPERBROWSER_API_KEY is not set. Please provide it either via the constructor with the `api_key` argument or by setting the HYPERBROWSER_API_KEY environment variable.") + + self.hyperbrowser = Hyperbrowser(api_key=self.api_key) + + def _prepare_params(self, params: Dict) -> Dict: + """Prepare session and scrape options parameters.""" + try: + from hyperbrowser.models.session import CreateSessionParams + from hyperbrowser.models.scrape import ScrapeOptions + except ImportError: + raise ImportError( + "`hyperbrowser` package not found, please run `pip install hyperbrowser`" + ) + + if "scrape_options" in params: + if "formats" in params["scrape_options"]: + formats = params["scrape_options"]["formats"] + if not all(fmt in ["markdown", "html"] for fmt in formats): + raise ValueError("formats can only contain 'markdown' or 'html'") + + if "session_options" in params: + params["session_options"] = CreateSessionParams(**params["session_options"]) + if "scrape_options" in params: + params["scrape_options"] = ScrapeOptions(**params["scrape_options"]) + return params + + def _extract_content(self, data: Union[Any, None]): + """Extract content from response data.""" + content = "" + if data: + content = data.markdown or data.html or "" + return content + + def _run(self, url: str, operation: Literal['scrape', 'crawl'] = 'scrape', params: Optional[Dict] = {}): + try: + from hyperbrowser.models.scrape import StartScrapeJobParams + from hyperbrowser.models.crawl import StartCrawlJobParams + except ImportError: + raise ImportError( + "`hyperbrowser` package not found, please run `pip install hyperbrowser`" + ) + + params = self._prepare_params(params) + + if operation == 'scrape': + scrape_params = StartScrapeJobParams(url=url, **params) + scrape_resp = self.hyperbrowser.scrape.start_and_wait(scrape_params) + content = self._extract_content(scrape_resp.data) + return content + else: + crawl_params = StartCrawlJobParams(url=url, **params) + crawl_resp = self.hyperbrowser.crawl.start_and_wait(crawl_params) + content = "" + if crawl_resp.data: + for page in crawl_resp.data: + page_content = self._extract_content(page) + if page_content: + content += ( + f"\n{'-'*50}\nUrl: {page.url}\nContent:\n{page_content}\n" + ) + return content From e343f26c037f5557c0f31654bf053802b1b534f6 Mon Sep 17 00:00:00 2001 From: Nikhil Shahi Date: Mon, 13 Jan 2025 16:08:11 -0600 Subject: [PATCH 2/2] add docstring --- .../hyperbrowser_load_tool/hyperbrowser_load_tool.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/crewai_tools/tools/hyperbrowser_load_tool/hyperbrowser_load_tool.py b/src/crewai_tools/tools/hyperbrowser_load_tool/hyperbrowser_load_tool.py index eb52b151c..b802d1859 100644 --- a/src/crewai_tools/tools/hyperbrowser_load_tool/hyperbrowser_load_tool.py +++ b/src/crewai_tools/tools/hyperbrowser_load_tool/hyperbrowser_load_tool.py @@ -11,6 +11,15 @@ class HyperbrowserLoadToolSchema(BaseModel): params: Optional[Dict] = Field(description="Optional params for scrape or crawl. For more information on the supported params, visit https://docs.hyperbrowser.ai/reference/sdks/python/scrape#start-scrape-job-and-wait or https://docs.hyperbrowser.ai/reference/sdks/python/crawl#start-crawl-job-and-wait") class HyperbrowserLoadTool(BaseTool): + """HyperbrowserLoadTool. + + Scrape or crawl web pages and load the contents with optional parameters for configuring content extraction. + Requires the `hyperbrowser` package. + Get your API Key from https://app.hyperbrowser.ai/ + + Args: + api_key: The Hyperbrowser API key, can be set as an environment variable `HYPERBROWSER_API_KEY` or passed directly + """ name: str = "Hyperbrowser web load tool" description: str = "Scrape or crawl a website using Hyperbrowser and return the contents in properly formatted markdown or html" args_schema: Type[BaseModel] = HyperbrowserLoadToolSchema