diff --git a/src/crewai_tools/__init__.py b/src/crewai_tools/__init__.py index d9b1fa753..a049cdc5b 100644 --- a/src/crewai_tools/__init__.py +++ b/src/crewai_tools/__init__.py @@ -21,6 +21,7 @@ from .tools import ( PGSearchTool, RagTool, ScrapeElementFromWebsiteTool, + ScrapflyScrapeWebsiteTool, ScrapeWebsiteTool, SeleniumScrapingTool, SerperDevTool, diff --git a/src/crewai_tools/tools/__init__.py b/src/crewai_tools/tools/__init__.py index 26f0e9d4b..8cf9d4499 100644 --- a/src/crewai_tools/tools/__init__.py +++ b/src/crewai_tools/tools/__init__.py @@ -23,6 +23,7 @@ from .scrape_element_from_website.scrape_element_from_website import ( ScrapeElementFromWebsiteTool, ) from .scrape_website_tool.scrape_website_tool import ScrapeWebsiteTool +from .scrapfly_scrape_website_tool.scrapfly_scrape_website_tool import ScrapflyScrapeWebsiteTool from .selenium_scraping_tool.selenium_scraping_tool import SeleniumScrapingTool from .serper_dev_tool.serper_dev_tool import SerperDevTool from .serply_api_tool.serply_web_search_tool import SerplyWebSearchTool diff --git a/src/crewai_tools/tools/scrapfly_scrape_website_tool/README.md b/src/crewai_tools/tools/scrapfly_scrape_website_tool/README.md new file mode 100644 index 000000000..6ab9c9d52 --- /dev/null +++ b/src/crewai_tools/tools/scrapfly_scrape_website_tool/README.md @@ -0,0 +1,57 @@ +# ScrapflyScrapeWebsiteTool + +## Description +[ScrapFly](https://scrapfly.io/) is a web scraping API with headless browser capabilities, proxies, and anti-bot bypass. It allows for extracting web page data into accessible LLM markdown or text. + +## Setup and Installation +1. **Install ScrapFly Python SDK**: Install `scrapfly-sdk` Python package is installed to use the ScrapFly Web Loader. Install it via pip with the following command: + + ```bash + pip install scrapfly-sdk + ``` + +2. **API Key**: Register for free from [scrapfly.io/register](https://www.scrapfly.io/register/) to obtain your API key. + +## Example Usage + +Utilize the ScrapflyScrapeWebsiteTool as follows to retrieve a web page data as text, markdown (LLM accissible) or HTML: + +```python +from crewai_tools import ScrapflyScrapeWebsiteTool + +tool = ScrapflyScrapeWebsiteTool( + api_key="Your ScrapFly API key" +) + +result = tool._run( + url="https://web-scraping.dev/products", + scrape_format="markdown", + ignore_scrape_failures=True +) +``` + +## Additional Arguments +The ScrapflyScrapeWebsiteTool also allows passigng ScrapeConfig object for customizing the scrape request. See the [API params documentation](https://scrapfly.io/docs/scrape-api/getting-started) for the full feature details and their API params: +```python +from crewai_tools import ScrapflyScrapeWebsiteTool + +tool = ScrapflyScrapeWebsiteTool( + api_key="Your ScrapFly API key" +) + +scrapfly_scrape_config = { + "asp": True, # Bypass scraping blocking and solutions, like Cloudflare + "render_js": True, # Enable JavaScript rendering with a cloud headless browser + "proxy_pool": "public_residential_pool", # Select a proxy pool (datacenter or residnetial) + "country": "us", # Select a proxy location + "auto_scroll": True, # Auto scroll the page + "js": "" # Execute custom JavaScript code by the headless browser +} + +result = tool._run( + url="https://web-scraping.dev/products", + scrape_format="markdown", + ignore_scrape_failures=True, + scrape_config=scrapfly_scrape_config +) +``` \ No newline at end of file diff --git a/src/crewai_tools/tools/scrapfly_scrape_website_tool/scrapfly_scrape_website_tool.py b/src/crewai_tools/tools/scrapfly_scrape_website_tool/scrapfly_scrape_website_tool.py new file mode 100644 index 000000000..b0bfa7ee6 --- /dev/null +++ b/src/crewai_tools/tools/scrapfly_scrape_website_tool/scrapfly_scrape_website_tool.py @@ -0,0 +1,47 @@ +import logging + +from typing import Optional, Any, Type, Dict, Literal +from pydantic.v1 import BaseModel, Field +from crewai_tools.tools.base_tool import BaseTool + +logger = logging.getLogger(__file__) + +class ScrapflyScrapeWebsiteToolSchema(BaseModel): + url: str = Field(description="Webpage URL") + scrape_format: Optional[Literal["raw", "markdown", "text"]] = Field(default="markdown", description="Webpage extraction format") + scrape_config: Optional[Dict[str, Any]] = Field(default=None, description="Scrapfly request scrape config") + ignore_scrape_failures: Optional[bool] = Field(default=None, description="whether to ignore failures") + +class ScrapflyScrapeWebsiteTool(BaseTool): + name: str = "Scrapfly web scraping API tool" + description: str = "Scrape a webpage url using Scrapfly and return its content as markdown or text" + args_schema: Type[BaseModel] = ScrapflyScrapeWebsiteToolSchema + api_key: str = None + scrapfly: Optional[Any] = None + + def __init__(self, api_key: str): + super().__init__() + try: + from scrapfly import ScrapflyClient + except ImportError: + raise ImportError( + "`scrapfly` package not found, please run `pip install scrapfly-sdk`" + ) + self.scrapfly = ScrapflyClient(key=api_key) + + def _run(self, url: str, scrape_format: str = "markdown", scrape_config: Optional[Dict[str, Any]] = None, ignore_scrape_failures: Optional[bool] = None): + from scrapfly import ScrapeApiResponse, ScrapeConfig + + scrape_config = scrape_config if scrape_config is not None else {} + try: + response: ScrapeApiResponse = self.scrapfly.scrape( + ScrapeConfig(url, format=scrape_format, **scrape_config) + ) + return response.scrape_result["content"] + except Exception as e: + if ignore_scrape_failures: + logger.error(f"Error fetching data from {url}, exception: {e}") + return None + else: + raise e + \ No newline at end of file