Merge pull request #45 from mazen-r/main

Add Scrapfly website scrape tool
2026-01-29 18:18:13 +00:00 · 2024-07-14 13:46:19 -07:00
parent 7dd33e0b3a 1111a1ac6b
commit 7227a0e740
4 changed files with 106 additions and 0 deletions
--- a/src/crewai_tools/init.py
+++ b/src/crewai_tools/init.py
@@ -21,6 +21,7 @@ from .tools import (
    PGSearchTool,
    RagTool,
    ScrapeElementFromWebsiteTool,
    ScrapflyScrapeWebsiteTool,
    ScrapeWebsiteTool,
    SeleniumScrapingTool,
    SerperDevTool,
--- a/src/crewai_tools/tools/init.py
+++ b/src/crewai_tools/tools/init.py
@@ -23,6 +23,7 @@ from .scrape_element_from_website.scrape_element_from_website import (
    ScrapeElementFromWebsiteTool,
 )
 from .scrape_website_tool.scrape_website_tool import ScrapeWebsiteTool
 from .scrapfly_scrape_website_tool.scrapfly_scrape_website_tool import ScrapflyScrapeWebsiteTool
 from .selenium_scraping_tool.selenium_scraping_tool import SeleniumScrapingTool
 from .serper_dev_tool.serper_dev_tool import SerperDevTool
 from .serply_api_tool.serply_web_search_tool import SerplyWebSearchTool
--- a/src/crewai_tools/tools/scrapfly_scrape_website_tool/README.md
+++ b/src/crewai_tools/tools/scrapfly_scrape_website_tool/README.md
@@ -0,0 +1,57 @@
 # ScrapflyScrapeWebsiteTool
 ## Description
 [ScrapFly](https://scrapfly.io/) is a web scraping API with headless browser capabilities, proxies, and anti-bot bypass. It allows for extracting web page data into accessible LLM markdown or text.
 ## Setup and Installation
 1. **Install ScrapFly Python SDK**: Install `scrapfly-sdk` Python package is installed to use the ScrapFly Web Loader. Install it via pip with the following command:
   ```bash
   pip install scrapfly-sdk
   ```
 2. **API Key**: Register for free from [scrapfly.io/register](https://www.scrapfly.io/register/) to obtain your API key.
 ## Example Usage
 Utilize the ScrapflyScrapeWebsiteTool as follows to retrieve a web page data as text, markdown (LLM accissible) or HTML:
 ```python
 from crewai_tools import ScrapflyScrapeWebsiteTool
 tool = ScrapflyScrapeWebsiteTool(
    api_key="Your ScrapFly API key"
 )
 result = tool._run(
    url="https://web-scraping.dev/products",
    scrape_format="markdown",
    ignore_scrape_failures=True
 )
 ```
 ## Additional Arguments
 The ScrapflyScrapeWebsiteTool also allows passigng ScrapeConfig object for customizing the scrape request. See the [API params documentation](https://scrapfly.io/docs/scrape-api/getting-started) for the full feature details and their API params:
 ```python
 from crewai_tools import ScrapflyScrapeWebsiteTool
 tool = ScrapflyScrapeWebsiteTool(
    api_key="Your ScrapFly API key"
 )
 scrapfly_scrape_config = {
    "asp": True, # Bypass scraping blocking and solutions, like Cloudflare
    "render_js": True, # Enable JavaScript rendering with a cloud headless browser
    "proxy_pool": "public_residential_pool", # Select a proxy pool (datacenter or residnetial)
    "country": "us", # Select a proxy location
    "auto_scroll": True, # Auto scroll the page
    "js": "" # Execute custom JavaScript code by the headless browser
 }
 result = tool._run(
    url="https://web-scraping.dev/products",
    scrape_format="markdown",
    ignore_scrape_failures=True,
    scrape_config=scrapfly_scrape_config
 )
 ```
--- a/src/crewai_tools/tools/scrapfly_scrape_website_tool/scrapfly_scrape_website_tool.py
+++ b/src/crewai_tools/tools/scrapfly_scrape_website_tool/scrapfly_scrape_website_tool.py
@@ -0,0 +1,47 @@
 import logging
 from typing import Optional, Any, Type, Dict, Literal
 from pydantic.v1 import BaseModel, Field
 from crewai_tools.tools.base_tool import BaseTool
 logger = logging.getLogger(__file__)
 class ScrapflyScrapeWebsiteToolSchema(BaseModel):
    url: str = Field(description="Webpage URL")
    scrape_format: Optional[Literal["raw", "markdown", "text"]] = Field(default="markdown", description="Webpage extraction format")
    scrape_config: Optional[Dict[str, Any]] = Field(default=None, description="Scrapfly request scrape config")
    ignore_scrape_failures: Optional[bool] = Field(default=None, description="whether to ignore failures")
 class ScrapflyScrapeWebsiteTool(BaseTool):
    name: str = "Scrapfly web scraping API tool"
    description: str = "Scrape a webpage url using Scrapfly and return its content as markdown or text"
    args_schema: Type[BaseModel] = ScrapflyScrapeWebsiteToolSchema
    api_key: str = None
    scrapfly: Optional[Any] = None
    def __init__(self, api_key: str):
        super().__init__()
        try:
            from scrapfly import ScrapflyClient
        except ImportError:
            raise ImportError(
                "`scrapfly` package not found, please run `pip install scrapfly-sdk`"
            )
        self.scrapfly = ScrapflyClient(key=api_key)
    def _run(self, url: str, scrape_format: str = "markdown", scrape_config: Optional[Dict[str, Any]] = None, ignore_scrape_failures: Optional[bool] = None):
        from scrapfly import ScrapeApiResponse, ScrapeConfig
        scrape_config = scrape_config if scrape_config is not None else {}
        try:
            response: ScrapeApiResponse = self.scrapfly.scrape(
                ScrapeConfig(url, format=scrape_format, **scrape_config)
            )
            return response.scrape_result["content"]
        except Exception as e:
            if ignore_scrape_failures:
                logger.error(f"Error fetching data from {url}, exception: {e}")
                return None
            else:
                raise e