Merge pull request #45 from mazen-r/main

Add Scrapfly website scrape tool
2026-01-10 00:28:31 +00:00 · 2024-07-14 13:46:19 -07:00
parent 7dd33e0b3a 1111a1ac6b
commit 7227a0e740
4 changed files with 106 additions and 0 deletions
--- a/src/crewai_tools/init.py
+++ b/src/crewai_tools/init.py
@@ -21,6 +21,7 @@ from .tools import (
    PGSearchTool,
    RagTool,
    ScrapeElementFromWebsiteTool,
+    ScrapflyScrapeWebsiteTool,
    ScrapeWebsiteTool,
    SeleniumScrapingTool,
    SerperDevTool,
--- a/src/crewai_tools/tools/init.py
+++ b/src/crewai_tools/tools/init.py
@@ -23,6 +23,7 @@ from .scrape_element_from_website.scrape_element_from_website import (
    ScrapeElementFromWebsiteTool,
 )
 from .scrape_website_tool.scrape_website_tool import ScrapeWebsiteTool
+from .scrapfly_scrape_website_tool.scrapfly_scrape_website_tool import ScrapflyScrapeWebsiteTool
 from .selenium_scraping_tool.selenium_scraping_tool import SeleniumScrapingTool
 from .serper_dev_tool.serper_dev_tool import SerperDevTool
 from .serply_api_tool.serply_web_search_tool import SerplyWebSearchTool
--- a/src/crewai_tools/tools/scrapfly_scrape_website_tool/README.md
+++ b/src/crewai_tools/tools/scrapfly_scrape_website_tool/README.md
@@ -0,0 +1,57 @@
+# ScrapflyScrapeWebsiteTool
+
+## Description
+[ScrapFly](https://scrapfly.io/) is a web scraping API with headless browser capabilities, proxies, and anti-bot bypass. It allows for extracting web page data into accessible LLM markdown or text.
+
+## Setup and Installation
+1. **Install ScrapFly Python SDK**: Install `scrapfly-sdk` Python package is installed to use the ScrapFly Web Loader. Install it via pip with the following command:
+
+   ```bash
+   pip install scrapfly-sdk
+   ```
+
+2. **API Key**: Register for free from [scrapfly.io/register](https://www.scrapfly.io/register/) to obtain your API key.
+
+## Example Usage
+
+Utilize the ScrapflyScrapeWebsiteTool as follows to retrieve a web page data as text, markdown (LLM accissible) or HTML:
+
+```python
+from crewai_tools import ScrapflyScrapeWebsiteTool
+
+tool = ScrapflyScrapeWebsiteTool(
+    api_key="Your ScrapFly API key"
+)
+
+result = tool._run(
+    url="https://web-scraping.dev/products",
+    scrape_format="markdown",
+    ignore_scrape_failures=True
+)
+```
+
+## Additional Arguments
+The ScrapflyScrapeWebsiteTool also allows passigng ScrapeConfig object for customizing the scrape request. See the [API params documentation](https://scrapfly.io/docs/scrape-api/getting-started) for the full feature details and their API params:
+```python
+from crewai_tools import ScrapflyScrapeWebsiteTool
+
+tool = ScrapflyScrapeWebsiteTool(
+    api_key="Your ScrapFly API key"
+)
+
+scrapfly_scrape_config = {
+    "asp": True, # Bypass scraping blocking and solutions, like Cloudflare
+    "render_js": True, # Enable JavaScript rendering with a cloud headless browser
+    "proxy_pool": "public_residential_pool", # Select a proxy pool (datacenter or residnetial)
+    "country": "us", # Select a proxy location
+    "auto_scroll": True, # Auto scroll the page
+    "js": "" # Execute custom JavaScript code by the headless browser
+}
+
+result = tool._run(
+    url="https://web-scraping.dev/products",
+    scrape_format="markdown",
+    ignore_scrape_failures=True,
+    scrape_config=scrapfly_scrape_config
+)
+```
--- a/src/crewai_tools/tools/scrapfly_scrape_website_tool/scrapfly_scrape_website_tool.py
+++ b/src/crewai_tools/tools/scrapfly_scrape_website_tool/scrapfly_scrape_website_tool.py
@@ -0,0 +1,47 @@
+import logging
+
+from typing import Optional, Any, Type, Dict, Literal
+from pydantic.v1 import BaseModel, Field
+from crewai_tools.tools.base_tool import BaseTool
+
+logger = logging.getLogger(__file__)
+
+class ScrapflyScrapeWebsiteToolSchema(BaseModel):
+    url: str = Field(description="Webpage URL")
+    scrape_format: Optional[Literal["raw", "markdown", "text"]] = Field(default="markdown", description="Webpage extraction format")
+    scrape_config: Optional[Dict[str, Any]] = Field(default=None, description="Scrapfly request scrape config")
+    ignore_scrape_failures: Optional[bool] = Field(default=None, description="whether to ignore failures")
+
+class ScrapflyScrapeWebsiteTool(BaseTool):
+    name: str = "Scrapfly web scraping API tool"
+    description: str = "Scrape a webpage url using Scrapfly and return its content as markdown or text"
+    args_schema: Type[BaseModel] = ScrapflyScrapeWebsiteToolSchema
+    api_key: str = None
+    scrapfly: Optional[Any] = None
+
+    def __init__(self, api_key: str):
+        super().__init__()
+        try:
+            from scrapfly import ScrapflyClient
+        except ImportError:
+            raise ImportError(
+                "`scrapfly` package not found, please run `pip install scrapfly-sdk`"
+            )
+        self.scrapfly = ScrapflyClient(key=api_key)
+
+    def _run(self, url: str, scrape_format: str = "markdown", scrape_config: Optional[Dict[str, Any]] = None, ignore_scrape_failures: Optional[bool] = None):
+        from scrapfly import ScrapeApiResponse, ScrapeConfig
+
+        scrape_config = scrape_config if scrape_config is not None else {}
+        try:
+            response: ScrapeApiResponse = self.scrapfly.scrape(
+                ScrapeConfig(url, format=scrape_format, **scrape_config)
+            )
+            return response.scrape_result["content"]
+        except Exception as e:
+            if ignore_scrape_failures:
+                logger.error(f"Error fetching data from {url}, exception: {e}")
+                return None
+            else:
+                raise e
+