Merge pull request #45 from mazen-r/main

Add Scrapfly website scrape tool
This commit is contained in:
João Moura
2024-07-14 13:46:19 -07:00
committed by GitHub
4 changed files with 106 additions and 0 deletions

View File

@@ -21,6 +21,7 @@ from .tools import (
PGSearchTool,
RagTool,
ScrapeElementFromWebsiteTool,
ScrapflyScrapeWebsiteTool,
ScrapeWebsiteTool,
SeleniumScrapingTool,
SerperDevTool,

View File

@@ -23,6 +23,7 @@ from .scrape_element_from_website.scrape_element_from_website import (
ScrapeElementFromWebsiteTool,
)
from .scrape_website_tool.scrape_website_tool import ScrapeWebsiteTool
from .scrapfly_scrape_website_tool.scrapfly_scrape_website_tool import ScrapflyScrapeWebsiteTool
from .selenium_scraping_tool.selenium_scraping_tool import SeleniumScrapingTool
from .serper_dev_tool.serper_dev_tool import SerperDevTool
from .serply_api_tool.serply_web_search_tool import SerplyWebSearchTool

View File

@@ -0,0 +1,57 @@
# ScrapflyScrapeWebsiteTool
## Description
[ScrapFly](https://scrapfly.io/) is a web scraping API with headless browser capabilities, proxies, and anti-bot bypass. It allows for extracting web page data into accessible LLM markdown or text.
## Setup and Installation
1. **Install ScrapFly Python SDK**: Install `scrapfly-sdk` Python package is installed to use the ScrapFly Web Loader. Install it via pip with the following command:
```bash
pip install scrapfly-sdk
```
2. **API Key**: Register for free from [scrapfly.io/register](https://www.scrapfly.io/register/) to obtain your API key.
## Example Usage
Utilize the ScrapflyScrapeWebsiteTool as follows to retrieve a web page data as text, markdown (LLM accissible) or HTML:
```python
from crewai_tools import ScrapflyScrapeWebsiteTool
tool = ScrapflyScrapeWebsiteTool(
api_key="Your ScrapFly API key"
)
result = tool._run(
url="https://web-scraping.dev/products",
scrape_format="markdown",
ignore_scrape_failures=True
)
```
## Additional Arguments
The ScrapflyScrapeWebsiteTool also allows passigng ScrapeConfig object for customizing the scrape request. See the [API params documentation](https://scrapfly.io/docs/scrape-api/getting-started) for the full feature details and their API params:
```python
from crewai_tools import ScrapflyScrapeWebsiteTool
tool = ScrapflyScrapeWebsiteTool(
api_key="Your ScrapFly API key"
)
scrapfly_scrape_config = {
"asp": True, # Bypass scraping blocking and solutions, like Cloudflare
"render_js": True, # Enable JavaScript rendering with a cloud headless browser
"proxy_pool": "public_residential_pool", # Select a proxy pool (datacenter or residnetial)
"country": "us", # Select a proxy location
"auto_scroll": True, # Auto scroll the page
"js": "" # Execute custom JavaScript code by the headless browser
}
result = tool._run(
url="https://web-scraping.dev/products",
scrape_format="markdown",
ignore_scrape_failures=True,
scrape_config=scrapfly_scrape_config
)
```

View File

@@ -0,0 +1,47 @@
import logging
from typing import Optional, Any, Type, Dict, Literal
from pydantic.v1 import BaseModel, Field
from crewai_tools.tools.base_tool import BaseTool
logger = logging.getLogger(__file__)
class ScrapflyScrapeWebsiteToolSchema(BaseModel):
url: str = Field(description="Webpage URL")
scrape_format: Optional[Literal["raw", "markdown", "text"]] = Field(default="markdown", description="Webpage extraction format")
scrape_config: Optional[Dict[str, Any]] = Field(default=None, description="Scrapfly request scrape config")
ignore_scrape_failures: Optional[bool] = Field(default=None, description="whether to ignore failures")
class ScrapflyScrapeWebsiteTool(BaseTool):
name: str = "Scrapfly web scraping API tool"
description: str = "Scrape a webpage url using Scrapfly and return its content as markdown or text"
args_schema: Type[BaseModel] = ScrapflyScrapeWebsiteToolSchema
api_key: str = None
scrapfly: Optional[Any] = None
def __init__(self, api_key: str):
super().__init__()
try:
from scrapfly import ScrapflyClient
except ImportError:
raise ImportError(
"`scrapfly` package not found, please run `pip install scrapfly-sdk`"
)
self.scrapfly = ScrapflyClient(key=api_key)
def _run(self, url: str, scrape_format: str = "markdown", scrape_config: Optional[Dict[str, Any]] = None, ignore_scrape_failures: Optional[bool] = None):
from scrapfly import ScrapeApiResponse, ScrapeConfig
scrape_config = scrape_config if scrape_config is not None else {}
try:
response: ScrapeApiResponse = self.scrapfly.scrape(
ScrapeConfig(url, format=scrape_format, **scrape_config)
)
return response.scrape_result["content"]
except Exception as e:
if ignore_scrape_failures:
logger.error(f"Error fetching data from {url}, exception: {e}")
return None
else:
raise e