mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-10 00:28:31 +00:00
Merge pull request #45 from mazen-r/main
Add Scrapfly website scrape tool
This commit is contained in:
@@ -21,6 +21,7 @@ from .tools import (
|
||||
PGSearchTool,
|
||||
RagTool,
|
||||
ScrapeElementFromWebsiteTool,
|
||||
ScrapflyScrapeWebsiteTool,
|
||||
ScrapeWebsiteTool,
|
||||
SeleniumScrapingTool,
|
||||
SerperDevTool,
|
||||
|
||||
@@ -23,6 +23,7 @@ from .scrape_element_from_website.scrape_element_from_website import (
|
||||
ScrapeElementFromWebsiteTool,
|
||||
)
|
||||
from .scrape_website_tool.scrape_website_tool import ScrapeWebsiteTool
|
||||
from .scrapfly_scrape_website_tool.scrapfly_scrape_website_tool import ScrapflyScrapeWebsiteTool
|
||||
from .selenium_scraping_tool.selenium_scraping_tool import SeleniumScrapingTool
|
||||
from .serper_dev_tool.serper_dev_tool import SerperDevTool
|
||||
from .serply_api_tool.serply_web_search_tool import SerplyWebSearchTool
|
||||
|
||||
@@ -0,0 +1,57 @@
|
||||
# ScrapflyScrapeWebsiteTool
|
||||
|
||||
## Description
|
||||
[ScrapFly](https://scrapfly.io/) is a web scraping API with headless browser capabilities, proxies, and anti-bot bypass. It allows for extracting web page data into accessible LLM markdown or text.
|
||||
|
||||
## Setup and Installation
|
||||
1. **Install ScrapFly Python SDK**: Install `scrapfly-sdk` Python package is installed to use the ScrapFly Web Loader. Install it via pip with the following command:
|
||||
|
||||
```bash
|
||||
pip install scrapfly-sdk
|
||||
```
|
||||
|
||||
2. **API Key**: Register for free from [scrapfly.io/register](https://www.scrapfly.io/register/) to obtain your API key.
|
||||
|
||||
## Example Usage
|
||||
|
||||
Utilize the ScrapflyScrapeWebsiteTool as follows to retrieve a web page data as text, markdown (LLM accissible) or HTML:
|
||||
|
||||
```python
|
||||
from crewai_tools import ScrapflyScrapeWebsiteTool
|
||||
|
||||
tool = ScrapflyScrapeWebsiteTool(
|
||||
api_key="Your ScrapFly API key"
|
||||
)
|
||||
|
||||
result = tool._run(
|
||||
url="https://web-scraping.dev/products",
|
||||
scrape_format="markdown",
|
||||
ignore_scrape_failures=True
|
||||
)
|
||||
```
|
||||
|
||||
## Additional Arguments
|
||||
The ScrapflyScrapeWebsiteTool also allows passigng ScrapeConfig object for customizing the scrape request. See the [API params documentation](https://scrapfly.io/docs/scrape-api/getting-started) for the full feature details and their API params:
|
||||
```python
|
||||
from crewai_tools import ScrapflyScrapeWebsiteTool
|
||||
|
||||
tool = ScrapflyScrapeWebsiteTool(
|
||||
api_key="Your ScrapFly API key"
|
||||
)
|
||||
|
||||
scrapfly_scrape_config = {
|
||||
"asp": True, # Bypass scraping blocking and solutions, like Cloudflare
|
||||
"render_js": True, # Enable JavaScript rendering with a cloud headless browser
|
||||
"proxy_pool": "public_residential_pool", # Select a proxy pool (datacenter or residnetial)
|
||||
"country": "us", # Select a proxy location
|
||||
"auto_scroll": True, # Auto scroll the page
|
||||
"js": "" # Execute custom JavaScript code by the headless browser
|
||||
}
|
||||
|
||||
result = tool._run(
|
||||
url="https://web-scraping.dev/products",
|
||||
scrape_format="markdown",
|
||||
ignore_scrape_failures=True,
|
||||
scrape_config=scrapfly_scrape_config
|
||||
)
|
||||
```
|
||||
@@ -0,0 +1,47 @@
|
||||
import logging
|
||||
|
||||
from typing import Optional, Any, Type, Dict, Literal
|
||||
from pydantic.v1 import BaseModel, Field
|
||||
from crewai_tools.tools.base_tool import BaseTool
|
||||
|
||||
logger = logging.getLogger(__file__)
|
||||
|
||||
class ScrapflyScrapeWebsiteToolSchema(BaseModel):
|
||||
url: str = Field(description="Webpage URL")
|
||||
scrape_format: Optional[Literal["raw", "markdown", "text"]] = Field(default="markdown", description="Webpage extraction format")
|
||||
scrape_config: Optional[Dict[str, Any]] = Field(default=None, description="Scrapfly request scrape config")
|
||||
ignore_scrape_failures: Optional[bool] = Field(default=None, description="whether to ignore failures")
|
||||
|
||||
class ScrapflyScrapeWebsiteTool(BaseTool):
|
||||
name: str = "Scrapfly web scraping API tool"
|
||||
description: str = "Scrape a webpage url using Scrapfly and return its content as markdown or text"
|
||||
args_schema: Type[BaseModel] = ScrapflyScrapeWebsiteToolSchema
|
||||
api_key: str = None
|
||||
scrapfly: Optional[Any] = None
|
||||
|
||||
def __init__(self, api_key: str):
|
||||
super().__init__()
|
||||
try:
|
||||
from scrapfly import ScrapflyClient
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"`scrapfly` package not found, please run `pip install scrapfly-sdk`"
|
||||
)
|
||||
self.scrapfly = ScrapflyClient(key=api_key)
|
||||
|
||||
def _run(self, url: str, scrape_format: str = "markdown", scrape_config: Optional[Dict[str, Any]] = None, ignore_scrape_failures: Optional[bool] = None):
|
||||
from scrapfly import ScrapeApiResponse, ScrapeConfig
|
||||
|
||||
scrape_config = scrape_config if scrape_config is not None else {}
|
||||
try:
|
||||
response: ScrapeApiResponse = self.scrapfly.scrape(
|
||||
ScrapeConfig(url, format=scrape_format, **scrape_config)
|
||||
)
|
||||
return response.scrape_result["content"]
|
||||
except Exception as e:
|
||||
if ignore_scrape_failures:
|
||||
logger.error(f"Error fetching data from {url}, exception: {e}")
|
||||
return None
|
||||
else:
|
||||
raise e
|
||||
|
||||
Reference in New Issue
Block a user