mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-29 18:18:13 +00:00
Merge pull request #45 from mazen-r/main
Add Scrapfly website scrape tool
This commit is contained in:
@@ -21,6 +21,7 @@ from .tools import (
|
|||||||
PGSearchTool,
|
PGSearchTool,
|
||||||
RagTool,
|
RagTool,
|
||||||
ScrapeElementFromWebsiteTool,
|
ScrapeElementFromWebsiteTool,
|
||||||
|
ScrapflyScrapeWebsiteTool,
|
||||||
ScrapeWebsiteTool,
|
ScrapeWebsiteTool,
|
||||||
SeleniumScrapingTool,
|
SeleniumScrapingTool,
|
||||||
SerperDevTool,
|
SerperDevTool,
|
||||||
|
|||||||
@@ -23,6 +23,7 @@ from .scrape_element_from_website.scrape_element_from_website import (
|
|||||||
ScrapeElementFromWebsiteTool,
|
ScrapeElementFromWebsiteTool,
|
||||||
)
|
)
|
||||||
from .scrape_website_tool.scrape_website_tool import ScrapeWebsiteTool
|
from .scrape_website_tool.scrape_website_tool import ScrapeWebsiteTool
|
||||||
|
from .scrapfly_scrape_website_tool.scrapfly_scrape_website_tool import ScrapflyScrapeWebsiteTool
|
||||||
from .selenium_scraping_tool.selenium_scraping_tool import SeleniumScrapingTool
|
from .selenium_scraping_tool.selenium_scraping_tool import SeleniumScrapingTool
|
||||||
from .serper_dev_tool.serper_dev_tool import SerperDevTool
|
from .serper_dev_tool.serper_dev_tool import SerperDevTool
|
||||||
from .serply_api_tool.serply_web_search_tool import SerplyWebSearchTool
|
from .serply_api_tool.serply_web_search_tool import SerplyWebSearchTool
|
||||||
|
|||||||
@@ -0,0 +1,57 @@
|
|||||||
|
# ScrapflyScrapeWebsiteTool
|
||||||
|
|
||||||
|
## Description
|
||||||
|
[ScrapFly](https://scrapfly.io/) is a web scraping API with headless browser capabilities, proxies, and anti-bot bypass. It allows for extracting web page data into accessible LLM markdown or text.
|
||||||
|
|
||||||
|
## Setup and Installation
|
||||||
|
1. **Install ScrapFly Python SDK**: Install `scrapfly-sdk` Python package is installed to use the ScrapFly Web Loader. Install it via pip with the following command:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install scrapfly-sdk
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **API Key**: Register for free from [scrapfly.io/register](https://www.scrapfly.io/register/) to obtain your API key.
|
||||||
|
|
||||||
|
## Example Usage
|
||||||
|
|
||||||
|
Utilize the ScrapflyScrapeWebsiteTool as follows to retrieve a web page data as text, markdown (LLM accissible) or HTML:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crewai_tools import ScrapflyScrapeWebsiteTool
|
||||||
|
|
||||||
|
tool = ScrapflyScrapeWebsiteTool(
|
||||||
|
api_key="Your ScrapFly API key"
|
||||||
|
)
|
||||||
|
|
||||||
|
result = tool._run(
|
||||||
|
url="https://web-scraping.dev/products",
|
||||||
|
scrape_format="markdown",
|
||||||
|
ignore_scrape_failures=True
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Additional Arguments
|
||||||
|
The ScrapflyScrapeWebsiteTool also allows passigng ScrapeConfig object for customizing the scrape request. See the [API params documentation](https://scrapfly.io/docs/scrape-api/getting-started) for the full feature details and their API params:
|
||||||
|
```python
|
||||||
|
from crewai_tools import ScrapflyScrapeWebsiteTool
|
||||||
|
|
||||||
|
tool = ScrapflyScrapeWebsiteTool(
|
||||||
|
api_key="Your ScrapFly API key"
|
||||||
|
)
|
||||||
|
|
||||||
|
scrapfly_scrape_config = {
|
||||||
|
"asp": True, # Bypass scraping blocking and solutions, like Cloudflare
|
||||||
|
"render_js": True, # Enable JavaScript rendering with a cloud headless browser
|
||||||
|
"proxy_pool": "public_residential_pool", # Select a proxy pool (datacenter or residnetial)
|
||||||
|
"country": "us", # Select a proxy location
|
||||||
|
"auto_scroll": True, # Auto scroll the page
|
||||||
|
"js": "" # Execute custom JavaScript code by the headless browser
|
||||||
|
}
|
||||||
|
|
||||||
|
result = tool._run(
|
||||||
|
url="https://web-scraping.dev/products",
|
||||||
|
scrape_format="markdown",
|
||||||
|
ignore_scrape_failures=True,
|
||||||
|
scrape_config=scrapfly_scrape_config
|
||||||
|
)
|
||||||
|
```
|
||||||
@@ -0,0 +1,47 @@
|
|||||||
|
import logging
|
||||||
|
|
||||||
|
from typing import Optional, Any, Type, Dict, Literal
|
||||||
|
from pydantic.v1 import BaseModel, Field
|
||||||
|
from crewai_tools.tools.base_tool import BaseTool
|
||||||
|
|
||||||
|
logger = logging.getLogger(__file__)
|
||||||
|
|
||||||
|
class ScrapflyScrapeWebsiteToolSchema(BaseModel):
|
||||||
|
url: str = Field(description="Webpage URL")
|
||||||
|
scrape_format: Optional[Literal["raw", "markdown", "text"]] = Field(default="markdown", description="Webpage extraction format")
|
||||||
|
scrape_config: Optional[Dict[str, Any]] = Field(default=None, description="Scrapfly request scrape config")
|
||||||
|
ignore_scrape_failures: Optional[bool] = Field(default=None, description="whether to ignore failures")
|
||||||
|
|
||||||
|
class ScrapflyScrapeWebsiteTool(BaseTool):
|
||||||
|
name: str = "Scrapfly web scraping API tool"
|
||||||
|
description: str = "Scrape a webpage url using Scrapfly and return its content as markdown or text"
|
||||||
|
args_schema: Type[BaseModel] = ScrapflyScrapeWebsiteToolSchema
|
||||||
|
api_key: str = None
|
||||||
|
scrapfly: Optional[Any] = None
|
||||||
|
|
||||||
|
def __init__(self, api_key: str):
|
||||||
|
super().__init__()
|
||||||
|
try:
|
||||||
|
from scrapfly import ScrapflyClient
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"`scrapfly` package not found, please run `pip install scrapfly-sdk`"
|
||||||
|
)
|
||||||
|
self.scrapfly = ScrapflyClient(key=api_key)
|
||||||
|
|
||||||
|
def _run(self, url: str, scrape_format: str = "markdown", scrape_config: Optional[Dict[str, Any]] = None, ignore_scrape_failures: Optional[bool] = None):
|
||||||
|
from scrapfly import ScrapeApiResponse, ScrapeConfig
|
||||||
|
|
||||||
|
scrape_config = scrape_config if scrape_config is not None else {}
|
||||||
|
try:
|
||||||
|
response: ScrapeApiResponse = self.scrapfly.scrape(
|
||||||
|
ScrapeConfig(url, format=scrape_format, **scrape_config)
|
||||||
|
)
|
||||||
|
return response.scrape_result["content"]
|
||||||
|
except Exception as e:
|
||||||
|
if ignore_scrape_failures:
|
||||||
|
logger.error(f"Error fetching data from {url}, exception: {e}")
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
raise e
|
||||||
|
|
||||||
Reference in New Issue
Block a user