spider tool

This commit is contained in:
WilliamEspegren
2024-05-21 11:48:52 +02:00
parent 53c7d815ae
commit 4e1425665c
2 changed files with 71 additions and 0 deletions

View File

@@ -0,0 +1,27 @@
# SpiderTool
## Description
[Spider](https://spider.cloud) is the [fastest]([Spider](https://spider.cloud/?ref=crewai) is the [fastest](https://github.com/spider-rs/spider/blob/main/benches/BENCHMARKS.md#benchmark-results) open source scraper and crawler that returns LLM-ready data. It converts any website into pure HTML, markdown, metadata or text while enabling you to crawl with custom actions using AI.
## Installation
To use the Spider API you need to download the [Spider SDK](https://pypi.org/project/spider-client/) and the crewai[tools] SDK too:
```python
pip install spider-client 'crewai[tools]'
```
## Example
This example shows you how you can use the Spider tool to enable your agent to scrape and crawl websites. The data returned from the Spider API is already LLM-ready, so no need to do any cleaning there.
```python
from crewai_tools import SpiderTool
tool = SpiderTool()
```
## Arguments
- `api_key`: Optional. Specifies Spider API key. If not specified it looks for `SPIDER_API_KEY` in environment variables.

View File

@@ -0,0 +1,44 @@
from typing import Optional, Any, Type, Dict, Literal
from pydantic.v1 import BaseModel, Field
from crewai_tools.tools.base_tool import BaseTool
class SpiderToolSchema(BaseModel):
url: str = Field(description="Website URL")
params: Optional[Dict[str, Any]] = Field(default={"return_format": "markdown"}, description="Specified Params, see https://spider.cloud/docs/api for all availabe params")
mode: Optional[Literal["scrape", "crawl"]] = Field(defualt="scrape", description="Mode, either `scrape` or `crawl` the url")
class SpiderTool(BaseTool):
name: str = "Spider scrape & crawl tool"
description: str = "Scrape & Crawl any url and return LLM-ready data."
args_schema: Type[BaseModel] = SpiderToolSchema
api_key: Optional[str] = None
spider: Optional[Any] = None
def __init__(self, api_key: Optional[str] = None, **kwargs):
super().__init__(**kwargs)
try:
from spider import Spider # type: ignore
except ImportError:
raise ImportError(
"`spider-client` package not found, please run `pip install spider-client`"
)
self.spider = Spider(api_key=api_key)
def _run(self, url: str, params: Optional[Dict[str, Any]] = None, mode: Optional[Literal["scrape", "crawl"]] = "scrape"):
if mode != "scrape" and mode != "crawl":
raise ValueError(
"Unknown mode in `mode` parameter, `scrape` or `crawl` is the allowed modes"
)
if params is None:
params = {"return_format": "markdown"}
action = (
self.spider.scrape_url if mode == "scrape" else self.spider.crawl_url
)
spider_docs = action(url=url, params=params)
print(spider_docs)
return spider_docs