mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-28 01:28:14 +00:00
spider tool
This commit is contained in:
27
src/crewai_tools/tools/spider_crawl_tool/README.md
Normal file
27
src/crewai_tools/tools/spider_crawl_tool/README.md
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
# SpiderTool
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
[Spider](https://spider.cloud) is the [fastest]([Spider](https://spider.cloud/?ref=crewai) is the [fastest](https://github.com/spider-rs/spider/blob/main/benches/BENCHMARKS.md#benchmark-results) open source scraper and crawler that returns LLM-ready data. It converts any website into pure HTML, markdown, metadata or text while enabling you to crawl with custom actions using AI.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
To use the Spider API you need to download the [Spider SDK](https://pypi.org/project/spider-client/) and the crewai[tools] SDK too:
|
||||||
|
|
||||||
|
```python
|
||||||
|
pip install spider-client 'crewai[tools]'
|
||||||
|
```
|
||||||
|
|
||||||
|
## Example
|
||||||
|
|
||||||
|
This example shows you how you can use the Spider tool to enable your agent to scrape and crawl websites. The data returned from the Spider API is already LLM-ready, so no need to do any cleaning there.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crewai_tools import SpiderTool
|
||||||
|
|
||||||
|
tool = SpiderTool()
|
||||||
|
```
|
||||||
|
|
||||||
|
## Arguments
|
||||||
|
|
||||||
|
- `api_key`: Optional. Specifies Spider API key. If not specified it looks for `SPIDER_API_KEY` in environment variables.
|
||||||
44
src/crewai_tools/tools/spider_crawl_tool/spider_tool.py
Normal file
44
src/crewai_tools/tools/spider_crawl_tool/spider_tool.py
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
from typing import Optional, Any, Type, Dict, Literal
|
||||||
|
from pydantic.v1 import BaseModel, Field
|
||||||
|
from crewai_tools.tools.base_tool import BaseTool
|
||||||
|
|
||||||
|
class SpiderToolSchema(BaseModel):
|
||||||
|
url: str = Field(description="Website URL")
|
||||||
|
params: Optional[Dict[str, Any]] = Field(default={"return_format": "markdown"}, description="Specified Params, see https://spider.cloud/docs/api for all availabe params")
|
||||||
|
mode: Optional[Literal["scrape", "crawl"]] = Field(defualt="scrape", description="Mode, either `scrape` or `crawl` the url")
|
||||||
|
|
||||||
|
class SpiderTool(BaseTool):
|
||||||
|
name: str = "Spider scrape & crawl tool"
|
||||||
|
description: str = "Scrape & Crawl any url and return LLM-ready data."
|
||||||
|
args_schema: Type[BaseModel] = SpiderToolSchema
|
||||||
|
api_key: Optional[str] = None
|
||||||
|
spider: Optional[Any] = None
|
||||||
|
|
||||||
|
def __init__(self, api_key: Optional[str] = None, **kwargs):
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
try:
|
||||||
|
from spider import Spider # type: ignore
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"`spider-client` package not found, please run `pip install spider-client`"
|
||||||
|
)
|
||||||
|
|
||||||
|
self.spider = Spider(api_key=api_key)
|
||||||
|
|
||||||
|
def _run(self, url: str, params: Optional[Dict[str, Any]] = None, mode: Optional[Literal["scrape", "crawl"]] = "scrape"):
|
||||||
|
if mode != "scrape" and mode != "crawl":
|
||||||
|
raise ValueError(
|
||||||
|
"Unknown mode in `mode` parameter, `scrape` or `crawl` is the allowed modes"
|
||||||
|
)
|
||||||
|
|
||||||
|
if params is None:
|
||||||
|
params = {"return_format": "markdown"}
|
||||||
|
|
||||||
|
action = (
|
||||||
|
self.spider.scrape_url if mode == "scrape" else self.spider.crawl_url
|
||||||
|
)
|
||||||
|
spider_docs = action(url=url, params=params)
|
||||||
|
|
||||||
|
|
||||||
|
print(spider_docs)
|
||||||
|
return spider_docs
|
||||||
Reference in New Issue
Block a user