mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-10 00:28:31 +00:00
spider tool
This commit is contained in:
27
src/crewai_tools/tools/spider_crawl_tool/README.md
Normal file
27
src/crewai_tools/tools/spider_crawl_tool/README.md
Normal file
@@ -0,0 +1,27 @@
|
||||
# SpiderTool
|
||||
|
||||
## Description
|
||||
|
||||
[Spider](https://spider.cloud) is the [fastest]([Spider](https://spider.cloud/?ref=crewai) is the [fastest](https://github.com/spider-rs/spider/blob/main/benches/BENCHMARKS.md#benchmark-results) open source scraper and crawler that returns LLM-ready data. It converts any website into pure HTML, markdown, metadata or text while enabling you to crawl with custom actions using AI.
|
||||
|
||||
## Installation
|
||||
|
||||
To use the Spider API you need to download the [Spider SDK](https://pypi.org/project/spider-client/) and the crewai[tools] SDK too:
|
||||
|
||||
```python
|
||||
pip install spider-client 'crewai[tools]'
|
||||
```
|
||||
|
||||
## Example
|
||||
|
||||
This example shows you how you can use the Spider tool to enable your agent to scrape and crawl websites. The data returned from the Spider API is already LLM-ready, so no need to do any cleaning there.
|
||||
|
||||
```python
|
||||
from crewai_tools import SpiderTool
|
||||
|
||||
tool = SpiderTool()
|
||||
```
|
||||
|
||||
## Arguments
|
||||
|
||||
- `api_key`: Optional. Specifies Spider API key. If not specified it looks for `SPIDER_API_KEY` in environment variables.
|
||||
44
src/crewai_tools/tools/spider_crawl_tool/spider_tool.py
Normal file
44
src/crewai_tools/tools/spider_crawl_tool/spider_tool.py
Normal file
@@ -0,0 +1,44 @@
|
||||
from typing import Optional, Any, Type, Dict, Literal
|
||||
from pydantic.v1 import BaseModel, Field
|
||||
from crewai_tools.tools.base_tool import BaseTool
|
||||
|
||||
class SpiderToolSchema(BaseModel):
|
||||
url: str = Field(description="Website URL")
|
||||
params: Optional[Dict[str, Any]] = Field(default={"return_format": "markdown"}, description="Specified Params, see https://spider.cloud/docs/api for all availabe params")
|
||||
mode: Optional[Literal["scrape", "crawl"]] = Field(defualt="scrape", description="Mode, either `scrape` or `crawl` the url")
|
||||
|
||||
class SpiderTool(BaseTool):
|
||||
name: str = "Spider scrape & crawl tool"
|
||||
description: str = "Scrape & Crawl any url and return LLM-ready data."
|
||||
args_schema: Type[BaseModel] = SpiderToolSchema
|
||||
api_key: Optional[str] = None
|
||||
spider: Optional[Any] = None
|
||||
|
||||
def __init__(self, api_key: Optional[str] = None, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
try:
|
||||
from spider import Spider # type: ignore
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"`spider-client` package not found, please run `pip install spider-client`"
|
||||
)
|
||||
|
||||
self.spider = Spider(api_key=api_key)
|
||||
|
||||
def _run(self, url: str, params: Optional[Dict[str, Any]] = None, mode: Optional[Literal["scrape", "crawl"]] = "scrape"):
|
||||
if mode != "scrape" and mode != "crawl":
|
||||
raise ValueError(
|
||||
"Unknown mode in `mode` parameter, `scrape` or `crawl` is the allowed modes"
|
||||
)
|
||||
|
||||
if params is None:
|
||||
params = {"return_format": "markdown"}
|
||||
|
||||
action = (
|
||||
self.spider.scrape_url if mode == "scrape" else self.spider.crawl_url
|
||||
)
|
||||
spider_docs = action(url=url, params=params)
|
||||
|
||||
|
||||
print(spider_docs)
|
||||
return spider_docs
|
||||
Reference in New Issue
Block a user