From 4e1425665c673badcc0a233cca03733ec35656e4 Mon Sep 17 00:00:00 2001 From: WilliamEspegren Date: Tue, 21 May 2024 11:48:52 +0200 Subject: [PATCH] spider tool --- .../tools/spider_crawl_tool/README.md | 27 ++++++++++++ .../tools/spider_crawl_tool/spider_tool.py | 44 +++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 src/crewai_tools/tools/spider_crawl_tool/README.md create mode 100644 src/crewai_tools/tools/spider_crawl_tool/spider_tool.py diff --git a/src/crewai_tools/tools/spider_crawl_tool/README.md b/src/crewai_tools/tools/spider_crawl_tool/README.md new file mode 100644 index 000000000..3207efcca --- /dev/null +++ b/src/crewai_tools/tools/spider_crawl_tool/README.md @@ -0,0 +1,27 @@ +# SpiderTool + +## Description + +[Spider](https://spider.cloud) is the [fastest]([Spider](https://spider.cloud/?ref=crewai) is the [fastest](https://github.com/spider-rs/spider/blob/main/benches/BENCHMARKS.md#benchmark-results) open source scraper and crawler that returns LLM-ready data. It converts any website into pure HTML, markdown, metadata or text while enabling you to crawl with custom actions using AI. + +## Installation + +To use the Spider API you need to download the [Spider SDK](https://pypi.org/project/spider-client/) and the crewai[tools] SDK too: + +```python +pip install spider-client 'crewai[tools]' +``` + +## Example + +This example shows you how you can use the Spider tool to enable your agent to scrape and crawl websites. The data returned from the Spider API is already LLM-ready, so no need to do any cleaning there. + +```python +from crewai_tools import SpiderTool + +tool = SpiderTool() +``` + +## Arguments + +- `api_key`: Optional. Specifies Spider API key. If not specified it looks for `SPIDER_API_KEY` in environment variables. diff --git a/src/crewai_tools/tools/spider_crawl_tool/spider_tool.py b/src/crewai_tools/tools/spider_crawl_tool/spider_tool.py new file mode 100644 index 000000000..c924f6136 --- /dev/null +++ b/src/crewai_tools/tools/spider_crawl_tool/spider_tool.py @@ -0,0 +1,44 @@ +from typing import Optional, Any, Type, Dict, Literal +from pydantic.v1 import BaseModel, Field +from crewai_tools.tools.base_tool import BaseTool + +class SpiderToolSchema(BaseModel): + url: str = Field(description="Website URL") + params: Optional[Dict[str, Any]] = Field(default={"return_format": "markdown"}, description="Specified Params, see https://spider.cloud/docs/api for all availabe params") + mode: Optional[Literal["scrape", "crawl"]] = Field(defualt="scrape", description="Mode, either `scrape` or `crawl` the url") + +class SpiderTool(BaseTool): + name: str = "Spider scrape & crawl tool" + description: str = "Scrape & Crawl any url and return LLM-ready data." + args_schema: Type[BaseModel] = SpiderToolSchema + api_key: Optional[str] = None + spider: Optional[Any] = None + + def __init__(self, api_key: Optional[str] = None, **kwargs): + super().__init__(**kwargs) + try: + from spider import Spider # type: ignore + except ImportError: + raise ImportError( + "`spider-client` package not found, please run `pip install spider-client`" + ) + + self.spider = Spider(api_key=api_key) + + def _run(self, url: str, params: Optional[Dict[str, Any]] = None, mode: Optional[Literal["scrape", "crawl"]] = "scrape"): + if mode != "scrape" and mode != "crawl": + raise ValueError( + "Unknown mode in `mode` parameter, `scrape` or `crawl` is the allowed modes" + ) + + if params is None: + params = {"return_format": "markdown"} + + action = ( + self.spider.scrape_url if mode == "scrape" else self.spider.crawl_url + ) + spider_docs = action(url=url, params=params) + + + print(spider_docs) + return spider_docs