spider tool

2026-01-10 00:28:31 +00:00 · 2024-05-21 11:48:52 +02:00
parent 53c7d815ae
commit 4e1425665c
2 changed files with 71 additions and 0 deletions
--- a/src/crewai_tools/tools/spider_crawl_tool/README.md
+++ b/src/crewai_tools/tools/spider_crawl_tool/README.md
@@ -0,0 +1,27 @@
+# SpiderTool
+
+## Description
+
+[Spider](https://spider.cloud) is the [fastest]([Spider](https://spider.cloud/?ref=crewai) is the [fastest](https://github.com/spider-rs/spider/blob/main/benches/BENCHMARKS.md#benchmark-results) open source scraper and crawler that returns LLM-ready data. It converts any website into pure HTML, markdown, metadata or text while enabling you to crawl with custom actions using AI.
+
+## Installation
+
+To use the Spider API you need to download the [Spider SDK](https://pypi.org/project/spider-client/) and the crewai[tools] SDK too:
+
+```python
+pip install spider-client 'crewai[tools]'
+```
+
+## Example
+
+This example shows you how you can use the Spider tool to enable your agent to scrape and crawl websites. The data returned from the Spider API is already LLM-ready, so no need to do any cleaning there.
+
+```python
+from crewai_tools import SpiderTool
+
+tool = SpiderTool()
+```
+
+## Arguments
+
+- `api_key`: Optional. Specifies Spider API key. If not specified it looks for `SPIDER_API_KEY` in environment variables.
--- a/src/crewai_tools/tools/spider_crawl_tool/spider_tool.py
+++ b/src/crewai_tools/tools/spider_crawl_tool/spider_tool.py
@@ -0,0 +1,44 @@
+from typing import Optional, Any, Type, Dict, Literal
+from pydantic.v1 import BaseModel, Field
+from crewai_tools.tools.base_tool import BaseTool
+
+class SpiderToolSchema(BaseModel):
+    url: str = Field(description="Website URL")
+    params: Optional[Dict[str, Any]] = Field(default={"return_format": "markdown"}, description="Specified Params, see https://spider.cloud/docs/api for all availabe params")
+    mode: Optional[Literal["scrape", "crawl"]] = Field(defualt="scrape", description="Mode, either `scrape` or `crawl` the url")
+
+class SpiderTool(BaseTool):
+    name: str = "Spider scrape & crawl tool"
+    description: str = "Scrape & Crawl any url and return LLM-ready data."
+    args_schema: Type[BaseModel] = SpiderToolSchema
+    api_key: Optional[str] = None
+    spider: Optional[Any] = None
+
+    def __init__(self, api_key: Optional[str] = None, **kwargs):
+        super().__init__(**kwargs)
+        try:
+            from spider import Spider # type: ignore
+        except ImportError:
+           raise ImportError(
+               "`spider-client` package not found, please run `pip install spider-client`"
+           )
+
+        self.spider = Spider(api_key=api_key)
+
+    def _run(self, url: str, params: Optional[Dict[str, Any]] = None, mode: Optional[Literal["scrape", "crawl"]] = "scrape"):
+        if mode != "scrape" and mode != "crawl":
+            raise ValueError(
+                "Unknown mode in `mode` parameter, `scrape` or `crawl` is the allowed modes"
+            )
+
+        if params is None:
+            params = {"return_format": "markdown"}
+        
+        action = (
+            self.spider.scrape_url if mode == "scrape" else self.spider.crawl_url
+        )
+        spider_docs = action(url=url, params=params)
+
+
+        print(spider_docs)
+        return spider_docs