Move off v1

This commit is contained in:
Brandon Hancock
2024-09-03 15:57:29 -04:00
parent d19bba72b0
commit 35fe222ca1
39 changed files with 752 additions and 550 deletions

View File

@@ -1,21 +1,25 @@
from typing import Optional, Any, Type, Dict, Literal
from pydantic.v1 import BaseModel, Field
from typing import Any, Dict, Literal, Optional, Type
from pydantic import BaseModel, Field
from crewai_tools.tools.base_tool import BaseTool
class SpiderToolSchema(BaseModel):
url: str = Field(description="Website URL")
params: Optional[Dict[str, Any]] = Field(
description="Set additional params. Options include:\n"
"- `limit`: Optional[int] - The maximum number of pages allowed to crawl per website. Remove the value or set it to `0` to crawl all pages.\n"
"- `depth`: Optional[int] - The crawl limit for maximum depth. If `0`, no limit will be applied.\n"
"- `metadata`: Optional[bool] - Boolean to include metadata or not. Defaults to `False` unless set to `True`. If the user wants metadata, include params.metadata = True.\n"
"- `query_selector`: Optional[str] - The CSS query selector to use when extracting content from the markup.\n"
"- `limit`: Optional[int] - The maximum number of pages allowed to crawl per website. Remove the value or set it to `0` to crawl all pages.\n"
"- `depth`: Optional[int] - The crawl limit for maximum depth. If `0`, no limit will be applied.\n"
"- `metadata`: Optional[bool] - Boolean to include metadata or not. Defaults to `False` unless set to `True`. If the user wants metadata, include params.metadata = True.\n"
"- `query_selector`: Optional[str] - The CSS query selector to use when extracting content from the markup.\n"
)
mode: Literal["scrape", "crawl"] = Field(
default="scrape",
description="Mode, the only two allowed modes are `scrape` or `crawl`. Use `scrape` to scrape a single page and `crawl` to crawl the entire website following subpages. These modes are the only allowed values even when ANY params is set."
description="Mode, the only two allowed modes are `scrape` or `crawl`. Use `scrape` to scrape a single page and `crawl` to crawl the entire website following subpages. These modes are the only allowed values even when ANY params is set.",
)
class SpiderTool(BaseTool):
name: str = "Spider scrape & crawl tool"
description: str = "Scrape & Crawl any url and return LLM-ready data."
@@ -26,11 +30,11 @@ class SpiderTool(BaseTool):
def __init__(self, api_key: Optional[str] = None, **kwargs):
super().__init__(**kwargs)
try:
from spider import Spider # type: ignore
from spider import Spider # type: ignore
except ImportError:
raise ImportError(
"`spider-client` package not found, please run `pip install spider-client`"
)
raise ImportError(
"`spider-client` package not found, please run `pip install spider-client`"
)
self.spider = Spider(api_key=api_key)
@@ -38,7 +42,7 @@ class SpiderTool(BaseTool):
self,
url: str,
params: Optional[Dict[str, Any]] = None,
mode: Optional[Literal["scrape", "crawl"]] = "scrape"
mode: Optional[Literal["scrape", "crawl"]] = "scrape",
):
if mode not in ["scrape", "crawl"]:
raise ValueError(
@@ -51,9 +55,7 @@ class SpiderTool(BaseTool):
else:
params = {"return_format": "markdown"}
action = (
self.spider.scrape_url if mode == "scrape" else self.spider.crawl_url
)
action = self.spider.scrape_url if mode == "scrape" else self.spider.crawl_url
spider_docs = action(url=url, params=params)
return spider_docs