lint fixes

This commit is contained in:
Gilbert Bagaoisan
2024-12-16 22:05:28 -08:00
parent a49be2fc52
commit cd37ede869

View File

@@ -1,60 +1,137 @@
import logging
from typing import Any, Dict, Literal, Optional, Type from typing import Any, Dict, Literal, Optional, Type
from urllib.parse import urlparse
from crewai.tools import BaseTool from crewai.tools import BaseTool
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
logger = logging.getLogger(__file__)
class SpiderToolSchema(BaseModel): class SpiderToolSchema(BaseModel):
url: str = Field(description="Website URL") """Input schema for SpiderTool."""
params: Optional[Dict[str, Any]] = Field(
description="Set additional params. Options include:\n" website_url: str = Field(
"- `limit`: Optional[int] - The maximum number of pages allowed to crawl per website. Remove the value or set it to `0` to crawl all pages.\n" ..., description="Mandatory website URL to scrape or crawl"
"- `depth`: Optional[int] - The crawl limit for maximum depth. If `0`, no limit will be applied.\n"
"- `metadata`: Optional[bool] - Boolean to include metadata or not. Defaults to `False` unless set to `True`. If the user wants metadata, include params.metadata = True.\n"
"- `query_selector`: Optional[str] - The CSS query selector to use when extracting content from the markup.\n"
) )
mode: Literal["scrape", "crawl"] = Field( mode: Literal["scrape", "crawl"] = Field(
default="scrape", default="scrape",
description="Mode, the only two allowed modes are `scrape` or `crawl`. Use `scrape` to scrape a single page and `crawl` to crawl the entire website following subpages. These modes are the only allowed values even when ANY params is set.", description="The mode of the SpiderTool. The only two allowed modes are `scrape` or `crawl`. Crawl mode will follow up to 5 links and return their content in markdown format.",
) )
class SpiderTool(BaseTool): class SpiderTool(BaseTool):
name: str = "Spider scrape & crawl tool" """Tool for scraping and crawling websites."""
description: str = "Scrape & Crawl any url and return LLM-ready data."
args_schema: Type[BaseModel] = SpiderToolSchema DEFAULT_CRAWL_LIMIT: int = 5
api_key: Optional[str] = None DEFAULT_RETURN_FORMAT: str = "markdown"
spider: Optional[Any] = None
name: str = "SpiderTool"
description: str = (
"A tool to scrape or crawl a website and return LLM-ready content."
)
args_schema: Type[BaseModel] = SpiderToolSchema
custom_params: Optional[Dict[str, Any]] = None
website_url: Optional[str] = None
api_key: Optional[str] = None
spider: Any = None
log_failures: bool = True
def __init__(
self,
api_key: Optional[str] = None,
website_url: Optional[str] = None,
custom_params: Optional[Dict[str, Any]] = None,
log_failures: bool = True,
**kwargs,
):
"""Initialize SpiderTool for web scraping and crawling.
Args:
api_key (Optional[str]): Spider API key for authentication. Required for production use.
website_url (Optional[str]): Default website URL to scrape/crawl. Can be overridden during execution.
custom_params (Optional[Dict[str, Any]]): Additional parameters to pass to Spider API.
These override any parameters set by the LLM.
log_failures (bool): If True, logs errors. Defaults to True.
**kwargs: Additional arguments passed to BaseTool.
Raises:
ImportError: If spider-client package is not installed.
RuntimeError: If Spider client initialization fails.
"""
def __init__(self, api_key: Optional[str] = None, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
if website_url is not None:
self.website_url = website_url
self.log_failures = log_failures
self.custom_params = custom_params
try: try:
from spider import Spider # type: ignore from spider import Spider # type: ignore
self.spider = Spider(api_key=api_key)
except ImportError: except ImportError:
raise ImportError( raise ImportError(
"`spider-client` package not found, please run `pip install spider-client`" "`spider-client` package not found, please run `pip install spider-client`"
) )
except Exception as e:
raise RuntimeError(f"Failed to initialize Spider client: {str(e)}")
self.spider = Spider(api_key=api_key) def _validate_url(self, url: str) -> bool:
"""Validate URL format.
Args:
url (str): URL to validate.
Returns:
bool: True if valid URL.
"""
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except Exception:
return False
def _run( def _run(
self, self,
url: str, website_url: str,
params: Optional[Dict[str, Any]] = None, mode: Literal["scrape", "crawl"] = "scrape",
mode: Optional[Literal["scrape", "crawl"]] = "scrape", ) -> str:
): params = {}
if mode not in ["scrape", "crawl"]: url = website_url or self.website_url
if not self._validate_url(url):
raise ValueError("Invalid URL format")
if not url:
raise ValueError( raise ValueError(
"Unknown mode in `mode` parameter, `scrape` or `crawl` are the allowed modes" "Website URL must be provided either during initialization or execution"
) )
# Ensure 'return_format': 'markdown' is always included if mode not in ["scrape", "crawl"]:
if params: raise ValueError("Mode must be either 'scrape' or 'crawl'")
params["return_format"] = "markdown"
else:
params = {"return_format": "markdown"}
action = self.spider.scrape_url if mode == "scrape" else self.spider.crawl_url params["request"] = "smart"
spider_docs = action(url=url, params=params) params["filter_output_svg"] = True
params["return_format"] = self.DEFAULT_RETURN_FORMAT
return spider_docs if mode == "crawl":
params["limit"] = self.DEFAULT_CRAWL_LIMIT
# Update params with custom params if provided.
# This will override any params passed by LLM.
if self.custom_params:
params.update(self.custom_params)
try:
action = (
self.spider.scrape_url if mode == "scrape" else self.spider.crawl_url
)
return action(url=url, params=params)
except Exception as e:
if self.log_failures:
logger.error(f"Error fetching data from {url}, exception: {e}")
return None
else:
raise e