mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-09 08:08:32 +00:00
lint fixes
This commit is contained in:
@@ -1,60 +1,137 @@
|
|||||||
|
import logging
|
||||||
from typing import Any, Dict, Literal, Optional, Type
|
from typing import Any, Dict, Literal, Optional, Type
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from crewai.tools import BaseTool
|
from crewai.tools import BaseTool
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
logger = logging.getLogger(__file__)
|
||||||
|
|
||||||
|
|
||||||
class SpiderToolSchema(BaseModel):
|
class SpiderToolSchema(BaseModel):
|
||||||
url: str = Field(description="Website URL")
|
"""Input schema for SpiderTool."""
|
||||||
params: Optional[Dict[str, Any]] = Field(
|
|
||||||
description="Set additional params. Options include:\n"
|
website_url: str = Field(
|
||||||
"- `limit`: Optional[int] - The maximum number of pages allowed to crawl per website. Remove the value or set it to `0` to crawl all pages.\n"
|
..., description="Mandatory website URL to scrape or crawl"
|
||||||
"- `depth`: Optional[int] - The crawl limit for maximum depth. If `0`, no limit will be applied.\n"
|
|
||||||
"- `metadata`: Optional[bool] - Boolean to include metadata or not. Defaults to `False` unless set to `True`. If the user wants metadata, include params.metadata = True.\n"
|
|
||||||
"- `query_selector`: Optional[str] - The CSS query selector to use when extracting content from the markup.\n"
|
|
||||||
)
|
)
|
||||||
mode: Literal["scrape", "crawl"] = Field(
|
mode: Literal["scrape", "crawl"] = Field(
|
||||||
default="scrape",
|
default="scrape",
|
||||||
description="Mode, the only two allowed modes are `scrape` or `crawl`. Use `scrape` to scrape a single page and `crawl` to crawl the entire website following subpages. These modes are the only allowed values even when ANY params is set.",
|
description="The mode of the SpiderTool. The only two allowed modes are `scrape` or `crawl`. Crawl mode will follow up to 5 links and return their content in markdown format.",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class SpiderTool(BaseTool):
|
class SpiderTool(BaseTool):
|
||||||
name: str = "Spider scrape & crawl tool"
|
"""Tool for scraping and crawling websites."""
|
||||||
description: str = "Scrape & Crawl any url and return LLM-ready data."
|
|
||||||
args_schema: Type[BaseModel] = SpiderToolSchema
|
DEFAULT_CRAWL_LIMIT: int = 5
|
||||||
api_key: Optional[str] = None
|
DEFAULT_RETURN_FORMAT: str = "markdown"
|
||||||
spider: Optional[Any] = None
|
|
||||||
|
name: str = "SpiderTool"
|
||||||
|
description: str = (
|
||||||
|
"A tool to scrape or crawl a website and return LLM-ready content."
|
||||||
|
)
|
||||||
|
args_schema: Type[BaseModel] = SpiderToolSchema
|
||||||
|
custom_params: Optional[Dict[str, Any]] = None
|
||||||
|
website_url: Optional[str] = None
|
||||||
|
api_key: Optional[str] = None
|
||||||
|
spider: Any = None
|
||||||
|
log_failures: bool = True
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
api_key: Optional[str] = None,
|
||||||
|
website_url: Optional[str] = None,
|
||||||
|
custom_params: Optional[Dict[str, Any]] = None,
|
||||||
|
log_failures: bool = True,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
"""Initialize SpiderTool for web scraping and crawling.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
api_key (Optional[str]): Spider API key for authentication. Required for production use.
|
||||||
|
website_url (Optional[str]): Default website URL to scrape/crawl. Can be overridden during execution.
|
||||||
|
custom_params (Optional[Dict[str, Any]]): Additional parameters to pass to Spider API.
|
||||||
|
These override any parameters set by the LLM.
|
||||||
|
log_failures (bool): If True, logs errors. Defaults to True.
|
||||||
|
**kwargs: Additional arguments passed to BaseTool.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ImportError: If spider-client package is not installed.
|
||||||
|
RuntimeError: If Spider client initialization fails.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, api_key: Optional[str] = None, **kwargs):
|
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
if website_url is not None:
|
||||||
|
self.website_url = website_url
|
||||||
|
|
||||||
|
self.log_failures = log_failures
|
||||||
|
self.custom_params = custom_params
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from spider import Spider # type: ignore
|
from spider import Spider # type: ignore
|
||||||
|
|
||||||
|
self.spider = Spider(api_key=api_key)
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"`spider-client` package not found, please run `pip install spider-client`"
|
"`spider-client` package not found, please run `pip install spider-client`"
|
||||||
)
|
)
|
||||||
|
except Exception as e:
|
||||||
|
raise RuntimeError(f"Failed to initialize Spider client: {str(e)}")
|
||||||
|
|
||||||
self.spider = Spider(api_key=api_key)
|
def _validate_url(self, url: str) -> bool:
|
||||||
|
"""Validate URL format.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url (str): URL to validate.
|
||||||
|
Returns:
|
||||||
|
bool: True if valid URL.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
result = urlparse(url)
|
||||||
|
return all([result.scheme, result.netloc])
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
def _run(
|
def _run(
|
||||||
self,
|
self,
|
||||||
url: str,
|
website_url: str,
|
||||||
params: Optional[Dict[str, Any]] = None,
|
mode: Literal["scrape", "crawl"] = "scrape",
|
||||||
mode: Optional[Literal["scrape", "crawl"]] = "scrape",
|
) -> str:
|
||||||
):
|
params = {}
|
||||||
if mode not in ["scrape", "crawl"]:
|
url = website_url or self.website_url
|
||||||
|
|
||||||
|
if not self._validate_url(url):
|
||||||
|
raise ValueError("Invalid URL format")
|
||||||
|
|
||||||
|
if not url:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Unknown mode in `mode` parameter, `scrape` or `crawl` are the allowed modes"
|
"Website URL must be provided either during initialization or execution"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Ensure 'return_format': 'markdown' is always included
|
if mode not in ["scrape", "crawl"]:
|
||||||
if params:
|
raise ValueError("Mode must be either 'scrape' or 'crawl'")
|
||||||
params["return_format"] = "markdown"
|
|
||||||
else:
|
|
||||||
params = {"return_format": "markdown"}
|
|
||||||
|
|
||||||
action = self.spider.scrape_url if mode == "scrape" else self.spider.crawl_url
|
params["request"] = "smart"
|
||||||
spider_docs = action(url=url, params=params)
|
params["filter_output_svg"] = True
|
||||||
|
params["return_format"] = self.DEFAULT_RETURN_FORMAT
|
||||||
|
|
||||||
return spider_docs
|
if mode == "crawl":
|
||||||
|
params["limit"] = self.DEFAULT_CRAWL_LIMIT
|
||||||
|
|
||||||
|
# Update params with custom params if provided.
|
||||||
|
# This will override any params passed by LLM.
|
||||||
|
if self.custom_params:
|
||||||
|
params.update(self.custom_params)
|
||||||
|
|
||||||
|
try:
|
||||||
|
action = (
|
||||||
|
self.spider.scrape_url if mode == "scrape" else self.spider.crawl_url
|
||||||
|
)
|
||||||
|
return action(url=url, params=params)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
if self.log_failures:
|
||||||
|
logger.error(f"Error fetching data from {url}, exception: {e}")
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
raise e
|
||||||
|
|||||||
Reference in New Issue
Block a user