From 73b803ddc3604efc5975de6863c737d80a8723aa Mon Sep 17 00:00:00 2001 From: Gilbert Bagaoisan Date: Tue, 17 Dec 2024 20:53:17 -0800 Subject: [PATCH] various improvements for PR based on recommendations --- .../tools/spider_tool/spider_tool.py | 133 +++++++++++++----- 1 file changed, 99 insertions(+), 34 deletions(-) diff --git a/src/crewai_tools/tools/spider_tool/spider_tool.py b/src/crewai_tools/tools/spider_tool/spider_tool.py index 970ac8d64..40959612f 100644 --- a/src/crewai_tools/tools/spider_tool/spider_tool.py +++ b/src/crewai_tools/tools/spider_tool/spider_tool.py @@ -1,6 +1,6 @@ import logging from typing import Any, Dict, Literal, Optional, Type -from urllib.parse import urlparse +from urllib.parse import unquote, urlparse from crewai.tools import BaseTool from pydantic import BaseModel, Field @@ -20,12 +20,28 @@ class SpiderToolSchema(BaseModel): ) -class SpiderTool(BaseTool): - """Tool for scraping and crawling websites.""" +class SpiderToolConfig(BaseModel): + """Configuration settings for SpiderTool. + Contains all default values and constants used by SpiderTool. + Centralizes configuration management for easier maintenance. + """ + + # Crawling settings DEFAULT_CRAWL_LIMIT: int = 5 DEFAULT_RETURN_FORMAT: str = "markdown" + # Request parameters + DEFAULT_REQUEST_MODE: str = "smart" + FILTER_SVG: bool = True + + +class SpiderTool(BaseTool): + """Tool for scraping and crawling websites. + This tool provides functionality to either scrape a single webpage or crawl multiple + pages, returning content in a format suitable for LLM processing. + """ + name: str = "SpiderTool" description: str = ( "A tool to scrape or crawl a website and return LLM-ready content." @@ -36,6 +52,7 @@ class SpiderTool(BaseTool): api_key: Optional[str] = None spider: Any = None log_failures: bool = True + config: SpiderToolConfig = SpiderToolConfig() def __init__( self, @@ -79,16 +96,26 @@ class SpiderTool(BaseTool): raise RuntimeError(f"Failed to initialize Spider client: {str(e)}") def _validate_url(self, url: str) -> bool: - """Validate URL format. + """Validate URL format and security constraints. Args: - url (str): URL to validate. + url (str): URL to validate. Must be a properly formatted HTTP(S) URL + Returns: - bool: True if valid URL. + bool: True if URL is valid and meets security requirements, False otherwise. """ try: - result = urlparse(url) - return all([result.scheme, result.netloc]) + url = url.strip() + decoded_url = unquote(url) + + result = urlparse(decoded_url) + if not all([result.scheme, result.netloc]): + return False + + if result.scheme not in ["http", "https"]: + return False + + return True except Exception: return False @@ -96,42 +123,80 @@ class SpiderTool(BaseTool): self, website_url: str, mode: Literal["scrape", "crawl"] = "scrape", - ) -> str: - params = {} - url = website_url or self.website_url + ) -> Optional[str]: + """Execute the spider tool to scrape or crawl the specified website. - if not url: - raise ValueError( - "Website URL must be provided either during initialization or execution" - ) + Args: + website_url (str): The URL to process. Must be a valid HTTP(S) URL. + mode (Literal["scrape", "crawl"]): Operation mode. + - "scrape": Extract content from single page + - "crawl": Follow links and extract content from multiple pages - if not self._validate_url(url): - raise ValueError("Invalid URL format") + Returns: + Optional[str]: Extracted content in markdown format, or None if extraction fails + and log_failures is True. - if mode not in ["scrape", "crawl"]: - raise ValueError("Mode must be either 'scrape' or 'crawl'") - - params["request"] = "smart" - params["filter_output_svg"] = True - params["return_format"] = self.DEFAULT_RETURN_FORMAT - - if mode == "crawl": - params["limit"] = self.DEFAULT_CRAWL_LIMIT - - # Update params with custom params if provided. - # This will override any params passed by LLM. - if self.custom_params: - params.update(self.custom_params) + Raises: + ValueError: If URL is invalid or missing, or if mode is invalid. + ImportError: If spider-client package is not properly installed. + ConnectionError: If network connection fails while accessing the URL. + Exception: For other runtime errors. + """ try: + params = {} + url = website_url or self.website_url + + if not url: + raise ValueError( + "Website URL must be provided either during initialization or execution" + ) + + if not self._validate_url(url): + raise ValueError(f"Invalid URL format: {url}") + + if mode not in ["scrape", "crawl"]: + raise ValueError( + f"Invalid mode: {mode}. Must be either 'scrape' or 'crawl'" + ) + + params = { + "request": self.config.DEFAULT_REQUEST_MODE, + "filter_output_svg": self.config.FILTER_SVG, + "return_format": self.config.DEFAULT_RETURN_FORMAT, + } + + if mode == "crawl": + params["limit"] = self.config.DEFAULT_CRAWL_LIMIT + + if self.custom_params: + params.update(self.custom_params) + action = ( self.spider.scrape_url if mode == "scrape" else self.spider.crawl_url ) return action(url=url, params=params) + except ValueError as ve: + if self.log_failures: + logger.error(f"Validation error for URL {url}: {str(ve)}") + return None + raise ve + + except ImportError as ie: + logger.error(f"Spider client import error: {str(ie)}") + raise ie + + except ConnectionError as ce: + if self.log_failures: + logger.error(f"Connection error while accessing {url}: {str(ce)}") + return None + raise ce + except Exception as e: if self.log_failures: - logger.error(f"Error fetching data from {url}, exception: {e}") + logger.error( + f"Unexpected error during {mode} operation on {url}: {str(e)}" + ) return None - else: - raise e + raise e