From cd37ede869b3032ba191644a9eda613d2624ae30 Mon Sep 17 00:00:00 2001 From: Gilbert Bagaoisan Date: Mon, 16 Dec 2024 22:05:28 -0800 Subject: [PATCH 1/5] lint fixes --- .../tools/spider_tool/spider_tool.py | 135 ++++++++++++++---- 1 file changed, 106 insertions(+), 29 deletions(-) diff --git a/src/crewai_tools/tools/spider_tool/spider_tool.py b/src/crewai_tools/tools/spider_tool/spider_tool.py index 94da9f6fe..74fee809d 100644 --- a/src/crewai_tools/tools/spider_tool/spider_tool.py +++ b/src/crewai_tools/tools/spider_tool/spider_tool.py @@ -1,60 +1,137 @@ +import logging from typing import Any, Dict, Literal, Optional, Type +from urllib.parse import urlparse from crewai.tools import BaseTool from pydantic import BaseModel, Field +logger = logging.getLogger(__file__) + class SpiderToolSchema(BaseModel): - url: str = Field(description="Website URL") - params: Optional[Dict[str, Any]] = Field( - description="Set additional params. Options include:\n" - "- `limit`: Optional[int] - The maximum number of pages allowed to crawl per website. Remove the value or set it to `0` to crawl all pages.\n" - "- `depth`: Optional[int] - The crawl limit for maximum depth. If `0`, no limit will be applied.\n" - "- `metadata`: Optional[bool] - Boolean to include metadata or not. Defaults to `False` unless set to `True`. If the user wants metadata, include params.metadata = True.\n" - "- `query_selector`: Optional[str] - The CSS query selector to use when extracting content from the markup.\n" + """Input schema for SpiderTool.""" + + website_url: str = Field( + ..., description="Mandatory website URL to scrape or crawl" ) mode: Literal["scrape", "crawl"] = Field( default="scrape", - description="Mode, the only two allowed modes are `scrape` or `crawl`. Use `scrape` to scrape a single page and `crawl` to crawl the entire website following subpages. These modes are the only allowed values even when ANY params is set.", + description="The mode of the SpiderTool. The only two allowed modes are `scrape` or `crawl`. Crawl mode will follow up to 5 links and return their content in markdown format.", ) class SpiderTool(BaseTool): - name: str = "Spider scrape & crawl tool" - description: str = "Scrape & Crawl any url and return LLM-ready data." - args_schema: Type[BaseModel] = SpiderToolSchema - api_key: Optional[str] = None - spider: Optional[Any] = None + """Tool for scraping and crawling websites.""" + + DEFAULT_CRAWL_LIMIT: int = 5 + DEFAULT_RETURN_FORMAT: str = "markdown" + + name: str = "SpiderTool" + description: str = ( + "A tool to scrape or crawl a website and return LLM-ready content." + ) + args_schema: Type[BaseModel] = SpiderToolSchema + custom_params: Optional[Dict[str, Any]] = None + website_url: Optional[str] = None + api_key: Optional[str] = None + spider: Any = None + log_failures: bool = True + + def __init__( + self, + api_key: Optional[str] = None, + website_url: Optional[str] = None, + custom_params: Optional[Dict[str, Any]] = None, + log_failures: bool = True, + **kwargs, + ): + """Initialize SpiderTool for web scraping and crawling. + + Args: + api_key (Optional[str]): Spider API key for authentication. Required for production use. + website_url (Optional[str]): Default website URL to scrape/crawl. Can be overridden during execution. + custom_params (Optional[Dict[str, Any]]): Additional parameters to pass to Spider API. + These override any parameters set by the LLM. + log_failures (bool): If True, logs errors. Defaults to True. + **kwargs: Additional arguments passed to BaseTool. + + Raises: + ImportError: If spider-client package is not installed. + RuntimeError: If Spider client initialization fails. + """ - def __init__(self, api_key: Optional[str] = None, **kwargs): super().__init__(**kwargs) + if website_url is not None: + self.website_url = website_url + + self.log_failures = log_failures + self.custom_params = custom_params + try: from spider import Spider # type: ignore + + self.spider = Spider(api_key=api_key) except ImportError: raise ImportError( "`spider-client` package not found, please run `pip install spider-client`" ) + except Exception as e: + raise RuntimeError(f"Failed to initialize Spider client: {str(e)}") - self.spider = Spider(api_key=api_key) + def _validate_url(self, url: str) -> bool: + """Validate URL format. + + Args: + url (str): URL to validate. + Returns: + bool: True if valid URL. + """ + try: + result = urlparse(url) + return all([result.scheme, result.netloc]) + except Exception: + return False def _run( self, - url: str, - params: Optional[Dict[str, Any]] = None, - mode: Optional[Literal["scrape", "crawl"]] = "scrape", - ): - if mode not in ["scrape", "crawl"]: + website_url: str, + mode: Literal["scrape", "crawl"] = "scrape", + ) -> str: + params = {} + url = website_url or self.website_url + + if not self._validate_url(url): + raise ValueError("Invalid URL format") + + if not url: raise ValueError( - "Unknown mode in `mode` parameter, `scrape` or `crawl` are the allowed modes" + "Website URL must be provided either during initialization or execution" ) - # Ensure 'return_format': 'markdown' is always included - if params: - params["return_format"] = "markdown" - else: - params = {"return_format": "markdown"} + if mode not in ["scrape", "crawl"]: + raise ValueError("Mode must be either 'scrape' or 'crawl'") - action = self.spider.scrape_url if mode == "scrape" else self.spider.crawl_url - spider_docs = action(url=url, params=params) + params["request"] = "smart" + params["filter_output_svg"] = True + params["return_format"] = self.DEFAULT_RETURN_FORMAT - return spider_docs + if mode == "crawl": + params["limit"] = self.DEFAULT_CRAWL_LIMIT + + # Update params with custom params if provided. + # This will override any params passed by LLM. + if self.custom_params: + params.update(self.custom_params) + + try: + action = ( + self.spider.scrape_url if mode == "scrape" else self.spider.crawl_url + ) + return action(url=url, params=params) + + except Exception as e: + if self.log_failures: + logger.error(f"Error fetching data from {url}, exception: {e}") + return None + else: + raise e From 4551b8c6251754e6c67832af63d705ef9eb43cb1 Mon Sep 17 00:00:00 2001 From: Gilbert Bagaoisan Date: Mon, 16 Dec 2024 22:05:46 -0800 Subject: [PATCH 2/5] Updated readme --- src/crewai_tools/tools/spider_tool/README.md | 72 +++++++------------- 1 file changed, 24 insertions(+), 48 deletions(-) diff --git a/src/crewai_tools/tools/spider_tool/README.md b/src/crewai_tools/tools/spider_tool/README.md index 563c07a04..c2dc8826a 100644 --- a/src/crewai_tools/tools/spider_tool/README.md +++ b/src/crewai_tools/tools/spider_tool/README.md @@ -1,81 +1,57 @@ # SpiderTool ## Description - -[Spider](https://spider.cloud/?ref=crewai) is the [fastest](https://github.com/spider-rs/spider/blob/main/benches/BENCHMARKS.md#benchmark-results) open source scraper and crawler that returns LLM-ready data. It converts any website into pure HTML, markdown, metadata or text while enabling you to crawl with custom actions using AI. +[Spider](https://spider.cloud/?ref=crewai) is a high-performance web scraping and crawling tool that delivers optimized markdown for LLMs and AI agents. It intelligently switches between HTTP requests and JavaScript rendering based on page requirements. Perfect for both single-page scraping and website crawling—making it ideal for content extraction and data collection. ## Installation - -To use the Spider API you need to download the [Spider SDK](https://pypi.org/project/spider-client/) and the crewai[tools] SDK too: +To use the Spider API you need to download the [Spider SDK](https://pypi.org/project/spider-client/) and the crewai[tools] SDK, too: ```python pip install spider-client 'crewai[tools]' ``` ## Example - -This example shows you how you can use the Spider tool to enable your agent to scrape and crawl websites. The data returned from the Spider API is already LLM-ready, so no need to do any cleaning there. +This example shows you how you can use the Spider tool to enable your agent to scrape and crawl websites. The data returned from the Spider API is LLM-ready. ```python from crewai_tools import SpiderTool -def main(): - spider_tool = SpiderTool() - - searcher = Agent( - role="Web Research Expert", - goal="Find related information from specific URL's", - backstory="An expert web researcher that uses the web extremely well", - tools=[spider_tool], - verbose=True, - ) +# To enable scraping any website it finds during its execution +spider_tool = SpiderTool(api_key='YOUR_API_KEY') - return_metadata = Task( - description="Scrape https://spider.cloud with a limit of 1 and enable metadata", - expected_output="Metadata and 10 word summary of spider.cloud", - agent=searcher - ) +# Initialize the tool with the website URL, so the agent can only scrape the content of the specified website +spider_tool = SpiderTool(website_url='https://www.example.com') - crew = Crew( - agents=[searcher], - tasks=[ - return_metadata, - ], - verbose=2 - ) - - crew.kickoff() - -if __name__ == "__main__": - main() +# Pass in custom parameters, see below for more details +spider_tool = SpiderTool( + website_url='https://www.example.com', + custom_params={"depth": 2, "anti_bot": True, "proxy_enabled": True} +) ``` ## Arguments - `api_key` (string, optional): Specifies Spider API key. If not specified, it looks for `SPIDER_API_KEY` in environment variables. -- `params` (object, optional): Optional parameters for the request. Defaults to `{"return_format": "markdown"}` to return the website's content in a format that fits LLMs better. +- `website_url` (string): The website URL. Will be used as a fallback if passed when the tool is initialized. +- `log_failures` (bool): Log scrape failures or fail silently. Defaults to `true`. +- `custom_params` (object, optional): Optional parameters for the request. + - `return_format` (string): The return format of the website's content. Defaults to `markdown`. - `request` (string): The request type to perform. Possible values are `http`, `chrome`, and `smart`. Use `smart` to perform an HTTP request by default until JavaScript rendering is needed for the HTML. - `limit` (int): The maximum number of pages allowed to crawl per website. Remove the value or set it to `0` to crawl all pages. - `depth` (int): The crawl limit for maximum depth. If `0`, no limit will be applied. - - `cache` (bool): Use HTTP caching for the crawl to speed up repeated runs. Default is `true`. - - `budget` (object): Object that has paths with a counter for limiting the amount of pages example `{"*":1}` for only crawling the root page. - `locale` (string): The locale to use for request, example `en-US`. - `cookies` (string): Add HTTP cookies to use for request. - `stealth` (bool): Use stealth mode for headless chrome request to help prevent being blocked. The default is `true` on chrome. - `headers` (object): Forward HTTP headers to use for all request. The object is expected to be a map of key value pairs. - - `metadata` (bool): Boolean to store metadata about the pages and content found. This could help improve AI interopt. Defaults to `false` unless you have the website already stored with the configuration enabled. - - `viewport` (object): Configure the viewport for chrome. Defaults to `800x600`. - - `encoding` (string): The type of encoding to use like `UTF-8`, `SHIFT_JIS`, or etc. + - `metadata` (bool): Boolean to store metadata about the pages and content found. Defaults to `false`. - `subdomains` (bool): Allow subdomains to be included. Default is `false`. - `user_agent` (string): Add a custom HTTP user agent to the request. By default this is set to a random agent. - - `store_data` (bool): Boolean to determine if storage should be used. If set this takes precedence over `storageless`. Defaults to `false`. - - `gpt_config` (object): Use AI to generate actions to perform during the crawl. You can pass an array for the `"prompt"` to chain steps. - - `fingerprint` (bool): Use advanced fingerprint for chrome. - - `storageless` (bool): Boolean to prevent storing any type of data for the request including storage and AI vectors embedding. Defaults to `false` unless you have the website already stored. - - `readability` (bool): Use [readability](https://github.com/mozilla/readability) to pre-process the content for reading. This may drastically improve the content for LLM usage. - `return_format` (string): The format to return the data in. Possible values are `markdown`, `raw`, `text`, and `html2text`. Use `raw` to return the default format of the page like HTML etc. - `proxy_enabled` (bool): Enable high performance premium proxies for the request to prevent being blocked at the network level. - - `query_selector` (string): The CSS query selector to use when extracting content from the markup. - - `full_resources` (bool): Crawl and download all the resources for a website. + - `css_extraction_map` (object): Use CSS or XPath selectors to scrape contents from the web page. Set the paths and the extraction object map to perform extractions per path or page. - `request_timeout` (int): The timeout to use for request. Timeouts can be from `5-60`. The default is `30` seconds. - - `run_in_background` (bool): Run the request in the background. Useful if storing data and wanting to trigger crawls to the dashboard. This has no effect if storageless is set. + - `return_headers` (bool): Return the HTTP response headers with the results. Defaults to `false`. + - `filter_output_main_only` (bool): Filter the nav, aside, and footer from the output. + - `headers` (object): Forward HTTP headers to use for all request. The object is expected to be a map of key value pairs. + +Learn other parameters that can be used: [https://spider.cloud/docs/api](https://spider.cloud/docs/api) + From 3795d7dd8eca55d8311bc776ff00dcea916500fb Mon Sep 17 00:00:00 2001 From: Gilbert Bagaoisan Date: Mon, 16 Dec 2024 22:19:46 -0800 Subject: [PATCH 3/5] Reversed order of url validation --- src/crewai_tools/tools/spider_tool/spider_tool.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/crewai_tools/tools/spider_tool/spider_tool.py b/src/crewai_tools/tools/spider_tool/spider_tool.py index 74fee809d..970ac8d64 100644 --- a/src/crewai_tools/tools/spider_tool/spider_tool.py +++ b/src/crewai_tools/tools/spider_tool/spider_tool.py @@ -100,14 +100,14 @@ class SpiderTool(BaseTool): params = {} url = website_url or self.website_url - if not self._validate_url(url): - raise ValueError("Invalid URL format") - if not url: raise ValueError( "Website URL must be provided either during initialization or execution" ) + if not self._validate_url(url): + raise ValueError("Invalid URL format") + if mode not in ["scrape", "crawl"]: raise ValueError("Mode must be either 'scrape' or 'crawl'") From 73b803ddc3604efc5975de6863c737d80a8723aa Mon Sep 17 00:00:00 2001 From: Gilbert Bagaoisan Date: Tue, 17 Dec 2024 20:53:17 -0800 Subject: [PATCH 4/5] various improvements for PR based on recommendations --- .../tools/spider_tool/spider_tool.py | 133 +++++++++++++----- 1 file changed, 99 insertions(+), 34 deletions(-) diff --git a/src/crewai_tools/tools/spider_tool/spider_tool.py b/src/crewai_tools/tools/spider_tool/spider_tool.py index 970ac8d64..40959612f 100644 --- a/src/crewai_tools/tools/spider_tool/spider_tool.py +++ b/src/crewai_tools/tools/spider_tool/spider_tool.py @@ -1,6 +1,6 @@ import logging from typing import Any, Dict, Literal, Optional, Type -from urllib.parse import urlparse +from urllib.parse import unquote, urlparse from crewai.tools import BaseTool from pydantic import BaseModel, Field @@ -20,12 +20,28 @@ class SpiderToolSchema(BaseModel): ) -class SpiderTool(BaseTool): - """Tool for scraping and crawling websites.""" +class SpiderToolConfig(BaseModel): + """Configuration settings for SpiderTool. + Contains all default values and constants used by SpiderTool. + Centralizes configuration management for easier maintenance. + """ + + # Crawling settings DEFAULT_CRAWL_LIMIT: int = 5 DEFAULT_RETURN_FORMAT: str = "markdown" + # Request parameters + DEFAULT_REQUEST_MODE: str = "smart" + FILTER_SVG: bool = True + + +class SpiderTool(BaseTool): + """Tool for scraping and crawling websites. + This tool provides functionality to either scrape a single webpage or crawl multiple + pages, returning content in a format suitable for LLM processing. + """ + name: str = "SpiderTool" description: str = ( "A tool to scrape or crawl a website and return LLM-ready content." @@ -36,6 +52,7 @@ class SpiderTool(BaseTool): api_key: Optional[str] = None spider: Any = None log_failures: bool = True + config: SpiderToolConfig = SpiderToolConfig() def __init__( self, @@ -79,16 +96,26 @@ class SpiderTool(BaseTool): raise RuntimeError(f"Failed to initialize Spider client: {str(e)}") def _validate_url(self, url: str) -> bool: - """Validate URL format. + """Validate URL format and security constraints. Args: - url (str): URL to validate. + url (str): URL to validate. Must be a properly formatted HTTP(S) URL + Returns: - bool: True if valid URL. + bool: True if URL is valid and meets security requirements, False otherwise. """ try: - result = urlparse(url) - return all([result.scheme, result.netloc]) + url = url.strip() + decoded_url = unquote(url) + + result = urlparse(decoded_url) + if not all([result.scheme, result.netloc]): + return False + + if result.scheme not in ["http", "https"]: + return False + + return True except Exception: return False @@ -96,42 +123,80 @@ class SpiderTool(BaseTool): self, website_url: str, mode: Literal["scrape", "crawl"] = "scrape", - ) -> str: - params = {} - url = website_url or self.website_url + ) -> Optional[str]: + """Execute the spider tool to scrape or crawl the specified website. - if not url: - raise ValueError( - "Website URL must be provided either during initialization or execution" - ) + Args: + website_url (str): The URL to process. Must be a valid HTTP(S) URL. + mode (Literal["scrape", "crawl"]): Operation mode. + - "scrape": Extract content from single page + - "crawl": Follow links and extract content from multiple pages - if not self._validate_url(url): - raise ValueError("Invalid URL format") + Returns: + Optional[str]: Extracted content in markdown format, or None if extraction fails + and log_failures is True. - if mode not in ["scrape", "crawl"]: - raise ValueError("Mode must be either 'scrape' or 'crawl'") - - params["request"] = "smart" - params["filter_output_svg"] = True - params["return_format"] = self.DEFAULT_RETURN_FORMAT - - if mode == "crawl": - params["limit"] = self.DEFAULT_CRAWL_LIMIT - - # Update params with custom params if provided. - # This will override any params passed by LLM. - if self.custom_params: - params.update(self.custom_params) + Raises: + ValueError: If URL is invalid or missing, or if mode is invalid. + ImportError: If spider-client package is not properly installed. + ConnectionError: If network connection fails while accessing the URL. + Exception: For other runtime errors. + """ try: + params = {} + url = website_url or self.website_url + + if not url: + raise ValueError( + "Website URL must be provided either during initialization or execution" + ) + + if not self._validate_url(url): + raise ValueError(f"Invalid URL format: {url}") + + if mode not in ["scrape", "crawl"]: + raise ValueError( + f"Invalid mode: {mode}. Must be either 'scrape' or 'crawl'" + ) + + params = { + "request": self.config.DEFAULT_REQUEST_MODE, + "filter_output_svg": self.config.FILTER_SVG, + "return_format": self.config.DEFAULT_RETURN_FORMAT, + } + + if mode == "crawl": + params["limit"] = self.config.DEFAULT_CRAWL_LIMIT + + if self.custom_params: + params.update(self.custom_params) + action = ( self.spider.scrape_url if mode == "scrape" else self.spider.crawl_url ) return action(url=url, params=params) + except ValueError as ve: + if self.log_failures: + logger.error(f"Validation error for URL {url}: {str(ve)}") + return None + raise ve + + except ImportError as ie: + logger.error(f"Spider client import error: {str(ie)}") + raise ie + + except ConnectionError as ce: + if self.log_failures: + logger.error(f"Connection error while accessing {url}: {str(ce)}") + return None + raise ce + except Exception as e: if self.log_failures: - logger.error(f"Error fetching data from {url}, exception: {e}") + logger.error( + f"Unexpected error during {mode} operation on {url}: {str(e)}" + ) return None - else: - raise e + raise e From 1bbac87e70cfe2fb71a3d5a5a5ec2af13bebbdaf Mon Sep 17 00:00:00 2001 From: Gilbert Bagaoisan Date: Tue, 17 Dec 2024 20:54:07 -0800 Subject: [PATCH 5/5] =?UTF-8?q?Improved=20readme=20based=20on=20recommenda?= =?UTF-8?q?tions=E2=80=94added=20more=20advanced=20usage=20examples?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/crewai_tools/tools/spider_tool/README.md | 34 ++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/src/crewai_tools/tools/spider_tool/README.md b/src/crewai_tools/tools/spider_tool/README.md index c2dc8826a..482c7c830 100644 --- a/src/crewai_tools/tools/spider_tool/README.md +++ b/src/crewai_tools/tools/spider_tool/README.md @@ -20,13 +20,43 @@ from crewai_tools import SpiderTool spider_tool = SpiderTool(api_key='YOUR_API_KEY') # Initialize the tool with the website URL, so the agent can only scrape the content of the specified website -spider_tool = SpiderTool(website_url='https://www.example.com') +spider_tool = SpiderTool(website_url='https://spider.cloud') # Pass in custom parameters, see below for more details spider_tool = SpiderTool( - website_url='https://www.example.com', + website_url='https://spider.cloud', custom_params={"depth": 2, "anti_bot": True, "proxy_enabled": True} ) + +# Advanced usage using css query selector to extract content +css_extraction_map = { + "/": [ # pass in path (main index in this case) + { + "name": "headers", # give it a name for this element + "selectors": [ + "h1" + ] + } + ] +} + +spider_tool = SpiderTool( + website_url='https://spider.cloud', + custom_params={"anti_bot": True, "proxy_enabled": True, "metadata": True, "css_extraction_map": css_extraction_map} +) + +### Response (extracted text will be in the metadata) +"css_extracted": { + "headers": [ + "The Web Crawler for AI Agents and LLMs!" + ] +} +``` +## Agent setup +```yaml +researcher: + role: > + You're a researcher that is tasked with researching a website and it's content (use crawl mode). The website is to crawl is: {website_url}. ``` ## Arguments