mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-29 18:18:13 +00:00
Squashed 'packages/tools/' content from commit 78317b9c
git-subtree-dir: packages/tools git-subtree-split: 78317b9c127f18bd040c1d77e3c0840cdc9a5b38
This commit is contained in:
87
crewai_tools/tools/spider_tool/README.md
Normal file
87
crewai_tools/tools/spider_tool/README.md
Normal file
@@ -0,0 +1,87 @@
|
||||
# SpiderTool
|
||||
|
||||
## Description
|
||||
[Spider](https://spider.cloud/?ref=crewai) is a high-performance web scraping and crawling tool that delivers optimized markdown for LLMs and AI agents. It intelligently switches between HTTP requests and JavaScript rendering based on page requirements. Perfect for both single-page scraping and website crawling—making it ideal for content extraction and data collection.
|
||||
|
||||
## Installation
|
||||
To use the Spider API you need to download the [Spider SDK](https://pypi.org/project/spider-client/) and the crewai[tools] SDK, too:
|
||||
|
||||
```python
|
||||
pip install spider-client 'crewai[tools]'
|
||||
```
|
||||
|
||||
## Example
|
||||
This example shows you how you can use the Spider tool to enable your agent to scrape and crawl websites. The data returned from the Spider API is LLM-ready.
|
||||
|
||||
```python
|
||||
from crewai_tools import SpiderTool
|
||||
|
||||
# To enable scraping any website it finds during its execution
|
||||
spider_tool = SpiderTool(api_key='YOUR_API_KEY')
|
||||
|
||||
# Initialize the tool with the website URL, so the agent can only scrape the content of the specified website
|
||||
spider_tool = SpiderTool(website_url='https://spider.cloud')
|
||||
|
||||
# Pass in custom parameters, see below for more details
|
||||
spider_tool = SpiderTool(
|
||||
website_url='https://spider.cloud',
|
||||
custom_params={"depth": 2, "anti_bot": True, "proxy_enabled": True}
|
||||
)
|
||||
|
||||
# Advanced usage using css query selector to extract content
|
||||
css_extraction_map = {
|
||||
"/": [ # pass in path (main index in this case)
|
||||
{
|
||||
"name": "headers", # give it a name for this element
|
||||
"selectors": [
|
||||
"h1"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
spider_tool = SpiderTool(
|
||||
website_url='https://spider.cloud',
|
||||
custom_params={"anti_bot": True, "proxy_enabled": True, "metadata": True, "css_extraction_map": css_extraction_map}
|
||||
)
|
||||
|
||||
### Response (extracted text will be in the metadata)
|
||||
"css_extracted": {
|
||||
"headers": [
|
||||
"The Web Crawler for AI Agents and LLMs!"
|
||||
]
|
||||
}
|
||||
```
|
||||
## Agent setup
|
||||
```yaml
|
||||
researcher:
|
||||
role: >
|
||||
You're a researcher that is tasked with researching a website and it's content (use crawl mode). The website is to crawl is: {website_url}.
|
||||
```
|
||||
|
||||
## Arguments
|
||||
|
||||
- `api_key` (string, optional): Specifies Spider API key. If not specified, it looks for `SPIDER_API_KEY` in environment variables.
|
||||
- `website_url` (string): The website URL. Will be used as a fallback if passed when the tool is initialized.
|
||||
- `log_failures` (bool): Log scrape failures or fail silently. Defaults to `true`.
|
||||
- `custom_params` (object, optional): Optional parameters for the request.
|
||||
- `return_format` (string): The return format of the website's content. Defaults to `markdown`.
|
||||
- `request` (string): The request type to perform. Possible values are `http`, `chrome`, and `smart`. Use `smart` to perform an HTTP request by default until JavaScript rendering is needed for the HTML.
|
||||
- `limit` (int): The maximum number of pages allowed to crawl per website. Remove the value or set it to `0` to crawl all pages.
|
||||
- `depth` (int): The crawl limit for maximum depth. If `0`, no limit will be applied.
|
||||
- `locale` (string): The locale to use for request, example `en-US`.
|
||||
- `cookies` (string): Add HTTP cookies to use for request.
|
||||
- `stealth` (bool): Use stealth mode for headless chrome request to help prevent being blocked. The default is `true` on chrome.
|
||||
- `headers` (object): Forward HTTP headers to use for all request. The object is expected to be a map of key value pairs.
|
||||
- `metadata` (bool): Boolean to store metadata about the pages and content found. Defaults to `false`.
|
||||
- `subdomains` (bool): Allow subdomains to be included. Default is `false`.
|
||||
- `user_agent` (string): Add a custom HTTP user agent to the request. By default this is set to a random agent.
|
||||
- `proxy_enabled` (bool): Enable high performance premium proxies for the request to prevent being blocked at the network level.
|
||||
- `css_extraction_map` (object): Use CSS or XPath selectors to scrape contents from the web page. Set the paths and the extraction object map to perform extractions per path or page.
|
||||
- `request_timeout` (int): The timeout to use for request. Timeouts can be from `5-60`. The default is `30` seconds.
|
||||
- `return_headers` (bool): Return the HTTP response headers with the results. Defaults to `false`.
|
||||
- `filter_output_main_only` (bool): Filter the nav, aside, and footer from the output.
|
||||
- `headers` (object): Forward HTTP headers to use for all request. The object is expected to be a map of key value pairs.
|
||||
|
||||
Learn other parameters that can be used: [https://spider.cloud/docs/api](https://spider.cloud/docs/api)
|
||||
|
||||
214
crewai_tools/tools/spider_tool/spider_tool.py
Normal file
214
crewai_tools/tools/spider_tool/spider_tool.py
Normal file
@@ -0,0 +1,214 @@
|
||||
import logging
|
||||
from typing import Any, Dict, Literal, Optional, Type, List
|
||||
from urllib.parse import unquote, urlparse
|
||||
|
||||
from crewai.tools import BaseTool, EnvVar
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
logger = logging.getLogger(__file__)
|
||||
|
||||
|
||||
class SpiderToolSchema(BaseModel):
|
||||
"""Input schema for SpiderTool."""
|
||||
|
||||
website_url: str = Field(
|
||||
..., description="Mandatory website URL to scrape or crawl"
|
||||
)
|
||||
mode: Literal["scrape", "crawl"] = Field(
|
||||
default="scrape",
|
||||
description="The mode of the SpiderTool. The only two allowed modes are `scrape` or `crawl`. Crawl mode will follow up to 5 links and return their content in markdown format.",
|
||||
)
|
||||
|
||||
|
||||
class SpiderToolConfig(BaseModel):
|
||||
"""Configuration settings for SpiderTool.
|
||||
|
||||
Contains all default values and constants used by SpiderTool.
|
||||
Centralizes configuration management for easier maintenance.
|
||||
"""
|
||||
|
||||
# Crawling settings
|
||||
DEFAULT_CRAWL_LIMIT: int = 5
|
||||
DEFAULT_RETURN_FORMAT: str = "markdown"
|
||||
|
||||
# Request parameters
|
||||
DEFAULT_REQUEST_MODE: str = "smart"
|
||||
FILTER_SVG: bool = True
|
||||
|
||||
|
||||
class SpiderTool(BaseTool):
|
||||
"""Tool for scraping and crawling websites.
|
||||
This tool provides functionality to either scrape a single webpage or crawl multiple
|
||||
pages, returning content in a format suitable for LLM processing.
|
||||
"""
|
||||
|
||||
name: str = "SpiderTool"
|
||||
description: str = (
|
||||
"A tool to scrape or crawl a website and return LLM-ready content."
|
||||
)
|
||||
args_schema: Type[BaseModel] = SpiderToolSchema
|
||||
custom_params: Optional[Dict[str, Any]] = None
|
||||
website_url: Optional[str] = None
|
||||
api_key: Optional[str] = None
|
||||
spider: Any = None
|
||||
log_failures: bool = True
|
||||
config: SpiderToolConfig = SpiderToolConfig()
|
||||
package_dependencies: List[str] = ["spider-client"]
|
||||
env_vars: List[EnvVar] = [
|
||||
EnvVar(name="SPIDER_API_KEY", description="API key for Spider.cloud", required=True),
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
api_key: Optional[str] = None,
|
||||
website_url: Optional[str] = None,
|
||||
custom_params: Optional[Dict[str, Any]] = None,
|
||||
log_failures: bool = True,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize SpiderTool for web scraping and crawling.
|
||||
|
||||
Args:
|
||||
api_key (Optional[str]): Spider API key for authentication. Required for production use.
|
||||
website_url (Optional[str]): Default website URL to scrape/crawl. Can be overridden during execution.
|
||||
custom_params (Optional[Dict[str, Any]]): Additional parameters to pass to Spider API.
|
||||
These override any parameters set by the LLM.
|
||||
log_failures (bool): If True, logs errors. Defaults to True.
|
||||
**kwargs: Additional arguments passed to BaseTool.
|
||||
|
||||
Raises:
|
||||
ImportError: If spider-client package is not installed.
|
||||
RuntimeError: If Spider client initialization fails.
|
||||
"""
|
||||
|
||||
super().__init__(**kwargs)
|
||||
if website_url is not None:
|
||||
self.website_url = website_url
|
||||
|
||||
self.log_failures = log_failures
|
||||
self.custom_params = custom_params
|
||||
|
||||
try:
|
||||
from spider import Spider # type: ignore
|
||||
|
||||
except ImportError:
|
||||
import click
|
||||
|
||||
if click.confirm(
|
||||
"You are missing the 'spider-client' package. Would you like to install it?"
|
||||
):
|
||||
import subprocess
|
||||
|
||||
subprocess.run(["uv", "pip", "install", "spider-client"], check=True)
|
||||
from spider import Spider
|
||||
else:
|
||||
raise ImportError(
|
||||
"`spider-client` package not found, please run `uv add spider-client`"
|
||||
)
|
||||
self.spider = Spider(api_key=api_key)
|
||||
|
||||
def _validate_url(self, url: str) -> bool:
|
||||
"""Validate URL format and security constraints.
|
||||
|
||||
Args:
|
||||
url (str): URL to validate. Must be a properly formatted HTTP(S) URL
|
||||
|
||||
Returns:
|
||||
bool: True if URL is valid and meets security requirements, False otherwise.
|
||||
"""
|
||||
try:
|
||||
url = url.strip()
|
||||
decoded_url = unquote(url)
|
||||
|
||||
result = urlparse(decoded_url)
|
||||
if not all([result.scheme, result.netloc]):
|
||||
return False
|
||||
|
||||
if result.scheme not in ["http", "https"]:
|
||||
return False
|
||||
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _run(
|
||||
self,
|
||||
website_url: str,
|
||||
mode: Literal["scrape", "crawl"] = "scrape",
|
||||
) -> Optional[str]:
|
||||
"""Execute the spider tool to scrape or crawl the specified website.
|
||||
|
||||
Args:
|
||||
website_url (str): The URL to process. Must be a valid HTTP(S) URL.
|
||||
mode (Literal["scrape", "crawl"]): Operation mode.
|
||||
- "scrape": Extract content from single page
|
||||
- "crawl": Follow links and extract content from multiple pages
|
||||
|
||||
Returns:
|
||||
Optional[str]: Extracted content in markdown format, or None if extraction fails
|
||||
and log_failures is True.
|
||||
|
||||
Raises:
|
||||
ValueError: If URL is invalid or missing, or if mode is invalid.
|
||||
ImportError: If spider-client package is not properly installed.
|
||||
ConnectionError: If network connection fails while accessing the URL.
|
||||
Exception: For other runtime errors.
|
||||
"""
|
||||
|
||||
try:
|
||||
params = {}
|
||||
url = website_url or self.website_url
|
||||
|
||||
if not url:
|
||||
raise ValueError(
|
||||
"Website URL must be provided either during initialization or execution"
|
||||
)
|
||||
|
||||
if not self._validate_url(url):
|
||||
raise ValueError(f"Invalid URL format: {url}")
|
||||
|
||||
if mode not in ["scrape", "crawl"]:
|
||||
raise ValueError(
|
||||
f"Invalid mode: {mode}. Must be either 'scrape' or 'crawl'"
|
||||
)
|
||||
|
||||
params = {
|
||||
"request": self.config.DEFAULT_REQUEST_MODE,
|
||||
"filter_output_svg": self.config.FILTER_SVG,
|
||||
"return_format": self.config.DEFAULT_RETURN_FORMAT,
|
||||
}
|
||||
|
||||
if mode == "crawl":
|
||||
params["limit"] = self.config.DEFAULT_CRAWL_LIMIT
|
||||
|
||||
if self.custom_params:
|
||||
params.update(self.custom_params)
|
||||
|
||||
action = (
|
||||
self.spider.scrape_url if mode == "scrape" else self.spider.crawl_url
|
||||
)
|
||||
return action(url=url, params=params)
|
||||
|
||||
except ValueError as ve:
|
||||
if self.log_failures:
|
||||
logger.error(f"Validation error for URL {url}: {str(ve)}")
|
||||
return None
|
||||
raise ve
|
||||
|
||||
except ImportError as ie:
|
||||
logger.error(f"Spider client import error: {str(ie)}")
|
||||
raise ie
|
||||
|
||||
except ConnectionError as ce:
|
||||
if self.log_failures:
|
||||
logger.error(f"Connection error while accessing {url}: {str(ce)}")
|
||||
return None
|
||||
raise ce
|
||||
|
||||
except Exception as e:
|
||||
if self.log_failures:
|
||||
logger.error(
|
||||
f"Unexpected error during {mode} operation on {url}: {str(e)}"
|
||||
)
|
||||
return None
|
||||
raise e
|
||||
Reference in New Issue
Block a user