mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-23 23:28:15 +00:00
@@ -34,7 +34,6 @@ from .serply_api_tool.serply_job_search_tool import SerplyJobSearchTool
|
|||||||
from .txt_search_tool.txt_search_tool import TXTSearchTool
|
from .txt_search_tool.txt_search_tool import TXTSearchTool
|
||||||
from .website_search.website_search_tool import WebsiteSearchTool
|
from .website_search.website_search_tool import WebsiteSearchTool
|
||||||
from .xml_search_tool.xml_search_tool import XMLSearchTool
|
from .xml_search_tool.xml_search_tool import XMLSearchTool
|
||||||
from .youtube_channel_search_tool.youtube_channel_search_tool import (
|
from .youtube_channel_search_tool.youtube_channel_search_tool import YoutubeChannelSearchTool
|
||||||
YoutubeChannelSearchTool,
|
from .youtube_video_search_tool.youtube_video_search_tool import YoutubeVideoSearchTool
|
||||||
)
|
from .spider_tool.spider_tool import SpiderTool
|
||||||
from .youtube_video_search_tool.youtube_video_search_tool import YoutubeVideoSearchToo
|
|
||||||
|
|||||||
81
src/crewai_tools/tools/spider_tool/README.md
Normal file
81
src/crewai_tools/tools/spider_tool/README.md
Normal file
@@ -0,0 +1,81 @@
|
|||||||
|
# SpiderTool
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
[Spider](https://spider.cloud/?ref=crewai) is the [fastest](https://github.com/spider-rs/spider/blob/main/benches/BENCHMARKS.md#benchmark-results) open source scraper and crawler that returns LLM-ready data. It converts any website into pure HTML, markdown, metadata or text while enabling you to crawl with custom actions using AI.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
To use the Spider API you need to download the [Spider SDK](https://pypi.org/project/spider-client/) and the crewai[tools] SDK too:
|
||||||
|
|
||||||
|
```python
|
||||||
|
pip install spider-client 'crewai[tools]'
|
||||||
|
```
|
||||||
|
|
||||||
|
## Example
|
||||||
|
|
||||||
|
This example shows you how you can use the Spider tool to enable your agent to scrape and crawl websites. The data returned from the Spider API is already LLM-ready, so no need to do any cleaning there.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crewai_tools import SpiderTool
|
||||||
|
|
||||||
|
def main():
|
||||||
|
spider_tool = SpiderTool()
|
||||||
|
|
||||||
|
searcher = Agent(
|
||||||
|
role="Web Research Expert",
|
||||||
|
goal="Find related information from specific URL's",
|
||||||
|
backstory="An expert web researcher that uses the web extremely well",
|
||||||
|
tools=[spider_tool],
|
||||||
|
verbose=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
return_metadata = Task(
|
||||||
|
description="Scrape https://spider.cloud with a limit of 1 and enable metadata",
|
||||||
|
expected_output="Metadata and 10 word summary of spider.cloud",
|
||||||
|
agent=searcher
|
||||||
|
)
|
||||||
|
|
||||||
|
crew = Crew(
|
||||||
|
agents=[searcher],
|
||||||
|
tasks=[
|
||||||
|
return_metadata,
|
||||||
|
],
|
||||||
|
verbose=2
|
||||||
|
)
|
||||||
|
|
||||||
|
crew.kickoff()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
```
|
||||||
|
|
||||||
|
## Arguments
|
||||||
|
|
||||||
|
- `api_key` (string, optional): Specifies Spider API key. If not specified, it looks for `SPIDER_API_KEY` in environment variables.
|
||||||
|
- `params` (object, optional): Optional parameters for the request. Defaults to `{"return_format": "markdown"}` to return the website's content in a format that fits LLMs better.
|
||||||
|
- `request` (string): The request type to perform. Possible values are `http`, `chrome`, and `smart`. Use `smart` to perform an HTTP request by default until JavaScript rendering is needed for the HTML.
|
||||||
|
- `limit` (int): The maximum number of pages allowed to crawl per website. Remove the value or set it to `0` to crawl all pages.
|
||||||
|
- `depth` (int): The crawl limit for maximum depth. If `0`, no limit will be applied.
|
||||||
|
- `cache` (bool): Use HTTP caching for the crawl to speed up repeated runs. Default is `true`.
|
||||||
|
- `budget` (object): Object that has paths with a counter for limiting the amount of pages example `{"*":1}` for only crawling the root page.
|
||||||
|
- `locale` (string): The locale to use for request, example `en-US`.
|
||||||
|
- `cookies` (string): Add HTTP cookies to use for request.
|
||||||
|
- `stealth` (bool): Use stealth mode for headless chrome request to help prevent being blocked. The default is `true` on chrome.
|
||||||
|
- `headers` (object): Forward HTTP headers to use for all request. The object is expected to be a map of key value pairs.
|
||||||
|
- `metadata` (bool): Boolean to store metadata about the pages and content found. This could help improve AI interopt. Defaults to `false` unless you have the website already stored with the configuration enabled.
|
||||||
|
- `viewport` (object): Configure the viewport for chrome. Defaults to `800x600`.
|
||||||
|
- `encoding` (string): The type of encoding to use like `UTF-8`, `SHIFT_JIS`, or etc.
|
||||||
|
- `subdomains` (bool): Allow subdomains to be included. Default is `false`.
|
||||||
|
- `user_agent` (string): Add a custom HTTP user agent to the request. By default this is set to a random agent.
|
||||||
|
- `store_data` (bool): Boolean to determine if storage should be used. If set this takes precedence over `storageless`. Defaults to `false`.
|
||||||
|
- `gpt_config` (object): Use AI to generate actions to perform during the crawl. You can pass an array for the `"prompt"` to chain steps.
|
||||||
|
- `fingerprint` (bool): Use advanced fingerprint for chrome.
|
||||||
|
- `storageless` (bool): Boolean to prevent storing any type of data for the request including storage and AI vectors embedding. Defaults to `false` unless you have the website already stored.
|
||||||
|
- `readability` (bool): Use [readability](https://github.com/mozilla/readability) to pre-process the content for reading. This may drastically improve the content for LLM usage.
|
||||||
|
`return_format` (string): The format to return the data in. Possible values are `markdown`, `raw`, `text`, and `html2text`. Use `raw` to return the default format of the page like HTML etc.
|
||||||
|
- `proxy_enabled` (bool): Enable high performance premium proxies for the request to prevent being blocked at the network level.
|
||||||
|
- `query_selector` (string): The CSS query selector to use when extracting content from the markup.
|
||||||
|
- `full_resources` (bool): Crawl and download all the resources for a website.
|
||||||
|
- `request_timeout` (int): The timeout to use for request. Timeouts can be from `5-60`. The default is `30` seconds.
|
||||||
|
- `run_in_background` (bool): Run the request in the background. Useful if storing data and wanting to trigger crawls to the dashboard. This has no effect if storageless is set.
|
||||||
59
src/crewai_tools/tools/spider_tool/spider_tool.py
Normal file
59
src/crewai_tools/tools/spider_tool/spider_tool.py
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
from typing import Optional, Any, Type, Dict, Literal
|
||||||
|
from pydantic.v1 import BaseModel, Field
|
||||||
|
from crewai_tools.tools.base_tool import BaseTool
|
||||||
|
|
||||||
|
class SpiderToolSchema(BaseModel):
|
||||||
|
url: str = Field(description="Website URL")
|
||||||
|
params: Optional[Dict[str, Any]] = Field(
|
||||||
|
description="Set additional params. Options include:\n"
|
||||||
|
"- `limit`: Optional[int] - The maximum number of pages allowed to crawl per website. Remove the value or set it to `0` to crawl all pages.\n"
|
||||||
|
"- `depth`: Optional[int] - The crawl limit for maximum depth. If `0`, no limit will be applied.\n"
|
||||||
|
"- `metadata`: Optional[bool] - Boolean to include metadata or not. Defaults to `False` unless set to `True`. If the user wants metadata, include params.metadata = True.\n"
|
||||||
|
"- `query_selector`: Optional[str] - The CSS query selector to use when extracting content from the markup.\n"
|
||||||
|
)
|
||||||
|
mode: Literal["scrape", "crawl"] = Field(
|
||||||
|
default="scrape",
|
||||||
|
description="Mode, the only two allowed modes are `scrape` or `crawl`. Use `scrape` to scrape a single page and `crawl` to crawl the entire website following subpages. These modes are the only allowed values even when ANY params is set."
|
||||||
|
)
|
||||||
|
|
||||||
|
class SpiderTool(BaseTool):
|
||||||
|
name: str = "Spider scrape & crawl tool"
|
||||||
|
description: str = "Scrape & Crawl any url and return LLM-ready data."
|
||||||
|
args_schema: Type[BaseModel] = SpiderToolSchema
|
||||||
|
api_key: Optional[str] = None
|
||||||
|
spider: Optional[Any] = None
|
||||||
|
|
||||||
|
def __init__(self, api_key: Optional[str] = None, **kwargs):
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
try:
|
||||||
|
from spider import Spider # type: ignore
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"`spider-client` package not found, please run `pip install spider-client`"
|
||||||
|
)
|
||||||
|
|
||||||
|
self.spider = Spider(api_key=api_key)
|
||||||
|
|
||||||
|
def _run(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
params: Optional[Dict[str, Any]] = None,
|
||||||
|
mode: Optional[Literal["scrape", "crawl"]] = "scrape"
|
||||||
|
):
|
||||||
|
if mode not in ["scrape", "crawl"]:
|
||||||
|
raise ValueError(
|
||||||
|
"Unknown mode in `mode` parameter, `scrape` or `crawl` are the allowed modes"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Ensure 'return_format': 'markdown' is always included
|
||||||
|
if params:
|
||||||
|
params["return_format"] = "markdown"
|
||||||
|
else:
|
||||||
|
params = {"return_format": "markdown"}
|
||||||
|
|
||||||
|
action = (
|
||||||
|
self.spider.scrape_url if mode == "scrape" else self.spider.crawl_url
|
||||||
|
)
|
||||||
|
spider_docs = action(url=url, params=params)
|
||||||
|
|
||||||
|
return spider_docs
|
||||||
47
tests/spider_tool_test.py
Normal file
47
tests/spider_tool_test.py
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
from crewai_tools.tools.spider_tool.spider_tool import SpiderTool
|
||||||
|
from crewai import Agent, Task, Crew
|
||||||
|
|
||||||
|
def test_spider_tool():
|
||||||
|
spider_tool = SpiderTool()
|
||||||
|
|
||||||
|
searcher = Agent(
|
||||||
|
role="Web Research Expert",
|
||||||
|
goal="Find related information from specific URL's",
|
||||||
|
backstory="An expert web researcher that uses the web extremely well",
|
||||||
|
tools=[spider_tool],
|
||||||
|
verbose=True,
|
||||||
|
cache=False
|
||||||
|
)
|
||||||
|
|
||||||
|
choose_between_scrape_crawl = Task(
|
||||||
|
description="Scrape the page of spider.cloud and return a summary of how fast it is",
|
||||||
|
expected_output="spider.cloud is a fast scraping and crawling tool",
|
||||||
|
agent=searcher
|
||||||
|
)
|
||||||
|
|
||||||
|
return_metadata = Task(
|
||||||
|
description="Scrape https://spider.cloud with a limit of 1 and enable metadata",
|
||||||
|
expected_output="Metadata and 10 word summary of spider.cloud",
|
||||||
|
agent=searcher
|
||||||
|
)
|
||||||
|
|
||||||
|
css_selector = Task(
|
||||||
|
description="Scrape one page of spider.cloud with the `body > div > main > section.grid.md\:grid-cols-2.gap-10.place-items-center.md\:max-w-screen-xl.mx-auto.pb-8.pt-20 > div:nth-child(1) > h1` CSS selector",
|
||||||
|
expected_output="The content of the element with the css selector body > div > main > section.grid.md\:grid-cols-2.gap-10.place-items-center.md\:max-w-screen-xl.mx-auto.pb-8.pt-20 > div:nth-child(1) > h1",
|
||||||
|
agent=searcher
|
||||||
|
)
|
||||||
|
|
||||||
|
crew = Crew(
|
||||||
|
agents=[searcher],
|
||||||
|
tasks=[
|
||||||
|
choose_between_scrape_crawl,
|
||||||
|
return_metadata,
|
||||||
|
css_selector
|
||||||
|
],
|
||||||
|
verbose=2
|
||||||
|
)
|
||||||
|
|
||||||
|
crew.kickoff()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test_spider_tool()
|
||||||
Reference in New Issue
Block a user