From 15970734e3690c81198ea238543d6a08000dd242 Mon Sep 17 00:00:00 2001 From: Mike Plachta Date: Wed, 27 Nov 2024 17:52:56 -0800 Subject: [PATCH] Jina Website Scraper v1 --- .../tools/jina_scrape_website_tool/README.md | 38 ++++++++++++++ .../jina_scrape_website_tool.py | 52 +++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 src/crewai_tools/tools/jina_scrape_website_tool/README.md create mode 100644 src/crewai_tools/tools/jina_scrape_website_tool/jina_scrape_website_tool.py diff --git a/src/crewai_tools/tools/jina_scrape_website_tool/README.md b/src/crewai_tools/tools/jina_scrape_website_tool/README.md new file mode 100644 index 000000000..0278e5aa0 --- /dev/null +++ b/src/crewai_tools/tools/jina_scrape_website_tool/README.md @@ -0,0 +1,38 @@ +# JinaScrapeWebsiteTool + +## Description +A tool designed to extract and read the content of a specified website by using Jina.ai reader. It is capable of handling various types of web pages by making HTTP requests and parsing the received HTML content. This tool can be particularly useful for web scraping tasks, data collection, or extracting specific information from websites. + +## Installation +Install the crewai_tools package +```shell +pip install 'crewai[tools]' +``` + +## Example +```python +from crewai_tools import JinaScrapeWebsiteTool + +# To enable scraping any website it finds during its execution +tool = JinaScrapeWebsiteTool(api_key='YOUR_API_KEY') + +# Initialize the tool with the website URL, so the agent can only scrape the content of the specified website +tool = JinaScrapeWebsiteTool(website_url='https://www.example.com') + +# With custom headers +tool = JinaScrapeWebsiteTool( + website_url='https://www.example.com', + custom_headers={'X-Target-Selector': 'body, .class, #id'} +) +``` + +## Authentication +The tool uses Jina.ai's reader service. While it can work without an API key, Jina.ai may apply rate limiting or blocking to unauthenticated requests. For production use, it's recommended to provide an API key. + +## Arguments +- `website_url`: Mandatory website URL to read the file. This is the primary input for the tool, specifying which website's content should be scraped and read. +- `api_key`: Optional Jina.ai API key for authenticated access to the reader service. +- `custom_headers`: Optional dictionary of HTTP headers to use when making requests. + +## Note +This tool is an alternative to the standard `ScrapeWebsiteTool` that specifically uses Jina.ai's reader service for enhanced content extraction. Choose this tool when you need more sophisticated content parsing capabilities. \ No newline at end of file diff --git a/src/crewai_tools/tools/jina_scrape_website_tool/jina_scrape_website_tool.py b/src/crewai_tools/tools/jina_scrape_website_tool/jina_scrape_website_tool.py new file mode 100644 index 000000000..7fec77938 --- /dev/null +++ b/src/crewai_tools/tools/jina_scrape_website_tool/jina_scrape_website_tool.py @@ -0,0 +1,52 @@ +import requests +from typing import Type, Optional +from crewai_tools import BaseTool +from pydantic import BaseModel, Field + + +class JinaScrapeWebsiteToolInput(BaseModel): + """Input schema for JinaScrapeWebsiteTool.""" + website_url: str = Field(..., description="Mandatory website url to read the file") + + +class JinaScrapeWebsiteTool(BaseTool): + name: str = "JinaScrapeWebsiteTool" + description: str = "A tool that can be used to read a website content using Jina.ai reader and return markdown content." + args_schema: Type[BaseModel] = JinaScrapeWebsiteToolInput + website_url: Optional[str] = None + api_key: Optional[str] = None + headers: dict = {} + + def __init__( + self, + website_url: Optional[str] = None, + api_key: Optional[str] = None, + custom_headers: Optional[dict] = None, + **kwargs + ): + super().__init__(**kwargs) + if website_url is not None: + self.website_url = website_url + self.description = ( + f"A tool that can be used to read {website_url}'s content and return markdown content." + ) + self._generate_description() + + if custom_headers is not None: + self.headers = custom_headers + + if api_key is not None: + self.headers["Authorization"] = f"Bearer {api_key}" + + def _run(self, website_url: Optional[str] = None) -> str: + url = website_url or self.website_url + if not url: + raise ValueError("Website URL must be provided either during initialization or execution") + + response = requests.get( + f"https://r.jina.ai/{url}", + headers=self.headers, + timeout=15 + ) + response.raise_for_status() + return response.text