Squashed 'packages/tools/' content from commit 78317b9c

git-subtree-dir: packages/tools
git-subtree-split: 78317b9c127f18bd040c1d77e3c0840cdc9a5b38
This commit is contained in:
Greyson Lalonde
2025-09-12 21:58:02 -04:00
commit e16606672a
303 changed files with 49010 additions and 0 deletions

View File

@@ -0,0 +1,24 @@
# ScrapeWebsiteTool
## Description
A tool designed to extract and read the content of a specified website. It is capable of handling various types of web pages by making HTTP requests and parsing the received HTML content. This tool can be particularly useful for web scraping tasks, data collection, or extracting specific information from websites.
## Installation
Install the crewai_tools package
```shell
pip install 'crewai[tools]'
```
## Example
```python
from crewai_tools import ScrapeWebsiteTool
# To enable scrapping any website it finds during it's execution
tool = ScrapeWebsiteTool()
# Initialize the tool with the website URL, so the agent can only scrap the content of the specified website
tool = ScrapeWebsiteTool(website_url='https://www.example.com')
```
## Arguments
- `website_url` : Mandatory website URL to read the file. This is the primary input for the tool, specifying which website's content should be scraped and read.

View File

@@ -0,0 +1,79 @@
import os
import re
from typing import Any, Optional, Type
import requests
try:
from bs4 import BeautifulSoup
BEAUTIFULSOUP_AVAILABLE = True
except ImportError:
BEAUTIFULSOUP_AVAILABLE = False
from crewai.tools import BaseTool
from pydantic import BaseModel, Field
class FixedScrapeWebsiteToolSchema(BaseModel):
"""Input for ScrapeWebsiteTool."""
class ScrapeWebsiteToolSchema(FixedScrapeWebsiteToolSchema):
"""Input for ScrapeWebsiteTool."""
website_url: str = Field(..., description="Mandatory website url to read the file")
class ScrapeWebsiteTool(BaseTool):
name: str = "Read website content"
description: str = "A tool that can be used to read a website content."
args_schema: Type[BaseModel] = ScrapeWebsiteToolSchema
website_url: Optional[str] = None
cookies: Optional[dict] = None
headers: Optional[dict] = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Language": "en-US,en;q=0.9",
"Referer": "https://www.google.com/",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
}
def __init__(
self,
website_url: Optional[str] = None,
cookies: Optional[dict] = None,
**kwargs,
):
super().__init__(**kwargs)
if not BEAUTIFULSOUP_AVAILABLE:
raise ImportError("beautifulsoup4 is not installed. Please install it with `pip install crewai-tools[beautifulsoup4]`")
if website_url is not None:
self.website_url = website_url
self.description = (
f"A tool that can be used to read {website_url}'s content."
)
self.args_schema = FixedScrapeWebsiteToolSchema
self._generate_description()
if cookies is not None:
self.cookies = {cookies["name"]: os.getenv(cookies["value"])}
def _run(
self,
**kwargs: Any,
) -> Any:
website_url = kwargs.get("website_url", self.website_url)
page = requests.get(
website_url,
timeout=15,
headers=self.headers,
cookies=self.cookies if self.cookies else {},
)
page.encoding = page.apparent_encoding
parsed = BeautifulSoup(page.text, "html.parser")
text = "The following text is scraped website content:\n\n"
text += parsed.get_text(" ")
text = re.sub("[ \t]+", " ", text)
text = re.sub("\\s+\n\\s+", "\n", text)
return text