mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-28 01:28:14 +00:00
Squashed 'packages/tools/' content from commit 78317b9c
git-subtree-dir: packages/tools git-subtree-split: 78317b9c127f18bd040c1d77e3c0840cdc9a5b38
This commit is contained in:
24
crewai_tools/tools/scrape_website_tool/README.md
Normal file
24
crewai_tools/tools/scrape_website_tool/README.md
Normal file
@@ -0,0 +1,24 @@
|
||||
# ScrapeWebsiteTool
|
||||
|
||||
## Description
|
||||
A tool designed to extract and read the content of a specified website. It is capable of handling various types of web pages by making HTTP requests and parsing the received HTML content. This tool can be particularly useful for web scraping tasks, data collection, or extracting specific information from websites.
|
||||
|
||||
## Installation
|
||||
Install the crewai_tools package
|
||||
```shell
|
||||
pip install 'crewai[tools]'
|
||||
```
|
||||
|
||||
## Example
|
||||
```python
|
||||
from crewai_tools import ScrapeWebsiteTool
|
||||
|
||||
# To enable scrapping any website it finds during it's execution
|
||||
tool = ScrapeWebsiteTool()
|
||||
|
||||
# Initialize the tool with the website URL, so the agent can only scrap the content of the specified website
|
||||
tool = ScrapeWebsiteTool(website_url='https://www.example.com')
|
||||
```
|
||||
|
||||
## Arguments
|
||||
- `website_url` : Mandatory website URL to read the file. This is the primary input for the tool, specifying which website's content should be scraped and read.
|
||||
@@ -0,0 +1,79 @@
|
||||
import os
|
||||
import re
|
||||
from typing import Any, Optional, Type
|
||||
|
||||
import requests
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
BEAUTIFULSOUP_AVAILABLE = True
|
||||
except ImportError:
|
||||
BEAUTIFULSOUP_AVAILABLE = False
|
||||
from crewai.tools import BaseTool
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class FixedScrapeWebsiteToolSchema(BaseModel):
|
||||
"""Input for ScrapeWebsiteTool."""
|
||||
|
||||
|
||||
class ScrapeWebsiteToolSchema(FixedScrapeWebsiteToolSchema):
|
||||
"""Input for ScrapeWebsiteTool."""
|
||||
|
||||
website_url: str = Field(..., description="Mandatory website url to read the file")
|
||||
|
||||
|
||||
class ScrapeWebsiteTool(BaseTool):
|
||||
name: str = "Read website content"
|
||||
description: str = "A tool that can be used to read a website content."
|
||||
args_schema: Type[BaseModel] = ScrapeWebsiteToolSchema
|
||||
website_url: Optional[str] = None
|
||||
cookies: Optional[dict] = None
|
||||
headers: Optional[dict] = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
"Referer": "https://www.google.com/",
|
||||
"Connection": "keep-alive",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
website_url: Optional[str] = None,
|
||||
cookies: Optional[dict] = None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
if not BEAUTIFULSOUP_AVAILABLE:
|
||||
raise ImportError("beautifulsoup4 is not installed. Please install it with `pip install crewai-tools[beautifulsoup4]`")
|
||||
|
||||
if website_url is not None:
|
||||
self.website_url = website_url
|
||||
self.description = (
|
||||
f"A tool that can be used to read {website_url}'s content."
|
||||
)
|
||||
self.args_schema = FixedScrapeWebsiteToolSchema
|
||||
self._generate_description()
|
||||
if cookies is not None:
|
||||
self.cookies = {cookies["name"]: os.getenv(cookies["value"])}
|
||||
|
||||
def _run(
|
||||
self,
|
||||
**kwargs: Any,
|
||||
) -> Any:
|
||||
website_url = kwargs.get("website_url", self.website_url)
|
||||
page = requests.get(
|
||||
website_url,
|
||||
timeout=15,
|
||||
headers=self.headers,
|
||||
cookies=self.cookies if self.cookies else {},
|
||||
)
|
||||
|
||||
page.encoding = page.apparent_encoding
|
||||
parsed = BeautifulSoup(page.text, "html.parser")
|
||||
|
||||
text = "The following text is scraped website content:\n\n"
|
||||
text += parsed.get_text(" ")
|
||||
text = re.sub("[ \t]+", " ", text)
|
||||
text = re.sub("\\s+\n\\s+", "\n", text)
|
||||
return text
|
||||
Reference in New Issue
Block a user