mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-14 02:28:30 +00:00
git-subtree-dir: packages/tools git-subtree-split: 78317b9c127f18bd040c1d77e3c0840cdc9a5b38
80 lines
2.7 KiB
Python
80 lines
2.7 KiB
Python
import os
|
|
import re
|
|
from typing import Any, Optional, Type
|
|
|
|
import requests
|
|
try:
|
|
from bs4 import BeautifulSoup
|
|
BEAUTIFULSOUP_AVAILABLE = True
|
|
except ImportError:
|
|
BEAUTIFULSOUP_AVAILABLE = False
|
|
from crewai.tools import BaseTool
|
|
from pydantic import BaseModel, Field
|
|
|
|
|
|
class FixedScrapeWebsiteToolSchema(BaseModel):
|
|
"""Input for ScrapeWebsiteTool."""
|
|
|
|
|
|
class ScrapeWebsiteToolSchema(FixedScrapeWebsiteToolSchema):
|
|
"""Input for ScrapeWebsiteTool."""
|
|
|
|
website_url: str = Field(..., description="Mandatory website url to read the file")
|
|
|
|
|
|
class ScrapeWebsiteTool(BaseTool):
|
|
name: str = "Read website content"
|
|
description: str = "A tool that can be used to read a website content."
|
|
args_schema: Type[BaseModel] = ScrapeWebsiteToolSchema
|
|
website_url: Optional[str] = None
|
|
cookies: Optional[dict] = None
|
|
headers: Optional[dict] = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
|
"Accept-Language": "en-US,en;q=0.9",
|
|
"Referer": "https://www.google.com/",
|
|
"Connection": "keep-alive",
|
|
"Upgrade-Insecure-Requests": "1",
|
|
}
|
|
|
|
def __init__(
|
|
self,
|
|
website_url: Optional[str] = None,
|
|
cookies: Optional[dict] = None,
|
|
**kwargs,
|
|
):
|
|
super().__init__(**kwargs)
|
|
if not BEAUTIFULSOUP_AVAILABLE:
|
|
raise ImportError("beautifulsoup4 is not installed. Please install it with `pip install crewai-tools[beautifulsoup4]`")
|
|
|
|
if website_url is not None:
|
|
self.website_url = website_url
|
|
self.description = (
|
|
f"A tool that can be used to read {website_url}'s content."
|
|
)
|
|
self.args_schema = FixedScrapeWebsiteToolSchema
|
|
self._generate_description()
|
|
if cookies is not None:
|
|
self.cookies = {cookies["name"]: os.getenv(cookies["value"])}
|
|
|
|
def _run(
|
|
self,
|
|
**kwargs: Any,
|
|
) -> Any:
|
|
website_url = kwargs.get("website_url", self.website_url)
|
|
page = requests.get(
|
|
website_url,
|
|
timeout=15,
|
|
headers=self.headers,
|
|
cookies=self.cookies if self.cookies else {},
|
|
)
|
|
|
|
page.encoding = page.apparent_encoding
|
|
parsed = BeautifulSoup(page.text, "html.parser")
|
|
|
|
text = "The following text is scraped website content:\n\n"
|
|
text += parsed.get_text(" ")
|
|
text = re.sub("[ \t]+", " ", text)
|
|
text = re.sub("\\s+\n\\s+", "\n", text)
|
|
return text
|