Squashed 'packages/tools/' content from commit 78317b9c

git-subtree-dir: packages/tools git-subtree-split: 78317b9c127f18bd040c1d77e3c0840cdc9a5b38
2026-01-26 16:48:13 +00:00 · 2025-09-12 21:58:02 -04:00
commit e16606672a
303 changed files with 49010 additions and 0 deletions
--- a/crewai_tools/tools/scrape_website_tool/scrape_website_tool.py
+++ b/crewai_tools/tools/scrape_website_tool/scrape_website_tool.py
@@ -0,0 +1,79 @@
+import os
+import re
+from typing import Any, Optional, Type
+
+import requests
+try:
+    from bs4 import BeautifulSoup
+    BEAUTIFULSOUP_AVAILABLE = True
+except ImportError:
+    BEAUTIFULSOUP_AVAILABLE = False
+from crewai.tools import BaseTool
+from pydantic import BaseModel, Field
+
+
+class FixedScrapeWebsiteToolSchema(BaseModel):
+    """Input for ScrapeWebsiteTool."""
+
+
+class ScrapeWebsiteToolSchema(FixedScrapeWebsiteToolSchema):
+    """Input for ScrapeWebsiteTool."""
+
+    website_url: str = Field(..., description="Mandatory website url to read the file")
+
+
+class ScrapeWebsiteTool(BaseTool):
+    name: str = "Read website content"
+    description: str = "A tool that can be used to read a website content."
+    args_schema: Type[BaseModel] = ScrapeWebsiteToolSchema
+    website_url: Optional[str] = None
+    cookies: Optional[dict] = None
+    headers: Optional[dict] = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
+        "Accept-Language": "en-US,en;q=0.9",
+        "Referer": "https://www.google.com/",
+        "Connection": "keep-alive",
+        "Upgrade-Insecure-Requests": "1",
+    }
+
+    def __init__(
+        self,
+        website_url: Optional[str] = None,
+        cookies: Optional[dict] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if not BEAUTIFULSOUP_AVAILABLE:
+            raise ImportError("beautifulsoup4 is not installed. Please install it with `pip install crewai-tools[beautifulsoup4]`")
+
+        if website_url is not None:
+            self.website_url = website_url
+            self.description = (
+                f"A tool that can be used to read {website_url}'s content."
+            )
+            self.args_schema = FixedScrapeWebsiteToolSchema
+            self._generate_description()
+            if cookies is not None:
+                self.cookies = {cookies["name"]: os.getenv(cookies["value"])}
+
+    def _run(
+        self,
+        **kwargs: Any,
+    ) -> Any:
+        website_url = kwargs.get("website_url", self.website_url)
+        page = requests.get(
+            website_url,
+            timeout=15,
+            headers=self.headers,
+            cookies=self.cookies if self.cookies else {},
+        )
+
+        page.encoding = page.apparent_encoding
+        parsed = BeautifulSoup(page.text, "html.parser")
+
+        text = "The following text is scraped website content:\n\n"
+        text += parsed.get_text(" ")
+        text = re.sub("[ \t]+", " ", text)
+        text = re.sub("\\s+\n\\s+", "\n", text)
+        return text