From 9e560ff9517f3f62c80c59965a7c3c8e8abfabe8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Moura?= Date: Mon, 26 Feb 2024 06:15:15 -0300 Subject: [PATCH] adding new scrapping tools --- src/crewai_tools/__init__.py | 2 + src/crewai_tools/tools/__init__.py | 2 + .../scrape_element_from_website.py | 43 +++++++++++++++++++ .../scrape_website_tool.py | 36 ++++++++++++++++ 4 files changed, 83 insertions(+) create mode 100644 src/crewai_tools/tools/scrape_element_from_website/scrape_element_from_website.py create mode 100644 src/crewai_tools/tools/scrape_website_tool/scrape_website_tool.py diff --git a/src/crewai_tools/__init__.py b/src/crewai_tools/__init__.py index 08e5df185..e417720d7 100644 --- a/src/crewai_tools/__init__.py +++ b/src/crewai_tools/__init__.py @@ -14,6 +14,8 @@ from .tools import ( PDFSearchTool, PGSearchTool, RagTool, + ScrapeElementFromWebsiteTool, + ScrapeWebsiteTool, WebsiteSearchTool, XMLSearchTool, YoutubeChannelSearchTool, diff --git a/src/crewai_tools/tools/__init__.py b/src/crewai_tools/tools/__init__.py index e2382eb9b..ecea1cb3f 100644 --- a/src/crewai_tools/tools/__init__.py +++ b/src/crewai_tools/tools/__init__.py @@ -12,6 +12,8 @@ from .mdx_seach_tool.mdx_search_tool import MDXSearchTool from .pdf_search_tool.pdf_search_tool import PDFSearchTool from .pg_seach_tool.pg_search_tool import PGSearchTool from .rag.rag_tool import RagTool +from .scrape_element_from_website.scrape_element_from_website import ScrapeElementFromWebsiteTool +from .scrape_website_tool.scrape_website_tool import ScrapeWebsiteTool from .website_search.website_search_tool import WebsiteSearchTool from .xml_search_tool.xml_search_tool import XMLSearchTool from .youtube_channel_search_tool.youtube_channel_search_tool import YoutubeChannelSearchTool diff --git a/src/crewai_tools/tools/scrape_element_from_website/scrape_element_from_website.py b/src/crewai_tools/tools/scrape_element_from_website/scrape_element_from_website.py new file mode 100644 index 000000000..1996172b2 --- /dev/null +++ b/src/crewai_tools/tools/scrape_element_from_website/scrape_element_from_website.py @@ -0,0 +1,43 @@ +import requests +from bs4 import BeautifulSoup +from typing import Optional, Type, Any +from pydantic.v1 import BaseModel, Field +from ..base_tool import BaseTool + +class FixedScrapeElementFromWebsiteToolSchema(BaseModel): + """Input for ScrapeElementFromWebsiteTool.""" + pass + +class ScrapeElementFromWebsiteToolSchema(FixedScrapeElementFromWebsiteToolSchema): + """Input for ScrapeElementFromWebsiteTool.""" + website_url: str = Field(..., description="Mandatory website url to read the file") + css_element: str = Field(..., description="Mandatory css reference for element to scrape from the website") + +class ScrapeElementFromWebsiteTool(BaseTool): + name: str = "Read a website content" + description: str = "A tool that can be used to read a website content." + args_schema: Type[BaseModel] = ScrapeElementFromWebsiteToolSchema + website_url: Optional[str] = None + css_element: Optional[str] = None + + def __init__(self, website_url: Optional[str] = None, css_element: Optional[str] = None, **kwargs): + super().__init__(**kwargs) + if website_url is not None: + self.website_url = website_url + self.css_element = css_element + self.description = f"A tool that can be used to read {website_url}'s content." + self.args_schema = FixedScrapeElementFromWebsiteToolSchema + + def _run( + self, + **kwargs: Any, + ) -> Any: + website_url = kwargs.get('website_url', self.website_url) + css_element = kwargs.get('css_element', self.css_element) + page = requests.get(website_url) + parsed = BeautifulSoup(page.content, "html.parser") + elements = parsed.select(css_element) + return "\n".join([element.get_text() for element in elements]) + + + diff --git a/src/crewai_tools/tools/scrape_website_tool/scrape_website_tool.py b/src/crewai_tools/tools/scrape_website_tool/scrape_website_tool.py new file mode 100644 index 000000000..8ec16c1ab --- /dev/null +++ b/src/crewai_tools/tools/scrape_website_tool/scrape_website_tool.py @@ -0,0 +1,36 @@ +import requests +from bs4 import BeautifulSoup +from typing import Optional, Type, Any +from pydantic.v1 import BaseModel, Field +from ..base_tool import BaseTool + +class FixedScrapeWebsiteToolSchema(BaseModel): + """Input for ScrapeWebsiteTool.""" + pass + +class ScrapeWebsiteToolSchema(FixedScrapeWebsiteToolSchema): + """Input for ScrapeWebsiteTool.""" + website_url: str = Field(..., description="Mandatory website url to read the file") + +class ScrapeWebsiteTool(BaseTool): + name: str = "Read a website content" + description: str = "A tool that can be used to read a website content." + args_schema: Type[BaseModel] = ScrapeWebsiteToolSchema + website_url: Optional[str] = None + + def __init__(self, website_url: Optional[str] = None, **kwargs): + super().__init__(**kwargs) + if website_url is not None: + self.website_url = website_url + self.description = f"A tool that can be used to read {website_url}'s content." + self.args_schema = FixedScrapeWebsiteToolSchema + + def _run( + self, + **kwargs: Any, + ) -> Any: + website_url = kwargs.get('website_url', self.website_url) + page = requests.get(website_url) + parsed = BeautifulSoup(page.content, "html.parser") + return parsed.get_text() +