From 640b5a9461e974b78d36cc5f7b4b4dbc74c85275 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Moura?= Date: Thu, 29 Feb 2024 03:09:14 -0300 Subject: [PATCH] adding intial selenium scrapping tool --- src/crewai_tools/__init__.py | 1 + src/crewai_tools/tools/__init__.py | 1 + .../selenium_scraping_tool.py | 78 +++++++++++++++++++ 3 files changed, 80 insertions(+) create mode 100644 src/crewai_tools/tools/selenium_scraping_tool/selenium_scraping_tool.py diff --git a/src/crewai_tools/__init__.py b/src/crewai_tools/__init__.py index e417720d7..63dfedc15 100644 --- a/src/crewai_tools/__init__.py +++ b/src/crewai_tools/__init__.py @@ -16,6 +16,7 @@ from .tools import ( RagTool, ScrapeElementFromWebsiteTool, ScrapeWebsiteTool, + SeleniumScrapingTool, WebsiteSearchTool, XMLSearchTool, YoutubeChannelSearchTool, diff --git a/src/crewai_tools/tools/__init__.py b/src/crewai_tools/tools/__init__.py index ecea1cb3f..261437d5f 100644 --- a/src/crewai_tools/tools/__init__.py +++ b/src/crewai_tools/tools/__init__.py @@ -14,6 +14,7 @@ from .pg_seach_tool.pg_search_tool import PGSearchTool from .rag.rag_tool import RagTool from .scrape_element_from_website.scrape_element_from_website import ScrapeElementFromWebsiteTool from .scrape_website_tool.scrape_website_tool import ScrapeWebsiteTool +from .selenium_scraping_tool.selenium_scraping_tool import SeleniumScrapingTool from .website_search.website_search_tool import WebsiteSearchTool from .xml_search_tool.xml_search_tool import XMLSearchTool from .youtube_channel_search_tool.youtube_channel_search_tool import YoutubeChannelSearchTool diff --git a/src/crewai_tools/tools/selenium_scraping_tool/selenium_scraping_tool.py b/src/crewai_tools/tools/selenium_scraping_tool/selenium_scraping_tool.py new file mode 100644 index 000000000..6af3e18cb --- /dev/null +++ b/src/crewai_tools/tools/selenium_scraping_tool/selenium_scraping_tool.py @@ -0,0 +1,78 @@ +from typing import Optional, Type, Any +import time +from pydantic.v1 import BaseModel, Field + +from bs4 import BeautifulSoup +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.firefox.options import Options + +from ..base_tool import BaseTool + +class FixedSeleniumScrapingToolSchema(BaseModel): + """Input for SeleniumScrapingTool.""" + pass + +class SeleniumScrapingToolSchema(FixedSeleniumScrapingToolSchema): + """Input for SeleniumScrapingTool.""" + website_url: str = Field(..., description="Mandatory website url to read the file") + css_element: str = Field(..., description="Mandatory css reference for element to scrape from the website") + +class SeleniumScrapingTool(BaseTool): + name: str = "Read a website content" + description: str = "A tool that can be used to read a website content." + args_schema: Type[BaseModel] = SeleniumScrapingToolSchema + website_url: Optional[str] = None + driver: Optional[Any] = webdriver.Chrome + cookie: Optional[dict] = None + wait_time: Optional[int] = 3 + css_element: Optional[str] = None + + def __init__(self, website_url: Optional[str] = None, cookie: Optional[dict] = None, css_element: Optional[str] = None, **kwargs): + super().__init__(**kwargs) + if cookie is not None: + self.cookie = cookie + + if css_element is not None: + self.css_element = css_element + + if website_url is not None: + self.website_url = website_url + self.description = f"A tool that can be used to read {website_url}'s content." + self.args_schema = FixedSeleniumScrapingToolSchema + + self._generate_description() + def _run( + self, + **kwargs: Any, + ) -> Any: + website_url = kwargs.get('website_url', self.website_url) + css_element = kwargs.get('css_element', self.css_element) + driver = self._create_driver(website_url, self.cookie, self.wait_time) + + content = [] + if css_element is None or css_element.strip() == "": + body_text = driver.find_element(By.TAG_NAME, "body").text + content.append(body_text) + else: + driver.find_elements(By.CSS_SELECTOR, css_element) + for element in driver.find_elements(By.CSS_SELECTOR, css_element): + content.append(element.text) + driver.close() + return "\n".join(content) + + def _create_driver(self, url, cookie, wait_time): + options = Options() + options.add_argument("--headless") + driver = self.driver(options=options) + driver.get(url) + time.sleep(wait_time) + if cookie: + driver.add_cookie(cookie) + time.sleep(wait_time) + driver.get(url) + time.sleep(wait_time) + return driver + + def close(self): + self.driver.close() \ No newline at end of file