adding intial selenium scrapping tool

This commit is contained in:
João Moura
2024-02-29 03:09:14 -03:00
parent f2dfa07221
commit 640b5a9461
3 changed files with 80 additions and 0 deletions

View File

@@ -16,6 +16,7 @@ from .tools import (
RagTool,
ScrapeElementFromWebsiteTool,
ScrapeWebsiteTool,
SeleniumScrapingTool,
WebsiteSearchTool,
XMLSearchTool,
YoutubeChannelSearchTool,

View File

@@ -14,6 +14,7 @@ from .pg_seach_tool.pg_search_tool import PGSearchTool
from .rag.rag_tool import RagTool
from .scrape_element_from_website.scrape_element_from_website import ScrapeElementFromWebsiteTool
from .scrape_website_tool.scrape_website_tool import ScrapeWebsiteTool
from .selenium_scraping_tool.selenium_scraping_tool import SeleniumScrapingTool
from .website_search.website_search_tool import WebsiteSearchTool
from .xml_search_tool.xml_search_tool import XMLSearchTool
from .youtube_channel_search_tool.youtube_channel_search_tool import YoutubeChannelSearchTool

View File

@@ -0,0 +1,78 @@
from typing import Optional, Type, Any
import time
from pydantic.v1 import BaseModel, Field
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from ..base_tool import BaseTool
class FixedSeleniumScrapingToolSchema(BaseModel):
"""Input for SeleniumScrapingTool."""
pass
class SeleniumScrapingToolSchema(FixedSeleniumScrapingToolSchema):
"""Input for SeleniumScrapingTool."""
website_url: str = Field(..., description="Mandatory website url to read the file")
css_element: str = Field(..., description="Mandatory css reference for element to scrape from the website")
class SeleniumScrapingTool(BaseTool):
name: str = "Read a website content"
description: str = "A tool that can be used to read a website content."
args_schema: Type[BaseModel] = SeleniumScrapingToolSchema
website_url: Optional[str] = None
driver: Optional[Any] = webdriver.Chrome
cookie: Optional[dict] = None
wait_time: Optional[int] = 3
css_element: Optional[str] = None
def __init__(self, website_url: Optional[str] = None, cookie: Optional[dict] = None, css_element: Optional[str] = None, **kwargs):
super().__init__(**kwargs)
if cookie is not None:
self.cookie = cookie
if css_element is not None:
self.css_element = css_element
if website_url is not None:
self.website_url = website_url
self.description = f"A tool that can be used to read {website_url}'s content."
self.args_schema = FixedSeleniumScrapingToolSchema
self._generate_description()
def _run(
self,
**kwargs: Any,
) -> Any:
website_url = kwargs.get('website_url', self.website_url)
css_element = kwargs.get('css_element', self.css_element)
driver = self._create_driver(website_url, self.cookie, self.wait_time)
content = []
if css_element is None or css_element.strip() == "":
body_text = driver.find_element(By.TAG_NAME, "body").text
content.append(body_text)
else:
driver.find_elements(By.CSS_SELECTOR, css_element)
for element in driver.find_elements(By.CSS_SELECTOR, css_element):
content.append(element.text)
driver.close()
return "\n".join(content)
def _create_driver(self, url, cookie, wait_time):
options = Options()
options.add_argument("--headless")
driver = self.driver(options=options)
driver.get(url)
time.sleep(wait_time)
if cookie:
driver.add_cookie(cookie)
time.sleep(wait_time)
driver.get(url)
time.sleep(wait_time)
return driver
def close(self):
self.driver.close()