adding new scrapping tools

This commit is contained in:
João Moura
2024-02-26 06:15:15 -03:00
parent 80942bf38c
commit 9e560ff951
4 changed files with 83 additions and 0 deletions

View File

@@ -14,6 +14,8 @@ from .tools import (
PDFSearchTool,
PGSearchTool,
RagTool,
ScrapeElementFromWebsiteTool,
ScrapeWebsiteTool,
WebsiteSearchTool,
XMLSearchTool,
YoutubeChannelSearchTool,

View File

@@ -12,6 +12,8 @@ from .mdx_seach_tool.mdx_search_tool import MDXSearchTool
from .pdf_search_tool.pdf_search_tool import PDFSearchTool
from .pg_seach_tool.pg_search_tool import PGSearchTool
from .rag.rag_tool import RagTool
from .scrape_element_from_website.scrape_element_from_website import ScrapeElementFromWebsiteTool
from .scrape_website_tool.scrape_website_tool import ScrapeWebsiteTool
from .website_search.website_search_tool import WebsiteSearchTool
from .xml_search_tool.xml_search_tool import XMLSearchTool
from .youtube_channel_search_tool.youtube_channel_search_tool import YoutubeChannelSearchTool

View File

@@ -0,0 +1,43 @@
import requests
from bs4 import BeautifulSoup
from typing import Optional, Type, Any
from pydantic.v1 import BaseModel, Field
from ..base_tool import BaseTool
class FixedScrapeElementFromWebsiteToolSchema(BaseModel):
"""Input for ScrapeElementFromWebsiteTool."""
pass
class ScrapeElementFromWebsiteToolSchema(FixedScrapeElementFromWebsiteToolSchema):
"""Input for ScrapeElementFromWebsiteTool."""
website_url: str = Field(..., description="Mandatory website url to read the file")
css_element: str = Field(..., description="Mandatory css reference for element to scrape from the website")
class ScrapeElementFromWebsiteTool(BaseTool):
name: str = "Read a website content"
description: str = "A tool that can be used to read a website content."
args_schema: Type[BaseModel] = ScrapeElementFromWebsiteToolSchema
website_url: Optional[str] = None
css_element: Optional[str] = None
def __init__(self, website_url: Optional[str] = None, css_element: Optional[str] = None, **kwargs):
super().__init__(**kwargs)
if website_url is not None:
self.website_url = website_url
self.css_element = css_element
self.description = f"A tool that can be used to read {website_url}'s content."
self.args_schema = FixedScrapeElementFromWebsiteToolSchema
def _run(
self,
**kwargs: Any,
) -> Any:
website_url = kwargs.get('website_url', self.website_url)
css_element = kwargs.get('css_element', self.css_element)
page = requests.get(website_url)
parsed = BeautifulSoup(page.content, "html.parser")
elements = parsed.select(css_element)
return "\n".join([element.get_text() for element in elements])

View File

@@ -0,0 +1,36 @@
import requests
from bs4 import BeautifulSoup
from typing import Optional, Type, Any
from pydantic.v1 import BaseModel, Field
from ..base_tool import BaseTool
class FixedScrapeWebsiteToolSchema(BaseModel):
"""Input for ScrapeWebsiteTool."""
pass
class ScrapeWebsiteToolSchema(FixedScrapeWebsiteToolSchema):
"""Input for ScrapeWebsiteTool."""
website_url: str = Field(..., description="Mandatory website url to read the file")
class ScrapeWebsiteTool(BaseTool):
name: str = "Read a website content"
description: str = "A tool that can be used to read a website content."
args_schema: Type[BaseModel] = ScrapeWebsiteToolSchema
website_url: Optional[str] = None
def __init__(self, website_url: Optional[str] = None, **kwargs):
super().__init__(**kwargs)
if website_url is not None:
self.website_url = website_url
self.description = f"A tool that can be used to read {website_url}'s content."
self.args_schema = FixedScrapeWebsiteToolSchema
def _run(
self,
**kwargs: Any,
) -> Any:
website_url = kwargs.get('website_url', self.website_url)
page = requests.get(website_url)
parsed = BeautifulSoup(page.content, "html.parser")
return parsed.get_text()