mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-10 08:38:30 +00:00
adding new scrapping tools
This commit is contained in:
@@ -14,6 +14,8 @@ from .tools import (
|
|||||||
PDFSearchTool,
|
PDFSearchTool,
|
||||||
PGSearchTool,
|
PGSearchTool,
|
||||||
RagTool,
|
RagTool,
|
||||||
|
ScrapeElementFromWebsiteTool,
|
||||||
|
ScrapeWebsiteTool,
|
||||||
WebsiteSearchTool,
|
WebsiteSearchTool,
|
||||||
XMLSearchTool,
|
XMLSearchTool,
|
||||||
YoutubeChannelSearchTool,
|
YoutubeChannelSearchTool,
|
||||||
|
|||||||
@@ -12,6 +12,8 @@ from .mdx_seach_tool.mdx_search_tool import MDXSearchTool
|
|||||||
from .pdf_search_tool.pdf_search_tool import PDFSearchTool
|
from .pdf_search_tool.pdf_search_tool import PDFSearchTool
|
||||||
from .pg_seach_tool.pg_search_tool import PGSearchTool
|
from .pg_seach_tool.pg_search_tool import PGSearchTool
|
||||||
from .rag.rag_tool import RagTool
|
from .rag.rag_tool import RagTool
|
||||||
|
from .scrape_element_from_website.scrape_element_from_website import ScrapeElementFromWebsiteTool
|
||||||
|
from .scrape_website_tool.scrape_website_tool import ScrapeWebsiteTool
|
||||||
from .website_search.website_search_tool import WebsiteSearchTool
|
from .website_search.website_search_tool import WebsiteSearchTool
|
||||||
from .xml_search_tool.xml_search_tool import XMLSearchTool
|
from .xml_search_tool.xml_search_tool import XMLSearchTool
|
||||||
from .youtube_channel_search_tool.youtube_channel_search_tool import YoutubeChannelSearchTool
|
from .youtube_channel_search_tool.youtube_channel_search_tool import YoutubeChannelSearchTool
|
||||||
|
|||||||
@@ -0,0 +1,43 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from typing import Optional, Type, Any
|
||||||
|
from pydantic.v1 import BaseModel, Field
|
||||||
|
from ..base_tool import BaseTool
|
||||||
|
|
||||||
|
class FixedScrapeElementFromWebsiteToolSchema(BaseModel):
|
||||||
|
"""Input for ScrapeElementFromWebsiteTool."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
class ScrapeElementFromWebsiteToolSchema(FixedScrapeElementFromWebsiteToolSchema):
|
||||||
|
"""Input for ScrapeElementFromWebsiteTool."""
|
||||||
|
website_url: str = Field(..., description="Mandatory website url to read the file")
|
||||||
|
css_element: str = Field(..., description="Mandatory css reference for element to scrape from the website")
|
||||||
|
|
||||||
|
class ScrapeElementFromWebsiteTool(BaseTool):
|
||||||
|
name: str = "Read a website content"
|
||||||
|
description: str = "A tool that can be used to read a website content."
|
||||||
|
args_schema: Type[BaseModel] = ScrapeElementFromWebsiteToolSchema
|
||||||
|
website_url: Optional[str] = None
|
||||||
|
css_element: Optional[str] = None
|
||||||
|
|
||||||
|
def __init__(self, website_url: Optional[str] = None, css_element: Optional[str] = None, **kwargs):
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
if website_url is not None:
|
||||||
|
self.website_url = website_url
|
||||||
|
self.css_element = css_element
|
||||||
|
self.description = f"A tool that can be used to read {website_url}'s content."
|
||||||
|
self.args_schema = FixedScrapeElementFromWebsiteToolSchema
|
||||||
|
|
||||||
|
def _run(
|
||||||
|
self,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> Any:
|
||||||
|
website_url = kwargs.get('website_url', self.website_url)
|
||||||
|
css_element = kwargs.get('css_element', self.css_element)
|
||||||
|
page = requests.get(website_url)
|
||||||
|
parsed = BeautifulSoup(page.content, "html.parser")
|
||||||
|
elements = parsed.select(css_element)
|
||||||
|
return "\n".join([element.get_text() for element in elements])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -0,0 +1,36 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from typing import Optional, Type, Any
|
||||||
|
from pydantic.v1 import BaseModel, Field
|
||||||
|
from ..base_tool import BaseTool
|
||||||
|
|
||||||
|
class FixedScrapeWebsiteToolSchema(BaseModel):
|
||||||
|
"""Input for ScrapeWebsiteTool."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
class ScrapeWebsiteToolSchema(FixedScrapeWebsiteToolSchema):
|
||||||
|
"""Input for ScrapeWebsiteTool."""
|
||||||
|
website_url: str = Field(..., description="Mandatory website url to read the file")
|
||||||
|
|
||||||
|
class ScrapeWebsiteTool(BaseTool):
|
||||||
|
name: str = "Read a website content"
|
||||||
|
description: str = "A tool that can be used to read a website content."
|
||||||
|
args_schema: Type[BaseModel] = ScrapeWebsiteToolSchema
|
||||||
|
website_url: Optional[str] = None
|
||||||
|
|
||||||
|
def __init__(self, website_url: Optional[str] = None, **kwargs):
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
if website_url is not None:
|
||||||
|
self.website_url = website_url
|
||||||
|
self.description = f"A tool that can be used to read {website_url}'s content."
|
||||||
|
self.args_schema = FixedScrapeWebsiteToolSchema
|
||||||
|
|
||||||
|
def _run(
|
||||||
|
self,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> Any:
|
||||||
|
website_url = kwargs.get('website_url', self.website_url)
|
||||||
|
page = requests.get(website_url)
|
||||||
|
parsed = BeautifulSoup(page.content, "html.parser")
|
||||||
|
return parsed.get_text()
|
||||||
|
|
||||||
Reference in New Issue
Block a user