From cff6082f1c5843c6a704900710388b7b1cb99a90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Moura?= Date: Mon, 26 Feb 2024 06:20:15 -0300 Subject: [PATCH] improving scrapping tools --- .../scrape_element_from_website/scrape_element_from_website.py | 3 ++- .../tools/scrape_website_tool/scrape_website_tool.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/crewai_tools/tools/scrape_element_from_website/scrape_element_from_website.py b/src/crewai_tools/tools/scrape_element_from_website/scrape_element_from_website.py index 1996172b2..54de3cd39 100644 --- a/src/crewai_tools/tools/scrape_element_from_website/scrape_element_from_website.py +++ b/src/crewai_tools/tools/scrape_element_from_website/scrape_element_from_website.py @@ -19,6 +19,7 @@ class ScrapeElementFromWebsiteTool(BaseTool): args_schema: Type[BaseModel] = ScrapeElementFromWebsiteToolSchema website_url: Optional[str] = None css_element: Optional[str] = None + headers: Optional[dict] = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} def __init__(self, website_url: Optional[str] = None, css_element: Optional[str] = None, **kwargs): super().__init__(**kwargs) @@ -34,7 +35,7 @@ class ScrapeElementFromWebsiteTool(BaseTool): ) -> Any: website_url = kwargs.get('website_url', self.website_url) css_element = kwargs.get('css_element', self.css_element) - page = requests.get(website_url) + page = requests.get(website_url, headers=self.headers) parsed = BeautifulSoup(page.content, "html.parser") elements = parsed.select(css_element) return "\n".join([element.get_text() for element in elements]) diff --git a/src/crewai_tools/tools/scrape_website_tool/scrape_website_tool.py b/src/crewai_tools/tools/scrape_website_tool/scrape_website_tool.py index 8ec16c1ab..e672a9b1d 100644 --- a/src/crewai_tools/tools/scrape_website_tool/scrape_website_tool.py +++ b/src/crewai_tools/tools/scrape_website_tool/scrape_website_tool.py @@ -17,6 +17,7 @@ class ScrapeWebsiteTool(BaseTool): description: str = "A tool that can be used to read a website content." args_schema: Type[BaseModel] = ScrapeWebsiteToolSchema website_url: Optional[str] = None + headers: Optional[dict] = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} def __init__(self, website_url: Optional[str] = None, **kwargs): super().__init__(**kwargs) @@ -30,7 +31,7 @@ class ScrapeWebsiteTool(BaseTool): **kwargs: Any, ) -> Any: website_url = kwargs.get('website_url', self.website_url) - page = requests.get(website_url) + page = requests.get(website_url, headers=self.headers) parsed = BeautifulSoup(page.content, "html.parser") return parsed.get_text()