From 96e52767ad417738c684e6ff7470ce25a458634e Mon Sep 17 00:00:00 2001 From: Ernest Poletaev Date: Fri, 25 Oct 2024 22:03:59 +0700 Subject: [PATCH 1/2] fix: web scraper concatenate words --- .../tools/scrape_website_tool/scrape_website_tool.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/crewai_tools/tools/scrape_website_tool/scrape_website_tool.py b/src/crewai_tools/tools/scrape_website_tool/scrape_website_tool.py index 7173c2156..3cfb67bae 100644 --- a/src/crewai_tools/tools/scrape_website_tool/scrape_website_tool.py +++ b/src/crewai_tools/tools/scrape_website_tool/scrape_website_tool.py @@ -1,4 +1,5 @@ import os +import re from typing import Any, Optional, Type import requests @@ -67,7 +68,6 @@ class ScrapeWebsiteTool(BaseTool): page.encoding = page.apparent_encoding parsed = BeautifulSoup(page.text, "html.parser") - text = parsed.get_text() - text = "\n".join([i for i in text.split("\n") if i.strip() != ""]) - text = " ".join([i for i in text.split(" ") if i.strip() != ""]) + text = parsed.get_text(" ") + text = re.sub('\s+', ' ', text) return text From 1f8791953e41194fe0c34761076096824c844bf8 Mon Sep 17 00:00:00 2001 From: Ernest Poletaev Date: Fri, 25 Oct 2024 22:33:24 +0700 Subject: [PATCH 2/2] fix: retain line breaks --- .../tools/scrape_website_tool/scrape_website_tool.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/crewai_tools/tools/scrape_website_tool/scrape_website_tool.py b/src/crewai_tools/tools/scrape_website_tool/scrape_website_tool.py index 3cfb67bae..99df1d2dd 100644 --- a/src/crewai_tools/tools/scrape_website_tool/scrape_website_tool.py +++ b/src/crewai_tools/tools/scrape_website_tool/scrape_website_tool.py @@ -69,5 +69,6 @@ class ScrapeWebsiteTool(BaseTool): parsed = BeautifulSoup(page.text, "html.parser") text = parsed.get_text(" ") - text = re.sub('\s+', ' ', text) + text = re.sub('[ \t]+', ' ', text) + text = re.sub('\\s+\n\\s+', '\n', text) return text