mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-26 16:48:13 +00:00
updating scrapping tool
This commit is contained in:
@@ -14,7 +14,7 @@ class ScrapeWebsiteToolSchema(FixedScrapeWebsiteToolSchema):
|
|||||||
website_url: str = Field(..., description="Mandatory website url to read the file")
|
website_url: str = Field(..., description="Mandatory website url to read the file")
|
||||||
|
|
||||||
class ScrapeWebsiteTool(BaseTool):
|
class ScrapeWebsiteTool(BaseTool):
|
||||||
name: str = "Read a website content"
|
name: str = "Read website content"
|
||||||
description: str = "A tool that can be used to read a website content."
|
description: str = "A tool that can be used to read a website content."
|
||||||
args_schema: Type[BaseModel] = ScrapeWebsiteToolSchema
|
args_schema: Type[BaseModel] = ScrapeWebsiteToolSchema
|
||||||
website_url: Optional[str] = None
|
website_url: Optional[str] = None
|
||||||
@@ -46,5 +46,8 @@ class ScrapeWebsiteTool(BaseTool):
|
|||||||
website_url = kwargs.get('website_url', self.website_url)
|
website_url = kwargs.get('website_url', self.website_url)
|
||||||
page = requests.get(website_url, headers=self.headers, cookies=self.cookies if self.cookies else {})
|
page = requests.get(website_url, headers=self.headers, cookies=self.cookies if self.cookies else {})
|
||||||
parsed = BeautifulSoup(page.content, "html.parser")
|
parsed = BeautifulSoup(page.content, "html.parser")
|
||||||
return parsed.get_text()
|
text = parsed.get_text()
|
||||||
|
text = '\n'.join([i for i in text.split('\n') if i.strip() != ''])
|
||||||
|
text = ' '.join([i for i in text.split(' ') if i.strip() != ''])
|
||||||
|
return text
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user