Merge pull request #160 from crewAIInc/feature/selenium-scraping-tool-can-return-html

feat: add URL validation and return_html examples
This commit is contained in:
João Moura
2024-12-27 22:09:01 -03:00
committed by GitHub
2 changed files with 44 additions and 2 deletions

View File

@@ -24,6 +24,16 @@ tool = SeleniumScrapingTool(website_url='https://example.com', css_element='.mai
# Example 4: Scrape using optional parameters for customized scraping
tool = SeleniumScrapingTool(website_url='https://example.com', css_element='.main-content', cookie={'name': 'user', 'value': 'John Doe'})
# Example 5: Scrape content in HTML format
tool = SeleniumScrapingTool(website_url='https://example.com', return_html=True)
result = tool._run()
# Returns HTML content like: ['<div class="content">Hello World</div>', '<div class="footer">Copyright 2024</div>']
# Example 6: Scrape content in text format (default)
tool = SeleniumScrapingTool(website_url='https://example.com', return_html=False)
result = tool._run()
# Returns text content like: ['Hello World', 'Copyright 2024']
```
## Arguments

View File

@@ -1,8 +1,10 @@
import re
import time
from typing import Any, Optional, Type
from urllib.parse import urlparse
from crewai.tools import BaseTool
from pydantic import BaseModel, Field
from pydantic import BaseModel, Field, validator
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
@@ -15,12 +17,35 @@ class FixedSeleniumScrapingToolSchema(BaseModel):
class SeleniumScrapingToolSchema(FixedSeleniumScrapingToolSchema):
"""Input for SeleniumScrapingTool."""
website_url: str = Field(..., description="Mandatory website url to read the file")
website_url: str = Field(..., description="Mandatory website url to read the file. Must start with http:// or https://")
css_element: str = Field(
...,
description="Mandatory css reference for element to scrape from the website",
)
@validator('website_url')
def validate_website_url(cls, v):
if not v:
raise ValueError("Website URL cannot be empty")
if len(v) > 2048: # Common maximum URL length
raise ValueError("URL is too long (max 2048 characters)")
if not re.match(r'^https?://', v):
raise ValueError("URL must start with http:// or https://")
try:
result = urlparse(v)
if not all([result.scheme, result.netloc]):
raise ValueError("Invalid URL format")
except Exception as e:
raise ValueError(f"Invalid URL: {str(e)}")
if re.search(r'\s', v):
raise ValueError("URL cannot contain whitespace")
return v
class SeleniumScrapingTool(BaseTool):
name: str = "Read a website content"
@@ -103,6 +128,13 @@ class SeleniumScrapingTool(BaseTool):
return elements_content
def _create_driver(self, url, cookie, wait_time):
if not url:
raise ValueError("URL cannot be empty")
# Validate URL format
if not re.match(r'^https?://', url):
raise ValueError("URL must start with http:// or https://")
options = Options()
options.add_argument("--headless")
driver = self.driver(options=options)