feat: add URL validation and return_html examples

- Add comprehensive URL validation in schema and _create_driver
- Add URL format, length, and character validation
- Add meaningful error messages for validation failures
- Add return_html usage examples in README.md

Co-Authored-By: Joe Moura <joao@crewai.com>
This commit is contained in:
Devin AI
2024-12-28 00:54:49 +00:00
parent f11756387d
commit c7c8cd0a3c
2 changed files with 44 additions and 2 deletions

View File

@@ -24,6 +24,16 @@ tool = SeleniumScrapingTool(website_url='https://example.com', css_element='.mai
# Example 4: Scrape using optional parameters for customized scraping
tool = SeleniumScrapingTool(website_url='https://example.com', css_element='.main-content', cookie={'name': 'user', 'value': 'John Doe'})
# Example 5: Scrape content in HTML format
tool = SeleniumScrapingTool(website_url='https://example.com', return_html=True)
result = tool._run()
# Returns HTML content like: ['<div class="content">Hello World</div>', '<div class="footer">Copyright 2024</div>']
# Example 6: Scrape content in text format (default)
tool = SeleniumScrapingTool(website_url='https://example.com', return_html=False)
result = tool._run()
# Returns text content like: ['Hello World', 'Copyright 2024']
```
## Arguments

View File

@@ -1,8 +1,10 @@
import re
import time
from typing import Any, Optional, Type
from urllib.parse import urlparse
from crewai.tools import BaseTool
from pydantic import BaseModel, Field
from pydantic import BaseModel, Field, validator
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
@@ -15,12 +17,35 @@ class FixedSeleniumScrapingToolSchema(BaseModel):
class SeleniumScrapingToolSchema(FixedSeleniumScrapingToolSchema):
"""Input for SeleniumScrapingTool."""
website_url: str = Field(..., description="Mandatory website url to read the file")
website_url: str = Field(..., description="Mandatory website url to read the file. Must start with http:// or https://")
css_element: str = Field(
...,
description="Mandatory css reference for element to scrape from the website",
)
@validator('website_url')
def validate_website_url(cls, v):
if not v:
raise ValueError("Website URL cannot be empty")
if len(v) > 2048: # Common maximum URL length
raise ValueError("URL is too long (max 2048 characters)")
if not re.match(r'^https?://', v):
raise ValueError("URL must start with http:// or https://")
try:
result = urlparse(v)
if not all([result.scheme, result.netloc]):
raise ValueError("Invalid URL format")
except Exception as e:
raise ValueError(f"Invalid URL: {str(e)}")
if re.search(r'\s', v):
raise ValueError("URL cannot contain whitespace")
return v
class SeleniumScrapingTool(BaseTool):
name: str = "Read a website content"
@@ -103,6 +128,13 @@ class SeleniumScrapingTool(BaseTool):
return elements_content
def _create_driver(self, url, cookie, wait_time):
if not url:
raise ValueError("URL cannot be empty")
# Validate URL format
if not re.match(r'^https?://', url):
raise ValueError("URL must start with http:// or https://")
options = Options()
options.add_argument("--headless")
driver = self.driver(options=options)