mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-23 23:28:15 +00:00
feat: add URL validation and return_html examples
- Add comprehensive URL validation in schema and _create_driver - Add URL format, length, and character validation - Add meaningful error messages for validation failures - Add return_html usage examples in README.md Co-Authored-By: Joe Moura <joao@crewai.com>
This commit is contained in:
@@ -24,6 +24,16 @@ tool = SeleniumScrapingTool(website_url='https://example.com', css_element='.mai
|
|||||||
|
|
||||||
# Example 4: Scrape using optional parameters for customized scraping
|
# Example 4: Scrape using optional parameters for customized scraping
|
||||||
tool = SeleniumScrapingTool(website_url='https://example.com', css_element='.main-content', cookie={'name': 'user', 'value': 'John Doe'})
|
tool = SeleniumScrapingTool(website_url='https://example.com', css_element='.main-content', cookie={'name': 'user', 'value': 'John Doe'})
|
||||||
|
|
||||||
|
# Example 5: Scrape content in HTML format
|
||||||
|
tool = SeleniumScrapingTool(website_url='https://example.com', return_html=True)
|
||||||
|
result = tool._run()
|
||||||
|
# Returns HTML content like: ['<div class="content">Hello World</div>', '<div class="footer">Copyright 2024</div>']
|
||||||
|
|
||||||
|
# Example 6: Scrape content in text format (default)
|
||||||
|
tool = SeleniumScrapingTool(website_url='https://example.com', return_html=False)
|
||||||
|
result = tool._run()
|
||||||
|
# Returns text content like: ['Hello World', 'Copyright 2024']
|
||||||
```
|
```
|
||||||
|
|
||||||
## Arguments
|
## Arguments
|
||||||
|
|||||||
@@ -1,8 +1,10 @@
|
|||||||
|
import re
|
||||||
import time
|
import time
|
||||||
from typing import Any, Optional, Type
|
from typing import Any, Optional, Type
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from crewai.tools import BaseTool
|
from crewai.tools import BaseTool
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field, validator
|
||||||
from selenium import webdriver
|
from selenium import webdriver
|
||||||
from selenium.webdriver.chrome.options import Options
|
from selenium.webdriver.chrome.options import Options
|
||||||
from selenium.webdriver.common.by import By
|
from selenium.webdriver.common.by import By
|
||||||
@@ -15,12 +17,35 @@ class FixedSeleniumScrapingToolSchema(BaseModel):
|
|||||||
class SeleniumScrapingToolSchema(FixedSeleniumScrapingToolSchema):
|
class SeleniumScrapingToolSchema(FixedSeleniumScrapingToolSchema):
|
||||||
"""Input for SeleniumScrapingTool."""
|
"""Input for SeleniumScrapingTool."""
|
||||||
|
|
||||||
website_url: str = Field(..., description="Mandatory website url to read the file")
|
website_url: str = Field(..., description="Mandatory website url to read the file. Must start with http:// or https://")
|
||||||
css_element: str = Field(
|
css_element: str = Field(
|
||||||
...,
|
...,
|
||||||
description="Mandatory css reference for element to scrape from the website",
|
description="Mandatory css reference for element to scrape from the website",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@validator('website_url')
|
||||||
|
def validate_website_url(cls, v):
|
||||||
|
if not v:
|
||||||
|
raise ValueError("Website URL cannot be empty")
|
||||||
|
|
||||||
|
if len(v) > 2048: # Common maximum URL length
|
||||||
|
raise ValueError("URL is too long (max 2048 characters)")
|
||||||
|
|
||||||
|
if not re.match(r'^https?://', v):
|
||||||
|
raise ValueError("URL must start with http:// or https://")
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = urlparse(v)
|
||||||
|
if not all([result.scheme, result.netloc]):
|
||||||
|
raise ValueError("Invalid URL format")
|
||||||
|
except Exception as e:
|
||||||
|
raise ValueError(f"Invalid URL: {str(e)}")
|
||||||
|
|
||||||
|
if re.search(r'\s', v):
|
||||||
|
raise ValueError("URL cannot contain whitespace")
|
||||||
|
|
||||||
|
return v
|
||||||
|
|
||||||
|
|
||||||
class SeleniumScrapingTool(BaseTool):
|
class SeleniumScrapingTool(BaseTool):
|
||||||
name: str = "Read a website content"
|
name: str = "Read a website content"
|
||||||
@@ -103,6 +128,13 @@ class SeleniumScrapingTool(BaseTool):
|
|||||||
return elements_content
|
return elements_content
|
||||||
|
|
||||||
def _create_driver(self, url, cookie, wait_time):
|
def _create_driver(self, url, cookie, wait_time):
|
||||||
|
if not url:
|
||||||
|
raise ValueError("URL cannot be empty")
|
||||||
|
|
||||||
|
# Validate URL format
|
||||||
|
if not re.match(r'^https?://', url):
|
||||||
|
raise ValueError("URL must start with http:// or https://")
|
||||||
|
|
||||||
options = Options()
|
options = Options()
|
||||||
options.add_argument("--headless")
|
options.add_argument("--headless")
|
||||||
driver = self.driver(options=options)
|
driver = self.driver(options=options)
|
||||||
|
|||||||
Reference in New Issue
Block a user