From c7c8cd0a3cdb52234ec593f89f760e574fc36c41 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Sat, 28 Dec 2024 00:54:49 +0000 Subject: [PATCH] feat: add URL validation and return_html examples - Add comprehensive URL validation in schema and _create_driver - Add URL format, length, and character validation - Add meaningful error messages for validation failures - Add return_html usage examples in README.md Co-Authored-By: Joe Moura --- .../tools/selenium_scraping_tool/README.md | 10 ++++++ .../selenium_scraping_tool.py | 36 +++++++++++++++++-- 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/src/crewai_tools/tools/selenium_scraping_tool/README.md b/src/crewai_tools/tools/selenium_scraping_tool/README.md index e2ddefba1..2d54eb970 100644 --- a/src/crewai_tools/tools/selenium_scraping_tool/README.md +++ b/src/crewai_tools/tools/selenium_scraping_tool/README.md @@ -24,6 +24,16 @@ tool = SeleniumScrapingTool(website_url='https://example.com', css_element='.mai # Example 4: Scrape using optional parameters for customized scraping tool = SeleniumScrapingTool(website_url='https://example.com', css_element='.main-content', cookie={'name': 'user', 'value': 'John Doe'}) + +# Example 5: Scrape content in HTML format +tool = SeleniumScrapingTool(website_url='https://example.com', return_html=True) +result = tool._run() +# Returns HTML content like: ['
Hello World
', ''] + +# Example 6: Scrape content in text format (default) +tool = SeleniumScrapingTool(website_url='https://example.com', return_html=False) +result = tool._run() +# Returns text content like: ['Hello World', 'Copyright 2024'] ``` ## Arguments diff --git a/src/crewai_tools/tools/selenium_scraping_tool/selenium_scraping_tool.py b/src/crewai_tools/tools/selenium_scraping_tool/selenium_scraping_tool.py index 5f7d9391b..d7a55428d 100644 --- a/src/crewai_tools/tools/selenium_scraping_tool/selenium_scraping_tool.py +++ b/src/crewai_tools/tools/selenium_scraping_tool/selenium_scraping_tool.py @@ -1,8 +1,10 @@ +import re import time from typing import Any, Optional, Type +from urllib.parse import urlparse from crewai.tools import BaseTool -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, validator from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By @@ -15,12 +17,35 @@ class FixedSeleniumScrapingToolSchema(BaseModel): class SeleniumScrapingToolSchema(FixedSeleniumScrapingToolSchema): """Input for SeleniumScrapingTool.""" - website_url: str = Field(..., description="Mandatory website url to read the file") + website_url: str = Field(..., description="Mandatory website url to read the file. Must start with http:// or https://") css_element: str = Field( ..., description="Mandatory css reference for element to scrape from the website", ) + @validator('website_url') + def validate_website_url(cls, v): + if not v: + raise ValueError("Website URL cannot be empty") + + if len(v) > 2048: # Common maximum URL length + raise ValueError("URL is too long (max 2048 characters)") + + if not re.match(r'^https?://', v): + raise ValueError("URL must start with http:// or https://") + + try: + result = urlparse(v) + if not all([result.scheme, result.netloc]): + raise ValueError("Invalid URL format") + except Exception as e: + raise ValueError(f"Invalid URL: {str(e)}") + + if re.search(r'\s', v): + raise ValueError("URL cannot contain whitespace") + + return v + class SeleniumScrapingTool(BaseTool): name: str = "Read a website content" @@ -103,6 +128,13 @@ class SeleniumScrapingTool(BaseTool): return elements_content def _create_driver(self, url, cookie, wait_time): + if not url: + raise ValueError("URL cannot be empty") + + # Validate URL format + if not re.match(r'^https?://', url): + raise ValueError("URL must start with http:// or https://") + options = Options() options.add_argument("--headless") driver = self.driver(options=options)