diff --git a/src/crewai_tools/tools/selenium_scraping_tool/README.md b/src/crewai_tools/tools/selenium_scraping_tool/README.md
index e2ddefba1..2d54eb970 100644
--- a/src/crewai_tools/tools/selenium_scraping_tool/README.md
+++ b/src/crewai_tools/tools/selenium_scraping_tool/README.md
@@ -24,6 +24,16 @@ tool = SeleniumScrapingTool(website_url='https://example.com', css_element='.mai
# Example 4: Scrape using optional parameters for customized scraping
tool = SeleniumScrapingTool(website_url='https://example.com', css_element='.main-content', cookie={'name': 'user', 'value': 'John Doe'})
+
+# Example 5: Scrape content in HTML format
+tool = SeleniumScrapingTool(website_url='https://example.com', return_html=True)
+result = tool._run()
+# Returns HTML content like: ['
Hello World
', '']
+
+# Example 6: Scrape content in text format (default)
+tool = SeleniumScrapingTool(website_url='https://example.com', return_html=False)
+result = tool._run()
+# Returns text content like: ['Hello World', 'Copyright 2024']
```
## Arguments
diff --git a/src/crewai_tools/tools/selenium_scraping_tool/selenium_scraping_tool.py b/src/crewai_tools/tools/selenium_scraping_tool/selenium_scraping_tool.py
index 5f7d9391b..d7a55428d 100644
--- a/src/crewai_tools/tools/selenium_scraping_tool/selenium_scraping_tool.py
+++ b/src/crewai_tools/tools/selenium_scraping_tool/selenium_scraping_tool.py
@@ -1,8 +1,10 @@
+import re
import time
from typing import Any, Optional, Type
+from urllib.parse import urlparse
from crewai.tools import BaseTool
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, validator
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
@@ -15,12 +17,35 @@ class FixedSeleniumScrapingToolSchema(BaseModel):
class SeleniumScrapingToolSchema(FixedSeleniumScrapingToolSchema):
"""Input for SeleniumScrapingTool."""
- website_url: str = Field(..., description="Mandatory website url to read the file")
+ website_url: str = Field(..., description="Mandatory website url to read the file. Must start with http:// or https://")
css_element: str = Field(
...,
description="Mandatory css reference for element to scrape from the website",
)
+ @validator('website_url')
+ def validate_website_url(cls, v):
+ if not v:
+ raise ValueError("Website URL cannot be empty")
+
+ if len(v) > 2048: # Common maximum URL length
+ raise ValueError("URL is too long (max 2048 characters)")
+
+ if not re.match(r'^https?://', v):
+ raise ValueError("URL must start with http:// or https://")
+
+ try:
+ result = urlparse(v)
+ if not all([result.scheme, result.netloc]):
+ raise ValueError("Invalid URL format")
+ except Exception as e:
+ raise ValueError(f"Invalid URL: {str(e)}")
+
+ if re.search(r'\s', v):
+ raise ValueError("URL cannot contain whitespace")
+
+ return v
+
class SeleniumScrapingTool(BaseTool):
name: str = "Read a website content"
@@ -103,6 +128,13 @@ class SeleniumScrapingTool(BaseTool):
return elements_content
def _create_driver(self, url, cookie, wait_time):
+ if not url:
+ raise ValueError("URL cannot be empty")
+
+ # Validate URL format
+ if not re.match(r'^https?://', url):
+ raise ValueError("URL must start with http:// or https://")
+
options = Options()
options.add_argument("--headless")
driver = self.driver(options=options)