From c070ba002c0d1f96087a53ed89a6963ba8d4b7ac Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <mvincig11@gmail.com>
Date: Wed, 18 Dec 2024 14:34:40 +0100
Subject: [PATCH 1/5] feat: integration of scrapegraph APIs

---
 .../tools/scrapegraph_scrape_tool/README.md   | 43 ++++++++++
 .../scrapegraph_scrape_tool.py                | 82 +++++++++++++++++++
 2 files changed, 125 insertions(+)
 create mode 100644 src/crewai_tools/tools/scrapegraph_scrape_tool/README.md
 create mode 100644 src/crewai_tools/tools/scrapegraph_scrape_tool/scrapegraph_scrape_tool.py

diff --git a/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md b/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md
new file mode 100644
index 000000000..76f385831
--- /dev/null
+++ b/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md
@@ -0,0 +1,43 @@
+# ScrapegraphScrapeTool
+
+## Description
+A tool that leverages Scrapegraph AI's SmartScraper API to intelligently extract content from websites. This tool provides advanced web scraping capabilities with AI-powered content extraction, making it ideal for targeted data collection and content analysis tasks.
+
+## Installation
+Install the required packages:
+```shell
+pip install 'crewai[tools]'
+```
+
+## Example
+```python
+from crewai_tools import ScrapegraphScrapeTool
+
+# Basic usage with API key
+tool = ScrapegraphScrapeTool(api_key="your_api_key")
+result = tool.run(
+    website_url="https://www.example.com",
+    user_prompt="Extract the main heading and summary"
+)
+
+# Initialize with a fixed website URL
+tool = ScrapegraphScrapeTool(
+    website_url="https://www.example.com",
+    api_key="your_api_key"
+)
+result = tool.run()
+
+# With custom prompt
+tool = ScrapegraphScrapeTool(
+    api_key="your_api_key",
+    user_prompt="Extract all product prices and descriptions"
+)
+```
+
+## Arguments
+- `website_url`: The URL of the website to scrape (required if not set during initialization)
+- `user_prompt`: Custom instructions for content extraction (optional)
+- `api_key`: Your Scrapegraph API key (required, can be set via SCRAPEGRAPH_API_KEY environment variable)
+
+## Environment Variables
+- `SCRAPEGRAPH_API_KEY`: Your Scrapegraph API key
diff --git a/src/crewai_tools/tools/scrapegraph_scrape_tool/scrapegraph_scrape_tool.py b/src/crewai_tools/tools/scrapegraph_scrape_tool/scrapegraph_scrape_tool.py
new file mode 100644
index 000000000..058af4150
--- /dev/null
+++ b/src/crewai_tools/tools/scrapegraph_scrape_tool/scrapegraph_scrape_tool.py
@@ -0,0 +1,82 @@
+import os
+from typing import Any, Optional, Type
+
+from crewai.tools import BaseTool
+from pydantic import BaseModel, Field
+from scrapegraph_py import Client
+from scrapegraph_py.logger import sgai_logger
+
+
+class FixedScrapegraphScrapeToolSchema(BaseModel):
+    """Input for ScrapegraphScrapeTool when website_url is fixed."""
+
+    pass
+
+
+class ScrapegraphScrapeToolSchema(FixedScrapegraphScrapeToolSchema):
+    """Input for ScrapegraphScrapeTool."""
+
+    website_url: str = Field(..., description="Mandatory website url to scrape")
+    user_prompt: str = Field(
+        default="Extract the main content of the webpage",
+        description="Prompt to guide the extraction of content",
+    )
+
+
+class ScrapegraphScrapeTool(BaseTool):
+    name: str = "Scrapegraph website scraper"
+    description: str = "A tool that uses Scrapegraph AI to intelligently scrape website content."
+    args_schema: Type[BaseModel] = ScrapegraphScrapeToolSchema
+    website_url: Optional[str] = None
+    user_prompt: Optional[str] = None
+    api_key: Optional[str] = None
+
+    def __init__(
+        self,
+        website_url: Optional[str] = None,
+        user_prompt: Optional[str] = None,
+        api_key: Optional[str] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.api_key = api_key or os.getenv("SCRAPEGRAPH_API_KEY")
+        
+        if not self.api_key:
+            raise ValueError("Scrapegraph API key is required")
+
+        if website_url is not None:
+            self.website_url = website_url
+            self.description = f"A tool that uses Scrapegraph AI to intelligently scrape {website_url}'s content."
+            self.args_schema = FixedScrapegraphScrapeToolSchema
+            
+        if user_prompt is not None:
+            self.user_prompt = user_prompt
+
+        # Configure logging
+        sgai_logger.set_logging(level="INFO")
+
+    def _run(
+        self,
+        **kwargs: Any,
+    ) -> Any:
+        website_url = kwargs.get("website_url", self.website_url)
+        user_prompt = kwargs.get("user_prompt", self.user_prompt) or "Extract the main content of the webpage"
+
+        if not website_url:
+            raise ValueError("website_url is required")
+
+        # Initialize the client
+        sgai_client = Client(api_key=self.api_key)
+
+        try:
+            # Make the SmartScraper request
+            response = sgai_client.smartscraper(
+                website_url=website_url,
+                user_prompt=user_prompt,
+            )
+
+            # Return the result
+            return response["result"]
+        finally:
+            # Always close the client
+            sgai_client.close()

From 7608944e7f0e60f597e39fc2f40fc93fe31c4e28 Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <mvincig11@gmail.com>
Date: Wed, 18 Dec 2024 14:38:34 +0100
Subject: [PATCH 2/5] Update README.md

---
 src/crewai_tools/tools/scrapegraph_scrape_tool/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md b/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md
index 76f385831..03467faee 100644
--- a/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md
+++ b/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md
@@ -40,4 +40,4 @@ tool = ScrapegraphScrapeTool(
 - `api_key`: Your Scrapegraph API key (required, can be set via SCRAPEGRAPH_API_KEY environment variable)
 
 ## Environment Variables
-- `SCRAPEGRAPH_API_KEY`: Your Scrapegraph API key
+- `SCRAPEGRAPH_API_KEY`: Your Scrapegraph API key, you can buy it [here](https://scrapegraphai.com)

From b58d80dcf9373099ecc1bbc2715b6d042e8396ca Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <mvincig11@gmail.com>
Date: Wed, 18 Dec 2024 14:42:37 +0100
Subject: [PATCH 3/5] update documents according to suggestions

---
 .../tools/scrapegraph_scrape_tool/README.md   | 45 +++++++++++-
 .../scrapegraph_scrape_tool.py                | 73 ++++++++++++++++++-
 2 files changed, 112 insertions(+), 6 deletions(-)

diff --git a/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md b/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md
index 03467faee..e006c0ff9 100644
--- a/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md
+++ b/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md
@@ -9,7 +9,9 @@ Install the required packages:
 pip install 'crewai[tools]'
 ```
 
-## Example
+## Example Usage
+
+### Basic Usage
 ```python
 from crewai_tools import ScrapegraphScrapeTool
 
@@ -19,19 +21,40 @@ result = tool.run(
     website_url="https://www.example.com",
     user_prompt="Extract the main heading and summary"
 )
+```
 
+### Fixed Website URL
+```python
 # Initialize with a fixed website URL
 tool = ScrapegraphScrapeTool(
     website_url="https://www.example.com",
     api_key="your_api_key"
 )
 result = tool.run()
+```
 
+### Custom Prompt
+```python
 # With custom prompt
 tool = ScrapegraphScrapeTool(
     api_key="your_api_key",
     user_prompt="Extract all product prices and descriptions"
 )
+result = tool.run(website_url="https://www.example.com")
+```
+
+### Error Handling
+```python
+try:
+    tool = ScrapegraphScrapeTool(api_key="your_api_key")
+    result = tool.run(
+        website_url="https://www.example.com",
+        user_prompt="Extract the main heading"
+    )
+except ValueError as e:
+    print(f"Configuration error: {e}")  # Handles invalid URLs or missing API keys
+except RuntimeError as e:
+    print(f"Scraping error: {e}")  # Handles API or network errors
 ```
 
 ## Arguments
@@ -40,4 +63,22 @@ tool = ScrapegraphScrapeTool(
 - `api_key`: Your Scrapegraph API key (required, can be set via SCRAPEGRAPH_API_KEY environment variable)
 
 ## Environment Variables
-- `SCRAPEGRAPH_API_KEY`: Your Scrapegraph API key, you can buy it [here](https://scrapegraphai.com)
+- `SCRAPEGRAPH_API_KEY`: Your Scrapegraph API key, you can obtain one [here](https://scrapegraphai.com)
+
+## Rate Limiting
+The Scrapegraph API has rate limits that vary based on your subscription plan. Consider the following best practices:
+- Implement appropriate delays between requests when processing multiple URLs
+- Handle rate limit errors gracefully in your application
+- Check your API plan limits on the Scrapegraph dashboard
+
+## Error Handling
+The tool may raise the following exceptions:
+- `ValueError`: When API key is missing or URL format is invalid
+- `RuntimeError`: When scraping operation fails (network issues, API errors)
+- `RateLimitError`: When API rate limits are exceeded
+
+## Best Practices
+1. Always validate URLs before making requests
+2. Implement proper error handling as shown in examples
+3. Consider caching results for frequently accessed pages
+4. Monitor your API usage through the Scrapegraph dashboard
diff --git a/src/crewai_tools/tools/scrapegraph_scrape_tool/scrapegraph_scrape_tool.py b/src/crewai_tools/tools/scrapegraph_scrape_tool/scrapegraph_scrape_tool.py
index 058af4150..906bf6376 100644
--- a/src/crewai_tools/tools/scrapegraph_scrape_tool/scrapegraph_scrape_tool.py
+++ b/src/crewai_tools/tools/scrapegraph_scrape_tool/scrapegraph_scrape_tool.py
@@ -1,15 +1,25 @@
 import os
 from typing import Any, Optional, Type
+from urllib.parse import urlparse
 
 from crewai.tools import BaseTool
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, validator
 from scrapegraph_py import Client
 from scrapegraph_py.logger import sgai_logger
 
 
+class ScrapegraphError(Exception):
+    """Base exception for Scrapegraph-related errors"""
+    pass
+
+
+class RateLimitError(ScrapegraphError):
+    """Raised when API rate limits are exceeded"""
+    pass
+
+
 class FixedScrapegraphScrapeToolSchema(BaseModel):
     """Input for ScrapegraphScrapeTool when website_url is fixed."""
-
     pass
 
 
@@ -22,8 +32,28 @@ class ScrapegraphScrapeToolSchema(FixedScrapegraphScrapeToolSchema):
         description="Prompt to guide the extraction of content",
     )
 
+    @validator('website_url')
+    def validate_url(cls, v):
+        """Validate URL format"""
+        try:
+            result = urlparse(v)
+            if not all([result.scheme, result.netloc]):
+                raise ValueError
+            return v
+        except Exception:
+            raise ValueError("Invalid URL format. URL must include scheme (http/https) and domain")
+
 
 class ScrapegraphScrapeTool(BaseTool):
+    """
+    A tool that uses Scrapegraph AI to intelligently scrape website content.
+    
+    Raises:
+        ValueError: If API key is missing or URL format is invalid
+        RateLimitError: If API rate limits are exceeded
+        RuntimeError: If scraping operation fails
+    """
+
     name: str = "Scrapegraph website scraper"
     description: str = "A tool that uses Scrapegraph AI to intelligently scrape website content."
     args_schema: Type[BaseModel] = ScrapegraphScrapeToolSchema
@@ -45,6 +75,7 @@ class ScrapegraphScrapeTool(BaseTool):
             raise ValueError("Scrapegraph API key is required")
 
         if website_url is not None:
+            self._validate_url(website_url)
             self.website_url = website_url
             self.description = f"A tool that uses Scrapegraph AI to intelligently scrape {website_url}'s content."
             self.args_schema = FixedScrapegraphScrapeToolSchema
@@ -55,6 +86,32 @@ class ScrapegraphScrapeTool(BaseTool):
         # Configure logging
         sgai_logger.set_logging(level="INFO")
 
+    @staticmethod
+    def _validate_url(url: str) -> None:
+        """Validate URL format"""
+        try:
+            result = urlparse(url)
+            if not all([result.scheme, result.netloc]):
+                raise ValueError
+        except Exception:
+            raise ValueError("Invalid URL format. URL must include scheme (http/https) and domain")
+
+    def _handle_api_response(self, response: dict) -> str:
+        """Handle and validate API response"""
+        if not response:
+            raise RuntimeError("Empty response from Scrapegraph API")
+        
+        if "error" in response:
+            error_msg = response.get("error", {}).get("message", "Unknown error")
+            if "rate limit" in error_msg.lower():
+                raise RateLimitError(f"Rate limit exceeded: {error_msg}")
+            raise RuntimeError(f"API error: {error_msg}")
+            
+        if "result" not in response:
+            raise RuntimeError("Invalid response format from Scrapegraph API")
+            
+        return response["result"]
+
     def _run(
         self,
         **kwargs: Any,
@@ -65,6 +122,9 @@ class ScrapegraphScrapeTool(BaseTool):
         if not website_url:
             raise ValueError("website_url is required")
 
+        # Validate URL format
+        self._validate_url(website_url)
+
         # Initialize the client
         sgai_client = Client(api_key=self.api_key)
 
@@ -75,8 +135,13 @@ class ScrapegraphScrapeTool(BaseTool):
                 user_prompt=user_prompt,
             )
 
-            # Return the result
-            return response["result"]
+            # Handle and validate the response
+            return self._handle_api_response(response)
+
+        except RateLimitError:
+            raise  # Re-raise rate limit errors
+        except Exception as e:
+            raise RuntimeError(f"Scraping failed: {str(e)}")
         finally:
             # Always close the client
             sgai_client.close()

From c3ebbba8aefdc5d5c6cf0be1ab855720cc2e29d5 Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <mvincig11@gmail.com>
Date: Sat, 28 Dec 2024 09:11:32 +0100
Subject: [PATCH 4/5] Update __init__.py

---
 src/crewai_tools/tools/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/crewai_tools/tools/__init__.py b/src/crewai_tools/tools/__init__.py
index 23565dbea..00f992833 100644
--- a/src/crewai_tools/tools/__init__.py
+++ b/src/crewai_tools/tools/__init__.py
@@ -31,6 +31,7 @@ from .rag.rag_tool import RagTool
 from .scrape_element_from_website.scrape_element_from_website import (
     ScrapeElementFromWebsiteTool,
 )
+from .scrapegraph_scrape_tool.scrapegraph_scrape_tool import ScrapeGraphScrapeTool, ScrapegraphScrapeToolSchema
 from .scrape_website_tool.scrape_website_tool import ScrapeWebsiteTool
 from .scrapfly_scrape_website_tool.scrapfly_scrape_website_tool import (
     ScrapflyScrapeWebsiteTool,

From 029afd3e145030ed6a0d0141a899beaa75311099 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20Moura?= <joaomdmoura@gmail.com>
Date: Sun, 29 Dec 2024 12:23:08 -0300
Subject: [PATCH 5/5] Update __init__.py

---
 src/crewai_tools/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/crewai_tools/__init__.py b/src/crewai_tools/__init__.py
index 87aca8531..65a90a01b 100644
--- a/src/crewai_tools/__init__.py
+++ b/src/crewai_tools/__init__.py
@@ -26,6 +26,8 @@ from .tools import (
     PGSearchTool,
     RagTool,
     ScrapeElementFromWebsiteTool,
+    ScrapeGraphScrapeTool, 
+    ScrapegraphScrapeToolSchema
     ScrapeWebsiteTool,
     ScrapflyScrapeWebsiteTool,
     SeleniumScrapingTool,