update documents according to suggestions

2026-01-09 08:08:32 +00:00 · 2024-12-18 14:42:37 +01:00
parent 7608944e7f
commit b58d80dcf9
2 changed files with 112 additions and 6 deletions
--- a/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md
+++ b/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md
@@ -9,7 +9,9 @@ Install the required packages:
 pip install 'crewai[tools]'
 ```
-## Example
+## Example Usage
 ### Basic Usage
 ```python
 from crewai_tools import ScrapegraphScrapeTool
@@ -19,19 +21,40 @@ result = tool.run(
    website_url="https://www.example.com",
    user_prompt="Extract the main heading and summary"
 )
 ```
 ### Fixed Website URL
 ```python
 # Initialize with a fixed website URL
 tool = ScrapegraphScrapeTool(
    website_url="https://www.example.com",
    api_key="your_api_key"
 )
 result = tool.run()
 ```
 ### Custom Prompt
 ```python
 # With custom prompt
 tool = ScrapegraphScrapeTool(
    api_key="your_api_key",
    user_prompt="Extract all product prices and descriptions"
 )
 result = tool.run(website_url="https://www.example.com")
 ```
 ### Error Handling
 ```python
 try:
    tool = ScrapegraphScrapeTool(api_key="your_api_key")
    result = tool.run(
        website_url="https://www.example.com",
        user_prompt="Extract the main heading"
    )
 except ValueError as e:
    print(f"Configuration error: {e}")  # Handles invalid URLs or missing API keys
 except RuntimeError as e:
    print(f"Scraping error: {e}")  # Handles API or network errors
 ```
 ## Arguments
@@ -40,4 +63,22 @@ tool = ScrapegraphScrapeTool(
 - `api_key`: Your Scrapegraph API key (required, can be set via SCRAPEGRAPH_API_KEY environment variable)
 ## Environment Variables
- `SCRAPEGRAPH_API_KEY`: Your Scrapegraph API key, you can buy it [here](https://scrapegraphai.com)
+- `SCRAPEGRAPH_API_KEY`: Your Scrapegraph API key, you can obtain one [here](https://scrapegraphai.com)
 ## Rate Limiting
 The Scrapegraph API has rate limits that vary based on your subscription plan. Consider the following best practices:
 - Implement appropriate delays between requests when processing multiple URLs
 - Handle rate limit errors gracefully in your application
 - Check your API plan limits on the Scrapegraph dashboard
 ## Error Handling
 The tool may raise the following exceptions:
 - `ValueError`: When API key is missing or URL format is invalid
 - `RuntimeError`: When scraping operation fails (network issues, API errors)
 - `RateLimitError`: When API rate limits are exceeded
 ## Best Practices
 1. Always validate URLs before making requests
 2. Implement proper error handling as shown in examples
 3. Consider caching results for frequently accessed pages
 4. Monitor your API usage through the Scrapegraph dashboard
--- a/src/crewai_tools/tools/scrapegraph_scrape_tool/scrapegraph_scrape_tool.py
+++ b/src/crewai_tools/tools/scrapegraph_scrape_tool/scrapegraph_scrape_tool.py
@@ -1,15 +1,25 @@
 import os
 from typing import Any, Optional, Type
 from urllib.parse import urlparse
 from crewai.tools import BaseTool
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, validator
 from scrapegraph_py import Client
 from scrapegraph_py.logger import sgai_logger
 class ScrapegraphError(Exception):
    """Base exception for Scrapegraph-related errors"""
    pass
 class RateLimitError(ScrapegraphError):
    """Raised when API rate limits are exceeded"""
    pass
 class FixedScrapegraphScrapeToolSchema(BaseModel):
    """Input for ScrapegraphScrapeTool when website_url is fixed."""
    pass
@@ -22,8 +32,28 @@ class ScrapegraphScrapeToolSchema(FixedScrapegraphScrapeToolSchema):
        description="Prompt to guide the extraction of content",
    )
    @validator('website_url')
    def validate_url(cls, v):
        """Validate URL format"""
        try:
            result = urlparse(v)
            if not all([result.scheme, result.netloc]):
                raise ValueError
            return v
        except Exception:
            raise ValueError("Invalid URL format. URL must include scheme (http/https) and domain")
 class ScrapegraphScrapeTool(BaseTool):
    """
    A tool that uses Scrapegraph AI to intelligently scrape website content.
    Raises:
        ValueError: If API key is missing or URL format is invalid
        RateLimitError: If API rate limits are exceeded
        RuntimeError: If scraping operation fails
    """
    name: str = "Scrapegraph website scraper"
    description: str = "A tool that uses Scrapegraph AI to intelligently scrape website content."
    args_schema: Type[BaseModel] = ScrapegraphScrapeToolSchema
@@ -45,6 +75,7 @@ class ScrapegraphScrapeTool(BaseTool):
            raise ValueError("Scrapegraph API key is required")
        if website_url is not None:
            self._validate_url(website_url)
            self.website_url = website_url
            self.description = f"A tool that uses Scrapegraph AI to intelligently scrape {website_url}'s content."
            self.args_schema = FixedScrapegraphScrapeToolSchema
@@ -55,6 +86,32 @@ class ScrapegraphScrapeTool(BaseTool):
        # Configure logging
        sgai_logger.set_logging(level="INFO")
    @staticmethod
    def _validate_url(url: str) -> None:
        """Validate URL format"""
        try:
            result = urlparse(url)
            if not all([result.scheme, result.netloc]):
                raise ValueError
        except Exception:
            raise ValueError("Invalid URL format. URL must include scheme (http/https) and domain")
    def _handle_api_response(self, response: dict) -> str:
        """Handle and validate API response"""
        if not response:
            raise RuntimeError("Empty response from Scrapegraph API")
        if "error" in response:
            error_msg = response.get("error", {}).get("message", "Unknown error")
            if "rate limit" in error_msg.lower():
                raise RateLimitError(f"Rate limit exceeded: {error_msg}")
            raise RuntimeError(f"API error: {error_msg}")
        if "result" not in response:
            raise RuntimeError("Invalid response format from Scrapegraph API")
        return response["result"]
    def _run(
        self,
        **kwargs: Any,
@@ -65,6 +122,9 @@ class ScrapegraphScrapeTool(BaseTool):
        if not website_url:
            raise ValueError("website_url is required")
        # Validate URL format
        self._validate_url(website_url)
        # Initialize the client
        sgai_client = Client(api_key=self.api_key)
@@ -75,8 +135,13 @@ class ScrapegraphScrapeTool(BaseTool):
                user_prompt=user_prompt,
            )
-            # Return the result
+            # Handle and validate the response
-            return response["result"]
+            return self._handle_api_response(response)
        except RateLimitError:
            raise  # Re-raise rate limit errors
        except Exception as e:
            raise RuntimeError(f"Scraping failed: {str(e)}")
        finally:
            # Always close the client
            sgai_client.close()