From b58d80dcf9373099ecc1bbc2715b6d042e8396ca Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Wed, 18 Dec 2024 14:42:37 +0100 Subject: [PATCH] update documents according to suggestions --- .../tools/scrapegraph_scrape_tool/README.md | 45 +++++++++++- .../scrapegraph_scrape_tool.py | 73 ++++++++++++++++++- 2 files changed, 112 insertions(+), 6 deletions(-) diff --git a/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md b/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md index 03467faee..e006c0ff9 100644 --- a/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md +++ b/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md @@ -9,7 +9,9 @@ Install the required packages: pip install 'crewai[tools]' ``` -## Example +## Example Usage + +### Basic Usage ```python from crewai_tools import ScrapegraphScrapeTool @@ -19,19 +21,40 @@ result = tool.run( website_url="https://www.example.com", user_prompt="Extract the main heading and summary" ) +``` +### Fixed Website URL +```python # Initialize with a fixed website URL tool = ScrapegraphScrapeTool( website_url="https://www.example.com", api_key="your_api_key" ) result = tool.run() +``` +### Custom Prompt +```python # With custom prompt tool = ScrapegraphScrapeTool( api_key="your_api_key", user_prompt="Extract all product prices and descriptions" ) +result = tool.run(website_url="https://www.example.com") +``` + +### Error Handling +```python +try: + tool = ScrapegraphScrapeTool(api_key="your_api_key") + result = tool.run( + website_url="https://www.example.com", + user_prompt="Extract the main heading" + ) +except ValueError as e: + print(f"Configuration error: {e}") # Handles invalid URLs or missing API keys +except RuntimeError as e: + print(f"Scraping error: {e}") # Handles API or network errors ``` ## Arguments @@ -40,4 +63,22 @@ tool = ScrapegraphScrapeTool( - `api_key`: Your Scrapegraph API key (required, can be set via SCRAPEGRAPH_API_KEY environment variable) ## Environment Variables -- `SCRAPEGRAPH_API_KEY`: Your Scrapegraph API key, you can buy it [here](https://scrapegraphai.com) +- `SCRAPEGRAPH_API_KEY`: Your Scrapegraph API key, you can obtain one [here](https://scrapegraphai.com) + +## Rate Limiting +The Scrapegraph API has rate limits that vary based on your subscription plan. Consider the following best practices: +- Implement appropriate delays between requests when processing multiple URLs +- Handle rate limit errors gracefully in your application +- Check your API plan limits on the Scrapegraph dashboard + +## Error Handling +The tool may raise the following exceptions: +- `ValueError`: When API key is missing or URL format is invalid +- `RuntimeError`: When scraping operation fails (network issues, API errors) +- `RateLimitError`: When API rate limits are exceeded + +## Best Practices +1. Always validate URLs before making requests +2. Implement proper error handling as shown in examples +3. Consider caching results for frequently accessed pages +4. Monitor your API usage through the Scrapegraph dashboard diff --git a/src/crewai_tools/tools/scrapegraph_scrape_tool/scrapegraph_scrape_tool.py b/src/crewai_tools/tools/scrapegraph_scrape_tool/scrapegraph_scrape_tool.py index 058af4150..906bf6376 100644 --- a/src/crewai_tools/tools/scrapegraph_scrape_tool/scrapegraph_scrape_tool.py +++ b/src/crewai_tools/tools/scrapegraph_scrape_tool/scrapegraph_scrape_tool.py @@ -1,15 +1,25 @@ import os from typing import Any, Optional, Type +from urllib.parse import urlparse from crewai.tools import BaseTool -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, validator from scrapegraph_py import Client from scrapegraph_py.logger import sgai_logger +class ScrapegraphError(Exception): + """Base exception for Scrapegraph-related errors""" + pass + + +class RateLimitError(ScrapegraphError): + """Raised when API rate limits are exceeded""" + pass + + class FixedScrapegraphScrapeToolSchema(BaseModel): """Input for ScrapegraphScrapeTool when website_url is fixed.""" - pass @@ -22,8 +32,28 @@ class ScrapegraphScrapeToolSchema(FixedScrapegraphScrapeToolSchema): description="Prompt to guide the extraction of content", ) + @validator('website_url') + def validate_url(cls, v): + """Validate URL format""" + try: + result = urlparse(v) + if not all([result.scheme, result.netloc]): + raise ValueError + return v + except Exception: + raise ValueError("Invalid URL format. URL must include scheme (http/https) and domain") + class ScrapegraphScrapeTool(BaseTool): + """ + A tool that uses Scrapegraph AI to intelligently scrape website content. + + Raises: + ValueError: If API key is missing or URL format is invalid + RateLimitError: If API rate limits are exceeded + RuntimeError: If scraping operation fails + """ + name: str = "Scrapegraph website scraper" description: str = "A tool that uses Scrapegraph AI to intelligently scrape website content." args_schema: Type[BaseModel] = ScrapegraphScrapeToolSchema @@ -45,6 +75,7 @@ class ScrapegraphScrapeTool(BaseTool): raise ValueError("Scrapegraph API key is required") if website_url is not None: + self._validate_url(website_url) self.website_url = website_url self.description = f"A tool that uses Scrapegraph AI to intelligently scrape {website_url}'s content." self.args_schema = FixedScrapegraphScrapeToolSchema @@ -55,6 +86,32 @@ class ScrapegraphScrapeTool(BaseTool): # Configure logging sgai_logger.set_logging(level="INFO") + @staticmethod + def _validate_url(url: str) -> None: + """Validate URL format""" + try: + result = urlparse(url) + if not all([result.scheme, result.netloc]): + raise ValueError + except Exception: + raise ValueError("Invalid URL format. URL must include scheme (http/https) and domain") + + def _handle_api_response(self, response: dict) -> str: + """Handle and validate API response""" + if not response: + raise RuntimeError("Empty response from Scrapegraph API") + + if "error" in response: + error_msg = response.get("error", {}).get("message", "Unknown error") + if "rate limit" in error_msg.lower(): + raise RateLimitError(f"Rate limit exceeded: {error_msg}") + raise RuntimeError(f"API error: {error_msg}") + + if "result" not in response: + raise RuntimeError("Invalid response format from Scrapegraph API") + + return response["result"] + def _run( self, **kwargs: Any, @@ -65,6 +122,9 @@ class ScrapegraphScrapeTool(BaseTool): if not website_url: raise ValueError("website_url is required") + # Validate URL format + self._validate_url(website_url) + # Initialize the client sgai_client = Client(api_key=self.api_key) @@ -75,8 +135,13 @@ class ScrapegraphScrapeTool(BaseTool): user_prompt=user_prompt, ) - # Return the result - return response["result"] + # Handle and validate the response + return self._handle_api_response(response) + + except RateLimitError: + raise # Re-raise rate limit errors + except Exception as e: + raise RuntimeError(f"Scraping failed: {str(e)}") finally: # Always close the client sgai_client.close()