update documents according to suggestions

This commit is contained in:
Marco Vinciguerra
2024-12-18 14:42:37 +01:00
parent 7608944e7f
commit b58d80dcf9
2 changed files with 112 additions and 6 deletions

View File

@@ -9,7 +9,9 @@ Install the required packages:
pip install 'crewai[tools]'
```
## Example
## Example Usage
### Basic Usage
```python
from crewai_tools import ScrapegraphScrapeTool
@@ -19,19 +21,40 @@ result = tool.run(
website_url="https://www.example.com",
user_prompt="Extract the main heading and summary"
)
```
### Fixed Website URL
```python
# Initialize with a fixed website URL
tool = ScrapegraphScrapeTool(
website_url="https://www.example.com",
api_key="your_api_key"
)
result = tool.run()
```
### Custom Prompt
```python
# With custom prompt
tool = ScrapegraphScrapeTool(
api_key="your_api_key",
user_prompt="Extract all product prices and descriptions"
)
result = tool.run(website_url="https://www.example.com")
```
### Error Handling
```python
try:
tool = ScrapegraphScrapeTool(api_key="your_api_key")
result = tool.run(
website_url="https://www.example.com",
user_prompt="Extract the main heading"
)
except ValueError as e:
print(f"Configuration error: {e}") # Handles invalid URLs or missing API keys
except RuntimeError as e:
print(f"Scraping error: {e}") # Handles API or network errors
```
## Arguments
@@ -40,4 +63,22 @@ tool = ScrapegraphScrapeTool(
- `api_key`: Your Scrapegraph API key (required, can be set via SCRAPEGRAPH_API_KEY environment variable)
## Environment Variables
- `SCRAPEGRAPH_API_KEY`: Your Scrapegraph API key, you can buy it [here](https://scrapegraphai.com)
- `SCRAPEGRAPH_API_KEY`: Your Scrapegraph API key, you can obtain one [here](https://scrapegraphai.com)
## Rate Limiting
The Scrapegraph API has rate limits that vary based on your subscription plan. Consider the following best practices:
- Implement appropriate delays between requests when processing multiple URLs
- Handle rate limit errors gracefully in your application
- Check your API plan limits on the Scrapegraph dashboard
## Error Handling
The tool may raise the following exceptions:
- `ValueError`: When API key is missing or URL format is invalid
- `RuntimeError`: When scraping operation fails (network issues, API errors)
- `RateLimitError`: When API rate limits are exceeded
## Best Practices
1. Always validate URLs before making requests
2. Implement proper error handling as shown in examples
3. Consider caching results for frequently accessed pages
4. Monitor your API usage through the Scrapegraph dashboard

View File

@@ -1,15 +1,25 @@
import os
from typing import Any, Optional, Type
from urllib.parse import urlparse
from crewai.tools import BaseTool
from pydantic import BaseModel, Field
from pydantic import BaseModel, Field, validator
from scrapegraph_py import Client
from scrapegraph_py.logger import sgai_logger
class ScrapegraphError(Exception):
"""Base exception for Scrapegraph-related errors"""
pass
class RateLimitError(ScrapegraphError):
"""Raised when API rate limits are exceeded"""
pass
class FixedScrapegraphScrapeToolSchema(BaseModel):
"""Input for ScrapegraphScrapeTool when website_url is fixed."""
pass
@@ -22,8 +32,28 @@ class ScrapegraphScrapeToolSchema(FixedScrapegraphScrapeToolSchema):
description="Prompt to guide the extraction of content",
)
@validator('website_url')
def validate_url(cls, v):
"""Validate URL format"""
try:
result = urlparse(v)
if not all([result.scheme, result.netloc]):
raise ValueError
return v
except Exception:
raise ValueError("Invalid URL format. URL must include scheme (http/https) and domain")
class ScrapegraphScrapeTool(BaseTool):
"""
A tool that uses Scrapegraph AI to intelligently scrape website content.
Raises:
ValueError: If API key is missing or URL format is invalid
RateLimitError: If API rate limits are exceeded
RuntimeError: If scraping operation fails
"""
name: str = "Scrapegraph website scraper"
description: str = "A tool that uses Scrapegraph AI to intelligently scrape website content."
args_schema: Type[BaseModel] = ScrapegraphScrapeToolSchema
@@ -45,6 +75,7 @@ class ScrapegraphScrapeTool(BaseTool):
raise ValueError("Scrapegraph API key is required")
if website_url is not None:
self._validate_url(website_url)
self.website_url = website_url
self.description = f"A tool that uses Scrapegraph AI to intelligently scrape {website_url}'s content."
self.args_schema = FixedScrapegraphScrapeToolSchema
@@ -55,6 +86,32 @@ class ScrapegraphScrapeTool(BaseTool):
# Configure logging
sgai_logger.set_logging(level="INFO")
@staticmethod
def _validate_url(url: str) -> None:
"""Validate URL format"""
try:
result = urlparse(url)
if not all([result.scheme, result.netloc]):
raise ValueError
except Exception:
raise ValueError("Invalid URL format. URL must include scheme (http/https) and domain")
def _handle_api_response(self, response: dict) -> str:
"""Handle and validate API response"""
if not response:
raise RuntimeError("Empty response from Scrapegraph API")
if "error" in response:
error_msg = response.get("error", {}).get("message", "Unknown error")
if "rate limit" in error_msg.lower():
raise RateLimitError(f"Rate limit exceeded: {error_msg}")
raise RuntimeError(f"API error: {error_msg}")
if "result" not in response:
raise RuntimeError("Invalid response format from Scrapegraph API")
return response["result"]
def _run(
self,
**kwargs: Any,
@@ -65,6 +122,9 @@ class ScrapegraphScrapeTool(BaseTool):
if not website_url:
raise ValueError("website_url is required")
# Validate URL format
self._validate_url(website_url)
# Initialize the client
sgai_client = Client(api_key=self.api_key)
@@ -75,8 +135,13 @@ class ScrapegraphScrapeTool(BaseTool):
user_prompt=user_prompt,
)
# Return the result
return response["result"]
# Handle and validate the response
return self._handle_api_response(response)
except RateLimitError:
raise # Re-raise rate limit errors
except Exception as e:
raise RuntimeError(f"Scraping failed: {str(e)}")
finally:
# Always close the client
sgai_client.close()