mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-09 08:08:32 +00:00
update documents according to suggestions
This commit is contained in:
@@ -9,7 +9,9 @@ Install the required packages:
|
|||||||
pip install 'crewai[tools]'
|
pip install 'crewai[tools]'
|
||||||
```
|
```
|
||||||
|
|
||||||
## Example
|
## Example Usage
|
||||||
|
|
||||||
|
### Basic Usage
|
||||||
```python
|
```python
|
||||||
from crewai_tools import ScrapegraphScrapeTool
|
from crewai_tools import ScrapegraphScrapeTool
|
||||||
|
|
||||||
@@ -19,19 +21,40 @@ result = tool.run(
|
|||||||
website_url="https://www.example.com",
|
website_url="https://www.example.com",
|
||||||
user_prompt="Extract the main heading and summary"
|
user_prompt="Extract the main heading and summary"
|
||||||
)
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Fixed Website URL
|
||||||
|
```python
|
||||||
# Initialize with a fixed website URL
|
# Initialize with a fixed website URL
|
||||||
tool = ScrapegraphScrapeTool(
|
tool = ScrapegraphScrapeTool(
|
||||||
website_url="https://www.example.com",
|
website_url="https://www.example.com",
|
||||||
api_key="your_api_key"
|
api_key="your_api_key"
|
||||||
)
|
)
|
||||||
result = tool.run()
|
result = tool.run()
|
||||||
|
```
|
||||||
|
|
||||||
|
### Custom Prompt
|
||||||
|
```python
|
||||||
# With custom prompt
|
# With custom prompt
|
||||||
tool = ScrapegraphScrapeTool(
|
tool = ScrapegraphScrapeTool(
|
||||||
api_key="your_api_key",
|
api_key="your_api_key",
|
||||||
user_prompt="Extract all product prices and descriptions"
|
user_prompt="Extract all product prices and descriptions"
|
||||||
)
|
)
|
||||||
|
result = tool.run(website_url="https://www.example.com")
|
||||||
|
```
|
||||||
|
|
||||||
|
### Error Handling
|
||||||
|
```python
|
||||||
|
try:
|
||||||
|
tool = ScrapegraphScrapeTool(api_key="your_api_key")
|
||||||
|
result = tool.run(
|
||||||
|
website_url="https://www.example.com",
|
||||||
|
user_prompt="Extract the main heading"
|
||||||
|
)
|
||||||
|
except ValueError as e:
|
||||||
|
print(f"Configuration error: {e}") # Handles invalid URLs or missing API keys
|
||||||
|
except RuntimeError as e:
|
||||||
|
print(f"Scraping error: {e}") # Handles API or network errors
|
||||||
```
|
```
|
||||||
|
|
||||||
## Arguments
|
## Arguments
|
||||||
@@ -40,4 +63,22 @@ tool = ScrapegraphScrapeTool(
|
|||||||
- `api_key`: Your Scrapegraph API key (required, can be set via SCRAPEGRAPH_API_KEY environment variable)
|
- `api_key`: Your Scrapegraph API key (required, can be set via SCRAPEGRAPH_API_KEY environment variable)
|
||||||
|
|
||||||
## Environment Variables
|
## Environment Variables
|
||||||
- `SCRAPEGRAPH_API_KEY`: Your Scrapegraph API key, you can buy it [here](https://scrapegraphai.com)
|
- `SCRAPEGRAPH_API_KEY`: Your Scrapegraph API key, you can obtain one [here](https://scrapegraphai.com)
|
||||||
|
|
||||||
|
## Rate Limiting
|
||||||
|
The Scrapegraph API has rate limits that vary based on your subscription plan. Consider the following best practices:
|
||||||
|
- Implement appropriate delays between requests when processing multiple URLs
|
||||||
|
- Handle rate limit errors gracefully in your application
|
||||||
|
- Check your API plan limits on the Scrapegraph dashboard
|
||||||
|
|
||||||
|
## Error Handling
|
||||||
|
The tool may raise the following exceptions:
|
||||||
|
- `ValueError`: When API key is missing or URL format is invalid
|
||||||
|
- `RuntimeError`: When scraping operation fails (network issues, API errors)
|
||||||
|
- `RateLimitError`: When API rate limits are exceeded
|
||||||
|
|
||||||
|
## Best Practices
|
||||||
|
1. Always validate URLs before making requests
|
||||||
|
2. Implement proper error handling as shown in examples
|
||||||
|
3. Consider caching results for frequently accessed pages
|
||||||
|
4. Monitor your API usage through the Scrapegraph dashboard
|
||||||
|
|||||||
@@ -1,15 +1,25 @@
|
|||||||
import os
|
import os
|
||||||
from typing import Any, Optional, Type
|
from typing import Any, Optional, Type
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from crewai.tools import BaseTool
|
from crewai.tools import BaseTool
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field, validator
|
||||||
from scrapegraph_py import Client
|
from scrapegraph_py import Client
|
||||||
from scrapegraph_py.logger import sgai_logger
|
from scrapegraph_py.logger import sgai_logger
|
||||||
|
|
||||||
|
|
||||||
|
class ScrapegraphError(Exception):
|
||||||
|
"""Base exception for Scrapegraph-related errors"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class RateLimitError(ScrapegraphError):
|
||||||
|
"""Raised when API rate limits are exceeded"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class FixedScrapegraphScrapeToolSchema(BaseModel):
|
class FixedScrapegraphScrapeToolSchema(BaseModel):
|
||||||
"""Input for ScrapegraphScrapeTool when website_url is fixed."""
|
"""Input for ScrapegraphScrapeTool when website_url is fixed."""
|
||||||
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
@@ -22,8 +32,28 @@ class ScrapegraphScrapeToolSchema(FixedScrapegraphScrapeToolSchema):
|
|||||||
description="Prompt to guide the extraction of content",
|
description="Prompt to guide the extraction of content",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@validator('website_url')
|
||||||
|
def validate_url(cls, v):
|
||||||
|
"""Validate URL format"""
|
||||||
|
try:
|
||||||
|
result = urlparse(v)
|
||||||
|
if not all([result.scheme, result.netloc]):
|
||||||
|
raise ValueError
|
||||||
|
return v
|
||||||
|
except Exception:
|
||||||
|
raise ValueError("Invalid URL format. URL must include scheme (http/https) and domain")
|
||||||
|
|
||||||
|
|
||||||
class ScrapegraphScrapeTool(BaseTool):
|
class ScrapegraphScrapeTool(BaseTool):
|
||||||
|
"""
|
||||||
|
A tool that uses Scrapegraph AI to intelligently scrape website content.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If API key is missing or URL format is invalid
|
||||||
|
RateLimitError: If API rate limits are exceeded
|
||||||
|
RuntimeError: If scraping operation fails
|
||||||
|
"""
|
||||||
|
|
||||||
name: str = "Scrapegraph website scraper"
|
name: str = "Scrapegraph website scraper"
|
||||||
description: str = "A tool that uses Scrapegraph AI to intelligently scrape website content."
|
description: str = "A tool that uses Scrapegraph AI to intelligently scrape website content."
|
||||||
args_schema: Type[BaseModel] = ScrapegraphScrapeToolSchema
|
args_schema: Type[BaseModel] = ScrapegraphScrapeToolSchema
|
||||||
@@ -45,6 +75,7 @@ class ScrapegraphScrapeTool(BaseTool):
|
|||||||
raise ValueError("Scrapegraph API key is required")
|
raise ValueError("Scrapegraph API key is required")
|
||||||
|
|
||||||
if website_url is not None:
|
if website_url is not None:
|
||||||
|
self._validate_url(website_url)
|
||||||
self.website_url = website_url
|
self.website_url = website_url
|
||||||
self.description = f"A tool that uses Scrapegraph AI to intelligently scrape {website_url}'s content."
|
self.description = f"A tool that uses Scrapegraph AI to intelligently scrape {website_url}'s content."
|
||||||
self.args_schema = FixedScrapegraphScrapeToolSchema
|
self.args_schema = FixedScrapegraphScrapeToolSchema
|
||||||
@@ -55,6 +86,32 @@ class ScrapegraphScrapeTool(BaseTool):
|
|||||||
# Configure logging
|
# Configure logging
|
||||||
sgai_logger.set_logging(level="INFO")
|
sgai_logger.set_logging(level="INFO")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _validate_url(url: str) -> None:
|
||||||
|
"""Validate URL format"""
|
||||||
|
try:
|
||||||
|
result = urlparse(url)
|
||||||
|
if not all([result.scheme, result.netloc]):
|
||||||
|
raise ValueError
|
||||||
|
except Exception:
|
||||||
|
raise ValueError("Invalid URL format. URL must include scheme (http/https) and domain")
|
||||||
|
|
||||||
|
def _handle_api_response(self, response: dict) -> str:
|
||||||
|
"""Handle and validate API response"""
|
||||||
|
if not response:
|
||||||
|
raise RuntimeError("Empty response from Scrapegraph API")
|
||||||
|
|
||||||
|
if "error" in response:
|
||||||
|
error_msg = response.get("error", {}).get("message", "Unknown error")
|
||||||
|
if "rate limit" in error_msg.lower():
|
||||||
|
raise RateLimitError(f"Rate limit exceeded: {error_msg}")
|
||||||
|
raise RuntimeError(f"API error: {error_msg}")
|
||||||
|
|
||||||
|
if "result" not in response:
|
||||||
|
raise RuntimeError("Invalid response format from Scrapegraph API")
|
||||||
|
|
||||||
|
return response["result"]
|
||||||
|
|
||||||
def _run(
|
def _run(
|
||||||
self,
|
self,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
@@ -65,6 +122,9 @@ class ScrapegraphScrapeTool(BaseTool):
|
|||||||
if not website_url:
|
if not website_url:
|
||||||
raise ValueError("website_url is required")
|
raise ValueError("website_url is required")
|
||||||
|
|
||||||
|
# Validate URL format
|
||||||
|
self._validate_url(website_url)
|
||||||
|
|
||||||
# Initialize the client
|
# Initialize the client
|
||||||
sgai_client = Client(api_key=self.api_key)
|
sgai_client = Client(api_key=self.api_key)
|
||||||
|
|
||||||
@@ -75,8 +135,13 @@ class ScrapegraphScrapeTool(BaseTool):
|
|||||||
user_prompt=user_prompt,
|
user_prompt=user_prompt,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Return the result
|
# Handle and validate the response
|
||||||
return response["result"]
|
return self._handle_api_response(response)
|
||||||
|
|
||||||
|
except RateLimitError:
|
||||||
|
raise # Re-raise rate limit errors
|
||||||
|
except Exception as e:
|
||||||
|
raise RuntimeError(f"Scraping failed: {str(e)}")
|
||||||
finally:
|
finally:
|
||||||
# Always close the client
|
# Always close the client
|
||||||
sgai_client.close()
|
sgai_client.close()
|
||||||
|
|||||||
Reference in New Issue
Block a user