Squashed 'packages/tools/' content from commit 78317b9c

git-subtree-dir: packages/tools
git-subtree-split: 78317b9c127f18bd040c1d77e3c0840cdc9a5b38
This commit is contained in:
Greyson Lalonde
2025-09-12 21:58:02 -04:00
commit e16606672a
303 changed files with 49010 additions and 0 deletions

View File

@@ -0,0 +1,84 @@
# ScrapegraphScrapeTool
## Description
A tool that leverages Scrapegraph AI's SmartScraper API to intelligently extract content from websites. This tool provides advanced web scraping capabilities with AI-powered content extraction, making it ideal for targeted data collection and content analysis tasks.
## Installation
Install the required packages:
```shell
pip install 'crewai[tools]'
```
## Example Usage
### Basic Usage
```python
from crewai_tools import ScrapegraphScrapeTool
# Basic usage with API key
tool = ScrapegraphScrapeTool(api_key="your_api_key")
result = tool.run(
website_url="https://www.example.com",
user_prompt="Extract the main heading and summary"
)
```
### Fixed Website URL
```python
# Initialize with a fixed website URL
tool = ScrapegraphScrapeTool(
website_url="https://www.example.com",
api_key="your_api_key"
)
result = tool.run()
```
### Custom Prompt
```python
# With custom prompt
tool = ScrapegraphScrapeTool(
api_key="your_api_key",
user_prompt="Extract all product prices and descriptions"
)
result = tool.run(website_url="https://www.example.com")
```
### Error Handling
```python
try:
tool = ScrapegraphScrapeTool(api_key="your_api_key")
result = tool.run(
website_url="https://www.example.com",
user_prompt="Extract the main heading"
)
except ValueError as e:
print(f"Configuration error: {e}") # Handles invalid URLs or missing API keys
except RuntimeError as e:
print(f"Scraping error: {e}") # Handles API or network errors
```
## Arguments
- `website_url`: The URL of the website to scrape (required if not set during initialization)
- `user_prompt`: Custom instructions for content extraction (optional)
- `api_key`: Your Scrapegraph API key (required, can be set via SCRAPEGRAPH_API_KEY environment variable)
## Environment Variables
- `SCRAPEGRAPH_API_KEY`: Your Scrapegraph API key, you can obtain one [here](https://scrapegraphai.com)
## Rate Limiting
The Scrapegraph API has rate limits that vary based on your subscription plan. Consider the following best practices:
- Implement appropriate delays between requests when processing multiple URLs
- Handle rate limit errors gracefully in your application
- Check your API plan limits on the Scrapegraph dashboard
## Error Handling
The tool may raise the following exceptions:
- `ValueError`: When API key is missing or URL format is invalid
- `RuntimeError`: When scraping operation fails (network issues, API errors)
- `RateLimitError`: When API rate limits are exceeded
## Best Practices
1. Always validate URLs before making requests
2. Implement proper error handling as shown in examples
3. Consider caching results for frequently accessed pages
4. Monitor your API usage through the Scrapegraph dashboard

View File

@@ -0,0 +1,183 @@
import os
from typing import TYPE_CHECKING, Any, Optional, Type, List
from urllib.parse import urlparse
from crewai.tools import BaseTool, EnvVar
from pydantic import BaseModel, ConfigDict, Field, field_validator
# Type checking import
if TYPE_CHECKING:
from scrapegraph_py import Client
class ScrapegraphError(Exception):
"""Base exception for Scrapegraph-related errors"""
class RateLimitError(ScrapegraphError):
"""Raised when API rate limits are exceeded"""
class FixedScrapegraphScrapeToolSchema(BaseModel):
"""Input for ScrapegraphScrapeTool when website_url is fixed."""
class ScrapegraphScrapeToolSchema(FixedScrapegraphScrapeToolSchema):
"""Input for ScrapegraphScrapeTool."""
website_url: str = Field(..., description="Mandatory website url to scrape")
user_prompt: str = Field(
default="Extract the main content of the webpage",
description="Prompt to guide the extraction of content",
)
@field_validator("website_url")
def validate_url(cls, v):
"""Validate URL format"""
try:
result = urlparse(v)
if not all([result.scheme, result.netloc]):
raise ValueError
return v
except Exception:
raise ValueError(
"Invalid URL format. URL must include scheme (http/https) and domain"
)
class ScrapegraphScrapeTool(BaseTool):
"""
A tool that uses Scrapegraph AI to intelligently scrape website content.
Raises:
ValueError: If API key is missing or URL format is invalid
RateLimitError: If API rate limits are exceeded
RuntimeError: If scraping operation fails
"""
model_config = ConfigDict(arbitrary_types_allowed=True)
name: str = "Scrapegraph website scraper"
description: str = (
"A tool that uses Scrapegraph AI to intelligently scrape website content."
)
args_schema: Type[BaseModel] = ScrapegraphScrapeToolSchema
website_url: Optional[str] = None
user_prompt: Optional[str] = None
api_key: Optional[str] = None
enable_logging: bool = False
_client: Optional["Client"] = None
package_dependencies: List[str] = ["scrapegraph-py"]
env_vars: List[EnvVar] = [
EnvVar(name="SCRAPEGRAPH_API_KEY", description="API key for Scrapegraph AI services", required=False),
]
def __init__(
self,
website_url: Optional[str] = None,
user_prompt: Optional[str] = None,
api_key: Optional[str] = None,
enable_logging: bool = False,
**kwargs,
):
super().__init__(**kwargs)
try:
from scrapegraph_py import Client
from scrapegraph_py.logger import sgai_logger
except ImportError:
import click
if click.confirm(
"You are missing the 'scrapegraph-py' package. Would you like to install it?"
):
import subprocess
subprocess.run(["uv", "add", "scrapegraph-py"], check=True)
from scrapegraph_py import Client
from scrapegraph_py.logger import sgai_logger
else:
raise ImportError(
"`scrapegraph-py` package not found, please run `uv add scrapegraph-py`"
)
self.api_key = api_key or os.getenv("SCRAPEGRAPH_API_KEY")
self._client = Client(api_key=self.api_key)
if not self.api_key:
raise ValueError("Scrapegraph API key is required")
if website_url is not None:
self._validate_url(website_url)
self.website_url = website_url
self.description = f"A tool that uses Scrapegraph AI to intelligently scrape {website_url}'s content."
self.args_schema = FixedScrapegraphScrapeToolSchema
if user_prompt is not None:
self.user_prompt = user_prompt
# Configure logging only if enabled
if self.enable_logging:
sgai_logger.set_logging(level="INFO")
@staticmethod
def _validate_url(url: str) -> None:
"""Validate URL format"""
try:
result = urlparse(url)
if not all([result.scheme, result.netloc]):
raise ValueError
except Exception:
raise ValueError(
"Invalid URL format. URL must include scheme (http/https) and domain"
)
def _handle_api_response(self, response: dict) -> str:
"""Handle and validate API response"""
if not response:
raise RuntimeError("Empty response from Scrapegraph API")
if "error" in response:
error_msg = response.get("error", {}).get("message", "Unknown error")
if "rate limit" in error_msg.lower():
raise RateLimitError(f"Rate limit exceeded: {error_msg}")
raise RuntimeError(f"API error: {error_msg}")
if "result" not in response:
raise RuntimeError("Invalid response format from Scrapegraph API")
return response["result"]
def _run(
self,
**kwargs: Any,
) -> Any:
website_url = kwargs.get("website_url", self.website_url)
user_prompt = (
kwargs.get("user_prompt", self.user_prompt)
or "Extract the main content of the webpage"
)
if not website_url:
raise ValueError("website_url is required")
# Validate URL format
self._validate_url(website_url)
try:
# Make the SmartScraper request
response = self._client.smartscraper(
website_url=website_url,
user_prompt=user_prompt,
)
return response
except RateLimitError:
raise # Re-raise rate limit errors
except Exception as e:
raise RuntimeError(f"Scraping failed: {str(e)}")
finally:
# Always close the client
self._client.close()