Merge pull request #153 from VinciGit00/main

feat: integration of scrapegraph APIs
This commit is contained in:
João Moura
2024-12-29 12:26:01 -03:00
committed by GitHub
4 changed files with 234 additions and 0 deletions

View File

@@ -27,6 +27,8 @@ from .tools import (
PGSearchTool,
RagTool,
ScrapeElementFromWebsiteTool,
ScrapeGraphScrapeTool,
ScrapegraphScrapeToolSchema
ScrapeWebsiteTool,
ScrapflyScrapeWebsiteTool,
SeleniumScrapingTool,

View File

@@ -32,6 +32,7 @@ from .rag.rag_tool import RagTool
from .scrape_element_from_website.scrape_element_from_website import (
ScrapeElementFromWebsiteTool,
)
from .scrapegraph_scrape_tool.scrapegraph_scrape_tool import ScrapeGraphScrapeTool, ScrapegraphScrapeToolSchema
from .scrape_website_tool.scrape_website_tool import ScrapeWebsiteTool
from .scrapfly_scrape_website_tool.scrapfly_scrape_website_tool import (
ScrapflyScrapeWebsiteTool,

View File

@@ -0,0 +1,84 @@
# ScrapegraphScrapeTool
## Description
A tool that leverages Scrapegraph AI's SmartScraper API to intelligently extract content from websites. This tool provides advanced web scraping capabilities with AI-powered content extraction, making it ideal for targeted data collection and content analysis tasks.
## Installation
Install the required packages:
```shell
pip install 'crewai[tools]'
```
## Example Usage
### Basic Usage
```python
from crewai_tools import ScrapegraphScrapeTool
# Basic usage with API key
tool = ScrapegraphScrapeTool(api_key="your_api_key")
result = tool.run(
website_url="https://www.example.com",
user_prompt="Extract the main heading and summary"
)
```
### Fixed Website URL
```python
# Initialize with a fixed website URL
tool = ScrapegraphScrapeTool(
website_url="https://www.example.com",
api_key="your_api_key"
)
result = tool.run()
```
### Custom Prompt
```python
# With custom prompt
tool = ScrapegraphScrapeTool(
api_key="your_api_key",
user_prompt="Extract all product prices and descriptions"
)
result = tool.run(website_url="https://www.example.com")
```
### Error Handling
```python
try:
tool = ScrapegraphScrapeTool(api_key="your_api_key")
result = tool.run(
website_url="https://www.example.com",
user_prompt="Extract the main heading"
)
except ValueError as e:
print(f"Configuration error: {e}") # Handles invalid URLs or missing API keys
except RuntimeError as e:
print(f"Scraping error: {e}") # Handles API or network errors
```
## Arguments
- `website_url`: The URL of the website to scrape (required if not set during initialization)
- `user_prompt`: Custom instructions for content extraction (optional)
- `api_key`: Your Scrapegraph API key (required, can be set via SCRAPEGRAPH_API_KEY environment variable)
## Environment Variables
- `SCRAPEGRAPH_API_KEY`: Your Scrapegraph API key, you can obtain one [here](https://scrapegraphai.com)
## Rate Limiting
The Scrapegraph API has rate limits that vary based on your subscription plan. Consider the following best practices:
- Implement appropriate delays between requests when processing multiple URLs
- Handle rate limit errors gracefully in your application
- Check your API plan limits on the Scrapegraph dashboard
## Error Handling
The tool may raise the following exceptions:
- `ValueError`: When API key is missing or URL format is invalid
- `RuntimeError`: When scraping operation fails (network issues, API errors)
- `RateLimitError`: When API rate limits are exceeded
## Best Practices
1. Always validate URLs before making requests
2. Implement proper error handling as shown in examples
3. Consider caching results for frequently accessed pages
4. Monitor your API usage through the Scrapegraph dashboard

View File

@@ -0,0 +1,147 @@
import os
from typing import Any, Optional, Type
from urllib.parse import urlparse
from crewai.tools import BaseTool
from pydantic import BaseModel, Field, validator
from scrapegraph_py import Client
from scrapegraph_py.logger import sgai_logger
class ScrapegraphError(Exception):
"""Base exception for Scrapegraph-related errors"""
pass
class RateLimitError(ScrapegraphError):
"""Raised when API rate limits are exceeded"""
pass
class FixedScrapegraphScrapeToolSchema(BaseModel):
"""Input for ScrapegraphScrapeTool when website_url is fixed."""
pass
class ScrapegraphScrapeToolSchema(FixedScrapegraphScrapeToolSchema):
"""Input for ScrapegraphScrapeTool."""
website_url: str = Field(..., description="Mandatory website url to scrape")
user_prompt: str = Field(
default="Extract the main content of the webpage",
description="Prompt to guide the extraction of content",
)
@validator('website_url')
def validate_url(cls, v):
"""Validate URL format"""
try:
result = urlparse(v)
if not all([result.scheme, result.netloc]):
raise ValueError
return v
except Exception:
raise ValueError("Invalid URL format. URL must include scheme (http/https) and domain")
class ScrapegraphScrapeTool(BaseTool):
"""
A tool that uses Scrapegraph AI to intelligently scrape website content.
Raises:
ValueError: If API key is missing or URL format is invalid
RateLimitError: If API rate limits are exceeded
RuntimeError: If scraping operation fails
"""
name: str = "Scrapegraph website scraper"
description: str = "A tool that uses Scrapegraph AI to intelligently scrape website content."
args_schema: Type[BaseModel] = ScrapegraphScrapeToolSchema
website_url: Optional[str] = None
user_prompt: Optional[str] = None
api_key: Optional[str] = None
def __init__(
self,
website_url: Optional[str] = None,
user_prompt: Optional[str] = None,
api_key: Optional[str] = None,
**kwargs,
):
super().__init__(**kwargs)
self.api_key = api_key or os.getenv("SCRAPEGRAPH_API_KEY")
if not self.api_key:
raise ValueError("Scrapegraph API key is required")
if website_url is not None:
self._validate_url(website_url)
self.website_url = website_url
self.description = f"A tool that uses Scrapegraph AI to intelligently scrape {website_url}'s content."
self.args_schema = FixedScrapegraphScrapeToolSchema
if user_prompt is not None:
self.user_prompt = user_prompt
# Configure logging
sgai_logger.set_logging(level="INFO")
@staticmethod
def _validate_url(url: str) -> None:
"""Validate URL format"""
try:
result = urlparse(url)
if not all([result.scheme, result.netloc]):
raise ValueError
except Exception:
raise ValueError("Invalid URL format. URL must include scheme (http/https) and domain")
def _handle_api_response(self, response: dict) -> str:
"""Handle and validate API response"""
if not response:
raise RuntimeError("Empty response from Scrapegraph API")
if "error" in response:
error_msg = response.get("error", {}).get("message", "Unknown error")
if "rate limit" in error_msg.lower():
raise RateLimitError(f"Rate limit exceeded: {error_msg}")
raise RuntimeError(f"API error: {error_msg}")
if "result" not in response:
raise RuntimeError("Invalid response format from Scrapegraph API")
return response["result"]
def _run(
self,
**kwargs: Any,
) -> Any:
website_url = kwargs.get("website_url", self.website_url)
user_prompt = kwargs.get("user_prompt", self.user_prompt) or "Extract the main content of the webpage"
if not website_url:
raise ValueError("website_url is required")
# Validate URL format
self._validate_url(website_url)
# Initialize the client
sgai_client = Client(api_key=self.api_key)
try:
# Make the SmartScraper request
response = sgai_client.smartscraper(
website_url=website_url,
user_prompt=user_prompt,
)
# Handle and validate the response
return self._handle_api_response(response)
except RateLimitError:
raise # Re-raise rate limit errors
except Exception as e:
raise RuntimeError(f"Scraping failed: {str(e)}")
finally:
# Always close the client
sgai_client.close()