mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-10 00:28:31 +00:00
Merge pull request #153 from VinciGit00/main
feat: integration of scrapegraph APIs
This commit is contained in:
@@ -27,6 +27,8 @@ from .tools import (
|
||||
PGSearchTool,
|
||||
RagTool,
|
||||
ScrapeElementFromWebsiteTool,
|
||||
ScrapeGraphScrapeTool,
|
||||
ScrapegraphScrapeToolSchema
|
||||
ScrapeWebsiteTool,
|
||||
ScrapflyScrapeWebsiteTool,
|
||||
SeleniumScrapingTool,
|
||||
|
||||
@@ -32,6 +32,7 @@ from .rag.rag_tool import RagTool
|
||||
from .scrape_element_from_website.scrape_element_from_website import (
|
||||
ScrapeElementFromWebsiteTool,
|
||||
)
|
||||
from .scrapegraph_scrape_tool.scrapegraph_scrape_tool import ScrapeGraphScrapeTool, ScrapegraphScrapeToolSchema
|
||||
from .scrape_website_tool.scrape_website_tool import ScrapeWebsiteTool
|
||||
from .scrapfly_scrape_website_tool.scrapfly_scrape_website_tool import (
|
||||
ScrapflyScrapeWebsiteTool,
|
||||
|
||||
84
src/crewai_tools/tools/scrapegraph_scrape_tool/README.md
Normal file
84
src/crewai_tools/tools/scrapegraph_scrape_tool/README.md
Normal file
@@ -0,0 +1,84 @@
|
||||
# ScrapegraphScrapeTool
|
||||
|
||||
## Description
|
||||
A tool that leverages Scrapegraph AI's SmartScraper API to intelligently extract content from websites. This tool provides advanced web scraping capabilities with AI-powered content extraction, making it ideal for targeted data collection and content analysis tasks.
|
||||
|
||||
## Installation
|
||||
Install the required packages:
|
||||
```shell
|
||||
pip install 'crewai[tools]'
|
||||
```
|
||||
|
||||
## Example Usage
|
||||
|
||||
### Basic Usage
|
||||
```python
|
||||
from crewai_tools import ScrapegraphScrapeTool
|
||||
|
||||
# Basic usage with API key
|
||||
tool = ScrapegraphScrapeTool(api_key="your_api_key")
|
||||
result = tool.run(
|
||||
website_url="https://www.example.com",
|
||||
user_prompt="Extract the main heading and summary"
|
||||
)
|
||||
```
|
||||
|
||||
### Fixed Website URL
|
||||
```python
|
||||
# Initialize with a fixed website URL
|
||||
tool = ScrapegraphScrapeTool(
|
||||
website_url="https://www.example.com",
|
||||
api_key="your_api_key"
|
||||
)
|
||||
result = tool.run()
|
||||
```
|
||||
|
||||
### Custom Prompt
|
||||
```python
|
||||
# With custom prompt
|
||||
tool = ScrapegraphScrapeTool(
|
||||
api_key="your_api_key",
|
||||
user_prompt="Extract all product prices and descriptions"
|
||||
)
|
||||
result = tool.run(website_url="https://www.example.com")
|
||||
```
|
||||
|
||||
### Error Handling
|
||||
```python
|
||||
try:
|
||||
tool = ScrapegraphScrapeTool(api_key="your_api_key")
|
||||
result = tool.run(
|
||||
website_url="https://www.example.com",
|
||||
user_prompt="Extract the main heading"
|
||||
)
|
||||
except ValueError as e:
|
||||
print(f"Configuration error: {e}") # Handles invalid URLs or missing API keys
|
||||
except RuntimeError as e:
|
||||
print(f"Scraping error: {e}") # Handles API or network errors
|
||||
```
|
||||
|
||||
## Arguments
|
||||
- `website_url`: The URL of the website to scrape (required if not set during initialization)
|
||||
- `user_prompt`: Custom instructions for content extraction (optional)
|
||||
- `api_key`: Your Scrapegraph API key (required, can be set via SCRAPEGRAPH_API_KEY environment variable)
|
||||
|
||||
## Environment Variables
|
||||
- `SCRAPEGRAPH_API_KEY`: Your Scrapegraph API key, you can obtain one [here](https://scrapegraphai.com)
|
||||
|
||||
## Rate Limiting
|
||||
The Scrapegraph API has rate limits that vary based on your subscription plan. Consider the following best practices:
|
||||
- Implement appropriate delays between requests when processing multiple URLs
|
||||
- Handle rate limit errors gracefully in your application
|
||||
- Check your API plan limits on the Scrapegraph dashboard
|
||||
|
||||
## Error Handling
|
||||
The tool may raise the following exceptions:
|
||||
- `ValueError`: When API key is missing or URL format is invalid
|
||||
- `RuntimeError`: When scraping operation fails (network issues, API errors)
|
||||
- `RateLimitError`: When API rate limits are exceeded
|
||||
|
||||
## Best Practices
|
||||
1. Always validate URLs before making requests
|
||||
2. Implement proper error handling as shown in examples
|
||||
3. Consider caching results for frequently accessed pages
|
||||
4. Monitor your API usage through the Scrapegraph dashboard
|
||||
@@ -0,0 +1,147 @@
|
||||
import os
|
||||
from typing import Any, Optional, Type
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from crewai.tools import BaseTool
|
||||
from pydantic import BaseModel, Field, validator
|
||||
from scrapegraph_py import Client
|
||||
from scrapegraph_py.logger import sgai_logger
|
||||
|
||||
|
||||
class ScrapegraphError(Exception):
|
||||
"""Base exception for Scrapegraph-related errors"""
|
||||
pass
|
||||
|
||||
|
||||
class RateLimitError(ScrapegraphError):
|
||||
"""Raised when API rate limits are exceeded"""
|
||||
pass
|
||||
|
||||
|
||||
class FixedScrapegraphScrapeToolSchema(BaseModel):
|
||||
"""Input for ScrapegraphScrapeTool when website_url is fixed."""
|
||||
pass
|
||||
|
||||
|
||||
class ScrapegraphScrapeToolSchema(FixedScrapegraphScrapeToolSchema):
|
||||
"""Input for ScrapegraphScrapeTool."""
|
||||
|
||||
website_url: str = Field(..., description="Mandatory website url to scrape")
|
||||
user_prompt: str = Field(
|
||||
default="Extract the main content of the webpage",
|
||||
description="Prompt to guide the extraction of content",
|
||||
)
|
||||
|
||||
@validator('website_url')
|
||||
def validate_url(cls, v):
|
||||
"""Validate URL format"""
|
||||
try:
|
||||
result = urlparse(v)
|
||||
if not all([result.scheme, result.netloc]):
|
||||
raise ValueError
|
||||
return v
|
||||
except Exception:
|
||||
raise ValueError("Invalid URL format. URL must include scheme (http/https) and domain")
|
||||
|
||||
|
||||
class ScrapegraphScrapeTool(BaseTool):
|
||||
"""
|
||||
A tool that uses Scrapegraph AI to intelligently scrape website content.
|
||||
|
||||
Raises:
|
||||
ValueError: If API key is missing or URL format is invalid
|
||||
RateLimitError: If API rate limits are exceeded
|
||||
RuntimeError: If scraping operation fails
|
||||
"""
|
||||
|
||||
name: str = "Scrapegraph website scraper"
|
||||
description: str = "A tool that uses Scrapegraph AI to intelligently scrape website content."
|
||||
args_schema: Type[BaseModel] = ScrapegraphScrapeToolSchema
|
||||
website_url: Optional[str] = None
|
||||
user_prompt: Optional[str] = None
|
||||
api_key: Optional[str] = None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
website_url: Optional[str] = None,
|
||||
user_prompt: Optional[str] = None,
|
||||
api_key: Optional[str] = None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.api_key = api_key or os.getenv("SCRAPEGRAPH_API_KEY")
|
||||
|
||||
if not self.api_key:
|
||||
raise ValueError("Scrapegraph API key is required")
|
||||
|
||||
if website_url is not None:
|
||||
self._validate_url(website_url)
|
||||
self.website_url = website_url
|
||||
self.description = f"A tool that uses Scrapegraph AI to intelligently scrape {website_url}'s content."
|
||||
self.args_schema = FixedScrapegraphScrapeToolSchema
|
||||
|
||||
if user_prompt is not None:
|
||||
self.user_prompt = user_prompt
|
||||
|
||||
# Configure logging
|
||||
sgai_logger.set_logging(level="INFO")
|
||||
|
||||
@staticmethod
|
||||
def _validate_url(url: str) -> None:
|
||||
"""Validate URL format"""
|
||||
try:
|
||||
result = urlparse(url)
|
||||
if not all([result.scheme, result.netloc]):
|
||||
raise ValueError
|
||||
except Exception:
|
||||
raise ValueError("Invalid URL format. URL must include scheme (http/https) and domain")
|
||||
|
||||
def _handle_api_response(self, response: dict) -> str:
|
||||
"""Handle and validate API response"""
|
||||
if not response:
|
||||
raise RuntimeError("Empty response from Scrapegraph API")
|
||||
|
||||
if "error" in response:
|
||||
error_msg = response.get("error", {}).get("message", "Unknown error")
|
||||
if "rate limit" in error_msg.lower():
|
||||
raise RateLimitError(f"Rate limit exceeded: {error_msg}")
|
||||
raise RuntimeError(f"API error: {error_msg}")
|
||||
|
||||
if "result" not in response:
|
||||
raise RuntimeError("Invalid response format from Scrapegraph API")
|
||||
|
||||
return response["result"]
|
||||
|
||||
def _run(
|
||||
self,
|
||||
**kwargs: Any,
|
||||
) -> Any:
|
||||
website_url = kwargs.get("website_url", self.website_url)
|
||||
user_prompt = kwargs.get("user_prompt", self.user_prompt) or "Extract the main content of the webpage"
|
||||
|
||||
if not website_url:
|
||||
raise ValueError("website_url is required")
|
||||
|
||||
# Validate URL format
|
||||
self._validate_url(website_url)
|
||||
|
||||
# Initialize the client
|
||||
sgai_client = Client(api_key=self.api_key)
|
||||
|
||||
try:
|
||||
# Make the SmartScraper request
|
||||
response = sgai_client.smartscraper(
|
||||
website_url=website_url,
|
||||
user_prompt=user_prompt,
|
||||
)
|
||||
|
||||
# Handle and validate the response
|
||||
return self._handle_api_response(response)
|
||||
|
||||
except RateLimitError:
|
||||
raise # Re-raise rate limit errors
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Scraping failed: {str(e)}")
|
||||
finally:
|
||||
# Always close the client
|
||||
sgai_client.close()
|
||||
Reference in New Issue
Block a user