diff --git a/src/crewai_tools/__init__.py b/src/crewai_tools/__init__.py index e1c7a70f6..ba779e5ac 100644 --- a/src/crewai_tools/__init__.py +++ b/src/crewai_tools/__init__.py @@ -27,6 +27,8 @@ from .tools import ( PGSearchTool, RagTool, ScrapeElementFromWebsiteTool, + ScrapeGraphScrapeTool, + ScrapegraphScrapeToolSchema ScrapeWebsiteTool, ScrapflyScrapeWebsiteTool, SeleniumScrapingTool, diff --git a/src/crewai_tools/tools/__init__.py b/src/crewai_tools/tools/__init__.py index 157f631dc..d6faccc98 100644 --- a/src/crewai_tools/tools/__init__.py +++ b/src/crewai_tools/tools/__init__.py @@ -32,6 +32,7 @@ from .rag.rag_tool import RagTool from .scrape_element_from_website.scrape_element_from_website import ( ScrapeElementFromWebsiteTool, ) +from .scrapegraph_scrape_tool.scrapegraph_scrape_tool import ScrapeGraphScrapeTool, ScrapegraphScrapeToolSchema from .scrape_website_tool.scrape_website_tool import ScrapeWebsiteTool from .scrapfly_scrape_website_tool.scrapfly_scrape_website_tool import ( ScrapflyScrapeWebsiteTool, diff --git a/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md b/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md new file mode 100644 index 000000000..e006c0ff9 --- /dev/null +++ b/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md @@ -0,0 +1,84 @@ +# ScrapegraphScrapeTool + +## Description +A tool that leverages Scrapegraph AI's SmartScraper API to intelligently extract content from websites. This tool provides advanced web scraping capabilities with AI-powered content extraction, making it ideal for targeted data collection and content analysis tasks. + +## Installation +Install the required packages: +```shell +pip install 'crewai[tools]' +``` + +## Example Usage + +### Basic Usage +```python +from crewai_tools import ScrapegraphScrapeTool + +# Basic usage with API key +tool = ScrapegraphScrapeTool(api_key="your_api_key") +result = tool.run( + website_url="https://www.example.com", + user_prompt="Extract the main heading and summary" +) +``` + +### Fixed Website URL +```python +# Initialize with a fixed website URL +tool = ScrapegraphScrapeTool( + website_url="https://www.example.com", + api_key="your_api_key" +) +result = tool.run() +``` + +### Custom Prompt +```python +# With custom prompt +tool = ScrapegraphScrapeTool( + api_key="your_api_key", + user_prompt="Extract all product prices and descriptions" +) +result = tool.run(website_url="https://www.example.com") +``` + +### Error Handling +```python +try: + tool = ScrapegraphScrapeTool(api_key="your_api_key") + result = tool.run( + website_url="https://www.example.com", + user_prompt="Extract the main heading" + ) +except ValueError as e: + print(f"Configuration error: {e}") # Handles invalid URLs or missing API keys +except RuntimeError as e: + print(f"Scraping error: {e}") # Handles API or network errors +``` + +## Arguments +- `website_url`: The URL of the website to scrape (required if not set during initialization) +- `user_prompt`: Custom instructions for content extraction (optional) +- `api_key`: Your Scrapegraph API key (required, can be set via SCRAPEGRAPH_API_KEY environment variable) + +## Environment Variables +- `SCRAPEGRAPH_API_KEY`: Your Scrapegraph API key, you can obtain one [here](https://scrapegraphai.com) + +## Rate Limiting +The Scrapegraph API has rate limits that vary based on your subscription plan. Consider the following best practices: +- Implement appropriate delays between requests when processing multiple URLs +- Handle rate limit errors gracefully in your application +- Check your API plan limits on the Scrapegraph dashboard + +## Error Handling +The tool may raise the following exceptions: +- `ValueError`: When API key is missing or URL format is invalid +- `RuntimeError`: When scraping operation fails (network issues, API errors) +- `RateLimitError`: When API rate limits are exceeded + +## Best Practices +1. Always validate URLs before making requests +2. Implement proper error handling as shown in examples +3. Consider caching results for frequently accessed pages +4. Monitor your API usage through the Scrapegraph dashboard diff --git a/src/crewai_tools/tools/scrapegraph_scrape_tool/scrapegraph_scrape_tool.py b/src/crewai_tools/tools/scrapegraph_scrape_tool/scrapegraph_scrape_tool.py new file mode 100644 index 000000000..906bf6376 --- /dev/null +++ b/src/crewai_tools/tools/scrapegraph_scrape_tool/scrapegraph_scrape_tool.py @@ -0,0 +1,147 @@ +import os +from typing import Any, Optional, Type +from urllib.parse import urlparse + +from crewai.tools import BaseTool +from pydantic import BaseModel, Field, validator +from scrapegraph_py import Client +from scrapegraph_py.logger import sgai_logger + + +class ScrapegraphError(Exception): + """Base exception for Scrapegraph-related errors""" + pass + + +class RateLimitError(ScrapegraphError): + """Raised when API rate limits are exceeded""" + pass + + +class FixedScrapegraphScrapeToolSchema(BaseModel): + """Input for ScrapegraphScrapeTool when website_url is fixed.""" + pass + + +class ScrapegraphScrapeToolSchema(FixedScrapegraphScrapeToolSchema): + """Input for ScrapegraphScrapeTool.""" + + website_url: str = Field(..., description="Mandatory website url to scrape") + user_prompt: str = Field( + default="Extract the main content of the webpage", + description="Prompt to guide the extraction of content", + ) + + @validator('website_url') + def validate_url(cls, v): + """Validate URL format""" + try: + result = urlparse(v) + if not all([result.scheme, result.netloc]): + raise ValueError + return v + except Exception: + raise ValueError("Invalid URL format. URL must include scheme (http/https) and domain") + + +class ScrapegraphScrapeTool(BaseTool): + """ + A tool that uses Scrapegraph AI to intelligently scrape website content. + + Raises: + ValueError: If API key is missing or URL format is invalid + RateLimitError: If API rate limits are exceeded + RuntimeError: If scraping operation fails + """ + + name: str = "Scrapegraph website scraper" + description: str = "A tool that uses Scrapegraph AI to intelligently scrape website content." + args_schema: Type[BaseModel] = ScrapegraphScrapeToolSchema + website_url: Optional[str] = None + user_prompt: Optional[str] = None + api_key: Optional[str] = None + + def __init__( + self, + website_url: Optional[str] = None, + user_prompt: Optional[str] = None, + api_key: Optional[str] = None, + **kwargs, + ): + super().__init__(**kwargs) + self.api_key = api_key or os.getenv("SCRAPEGRAPH_API_KEY") + + if not self.api_key: + raise ValueError("Scrapegraph API key is required") + + if website_url is not None: + self._validate_url(website_url) + self.website_url = website_url + self.description = f"A tool that uses Scrapegraph AI to intelligently scrape {website_url}'s content." + self.args_schema = FixedScrapegraphScrapeToolSchema + + if user_prompt is not None: + self.user_prompt = user_prompt + + # Configure logging + sgai_logger.set_logging(level="INFO") + + @staticmethod + def _validate_url(url: str) -> None: + """Validate URL format""" + try: + result = urlparse(url) + if not all([result.scheme, result.netloc]): + raise ValueError + except Exception: + raise ValueError("Invalid URL format. URL must include scheme (http/https) and domain") + + def _handle_api_response(self, response: dict) -> str: + """Handle and validate API response""" + if not response: + raise RuntimeError("Empty response from Scrapegraph API") + + if "error" in response: + error_msg = response.get("error", {}).get("message", "Unknown error") + if "rate limit" in error_msg.lower(): + raise RateLimitError(f"Rate limit exceeded: {error_msg}") + raise RuntimeError(f"API error: {error_msg}") + + if "result" not in response: + raise RuntimeError("Invalid response format from Scrapegraph API") + + return response["result"] + + def _run( + self, + **kwargs: Any, + ) -> Any: + website_url = kwargs.get("website_url", self.website_url) + user_prompt = kwargs.get("user_prompt", self.user_prompt) or "Extract the main content of the webpage" + + if not website_url: + raise ValueError("website_url is required") + + # Validate URL format + self._validate_url(website_url) + + # Initialize the client + sgai_client = Client(api_key=self.api_key) + + try: + # Make the SmartScraper request + response = sgai_client.smartscraper( + website_url=website_url, + user_prompt=user_prompt, + ) + + # Handle and validate the response + return self._handle_api_response(response) + + except RateLimitError: + raise # Re-raise rate limit errors + except Exception as e: + raise RuntimeError(f"Scraping failed: {str(e)}") + finally: + # Always close the client + sgai_client.close()