From c070ba002c0d1f96087a53ed89a6963ba8d4b7ac Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Wed, 18 Dec 2024 14:34:40 +0100 Subject: [PATCH 1/5] feat: integration of scrapegraph APIs --- .../tools/scrapegraph_scrape_tool/README.md | 43 ++++++++++ .../scrapegraph_scrape_tool.py | 82 +++++++++++++++++++ 2 files changed, 125 insertions(+) create mode 100644 src/crewai_tools/tools/scrapegraph_scrape_tool/README.md create mode 100644 src/crewai_tools/tools/scrapegraph_scrape_tool/scrapegraph_scrape_tool.py diff --git a/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md b/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md new file mode 100644 index 000000000..76f385831 --- /dev/null +++ b/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md @@ -0,0 +1,43 @@ +# ScrapegraphScrapeTool + +## Description +A tool that leverages Scrapegraph AI's SmartScraper API to intelligently extract content from websites. This tool provides advanced web scraping capabilities with AI-powered content extraction, making it ideal for targeted data collection and content analysis tasks. + +## Installation +Install the required packages: +```shell +pip install 'crewai[tools]' +``` + +## Example +```python +from crewai_tools import ScrapegraphScrapeTool + +# Basic usage with API key +tool = ScrapegraphScrapeTool(api_key="your_api_key") +result = tool.run( + website_url="https://www.example.com", + user_prompt="Extract the main heading and summary" +) + +# Initialize with a fixed website URL +tool = ScrapegraphScrapeTool( + website_url="https://www.example.com", + api_key="your_api_key" +) +result = tool.run() + +# With custom prompt +tool = ScrapegraphScrapeTool( + api_key="your_api_key", + user_prompt="Extract all product prices and descriptions" +) +``` + +## Arguments +- `website_url`: The URL of the website to scrape (required if not set during initialization) +- `user_prompt`: Custom instructions for content extraction (optional) +- `api_key`: Your Scrapegraph API key (required, can be set via SCRAPEGRAPH_API_KEY environment variable) + +## Environment Variables +- `SCRAPEGRAPH_API_KEY`: Your Scrapegraph API key diff --git a/src/crewai_tools/tools/scrapegraph_scrape_tool/scrapegraph_scrape_tool.py b/src/crewai_tools/tools/scrapegraph_scrape_tool/scrapegraph_scrape_tool.py new file mode 100644 index 000000000..058af4150 --- /dev/null +++ b/src/crewai_tools/tools/scrapegraph_scrape_tool/scrapegraph_scrape_tool.py @@ -0,0 +1,82 @@ +import os +from typing import Any, Optional, Type + +from crewai.tools import BaseTool +from pydantic import BaseModel, Field +from scrapegraph_py import Client +from scrapegraph_py.logger import sgai_logger + + +class FixedScrapegraphScrapeToolSchema(BaseModel): + """Input for ScrapegraphScrapeTool when website_url is fixed.""" + + pass + + +class ScrapegraphScrapeToolSchema(FixedScrapegraphScrapeToolSchema): + """Input for ScrapegraphScrapeTool.""" + + website_url: str = Field(..., description="Mandatory website url to scrape") + user_prompt: str = Field( + default="Extract the main content of the webpage", + description="Prompt to guide the extraction of content", + ) + + +class ScrapegraphScrapeTool(BaseTool): + name: str = "Scrapegraph website scraper" + description: str = "A tool that uses Scrapegraph AI to intelligently scrape website content." + args_schema: Type[BaseModel] = ScrapegraphScrapeToolSchema + website_url: Optional[str] = None + user_prompt: Optional[str] = None + api_key: Optional[str] = None + + def __init__( + self, + website_url: Optional[str] = None, + user_prompt: Optional[str] = None, + api_key: Optional[str] = None, + **kwargs, + ): + super().__init__(**kwargs) + self.api_key = api_key or os.getenv("SCRAPEGRAPH_API_KEY") + + if not self.api_key: + raise ValueError("Scrapegraph API key is required") + + if website_url is not None: + self.website_url = website_url + self.description = f"A tool that uses Scrapegraph AI to intelligently scrape {website_url}'s content." + self.args_schema = FixedScrapegraphScrapeToolSchema + + if user_prompt is not None: + self.user_prompt = user_prompt + + # Configure logging + sgai_logger.set_logging(level="INFO") + + def _run( + self, + **kwargs: Any, + ) -> Any: + website_url = kwargs.get("website_url", self.website_url) + user_prompt = kwargs.get("user_prompt", self.user_prompt) or "Extract the main content of the webpage" + + if not website_url: + raise ValueError("website_url is required") + + # Initialize the client + sgai_client = Client(api_key=self.api_key) + + try: + # Make the SmartScraper request + response = sgai_client.smartscraper( + website_url=website_url, + user_prompt=user_prompt, + ) + + # Return the result + return response["result"] + finally: + # Always close the client + sgai_client.close() From 7608944e7f0e60f597e39fc2f40fc93fe31c4e28 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Wed, 18 Dec 2024 14:38:34 +0100 Subject: [PATCH 2/5] Update README.md --- src/crewai_tools/tools/scrapegraph_scrape_tool/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md b/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md index 76f385831..03467faee 100644 --- a/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md +++ b/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md @@ -40,4 +40,4 @@ tool = ScrapegraphScrapeTool( - `api_key`: Your Scrapegraph API key (required, can be set via SCRAPEGRAPH_API_KEY environment variable) ## Environment Variables -- `SCRAPEGRAPH_API_KEY`: Your Scrapegraph API key +- `SCRAPEGRAPH_API_KEY`: Your Scrapegraph API key, you can buy it [here](https://scrapegraphai.com) From b58d80dcf9373099ecc1bbc2715b6d042e8396ca Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Wed, 18 Dec 2024 14:42:37 +0100 Subject: [PATCH 3/5] update documents according to suggestions --- .../tools/scrapegraph_scrape_tool/README.md | 45 +++++++++++- .../scrapegraph_scrape_tool.py | 73 ++++++++++++++++++- 2 files changed, 112 insertions(+), 6 deletions(-) diff --git a/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md b/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md index 03467faee..e006c0ff9 100644 --- a/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md +++ b/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md @@ -9,7 +9,9 @@ Install the required packages: pip install 'crewai[tools]' ``` -## Example +## Example Usage + +### Basic Usage ```python from crewai_tools import ScrapegraphScrapeTool @@ -19,19 +21,40 @@ result = tool.run( website_url="https://www.example.com", user_prompt="Extract the main heading and summary" ) +``` +### Fixed Website URL +```python # Initialize with a fixed website URL tool = ScrapegraphScrapeTool( website_url="https://www.example.com", api_key="your_api_key" ) result = tool.run() +``` +### Custom Prompt +```python # With custom prompt tool = ScrapegraphScrapeTool( api_key="your_api_key", user_prompt="Extract all product prices and descriptions" ) +result = tool.run(website_url="https://www.example.com") +``` + +### Error Handling +```python +try: + tool = ScrapegraphScrapeTool(api_key="your_api_key") + result = tool.run( + website_url="https://www.example.com", + user_prompt="Extract the main heading" + ) +except ValueError as e: + print(f"Configuration error: {e}") # Handles invalid URLs or missing API keys +except RuntimeError as e: + print(f"Scraping error: {e}") # Handles API or network errors ``` ## Arguments @@ -40,4 +63,22 @@ tool = ScrapegraphScrapeTool( - `api_key`: Your Scrapegraph API key (required, can be set via SCRAPEGRAPH_API_KEY environment variable) ## Environment Variables -- `SCRAPEGRAPH_API_KEY`: Your Scrapegraph API key, you can buy it [here](https://scrapegraphai.com) +- `SCRAPEGRAPH_API_KEY`: Your Scrapegraph API key, you can obtain one [here](https://scrapegraphai.com) + +## Rate Limiting +The Scrapegraph API has rate limits that vary based on your subscription plan. Consider the following best practices: +- Implement appropriate delays between requests when processing multiple URLs +- Handle rate limit errors gracefully in your application +- Check your API plan limits on the Scrapegraph dashboard + +## Error Handling +The tool may raise the following exceptions: +- `ValueError`: When API key is missing or URL format is invalid +- `RuntimeError`: When scraping operation fails (network issues, API errors) +- `RateLimitError`: When API rate limits are exceeded + +## Best Practices +1. Always validate URLs before making requests +2. Implement proper error handling as shown in examples +3. Consider caching results for frequently accessed pages +4. Monitor your API usage through the Scrapegraph dashboard diff --git a/src/crewai_tools/tools/scrapegraph_scrape_tool/scrapegraph_scrape_tool.py b/src/crewai_tools/tools/scrapegraph_scrape_tool/scrapegraph_scrape_tool.py index 058af4150..906bf6376 100644 --- a/src/crewai_tools/tools/scrapegraph_scrape_tool/scrapegraph_scrape_tool.py +++ b/src/crewai_tools/tools/scrapegraph_scrape_tool/scrapegraph_scrape_tool.py @@ -1,15 +1,25 @@ import os from typing import Any, Optional, Type +from urllib.parse import urlparse from crewai.tools import BaseTool -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, validator from scrapegraph_py import Client from scrapegraph_py.logger import sgai_logger +class ScrapegraphError(Exception): + """Base exception for Scrapegraph-related errors""" + pass + + +class RateLimitError(ScrapegraphError): + """Raised when API rate limits are exceeded""" + pass + + class FixedScrapegraphScrapeToolSchema(BaseModel): """Input for ScrapegraphScrapeTool when website_url is fixed.""" - pass @@ -22,8 +32,28 @@ class ScrapegraphScrapeToolSchema(FixedScrapegraphScrapeToolSchema): description="Prompt to guide the extraction of content", ) + @validator('website_url') + def validate_url(cls, v): + """Validate URL format""" + try: + result = urlparse(v) + if not all([result.scheme, result.netloc]): + raise ValueError + return v + except Exception: + raise ValueError("Invalid URL format. URL must include scheme (http/https) and domain") + class ScrapegraphScrapeTool(BaseTool): + """ + A tool that uses Scrapegraph AI to intelligently scrape website content. + + Raises: + ValueError: If API key is missing or URL format is invalid + RateLimitError: If API rate limits are exceeded + RuntimeError: If scraping operation fails + """ + name: str = "Scrapegraph website scraper" description: str = "A tool that uses Scrapegraph AI to intelligently scrape website content." args_schema: Type[BaseModel] = ScrapegraphScrapeToolSchema @@ -45,6 +75,7 @@ class ScrapegraphScrapeTool(BaseTool): raise ValueError("Scrapegraph API key is required") if website_url is not None: + self._validate_url(website_url) self.website_url = website_url self.description = f"A tool that uses Scrapegraph AI to intelligently scrape {website_url}'s content." self.args_schema = FixedScrapegraphScrapeToolSchema @@ -55,6 +86,32 @@ class ScrapegraphScrapeTool(BaseTool): # Configure logging sgai_logger.set_logging(level="INFO") + @staticmethod + def _validate_url(url: str) -> None: + """Validate URL format""" + try: + result = urlparse(url) + if not all([result.scheme, result.netloc]): + raise ValueError + except Exception: + raise ValueError("Invalid URL format. URL must include scheme (http/https) and domain") + + def _handle_api_response(self, response: dict) -> str: + """Handle and validate API response""" + if not response: + raise RuntimeError("Empty response from Scrapegraph API") + + if "error" in response: + error_msg = response.get("error", {}).get("message", "Unknown error") + if "rate limit" in error_msg.lower(): + raise RateLimitError(f"Rate limit exceeded: {error_msg}") + raise RuntimeError(f"API error: {error_msg}") + + if "result" not in response: + raise RuntimeError("Invalid response format from Scrapegraph API") + + return response["result"] + def _run( self, **kwargs: Any, @@ -65,6 +122,9 @@ class ScrapegraphScrapeTool(BaseTool): if not website_url: raise ValueError("website_url is required") + # Validate URL format + self._validate_url(website_url) + # Initialize the client sgai_client = Client(api_key=self.api_key) @@ -75,8 +135,13 @@ class ScrapegraphScrapeTool(BaseTool): user_prompt=user_prompt, ) - # Return the result - return response["result"] + # Handle and validate the response + return self._handle_api_response(response) + + except RateLimitError: + raise # Re-raise rate limit errors + except Exception as e: + raise RuntimeError(f"Scraping failed: {str(e)}") finally: # Always close the client sgai_client.close() From c3ebbba8aefdc5d5c6cf0be1ab855720cc2e29d5 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sat, 28 Dec 2024 09:11:32 +0100 Subject: [PATCH 4/5] Update __init__.py --- src/crewai_tools/tools/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/crewai_tools/tools/__init__.py b/src/crewai_tools/tools/__init__.py index 23565dbea..00f992833 100644 --- a/src/crewai_tools/tools/__init__.py +++ b/src/crewai_tools/tools/__init__.py @@ -31,6 +31,7 @@ from .rag.rag_tool import RagTool from .scrape_element_from_website.scrape_element_from_website import ( ScrapeElementFromWebsiteTool, ) +from .scrapegraph_scrape_tool.scrapegraph_scrape_tool import ScrapeGraphScrapeTool, ScrapegraphScrapeToolSchema from .scrape_website_tool.scrape_website_tool import ScrapeWebsiteTool from .scrapfly_scrape_website_tool.scrapfly_scrape_website_tool import ( ScrapflyScrapeWebsiteTool, From 029afd3e145030ed6a0d0141a899beaa75311099 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Moura?= Date: Sun, 29 Dec 2024 12:23:08 -0300 Subject: [PATCH 5/5] Update __init__.py --- src/crewai_tools/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/crewai_tools/__init__.py b/src/crewai_tools/__init__.py index 87aca8531..65a90a01b 100644 --- a/src/crewai_tools/__init__.py +++ b/src/crewai_tools/__init__.py @@ -26,6 +26,8 @@ from .tools import ( PGSearchTool, RagTool, ScrapeElementFromWebsiteTool, + ScrapeGraphScrapeTool, + ScrapegraphScrapeToolSchema ScrapeWebsiteTool, ScrapflyScrapeWebsiteTool, SeleniumScrapingTool,