diff --git a/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md b/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md new file mode 100644 index 000000000..76f385831 --- /dev/null +++ b/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md @@ -0,0 +1,43 @@ +# ScrapegraphScrapeTool + +## Description +A tool that leverages Scrapegraph AI's SmartScraper API to intelligently extract content from websites. This tool provides advanced web scraping capabilities with AI-powered content extraction, making it ideal for targeted data collection and content analysis tasks. + +## Installation +Install the required packages: +```shell +pip install 'crewai[tools]' +``` + +## Example +```python +from crewai_tools import ScrapegraphScrapeTool + +# Basic usage with API key +tool = ScrapegraphScrapeTool(api_key="your_api_key") +result = tool.run( + website_url="https://www.example.com", + user_prompt="Extract the main heading and summary" +) + +# Initialize with a fixed website URL +tool = ScrapegraphScrapeTool( + website_url="https://www.example.com", + api_key="your_api_key" +) +result = tool.run() + +# With custom prompt +tool = ScrapegraphScrapeTool( + api_key="your_api_key", + user_prompt="Extract all product prices and descriptions" +) +``` + +## Arguments +- `website_url`: The URL of the website to scrape (required if not set during initialization) +- `user_prompt`: Custom instructions for content extraction (optional) +- `api_key`: Your Scrapegraph API key (required, can be set via SCRAPEGRAPH_API_KEY environment variable) + +## Environment Variables +- `SCRAPEGRAPH_API_KEY`: Your Scrapegraph API key diff --git a/src/crewai_tools/tools/scrapegraph_scrape_tool/scrapegraph_scrape_tool.py b/src/crewai_tools/tools/scrapegraph_scrape_tool/scrapegraph_scrape_tool.py new file mode 100644 index 000000000..058af4150 --- /dev/null +++ b/src/crewai_tools/tools/scrapegraph_scrape_tool/scrapegraph_scrape_tool.py @@ -0,0 +1,82 @@ +import os +from typing import Any, Optional, Type + +from crewai.tools import BaseTool +from pydantic import BaseModel, Field +from scrapegraph_py import Client +from scrapegraph_py.logger import sgai_logger + + +class FixedScrapegraphScrapeToolSchema(BaseModel): + """Input for ScrapegraphScrapeTool when website_url is fixed.""" + + pass + + +class ScrapegraphScrapeToolSchema(FixedScrapegraphScrapeToolSchema): + """Input for ScrapegraphScrapeTool.""" + + website_url: str = Field(..., description="Mandatory website url to scrape") + user_prompt: str = Field( + default="Extract the main content of the webpage", + description="Prompt to guide the extraction of content", + ) + + +class ScrapegraphScrapeTool(BaseTool): + name: str = "Scrapegraph website scraper" + description: str = "A tool that uses Scrapegraph AI to intelligently scrape website content." + args_schema: Type[BaseModel] = ScrapegraphScrapeToolSchema + website_url: Optional[str] = None + user_prompt: Optional[str] = None + api_key: Optional[str] = None + + def __init__( + self, + website_url: Optional[str] = None, + user_prompt: Optional[str] = None, + api_key: Optional[str] = None, + **kwargs, + ): + super().__init__(**kwargs) + self.api_key = api_key or os.getenv("SCRAPEGRAPH_API_KEY") + + if not self.api_key: + raise ValueError("Scrapegraph API key is required") + + if website_url is not None: + self.website_url = website_url + self.description = f"A tool that uses Scrapegraph AI to intelligently scrape {website_url}'s content." + self.args_schema = FixedScrapegraphScrapeToolSchema + + if user_prompt is not None: + self.user_prompt = user_prompt + + # Configure logging + sgai_logger.set_logging(level="INFO") + + def _run( + self, + **kwargs: Any, + ) -> Any: + website_url = kwargs.get("website_url", self.website_url) + user_prompt = kwargs.get("user_prompt", self.user_prompt) or "Extract the main content of the webpage" + + if not website_url: + raise ValueError("website_url is required") + + # Initialize the client + sgai_client = Client(api_key=self.api_key) + + try: + # Make the SmartScraper request + response = sgai_client.smartscraper( + website_url=website_url, + user_prompt=user_prompt, + ) + + # Return the result + return response["result"] + finally: + # Always close the client + sgai_client.close()