feat: integration of scrapegraph APIs

2026-01-10 08:38:30 +00:00 · 2024-12-18 14:34:40 +01:00
parent a49be2fc52
commit c070ba002c
2 changed files with 125 additions and 0 deletions
--- a/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md
+++ b/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md
@@ -0,0 +1,43 @@
+# ScrapegraphScrapeTool
+
+## Description
+A tool that leverages Scrapegraph AI's SmartScraper API to intelligently extract content from websites. This tool provides advanced web scraping capabilities with AI-powered content extraction, making it ideal for targeted data collection and content analysis tasks.
+
+## Installation
+Install the required packages:
+```shell
+pip install 'crewai[tools]'
+```
+
+## Example
+```python
+from crewai_tools import ScrapegraphScrapeTool
+
+# Basic usage with API key
+tool = ScrapegraphScrapeTool(api_key="your_api_key")
+result = tool.run(
+    website_url="https://www.example.com",
+    user_prompt="Extract the main heading and summary"
+)
+
+# Initialize with a fixed website URL
+tool = ScrapegraphScrapeTool(
+    website_url="https://www.example.com",
+    api_key="your_api_key"
+)
+result = tool.run()
+
+# With custom prompt
+tool = ScrapegraphScrapeTool(
+    api_key="your_api_key",
+    user_prompt="Extract all product prices and descriptions"
+)
+```
+
+## Arguments
+- `website_url`: The URL of the website to scrape (required if not set during initialization)
+- `user_prompt`: Custom instructions for content extraction (optional)
+- `api_key`: Your Scrapegraph API key (required, can be set via SCRAPEGRAPH_API_KEY environment variable)
+
+## Environment Variables
+- `SCRAPEGRAPH_API_KEY`: Your Scrapegraph API key
--- a/src/crewai_tools/tools/scrapegraph_scrape_tool/scrapegraph_scrape_tool.py
+++ b/src/crewai_tools/tools/scrapegraph_scrape_tool/scrapegraph_scrape_tool.py
@@ -0,0 +1,82 @@
+import os
+from typing import Any, Optional, Type
+
+from crewai.tools import BaseTool
+from pydantic import BaseModel, Field
+from scrapegraph_py import Client
+from scrapegraph_py.logger import sgai_logger
+
+
+class FixedScrapegraphScrapeToolSchema(BaseModel):
+    """Input for ScrapegraphScrapeTool when website_url is fixed."""
+
+    pass
+
+
+class ScrapegraphScrapeToolSchema(FixedScrapegraphScrapeToolSchema):
+    """Input for ScrapegraphScrapeTool."""
+
+    website_url: str = Field(..., description="Mandatory website url to scrape")
+    user_prompt: str = Field(
+        default="Extract the main content of the webpage",
+        description="Prompt to guide the extraction of content",
+    )
+
+
+class ScrapegraphScrapeTool(BaseTool):
+    name: str = "Scrapegraph website scraper"
+    description: str = "A tool that uses Scrapegraph AI to intelligently scrape website content."
+    args_schema: Type[BaseModel] = ScrapegraphScrapeToolSchema
+    website_url: Optional[str] = None
+    user_prompt: Optional[str] = None
+    api_key: Optional[str] = None
+
+    def __init__(
+        self,
+        website_url: Optional[str] = None,
+        user_prompt: Optional[str] = None,
+        api_key: Optional[str] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.api_key = api_key or os.getenv("SCRAPEGRAPH_API_KEY")
+        
+        if not self.api_key:
+            raise ValueError("Scrapegraph API key is required")
+
+        if website_url is not None:
+            self.website_url = website_url
+            self.description = f"A tool that uses Scrapegraph AI to intelligently scrape {website_url}'s content."
+            self.args_schema = FixedScrapegraphScrapeToolSchema
+            
+        if user_prompt is not None:
+            self.user_prompt = user_prompt
+
+        # Configure logging
+        sgai_logger.set_logging(level="INFO")
+
+    def _run(
+        self,
+        **kwargs: Any,
+    ) -> Any:
+        website_url = kwargs.get("website_url", self.website_url)
+        user_prompt = kwargs.get("user_prompt", self.user_prompt) or "Extract the main content of the webpage"
+
+        if not website_url:
+            raise ValueError("website_url is required")
+
+        # Initialize the client
+        sgai_client = Client(api_key=self.api_key)
+
+        try:
+            # Make the SmartScraper request
+            response = sgai_client.smartscraper(
+                website_url=website_url,
+                user_prompt=user_prompt,
+            )
+
+            # Return the result
+            return response["result"]
+        finally:
+            # Always close the client
+            sgai_client.close()