feat: integration of scrapegraph APIs

2026-01-10 00:28:31 +00:00 · 2024-12-18 14:34:40 +01:00
parent a49be2fc52
commit c070ba002c
2 changed files with 125 additions and 0 deletions
--- a/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md
+++ b/src/crewai_tools/tools/scrapegraph_scrape_tool/README.md
@@ -0,0 +1,43 @@
 # ScrapegraphScrapeTool
 ## Description
 A tool that leverages Scrapegraph AI's SmartScraper API to intelligently extract content from websites. This tool provides advanced web scraping capabilities with AI-powered content extraction, making it ideal for targeted data collection and content analysis tasks.
 ## Installation
 Install the required packages:
 ```shell
 pip install 'crewai[tools]'
 ```
 ## Example
 ```python
 from crewai_tools import ScrapegraphScrapeTool
 # Basic usage with API key
 tool = ScrapegraphScrapeTool(api_key="your_api_key")
 result = tool.run(
    website_url="https://www.example.com",
    user_prompt="Extract the main heading and summary"
 )
 # Initialize with a fixed website URL
 tool = ScrapegraphScrapeTool(
    website_url="https://www.example.com",
    api_key="your_api_key"
 )
 result = tool.run()
 # With custom prompt
 tool = ScrapegraphScrapeTool(
    api_key="your_api_key",
    user_prompt="Extract all product prices and descriptions"
 )
 ```
 ## Arguments
 - `website_url`: The URL of the website to scrape (required if not set during initialization)
 - `user_prompt`: Custom instructions for content extraction (optional)
 - `api_key`: Your Scrapegraph API key (required, can be set via SCRAPEGRAPH_API_KEY environment variable)
 ## Environment Variables
 - `SCRAPEGRAPH_API_KEY`: Your Scrapegraph API key
--- a/src/crewai_tools/tools/scrapegraph_scrape_tool/scrapegraph_scrape_tool.py
+++ b/src/crewai_tools/tools/scrapegraph_scrape_tool/scrapegraph_scrape_tool.py
@@ -0,0 +1,82 @@
 import os
 from typing import Any, Optional, Type
 from crewai.tools import BaseTool
 from pydantic import BaseModel, Field
 from scrapegraph_py import Client
 from scrapegraph_py.logger import sgai_logger
 class FixedScrapegraphScrapeToolSchema(BaseModel):
    """Input for ScrapegraphScrapeTool when website_url is fixed."""
    pass
 class ScrapegraphScrapeToolSchema(FixedScrapegraphScrapeToolSchema):
    """Input for ScrapegraphScrapeTool."""
    website_url: str = Field(..., description="Mandatory website url to scrape")
    user_prompt: str = Field(
        default="Extract the main content of the webpage",
        description="Prompt to guide the extraction of content",
    )
 class ScrapegraphScrapeTool(BaseTool):
    name: str = "Scrapegraph website scraper"
    description: str = "A tool that uses Scrapegraph AI to intelligently scrape website content."
    args_schema: Type[BaseModel] = ScrapegraphScrapeToolSchema
    website_url: Optional[str] = None
    user_prompt: Optional[str] = None
    api_key: Optional[str] = None
    def __init__(
        self,
        website_url: Optional[str] = None,
        user_prompt: Optional[str] = None,
        api_key: Optional[str] = None,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.api_key = api_key or os.getenv("SCRAPEGRAPH_API_KEY")
        if not self.api_key:
            raise ValueError("Scrapegraph API key is required")
        if website_url is not None:
            self.website_url = website_url
            self.description = f"A tool that uses Scrapegraph AI to intelligently scrape {website_url}'s content."
            self.args_schema = FixedScrapegraphScrapeToolSchema
        if user_prompt is not None:
            self.user_prompt = user_prompt
        # Configure logging
        sgai_logger.set_logging(level="INFO")
    def _run(
        self,
        **kwargs: Any,
    ) -> Any:
        website_url = kwargs.get("website_url", self.website_url)
        user_prompt = kwargs.get("user_prompt", self.user_prompt) or "Extract the main content of the webpage"
        if not website_url:
            raise ValueError("website_url is required")
        # Initialize the client
        sgai_client = Client(api_key=self.api_key)
        try:
            # Make the SmartScraper request
            response = sgai_client.smartscraper(
                website_url=website_url,
                user_prompt=user_prompt,
            )
            # Return the result
            return response["result"]
        finally:
            # Always close the client
            sgai_client.close()