feat: integration of scrapegraph APIs

This commit is contained in:
Marco Vinciguerra
2024-12-18 14:34:40 +01:00
parent a49be2fc52
commit c070ba002c
2 changed files with 125 additions and 0 deletions

View File

@@ -0,0 +1,43 @@
# ScrapegraphScrapeTool
## Description
A tool that leverages Scrapegraph AI's SmartScraper API to intelligently extract content from websites. This tool provides advanced web scraping capabilities with AI-powered content extraction, making it ideal for targeted data collection and content analysis tasks.
## Installation
Install the required packages:
```shell
pip install 'crewai[tools]'
```
## Example
```python
from crewai_tools import ScrapegraphScrapeTool
# Basic usage with API key
tool = ScrapegraphScrapeTool(api_key="your_api_key")
result = tool.run(
website_url="https://www.example.com",
user_prompt="Extract the main heading and summary"
)
# Initialize with a fixed website URL
tool = ScrapegraphScrapeTool(
website_url="https://www.example.com",
api_key="your_api_key"
)
result = tool.run()
# With custom prompt
tool = ScrapegraphScrapeTool(
api_key="your_api_key",
user_prompt="Extract all product prices and descriptions"
)
```
## Arguments
- `website_url`: The URL of the website to scrape (required if not set during initialization)
- `user_prompt`: Custom instructions for content extraction (optional)
- `api_key`: Your Scrapegraph API key (required, can be set via SCRAPEGRAPH_API_KEY environment variable)
## Environment Variables
- `SCRAPEGRAPH_API_KEY`: Your Scrapegraph API key

View File

@@ -0,0 +1,82 @@
import os
from typing import Any, Optional, Type
from crewai.tools import BaseTool
from pydantic import BaseModel, Field
from scrapegraph_py import Client
from scrapegraph_py.logger import sgai_logger
class FixedScrapegraphScrapeToolSchema(BaseModel):
"""Input for ScrapegraphScrapeTool when website_url is fixed."""
pass
class ScrapegraphScrapeToolSchema(FixedScrapegraphScrapeToolSchema):
"""Input for ScrapegraphScrapeTool."""
website_url: str = Field(..., description="Mandatory website url to scrape")
user_prompt: str = Field(
default="Extract the main content of the webpage",
description="Prompt to guide the extraction of content",
)
class ScrapegraphScrapeTool(BaseTool):
name: str = "Scrapegraph website scraper"
description: str = "A tool that uses Scrapegraph AI to intelligently scrape website content."
args_schema: Type[BaseModel] = ScrapegraphScrapeToolSchema
website_url: Optional[str] = None
user_prompt: Optional[str] = None
api_key: Optional[str] = None
def __init__(
self,
website_url: Optional[str] = None,
user_prompt: Optional[str] = None,
api_key: Optional[str] = None,
**kwargs,
):
super().__init__(**kwargs)
self.api_key = api_key or os.getenv("SCRAPEGRAPH_API_KEY")
if not self.api_key:
raise ValueError("Scrapegraph API key is required")
if website_url is not None:
self.website_url = website_url
self.description = f"A tool that uses Scrapegraph AI to intelligently scrape {website_url}'s content."
self.args_schema = FixedScrapegraphScrapeToolSchema
if user_prompt is not None:
self.user_prompt = user_prompt
# Configure logging
sgai_logger.set_logging(level="INFO")
def _run(
self,
**kwargs: Any,
) -> Any:
website_url = kwargs.get("website_url", self.website_url)
user_prompt = kwargs.get("user_prompt", self.user_prompt) or "Extract the main content of the webpage"
if not website_url:
raise ValueError("website_url is required")
# Initialize the client
sgai_client = Client(api_key=self.api_key)
try:
# Make the SmartScraper request
response = sgai_client.smartscraper(
website_url=website_url,
user_prompt=user_prompt,
)
# Return the result
return response["result"]
finally:
# Always close the client
sgai_client.close()