mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-12 09:38:31 +00:00
git-subtree-dir: packages/tools git-subtree-split: 78317b9c127f18bd040c1d77e3c0840cdc9a5b38
184 lines
5.9 KiB
Python
184 lines
5.9 KiB
Python
import os
|
|
from typing import TYPE_CHECKING, Any, Optional, Type, List
|
|
from urllib.parse import urlparse
|
|
|
|
from crewai.tools import BaseTool, EnvVar
|
|
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
|
|
|
# Type checking import
|
|
if TYPE_CHECKING:
|
|
from scrapegraph_py import Client
|
|
|
|
|
|
class ScrapegraphError(Exception):
|
|
"""Base exception for Scrapegraph-related errors"""
|
|
|
|
|
|
class RateLimitError(ScrapegraphError):
|
|
"""Raised when API rate limits are exceeded"""
|
|
|
|
|
|
class FixedScrapegraphScrapeToolSchema(BaseModel):
|
|
"""Input for ScrapegraphScrapeTool when website_url is fixed."""
|
|
|
|
|
|
class ScrapegraphScrapeToolSchema(FixedScrapegraphScrapeToolSchema):
|
|
"""Input for ScrapegraphScrapeTool."""
|
|
|
|
website_url: str = Field(..., description="Mandatory website url to scrape")
|
|
user_prompt: str = Field(
|
|
default="Extract the main content of the webpage",
|
|
description="Prompt to guide the extraction of content",
|
|
)
|
|
|
|
@field_validator("website_url")
|
|
def validate_url(cls, v):
|
|
"""Validate URL format"""
|
|
try:
|
|
result = urlparse(v)
|
|
if not all([result.scheme, result.netloc]):
|
|
raise ValueError
|
|
return v
|
|
except Exception:
|
|
raise ValueError(
|
|
"Invalid URL format. URL must include scheme (http/https) and domain"
|
|
)
|
|
|
|
|
|
class ScrapegraphScrapeTool(BaseTool):
|
|
"""
|
|
A tool that uses Scrapegraph AI to intelligently scrape website content.
|
|
|
|
Raises:
|
|
ValueError: If API key is missing or URL format is invalid
|
|
RateLimitError: If API rate limits are exceeded
|
|
RuntimeError: If scraping operation fails
|
|
"""
|
|
|
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
|
|
name: str = "Scrapegraph website scraper"
|
|
description: str = (
|
|
"A tool that uses Scrapegraph AI to intelligently scrape website content."
|
|
)
|
|
args_schema: Type[BaseModel] = ScrapegraphScrapeToolSchema
|
|
website_url: Optional[str] = None
|
|
user_prompt: Optional[str] = None
|
|
api_key: Optional[str] = None
|
|
enable_logging: bool = False
|
|
_client: Optional["Client"] = None
|
|
package_dependencies: List[str] = ["scrapegraph-py"]
|
|
env_vars: List[EnvVar] = [
|
|
EnvVar(name="SCRAPEGRAPH_API_KEY", description="API key for Scrapegraph AI services", required=False),
|
|
]
|
|
|
|
def __init__(
|
|
self,
|
|
website_url: Optional[str] = None,
|
|
user_prompt: Optional[str] = None,
|
|
api_key: Optional[str] = None,
|
|
enable_logging: bool = False,
|
|
**kwargs,
|
|
):
|
|
super().__init__(**kwargs)
|
|
try:
|
|
from scrapegraph_py import Client
|
|
from scrapegraph_py.logger import sgai_logger
|
|
|
|
except ImportError:
|
|
import click
|
|
|
|
if click.confirm(
|
|
"You are missing the 'scrapegraph-py' package. Would you like to install it?"
|
|
):
|
|
import subprocess
|
|
|
|
subprocess.run(["uv", "add", "scrapegraph-py"], check=True)
|
|
from scrapegraph_py import Client
|
|
from scrapegraph_py.logger import sgai_logger
|
|
|
|
else:
|
|
raise ImportError(
|
|
"`scrapegraph-py` package not found, please run `uv add scrapegraph-py`"
|
|
)
|
|
|
|
self.api_key = api_key or os.getenv("SCRAPEGRAPH_API_KEY")
|
|
self._client = Client(api_key=self.api_key)
|
|
|
|
if not self.api_key:
|
|
raise ValueError("Scrapegraph API key is required")
|
|
|
|
if website_url is not None:
|
|
self._validate_url(website_url)
|
|
self.website_url = website_url
|
|
self.description = f"A tool that uses Scrapegraph AI to intelligently scrape {website_url}'s content."
|
|
self.args_schema = FixedScrapegraphScrapeToolSchema
|
|
|
|
if user_prompt is not None:
|
|
self.user_prompt = user_prompt
|
|
|
|
# Configure logging only if enabled
|
|
if self.enable_logging:
|
|
sgai_logger.set_logging(level="INFO")
|
|
|
|
@staticmethod
|
|
def _validate_url(url: str) -> None:
|
|
"""Validate URL format"""
|
|
try:
|
|
result = urlparse(url)
|
|
if not all([result.scheme, result.netloc]):
|
|
raise ValueError
|
|
except Exception:
|
|
raise ValueError(
|
|
"Invalid URL format. URL must include scheme (http/https) and domain"
|
|
)
|
|
|
|
def _handle_api_response(self, response: dict) -> str:
|
|
"""Handle and validate API response"""
|
|
if not response:
|
|
raise RuntimeError("Empty response from Scrapegraph API")
|
|
|
|
if "error" in response:
|
|
error_msg = response.get("error", {}).get("message", "Unknown error")
|
|
if "rate limit" in error_msg.lower():
|
|
raise RateLimitError(f"Rate limit exceeded: {error_msg}")
|
|
raise RuntimeError(f"API error: {error_msg}")
|
|
|
|
if "result" not in response:
|
|
raise RuntimeError("Invalid response format from Scrapegraph API")
|
|
|
|
return response["result"]
|
|
|
|
def _run(
|
|
self,
|
|
**kwargs: Any,
|
|
) -> Any:
|
|
website_url = kwargs.get("website_url", self.website_url)
|
|
user_prompt = (
|
|
kwargs.get("user_prompt", self.user_prompt)
|
|
or "Extract the main content of the webpage"
|
|
)
|
|
|
|
if not website_url:
|
|
raise ValueError("website_url is required")
|
|
|
|
# Validate URL format
|
|
self._validate_url(website_url)
|
|
|
|
try:
|
|
# Make the SmartScraper request
|
|
response = self._client.smartscraper(
|
|
website_url=website_url,
|
|
user_prompt=user_prompt,
|
|
)
|
|
|
|
return response
|
|
|
|
except RateLimitError:
|
|
raise # Re-raise rate limit errors
|
|
except Exception as e:
|
|
raise RuntimeError(f"Scraping failed: {str(e)}")
|
|
finally:
|
|
# Always close the client
|
|
self._client.close()
|