feat: add SerperScrapeWebsiteTool for extracting clean content from URLs (#392)

* feat: add SerperScrapeWebsiteTool for extracting clean content from URLs

* feat: add required SERPER_API_KEY env var validation to SerperScrapeWebsiteTool
This commit is contained in:
Mike Plachta
2025-07-23 10:22:47 -07:00
committed by GitHub
parent c3e87fc31f
commit 104485d18b
3 changed files with 82 additions and 0 deletions

View File

@@ -61,6 +61,7 @@ from .tools import (
SerpApiGoogleSearchTool,
SerpApiGoogleShoppingTool,
SerperDevTool,
SerperScrapeWebsiteTool,
SerplyJobSearchTool,
SerplyNewsSearchTool,
SerplyScholarSearchTool,

View File

@@ -74,6 +74,7 @@ from .selenium_scraping_tool.selenium_scraping_tool import SeleniumScrapingTool
from .serpapi_tool.serpapi_google_search_tool import SerpApiGoogleSearchTool
from .serpapi_tool.serpapi_google_shopping_tool import SerpApiGoogleShoppingTool
from .serper_dev_tool.serper_dev_tool import SerperDevTool
from .serper_scrape_website_tool.serper_scrape_website_tool import SerperScrapeWebsiteTool
from .serply_api_tool.serply_job_search_tool import SerplyJobSearchTool
from .serply_api_tool.serply_news_search_tool import SerplyNewsSearchTool
from .serply_api_tool.serply_scholar_search_tool import SerplyScholarSearchTool

View File

@@ -0,0 +1,80 @@
from crewai.tools import BaseTool, EnvVar
from typing import Type, List
from pydantic import BaseModel, Field
import requests
import json
import os
class SerperScrapeWebsiteInput(BaseModel):
"""Input schema for SerperScrapeWebsite."""
url: str = Field(..., description="The URL of the website to scrape")
include_markdown: bool = Field(
default=True,
description="Whether to include markdown formatting in the scraped content"
)
class SerperScrapeWebsiteTool(BaseTool):
name: str = "serper_scrape_website"
description: str = (
"Scrapes website content using Serper's scraping API. "
"This tool can extract clean, readable content from any website URL, "
"optionally including markdown formatting for better structure."
)
args_schema: Type[BaseModel] = SerperScrapeWebsiteInput
env_vars: List[EnvVar] = [
EnvVar(name="SERPER_API_KEY", description="API key for Serper", required=True),
]
def _run(self, url: str, include_markdown: bool = True) -> str:
"""
Scrape website content using Serper API.
Args:
url: The URL to scrape
include_markdown: Whether to include markdown formatting
Returns:
Scraped website content as a string
"""
try:
# Serper API endpoint
api_url = "https://scrape.serper.dev"
# Get API key from environment variable for security
api_key = os.getenv('SERPER_API_KEY')
# Prepare the payload
payload = json.dumps({
"url": url,
"includeMarkdown": include_markdown
})
# Set headers
headers = {
'X-API-KEY': api_key,
'Content-Type': 'application/json'
}
# Make the API request
response = requests.post(api_url, headers=headers, data=payload)
# Check if request was successful
if response.status_code == 200:
result = response.json()
# Extract the scraped content
if 'text' in result:
return result['text']
else:
return f"Successfully scraped {url}, but no text content found in response: {response.text}"
else:
return f"Error scraping {url}: HTTP {response.status_code} - {response.text}"
except requests.exceptions.RequestException as e:
return f"Network error while scraping {url}: {str(e)}"
except json.JSONDecodeError as e:
return f"Error parsing JSON response while scraping {url}: {str(e)}"
except Exception as e:
return f"Unexpected error while scraping {url}: {str(e)}"