mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-08 15:48:29 +00:00
feat: add SerperScrapeWebsiteTool for extracting clean content from URLs (#392)
* feat: add SerperScrapeWebsiteTool for extracting clean content from URLs * feat: add required SERPER_API_KEY env var validation to SerperScrapeWebsiteTool
This commit is contained in:
@@ -61,6 +61,7 @@ from .tools import (
|
||||
SerpApiGoogleSearchTool,
|
||||
SerpApiGoogleShoppingTool,
|
||||
SerperDevTool,
|
||||
SerperScrapeWebsiteTool,
|
||||
SerplyJobSearchTool,
|
||||
SerplyNewsSearchTool,
|
||||
SerplyScholarSearchTool,
|
||||
|
||||
@@ -74,6 +74,7 @@ from .selenium_scraping_tool.selenium_scraping_tool import SeleniumScrapingTool
|
||||
from .serpapi_tool.serpapi_google_search_tool import SerpApiGoogleSearchTool
|
||||
from .serpapi_tool.serpapi_google_shopping_tool import SerpApiGoogleShoppingTool
|
||||
from .serper_dev_tool.serper_dev_tool import SerperDevTool
|
||||
from .serper_scrape_website_tool.serper_scrape_website_tool import SerperScrapeWebsiteTool
|
||||
from .serply_api_tool.serply_job_search_tool import SerplyJobSearchTool
|
||||
from .serply_api_tool.serply_news_search_tool import SerplyNewsSearchTool
|
||||
from .serply_api_tool.serply_scholar_search_tool import SerplyScholarSearchTool
|
||||
|
||||
@@ -0,0 +1,80 @@
|
||||
from crewai.tools import BaseTool, EnvVar
|
||||
from typing import Type, List
|
||||
from pydantic import BaseModel, Field
|
||||
import requests
|
||||
import json
|
||||
import os
|
||||
|
||||
|
||||
class SerperScrapeWebsiteInput(BaseModel):
|
||||
"""Input schema for SerperScrapeWebsite."""
|
||||
url: str = Field(..., description="The URL of the website to scrape")
|
||||
include_markdown: bool = Field(
|
||||
default=True,
|
||||
description="Whether to include markdown formatting in the scraped content"
|
||||
)
|
||||
|
||||
|
||||
class SerperScrapeWebsiteTool(BaseTool):
|
||||
name: str = "serper_scrape_website"
|
||||
description: str = (
|
||||
"Scrapes website content using Serper's scraping API. "
|
||||
"This tool can extract clean, readable content from any website URL, "
|
||||
"optionally including markdown formatting for better structure."
|
||||
)
|
||||
args_schema: Type[BaseModel] = SerperScrapeWebsiteInput
|
||||
env_vars: List[EnvVar] = [
|
||||
EnvVar(name="SERPER_API_KEY", description="API key for Serper", required=True),
|
||||
]
|
||||
|
||||
def _run(self, url: str, include_markdown: bool = True) -> str:
|
||||
"""
|
||||
Scrape website content using Serper API.
|
||||
|
||||
Args:
|
||||
url: The URL to scrape
|
||||
include_markdown: Whether to include markdown formatting
|
||||
|
||||
Returns:
|
||||
Scraped website content as a string
|
||||
"""
|
||||
try:
|
||||
# Serper API endpoint
|
||||
api_url = "https://scrape.serper.dev"
|
||||
|
||||
# Get API key from environment variable for security
|
||||
api_key = os.getenv('SERPER_API_KEY')
|
||||
|
||||
# Prepare the payload
|
||||
payload = json.dumps({
|
||||
"url": url,
|
||||
"includeMarkdown": include_markdown
|
||||
})
|
||||
|
||||
# Set headers
|
||||
headers = {
|
||||
'X-API-KEY': api_key,
|
||||
'Content-Type': 'application/json'
|
||||
}
|
||||
|
||||
# Make the API request
|
||||
response = requests.post(api_url, headers=headers, data=payload)
|
||||
|
||||
# Check if request was successful
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
|
||||
# Extract the scraped content
|
||||
if 'text' in result:
|
||||
return result['text']
|
||||
else:
|
||||
return f"Successfully scraped {url}, but no text content found in response: {response.text}"
|
||||
else:
|
||||
return f"Error scraping {url}: HTTP {response.status_code} - {response.text}"
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
return f"Network error while scraping {url}: {str(e)}"
|
||||
except json.JSONDecodeError as e:
|
||||
return f"Error parsing JSON response while scraping {url}: {str(e)}"
|
||||
except Exception as e:
|
||||
return f"Unexpected error while scraping {url}: {str(e)}"
|
||||
Reference in New Issue
Block a user