improve serper and firecrawl

This commit is contained in:
Brandon Hancock
2025-01-08 14:56:12 -05:00
parent 4388235846
commit e5aabe05e1
4 changed files with 71 additions and 66 deletions

View File

@@ -1,9 +1,8 @@
import os
from typing import TYPE_CHECKING, Any, Dict, Optional, Type
from pydantic import BaseModel, ConfigDict, Field
from crewai.tools import BaseTool
from pydantic import BaseModel, ConfigDict, Field
# Type checking import
if TYPE_CHECKING:
@@ -12,6 +11,14 @@ if TYPE_CHECKING:
class FirecrawlCrawlWebsiteToolSchema(BaseModel):
url: str = Field(description="Website URL")
crawler_options: Optional[Dict[str, Any]] = Field(
default=None, description="Options for crawling"
)
timeout: Optional[int] = Field(
default=30000,
description="Timeout in milliseconds for the crawling operation. The default value is 30000.",
)
class FirecrawlCrawlWebsiteTool(BaseTool):
model_config = ConfigDict(
@@ -20,25 +27,10 @@ class FirecrawlCrawlWebsiteTool(BaseTool):
name: str = "Firecrawl web crawl tool"
description: str = "Crawl webpages using Firecrawl and return the contents"
args_schema: Type[BaseModel] = FirecrawlCrawlWebsiteToolSchema
firecrawl_app: Optional["FirecrawlApp"] = None
api_key: Optional[str] = None
url: Optional[str] = None
params: Optional[Dict[str, Any]] = None
poll_interval: Optional[int] = 2
idempotency_key: Optional[str] = None
firecrawl: Optional["FirecrawlApp"] = None
def __init__(self, api_key: Optional[str] = None, **kwargs):
"""Initialize FirecrawlCrawlWebsiteTool.
Args:
api_key (Optional[str]): Firecrawl API key. If not provided, will check FIRECRAWL_API_KEY env var.
url (Optional[str]): Base URL to crawl. Can be overridden by the _run method.
firecrawl_app (Optional[FirecrawlApp]): Previously created FirecrawlApp instance.
params (Optional[Dict[str, Any]]): Additional parameters to pass to the FirecrawlApp.
poll_interval (Optional[int]): Poll interval for the FirecrawlApp.
idempotency_key (Optional[str]): Idempotency key for the FirecrawlApp.
**kwargs: Additional arguments passed to BaseTool.
"""
super().__init__(**kwargs)
try:
from firecrawl import FirecrawlApp # type: ignore
@@ -47,28 +39,29 @@ class FirecrawlCrawlWebsiteTool(BaseTool):
"`firecrawl` package not found, please run `pip install firecrawl-py`"
)
# Allows passing a previously created FirecrawlApp instance
# or builds a new one with the provided API key
if not self.firecrawl_app:
if not self.firecrawl:
client_api_key = api_key or os.getenv("FIRECRAWL_API_KEY")
if not client_api_key:
raise ValueError(
"FIRECRAWL_API_KEY is not set. Please provide it either via the constructor "
"with the `api_key` argument or by setting the FIRECRAWL_API_KEY environment variable."
)
self.firecrawl_app = FirecrawlApp(api_key=client_api_key)
self.firecrawl = FirecrawlApp(api_key=client_api_key)
def _run(self, url: str):
# Unless url has been previously set via constructor by the user,
# use the url argument provided by the agent at runtime.
base_url = self.url or url
def _run(
self,
url: str,
crawler_options: Optional[Dict[str, Any]] = None,
timeout: Optional[int] = 30000,
):
if crawler_options is None:
crawler_options = {}
return self.firecrawl_app.crawl_url(
base_url,
params=self.params,
poll_interval=self.poll_interval,
idempotency_key=self.idempotency_key
)
options = {
"crawlerOptions": crawler_options,
"timeout": timeout,
}
return self.firecrawl.crawl_url(url, options)
try:

View File

@@ -1,4 +1,4 @@
from typing import TYPE_CHECKING, Any, Dict, Optional, Type
from typing import TYPE_CHECKING, Optional, Type
from crewai.tools import BaseTool
from pydantic import BaseModel, ConfigDict, Field
@@ -10,14 +10,8 @@ if TYPE_CHECKING:
class FirecrawlScrapeWebsiteToolSchema(BaseModel):
url: str = Field(description="Website URL")
page_options: Optional[Dict[str, Any]] = Field(
default=None, description="Options for page scraping"
)
extractor_options: Optional[Dict[str, Any]] = Field(
default=None, description="Options for data extraction"
)
timeout: Optional[int] = Field(
default=None,
default=30000,
description="Timeout in milliseconds for the scraping operation. The default value is 30000.",
)
@@ -46,20 +40,15 @@ class FirecrawlScrapeWebsiteTool(BaseTool):
def _run(
self,
url: str,
page_options: Optional[Dict[str, Any]] = None,
extractor_options: Optional[Dict[str, Any]] = None,
timeout: Optional[int] = None,
timeout: Optional[int] = 30000,
):
if page_options is None:
page_options = {}
if extractor_options is None:
extractor_options = {}
if timeout is None:
timeout = 30000
options = {
"pageOptions": page_options,
"extractorOptions": extractor_options,
"formats": ["markdown"],
"onlyMainContent": True,
"includeTags": [],
"excludeTags": [],
"headers": {},
"waitFor": 0,
"timeout": timeout,
}
return self.firecrawl.scrape_url(url, options)

View File

@@ -10,11 +10,22 @@ if TYPE_CHECKING:
class FirecrawlSearchToolSchema(BaseModel):
query: str = Field(description="Search query")
page_options: Optional[Dict[str, Any]] = Field(
default=None, description="Options for result formatting"
limit: Optional[int] = Field(
default=5, description="Maximum number of results to return"
)
search_options: Optional[Dict[str, Any]] = Field(
default=None, description="Options for searching"
tbs: Optional[str] = Field(default=None, description="Time-based search parameter")
lang: Optional[str] = Field(
default="en", description="Language code for search results"
)
country: Optional[str] = Field(
default="us", description="Country code for search results"
)
location: Optional[str] = Field(
default=None, description="Location parameter for search results"
)
timeout: Optional[int] = Field(default=60000, description="Timeout in milliseconds")
scrape_options: Optional[Dict[str, Any]] = Field(
default=None, description="Options for scraping search results"
)
@@ -39,13 +50,25 @@ class FirecrawlSearchTool(BaseTool):
def _run(
self,
query: str,
page_options: Optional[Dict[str, Any]] = None,
result_options: Optional[Dict[str, Any]] = None,
limit: Optional[int] = 5,
tbs: Optional[str] = None,
lang: Optional[str] = "en",
country: Optional[str] = "us",
location: Optional[str] = None,
timeout: Optional[int] = 60000,
scrape_options: Optional[Dict[str, Any]] = None,
):
if page_options is None:
page_options = {}
if result_options is None:
result_options = {}
if scrape_options is None:
scrape_options = {}
options = {"pageOptions": page_options, "resultOptions": result_options}
return self.firecrawl.search(query, **options)
options = {
"query": query,
"limit": limit,
"tbs": tbs,
"lang": lang,
"country": country,
"location": location,
"timeout": timeout,
"scrapeOptions": scrape_options,
}
return self.firecrawl.search(**options)

View File

@@ -35,7 +35,7 @@ class SerperDevToolSchema(BaseModel):
class SerperDevTool(BaseTool):
name: str = "Search the internet"
name: str = "Search the internet with Serper"
description: str = (
"A tool that can be used to search the internet with a search_query. "
"Supports different search types: 'search' (default), 'news'"