From e5aabe05e1b85d2c574a72bde85a6488fbd0d20c Mon Sep 17 00:00:00 2001 From: Brandon Hancock Date: Wed, 8 Jan 2025 14:56:12 -0500 Subject: [PATCH] improve serper and firecrawl --- .../firecrawl_crawl_website_tool.py | 59 ++++++++----------- .../firecrawl_scrape_website_tool.py | 29 +++------ .../firecrawl_search_tool.py | 47 +++++++++++---- .../tools/serper_dev_tool/serper_dev_tool.py | 2 +- 4 files changed, 71 insertions(+), 66 deletions(-) diff --git a/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py b/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py index edada38dd..0eafd6e4a 100644 --- a/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py +++ b/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py @@ -1,9 +1,8 @@ import os from typing import TYPE_CHECKING, Any, Dict, Optional, Type -from pydantic import BaseModel, ConfigDict, Field - from crewai.tools import BaseTool +from pydantic import BaseModel, ConfigDict, Field # Type checking import if TYPE_CHECKING: @@ -12,6 +11,14 @@ if TYPE_CHECKING: class FirecrawlCrawlWebsiteToolSchema(BaseModel): url: str = Field(description="Website URL") + crawler_options: Optional[Dict[str, Any]] = Field( + default=None, description="Options for crawling" + ) + timeout: Optional[int] = Field( + default=30000, + description="Timeout in milliseconds for the crawling operation. The default value is 30000.", + ) + class FirecrawlCrawlWebsiteTool(BaseTool): model_config = ConfigDict( @@ -20,25 +27,10 @@ class FirecrawlCrawlWebsiteTool(BaseTool): name: str = "Firecrawl web crawl tool" description: str = "Crawl webpages using Firecrawl and return the contents" args_schema: Type[BaseModel] = FirecrawlCrawlWebsiteToolSchema - firecrawl_app: Optional["FirecrawlApp"] = None api_key: Optional[str] = None - url: Optional[str] = None - params: Optional[Dict[str, Any]] = None - poll_interval: Optional[int] = 2 - idempotency_key: Optional[str] = None + firecrawl: Optional["FirecrawlApp"] = None def __init__(self, api_key: Optional[str] = None, **kwargs): - """Initialize FirecrawlCrawlWebsiteTool. - - Args: - api_key (Optional[str]): Firecrawl API key. If not provided, will check FIRECRAWL_API_KEY env var. - url (Optional[str]): Base URL to crawl. Can be overridden by the _run method. - firecrawl_app (Optional[FirecrawlApp]): Previously created FirecrawlApp instance. - params (Optional[Dict[str, Any]]): Additional parameters to pass to the FirecrawlApp. - poll_interval (Optional[int]): Poll interval for the FirecrawlApp. - idempotency_key (Optional[str]): Idempotency key for the FirecrawlApp. - **kwargs: Additional arguments passed to BaseTool. - """ super().__init__(**kwargs) try: from firecrawl import FirecrawlApp # type: ignore @@ -47,28 +39,29 @@ class FirecrawlCrawlWebsiteTool(BaseTool): "`firecrawl` package not found, please run `pip install firecrawl-py`" ) - # Allows passing a previously created FirecrawlApp instance - # or builds a new one with the provided API key - if not self.firecrawl_app: - client_api_key = api_key or os.getenv("FIRECRAWL_API_KEY") + if not self.firecrawl: + client_api_key = api_key or os.getenv("FIRECRAWL_API_KEY") if not client_api_key: raise ValueError( "FIRECRAWL_API_KEY is not set. Please provide it either via the constructor " "with the `api_key` argument or by setting the FIRECRAWL_API_KEY environment variable." ) - self.firecrawl_app = FirecrawlApp(api_key=client_api_key) + self.firecrawl = FirecrawlApp(api_key=client_api_key) - def _run(self, url: str): - # Unless url has been previously set via constructor by the user, - # use the url argument provided by the agent at runtime. - base_url = self.url or url + def _run( + self, + url: str, + crawler_options: Optional[Dict[str, Any]] = None, + timeout: Optional[int] = 30000, + ): + if crawler_options is None: + crawler_options = {} - return self.firecrawl_app.crawl_url( - base_url, - params=self.params, - poll_interval=self.poll_interval, - idempotency_key=self.idempotency_key - ) + options = { + "crawlerOptions": crawler_options, + "timeout": timeout, + } + return self.firecrawl.crawl_url(url, options) try: diff --git a/src/crewai_tools/tools/firecrawl_scrape_website_tool/firecrawl_scrape_website_tool.py b/src/crewai_tools/tools/firecrawl_scrape_website_tool/firecrawl_scrape_website_tool.py index 9ab7d293e..8b2a37185 100644 --- a/src/crewai_tools/tools/firecrawl_scrape_website_tool/firecrawl_scrape_website_tool.py +++ b/src/crewai_tools/tools/firecrawl_scrape_website_tool/firecrawl_scrape_website_tool.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, Any, Dict, Optional, Type +from typing import TYPE_CHECKING, Optional, Type from crewai.tools import BaseTool from pydantic import BaseModel, ConfigDict, Field @@ -10,14 +10,8 @@ if TYPE_CHECKING: class FirecrawlScrapeWebsiteToolSchema(BaseModel): url: str = Field(description="Website URL") - page_options: Optional[Dict[str, Any]] = Field( - default=None, description="Options for page scraping" - ) - extractor_options: Optional[Dict[str, Any]] = Field( - default=None, description="Options for data extraction" - ) timeout: Optional[int] = Field( - default=None, + default=30000, description="Timeout in milliseconds for the scraping operation. The default value is 30000.", ) @@ -46,20 +40,15 @@ class FirecrawlScrapeWebsiteTool(BaseTool): def _run( self, url: str, - page_options: Optional[Dict[str, Any]] = None, - extractor_options: Optional[Dict[str, Any]] = None, - timeout: Optional[int] = None, + timeout: Optional[int] = 30000, ): - if page_options is None: - page_options = {} - if extractor_options is None: - extractor_options = {} - if timeout is None: - timeout = 30000 - options = { - "pageOptions": page_options, - "extractorOptions": extractor_options, + "formats": ["markdown"], + "onlyMainContent": True, + "includeTags": [], + "excludeTags": [], + "headers": {}, + "waitFor": 0, "timeout": timeout, } return self.firecrawl.scrape_url(url, options) diff --git a/src/crewai_tools/tools/firecrawl_search_tool/firecrawl_search_tool.py b/src/crewai_tools/tools/firecrawl_search_tool/firecrawl_search_tool.py index 5efd274de..36ba16391 100644 --- a/src/crewai_tools/tools/firecrawl_search_tool/firecrawl_search_tool.py +++ b/src/crewai_tools/tools/firecrawl_search_tool/firecrawl_search_tool.py @@ -10,11 +10,22 @@ if TYPE_CHECKING: class FirecrawlSearchToolSchema(BaseModel): query: str = Field(description="Search query") - page_options: Optional[Dict[str, Any]] = Field( - default=None, description="Options for result formatting" + limit: Optional[int] = Field( + default=5, description="Maximum number of results to return" ) - search_options: Optional[Dict[str, Any]] = Field( - default=None, description="Options for searching" + tbs: Optional[str] = Field(default=None, description="Time-based search parameter") + lang: Optional[str] = Field( + default="en", description="Language code for search results" + ) + country: Optional[str] = Field( + default="us", description="Country code for search results" + ) + location: Optional[str] = Field( + default=None, description="Location parameter for search results" + ) + timeout: Optional[int] = Field(default=60000, description="Timeout in milliseconds") + scrape_options: Optional[Dict[str, Any]] = Field( + default=None, description="Options for scraping search results" ) @@ -39,13 +50,25 @@ class FirecrawlSearchTool(BaseTool): def _run( self, query: str, - page_options: Optional[Dict[str, Any]] = None, - result_options: Optional[Dict[str, Any]] = None, + limit: Optional[int] = 5, + tbs: Optional[str] = None, + lang: Optional[str] = "en", + country: Optional[str] = "us", + location: Optional[str] = None, + timeout: Optional[int] = 60000, + scrape_options: Optional[Dict[str, Any]] = None, ): - if page_options is None: - page_options = {} - if result_options is None: - result_options = {} + if scrape_options is None: + scrape_options = {} - options = {"pageOptions": page_options, "resultOptions": result_options} - return self.firecrawl.search(query, **options) + options = { + "query": query, + "limit": limit, + "tbs": tbs, + "lang": lang, + "country": country, + "location": location, + "timeout": timeout, + "scrapeOptions": scrape_options, + } + return self.firecrawl.search(**options) diff --git a/src/crewai_tools/tools/serper_dev_tool/serper_dev_tool.py b/src/crewai_tools/tools/serper_dev_tool/serper_dev_tool.py index 5e8986c7e..e9eab56a2 100644 --- a/src/crewai_tools/tools/serper_dev_tool/serper_dev_tool.py +++ b/src/crewai_tools/tools/serper_dev_tool/serper_dev_tool.py @@ -35,7 +35,7 @@ class SerperDevToolSchema(BaseModel): class SerperDevTool(BaseTool): - name: str = "Search the internet" + name: str = "Search the internet with Serper" description: str = ( "A tool that can be used to search the internet with a search_query. " "Supports different search types: 'search' (default), 'news'"