refactor: update Firecrawl tools to improve configuration and error handling (#351)

- Added TYPE_CHECKING imports for FirecrawlApp to enhance type safety.
- Updated configuration keys in FirecrawlCrawlWebsiteTool and FirecrawlScrapeWebsiteTool to camelCase for consistency.
- Introduced error handling in the _run methods of both tools to ensure FirecrawlApp is properly initialized before usage.
- Adjusted parameters passed to crawl_url and scrape_url methods to use 'params' instead of unpacking the config dictionary directly.
This commit is contained in:
Lorenze Jay
2025-06-27 11:27:48 -07:00
committed by GitHub
parent 8723e66807
commit 180cc38330
3 changed files with 44 additions and 26 deletions

View File

@@ -1,12 +1,17 @@
from typing import Any, Optional, Type, List from typing import Any, Optional, Type, List, TYPE_CHECKING
from crewai.tools import BaseTool from crewai.tools import BaseTool
from pydantic import BaseModel, ConfigDict, Field, PrivateAttr from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
if TYPE_CHECKING:
from firecrawl import FirecrawlApp
try: try:
from firecrawl import FirecrawlApp, ScrapeOptions from firecrawl import FirecrawlApp
FIRECRAWL_AVAILABLE = True
except ImportError: except ImportError:
FirecrawlApp = Any FIRECRAWL_AVAILABLE = False
class FirecrawlCrawlWebsiteToolSchema(BaseModel): class FirecrawlCrawlWebsiteToolSchema(BaseModel):
@@ -42,16 +47,16 @@ class FirecrawlCrawlWebsiteTool(BaseTool):
api_key: Optional[str] = None api_key: Optional[str] = None
config: Optional[dict[str, Any]] = Field( config: Optional[dict[str, Any]] = Field(
default_factory=lambda: { default_factory=lambda: {
"max_depth": 2, "maxDepth": 2,
"ignore_sitemap": True, "ignoreSitemap": True,
"limit": 100, "limit": 10,
"allow_backward_links": False, "allowBackwardLinks": False,
"allow_external_links": False, "allowExternalLinks": False,
"scrape_options": ScrapeOptions( "scrapeOptions": {
formats=["markdown", "screenshot", "links"], "formats": ["markdown", "screenshot", "links"],
only_main_content=True, "onlyMainContent": True,
timeout=30000, "timeout": 10000,
), },
} }
) )
_firecrawl: Optional["FirecrawlApp"] = PrivateAttr(None) _firecrawl: Optional["FirecrawlApp"] = PrivateAttr(None)
@@ -88,7 +93,10 @@ class FirecrawlCrawlWebsiteTool(BaseTool):
) )
def _run(self, url: str): def _run(self, url: str):
return self._firecrawl.crawl_url(url, **self.config) if not self._firecrawl:
raise RuntimeError("FirecrawlApp not properly initialized")
return self._firecrawl.crawl_url(url, poll_interval=2, params=self.config)
try: try:

View File

@@ -1,16 +1,23 @@
from typing import Any, Optional, Type, Dict, List from typing import Any, Optional, Type, Dict, List, TYPE_CHECKING
from crewai.tools import BaseTool from crewai.tools import BaseTool
from pydantic import BaseModel, ConfigDict, Field, PrivateAttr from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
if TYPE_CHECKING:
from firecrawl import FirecrawlApp
try: try:
from firecrawl import FirecrawlApp from firecrawl import FirecrawlApp
FIRECRAWL_AVAILABLE = True
except ImportError: except ImportError:
FirecrawlApp = Any FIRECRAWL_AVAILABLE = False
class FirecrawlScrapeWebsiteToolSchema(BaseModel): class FirecrawlScrapeWebsiteToolSchema(BaseModel):
url: str = Field(description="Website URL") url: str = Field(description="Website URL")
class FirecrawlScrapeWebsiteTool(BaseTool): class FirecrawlScrapeWebsiteTool(BaseTool):
""" """
Tool for scraping webpages using Firecrawl. To run this tool, you need to have a Firecrawl API key. Tool for scraping webpages using Firecrawl. To run this tool, you need to have a Firecrawl API key.
@@ -21,11 +28,11 @@ class FirecrawlScrapeWebsiteTool(BaseTool):
Default configuration options: Default configuration options:
formats (list[str]): Content formats to return. Default: ["markdown"] formats (list[str]): Content formats to return. Default: ["markdown"]
only_main_content (bool): Only return main content. Default: True onlyMainContent (bool): Only return main content. Default: True
include_tags (list[str]): Tags to include. Default: [] includeTags (list[str]): Tags to include. Default: []
exclude_tags (list[str]): Tags to exclude. Default: [] excludeTags (list[str]): Tags to exclude. Default: []
headers (dict): Headers to include. Default: {} headers (dict): Headers to include. Default: {}
wait_for (int): Time to wait for page to load in ms. Default: 0 waitFor (int): Time to wait for page to load in ms. Default: 0
json_options (dict): Options for JSON extraction. Default: None json_options (dict): Options for JSON extraction. Default: None
""" """
@@ -39,11 +46,11 @@ class FirecrawlScrapeWebsiteTool(BaseTool):
config: Dict[str, Any] = Field( config: Dict[str, Any] = Field(
default_factory=lambda: { default_factory=lambda: {
"formats": ["markdown"], "formats": ["markdown"],
"only_main_content": True, "onlyMainContent": True,
"include_tags": [], "includeTags": [],
"exclude_tags": [], "excludeTags": [],
"headers": {}, "headers": {},
"wait_for": 0, "waitFor": 0,
} }
) )
@@ -74,7 +81,10 @@ class FirecrawlScrapeWebsiteTool(BaseTool):
self._firecrawl = FirecrawlApp(api_key=api_key) self._firecrawl = FirecrawlApp(api_key=api_key)
def _run(self, url: str): def _run(self, url: str):
return self._firecrawl.scrape_url(url, **self.config) if not self._firecrawl:
raise RuntimeError("FirecrawlApp not properly initialized")
return self._firecrawl.scrape_url(url, params=self.config)
try: try:

View File

@@ -98,7 +98,7 @@ class FirecrawlSearchTool(BaseTool):
return self._firecrawl.search( return self._firecrawl.search(
query=query, query=query,
**self.config, params=self.config,
) )