Merge pull request #237 from SHIXOOM/Fix-FireCrawl-Crawler-Tool

Fix: FirecrawlCrawlWebsiteTool update parameters for FireCrawl API v1 and update run arguments for agents
2026-01-09 08:08:32 +00:00 · 2025-04-06 14:06:32 -04:00
parent 90c9d5d71d 89394ef3e3
commit 6f95572e18
2 changed files with 71 additions and 23 deletions
--- a/src/crewai_tools/tools/firecrawl_crawl_website_tool/README.md
+++ b/src/crewai_tools/tools/firecrawl_crawl_website_tool/README.md
@@ -4,6 +4,10 @@
 [Firecrawl](https://firecrawl.dev) is a platform for crawling and convert any website into clean markdown or structured data.
 ## Version Compatibility
 This implementation is compatible with FireCrawl API v1
 ## Installation
 - Get an API key from [firecrawl.dev](https://firecrawl.dev) and set it in environment variables (`FIRECRAWL_API_KEY`).
@@ -27,16 +31,27 @@ tool = FirecrawlCrawlWebsiteTool(url='firecrawl.dev')
 - `api_key`: Optional. Specifies Firecrawl API key. Defaults is the `FIRECRAWL_API_KEY` environment variable.
 - `url`: The base URL to start crawling from.
- `page_options`: Optional. 
+- `maxDepth`: Optional. Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children and so on.  
-  - `onlyMainContent`: Optional. Only return the main content of the page excluding headers, navs, footers, etc.
+- `limit`: Optional. Maximum number of pages to crawl.  
-  - `includeHtml`: Optional. Include the raw HTML content of the page. Will output a html key in the response.
+- `allowExternalLinks`: Allows the crawler to follow links that point to external domains.  
- `crawler_options`: Optional. Options for controlling the crawling behavior.
+- `formats`: Optional. Formats for the page's content to be returned (eg. markdown, html, screenshot, links).  
-  - `includes`: Optional. URL patterns to include in the crawl.
+- `timeout`: Optional. Timeout in milliseconds for the crawling operation.  
  - `exclude`: Optional. URL patterns to exclude from the crawl.
  - `generateImgAltText`: Optional. Generate alt text for images using LLMs (requires a paid plan).
  - `returnOnlyUrls`: Optional. If true, returns only the URLs as a list in the crawl status. Note: the response will be a list of URLs inside the data, not a list of documents.
  - `maxDepth`: Optional. Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children, and so on.
  - `mode`: Optional. The crawling mode to use. Fast mode crawls 4x faster on websites without a sitemap but may not be as accurate and shouldn't be used on heavily JavaScript-rendered websites.
  - `limit`: Optional. Maximum number of pages to crawl.
  - `timeout`: Optional. Timeout in milliseconds for the crawling operation.
 ## Configurations Example
 This is the default configuration
 ```python
    DEFAULT_CRAWLING_OPTIONS = {
        "maxDepth": 2,
        "ignoreSitemap": True,
        "limit": 100,
        "allowBackwardLinks": False, 
        "allowExternalLinks": False,
        "scrapeOptions": {
            "formats": ["markdown", "screenshot", "links"],
            "onlyMainContent": True,
            "timeout": 30000
        }
    }
 ```
--- a/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py
+++ b/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Optional, Type
+from typing import Any, Optional, Type
 from crewai.tools import BaseTool
 from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
@@ -12,8 +12,20 @@ except ImportError:
 class FirecrawlCrawlWebsiteToolSchema(BaseModel):
    url: str = Field(description="Website URL")
-    crawler_options: Optional[Dict[str, Any]] = Field(
+    maxDepth: Optional[int] = Field(
-        default=None, description="Options for crawling"
+        default=2,
        description="Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children and so on.",
    )
    limit: Optional[int] = Field(
        default=100, description="Maximum number of pages to crawl."
    )
    allowExternalLinks: Optional[bool] = Field(
        default=False,
        description="Allows the crawler to follow links that point to external domains.",
    )
    formats: Optional[list[str]] = Field(
        default=["markdown", "screenshot", "links"],
        description="Formats for the page's content to be returned (eg. markdown, html, screenshot, links).",
    )
    timeout: Optional[int] = Field(
        default=30000,
@@ -64,17 +76,38 @@ class FirecrawlCrawlWebsiteTool(BaseTool):
    def _run(
        self,
        url: str,
-        crawler_options: Optional[Dict[str, Any]] = None,
+        maxDepth: Optional[int] = 2,
        limit: Optional[int] = 100,
        allowExternalLinks: Optional[bool] = False,
        formats: Optional[list[str]] = ["markdown", "screenshot", "links"],
        timeout: Optional[int] = 30000,
    ):
-        if crawler_options is None:
+        # Default options for timeout and crawling
-            crawler_options = {}
+        DEFAULT_TIMEOUT = 30000
-
+        DEFAULT_CRAWLING_OPTIONS = {
-        options = {
+            "maxDepth": 2,
-            "crawlerOptions": crawler_options,
+            "ignoreSitemap": True,
-            "timeout": timeout,
+            "limit": 100,
            "allowBackwardLinks": False,
            "allowExternalLinks": False,
            "scrapeOptions": {
                "formats": ["markdown", "screenshot", "links"],
                "onlyMainContent": True,
                "timeout": DEFAULT_TIMEOUT,
            },
        }
-        return self._firecrawl.crawl_url(url, options)
+
        # Add default options not present as parameters
        crawling_options = DEFAULT_CRAWLING_OPTIONS
        # Update the values of parameters present
        crawling_options["maxDepth"] = maxDepth
        crawling_options["limit"] = limit
        crawling_options["allowExternalLinks"] = allowExternalLinks
        crawling_options["scrapeOptions"]["formats"] = formats
        crawling_options["scrapeOptions"]["timeout"] = timeout
        return self._firecrawl.crawl_url(url, crawling_options)
 try: