From 5af2108307bcc7c2cca23c154acf02093a31540f Mon Sep 17 00:00:00 2001
From: Shady Ali <121682078+SHIXOOM@users.noreply.github.com>
Date: Sat, 8 Mar 2025 09:35:23 +0200
Subject: [PATCH 1/3] Fix: FireCrawl FirecrawlCrawlWebsiteTool for crawling.
 FireCrawl API does not recognize sent paramters (HTTPError: Unexpected error
 during start crawl job: Status code 400. Bad Request - [{'code':
 'unrecognized_keys', 'keys': ['crawlerOptions', 'timeout'], 'path': [],
 'message': 'Unrecognized key in body -- please review the v1 API
 documentation for request body changes'}]) because it has been updated to v1.
 I updated the sent parameters to match v1 and updated their description in
 the readme file

---
 .../firecrawl_crawl_website_tool/README.md    | 11 ++++------
 .../firecrawl_crawl_website_tool.py           | 21 +++++++++++++------
 2 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/src/crewai_tools/tools/firecrawl_crawl_website_tool/README.md b/src/crewai_tools/tools/firecrawl_crawl_website_tool/README.md
index 46d011602..f0bf66918 100644
--- a/src/crewai_tools/tools/firecrawl_crawl_website_tool/README.md
+++ b/src/crewai_tools/tools/firecrawl_crawl_website_tool/README.md
@@ -31,12 +31,9 @@ tool = FirecrawlCrawlWebsiteTool(url='firecrawl.dev')
   - `onlyMainContent`: Optional. Only return the main content of the page excluding headers, navs, footers, etc.
   - `includeHtml`: Optional. Include the raw HTML content of the page. Will output a html key in the response.
 - `crawler_options`: Optional. Options for controlling the crawling behavior.
-  - `includes`: Optional. URL patterns to include in the crawl.
-  - `exclude`: Optional. URL patterns to exclude from the crawl.
-  - `generateImgAltText`: Optional. Generate alt text for images using LLMs (requires a paid plan).
-  - `returnOnlyUrls`: Optional. If true, returns only the URLs as a list in the crawl status. Note: the response will be a list of URLs inside the data, not a list of documents.
-  - `maxDepth`: Optional. Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children, and so on.
-  - `mode`: Optional. The crawling mode to use. Fast mode crawls 4x faster on websites without a sitemap but may not be as accurate and shouldn't be used on heavily JavaScript-rendered websites.
+  - `maxDepth`: Optional. Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children and so on.
   - `limit`: Optional. Maximum number of pages to crawl.
-  - `timeout`: Optional. Timeout in milliseconds for the crawling operation.
+  - `scrapeOptions`: Optional. Additional options for controlling the crawler.
+    - `formats`: Optional. Formats for the page's content to be returned (eg. markdown, html, screenshot, links).
+    - `timeout`: Optional. Timeout in milliseconds for the crawling operation.
 
diff --git a/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py b/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py
index b95199c84..878063953 100644
--- a/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py
+++ b/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py
@@ -68,13 +68,22 @@ class FirecrawlCrawlWebsiteTool(BaseTool):
         timeout: Optional[int] = 30000,
     ):
         if crawler_options is None:
-            crawler_options = {}
+            crawler_options = {
+                                "maxDepth": 2,
+                                "limit": 10,
+                                "scrapeOptions": {
+                                    # same options as in /scrape
+                                    "formats": ["markdown", "screenshot", "links"],
+                                    "timeout": timeout
+                                    }
+                                }
 
-        options = {
-            "crawlerOptions": crawler_options,
-            "timeout": timeout,
-        }
-        return self._firecrawl.crawl_url(url, options)
+        
+        else:
+            crawler_options["scrapeOptions"]["timeout"] = timeout
+
+        
+        return self._firecrawl.crawl_url(url, crawler_options)
 
 
 try:

From e0adb4695cdb30997616b4077f77f78f3d4755ac Mon Sep 17 00:00:00 2001
From: Shady Ali <121682078+SHIXOOM@users.noreply.github.com>
Date: Fri, 28 Mar 2025 16:58:47 +0200
Subject: [PATCH 2/3] Addressed review comments and made further improvements

---
 .../firecrawl_crawl_website_tool/README.md    | 36 ++++++++---
 .../firecrawl_crawl_website_tool.py           | 60 +++++++++++++------
 2 files changed, 68 insertions(+), 28 deletions(-)

diff --git a/src/crewai_tools/tools/firecrawl_crawl_website_tool/README.md b/src/crewai_tools/tools/firecrawl_crawl_website_tool/README.md
index f0bf66918..d8e8f1407 100644
--- a/src/crewai_tools/tools/firecrawl_crawl_website_tool/README.md
+++ b/src/crewai_tools/tools/firecrawl_crawl_website_tool/README.md
@@ -4,6 +4,10 @@
 
 [Firecrawl](https://firecrawl.dev) is a platform for crawling and convert any website into clean markdown or structured data.
 
+## Version Compatibility
+
+This implementation is compatible with FireCrawl API v1
+
 ## Installation
 
 - Get an API key from [firecrawl.dev](https://firecrawl.dev) and set it in environment variables (`FIRECRAWL_API_KEY`).
@@ -27,13 +31,27 @@ tool = FirecrawlCrawlWebsiteTool(url='firecrawl.dev')
 
 - `api_key`: Optional. Specifies Firecrawl API key. Defaults is the `FIRECRAWL_API_KEY` environment variable.
 - `url`: The base URL to start crawling from.
-- `page_options`: Optional. 
-  - `onlyMainContent`: Optional. Only return the main content of the page excluding headers, navs, footers, etc.
-  - `includeHtml`: Optional. Include the raw HTML content of the page. Will output a html key in the response.
-- `crawler_options`: Optional. Options for controlling the crawling behavior.
-  - `maxDepth`: Optional. Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children and so on.
-  - `limit`: Optional. Maximum number of pages to crawl.
-  - `scrapeOptions`: Optional. Additional options for controlling the crawler.
-    - `formats`: Optional. Formats for the page's content to be returned (eg. markdown, html, screenshot, links).
-    - `timeout`: Optional. Timeout in milliseconds for the crawling operation.
+- `maxDepth`: Optional. Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children and so on.  
+- `limit`: Optional. Maximum number of pages to crawl.  
+- `allowExternalLinks`: Allows the crawler to follow links that point to external domains.  
+- `formats`: Optional. Formats for the page's content to be returned (eg. markdown, html, screenshot, links).  
+- `timeout`: Optional. Timeout in milliseconds for the crawling operation.  
 
+## Configurations Example
+
+This is the default configuration
+
+```python
+    DEFAULT_CRAWLING_OPTIONS = {
+        "maxDepth": 2,
+        "ignoreSitemap": True,
+        "limit": 100,
+        "allowBackwardLinks": False, 
+        "allowExternalLinks": False,
+        "scrapeOptions": {
+            "formats": ["markdown", "screenshot", "links"],
+            "onlyMainContent": True,
+            "timeout": 30000
+        }
+    }
+```
diff --git a/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py b/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py
index 878063953..82bd913cd 100644
--- a/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py
+++ b/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py
@@ -12,9 +12,18 @@ except ImportError:
 
 class FirecrawlCrawlWebsiteToolSchema(BaseModel):
     url: str = Field(description="Website URL")
-    crawler_options: Optional[Dict[str, Any]] = Field(
-        default=None, description="Options for crawling"
-    )
+    maxDepth: Optional[int] = Field(
+        default=2,
+        description="Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children and so on.")
+    limit: Optional[int] = Field(
+        default=100,
+        description="Maximum number of pages to crawl.")
+    allowExternalLinks: Optional[bool] = Field(
+        default=False,
+        description="Allows the crawler to follow links that point to external domains.")
+    formats: Optional[list[str]] = Field(
+        default=["markdown", "screenshot", "links"],
+        description="Formats for the page's content to be returned (eg. markdown, html, screenshot, links).")
     timeout: Optional[int] = Field(
         default=30000,
         description="Timeout in milliseconds for the crawling operation. The default value is 30000.",
@@ -30,6 +39,7 @@ class FirecrawlCrawlWebsiteTool(BaseTool):
     args_schema: Type[BaseModel] = FirecrawlCrawlWebsiteToolSchema
     api_key: Optional[str] = None
     _firecrawl: Optional["FirecrawlApp"] = PrivateAttr(None)
+    
 
     def __init__(self, api_key: Optional[str] = None, **kwargs):
         super().__init__(**kwargs)
@@ -64,26 +74,38 @@ class FirecrawlCrawlWebsiteTool(BaseTool):
     def _run(
         self,
         url: str,
-        crawler_options: Optional[Dict[str, Any]] = None,
+        maxDepth: Optional[int] = 2,
+        limit: Optional[int] = 100,
+        allowExternalLinks: Optional[bool] = False,
+        formats: Optional[list[str]] = ["markdown", "screenshot", "links"],
         timeout: Optional[int] = 30000,
     ):
-        if crawler_options is None:
-            crawler_options = {
-                                "maxDepth": 2,
-                                "limit": 10,
-                                "scrapeOptions": {
-                                    # same options as in /scrape
-                                    "formats": ["markdown", "screenshot", "links"],
-                                    "timeout": timeout
-                                    }
-                                }
-
+        # Default options for timeout and crawling
+        DEFAULT_TIMEOUT = 30000
+        DEFAULT_CRAWLING_OPTIONS = {
+            "maxDepth": 2,
+            "ignoreSitemap": True,
+            "limit": 100,
+            "allowBackwardLinks": False,
+            "allowExternalLinks": False,
+            "scrapeOptions": {
+                "formats": ["markdown", "screenshot", "links"],
+                "onlyMainContent": True,
+                "timeout": DEFAULT_TIMEOUT
+            }
+        }
         
-        else:
-            crawler_options["scrapeOptions"]["timeout"] = timeout
-
+        # Add default options not present as parameters
+        crawling_options = DEFAULT_CRAWLING_OPTIONS
         
-        return self._firecrawl.crawl_url(url, crawler_options)
+        # Update the values of parameters present
+        crawling_options["maxDepth"] = maxDepth
+        crawling_options["limit"] = limit
+        crawling_options["allowExternalLinks"] = allowExternalLinks
+        crawling_options["scrapeOptions"]["formats"] = formats
+        crawling_options["scrapeOptions"]["timeout"] = timeout
+        
+        return self._firecrawl.crawl_url(url, crawling_options)
 
 
 try:

From 89394ef3e3d60966252b9c3782118594527daa6a Mon Sep 17 00:00:00 2001
From: lorenzejay <lorenzejaytech@gmail.com>
Date: Fri, 4 Apr 2025 11:42:32 -0700
Subject: [PATCH 3/3] Refactor: Clean up FirecrawlCrawlWebsiteTool schema field
 descriptions and formatting for improved readability

---
 .../firecrawl_crawl_website_tool.py           | 26 ++++++++++---------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py b/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py
index 82bd913cd..f91ad3184 100644
--- a/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py
+++ b/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Optional, Type
+from typing import Any, Optional, Type
 
 from crewai.tools import BaseTool
 from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
@@ -14,16 +14,19 @@ class FirecrawlCrawlWebsiteToolSchema(BaseModel):
     url: str = Field(description="Website URL")
     maxDepth: Optional[int] = Field(
         default=2,
-        description="Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children and so on.")
+        description="Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children and so on.",
+    )
     limit: Optional[int] = Field(
-        default=100,
-        description="Maximum number of pages to crawl.")
+        default=100, description="Maximum number of pages to crawl."
+    )
     allowExternalLinks: Optional[bool] = Field(
         default=False,
-        description="Allows the crawler to follow links that point to external domains.")
+        description="Allows the crawler to follow links that point to external domains.",
+    )
     formats: Optional[list[str]] = Field(
         default=["markdown", "screenshot", "links"],
-        description="Formats for the page's content to be returned (eg. markdown, html, screenshot, links).")
+        description="Formats for the page's content to be returned (eg. markdown, html, screenshot, links).",
+    )
     timeout: Optional[int] = Field(
         default=30000,
         description="Timeout in milliseconds for the crawling operation. The default value is 30000.",
@@ -39,7 +42,6 @@ class FirecrawlCrawlWebsiteTool(BaseTool):
     args_schema: Type[BaseModel] = FirecrawlCrawlWebsiteToolSchema
     api_key: Optional[str] = None
     _firecrawl: Optional["FirecrawlApp"] = PrivateAttr(None)
-    
 
     def __init__(self, api_key: Optional[str] = None, **kwargs):
         super().__init__(**kwargs)
@@ -91,20 +93,20 @@ class FirecrawlCrawlWebsiteTool(BaseTool):
             "scrapeOptions": {
                 "formats": ["markdown", "screenshot", "links"],
                 "onlyMainContent": True,
-                "timeout": DEFAULT_TIMEOUT
-            }
+                "timeout": DEFAULT_TIMEOUT,
+            },
         }
-        
+
         # Add default options not present as parameters
         crawling_options = DEFAULT_CRAWLING_OPTIONS
-        
+
         # Update the values of parameters present
         crawling_options["maxDepth"] = maxDepth
         crawling_options["limit"] = limit
         crawling_options["allowExternalLinks"] = allowExternalLinks
         crawling_options["scrapeOptions"]["formats"] = formats
         crawling_options["scrapeOptions"]["timeout"] = timeout
-        
+
         return self._firecrawl.crawl_url(url, crawling_options)