From 5af2108307bcc7c2cca23c154acf02093a31540f Mon Sep 17 00:00:00 2001 From: Shady Ali <121682078+SHIXOOM@users.noreply.github.com> Date: Sat, 8 Mar 2025 09:35:23 +0200 Subject: [PATCH 1/3] Fix: FireCrawl FirecrawlCrawlWebsiteTool for crawling. FireCrawl API does not recognize sent paramters (HTTPError: Unexpected error during start crawl job: Status code 400. Bad Request - [{'code': 'unrecognized_keys', 'keys': ['crawlerOptions', 'timeout'], 'path': [], 'message': 'Unrecognized key in body -- please review the v1 API documentation for request body changes'}]) because it has been updated to v1. I updated the sent parameters to match v1 and updated their description in the readme file --- .../firecrawl_crawl_website_tool/README.md | 11 ++++------ .../firecrawl_crawl_website_tool.py | 21 +++++++++++++------ 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/src/crewai_tools/tools/firecrawl_crawl_website_tool/README.md b/src/crewai_tools/tools/firecrawl_crawl_website_tool/README.md index 46d011602..f0bf66918 100644 --- a/src/crewai_tools/tools/firecrawl_crawl_website_tool/README.md +++ b/src/crewai_tools/tools/firecrawl_crawl_website_tool/README.md @@ -31,12 +31,9 @@ tool = FirecrawlCrawlWebsiteTool(url='firecrawl.dev') - `onlyMainContent`: Optional. Only return the main content of the page excluding headers, navs, footers, etc. - `includeHtml`: Optional. Include the raw HTML content of the page. Will output a html key in the response. - `crawler_options`: Optional. Options for controlling the crawling behavior. - - `includes`: Optional. URL patterns to include in the crawl. - - `exclude`: Optional. URL patterns to exclude from the crawl. - - `generateImgAltText`: Optional. Generate alt text for images using LLMs (requires a paid plan). - - `returnOnlyUrls`: Optional. If true, returns only the URLs as a list in the crawl status. Note: the response will be a list of URLs inside the data, not a list of documents. - - `maxDepth`: Optional. Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children, and so on. - - `mode`: Optional. The crawling mode to use. Fast mode crawls 4x faster on websites without a sitemap but may not be as accurate and shouldn't be used on heavily JavaScript-rendered websites. + - `maxDepth`: Optional. Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children and so on. - `limit`: Optional. Maximum number of pages to crawl. - - `timeout`: Optional. Timeout in milliseconds for the crawling operation. + - `scrapeOptions`: Optional. Additional options for controlling the crawler. + - `formats`: Optional. Formats for the page's content to be returned (eg. markdown, html, screenshot, links). + - `timeout`: Optional. Timeout in milliseconds for the crawling operation. diff --git a/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py b/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py index b95199c84..878063953 100644 --- a/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py +++ b/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py @@ -68,13 +68,22 @@ class FirecrawlCrawlWebsiteTool(BaseTool): timeout: Optional[int] = 30000, ): if crawler_options is None: - crawler_options = {} + crawler_options = { + "maxDepth": 2, + "limit": 10, + "scrapeOptions": { + # same options as in /scrape + "formats": ["markdown", "screenshot", "links"], + "timeout": timeout + } + } - options = { - "crawlerOptions": crawler_options, - "timeout": timeout, - } - return self._firecrawl.crawl_url(url, options) + + else: + crawler_options["scrapeOptions"]["timeout"] = timeout + + + return self._firecrawl.crawl_url(url, crawler_options) try: From e0adb4695cdb30997616b4077f77f78f3d4755ac Mon Sep 17 00:00:00 2001 From: Shady Ali <121682078+SHIXOOM@users.noreply.github.com> Date: Fri, 28 Mar 2025 16:58:47 +0200 Subject: [PATCH 2/3] Addressed review comments and made further improvements --- .../firecrawl_crawl_website_tool/README.md | 36 ++++++++--- .../firecrawl_crawl_website_tool.py | 60 +++++++++++++------ 2 files changed, 68 insertions(+), 28 deletions(-) diff --git a/src/crewai_tools/tools/firecrawl_crawl_website_tool/README.md b/src/crewai_tools/tools/firecrawl_crawl_website_tool/README.md index f0bf66918..d8e8f1407 100644 --- a/src/crewai_tools/tools/firecrawl_crawl_website_tool/README.md +++ b/src/crewai_tools/tools/firecrawl_crawl_website_tool/README.md @@ -4,6 +4,10 @@ [Firecrawl](https://firecrawl.dev) is a platform for crawling and convert any website into clean markdown or structured data. +## Version Compatibility + +This implementation is compatible with FireCrawl API v1 + ## Installation - Get an API key from [firecrawl.dev](https://firecrawl.dev) and set it in environment variables (`FIRECRAWL_API_KEY`). @@ -27,13 +31,27 @@ tool = FirecrawlCrawlWebsiteTool(url='firecrawl.dev') - `api_key`: Optional. Specifies Firecrawl API key. Defaults is the `FIRECRAWL_API_KEY` environment variable. - `url`: The base URL to start crawling from. -- `page_options`: Optional. - - `onlyMainContent`: Optional. Only return the main content of the page excluding headers, navs, footers, etc. - - `includeHtml`: Optional. Include the raw HTML content of the page. Will output a html key in the response. -- `crawler_options`: Optional. Options for controlling the crawling behavior. - - `maxDepth`: Optional. Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children and so on. - - `limit`: Optional. Maximum number of pages to crawl. - - `scrapeOptions`: Optional. Additional options for controlling the crawler. - - `formats`: Optional. Formats for the page's content to be returned (eg. markdown, html, screenshot, links). - - `timeout`: Optional. Timeout in milliseconds for the crawling operation. +- `maxDepth`: Optional. Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children and so on. +- `limit`: Optional. Maximum number of pages to crawl. +- `allowExternalLinks`: Allows the crawler to follow links that point to external domains. +- `formats`: Optional. Formats for the page's content to be returned (eg. markdown, html, screenshot, links). +- `timeout`: Optional. Timeout in milliseconds for the crawling operation. +## Configurations Example + +This is the default configuration + +```python + DEFAULT_CRAWLING_OPTIONS = { + "maxDepth": 2, + "ignoreSitemap": True, + "limit": 100, + "allowBackwardLinks": False, + "allowExternalLinks": False, + "scrapeOptions": { + "formats": ["markdown", "screenshot", "links"], + "onlyMainContent": True, + "timeout": 30000 + } + } +``` diff --git a/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py b/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py index 878063953..82bd913cd 100644 --- a/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py +++ b/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py @@ -12,9 +12,18 @@ except ImportError: class FirecrawlCrawlWebsiteToolSchema(BaseModel): url: str = Field(description="Website URL") - crawler_options: Optional[Dict[str, Any]] = Field( - default=None, description="Options for crawling" - ) + maxDepth: Optional[int] = Field( + default=2, + description="Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children and so on.") + limit: Optional[int] = Field( + default=100, + description="Maximum number of pages to crawl.") + allowExternalLinks: Optional[bool] = Field( + default=False, + description="Allows the crawler to follow links that point to external domains.") + formats: Optional[list[str]] = Field( + default=["markdown", "screenshot", "links"], + description="Formats for the page's content to be returned (eg. markdown, html, screenshot, links).") timeout: Optional[int] = Field( default=30000, description="Timeout in milliseconds for the crawling operation. The default value is 30000.", @@ -30,6 +39,7 @@ class FirecrawlCrawlWebsiteTool(BaseTool): args_schema: Type[BaseModel] = FirecrawlCrawlWebsiteToolSchema api_key: Optional[str] = None _firecrawl: Optional["FirecrawlApp"] = PrivateAttr(None) + def __init__(self, api_key: Optional[str] = None, **kwargs): super().__init__(**kwargs) @@ -64,26 +74,38 @@ class FirecrawlCrawlWebsiteTool(BaseTool): def _run( self, url: str, - crawler_options: Optional[Dict[str, Any]] = None, + maxDepth: Optional[int] = 2, + limit: Optional[int] = 100, + allowExternalLinks: Optional[bool] = False, + formats: Optional[list[str]] = ["markdown", "screenshot", "links"], timeout: Optional[int] = 30000, ): - if crawler_options is None: - crawler_options = { - "maxDepth": 2, - "limit": 10, - "scrapeOptions": { - # same options as in /scrape - "formats": ["markdown", "screenshot", "links"], - "timeout": timeout - } - } - + # Default options for timeout and crawling + DEFAULT_TIMEOUT = 30000 + DEFAULT_CRAWLING_OPTIONS = { + "maxDepth": 2, + "ignoreSitemap": True, + "limit": 100, + "allowBackwardLinks": False, + "allowExternalLinks": False, + "scrapeOptions": { + "formats": ["markdown", "screenshot", "links"], + "onlyMainContent": True, + "timeout": DEFAULT_TIMEOUT + } + } - else: - crawler_options["scrapeOptions"]["timeout"] = timeout - + # Add default options not present as parameters + crawling_options = DEFAULT_CRAWLING_OPTIONS - return self._firecrawl.crawl_url(url, crawler_options) + # Update the values of parameters present + crawling_options["maxDepth"] = maxDepth + crawling_options["limit"] = limit + crawling_options["allowExternalLinks"] = allowExternalLinks + crawling_options["scrapeOptions"]["formats"] = formats + crawling_options["scrapeOptions"]["timeout"] = timeout + + return self._firecrawl.crawl_url(url, crawling_options) try: From 89394ef3e3d60966252b9c3782118594527daa6a Mon Sep 17 00:00:00 2001 From: lorenzejay Date: Fri, 4 Apr 2025 11:42:32 -0700 Subject: [PATCH 3/3] Refactor: Clean up FirecrawlCrawlWebsiteTool schema field descriptions and formatting for improved readability --- .../firecrawl_crawl_website_tool.py | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py b/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py index 82bd913cd..f91ad3184 100644 --- a/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py +++ b/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, Optional, Type +from typing import Any, Optional, Type from crewai.tools import BaseTool from pydantic import BaseModel, ConfigDict, Field, PrivateAttr @@ -14,16 +14,19 @@ class FirecrawlCrawlWebsiteToolSchema(BaseModel): url: str = Field(description="Website URL") maxDepth: Optional[int] = Field( default=2, - description="Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children and so on.") + description="Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children and so on.", + ) limit: Optional[int] = Field( - default=100, - description="Maximum number of pages to crawl.") + default=100, description="Maximum number of pages to crawl." + ) allowExternalLinks: Optional[bool] = Field( default=False, - description="Allows the crawler to follow links that point to external domains.") + description="Allows the crawler to follow links that point to external domains.", + ) formats: Optional[list[str]] = Field( default=["markdown", "screenshot", "links"], - description="Formats for the page's content to be returned (eg. markdown, html, screenshot, links).") + description="Formats for the page's content to be returned (eg. markdown, html, screenshot, links).", + ) timeout: Optional[int] = Field( default=30000, description="Timeout in milliseconds for the crawling operation. The default value is 30000.", @@ -39,7 +42,6 @@ class FirecrawlCrawlWebsiteTool(BaseTool): args_schema: Type[BaseModel] = FirecrawlCrawlWebsiteToolSchema api_key: Optional[str] = None _firecrawl: Optional["FirecrawlApp"] = PrivateAttr(None) - def __init__(self, api_key: Optional[str] = None, **kwargs): super().__init__(**kwargs) @@ -91,20 +93,20 @@ class FirecrawlCrawlWebsiteTool(BaseTool): "scrapeOptions": { "formats": ["markdown", "screenshot", "links"], "onlyMainContent": True, - "timeout": DEFAULT_TIMEOUT - } + "timeout": DEFAULT_TIMEOUT, + }, } - + # Add default options not present as parameters crawling_options = DEFAULT_CRAWLING_OPTIONS - + # Update the values of parameters present crawling_options["maxDepth"] = maxDepth crawling_options["limit"] = limit crawling_options["allowExternalLinks"] = allowExternalLinks crawling_options["scrapeOptions"]["formats"] = formats crawling_options["scrapeOptions"]["timeout"] = timeout - + return self._firecrawl.crawl_url(url, crawling_options)