Merge pull request #237 from SHIXOOM/Fix-FireCrawl-Crawler-Tool

Fix: FirecrawlCrawlWebsiteTool update parameters for FireCrawl API v1 and update run arguments for agents
This commit is contained in:
Lucas Gomide
2025-04-06 14:06:32 -04:00
committed by GitHub
2 changed files with 71 additions and 23 deletions

View File

@@ -4,6 +4,10 @@
[Firecrawl](https://firecrawl.dev) is a platform for crawling and convert any website into clean markdown or structured data.
## Version Compatibility
This implementation is compatible with FireCrawl API v1
## Installation
- Get an API key from [firecrawl.dev](https://firecrawl.dev) and set it in environment variables (`FIRECRAWL_API_KEY`).
@@ -27,16 +31,27 @@ tool = FirecrawlCrawlWebsiteTool(url='firecrawl.dev')
- `api_key`: Optional. Specifies Firecrawl API key. Defaults is the `FIRECRAWL_API_KEY` environment variable.
- `url`: The base URL to start crawling from.
- `page_options`: Optional.
- `onlyMainContent`: Optional. Only return the main content of the page excluding headers, navs, footers, etc.
- `includeHtml`: Optional. Include the raw HTML content of the page. Will output a html key in the response.
- `crawler_options`: Optional. Options for controlling the crawling behavior.
- `includes`: Optional. URL patterns to include in the crawl.
- `exclude`: Optional. URL patterns to exclude from the crawl.
- `generateImgAltText`: Optional. Generate alt text for images using LLMs (requires a paid plan).
- `returnOnlyUrls`: Optional. If true, returns only the URLs as a list in the crawl status. Note: the response will be a list of URLs inside the data, not a list of documents.
- `maxDepth`: Optional. Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children, and so on.
- `mode`: Optional. The crawling mode to use. Fast mode crawls 4x faster on websites without a sitemap but may not be as accurate and shouldn't be used on heavily JavaScript-rendered websites.
- `limit`: Optional. Maximum number of pages to crawl.
- `timeout`: Optional. Timeout in milliseconds for the crawling operation.
- `maxDepth`: Optional. Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children and so on.
- `limit`: Optional. Maximum number of pages to crawl.
- `allowExternalLinks`: Allows the crawler to follow links that point to external domains.
- `formats`: Optional. Formats for the page's content to be returned (eg. markdown, html, screenshot, links).
- `timeout`: Optional. Timeout in milliseconds for the crawling operation.
## Configurations Example
This is the default configuration
```python
DEFAULT_CRAWLING_OPTIONS = {
"maxDepth": 2,
"ignoreSitemap": True,
"limit": 100,
"allowBackwardLinks": False,
"allowExternalLinks": False,
"scrapeOptions": {
"formats": ["markdown", "screenshot", "links"],
"onlyMainContent": True,
"timeout": 30000
}
}
```

View File

@@ -1,4 +1,4 @@
from typing import Any, Dict, Optional, Type
from typing import Any, Optional, Type
from crewai.tools import BaseTool
from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
@@ -12,8 +12,20 @@ except ImportError:
class FirecrawlCrawlWebsiteToolSchema(BaseModel):
url: str = Field(description="Website URL")
crawler_options: Optional[Dict[str, Any]] = Field(
default=None, description="Options for crawling"
maxDepth: Optional[int] = Field(
default=2,
description="Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children and so on.",
)
limit: Optional[int] = Field(
default=100, description="Maximum number of pages to crawl."
)
allowExternalLinks: Optional[bool] = Field(
default=False,
description="Allows the crawler to follow links that point to external domains.",
)
formats: Optional[list[str]] = Field(
default=["markdown", "screenshot", "links"],
description="Formats for the page's content to be returned (eg. markdown, html, screenshot, links).",
)
timeout: Optional[int] = Field(
default=30000,
@@ -64,17 +76,38 @@ class FirecrawlCrawlWebsiteTool(BaseTool):
def _run(
self,
url: str,
crawler_options: Optional[Dict[str, Any]] = None,
maxDepth: Optional[int] = 2,
limit: Optional[int] = 100,
allowExternalLinks: Optional[bool] = False,
formats: Optional[list[str]] = ["markdown", "screenshot", "links"],
timeout: Optional[int] = 30000,
):
if crawler_options is None:
crawler_options = {}
options = {
"crawlerOptions": crawler_options,
"timeout": timeout,
# Default options for timeout and crawling
DEFAULT_TIMEOUT = 30000
DEFAULT_CRAWLING_OPTIONS = {
"maxDepth": 2,
"ignoreSitemap": True,
"limit": 100,
"allowBackwardLinks": False,
"allowExternalLinks": False,
"scrapeOptions": {
"formats": ["markdown", "screenshot", "links"],
"onlyMainContent": True,
"timeout": DEFAULT_TIMEOUT,
},
}
return self._firecrawl.crawl_url(url, options)
# Add default options not present as parameters
crawling_options = DEFAULT_CRAWLING_OPTIONS
# Update the values of parameters present
crawling_options["maxDepth"] = maxDepth
crawling_options["limit"] = limit
crawling_options["allowExternalLinks"] = allowExternalLinks
crawling_options["scrapeOptions"]["formats"] = formats
crawling_options["scrapeOptions"]["timeout"] = timeout
return self._firecrawl.crawl_url(url, crawling_options)
try: