mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-08 07:38:29 +00:00
Merge pull request #237 from SHIXOOM/Fix-FireCrawl-Crawler-Tool
Fix: FirecrawlCrawlWebsiteTool update parameters for FireCrawl API v1 and update run arguments for agents
This commit is contained in:
@@ -4,6 +4,10 @@
|
||||
|
||||
[Firecrawl](https://firecrawl.dev) is a platform for crawling and convert any website into clean markdown or structured data.
|
||||
|
||||
## Version Compatibility
|
||||
|
||||
This implementation is compatible with FireCrawl API v1
|
||||
|
||||
## Installation
|
||||
|
||||
- Get an API key from [firecrawl.dev](https://firecrawl.dev) and set it in environment variables (`FIRECRAWL_API_KEY`).
|
||||
@@ -27,16 +31,27 @@ tool = FirecrawlCrawlWebsiteTool(url='firecrawl.dev')
|
||||
|
||||
- `api_key`: Optional. Specifies Firecrawl API key. Defaults is the `FIRECRAWL_API_KEY` environment variable.
|
||||
- `url`: The base URL to start crawling from.
|
||||
- `page_options`: Optional.
|
||||
- `onlyMainContent`: Optional. Only return the main content of the page excluding headers, navs, footers, etc.
|
||||
- `includeHtml`: Optional. Include the raw HTML content of the page. Will output a html key in the response.
|
||||
- `crawler_options`: Optional. Options for controlling the crawling behavior.
|
||||
- `includes`: Optional. URL patterns to include in the crawl.
|
||||
- `exclude`: Optional. URL patterns to exclude from the crawl.
|
||||
- `generateImgAltText`: Optional. Generate alt text for images using LLMs (requires a paid plan).
|
||||
- `returnOnlyUrls`: Optional. If true, returns only the URLs as a list in the crawl status. Note: the response will be a list of URLs inside the data, not a list of documents.
|
||||
- `maxDepth`: Optional. Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children, and so on.
|
||||
- `mode`: Optional. The crawling mode to use. Fast mode crawls 4x faster on websites without a sitemap but may not be as accurate and shouldn't be used on heavily JavaScript-rendered websites.
|
||||
- `limit`: Optional. Maximum number of pages to crawl.
|
||||
- `timeout`: Optional. Timeout in milliseconds for the crawling operation.
|
||||
- `maxDepth`: Optional. Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children and so on.
|
||||
- `limit`: Optional. Maximum number of pages to crawl.
|
||||
- `allowExternalLinks`: Allows the crawler to follow links that point to external domains.
|
||||
- `formats`: Optional. Formats for the page's content to be returned (eg. markdown, html, screenshot, links).
|
||||
- `timeout`: Optional. Timeout in milliseconds for the crawling operation.
|
||||
|
||||
## Configurations Example
|
||||
|
||||
This is the default configuration
|
||||
|
||||
```python
|
||||
DEFAULT_CRAWLING_OPTIONS = {
|
||||
"maxDepth": 2,
|
||||
"ignoreSitemap": True,
|
||||
"limit": 100,
|
||||
"allowBackwardLinks": False,
|
||||
"allowExternalLinks": False,
|
||||
"scrapeOptions": {
|
||||
"formats": ["markdown", "screenshot", "links"],
|
||||
"onlyMainContent": True,
|
||||
"timeout": 30000
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from typing import Any, Dict, Optional, Type
|
||||
from typing import Any, Optional, Type
|
||||
|
||||
from crewai.tools import BaseTool
|
||||
from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
|
||||
@@ -12,8 +12,20 @@ except ImportError:
|
||||
|
||||
class FirecrawlCrawlWebsiteToolSchema(BaseModel):
|
||||
url: str = Field(description="Website URL")
|
||||
crawler_options: Optional[Dict[str, Any]] = Field(
|
||||
default=None, description="Options for crawling"
|
||||
maxDepth: Optional[int] = Field(
|
||||
default=2,
|
||||
description="Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children and so on.",
|
||||
)
|
||||
limit: Optional[int] = Field(
|
||||
default=100, description="Maximum number of pages to crawl."
|
||||
)
|
||||
allowExternalLinks: Optional[bool] = Field(
|
||||
default=False,
|
||||
description="Allows the crawler to follow links that point to external domains.",
|
||||
)
|
||||
formats: Optional[list[str]] = Field(
|
||||
default=["markdown", "screenshot", "links"],
|
||||
description="Formats for the page's content to be returned (eg. markdown, html, screenshot, links).",
|
||||
)
|
||||
timeout: Optional[int] = Field(
|
||||
default=30000,
|
||||
@@ -64,17 +76,38 @@ class FirecrawlCrawlWebsiteTool(BaseTool):
|
||||
def _run(
|
||||
self,
|
||||
url: str,
|
||||
crawler_options: Optional[Dict[str, Any]] = None,
|
||||
maxDepth: Optional[int] = 2,
|
||||
limit: Optional[int] = 100,
|
||||
allowExternalLinks: Optional[bool] = False,
|
||||
formats: Optional[list[str]] = ["markdown", "screenshot", "links"],
|
||||
timeout: Optional[int] = 30000,
|
||||
):
|
||||
if crawler_options is None:
|
||||
crawler_options = {}
|
||||
|
||||
options = {
|
||||
"crawlerOptions": crawler_options,
|
||||
"timeout": timeout,
|
||||
# Default options for timeout and crawling
|
||||
DEFAULT_TIMEOUT = 30000
|
||||
DEFAULT_CRAWLING_OPTIONS = {
|
||||
"maxDepth": 2,
|
||||
"ignoreSitemap": True,
|
||||
"limit": 100,
|
||||
"allowBackwardLinks": False,
|
||||
"allowExternalLinks": False,
|
||||
"scrapeOptions": {
|
||||
"formats": ["markdown", "screenshot", "links"],
|
||||
"onlyMainContent": True,
|
||||
"timeout": DEFAULT_TIMEOUT,
|
||||
},
|
||||
}
|
||||
return self._firecrawl.crawl_url(url, options)
|
||||
|
||||
# Add default options not present as parameters
|
||||
crawling_options = DEFAULT_CRAWLING_OPTIONS
|
||||
|
||||
# Update the values of parameters present
|
||||
crawling_options["maxDepth"] = maxDepth
|
||||
crawling_options["limit"] = limit
|
||||
crawling_options["allowExternalLinks"] = allowExternalLinks
|
||||
crawling_options["scrapeOptions"]["formats"] = formats
|
||||
crawling_options["scrapeOptions"]["timeout"] = timeout
|
||||
|
||||
return self._firecrawl.crawl_url(url, crawling_options)
|
||||
|
||||
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user