Fix firecrawl tool (Too many positional arguments) (#275)

* Corrected to adapt to firecrawl package use

Was leading to an error too many arguments when calling the craw_url() function

* Corrected to adapt to firecrawl package use

Corrected to avoid too many arguments error when calling firecrawl scrape_url function

* Corrected to adapt to firecrawl package use

Corrected to avoid error too many arguments when calling firecrawl search() function

* fix: fix firecrawl integration

* feat: support define Firecrawl using any config

Currently we pre-defined the available paramenters to call Firecrawl, this commit adds support to receive any parameter and propagate them

* docs: added doc string to Firecrawls classes

---------

Co-authored-by: Lucas Gomide <lucaslg200@gmail.com>
This commit is contained in:
benzakritesteur
2025-04-28 19:57:03 +02:00
committed by GitHub
parent 6909c587c2
commit 82d0209ce2
6 changed files with 158 additions and 150 deletions

View File

@@ -23,35 +23,38 @@ Utilize the FirecrawlScrapeFromWebsiteTool as follows to allow your agent to loa
```python
from crewai_tools import FirecrawlCrawlWebsiteTool
from firecrawl import ScrapeOptions
tool = FirecrawlCrawlWebsiteTool(url='firecrawl.dev')
tool = FirecrawlCrawlWebsiteTool(
config={
"limit": 100,
"scrape_options": ScrapeOptions(formats=["markdown", "html"]),
"poll_interval": 30,
}
)
tool.run(url="firecrawl.dev")
```
## Arguments
- `api_key`: Optional. Specifies Firecrawl API key. Defaults is the `FIRECRAWL_API_KEY` environment variable.
- `url`: The base URL to start crawling from.
- `maxDepth`: Optional. Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children and so on.
- `limit`: Optional. Maximum number of pages to crawl.
- `allowExternalLinks`: Allows the crawler to follow links that point to external domains.
- `formats`: Optional. Formats for the page's content to be returned (eg. markdown, html, screenshot, links).
- `timeout`: Optional. Timeout in milliseconds for the crawling operation.
## Configurations Example
- `config`: Optional. It contains Firecrawl API parameters.
This is the default configuration
```python
DEFAULT_CRAWLING_OPTIONS = {
"maxDepth": 2,
"ignoreSitemap": True,
"limit": 100,
"allowBackwardLinks": False,
"allowExternalLinks": False,
"scrapeOptions": {
"formats": ["markdown", "screenshot", "links"],
"onlyMainContent": True,
"timeout": 30000
}
}
from firecrawl import ScrapeOptions
{
"max_depth": 2,
"ignore_sitemap": True,
"limit": 100,
"allow_backward_links": False,
"allow_external_links": False,
"scrape_options": ScrapeOptions(
formats=["markdown", "screenshot", "links"],
only_main_content=True,
timeout=30000,
),
}
```

View File

@@ -3,37 +3,36 @@ from typing import Any, Optional, Type
from crewai.tools import BaseTool
from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
try:
from firecrawl import FirecrawlApp
from firecrawl import FirecrawlApp, ScrapeOptions
except ImportError:
FirecrawlApp = Any
class FirecrawlCrawlWebsiteToolSchema(BaseModel):
url: str = Field(description="Website URL")
maxDepth: Optional[int] = Field(
default=2,
description="Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children and so on.",
)
limit: Optional[int] = Field(
default=100, description="Maximum number of pages to crawl."
)
allowExternalLinks: Optional[bool] = Field(
default=False,
description="Allows the crawler to follow links that point to external domains.",
)
formats: Optional[list[str]] = Field(
default=["markdown", "screenshot", "links"],
description="Formats for the page's content to be returned (eg. markdown, html, screenshot, links).",
)
timeout: Optional[int] = Field(
default=30000,
description="Timeout in milliseconds for the crawling operation. The default value is 30000.",
)
class FirecrawlCrawlWebsiteTool(BaseTool):
"""
Tool for crawling websites using Firecrawl. To run this tool, you need to have a Firecrawl API key.
Args:
api_key (str): Your Firecrawl API key.
config (dict): Optional. It contains Firecrawl API parameters.
Default configuration options:
max_depth (int): Maximum depth to crawl. Default: 2
ignore_sitemap (bool): Whether to ignore sitemap. Default: True
limit (int): Maximum number of pages to crawl. Default: 100
allow_backward_links (bool): Allow crawling backward links. Default: False
allow_external_links (bool): Allow crawling external links. Default: False
scrape_options (ScrapeOptions): Options for scraping content
- formats (list[str]): Content formats to return. Default: ["markdown", "screenshot", "links"]
- only_main_content (bool): Only return main content. Default: True
- timeout (int): Timeout in milliseconds. Default: 30000
"""
model_config = ConfigDict(
arbitrary_types_allowed=True, validate_assignment=True, frozen=False
)
@@ -41,6 +40,20 @@ class FirecrawlCrawlWebsiteTool(BaseTool):
description: str = "Crawl webpages using Firecrawl and return the contents"
args_schema: Type[BaseModel] = FirecrawlCrawlWebsiteToolSchema
api_key: Optional[str] = None
config: Optional[dict[str, Any]] = Field(
default_factory=lambda: {
"max_depth": 2,
"ignore_sitemap": True,
"limit": 100,
"allow_backward_links": False,
"allow_external_links": False,
"scrape_options": ScrapeOptions(
formats=["markdown", "screenshot", "links"],
only_main_content=True,
timeout=30000,
),
}
)
_firecrawl: Optional["FirecrawlApp"] = PrivateAttr(None)
def __init__(self, api_key: Optional[str] = None, **kwargs):
@@ -73,41 +86,8 @@ class FirecrawlCrawlWebsiteTool(BaseTool):
"`firecrawl-py` package not found, please run `uv add firecrawl-py`"
)
def _run(
self,
url: str,
maxDepth: Optional[int] = 2,
limit: Optional[int] = 100,
allowExternalLinks: Optional[bool] = False,
formats: Optional[list[str]] = ["markdown", "screenshot", "links"],
timeout: Optional[int] = 30000,
):
# Default options for timeout and crawling
DEFAULT_TIMEOUT = 30000
DEFAULT_CRAWLING_OPTIONS = {
"maxDepth": 2,
"ignoreSitemap": True,
"limit": 100,
"allowBackwardLinks": False,
"allowExternalLinks": False,
"scrapeOptions": {
"formats": ["markdown", "screenshot", "links"],
"onlyMainContent": True,
"timeout": DEFAULT_TIMEOUT,
},
}
# Add default options not present as parameters
crawling_options = DEFAULT_CRAWLING_OPTIONS
# Update the values of parameters present
crawling_options["maxDepth"] = maxDepth
crawling_options["limit"] = limit
crawling_options["allowExternalLinks"] = allowExternalLinks
crawling_options["scrapeOptions"]["formats"] = formats
crawling_options["scrapeOptions"]["timeout"] = timeout
return self._firecrawl.crawl_url(url, crawling_options)
def _run(self, url: str):
return self._firecrawl.crawl_url(url, **self.config)
try:

View File

@@ -20,19 +20,27 @@ Utilize the FirecrawlScrapeWebsiteTool as follows to allow your agent to load we
```python
from crewai_tools import FirecrawlScrapeWebsiteTool
tool = FirecrawlScrapeWebsiteTool(url='firecrawl.dev')
tool = FirecrawlScrapeWebsiteTool(config={"formats": ['html']})
tool.run(url="firecrawl.dev")
```
## Arguments
- `api_key`: Optional. Specifies Firecrawl API key. Defaults is the `FIRECRAWL_API_KEY` environment variable.
- `url`: The URL to scrape.
- `page_options`: Optional.
- `onlyMainContent`: Optional. Only return the main content of the page excluding headers, navs, footers, etc.
- `includeHtml`: Optional. Include the raw HTML content of the page. Will output a html key in the response.
- `extractor_options`: Optional. Options for LLM-based extraction of structured information from the page content
- `mode`: The extraction mode to use, currently supports 'llm-extraction'
- `extractionPrompt`: Optional. A prompt describing what information to extract from the page
- `extractionSchema`: Optional. The schema for the data to be extracted
- `timeout`: Optional. Timeout in milliseconds for the request
- `config`: Optional. It contains Firecrawl API parameters.
This is the default configuration
```python
{
"formats": ["markdown"],
"only_main_content": True,
"include_tags": [],
"exclude_tags": [],
"headers": {},
"wait_for": 0,
}
```

View File

@@ -18,6 +18,21 @@ class FirecrawlScrapeWebsiteToolSchema(BaseModel):
class FirecrawlScrapeWebsiteTool(BaseTool):
"""
Tool for scraping webpages using Firecrawl. To run this tool, you need to have a Firecrawl API key.
Args:
api_key (str): Your Firecrawl API key.
config (dict): Optional. It contains Firecrawl API parameters.
Default configuration options:
formats (list[str]): Content formats to return. Default: ["markdown"]
only_main_content (bool): Only return main content. Default: True
include_tags (list[str]): Tags to include. Default: []
exclude_tags (list[str]): Tags to exclude. Default: []
headers (dict): Headers to include. Default: {}
"""
model_config = ConfigDict(
arbitrary_types_allowed=True, validate_assignment=True, frozen=False
)
@@ -25,6 +40,17 @@ class FirecrawlScrapeWebsiteTool(BaseTool):
description: str = "Scrape webpages using Firecrawl and return the contents"
args_schema: Type[BaseModel] = FirecrawlScrapeWebsiteToolSchema
api_key: Optional[str] = None
config: Optional[dict[str, Any]] = Field(
default_factory=lambda: {
"formats": ["markdown"],
"only_main_content": True,
"include_tags": [],
"exclude_tags": [],
"headers": {},
"wait_for": 0,
}
)
_firecrawl: Optional["FirecrawlApp"] = PrivateAttr(None)
def __init__(self, api_key: Optional[str] = None, **kwargs):
@@ -50,21 +76,8 @@ class FirecrawlScrapeWebsiteTool(BaseTool):
self._firecrawl = FirecrawlApp(api_key=api_key)
def _run(
self,
url: str,
timeout: Optional[int] = 30000,
):
options = {
"formats": ["markdown"],
"onlyMainContent": True,
"includeTags": [],
"excludeTags": [],
"headers": {},
"waitFor": 0,
"timeout": timeout,
}
return self._firecrawl.scrape_url(url, options)
def _run(self, url: str):
return self._firecrawl.scrape_url(url, **self.config)
try:

View File

@@ -20,16 +20,25 @@ Utilize the FirecrawlSearchTool as follows to allow your agent to load websites:
```python
from crewai_tools import FirecrawlSearchTool
tool = FirecrawlSearchTool(query='what is firecrawl?')
tool = FirecrawlSearchTool(config={"limit": 5})
tool.run(query="firecrawl web scraping")
```
## Arguments
- `api_key`: Optional. Specifies Firecrawl API key. Defaults is the `FIRECRAWL_API_KEY` environment variable.
- `query`: The search query string to be used for searching.
- `page_options`: Optional. Options for result formatting.
- `onlyMainContent`: Optional. Only return the main content of the page excluding headers, navs, footers, etc.
- `includeHtml`: Optional. Include the raw HTML content of the page. Will output a html key in the response.
- `fetchPageContent`: Optional. Fetch the full content of the page.
- `search_options`: Optional. Options for controlling the crawling behavior.
- `limit`: Optional. Maximum number of pages to crawl.
- `config`: Optional. It contains Firecrawl API parameters.
This is the default configuration
```python
{
"limit": 5,
"tbs": None,
"lang": "en",
"country": "us",
"location": None,
"timeout": 60000,
}
```

View File

@@ -17,26 +17,25 @@ except ImportError:
class FirecrawlSearchToolSchema(BaseModel):
query: str = Field(description="Search query")
limit: Optional[int] = Field(
default=5, description="Maximum number of results to return"
)
tbs: Optional[str] = Field(default=None, description="Time-based search parameter")
lang: Optional[str] = Field(
default="en", description="Language code for search results"
)
country: Optional[str] = Field(
default="us", description="Country code for search results"
)
location: Optional[str] = Field(
default=None, description="Location parameter for search results"
)
timeout: Optional[int] = Field(default=60000, description="Timeout in milliseconds")
scrape_options: Optional[Dict[str, Any]] = Field(
default=None, description="Options for scraping search results"
)
class FirecrawlSearchTool(BaseTool):
"""
Tool for searching webpages using Firecrawl. To run this tool, you need to have a Firecrawl API key.
Args:
api_key (str): Your Firecrawl API key.
config (dict): Optional. It contains Firecrawl API parameters.
Default configuration options:
limit (int): Maximum number of pages to crawl. Default: 5
tbs (str): Time before search. Default: None
lang (str): Language. Default: "en"
country (str): Country. Default: "us"
location (str): Location. Default: None
timeout (int): Timeout in milliseconds. Default: 60000
"""
model_config = ConfigDict(
arbitrary_types_allowed=True, validate_assignment=True, frozen=False
)
@@ -47,6 +46,16 @@ class FirecrawlSearchTool(BaseTool):
description: str = "Search webpages using Firecrawl and return the results"
args_schema: Type[BaseModel] = FirecrawlSearchToolSchema
api_key: Optional[str] = None
config: Optional[dict[str, Any]] = Field(
default_factory=lambda: {
"limit": 5,
"tbs": None,
"lang": "en",
"country": "us",
"location": None,
"timeout": 60000,
}
)
_firecrawl: Optional["FirecrawlApp"] = PrivateAttr(None)
def __init__(self, api_key: Optional[str] = None, **kwargs):
@@ -56,10 +65,9 @@ class FirecrawlSearchTool(BaseTool):
def _initialize_firecrawl(self) -> None:
try:
if FIRECRAWL_AVAILABLE:
self._firecrawl = FirecrawlApp(api_key=self.api_key)
else:
raise ImportError
from firecrawl import FirecrawlApp # type: ignore
self._firecrawl = FirecrawlApp(api_key=self.api_key)
except ImportError:
import click
@@ -72,7 +80,7 @@ class FirecrawlSearchTool(BaseTool):
subprocess.run(["uv", "add", "firecrawl-py"], check=True)
from firecrawl import FirecrawlApp
self.firecrawl = FirecrawlApp(api_key=self.api_key)
self._firecrawl = FirecrawlApp(api_key=self.api_key)
except subprocess.CalledProcessError:
raise ImportError("Failed to install firecrawl-py package")
else:
@@ -83,27 +91,14 @@ class FirecrawlSearchTool(BaseTool):
def _run(
self,
query: str,
limit: Optional[int] = 5,
tbs: Optional[str] = None,
lang: Optional[str] = "en",
country: Optional[str] = "us",
location: Optional[str] = None,
timeout: Optional[int] = 60000,
scrape_options: Optional[Dict[str, Any]] = None,
) -> Any:
if not self.firecrawl:
if not self._firecrawl:
raise RuntimeError("FirecrawlApp not properly initialized")
options = {
"limit": limit,
"tbs": tbs,
"lang": lang,
"country": country,
"location": location,
"timeout": timeout,
"scrapeOptions": scrape_options or {},
}
return self.firecrawl.search(**options)
return self._firecrawl.search(
query=query,
**self.config,
)
try: