mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-15 02:58:30 +00:00
Fix firecrawl tool (Too many positional arguments) (#275)
* Corrected to adapt to firecrawl package use Was leading to an error too many arguments when calling the craw_url() function * Corrected to adapt to firecrawl package use Corrected to avoid too many arguments error when calling firecrawl scrape_url function * Corrected to adapt to firecrawl package use Corrected to avoid error too many arguments when calling firecrawl search() function * fix: fix firecrawl integration * feat: support define Firecrawl using any config Currently we pre-defined the available paramenters to call Firecrawl, this commit adds support to receive any parameter and propagate them * docs: added doc string to Firecrawls classes --------- Co-authored-by: Lucas Gomide <lucaslg200@gmail.com>
This commit is contained in:
@@ -23,35 +23,38 @@ Utilize the FirecrawlScrapeFromWebsiteTool as follows to allow your agent to loa
|
||||
|
||||
```python
|
||||
from crewai_tools import FirecrawlCrawlWebsiteTool
|
||||
from firecrawl import ScrapeOptions
|
||||
|
||||
tool = FirecrawlCrawlWebsiteTool(url='firecrawl.dev')
|
||||
tool = FirecrawlCrawlWebsiteTool(
|
||||
config={
|
||||
"limit": 100,
|
||||
"scrape_options": ScrapeOptions(formats=["markdown", "html"]),
|
||||
"poll_interval": 30,
|
||||
}
|
||||
)
|
||||
tool.run(url="firecrawl.dev")
|
||||
```
|
||||
|
||||
## Arguments
|
||||
|
||||
- `api_key`: Optional. Specifies Firecrawl API key. Defaults is the `FIRECRAWL_API_KEY` environment variable.
|
||||
- `url`: The base URL to start crawling from.
|
||||
- `maxDepth`: Optional. Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children and so on.
|
||||
- `limit`: Optional. Maximum number of pages to crawl.
|
||||
- `allowExternalLinks`: Allows the crawler to follow links that point to external domains.
|
||||
- `formats`: Optional. Formats for the page's content to be returned (eg. markdown, html, screenshot, links).
|
||||
- `timeout`: Optional. Timeout in milliseconds for the crawling operation.
|
||||
|
||||
## Configurations Example
|
||||
- `config`: Optional. It contains Firecrawl API parameters.
|
||||
|
||||
This is the default configuration
|
||||
|
||||
```python
|
||||
DEFAULT_CRAWLING_OPTIONS = {
|
||||
"maxDepth": 2,
|
||||
"ignoreSitemap": True,
|
||||
"limit": 100,
|
||||
"allowBackwardLinks": False,
|
||||
"allowExternalLinks": False,
|
||||
"scrapeOptions": {
|
||||
"formats": ["markdown", "screenshot", "links"],
|
||||
"onlyMainContent": True,
|
||||
"timeout": 30000
|
||||
}
|
||||
}
|
||||
from firecrawl import ScrapeOptions
|
||||
|
||||
{
|
||||
"max_depth": 2,
|
||||
"ignore_sitemap": True,
|
||||
"limit": 100,
|
||||
"allow_backward_links": False,
|
||||
"allow_external_links": False,
|
||||
"scrape_options": ScrapeOptions(
|
||||
formats=["markdown", "screenshot", "links"],
|
||||
only_main_content=True,
|
||||
timeout=30000,
|
||||
),
|
||||
}
|
||||
```
|
||||
|
||||
@@ -3,37 +3,36 @@ from typing import Any, Optional, Type
|
||||
from crewai.tools import BaseTool
|
||||
from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
|
||||
|
||||
|
||||
try:
|
||||
from firecrawl import FirecrawlApp
|
||||
from firecrawl import FirecrawlApp, ScrapeOptions
|
||||
except ImportError:
|
||||
FirecrawlApp = Any
|
||||
|
||||
|
||||
class FirecrawlCrawlWebsiteToolSchema(BaseModel):
|
||||
url: str = Field(description="Website URL")
|
||||
maxDepth: Optional[int] = Field(
|
||||
default=2,
|
||||
description="Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children and so on.",
|
||||
)
|
||||
limit: Optional[int] = Field(
|
||||
default=100, description="Maximum number of pages to crawl."
|
||||
)
|
||||
allowExternalLinks: Optional[bool] = Field(
|
||||
default=False,
|
||||
description="Allows the crawler to follow links that point to external domains.",
|
||||
)
|
||||
formats: Optional[list[str]] = Field(
|
||||
default=["markdown", "screenshot", "links"],
|
||||
description="Formats for the page's content to be returned (eg. markdown, html, screenshot, links).",
|
||||
)
|
||||
timeout: Optional[int] = Field(
|
||||
default=30000,
|
||||
description="Timeout in milliseconds for the crawling operation. The default value is 30000.",
|
||||
)
|
||||
|
||||
|
||||
class FirecrawlCrawlWebsiteTool(BaseTool):
|
||||
"""
|
||||
Tool for crawling websites using Firecrawl. To run this tool, you need to have a Firecrawl API key.
|
||||
|
||||
Args:
|
||||
api_key (str): Your Firecrawl API key.
|
||||
config (dict): Optional. It contains Firecrawl API parameters.
|
||||
|
||||
Default configuration options:
|
||||
max_depth (int): Maximum depth to crawl. Default: 2
|
||||
ignore_sitemap (bool): Whether to ignore sitemap. Default: True
|
||||
limit (int): Maximum number of pages to crawl. Default: 100
|
||||
allow_backward_links (bool): Allow crawling backward links. Default: False
|
||||
allow_external_links (bool): Allow crawling external links. Default: False
|
||||
scrape_options (ScrapeOptions): Options for scraping content
|
||||
- formats (list[str]): Content formats to return. Default: ["markdown", "screenshot", "links"]
|
||||
- only_main_content (bool): Only return main content. Default: True
|
||||
- timeout (int): Timeout in milliseconds. Default: 30000
|
||||
"""
|
||||
|
||||
model_config = ConfigDict(
|
||||
arbitrary_types_allowed=True, validate_assignment=True, frozen=False
|
||||
)
|
||||
@@ -41,6 +40,20 @@ class FirecrawlCrawlWebsiteTool(BaseTool):
|
||||
description: str = "Crawl webpages using Firecrawl and return the contents"
|
||||
args_schema: Type[BaseModel] = FirecrawlCrawlWebsiteToolSchema
|
||||
api_key: Optional[str] = None
|
||||
config: Optional[dict[str, Any]] = Field(
|
||||
default_factory=lambda: {
|
||||
"max_depth": 2,
|
||||
"ignore_sitemap": True,
|
||||
"limit": 100,
|
||||
"allow_backward_links": False,
|
||||
"allow_external_links": False,
|
||||
"scrape_options": ScrapeOptions(
|
||||
formats=["markdown", "screenshot", "links"],
|
||||
only_main_content=True,
|
||||
timeout=30000,
|
||||
),
|
||||
}
|
||||
)
|
||||
_firecrawl: Optional["FirecrawlApp"] = PrivateAttr(None)
|
||||
|
||||
def __init__(self, api_key: Optional[str] = None, **kwargs):
|
||||
@@ -73,41 +86,8 @@ class FirecrawlCrawlWebsiteTool(BaseTool):
|
||||
"`firecrawl-py` package not found, please run `uv add firecrawl-py`"
|
||||
)
|
||||
|
||||
def _run(
|
||||
self,
|
||||
url: str,
|
||||
maxDepth: Optional[int] = 2,
|
||||
limit: Optional[int] = 100,
|
||||
allowExternalLinks: Optional[bool] = False,
|
||||
formats: Optional[list[str]] = ["markdown", "screenshot", "links"],
|
||||
timeout: Optional[int] = 30000,
|
||||
):
|
||||
# Default options for timeout and crawling
|
||||
DEFAULT_TIMEOUT = 30000
|
||||
DEFAULT_CRAWLING_OPTIONS = {
|
||||
"maxDepth": 2,
|
||||
"ignoreSitemap": True,
|
||||
"limit": 100,
|
||||
"allowBackwardLinks": False,
|
||||
"allowExternalLinks": False,
|
||||
"scrapeOptions": {
|
||||
"formats": ["markdown", "screenshot", "links"],
|
||||
"onlyMainContent": True,
|
||||
"timeout": DEFAULT_TIMEOUT,
|
||||
},
|
||||
}
|
||||
|
||||
# Add default options not present as parameters
|
||||
crawling_options = DEFAULT_CRAWLING_OPTIONS
|
||||
|
||||
# Update the values of parameters present
|
||||
crawling_options["maxDepth"] = maxDepth
|
||||
crawling_options["limit"] = limit
|
||||
crawling_options["allowExternalLinks"] = allowExternalLinks
|
||||
crawling_options["scrapeOptions"]["formats"] = formats
|
||||
crawling_options["scrapeOptions"]["timeout"] = timeout
|
||||
|
||||
return self._firecrawl.crawl_url(url, crawling_options)
|
||||
def _run(self, url: str):
|
||||
return self._firecrawl.crawl_url(url, **self.config)
|
||||
|
||||
|
||||
try:
|
||||
|
||||
@@ -20,19 +20,27 @@ Utilize the FirecrawlScrapeWebsiteTool as follows to allow your agent to load we
|
||||
```python
|
||||
from crewai_tools import FirecrawlScrapeWebsiteTool
|
||||
|
||||
tool = FirecrawlScrapeWebsiteTool(url='firecrawl.dev')
|
||||
tool = FirecrawlScrapeWebsiteTool(config={"formats": ['html']})
|
||||
tool.run(url="firecrawl.dev")
|
||||
```
|
||||
|
||||
## Arguments
|
||||
|
||||
- `api_key`: Optional. Specifies Firecrawl API key. Defaults is the `FIRECRAWL_API_KEY` environment variable.
|
||||
- `url`: The URL to scrape.
|
||||
- `page_options`: Optional.
|
||||
- `onlyMainContent`: Optional. Only return the main content of the page excluding headers, navs, footers, etc.
|
||||
- `includeHtml`: Optional. Include the raw HTML content of the page. Will output a html key in the response.
|
||||
- `extractor_options`: Optional. Options for LLM-based extraction of structured information from the page content
|
||||
- `mode`: The extraction mode to use, currently supports 'llm-extraction'
|
||||
- `extractionPrompt`: Optional. A prompt describing what information to extract from the page
|
||||
- `extractionSchema`: Optional. The schema for the data to be extracted
|
||||
- `timeout`: Optional. Timeout in milliseconds for the request
|
||||
- `config`: Optional. It contains Firecrawl API parameters.
|
||||
|
||||
|
||||
This is the default configuration
|
||||
|
||||
```python
|
||||
{
|
||||
"formats": ["markdown"],
|
||||
"only_main_content": True,
|
||||
"include_tags": [],
|
||||
"exclude_tags": [],
|
||||
"headers": {},
|
||||
"wait_for": 0,
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
|
||||
@@ -18,6 +18,21 @@ class FirecrawlScrapeWebsiteToolSchema(BaseModel):
|
||||
|
||||
|
||||
class FirecrawlScrapeWebsiteTool(BaseTool):
|
||||
"""
|
||||
Tool for scraping webpages using Firecrawl. To run this tool, you need to have a Firecrawl API key.
|
||||
|
||||
Args:
|
||||
api_key (str): Your Firecrawl API key.
|
||||
config (dict): Optional. It contains Firecrawl API parameters.
|
||||
|
||||
Default configuration options:
|
||||
formats (list[str]): Content formats to return. Default: ["markdown"]
|
||||
only_main_content (bool): Only return main content. Default: True
|
||||
include_tags (list[str]): Tags to include. Default: []
|
||||
exclude_tags (list[str]): Tags to exclude. Default: []
|
||||
headers (dict): Headers to include. Default: {}
|
||||
"""
|
||||
|
||||
model_config = ConfigDict(
|
||||
arbitrary_types_allowed=True, validate_assignment=True, frozen=False
|
||||
)
|
||||
@@ -25,6 +40,17 @@ class FirecrawlScrapeWebsiteTool(BaseTool):
|
||||
description: str = "Scrape webpages using Firecrawl and return the contents"
|
||||
args_schema: Type[BaseModel] = FirecrawlScrapeWebsiteToolSchema
|
||||
api_key: Optional[str] = None
|
||||
config: Optional[dict[str, Any]] = Field(
|
||||
default_factory=lambda: {
|
||||
"formats": ["markdown"],
|
||||
"only_main_content": True,
|
||||
"include_tags": [],
|
||||
"exclude_tags": [],
|
||||
"headers": {},
|
||||
"wait_for": 0,
|
||||
}
|
||||
)
|
||||
|
||||
_firecrawl: Optional["FirecrawlApp"] = PrivateAttr(None)
|
||||
|
||||
def __init__(self, api_key: Optional[str] = None, **kwargs):
|
||||
@@ -50,21 +76,8 @@ class FirecrawlScrapeWebsiteTool(BaseTool):
|
||||
|
||||
self._firecrawl = FirecrawlApp(api_key=api_key)
|
||||
|
||||
def _run(
|
||||
self,
|
||||
url: str,
|
||||
timeout: Optional[int] = 30000,
|
||||
):
|
||||
options = {
|
||||
"formats": ["markdown"],
|
||||
"onlyMainContent": True,
|
||||
"includeTags": [],
|
||||
"excludeTags": [],
|
||||
"headers": {},
|
||||
"waitFor": 0,
|
||||
"timeout": timeout,
|
||||
}
|
||||
return self._firecrawl.scrape_url(url, options)
|
||||
def _run(self, url: str):
|
||||
return self._firecrawl.scrape_url(url, **self.config)
|
||||
|
||||
|
||||
try:
|
||||
|
||||
@@ -20,16 +20,25 @@ Utilize the FirecrawlSearchTool as follows to allow your agent to load websites:
|
||||
```python
|
||||
from crewai_tools import FirecrawlSearchTool
|
||||
|
||||
tool = FirecrawlSearchTool(query='what is firecrawl?')
|
||||
tool = FirecrawlSearchTool(config={"limit": 5})
|
||||
tool.run(query="firecrawl web scraping")
|
||||
```
|
||||
|
||||
## Arguments
|
||||
|
||||
- `api_key`: Optional. Specifies Firecrawl API key. Defaults is the `FIRECRAWL_API_KEY` environment variable.
|
||||
- `query`: The search query string to be used for searching.
|
||||
- `page_options`: Optional. Options for result formatting.
|
||||
- `onlyMainContent`: Optional. Only return the main content of the page excluding headers, navs, footers, etc.
|
||||
- `includeHtml`: Optional. Include the raw HTML content of the page. Will output a html key in the response.
|
||||
- `fetchPageContent`: Optional. Fetch the full content of the page.
|
||||
- `search_options`: Optional. Options for controlling the crawling behavior.
|
||||
- `limit`: Optional. Maximum number of pages to crawl.
|
||||
- `config`: Optional. It contains Firecrawl API parameters.
|
||||
|
||||
|
||||
This is the default configuration
|
||||
|
||||
```python
|
||||
{
|
||||
"limit": 5,
|
||||
"tbs": None,
|
||||
"lang": "en",
|
||||
"country": "us",
|
||||
"location": None,
|
||||
"timeout": 60000,
|
||||
}
|
||||
```
|
||||
|
||||
@@ -17,26 +17,25 @@ except ImportError:
|
||||
|
||||
class FirecrawlSearchToolSchema(BaseModel):
|
||||
query: str = Field(description="Search query")
|
||||
limit: Optional[int] = Field(
|
||||
default=5, description="Maximum number of results to return"
|
||||
)
|
||||
tbs: Optional[str] = Field(default=None, description="Time-based search parameter")
|
||||
lang: Optional[str] = Field(
|
||||
default="en", description="Language code for search results"
|
||||
)
|
||||
country: Optional[str] = Field(
|
||||
default="us", description="Country code for search results"
|
||||
)
|
||||
location: Optional[str] = Field(
|
||||
default=None, description="Location parameter for search results"
|
||||
)
|
||||
timeout: Optional[int] = Field(default=60000, description="Timeout in milliseconds")
|
||||
scrape_options: Optional[Dict[str, Any]] = Field(
|
||||
default=None, description="Options for scraping search results"
|
||||
)
|
||||
|
||||
|
||||
class FirecrawlSearchTool(BaseTool):
|
||||
"""
|
||||
Tool for searching webpages using Firecrawl. To run this tool, you need to have a Firecrawl API key.
|
||||
|
||||
Args:
|
||||
api_key (str): Your Firecrawl API key.
|
||||
config (dict): Optional. It contains Firecrawl API parameters.
|
||||
|
||||
Default configuration options:
|
||||
limit (int): Maximum number of pages to crawl. Default: 5
|
||||
tbs (str): Time before search. Default: None
|
||||
lang (str): Language. Default: "en"
|
||||
country (str): Country. Default: "us"
|
||||
location (str): Location. Default: None
|
||||
timeout (int): Timeout in milliseconds. Default: 60000
|
||||
"""
|
||||
|
||||
model_config = ConfigDict(
|
||||
arbitrary_types_allowed=True, validate_assignment=True, frozen=False
|
||||
)
|
||||
@@ -47,6 +46,16 @@ class FirecrawlSearchTool(BaseTool):
|
||||
description: str = "Search webpages using Firecrawl and return the results"
|
||||
args_schema: Type[BaseModel] = FirecrawlSearchToolSchema
|
||||
api_key: Optional[str] = None
|
||||
config: Optional[dict[str, Any]] = Field(
|
||||
default_factory=lambda: {
|
||||
"limit": 5,
|
||||
"tbs": None,
|
||||
"lang": "en",
|
||||
"country": "us",
|
||||
"location": None,
|
||||
"timeout": 60000,
|
||||
}
|
||||
)
|
||||
_firecrawl: Optional["FirecrawlApp"] = PrivateAttr(None)
|
||||
|
||||
def __init__(self, api_key: Optional[str] = None, **kwargs):
|
||||
@@ -56,10 +65,9 @@ class FirecrawlSearchTool(BaseTool):
|
||||
|
||||
def _initialize_firecrawl(self) -> None:
|
||||
try:
|
||||
if FIRECRAWL_AVAILABLE:
|
||||
self._firecrawl = FirecrawlApp(api_key=self.api_key)
|
||||
else:
|
||||
raise ImportError
|
||||
from firecrawl import FirecrawlApp # type: ignore
|
||||
|
||||
self._firecrawl = FirecrawlApp(api_key=self.api_key)
|
||||
except ImportError:
|
||||
import click
|
||||
|
||||
@@ -72,7 +80,7 @@ class FirecrawlSearchTool(BaseTool):
|
||||
subprocess.run(["uv", "add", "firecrawl-py"], check=True)
|
||||
from firecrawl import FirecrawlApp
|
||||
|
||||
self.firecrawl = FirecrawlApp(api_key=self.api_key)
|
||||
self._firecrawl = FirecrawlApp(api_key=self.api_key)
|
||||
except subprocess.CalledProcessError:
|
||||
raise ImportError("Failed to install firecrawl-py package")
|
||||
else:
|
||||
@@ -83,27 +91,14 @@ class FirecrawlSearchTool(BaseTool):
|
||||
def _run(
|
||||
self,
|
||||
query: str,
|
||||
limit: Optional[int] = 5,
|
||||
tbs: Optional[str] = None,
|
||||
lang: Optional[str] = "en",
|
||||
country: Optional[str] = "us",
|
||||
location: Optional[str] = None,
|
||||
timeout: Optional[int] = 60000,
|
||||
scrape_options: Optional[Dict[str, Any]] = None,
|
||||
) -> Any:
|
||||
if not self.firecrawl:
|
||||
if not self._firecrawl:
|
||||
raise RuntimeError("FirecrawlApp not properly initialized")
|
||||
|
||||
options = {
|
||||
"limit": limit,
|
||||
"tbs": tbs,
|
||||
"lang": lang,
|
||||
"country": country,
|
||||
"location": location,
|
||||
"timeout": timeout,
|
||||
"scrapeOptions": scrape_options or {},
|
||||
}
|
||||
return self.firecrawl.search(**options)
|
||||
return self._firecrawl.search(
|
||||
query=query,
|
||||
**self.config,
|
||||
)
|
||||
|
||||
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user