mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-08 15:48:29 +00:00
Addressed review comments and made further improvements
This commit is contained in:
@@ -4,6 +4,10 @@
|
||||
|
||||
[Firecrawl](https://firecrawl.dev) is a platform for crawling and convert any website into clean markdown or structured data.
|
||||
|
||||
## Version Compatibility
|
||||
|
||||
This implementation is compatible with FireCrawl API v1
|
||||
|
||||
## Installation
|
||||
|
||||
- Get an API key from [firecrawl.dev](https://firecrawl.dev) and set it in environment variables (`FIRECRAWL_API_KEY`).
|
||||
@@ -27,13 +31,27 @@ tool = FirecrawlCrawlWebsiteTool(url='firecrawl.dev')
|
||||
|
||||
- `api_key`: Optional. Specifies Firecrawl API key. Defaults is the `FIRECRAWL_API_KEY` environment variable.
|
||||
- `url`: The base URL to start crawling from.
|
||||
- `page_options`: Optional.
|
||||
- `onlyMainContent`: Optional. Only return the main content of the page excluding headers, navs, footers, etc.
|
||||
- `includeHtml`: Optional. Include the raw HTML content of the page. Will output a html key in the response.
|
||||
- `crawler_options`: Optional. Options for controlling the crawling behavior.
|
||||
- `maxDepth`: Optional. Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children and so on.
|
||||
- `limit`: Optional. Maximum number of pages to crawl.
|
||||
- `scrapeOptions`: Optional. Additional options for controlling the crawler.
|
||||
- `formats`: Optional. Formats for the page's content to be returned (eg. markdown, html, screenshot, links).
|
||||
- `timeout`: Optional. Timeout in milliseconds for the crawling operation.
|
||||
- `maxDepth`: Optional. Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children and so on.
|
||||
- `limit`: Optional. Maximum number of pages to crawl.
|
||||
- `allowExternalLinks`: Allows the crawler to follow links that point to external domains.
|
||||
- `formats`: Optional. Formats for the page's content to be returned (eg. markdown, html, screenshot, links).
|
||||
- `timeout`: Optional. Timeout in milliseconds for the crawling operation.
|
||||
|
||||
## Configurations Example
|
||||
|
||||
This is the default configuration
|
||||
|
||||
```python
|
||||
DEFAULT_CRAWLING_OPTIONS = {
|
||||
"maxDepth": 2,
|
||||
"ignoreSitemap": True,
|
||||
"limit": 100,
|
||||
"allowBackwardLinks": False,
|
||||
"allowExternalLinks": False,
|
||||
"scrapeOptions": {
|
||||
"formats": ["markdown", "screenshot", "links"],
|
||||
"onlyMainContent": True,
|
||||
"timeout": 30000
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
@@ -12,9 +12,18 @@ except ImportError:
|
||||
|
||||
class FirecrawlCrawlWebsiteToolSchema(BaseModel):
|
||||
url: str = Field(description="Website URL")
|
||||
crawler_options: Optional[Dict[str, Any]] = Field(
|
||||
default=None, description="Options for crawling"
|
||||
)
|
||||
maxDepth: Optional[int] = Field(
|
||||
default=2,
|
||||
description="Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children and so on.")
|
||||
limit: Optional[int] = Field(
|
||||
default=100,
|
||||
description="Maximum number of pages to crawl.")
|
||||
allowExternalLinks: Optional[bool] = Field(
|
||||
default=False,
|
||||
description="Allows the crawler to follow links that point to external domains.")
|
||||
formats: Optional[list[str]] = Field(
|
||||
default=["markdown", "screenshot", "links"],
|
||||
description="Formats for the page's content to be returned (eg. markdown, html, screenshot, links).")
|
||||
timeout: Optional[int] = Field(
|
||||
default=30000,
|
||||
description="Timeout in milliseconds for the crawling operation. The default value is 30000.",
|
||||
@@ -30,6 +39,7 @@ class FirecrawlCrawlWebsiteTool(BaseTool):
|
||||
args_schema: Type[BaseModel] = FirecrawlCrawlWebsiteToolSchema
|
||||
api_key: Optional[str] = None
|
||||
_firecrawl: Optional["FirecrawlApp"] = PrivateAttr(None)
|
||||
|
||||
|
||||
def __init__(self, api_key: Optional[str] = None, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
@@ -64,26 +74,38 @@ class FirecrawlCrawlWebsiteTool(BaseTool):
|
||||
def _run(
|
||||
self,
|
||||
url: str,
|
||||
crawler_options: Optional[Dict[str, Any]] = None,
|
||||
maxDepth: Optional[int] = 2,
|
||||
limit: Optional[int] = 100,
|
||||
allowExternalLinks: Optional[bool] = False,
|
||||
formats: Optional[list[str]] = ["markdown", "screenshot", "links"],
|
||||
timeout: Optional[int] = 30000,
|
||||
):
|
||||
if crawler_options is None:
|
||||
crawler_options = {
|
||||
"maxDepth": 2,
|
||||
"limit": 10,
|
||||
"scrapeOptions": {
|
||||
# same options as in /scrape
|
||||
"formats": ["markdown", "screenshot", "links"],
|
||||
"timeout": timeout
|
||||
}
|
||||
}
|
||||
|
||||
# Default options for timeout and crawling
|
||||
DEFAULT_TIMEOUT = 30000
|
||||
DEFAULT_CRAWLING_OPTIONS = {
|
||||
"maxDepth": 2,
|
||||
"ignoreSitemap": True,
|
||||
"limit": 100,
|
||||
"allowBackwardLinks": False,
|
||||
"allowExternalLinks": False,
|
||||
"scrapeOptions": {
|
||||
"formats": ["markdown", "screenshot", "links"],
|
||||
"onlyMainContent": True,
|
||||
"timeout": DEFAULT_TIMEOUT
|
||||
}
|
||||
}
|
||||
|
||||
else:
|
||||
crawler_options["scrapeOptions"]["timeout"] = timeout
|
||||
|
||||
# Add default options not present as parameters
|
||||
crawling_options = DEFAULT_CRAWLING_OPTIONS
|
||||
|
||||
return self._firecrawl.crawl_url(url, crawler_options)
|
||||
# Update the values of parameters present
|
||||
crawling_options["maxDepth"] = maxDepth
|
||||
crawling_options["limit"] = limit
|
||||
crawling_options["allowExternalLinks"] = allowExternalLinks
|
||||
crawling_options["scrapeOptions"]["formats"] = formats
|
||||
crawling_options["scrapeOptions"]["timeout"] = timeout
|
||||
|
||||
return self._firecrawl.crawl_url(url, crawling_options)
|
||||
|
||||
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user