Addressed review comments and made further improvements

This commit is contained in:
Shady Ali
2025-03-28 16:58:47 +02:00
parent 5af2108307
commit e0adb4695c
2 changed files with 68 additions and 28 deletions

View File

@@ -4,6 +4,10 @@
[Firecrawl](https://firecrawl.dev) is a platform for crawling and convert any website into clean markdown or structured data.
## Version Compatibility
This implementation is compatible with FireCrawl API v1
## Installation
- Get an API key from [firecrawl.dev](https://firecrawl.dev) and set it in environment variables (`FIRECRAWL_API_KEY`).
@@ -27,13 +31,27 @@ tool = FirecrawlCrawlWebsiteTool(url='firecrawl.dev')
- `api_key`: Optional. Specifies Firecrawl API key. Defaults is the `FIRECRAWL_API_KEY` environment variable.
- `url`: The base URL to start crawling from.
- `page_options`: Optional.
- `onlyMainContent`: Optional. Only return the main content of the page excluding headers, navs, footers, etc.
- `includeHtml`: Optional. Include the raw HTML content of the page. Will output a html key in the response.
- `crawler_options`: Optional. Options for controlling the crawling behavior.
- `maxDepth`: Optional. Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children and so on.
- `limit`: Optional. Maximum number of pages to crawl.
- `scrapeOptions`: Optional. Additional options for controlling the crawler.
- `formats`: Optional. Formats for the page's content to be returned (eg. markdown, html, screenshot, links).
- `timeout`: Optional. Timeout in milliseconds for the crawling operation.
- `maxDepth`: Optional. Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children and so on.
- `limit`: Optional. Maximum number of pages to crawl.
- `allowExternalLinks`: Allows the crawler to follow links that point to external domains.
- `formats`: Optional. Formats for the page's content to be returned (eg. markdown, html, screenshot, links).
- `timeout`: Optional. Timeout in milliseconds for the crawling operation.
## Configurations Example
This is the default configuration
```python
DEFAULT_CRAWLING_OPTIONS = {
"maxDepth": 2,
"ignoreSitemap": True,
"limit": 100,
"allowBackwardLinks": False,
"allowExternalLinks": False,
"scrapeOptions": {
"formats": ["markdown", "screenshot", "links"],
"onlyMainContent": True,
"timeout": 30000
}
}
```

View File

@@ -12,9 +12,18 @@ except ImportError:
class FirecrawlCrawlWebsiteToolSchema(BaseModel):
url: str = Field(description="Website URL")
crawler_options: Optional[Dict[str, Any]] = Field(
default=None, description="Options for crawling"
)
maxDepth: Optional[int] = Field(
default=2,
description="Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children and so on.")
limit: Optional[int] = Field(
default=100,
description="Maximum number of pages to crawl.")
allowExternalLinks: Optional[bool] = Field(
default=False,
description="Allows the crawler to follow links that point to external domains.")
formats: Optional[list[str]] = Field(
default=["markdown", "screenshot", "links"],
description="Formats for the page's content to be returned (eg. markdown, html, screenshot, links).")
timeout: Optional[int] = Field(
default=30000,
description="Timeout in milliseconds for the crawling operation. The default value is 30000.",
@@ -30,6 +39,7 @@ class FirecrawlCrawlWebsiteTool(BaseTool):
args_schema: Type[BaseModel] = FirecrawlCrawlWebsiteToolSchema
api_key: Optional[str] = None
_firecrawl: Optional["FirecrawlApp"] = PrivateAttr(None)
def __init__(self, api_key: Optional[str] = None, **kwargs):
super().__init__(**kwargs)
@@ -64,26 +74,38 @@ class FirecrawlCrawlWebsiteTool(BaseTool):
def _run(
self,
url: str,
crawler_options: Optional[Dict[str, Any]] = None,
maxDepth: Optional[int] = 2,
limit: Optional[int] = 100,
allowExternalLinks: Optional[bool] = False,
formats: Optional[list[str]] = ["markdown", "screenshot", "links"],
timeout: Optional[int] = 30000,
):
if crawler_options is None:
crawler_options = {
"maxDepth": 2,
"limit": 10,
"scrapeOptions": {
# same options as in /scrape
"formats": ["markdown", "screenshot", "links"],
"timeout": timeout
}
}
# Default options for timeout and crawling
DEFAULT_TIMEOUT = 30000
DEFAULT_CRAWLING_OPTIONS = {
"maxDepth": 2,
"ignoreSitemap": True,
"limit": 100,
"allowBackwardLinks": False,
"allowExternalLinks": False,
"scrapeOptions": {
"formats": ["markdown", "screenshot", "links"],
"onlyMainContent": True,
"timeout": DEFAULT_TIMEOUT
}
}
else:
crawler_options["scrapeOptions"]["timeout"] = timeout
# Add default options not present as parameters
crawling_options = DEFAULT_CRAWLING_OPTIONS
return self._firecrawl.crawl_url(url, crawler_options)
# Update the values of parameters present
crawling_options["maxDepth"] = maxDepth
crawling_options["limit"] = limit
crawling_options["allowExternalLinks"] = allowExternalLinks
crawling_options["scrapeOptions"]["formats"] = formats
crawling_options["scrapeOptions"]["timeout"] = timeout
return self._firecrawl.crawl_url(url, crawling_options)
try: