added Firecrawl tools

2026-05-01 23:32:39 +00:00 · 2024-05-16 11:20:36 -03:00
parent 53c7d815ae
commit a51a7000c5
6 changed files with 216 additions and 0 deletions
--- a/src/crewai_tools/tools/firecrawl_scrape_website_tool/README.md
+++ b/src/crewai_tools/tools/firecrawl_scrape_website_tool/README.md
@@ -0,0 +1,38 @@
+# FirecrawlScrapeWebsiteTool
+
+## Description
+
+[Firecrawl](https://firecrawl.dev) is a platform for crawling and convert any website into clean markdown or structured data.
+
+## Installation
+
+- Get an API key from [firecrawl.dev](https://firecrawl.dev) and set it in environment variables (`FIRECRAWL_API_KEY`).
+- Install the [Firecrawl SDK](https://github.com/mendableai/firecrawl) along with `crewai[tools]` package:
+
+```
+pip install firecrawl-py 'crewai[tools]'
+```
+
+## Example
+
+Utilize the FirecrawlScrapeWebsiteTool as follows to allow your agent to load websites:
+
+```python
+from crewai_tools import FirecrawlScrapeWebsiteTool
+
+tool = FirecrawlScrapeWebsiteTool(url='firecrawl.dev')
+```
+
+## Arguments
+
+- `api_key`: Optional. Specifies Firecrawl API key. Defaults is the `FIRECRAWL_API_KEY` environment variable.
+- `url`: The URL to scrape.
+- `page_options`: Optional. 
+  - `onlyMainContent`: Optional. Only return the main content of the page excluding headers, navs, footers, etc.
+  - `includeHtml`: Optional. Include the raw HTML content of the page. Will output a html key in the response.
+- `extractor_options`: Optional. Options for LLM-based extraction of structured information from the page content
+  - `mode`: The extraction mode to use, currently supports 'llm-extraction'
+  - `extractionPrompt`: Optional. A prompt describing what information to extract from the page
+  - `extractionSchema`: Optional. The schema for the data to be extracted
+- `timeout`: Optional. Timeout in milliseconds for the request
+
--- a/src/crewai_tools/tools/firecrawl_scrape_website_tool/firecrawl_scrape_website_tool.py
+++ b/src/crewai_tools/tools/firecrawl_scrape_website_tool/firecrawl_scrape_website_tool.py
@@ -0,0 +1,35 @@
+from typing import Optional, Any, Type, Dict
+from pydantic.v1 import BaseModel, Field
+from crewai_tools.tools.base_tool import BaseTool
+
+class FirecrawlScrapeWebsiteToolSchema(BaseModel):
+    url: str = Field(description="Website URL")
+    page_options: Optional[Dict[str, Any]] = Field(default=None, description="Options for page scraping")
+    extractor_options: Optional[Dict[str, Any]] = Field(default=None, description="Options for data extraction")
+    timeout: Optional[int] = Field(default=None, description="Timeout for the scraping operation")
+
+class FirecrawlScrapeWebsiteTool(BaseTool):
+    name: str = "Firecrawl web scrape tool"
+    description: str = "Scrape webpages url using Firecrawl and return the contents"
+    args_schema: Type[BaseModel] = FirecrawlScrapeWebsiteToolSchema
+    api_key: Optional[str] = None
+    firecrawl: Optional[Any] = None
+
+    def __init__(self, api_key: Optional[str] = None, **kwargs):
+        super().__init__(**kwargs)
+        try:
+            from firecrawl import FirecrawlApp # type: ignore
+        except ImportError:
+           raise ImportError(
+               "`firecrawl` package not found, please run `pip install firecrawl-py`"
+           )
+
+        self.firecrawl = FirecrawlApp(api_key=api_key)
+
+    def _run(self, url: str, page_options: Optional[Dict[str, Any]] = None, extractor_options: Optional[Dict[str, Any]] = None, timeout: Optional[int] = None):
+        options = {
+            "pageOptions": page_options,
+            "extractorOptions": extractor_options,
+            "timeout": timeout
+        }
+        return self.firecrawl.scrape_url(url, options)