mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-09 16:18:30 +00:00
Merge pull request #34 from rafaelsideguide/added-firecrawl-tools
added Firecrawl tools
This commit is contained in:
@@ -0,0 +1,42 @@
|
||||
# FirecrawlCrawlWebsiteTool
|
||||
|
||||
## Description
|
||||
|
||||
[Firecrawl](https://firecrawl.dev) is a platform for crawling and convert any website into clean markdown or structured data.
|
||||
|
||||
## Installation
|
||||
|
||||
- Get an API key from [firecrawl.dev](https://firecrawl.dev) and set it in environment variables (`FIRECRAWL_API_KEY`).
|
||||
- Install the [Firecrawl SDK](https://github.com/mendableai/firecrawl) along with `crewai[tools]` package:
|
||||
|
||||
```
|
||||
pip install firecrawl-py 'crewai[tools]'
|
||||
```
|
||||
|
||||
## Example
|
||||
|
||||
Utilize the FirecrawlScrapeFromWebsiteTool as follows to allow your agent to load websites:
|
||||
|
||||
```python
|
||||
from crewai_tools import FirecrawlCrawlWebsiteTool
|
||||
|
||||
tool = FirecrawlCrawlWebsiteTool(url='firecrawl.dev')
|
||||
```
|
||||
|
||||
## Arguments
|
||||
|
||||
- `api_key`: Optional. Specifies Firecrawl API key. Defaults is the `FIRECRAWL_API_KEY` environment variable.
|
||||
- `url`: The base URL to start crawling from.
|
||||
- `page_options`: Optional.
|
||||
- `onlyMainContent`: Optional. Only return the main content of the page excluding headers, navs, footers, etc.
|
||||
- `includeHtml`: Optional. Include the raw HTML content of the page. Will output a html key in the response.
|
||||
- `crawler_options`: Optional. Options for controlling the crawling behavior.
|
||||
- `includes`: Optional. URL patterns to include in the crawl.
|
||||
- `exclude`: Optional. URL patterns to exclude from the crawl.
|
||||
- `generateImgAltText`: Optional. Generate alt text for images using LLMs (requires a paid plan).
|
||||
- `returnOnlyUrls`: Optional. If true, returns only the URLs as a list in the crawl status. Note: the response will be a list of URLs inside the data, not a list of documents.
|
||||
- `maxDepth`: Optional. Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children, and so on.
|
||||
- `mode`: Optional. The crawling mode to use. Fast mode crawls 4x faster on websites without a sitemap but may not be as accurate and shouldn't be used on heavily JavaScript-rendered websites.
|
||||
- `limit`: Optional. Maximum number of pages to crawl.
|
||||
- `timeout`: Optional. Timeout in milliseconds for the crawling operation.
|
||||
|
||||
@@ -0,0 +1,33 @@
|
||||
from typing import Optional, Any, Type, Dict, List
|
||||
from pydantic.v1 import BaseModel, Field
|
||||
from crewai_tools.tools.base_tool import BaseTool
|
||||
|
||||
class FirecrawlCrawlWebsiteToolSchema(BaseModel):
|
||||
url: str = Field(description="Website URL")
|
||||
crawler_options: Optional[Dict[str, Any]] = Field(default=None, description="Options for crawling")
|
||||
page_options: Optional[Dict[str, Any]] = Field(default=None, description="Options for page")
|
||||
|
||||
class FirecrawlCrawlWebsiteTool(BaseTool):
|
||||
name: str = "Firecrawl web crawl tool"
|
||||
description: str = "Crawl webpages using Firecrawl and return the contents"
|
||||
args_schema: Type[BaseModel] = FirecrawlCrawlWebsiteToolSchema
|
||||
api_key: Optional[str] = None
|
||||
firecrawl: Optional[Any] = None
|
||||
|
||||
def __init__(self, api_key: Optional[str] = None, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
try:
|
||||
from firecrawl import FirecrawlApp # type: ignore
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"`firecrawl` package not found, please run `pip install firecrawl-py`"
|
||||
)
|
||||
|
||||
self.firecrawl = FirecrawlApp(api_key=api_key)
|
||||
|
||||
def _run(self, url: str, crawler_options: Optional[Dict[str, Any]] = None, page_options: Optional[Dict[str, Any]] = None):
|
||||
options = {
|
||||
"crawlerOptions": crawler_options,
|
||||
"pageOptions": page_options
|
||||
}
|
||||
return self.firecrawl.crawl_url(url, options)
|
||||
@@ -0,0 +1,38 @@
|
||||
# FirecrawlScrapeWebsiteTool
|
||||
|
||||
## Description
|
||||
|
||||
[Firecrawl](https://firecrawl.dev) is a platform for crawling and convert any website into clean markdown or structured data.
|
||||
|
||||
## Installation
|
||||
|
||||
- Get an API key from [firecrawl.dev](https://firecrawl.dev) and set it in environment variables (`FIRECRAWL_API_KEY`).
|
||||
- Install the [Firecrawl SDK](https://github.com/mendableai/firecrawl) along with `crewai[tools]` package:
|
||||
|
||||
```
|
||||
pip install firecrawl-py 'crewai[tools]'
|
||||
```
|
||||
|
||||
## Example
|
||||
|
||||
Utilize the FirecrawlScrapeWebsiteTool as follows to allow your agent to load websites:
|
||||
|
||||
```python
|
||||
from crewai_tools import FirecrawlScrapeWebsiteTool
|
||||
|
||||
tool = FirecrawlScrapeWebsiteTool(url='firecrawl.dev')
|
||||
```
|
||||
|
||||
## Arguments
|
||||
|
||||
- `api_key`: Optional. Specifies Firecrawl API key. Defaults is the `FIRECRAWL_API_KEY` environment variable.
|
||||
- `url`: The URL to scrape.
|
||||
- `page_options`: Optional.
|
||||
- `onlyMainContent`: Optional. Only return the main content of the page excluding headers, navs, footers, etc.
|
||||
- `includeHtml`: Optional. Include the raw HTML content of the page. Will output a html key in the response.
|
||||
- `extractor_options`: Optional. Options for LLM-based extraction of structured information from the page content
|
||||
- `mode`: The extraction mode to use, currently supports 'llm-extraction'
|
||||
- `extractionPrompt`: Optional. A prompt describing what information to extract from the page
|
||||
- `extractionSchema`: Optional. The schema for the data to be extracted
|
||||
- `timeout`: Optional. Timeout in milliseconds for the request
|
||||
|
||||
@@ -0,0 +1,35 @@
|
||||
from typing import Optional, Any, Type, Dict
|
||||
from pydantic.v1 import BaseModel, Field
|
||||
from crewai_tools.tools.base_tool import BaseTool
|
||||
|
||||
class FirecrawlScrapeWebsiteToolSchema(BaseModel):
|
||||
url: str = Field(description="Website URL")
|
||||
page_options: Optional[Dict[str, Any]] = Field(default=None, description="Options for page scraping")
|
||||
extractor_options: Optional[Dict[str, Any]] = Field(default=None, description="Options for data extraction")
|
||||
timeout: Optional[int] = Field(default=None, description="Timeout for the scraping operation")
|
||||
|
||||
class FirecrawlScrapeWebsiteTool(BaseTool):
|
||||
name: str = "Firecrawl web scrape tool"
|
||||
description: str = "Scrape webpages url using Firecrawl and return the contents"
|
||||
args_schema: Type[BaseModel] = FirecrawlScrapeWebsiteToolSchema
|
||||
api_key: Optional[str] = None
|
||||
firecrawl: Optional[Any] = None
|
||||
|
||||
def __init__(self, api_key: Optional[str] = None, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
try:
|
||||
from firecrawl import FirecrawlApp # type: ignore
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"`firecrawl` package not found, please run `pip install firecrawl-py`"
|
||||
)
|
||||
|
||||
self.firecrawl = FirecrawlApp(api_key=api_key)
|
||||
|
||||
def _run(self, url: str, page_options: Optional[Dict[str, Any]] = None, extractor_options: Optional[Dict[str, Any]] = None, timeout: Optional[int] = None):
|
||||
options = {
|
||||
"pageOptions": page_options,
|
||||
"extractorOptions": extractor_options,
|
||||
"timeout": timeout
|
||||
}
|
||||
return self.firecrawl.scrape_url(url, options)
|
||||
35
src/crewai_tools/tools/firecrawl_search_tool/README.md
Normal file
35
src/crewai_tools/tools/firecrawl_search_tool/README.md
Normal file
@@ -0,0 +1,35 @@
|
||||
# FirecrawlSearchTool
|
||||
|
||||
## Description
|
||||
|
||||
[Firecrawl](https://firecrawl.dev) is a platform for crawling and convert any website into clean markdown or structured data.
|
||||
|
||||
## Installation
|
||||
|
||||
- Get an API key from [firecrawl.dev](https://firecrawl.dev) and set it in environment variables (`FIRECRAWL_API_KEY`).
|
||||
- Install the [Firecrawl SDK](https://github.com/mendableai/firecrawl) along with `crewai[tools]` package:
|
||||
|
||||
```
|
||||
pip install firecrawl-py 'crewai[tools]'
|
||||
```
|
||||
|
||||
## Example
|
||||
|
||||
Utilize the FirecrawlSearchTool as follows to allow your agent to load websites:
|
||||
|
||||
```python
|
||||
from crewai_tools import FirecrawlSearchTool
|
||||
|
||||
tool = FirecrawlSearchTool(query='what is firecrawl?')
|
||||
```
|
||||
|
||||
## Arguments
|
||||
|
||||
- `api_key`: Optional. Specifies Firecrawl API key. Defaults is the `FIRECRAWL_API_KEY` environment variable.
|
||||
- `query`: The search query string to be used for searching.
|
||||
- `page_options`: Optional. Options for result formatting.
|
||||
- `onlyMainContent`: Optional. Only return the main content of the page excluding headers, navs, footers, etc.
|
||||
- `includeHtml`: Optional. Include the raw HTML content of the page. Will output a html key in the response.
|
||||
- `fetchPageContent`: Optional. Fetch the full content of the page.
|
||||
- `search_options`: Optional. Options for controlling the crawling behavior.
|
||||
- `limit`: Optional. Maximum number of pages to crawl.
|
||||
@@ -0,0 +1,33 @@
|
||||
from typing import Optional, Any, Type, Dict, List
|
||||
from pydantic.v1 import BaseModel, Field
|
||||
from crewai_tools.tools.base_tool import BaseTool
|
||||
|
||||
class FirecrawlSearchToolSchema(BaseModel):
|
||||
query: str = Field(description="Search query")
|
||||
page_options: Optional[Dict[str, Any]] = Field(default=None, description="Options for result formatting")
|
||||
search_options: Optional[Dict[str, Any]] = Field(default=None, description="Options for searching")
|
||||
|
||||
class FirecrawlSearchTool(BaseTool):
|
||||
name: str = "Firecrawl web search tool"
|
||||
description: str = "Search webpages using Firecrawl and return the results"
|
||||
args_schema: Type[BaseModel] = FirecrawlSearchToolSchema
|
||||
api_key: Optional[str] = None
|
||||
firecrawl: Optional[Any] = None
|
||||
|
||||
def __init__(self, api_key: Optional[str] = None, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
try:
|
||||
from firecrawl import FirecrawlApp # type: ignore
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"`firecrawl` package not found, please run `pip install firecrawl-py`"
|
||||
)
|
||||
|
||||
self.firecrawl = FirecrawlApp(api_key=api_key)
|
||||
|
||||
def _run(self, query: str, page_options: Optional[Dict[str, Any]] = None, result_options: Optional[Dict[str, Any]] = None):
|
||||
options = {
|
||||
"pageOptions": page_options,
|
||||
"resultOptions": result_options
|
||||
}
|
||||
return self.firecrawl.search(query, options)
|
||||
Reference in New Issue
Block a user