diff --git a/src/crewai_tools/tools/firecrawl_crawl_website_tool/README.md b/src/crewai_tools/tools/firecrawl_crawl_website_tool/README.md new file mode 100644 index 000000000..46d011602 --- /dev/null +++ b/src/crewai_tools/tools/firecrawl_crawl_website_tool/README.md @@ -0,0 +1,42 @@ +# FirecrawlCrawlWebsiteTool + +## Description + +[Firecrawl](https://firecrawl.dev) is a platform for crawling and convert any website into clean markdown or structured data. + +## Installation + +- Get an API key from [firecrawl.dev](https://firecrawl.dev) and set it in environment variables (`FIRECRAWL_API_KEY`). +- Install the [Firecrawl SDK](https://github.com/mendableai/firecrawl) along with `crewai[tools]` package: + +``` +pip install firecrawl-py 'crewai[tools]' +``` + +## Example + +Utilize the FirecrawlScrapeFromWebsiteTool as follows to allow your agent to load websites: + +```python +from crewai_tools import FirecrawlCrawlWebsiteTool + +tool = FirecrawlCrawlWebsiteTool(url='firecrawl.dev') +``` + +## Arguments + +- `api_key`: Optional. Specifies Firecrawl API key. Defaults is the `FIRECRAWL_API_KEY` environment variable. +- `url`: The base URL to start crawling from. +- `page_options`: Optional. + - `onlyMainContent`: Optional. Only return the main content of the page excluding headers, navs, footers, etc. + - `includeHtml`: Optional. Include the raw HTML content of the page. Will output a html key in the response. +- `crawler_options`: Optional. Options for controlling the crawling behavior. + - `includes`: Optional. URL patterns to include in the crawl. + - `exclude`: Optional. URL patterns to exclude from the crawl. + - `generateImgAltText`: Optional. Generate alt text for images using LLMs (requires a paid plan). + - `returnOnlyUrls`: Optional. If true, returns only the URLs as a list in the crawl status. Note: the response will be a list of URLs inside the data, not a list of documents. + - `maxDepth`: Optional. Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children, and so on. + - `mode`: Optional. The crawling mode to use. Fast mode crawls 4x faster on websites without a sitemap but may not be as accurate and shouldn't be used on heavily JavaScript-rendered websites. + - `limit`: Optional. Maximum number of pages to crawl. + - `timeout`: Optional. Timeout in milliseconds for the crawling operation. + diff --git a/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py b/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py new file mode 100644 index 000000000..5c796189a --- /dev/null +++ b/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py @@ -0,0 +1,33 @@ +from typing import Optional, Any, Type, Dict, List +from pydantic.v1 import BaseModel, Field +from crewai_tools.tools.base_tool import BaseTool + +class FirecrawlCrawlWebsiteToolSchema(BaseModel): + url: str = Field(description="Website URL") + crawler_options: Optional[Dict[str, Any]] = Field(default=None, description="Options for crawling") + page_options: Optional[Dict[str, Any]] = Field(default=None, description="Options for page") + +class FirecrawlCrawlWebsiteTool(BaseTool): + name: str = "Firecrawl web crawl tool" + description: str = "Crawl webpages using Firecrawl and return the contents" + args_schema: Type[BaseModel] = FirecrawlCrawlWebsiteToolSchema + api_key: Optional[str] = None + firecrawl: Optional[Any] = None + + def __init__(self, api_key: Optional[str] = None, **kwargs): + super().__init__(**kwargs) + try: + from firecrawl import FirecrawlApp # type: ignore + except ImportError: + raise ImportError( + "`firecrawl` package not found, please run `pip install firecrawl-py`" + ) + + self.firecrawl = FirecrawlApp(api_key=api_key) + + def _run(self, url: str, crawler_options: Optional[Dict[str, Any]] = None, page_options: Optional[Dict[str, Any]] = None): + options = { + "crawlerOptions": crawler_options, + "pageOptions": page_options + } + return self.firecrawl.crawl_url(url, options) \ No newline at end of file diff --git a/src/crewai_tools/tools/firecrawl_scrape_website_tool/README.md b/src/crewai_tools/tools/firecrawl_scrape_website_tool/README.md new file mode 100644 index 000000000..93570f06b --- /dev/null +++ b/src/crewai_tools/tools/firecrawl_scrape_website_tool/README.md @@ -0,0 +1,38 @@ +# FirecrawlScrapeWebsiteTool + +## Description + +[Firecrawl](https://firecrawl.dev) is a platform for crawling and convert any website into clean markdown or structured data. + +## Installation + +- Get an API key from [firecrawl.dev](https://firecrawl.dev) and set it in environment variables (`FIRECRAWL_API_KEY`). +- Install the [Firecrawl SDK](https://github.com/mendableai/firecrawl) along with `crewai[tools]` package: + +``` +pip install firecrawl-py 'crewai[tools]' +``` + +## Example + +Utilize the FirecrawlScrapeWebsiteTool as follows to allow your agent to load websites: + +```python +from crewai_tools import FirecrawlScrapeWebsiteTool + +tool = FirecrawlScrapeWebsiteTool(url='firecrawl.dev') +``` + +## Arguments + +- `api_key`: Optional. Specifies Firecrawl API key. Defaults is the `FIRECRAWL_API_KEY` environment variable. +- `url`: The URL to scrape. +- `page_options`: Optional. + - `onlyMainContent`: Optional. Only return the main content of the page excluding headers, navs, footers, etc. + - `includeHtml`: Optional. Include the raw HTML content of the page. Will output a html key in the response. +- `extractor_options`: Optional. Options for LLM-based extraction of structured information from the page content + - `mode`: The extraction mode to use, currently supports 'llm-extraction' + - `extractionPrompt`: Optional. A prompt describing what information to extract from the page + - `extractionSchema`: Optional. The schema for the data to be extracted +- `timeout`: Optional. Timeout in milliseconds for the request + diff --git a/src/crewai_tools/tools/firecrawl_scrape_website_tool/firecrawl_scrape_website_tool.py b/src/crewai_tools/tools/firecrawl_scrape_website_tool/firecrawl_scrape_website_tool.py new file mode 100644 index 000000000..8540b13ff --- /dev/null +++ b/src/crewai_tools/tools/firecrawl_scrape_website_tool/firecrawl_scrape_website_tool.py @@ -0,0 +1,35 @@ +from typing import Optional, Any, Type, Dict +from pydantic.v1 import BaseModel, Field +from crewai_tools.tools.base_tool import BaseTool + +class FirecrawlScrapeWebsiteToolSchema(BaseModel): + url: str = Field(description="Website URL") + page_options: Optional[Dict[str, Any]] = Field(default=None, description="Options for page scraping") + extractor_options: Optional[Dict[str, Any]] = Field(default=None, description="Options for data extraction") + timeout: Optional[int] = Field(default=None, description="Timeout for the scraping operation") + +class FirecrawlScrapeWebsiteTool(BaseTool): + name: str = "Firecrawl web scrape tool" + description: str = "Scrape webpages url using Firecrawl and return the contents" + args_schema: Type[BaseModel] = FirecrawlScrapeWebsiteToolSchema + api_key: Optional[str] = None + firecrawl: Optional[Any] = None + + def __init__(self, api_key: Optional[str] = None, **kwargs): + super().__init__(**kwargs) + try: + from firecrawl import FirecrawlApp # type: ignore + except ImportError: + raise ImportError( + "`firecrawl` package not found, please run `pip install firecrawl-py`" + ) + + self.firecrawl = FirecrawlApp(api_key=api_key) + + def _run(self, url: str, page_options: Optional[Dict[str, Any]] = None, extractor_options: Optional[Dict[str, Any]] = None, timeout: Optional[int] = None): + options = { + "pageOptions": page_options, + "extractorOptions": extractor_options, + "timeout": timeout + } + return self.firecrawl.scrape_url(url, options) \ No newline at end of file diff --git a/src/crewai_tools/tools/firecrawl_search_tool/README.md b/src/crewai_tools/tools/firecrawl_search_tool/README.md new file mode 100644 index 000000000..effb3f3d4 --- /dev/null +++ b/src/crewai_tools/tools/firecrawl_search_tool/README.md @@ -0,0 +1,35 @@ +# FirecrawlSearchTool + +## Description + +[Firecrawl](https://firecrawl.dev) is a platform for crawling and convert any website into clean markdown or structured data. + +## Installation + +- Get an API key from [firecrawl.dev](https://firecrawl.dev) and set it in environment variables (`FIRECRAWL_API_KEY`). +- Install the [Firecrawl SDK](https://github.com/mendableai/firecrawl) along with `crewai[tools]` package: + +``` +pip install firecrawl-py 'crewai[tools]' +``` + +## Example + +Utilize the FirecrawlSearchTool as follows to allow your agent to load websites: + +```python +from crewai_tools import FirecrawlSearchTool + +tool = FirecrawlSearchTool(query='what is firecrawl?') +``` + +## Arguments + +- `api_key`: Optional. Specifies Firecrawl API key. Defaults is the `FIRECRAWL_API_KEY` environment variable. +- `query`: The search query string to be used for searching. +- `page_options`: Optional. Options for result formatting. + - `onlyMainContent`: Optional. Only return the main content of the page excluding headers, navs, footers, etc. + - `includeHtml`: Optional. Include the raw HTML content of the page. Will output a html key in the response. + - `fetchPageContent`: Optional. Fetch the full content of the page. +- `search_options`: Optional. Options for controlling the crawling behavior. + - `limit`: Optional. Maximum number of pages to crawl. \ No newline at end of file diff --git a/src/crewai_tools/tools/firecrawl_search_tool/firecrawl_search_tool.py b/src/crewai_tools/tools/firecrawl_search_tool/firecrawl_search_tool.py new file mode 100644 index 000000000..89843f797 --- /dev/null +++ b/src/crewai_tools/tools/firecrawl_search_tool/firecrawl_search_tool.py @@ -0,0 +1,33 @@ +from typing import Optional, Any, Type, Dict, List +from pydantic.v1 import BaseModel, Field +from crewai_tools.tools.base_tool import BaseTool + +class FirecrawlSearchToolSchema(BaseModel): + query: str = Field(description="Search query") + page_options: Optional[Dict[str, Any]] = Field(default=None, description="Options for result formatting") + search_options: Optional[Dict[str, Any]] = Field(default=None, description="Options for searching") + +class FirecrawlSearchTool(BaseTool): + name: str = "Firecrawl web search tool" + description: str = "Search webpages using Firecrawl and return the results" + args_schema: Type[BaseModel] = FirecrawlSearchToolSchema + api_key: Optional[str] = None + firecrawl: Optional[Any] = None + + def __init__(self, api_key: Optional[str] = None, **kwargs): + super().__init__(**kwargs) + try: + from firecrawl import FirecrawlApp # type: ignore + except ImportError: + raise ImportError( + "`firecrawl` package not found, please run `pip install firecrawl-py`" + ) + + self.firecrawl = FirecrawlApp(api_key=api_key) + + def _run(self, query: str, page_options: Optional[Dict[str, Any]] = None, result_options: Optional[Dict[str, Any]] = None): + options = { + "pageOptions": page_options, + "resultOptions": result_options + } + return self.firecrawl.search(query, options)