diff --git a/src/crewai_tools/tools/firecrawl_crawl_website_tool/README.md b/src/crewai_tools/tools/firecrawl_crawl_website_tool/README.md index d8e8f1407..3edb73f02 100644 --- a/src/crewai_tools/tools/firecrawl_crawl_website_tool/README.md +++ b/src/crewai_tools/tools/firecrawl_crawl_website_tool/README.md @@ -23,35 +23,38 @@ Utilize the FirecrawlScrapeFromWebsiteTool as follows to allow your agent to loa ```python from crewai_tools import FirecrawlCrawlWebsiteTool +from firecrawl import ScrapeOptions -tool = FirecrawlCrawlWebsiteTool(url='firecrawl.dev') +tool = FirecrawlCrawlWebsiteTool( + config={ + "limit": 100, + "scrape_options": ScrapeOptions(formats=["markdown", "html"]), + "poll_interval": 30, + } +) +tool.run(url="firecrawl.dev") ``` ## Arguments - `api_key`: Optional. Specifies Firecrawl API key. Defaults is the `FIRECRAWL_API_KEY` environment variable. -- `url`: The base URL to start crawling from. -- `maxDepth`: Optional. Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children and so on. -- `limit`: Optional. Maximum number of pages to crawl. -- `allowExternalLinks`: Allows the crawler to follow links that point to external domains. -- `formats`: Optional. Formats for the page's content to be returned (eg. markdown, html, screenshot, links). -- `timeout`: Optional. Timeout in milliseconds for the crawling operation. - -## Configurations Example +- `config`: Optional. It contains Firecrawl API parameters. This is the default configuration ```python - DEFAULT_CRAWLING_OPTIONS = { - "maxDepth": 2, - "ignoreSitemap": True, - "limit": 100, - "allowBackwardLinks": False, - "allowExternalLinks": False, - "scrapeOptions": { - "formats": ["markdown", "screenshot", "links"], - "onlyMainContent": True, - "timeout": 30000 - } - } +from firecrawl import ScrapeOptions + +{ + "max_depth": 2, + "ignore_sitemap": True, + "limit": 100, + "allow_backward_links": False, + "allow_external_links": False, + "scrape_options": ScrapeOptions( + formats=["markdown", "screenshot", "links"], + only_main_content=True, + timeout=30000, + ), +} ``` diff --git a/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py b/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py index f91ad3184..ee7e5e3d9 100644 --- a/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py +++ b/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py @@ -3,37 +3,36 @@ from typing import Any, Optional, Type from crewai.tools import BaseTool from pydantic import BaseModel, ConfigDict, Field, PrivateAttr - try: - from firecrawl import FirecrawlApp + from firecrawl import FirecrawlApp, ScrapeOptions except ImportError: FirecrawlApp = Any class FirecrawlCrawlWebsiteToolSchema(BaseModel): url: str = Field(description="Website URL") - maxDepth: Optional[int] = Field( - default=2, - description="Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children and so on.", - ) - limit: Optional[int] = Field( - default=100, description="Maximum number of pages to crawl." - ) - allowExternalLinks: Optional[bool] = Field( - default=False, - description="Allows the crawler to follow links that point to external domains.", - ) - formats: Optional[list[str]] = Field( - default=["markdown", "screenshot", "links"], - description="Formats for the page's content to be returned (eg. markdown, html, screenshot, links).", - ) - timeout: Optional[int] = Field( - default=30000, - description="Timeout in milliseconds for the crawling operation. The default value is 30000.", - ) class FirecrawlCrawlWebsiteTool(BaseTool): + """ + Tool for crawling websites using Firecrawl. To run this tool, you need to have a Firecrawl API key. + + Args: + api_key (str): Your Firecrawl API key. + config (dict): Optional. It contains Firecrawl API parameters. + + Default configuration options: + max_depth (int): Maximum depth to crawl. Default: 2 + ignore_sitemap (bool): Whether to ignore sitemap. Default: True + limit (int): Maximum number of pages to crawl. Default: 100 + allow_backward_links (bool): Allow crawling backward links. Default: False + allow_external_links (bool): Allow crawling external links. Default: False + scrape_options (ScrapeOptions): Options for scraping content + - formats (list[str]): Content formats to return. Default: ["markdown", "screenshot", "links"] + - only_main_content (bool): Only return main content. Default: True + - timeout (int): Timeout in milliseconds. Default: 30000 + """ + model_config = ConfigDict( arbitrary_types_allowed=True, validate_assignment=True, frozen=False ) @@ -41,6 +40,20 @@ class FirecrawlCrawlWebsiteTool(BaseTool): description: str = "Crawl webpages using Firecrawl and return the contents" args_schema: Type[BaseModel] = FirecrawlCrawlWebsiteToolSchema api_key: Optional[str] = None + config: Optional[dict[str, Any]] = Field( + default_factory=lambda: { + "max_depth": 2, + "ignore_sitemap": True, + "limit": 100, + "allow_backward_links": False, + "allow_external_links": False, + "scrape_options": ScrapeOptions( + formats=["markdown", "screenshot", "links"], + only_main_content=True, + timeout=30000, + ), + } + ) _firecrawl: Optional["FirecrawlApp"] = PrivateAttr(None) def __init__(self, api_key: Optional[str] = None, **kwargs): @@ -73,41 +86,8 @@ class FirecrawlCrawlWebsiteTool(BaseTool): "`firecrawl-py` package not found, please run `uv add firecrawl-py`" ) - def _run( - self, - url: str, - maxDepth: Optional[int] = 2, - limit: Optional[int] = 100, - allowExternalLinks: Optional[bool] = False, - formats: Optional[list[str]] = ["markdown", "screenshot", "links"], - timeout: Optional[int] = 30000, - ): - # Default options for timeout and crawling - DEFAULT_TIMEOUT = 30000 - DEFAULT_CRAWLING_OPTIONS = { - "maxDepth": 2, - "ignoreSitemap": True, - "limit": 100, - "allowBackwardLinks": False, - "allowExternalLinks": False, - "scrapeOptions": { - "formats": ["markdown", "screenshot", "links"], - "onlyMainContent": True, - "timeout": DEFAULT_TIMEOUT, - }, - } - - # Add default options not present as parameters - crawling_options = DEFAULT_CRAWLING_OPTIONS - - # Update the values of parameters present - crawling_options["maxDepth"] = maxDepth - crawling_options["limit"] = limit - crawling_options["allowExternalLinks"] = allowExternalLinks - crawling_options["scrapeOptions"]["formats"] = formats - crawling_options["scrapeOptions"]["timeout"] = timeout - - return self._firecrawl.crawl_url(url, crawling_options) + def _run(self, url: str): + return self._firecrawl.crawl_url(url, **self.config) try: diff --git a/src/crewai_tools/tools/firecrawl_scrape_website_tool/README.md b/src/crewai_tools/tools/firecrawl_scrape_website_tool/README.md index 93570f06b..ebcea2f53 100644 --- a/src/crewai_tools/tools/firecrawl_scrape_website_tool/README.md +++ b/src/crewai_tools/tools/firecrawl_scrape_website_tool/README.md @@ -20,19 +20,27 @@ Utilize the FirecrawlScrapeWebsiteTool as follows to allow your agent to load we ```python from crewai_tools import FirecrawlScrapeWebsiteTool -tool = FirecrawlScrapeWebsiteTool(url='firecrawl.dev') +tool = FirecrawlScrapeWebsiteTool(config={"formats": ['html']}) +tool.run(url="firecrawl.dev") ``` ## Arguments - `api_key`: Optional. Specifies Firecrawl API key. Defaults is the `FIRECRAWL_API_KEY` environment variable. -- `url`: The URL to scrape. -- `page_options`: Optional. - - `onlyMainContent`: Optional. Only return the main content of the page excluding headers, navs, footers, etc. - - `includeHtml`: Optional. Include the raw HTML content of the page. Will output a html key in the response. -- `extractor_options`: Optional. Options for LLM-based extraction of structured information from the page content - - `mode`: The extraction mode to use, currently supports 'llm-extraction' - - `extractionPrompt`: Optional. A prompt describing what information to extract from the page - - `extractionSchema`: Optional. The schema for the data to be extracted -- `timeout`: Optional. Timeout in milliseconds for the request +- `config`: Optional. It contains Firecrawl API parameters. + + +This is the default configuration + +```python +{ + "formats": ["markdown"], + "only_main_content": True, + "include_tags": [], + "exclude_tags": [], + "headers": {}, + "wait_for": 0, +} +``` + diff --git a/src/crewai_tools/tools/firecrawl_scrape_website_tool/firecrawl_scrape_website_tool.py b/src/crewai_tools/tools/firecrawl_scrape_website_tool/firecrawl_scrape_website_tool.py index 8530aa71d..954136341 100644 --- a/src/crewai_tools/tools/firecrawl_scrape_website_tool/firecrawl_scrape_website_tool.py +++ b/src/crewai_tools/tools/firecrawl_scrape_website_tool/firecrawl_scrape_website_tool.py @@ -18,6 +18,21 @@ class FirecrawlScrapeWebsiteToolSchema(BaseModel): class FirecrawlScrapeWebsiteTool(BaseTool): + """ + Tool for scraping webpages using Firecrawl. To run this tool, you need to have a Firecrawl API key. + + Args: + api_key (str): Your Firecrawl API key. + config (dict): Optional. It contains Firecrawl API parameters. + + Default configuration options: + formats (list[str]): Content formats to return. Default: ["markdown"] + only_main_content (bool): Only return main content. Default: True + include_tags (list[str]): Tags to include. Default: [] + exclude_tags (list[str]): Tags to exclude. Default: [] + headers (dict): Headers to include. Default: {} + """ + model_config = ConfigDict( arbitrary_types_allowed=True, validate_assignment=True, frozen=False ) @@ -25,6 +40,17 @@ class FirecrawlScrapeWebsiteTool(BaseTool): description: str = "Scrape webpages using Firecrawl and return the contents" args_schema: Type[BaseModel] = FirecrawlScrapeWebsiteToolSchema api_key: Optional[str] = None + config: Optional[dict[str, Any]] = Field( + default_factory=lambda: { + "formats": ["markdown"], + "only_main_content": True, + "include_tags": [], + "exclude_tags": [], + "headers": {}, + "wait_for": 0, + } + ) + _firecrawl: Optional["FirecrawlApp"] = PrivateAttr(None) def __init__(self, api_key: Optional[str] = None, **kwargs): @@ -50,21 +76,8 @@ class FirecrawlScrapeWebsiteTool(BaseTool): self._firecrawl = FirecrawlApp(api_key=api_key) - def _run( - self, - url: str, - timeout: Optional[int] = 30000, - ): - options = { - "formats": ["markdown"], - "onlyMainContent": True, - "includeTags": [], - "excludeTags": [], - "headers": {}, - "waitFor": 0, - "timeout": timeout, - } - return self._firecrawl.scrape_url(url, options) + def _run(self, url: str): + return self._firecrawl.scrape_url(url, **self.config) try: diff --git a/src/crewai_tools/tools/firecrawl_search_tool/README.md b/src/crewai_tools/tools/firecrawl_search_tool/README.md index effb3f3d4..a2037e951 100644 --- a/src/crewai_tools/tools/firecrawl_search_tool/README.md +++ b/src/crewai_tools/tools/firecrawl_search_tool/README.md @@ -20,16 +20,25 @@ Utilize the FirecrawlSearchTool as follows to allow your agent to load websites: ```python from crewai_tools import FirecrawlSearchTool -tool = FirecrawlSearchTool(query='what is firecrawl?') +tool = FirecrawlSearchTool(config={"limit": 5}) +tool.run(query="firecrawl web scraping") ``` ## Arguments - `api_key`: Optional. Specifies Firecrawl API key. Defaults is the `FIRECRAWL_API_KEY` environment variable. -- `query`: The search query string to be used for searching. -- `page_options`: Optional. Options for result formatting. - - `onlyMainContent`: Optional. Only return the main content of the page excluding headers, navs, footers, etc. - - `includeHtml`: Optional. Include the raw HTML content of the page. Will output a html key in the response. - - `fetchPageContent`: Optional. Fetch the full content of the page. -- `search_options`: Optional. Options for controlling the crawling behavior. - - `limit`: Optional. Maximum number of pages to crawl. \ No newline at end of file +- `config`: Optional. It contains Firecrawl API parameters. + + +This is the default configuration + +```python +{ + "limit": 5, + "tbs": None, + "lang": "en", + "country": "us", + "location": None, + "timeout": 60000, +} +``` diff --git a/src/crewai_tools/tools/firecrawl_search_tool/firecrawl_search_tool.py b/src/crewai_tools/tools/firecrawl_search_tool/firecrawl_search_tool.py index f7f4f3677..8b563778c 100644 --- a/src/crewai_tools/tools/firecrawl_search_tool/firecrawl_search_tool.py +++ b/src/crewai_tools/tools/firecrawl_search_tool/firecrawl_search_tool.py @@ -17,26 +17,25 @@ except ImportError: class FirecrawlSearchToolSchema(BaseModel): query: str = Field(description="Search query") - limit: Optional[int] = Field( - default=5, description="Maximum number of results to return" - ) - tbs: Optional[str] = Field(default=None, description="Time-based search parameter") - lang: Optional[str] = Field( - default="en", description="Language code for search results" - ) - country: Optional[str] = Field( - default="us", description="Country code for search results" - ) - location: Optional[str] = Field( - default=None, description="Location parameter for search results" - ) - timeout: Optional[int] = Field(default=60000, description="Timeout in milliseconds") - scrape_options: Optional[Dict[str, Any]] = Field( - default=None, description="Options for scraping search results" - ) class FirecrawlSearchTool(BaseTool): + """ + Tool for searching webpages using Firecrawl. To run this tool, you need to have a Firecrawl API key. + + Args: + api_key (str): Your Firecrawl API key. + config (dict): Optional. It contains Firecrawl API parameters. + + Default configuration options: + limit (int): Maximum number of pages to crawl. Default: 5 + tbs (str): Time before search. Default: None + lang (str): Language. Default: "en" + country (str): Country. Default: "us" + location (str): Location. Default: None + timeout (int): Timeout in milliseconds. Default: 60000 + """ + model_config = ConfigDict( arbitrary_types_allowed=True, validate_assignment=True, frozen=False ) @@ -47,6 +46,16 @@ class FirecrawlSearchTool(BaseTool): description: str = "Search webpages using Firecrawl and return the results" args_schema: Type[BaseModel] = FirecrawlSearchToolSchema api_key: Optional[str] = None + config: Optional[dict[str, Any]] = Field( + default_factory=lambda: { + "limit": 5, + "tbs": None, + "lang": "en", + "country": "us", + "location": None, + "timeout": 60000, + } + ) _firecrawl: Optional["FirecrawlApp"] = PrivateAttr(None) def __init__(self, api_key: Optional[str] = None, **kwargs): @@ -56,10 +65,9 @@ class FirecrawlSearchTool(BaseTool): def _initialize_firecrawl(self) -> None: try: - if FIRECRAWL_AVAILABLE: - self._firecrawl = FirecrawlApp(api_key=self.api_key) - else: - raise ImportError + from firecrawl import FirecrawlApp # type: ignore + + self._firecrawl = FirecrawlApp(api_key=self.api_key) except ImportError: import click @@ -72,7 +80,7 @@ class FirecrawlSearchTool(BaseTool): subprocess.run(["uv", "add", "firecrawl-py"], check=True) from firecrawl import FirecrawlApp - self.firecrawl = FirecrawlApp(api_key=self.api_key) + self._firecrawl = FirecrawlApp(api_key=self.api_key) except subprocess.CalledProcessError: raise ImportError("Failed to install firecrawl-py package") else: @@ -83,27 +91,14 @@ class FirecrawlSearchTool(BaseTool): def _run( self, query: str, - limit: Optional[int] = 5, - tbs: Optional[str] = None, - lang: Optional[str] = "en", - country: Optional[str] = "us", - location: Optional[str] = None, - timeout: Optional[int] = 60000, - scrape_options: Optional[Dict[str, Any]] = None, ) -> Any: - if not self.firecrawl: + if not self._firecrawl: raise RuntimeError("FirecrawlApp not properly initialized") - options = { - "limit": limit, - "tbs": tbs, - "lang": lang, - "country": country, - "location": location, - "timeout": timeout, - "scrapeOptions": scrape_options or {}, - } - return self.firecrawl.search(**options) + return self._firecrawl.search( + query=query, + **self.config, + ) try: