mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-28 09:38:17 +00:00
Squashed 'packages/tools/' content from commit 78317b9c
git-subtree-dir: packages/tools git-subtree-split: 78317b9c127f18bd040c1d77e3c0840cdc9a5b38
This commit is contained in:
46
crewai_tools/tools/firecrawl_scrape_website_tool/README.md
Normal file
46
crewai_tools/tools/firecrawl_scrape_website_tool/README.md
Normal file
@@ -0,0 +1,46 @@
|
||||
# FirecrawlScrapeWebsiteTool
|
||||
|
||||
## Description
|
||||
|
||||
[Firecrawl](https://firecrawl.dev) is a platform for crawling and convert any website into clean markdown or structured data.
|
||||
|
||||
## Installation
|
||||
|
||||
- Get an API key from [firecrawl.dev](https://firecrawl.dev) and set it in environment variables (`FIRECRAWL_API_KEY`).
|
||||
- Install the [Firecrawl SDK](https://github.com/mendableai/firecrawl) along with `crewai[tools]` package:
|
||||
|
||||
```
|
||||
pip install firecrawl-py 'crewai[tools]'
|
||||
```
|
||||
|
||||
## Example
|
||||
|
||||
Utilize the FirecrawlScrapeWebsiteTool as follows to allow your agent to load websites:
|
||||
|
||||
```python
|
||||
from crewai_tools import FirecrawlScrapeWebsiteTool
|
||||
|
||||
tool = FirecrawlScrapeWebsiteTool(config={"formats": ['html']})
|
||||
tool.run(url="firecrawl.dev")
|
||||
```
|
||||
|
||||
## Arguments
|
||||
|
||||
- `api_key`: Optional. Specifies Firecrawl API key. Defaults is the `FIRECRAWL_API_KEY` environment variable.
|
||||
- `config`: Optional. It contains Firecrawl API parameters.
|
||||
|
||||
|
||||
This is the default configuration
|
||||
|
||||
```python
|
||||
{
|
||||
"formats": ["markdown"],
|
||||
"only_main_content": True,
|
||||
"include_tags": [],
|
||||
"exclude_tags": [],
|
||||
"headers": {},
|
||||
"wait_for": 0,
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
@@ -0,0 +1,103 @@
|
||||
from typing import Any, Optional, Type, Dict, List, TYPE_CHECKING
|
||||
|
||||
from crewai.tools import BaseTool, EnvVar
|
||||
from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from firecrawl import FirecrawlApp
|
||||
|
||||
try:
|
||||
from firecrawl import FirecrawlApp
|
||||
|
||||
FIRECRAWL_AVAILABLE = True
|
||||
except ImportError:
|
||||
FIRECRAWL_AVAILABLE = False
|
||||
|
||||
|
||||
class FirecrawlScrapeWebsiteToolSchema(BaseModel):
|
||||
url: str = Field(description="Website URL")
|
||||
|
||||
|
||||
class FirecrawlScrapeWebsiteTool(BaseTool):
|
||||
"""
|
||||
Tool for scraping webpages using Firecrawl. To run this tool, you need to have a Firecrawl API key.
|
||||
|
||||
Args:
|
||||
api_key (str): Your Firecrawl API key.
|
||||
config (dict): Optional. It contains Firecrawl API parameters.
|
||||
|
||||
Default configuration options:
|
||||
formats (list[str]): Content formats to return. Default: ["markdown"]
|
||||
onlyMainContent (bool): Only return main content. Default: True
|
||||
includeTags (list[str]): Tags to include. Default: []
|
||||
excludeTags (list[str]): Tags to exclude. Default: []
|
||||
headers (dict): Headers to include. Default: {}
|
||||
waitFor (int): Time to wait for page to load in ms. Default: 0
|
||||
json_options (dict): Options for JSON extraction. Default: None
|
||||
"""
|
||||
|
||||
model_config = ConfigDict(
|
||||
arbitrary_types_allowed=True, validate_assignment=True, frozen=False
|
||||
)
|
||||
name: str = "Firecrawl web scrape tool"
|
||||
description: str = "Scrape webpages using Firecrawl and return the contents"
|
||||
args_schema: Type[BaseModel] = FirecrawlScrapeWebsiteToolSchema
|
||||
api_key: Optional[str] = None
|
||||
config: Dict[str, Any] = Field(
|
||||
default_factory=lambda: {
|
||||
"formats": ["markdown"],
|
||||
"onlyMainContent": True,
|
||||
"includeTags": [],
|
||||
"excludeTags": [],
|
||||
"headers": {},
|
||||
"waitFor": 0,
|
||||
}
|
||||
)
|
||||
|
||||
_firecrawl: Optional["FirecrawlApp"] = PrivateAttr(None)
|
||||
package_dependencies: List[str] = ["firecrawl-py"]
|
||||
env_vars: List[EnvVar] = [
|
||||
EnvVar(name="FIRECRAWL_API_KEY", description="API key for Firecrawl services", required=True),
|
||||
]
|
||||
|
||||
def __init__(self, api_key: Optional[str] = None, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
try:
|
||||
from firecrawl import FirecrawlApp # type: ignore
|
||||
except ImportError:
|
||||
import click
|
||||
|
||||
if click.confirm(
|
||||
"You are missing the 'firecrawl-py' package. Would you like to install it?"
|
||||
):
|
||||
import subprocess
|
||||
|
||||
subprocess.run(["uv", "add", "firecrawl-py"], check=True)
|
||||
from firecrawl import (
|
||||
FirecrawlApp,
|
||||
)
|
||||
else:
|
||||
raise ImportError(
|
||||
"`firecrawl-py` package not found, please run `uv add firecrawl-py`"
|
||||
)
|
||||
|
||||
self._firecrawl = FirecrawlApp(api_key=api_key)
|
||||
|
||||
def _run(self, url: str):
|
||||
if not self._firecrawl:
|
||||
raise RuntimeError("FirecrawlApp not properly initialized")
|
||||
|
||||
return self._firecrawl.scrape_url(url, params=self.config)
|
||||
|
||||
|
||||
try:
|
||||
from firecrawl import FirecrawlApp
|
||||
|
||||
# Must rebuild model after class is defined
|
||||
if not hasattr(FirecrawlScrapeWebsiteTool, "_model_rebuilt"):
|
||||
FirecrawlScrapeWebsiteTool.model_rebuild()
|
||||
FirecrawlScrapeWebsiteTool._model_rebuilt = True
|
||||
except ImportError:
|
||||
"""
|
||||
When this tool is not used, then exception can be ignored.
|
||||
"""
|
||||
Reference in New Issue
Block a user