mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-05-02 07:42:40 +00:00
Squashed 'packages/tools/' content from commit 78317b9c
git-subtree-dir: packages/tools git-subtree-split: 78317b9c127f18bd040c1d77e3c0840cdc9a5b38
This commit is contained in:
99
crewai_tools/tools/tavily_extractor_tool/README.md
Normal file
99
crewai_tools/tools/tavily_extractor_tool/README.md
Normal file
@@ -0,0 +1,99 @@
|
||||
# TavilyExtractorTool
|
||||
|
||||
## Description
|
||||
|
||||
The `TavilyExtractorTool` allows CrewAI agents to extract structured content from web pages using the Tavily API. It can process single URLs or lists of URLs and provides options for controlling the extraction depth and including images.
|
||||
|
||||
## Installation
|
||||
|
||||
To use the `TavilyExtractorTool`, you need to install the `tavily-python` library:
|
||||
|
||||
```shell
|
||||
pip install 'crewai[tools]' tavily-python
|
||||
```
|
||||
|
||||
You also need to set your Tavily API key as an environment variable:
|
||||
|
||||
```bash
|
||||
export TAVILY_API_KEY='your-tavily-api-key'
|
||||
```
|
||||
|
||||
## Example
|
||||
|
||||
Here's how to initialize and use the `TavilyExtractorTool` within a CrewAI agent:
|
||||
|
||||
```python
|
||||
import os
|
||||
from crewai import Agent, Task, Crew
|
||||
from crewai_tools import TavilyExtractorTool
|
||||
|
||||
# Ensure TAVILY_API_KEY is set in your environment
|
||||
# os.environ["TAVILY_API_KEY"] = "YOUR_API_KEY"
|
||||
|
||||
# Initialize the tool
|
||||
tavily_tool = TavilyExtractorTool()
|
||||
|
||||
# Create an agent that uses the tool
|
||||
extractor_agent = Agent(
|
||||
role='Web Content Extractor',
|
||||
goal='Extract key information from specified web pages',
|
||||
backstory='You are an expert at extracting relevant content from websites using the Tavily API.',
|
||||
tools=[tavily_tool],
|
||||
verbose=True
|
||||
)
|
||||
|
||||
# Define a task for the agent
|
||||
extract_task = Task(
|
||||
description='Extract the main content from the URL https://example.com using basic extraction depth.',
|
||||
expected_output='A JSON string containing the extracted content from the URL.',
|
||||
agent=extractor_agent,
|
||||
tool_inputs={
|
||||
'urls': 'https://example.com',
|
||||
'extract_depth': 'basic'
|
||||
}
|
||||
)
|
||||
|
||||
# Create and run the crew
|
||||
crew = Crew(
|
||||
agents=[extractor_agent],
|
||||
tasks=[extract_task],
|
||||
verbose=2
|
||||
)
|
||||
|
||||
result = crew.kickoff()
|
||||
print(result)
|
||||
|
||||
# Example with multiple URLs and advanced extraction
|
||||
extract_multiple_task = Task(
|
||||
description='Extract content from https://example.com and https://anotherexample.org using advanced extraction.',
|
||||
expected_output='A JSON string containing the extracted content from both URLs.',
|
||||
agent=extractor_agent,
|
||||
tool_inputs={
|
||||
'urls': ['https://example.com', 'https://anotherexample.org'],
|
||||
'extract_depth': 'advanced',
|
||||
'include_images': True
|
||||
}
|
||||
)
|
||||
|
||||
result_multiple = crew.kickoff(inputs={'urls': ['https://example.com', 'https://anotherexample.org'], 'extract_depth': 'advanced', 'include_images': True}) # If task doesn't specify inputs directly
|
||||
print(result_multiple)
|
||||
|
||||
```
|
||||
|
||||
## Arguments
|
||||
|
||||
The `TavilyExtractorTool` accepts the following arguments during initialization or when running the tool:
|
||||
|
||||
- `api_key` (Optional[str]): Your Tavily API key. If not provided during initialization, it defaults to the `TAVILY_API_KEY` environment variable.
|
||||
- `proxies` (Optional[dict[str, str]]): Proxies to use for the API requests. Defaults to `None`.
|
||||
|
||||
When running the tool (`_run` or `_arun` methods, or via agent execution), it uses the `TavilyExtractorToolSchema` and expects the following inputs:
|
||||
|
||||
- `urls` (Union[List[str], str]): **Required**. A single URL string or a list of URL strings to extract data from.
|
||||
- `include_images` (Optional[bool]): Whether to include images in the extraction results. Defaults to `False`.
|
||||
- `extract_depth` (Literal["basic", "advanced"]): The depth of extraction. Use `"basic"` for faster, surface-level extraction or `"advanced"` for more comprehensive extraction. Defaults to `"basic"`.
|
||||
- `timeout` (int): The maximum time in seconds to wait for the extraction request to complete. Defaults to `60`.
|
||||
|
||||
## Response Format
|
||||
|
||||
The tool returns a JSON string representing the structured data extracted from the provided URL(s). The exact structure depends on the content of the pages and the `extract_depth` used. Refer to the [Tavily API documentation](https://docs.tavily.com/docs/tavily-api/python-sdk#extract) for details on the response structure.
|
||||
@@ -0,0 +1,170 @@
|
||||
from crewai.tools import BaseTool, EnvVar
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import Optional, Type, Any, Union, List, Literal
|
||||
from dotenv import load_dotenv
|
||||
import os
|
||||
import json
|
||||
|
||||
load_dotenv()
|
||||
try:
|
||||
from tavily import TavilyClient, AsyncTavilyClient
|
||||
|
||||
TAVILY_AVAILABLE = True
|
||||
except ImportError:
|
||||
TAVILY_AVAILABLE = False
|
||||
TavilyClient = Any
|
||||
AsyncTavilyClient = Any
|
||||
|
||||
|
||||
class TavilyExtractorToolSchema(BaseModel):
|
||||
"""Input schema for TavilyExtractorTool."""
|
||||
|
||||
urls: Union[List[str], str] = Field(
|
||||
...,
|
||||
description="The URL(s) to extract data from. Can be a single URL or a list of URLs.",
|
||||
)
|
||||
|
||||
|
||||
class TavilyExtractorTool(BaseTool):
|
||||
package_dependencies: List[str] = ["tavily-python"]
|
||||
env_vars: List[EnvVar] = [
|
||||
EnvVar(name="TAVILY_API_KEY", description="API key for Tavily extraction service", required=True),
|
||||
]
|
||||
"""
|
||||
Tool that uses the Tavily API to extract content from web pages.
|
||||
|
||||
Attributes:
|
||||
client: Synchronous Tavily client.
|
||||
async_client: Asynchronous Tavily client.
|
||||
name: The name of the tool.
|
||||
description: The description of the tool.
|
||||
args_schema: The schema for the tool's arguments.
|
||||
api_key: The Tavily API key.
|
||||
proxies: Optional proxies for the API requests.
|
||||
include_images: Whether to include images in the extraction.
|
||||
extract_depth: The depth of extraction.
|
||||
timeout: The timeout for the extraction request in seconds.
|
||||
"""
|
||||
|
||||
model_config = {"arbitrary_types_allowed": True}
|
||||
client: Optional[TavilyClient] = None
|
||||
async_client: Optional[AsyncTavilyClient] = None
|
||||
name: str = "TavilyExtractorTool"
|
||||
description: str = "Extracts content from one or more web pages using the Tavily API. Returns structured data."
|
||||
args_schema: Type[BaseModel] = TavilyExtractorToolSchema
|
||||
api_key: Optional[str] = Field(
|
||||
default_factory=lambda: os.getenv("TAVILY_API_KEY"),
|
||||
description="The Tavily API key. If not provided, it will be loaded from the environment variable TAVILY_API_KEY.",
|
||||
)
|
||||
proxies: Optional[dict[str, str]] = Field(
|
||||
default=None,
|
||||
description="Optional proxies to use for the Tavily API requests.",
|
||||
)
|
||||
include_images: bool = Field(
|
||||
default=False,
|
||||
description="Whether to include images in the extraction.",
|
||||
)
|
||||
extract_depth: Literal["basic", "advanced"] = Field(
|
||||
default="basic",
|
||||
description="The depth of extraction. 'basic' for basic extraction, 'advanced' for advanced extraction.",
|
||||
)
|
||||
timeout: int = Field(
|
||||
default=60,
|
||||
description="The timeout for the extraction request in seconds.",
|
||||
)
|
||||
|
||||
def __init__(self, **kwargs: Any):
|
||||
"""
|
||||
Initializes the TavilyExtractorTool.
|
||||
|
||||
Args:
|
||||
**kwargs: Additional keyword arguments.
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
if TAVILY_AVAILABLE:
|
||||
self.client = TavilyClient(api_key=self.api_key, proxies=self.proxies)
|
||||
self.async_client = AsyncTavilyClient(
|
||||
api_key=self.api_key, proxies=self.proxies
|
||||
)
|
||||
else:
|
||||
try:
|
||||
import click
|
||||
import subprocess
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"The 'tavily-python' package is required. 'click' and 'subprocess' are also needed to assist with installation if the package is missing. "
|
||||
"Please install 'tavily-python' manually (e.g., 'uv add tavily-python') and ensure 'click' and 'subprocess' are available."
|
||||
)
|
||||
|
||||
if click.confirm(
|
||||
"You are missing the 'tavily-python' package, which is required for TavilyExtractorTool. Would you like to install it?"
|
||||
):
|
||||
try:
|
||||
subprocess.run(["pip", "install", "tavily-python"], check=True)
|
||||
raise ImportError(
|
||||
"'tavily-python' has been installed. Please restart your Python application to use the TavilyExtractorTool."
|
||||
)
|
||||
except subprocess.CalledProcessError as e:
|
||||
raise ImportError(
|
||||
f"Attempted to install 'tavily-python' but failed: {e}. "
|
||||
f"Please install it manually to use the TavilyExtractorTool."
|
||||
)
|
||||
else:
|
||||
raise ImportError(
|
||||
"The 'tavily-python' package is required to use the TavilyExtractorTool. "
|
||||
"Please install it with: uv add tavily-python"
|
||||
)
|
||||
|
||||
def _run(
|
||||
self,
|
||||
urls: Union[List[str], str],
|
||||
) -> str:
|
||||
"""
|
||||
Synchronously extracts content from the given URL(s).
|
||||
|
||||
Args:
|
||||
urls: The URL(s) to extract data from.
|
||||
|
||||
Returns:
|
||||
A JSON string containing the extracted data.
|
||||
"""
|
||||
if not self.client:
|
||||
raise ValueError(
|
||||
"Tavily client is not initialized. Ensure 'tavily-python' is installed and API key is set."
|
||||
)
|
||||
|
||||
return json.dumps(
|
||||
self.client.extract(
|
||||
urls=urls,
|
||||
extract_depth=self.extract_depth,
|
||||
include_images=self.include_images,
|
||||
timeout=self.timeout,
|
||||
),
|
||||
indent=2,
|
||||
)
|
||||
|
||||
async def _arun(
|
||||
self,
|
||||
urls: Union[List[str], str],
|
||||
) -> str:
|
||||
"""
|
||||
Asynchronously extracts content from the given URL(s).
|
||||
|
||||
Args:
|
||||
urls: The URL(s) to extract data from.
|
||||
|
||||
Returns:
|
||||
A JSON string containing the extracted data.
|
||||
"""
|
||||
if not self.async_client:
|
||||
raise ValueError(
|
||||
"Tavily async client is not initialized. Ensure 'tavily-python' is installed and API key is set."
|
||||
)
|
||||
|
||||
results = await self.async_client.extract(
|
||||
urls=urls,
|
||||
extract_depth=self.extract_depth,
|
||||
include_images=self.include_images,
|
||||
timeout=self.timeout,
|
||||
)
|
||||
return json.dumps(results, indent=2)
|
||||
Reference in New Issue
Block a user