Squashed 'packages/tools/' content from commit 78317b9c

git-subtree-dir: packages/tools
git-subtree-split: 78317b9c127f18bd040c1d77e3c0840cdc9a5b38
This commit is contained in:
Greyson Lalonde
2025-09-12 21:58:02 -04:00
commit e16606672a
303 changed files with 49010 additions and 0 deletions

View File

@@ -0,0 +1,127 @@
from .ai_mind_tool.ai_mind_tool import AIMindTool
from .apify_actors_tool.apify_actors_tool import ApifyActorsTool
from .arxiv_paper_tool.arxiv_paper_tool import ArxivPaperTool
from .brave_search_tool.brave_search_tool import BraveSearchTool
from .brightdata_tool import (
BrightDataDatasetTool,
BrightDataSearchTool,
BrightDataWebUnlockerTool,
)
from .browserbase_load_tool.browserbase_load_tool import BrowserbaseLoadTool
from .code_docs_search_tool.code_docs_search_tool import CodeDocsSearchTool
from .code_interpreter_tool.code_interpreter_tool import CodeInterpreterTool
from .composio_tool.composio_tool import ComposioTool
from .contextualai_create_agent_tool.contextual_create_agent_tool import (
ContextualAICreateAgentTool,
)
from .contextualai_parse_tool.contextual_parse_tool import ContextualAIParseTool
from .contextualai_query_tool.contextual_query_tool import ContextualAIQueryTool
from .contextualai_rerank_tool.contextual_rerank_tool import ContextualAIRerankTool
from .couchbase_tool.couchbase_tool import CouchbaseFTSVectorSearchTool
from .crewai_enterprise_tools.crewai_enterprise_tools import CrewaiEnterpriseTools
from .crewai_platform_tools.crewai_platform_tools import CrewaiPlatformTools
from .csv_search_tool.csv_search_tool import CSVSearchTool
from .dalle_tool.dalle_tool import DallETool
from .databricks_query_tool.databricks_query_tool import DatabricksQueryTool
from .directory_read_tool.directory_read_tool import DirectoryReadTool
from .directory_search_tool.directory_search_tool import DirectorySearchTool
from .docx_search_tool.docx_search_tool import DOCXSearchTool
from .exa_tools.exa_search_tool import EXASearchTool
from .file_read_tool.file_read_tool import FileReadTool
from .file_writer_tool.file_writer_tool import FileWriterTool
from .files_compressor_tool.files_compressor_tool import FileCompressorTool
from .firecrawl_crawl_website_tool.firecrawl_crawl_website_tool import (
FirecrawlCrawlWebsiteTool,
)
from .firecrawl_scrape_website_tool.firecrawl_scrape_website_tool import (
FirecrawlScrapeWebsiteTool,
)
from .firecrawl_search_tool.firecrawl_search_tool import FirecrawlSearchTool
from .generate_crewai_automation_tool.generate_crewai_automation_tool import (
GenerateCrewaiAutomationTool,
)
from .github_search_tool.github_search_tool import GithubSearchTool
from .hyperbrowser_load_tool.hyperbrowser_load_tool import HyperbrowserLoadTool
from .invoke_crewai_automation_tool.invoke_crewai_automation_tool import (
InvokeCrewAIAutomationTool,
)
from .json_search_tool.json_search_tool import JSONSearchTool
from .linkup.linkup_search_tool import LinkupSearchTool
from .llamaindex_tool.llamaindex_tool import LlamaIndexTool
from .mdx_search_tool.mdx_search_tool import MDXSearchTool
from .mongodb_vector_search_tool import (
MongoDBToolSchema,
MongoDBVectorSearchConfig,
MongoDBVectorSearchTool,
)
from .multion_tool.multion_tool import MultiOnTool
from .mysql_search_tool.mysql_search_tool import MySQLSearchTool
from .nl2sql.nl2sql_tool import NL2SQLTool
from .ocr_tool.ocr_tool import OCRTool
from .oxylabs_amazon_product_scraper_tool.oxylabs_amazon_product_scraper_tool import (
OxylabsAmazonProductScraperTool,
)
from .oxylabs_amazon_search_scraper_tool.oxylabs_amazon_search_scraper_tool import (
OxylabsAmazonSearchScraperTool,
)
from .oxylabs_google_search_scraper_tool.oxylabs_google_search_scraper_tool import (
OxylabsGoogleSearchScraperTool,
)
from .oxylabs_universal_scraper_tool.oxylabs_universal_scraper_tool import (
OxylabsUniversalScraperTool,
)
from .patronus_eval_tool import (
PatronusEvalTool,
PatronusLocalEvaluatorTool,
PatronusPredefinedCriteriaEvalTool,
)
from .pdf_search_tool.pdf_search_tool import PDFSearchTool
from .pg_search_tool.pg_search_tool import PGSearchTool
from .qdrant_vector_search_tool.qdrant_search_tool import QdrantVectorSearchTool
from .rag.rag_tool import RagTool
from .scrape_element_from_website.scrape_element_from_website import (
ScrapeElementFromWebsiteTool,
)
from .scrape_website_tool.scrape_website_tool import ScrapeWebsiteTool
from .scrapegraph_scrape_tool.scrapegraph_scrape_tool import (
ScrapegraphScrapeTool,
ScrapegraphScrapeToolSchema,
)
from .scrapfly_scrape_website_tool.scrapfly_scrape_website_tool import (
ScrapflyScrapeWebsiteTool,
)
from .selenium_scraping_tool.selenium_scraping_tool import SeleniumScrapingTool
from .serpapi_tool.serpapi_google_search_tool import SerpApiGoogleSearchTool
from .serpapi_tool.serpapi_google_shopping_tool import SerpApiGoogleShoppingTool
from .serper_dev_tool.serper_dev_tool import SerperDevTool
from .serper_scrape_website_tool.serper_scrape_website_tool import (
SerperScrapeWebsiteTool,
)
from .serply_api_tool.serply_job_search_tool import SerplyJobSearchTool
from .serply_api_tool.serply_news_search_tool import SerplyNewsSearchTool
from .serply_api_tool.serply_scholar_search_tool import SerplyScholarSearchTool
from .serply_api_tool.serply_web_search_tool import SerplyWebSearchTool
from .serply_api_tool.serply_webpage_to_markdown_tool import SerplyWebpageToMarkdownTool
from .singlestore_search_tool import SingleStoreSearchTool
from .snowflake_search_tool import (
SnowflakeConfig,
SnowflakeSearchTool,
SnowflakeSearchToolInput,
)
from .spider_tool.spider_tool import SpiderTool
from .stagehand_tool.stagehand_tool import StagehandTool
from .tavily_extractor_tool.tavily_extractor_tool import TavilyExtractorTool
from .tavily_search_tool.tavily_search_tool import TavilySearchTool
from .txt_search_tool.txt_search_tool import TXTSearchTool
from .vision_tool.vision_tool import VisionTool
from .weaviate_tool.vector_search import WeaviateVectorSearchTool
from .website_search.website_search_tool import WebsiteSearchTool
from .xml_search_tool.xml_search_tool import XMLSearchTool
from .youtube_channel_search_tool.youtube_channel_search_tool import (
YoutubeChannelSearchTool,
)
from .youtube_video_search_tool.youtube_video_search_tool import YoutubeVideoSearchTool
from .zapier_action_tool.zapier_action_tool import ZapierActionTools
from .parallel_tools import (
ParallelSearchTool,
)

View File

@@ -0,0 +1,79 @@
# AIMind Tool
## Description
[Minds](https://mindsdb.com/minds) are AI systems provided by [MindsDB](https://mindsdb.com/) that work similarly to large language models (LLMs) but go beyond by answering any question from any data.
This is accomplished by selecting the most relevant data for an answer using parametric search, understanding the meaning and providing responses within the correct context through semantic search, and finally, delivering precise answers by analyzing data and using machine learning (ML) models.
The `AIMindTool` can be used to query data sources in natural language by simply configuring their connection parameters.
## Installation
1. Install the `crewai[tools]` package:
```shell
pip install 'crewai[tools]'
```
2. Install the Minds SDK:
```shell
pip install minds-sdk
```
3. Sign for a Minds account [here](https://mdb.ai/register), and obtain an API key.
4. Set the Minds API key in an environment variable named `MINDS_API_KEY`.
## Usage
```python
from crewai_tools import AIMindTool
# Initialize the AIMindTool.
aimind_tool = AIMindTool(
datasources=[
{
"description": "house sales data",
"engine": "postgres",
"connection_data": {
"user": "demo_user",
"password": "demo_password",
"host": "samples.mindsdb.com",
"port": 5432,
"database": "demo",
"schema": "demo_data"
},
"tables": ["house_sales"]
}
]
)
aimind_tool.run("How many 3 bedroom houses were sold in 2008?")
```
The `datasources` parameter is a list of dictionaries, each containing the following keys:
- `description`: A description of the data contained in the datasource.
- `engine`: The engine (or type) of the datasource. Find a list of supported engines in the link below.
- `connection_data`: A dictionary containing the connection parameters for the datasource. Find a list of connection parameters for each engine in the link below.
- `tables`: A list of tables that the data source will use. This is optional and can be omitted if all tables in the data source are to be used.
A list of supported data sources and their connection parameters can be found [here](https://docs.mdb.ai/docs/data_sources).
```python
from crewai import Agent
from crewai.project import agent
# Define an agent with the AIMindTool.
@agent
def researcher(self) -> Agent:
return Agent(
config=self.agents_config["researcher"],
allow_delegation=False,
tools=[aimind_tool]
)
```

View File

@@ -0,0 +1,91 @@
import os
import secrets
from typing import Any, Dict, List, Optional, Type
from crewai.tools import BaseTool, EnvVar
from openai import OpenAI
from pydantic import BaseModel, Field
class AIMindToolConstants:
MINDS_API_BASE_URL = "https://mdb.ai/"
MIND_NAME_PREFIX = "crwai_mind_"
DATASOURCE_NAME_PREFIX = "crwai_ds_"
class AIMindToolInputSchema(BaseModel):
"""Input for AIMind Tool."""
query: str = Field(description="Question in natural language to ask the AI-Mind")
class AIMindTool(BaseTool):
name: str = "AIMind Tool"
description: str = (
"A wrapper around [AI-Minds](https://mindsdb.com/minds). "
"Useful for when you need answers to questions from your data, stored in "
"data sources including PostgreSQL, MySQL, MariaDB, ClickHouse, Snowflake "
"and Google BigQuery. "
"Input should be a question in natural language."
)
args_schema: Type[BaseModel] = AIMindToolInputSchema
api_key: Optional[str] = None
datasources: Optional[List[Dict[str, Any]]] = None
mind_name: Optional[str] = None
package_dependencies: List[str] = ["minds-sdk"]
env_vars: List[EnvVar] = [
EnvVar(name="MINDS_API_KEY", description="API key for AI-Minds", required=True),
]
def __init__(self, api_key: Optional[str] = None, **kwargs):
super().__init__(**kwargs)
self.api_key = api_key or os.getenv("MINDS_API_KEY")
if not self.api_key:
raise ValueError("API key must be provided either through constructor or MINDS_API_KEY environment variable")
try:
from minds.client import Client # type: ignore
from minds.datasources import DatabaseConfig # type: ignore
except ImportError:
raise ImportError(
"`minds_sdk` package not found, please run `pip install minds-sdk`"
)
minds_client = Client(api_key=self.api_key)
# Convert the datasources to DatabaseConfig objects.
datasources = []
for datasource in self.datasources:
config = DatabaseConfig(
name=f"{AIMindToolConstants.DATASOURCE_NAME_PREFIX}_{secrets.token_hex(5)}",
engine=datasource["engine"],
description=datasource["description"],
connection_data=datasource["connection_data"],
tables=datasource["tables"],
)
datasources.append(config)
# Generate a random name for the Mind.
name = f"{AIMindToolConstants.MIND_NAME_PREFIX}_{secrets.token_hex(5)}"
mind = minds_client.minds.create(
name=name, datasources=datasources, replace=True
)
self.mind_name = mind.name
def _run(
self,
query: str
):
# Run the query on the AI-Mind.
# The Minds API is OpenAI compatible and therefore, the OpenAI client can be used.
openai_client = OpenAI(base_url=AIMindToolConstants.MINDS_API_BASE_URL, api_key=self.api_key)
completion = openai_client.chat.completions.create(
model=self.mind_name,
messages=[{"role": "user", "content": query}],
stream=False,
)
return completion.choices[0].message.content

View File

@@ -0,0 +1,96 @@
# ApifyActorsTool
Integrate [Apify Actors](https://apify.com/actors) into your CrewAI workflows.
## Description
The `ApifyActorsTool` connects [Apify Actors](https://apify.com/actors), cloud-based programs for web scraping and automation, to your CrewAI workflows.
Use any of the 4,000+ Actors on [Apify Store](https://apify.com/store) for use cases such as extracting data from social media, search engines, online maps, e-commerce sites, travel portals, or general websites.
For details, see the [Apify CrewAI integration](https://docs.apify.com/platform/integrations/crewai) in Apify documentation.
## Installation
To use `ApifyActorsTool`, install the necessary packages and set up your Apify API token. Follow the [Apify API documentation](https://docs.apify.com/platform/integrations/api) for steps to obtain the token.
### Steps
1. **Install dependencies**
Install `crewai[tools]` and `langchain-apify`:
```bash
pip install 'crewai[tools]' langchain-apify
```
2. **Set your API token**
Export the token as an environment variable:
```bash
export APIFY_API_TOKEN='your-api-token-here'
```
## Usage example
Use the `ApifyActorsTool` manually to run the [RAG Web Browser Actor](https://apify.com/apify/rag-web-browser) to perform a web search:
```python
from crewai_tools import ApifyActorsTool
# Initialize the tool with an Apify Actor
tool = ApifyActorsTool(actor_name="apify/rag-web-browser")
# Run the tool with input parameters
results = tool.run(run_input={"query": "What is CrewAI?", "maxResults": 5})
# Process the results
for result in results:
print(f"URL: {result['metadata']['url']}")
print(f"Content: {result.get('markdown', 'N/A')[:100]}...")
```
### Expected output
Here is the output from running the code above:
```text
URL: https://www.example.com/crewai-intro
Content: CrewAI is a framework for building AI-powered workflows...
URL: https://docs.crewai.com/
Content: Official documentation for CrewAI...
```
The `ApifyActorsTool` automatically fetches the Actor definition and input schema from Apify using the provided `actor_name` and then constructs the tool description and argument schema. This means you need to specify only a valid `actor_name`, and the tool handles the rest when used with agents—no need to specify the `run_input`. Here's how it works:
```python
from crewai import Agent
from crewai_tools import ApifyActorsTool
rag_browser = ApifyActorsTool(actor_name="apify/rag-web-browser")
agent = Agent(
role="Research Analyst",
goal="Find and summarize information about specific topics",
backstory="You are an experienced researcher with attention to detail",
tools=[rag_browser],
)
```
You can run other Actors from [Apify Store](https://apify.com/store) simply by changing the `actor_name` and, when using it manually, adjusting the `run_input` based on the Actor input schema.
For an example of usage with agents, see the [CrewAI Actor template](https://apify.com/templates/python-crewai).
## Configuration
The `ApifyActorsTool` requires these inputs to work:
- **`actor_name`**
The ID of the Apify Actor to run, e.g., `"apify/rag-web-browser"`. Browse all Actors on [Apify Store](https://apify.com/store).
- **`run_input`**
A dictionary of input parameters for the Actor when running the tool manually.
- For example, for the `apify/rag-web-browser` Actor: `{"query": "search term", "maxResults": 5}`
- See the Actor's [input schema](https://apify.com/apify/rag-web-browser/input-schema) for the list of input parameters.
## Resources
- **[Apify](https://apify.com/)**: Explore the Apify platform.
- **[How to build an AI agent on Apify](https://blog.apify.com/how-to-build-an-ai-agent/)** - A complete step-by-step guide to creating, publishing, and monetizing AI agents on the Apify platform.
- **[RAG Web Browser Actor](https://apify.com/apify/rag-web-browser)**: A popular Actor for web search for LLMs.
- **[CrewAI Integration Guide](https://docs.apify.com/platform/integrations/crewai)**: Follow the official guide for integrating Apify and CrewAI.

View File

@@ -0,0 +1,96 @@
from crewai.tools import BaseTool, EnvVar
from pydantic import Field
from typing import TYPE_CHECKING, Any, Dict, List
import os
if TYPE_CHECKING:
from langchain_apify import ApifyActorsTool as _ApifyActorsTool
class ApifyActorsTool(BaseTool):
env_vars: List[EnvVar] = [
EnvVar(name="APIFY_API_TOKEN", description="API token for Apify platform access", required=True),
]
"""Tool that runs Apify Actors.
To use, you should have the environment variable `APIFY_API_TOKEN` set
with your API key.
For details, see https://docs.apify.com/platform/integrations/crewai
Args:
actor_name (str): The name of the Apify Actor to run.
*args: Variable length argument list passed to BaseTool.
**kwargs: Arbitrary keyword arguments passed to BaseTool.
Returns:
List[Dict[str, Any]]: Results from the Actor execution.
Raises:
ValueError: If `APIFY_API_TOKEN` is not set or if the tool is not initialized.
ImportError: If `langchain_apify` package is not installed.
Example:
.. code-block:: python
from crewai_tools import ApifyActorsTool
tool = ApifyActorsTool(actor_name="apify/rag-web-browser")
results = tool.run(run_input={"query": "What is CrewAI?", "maxResults": 5})
for result in results:
print(f"URL: {result['metadata']['url']}")
print(f"Content: {result.get('markdown', 'N/A')[:100]}...")
"""
actor_tool: '_ApifyActorsTool' = Field(description="Apify Actor Tool")
package_dependencies: List[str] = ["langchain-apify"]
def __init__(
self,
actor_name: str,
*args: Any,
**kwargs: Any
) -> None:
if not os.environ.get("APIFY_API_TOKEN"):
msg = (
"APIFY_API_TOKEN environment variable is not set. "
"Please set it to your API key, to learn how to get it, "
"see https://docs.apify.com/platform/integrations/api"
)
raise ValueError(msg)
try:
from langchain_apify import ApifyActorsTool as _ApifyActorsTool
except ImportError:
raise ImportError(
"Could not import langchain_apify python package. "
"Please install it with `pip install langchain-apify` or `uv add langchain-apify`."
)
actor_tool = _ApifyActorsTool(actor_name)
kwargs.update(
{
"name": actor_tool.name,
"description": actor_tool.description,
"args_schema": actor_tool.args_schema,
"actor_tool": actor_tool,
}
)
super().__init__(*args, **kwargs)
def _run(self, run_input: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Run the Actor tool with the given input.
Returns:
List[Dict[str, Any]]: Results from the Actor execution.
Raises:
ValueError: If 'actor_tool' is not initialized.
"""
try:
return self.actor_tool._run(run_input)
except Exception as e:
msg = (
f'Failed to run ApifyActorsTool {self.name}. '
'Please check your Apify account Actor run logs for more details.'
f'Error: {e}'
)
raise RuntimeError(msg) from e

View File

@@ -0,0 +1,80 @@
### Example 1: Fetching Research Papers from arXiv with CrewAI
This example demonstrates how to build a simple CrewAI workflow that automatically searches for and downloads academic papers from [arXiv.org](https://arxiv.org). The setup uses:
* A custom `ArxivPaperTool` to fetch metadata and download PDFs
* A single `Agent` tasked with locating relevant papers based on a given research topic
* A `Task` to define the data retrieval and download process
* A sequential `Crew` to orchestrate execution
The downloaded PDFs are saved to a local directory (`./DOWNLOADS`). Filenames are optionally based on sanitized paper titles, ensuring compatibility with your operating system.
> The saved PDFs can be further used in **downstream tasks**, such as:
>
> * **RAG (Retrieval-Augmented Generation)**
> * **Summarization**
> * **Citation extraction**
> * **Embedding-based search or analysis**
---
```
from crewai import Agent, Task, Crew, Process, LLM
from crewai_tools import ArxivPaperTool
llm = LLM(
model="ollama/llama3.1",
base_url="http://localhost:11434",
temperature=0.1
)
topic = "Crew AI"
max_results = 3
save_dir = "./DOWNLOADS"
use_title_as_filename = True
tool = ArxivPaperTool(
download_pdfs=True,
save_dir=save_dir,
use_title_as_filename=True
)
tool.result_as_answer = True #Required,otherwise
arxiv_paper_fetch = Agent(
role="Arxiv Data Fetcher",
goal=f"Retrieve relevant papers from arXiv based on a research topic {topic} and maximum number of papers to be downloaded is{max_results},try to use title as filename {use_title_as_filename} and download PDFs to {save_dir},",
backstory="An expert in scientific data retrieval, skilled in extracting academic content from arXiv.",
# tools=[ArxivPaperTool()],
llm=llm,
verbose=True,
allow_delegation=False
)
fetch_task = Task(
description=(
f"Search arXiv for the topic '{topic}' and fetch up to {max_results} papers. "
f"Download PDFs for analysis and store them at {save_dir}."
),
expected_output="PDFs saved to disk for downstream agents.",
agent=arxiv_paper_fetch,
tools=[tool], # Use the actual tool instance here
)
pdf_qa_crew = Crew(
agents=[arxiv_paper_fetch],
tasks=[fetch_task],
process=Process.sequential,
verbose=True,
)
result = pdf_qa_crew.kickoff()
print(f"\n🤖 Answer:\n\n{result.raw}\n")
```

View File

@@ -0,0 +1,142 @@
# ArxivPaperTool
# 📚 ArxivPaperTool
The **ArxivPaperTool** is a utility for fetching metadata and optionally downloading PDFs of academic papers from the [arXiv](https://arxiv.org) platform using its public API. It supports configurable queries, batch retrieval, PDF downloading, and clean formatting for summaries and metadata. This tool is particularly useful for researchers, students, academic agents, and AI tools performing automated literature reviews.
---
## Description
This tool:
* Accepts a **search query** and retrieves a list of papers from arXiv.
* Allows configuration of the **maximum number of results** to fetch.
* Optionally downloads the **PDFs** of the matched papers.
* Lets you specify whether to name PDF files using the **arXiv ID** or **paper title**.
* Saves downloaded files into a **custom or default directory**.
* Returns structured summaries of all fetched papers including metadata.
---
## Arguments
| Argument | Type | Required | Description |
| ----------------------- | ------ | -------- | --------------------------------------------------------------------------------- |
| `search_query` | `str` | ✅ | Search query string (e.g., `"transformer neural network"`). |
| `max_results` | `int` | ✅ | Number of results to fetch (between 1 and 100). |
| `download_pdfs` | `bool` | ❌ | Whether to download the corresponding PDFs. Defaults to `False`. |
| `save_dir` | `str` | ❌ | Directory to save PDFs (created if it doesnt exist). Defaults to `./arxiv_pdfs`. |
| `use_title_as_filename` | `bool` | ❌ | Use the paper title as the filename (sanitized). Defaults to `False`. |
---
## 📄 `ArxivPaperTool` Usage Examples
This document shows how to use the `ArxivPaperTool` to fetch research paper metadata from arXiv and optionally download PDFs.
### 🔧 Tool Initialization
```python
from crewai_tools import ArxivPaperTool
```
---
### Example 1: Fetch Metadata Only (No Downloads)
```python
tool = ArxivPaperTool()
result = tool._run(
search_query="deep learning",
max_results=1
)
print(result)
```
---
### Example 2: Fetch and Download PDFs (arXiv ID as Filename)
```python
tool = ArxivPaperTool(download_pdfs=True)
result = tool._run(
search_query="transformer models",
max_results=2
)
print(result)
```
---
### Example 3: Download PDFs into a Custom Directory
```python
tool = ArxivPaperTool(
download_pdfs=True,
save_dir="./my_papers"
)
result = tool._run(
search_query="graph neural networks",
max_results=2
)
print(result)
```
---
### Example 4: Use Paper Titles as Filenames
```python
tool = ArxivPaperTool(
download_pdfs=True,
use_title_as_filename=True
)
result = tool._run(
search_query="vision transformers",
max_results=1
)
print(result)
```
---
### Example 5: All Options Combined
```python
tool = ArxivPaperTool(
download_pdfs=True,
save_dir="./downloads",
use_title_as_filename=True
)
result = tool._run(
search_query="stable diffusion",
max_results=3
)
print(result)
```
---
### Run via `__main__`
Your file can also include:
```python
if __name__ == "__main__":
tool = ArxivPaperTool(
download_pdfs=True,
save_dir="./downloads2",
use_title_as_filename=False
)
result = tool._run(
search_query="deep learning",
max_results=1
)
print(result)
```
---

View File

@@ -0,0 +1,152 @@
import re
import time
import urllib.request
import urllib.parse
import urllib.error
import xml.etree.ElementTree as ET
from typing import Type, List, Optional, ClassVar
from pydantic import BaseModel, Field
from crewai.tools import BaseTool,EnvVar
import logging
from pathlib import Path
logger = logging.getLogger(__file__)
class ArxivToolInput(BaseModel):
search_query: str = Field(..., description="Search query for Arxiv, e.g., 'transformer neural network'")
max_results: int = Field(5, ge=1, le=100, description="Max results to fetch; must be between 1 and 100")
class ArxivPaperTool(BaseTool):
BASE_API_URL: ClassVar[str] = "http://export.arxiv.org/api/query"
SLEEP_DURATION: ClassVar[int] = 1
SUMMARY_TRUNCATE_LENGTH: ClassVar[int] = 300
ATOM_NAMESPACE: ClassVar[str] = "{http://www.w3.org/2005/Atom}"
REQUEST_TIMEOUT: ClassVar[int] = 10
name: str = "Arxiv Paper Fetcher and Downloader"
description: str = "Fetches metadata from Arxiv based on a search query and optionally downloads PDFs."
args_schema: Type[BaseModel] = ArxivToolInput
model_config = {"extra": "allow"}
package_dependencies: List[str] = ["pydantic"]
env_vars: List[EnvVar] = []
def __init__(self, download_pdfs=False, save_dir="./arxiv_pdfs", use_title_as_filename=False):
super().__init__()
self.download_pdfs = download_pdfs
self.save_dir = save_dir
self.use_title_as_filename = use_title_as_filename
def _run(self, search_query: str, max_results: int = 5) -> str:
try:
args = ArxivToolInput(search_query=search_query, max_results=max_results)
logger.info(f"Running Arxiv tool: query='{args.search_query}', max_results={args.max_results}, "
f"download_pdfs={self.download_pdfs}, save_dir='{self.save_dir}', "
f"use_title_as_filename={self.use_title_as_filename}")
papers = self.fetch_arxiv_data(args.search_query, args.max_results)
if self.download_pdfs:
save_dir = self._validate_save_path(self.save_dir)
for paper in papers:
if paper['pdf_url']:
if self.use_title_as_filename:
safe_title = re.sub(r'[\\/*?:"<>|]', "_", paper['title']).strip()
filename_base = safe_title or paper['arxiv_id']
else:
filename_base = paper['arxiv_id']
filename = f"{filename_base[:500]}.pdf"
save_path = Path(save_dir) / filename
self.download_pdf(paper['pdf_url'], save_path)
time.sleep(self.SLEEP_DURATION)
results = [self._format_paper_result(p) for p in papers]
return "\n\n" + "-" * 80 + "\n\n".join(results)
except Exception as e:
logger.error(f"ArxivTool Error: {str(e)}")
return f"Failed to fetch or download Arxiv papers: {str(e)}"
def fetch_arxiv_data(self, search_query: str, max_results: int) -> List[dict]:
api_url = f"{self.BASE_API_URL}?search_query={urllib.parse.quote(search_query)}&start=0&max_results={max_results}"
logger.info(f"Fetching data from Arxiv API: {api_url}")
try:
with urllib.request.urlopen(api_url, timeout=self.REQUEST_TIMEOUT) as response:
if response.status != 200:
raise Exception(f"HTTP {response.status}: {response.reason}")
data = response.read().decode('utf-8')
except urllib.error.URLError as e:
logger.error(f"Error fetching data from Arxiv: {e}")
raise
root = ET.fromstring(data)
papers = []
for entry in root.findall(self.ATOM_NAMESPACE + "entry"):
raw_id = self._get_element_text(entry, "id")
arxiv_id = raw_id.split('/')[-1].replace('.', '_') if raw_id else "unknown"
title = self._get_element_text(entry, "title") or "No Title"
summary = self._get_element_text(entry, "summary") or "No Summary"
published = self._get_element_text(entry, "published") or "No Publish Date"
authors = [
self._get_element_text(author, "name") or "Unknown"
for author in entry.findall(self.ATOM_NAMESPACE + "author")
]
pdf_url = self._extract_pdf_url(entry)
papers.append({
"arxiv_id": arxiv_id,
"title": title,
"summary": summary,
"authors": authors,
"published_date": published,
"pdf_url": pdf_url
})
return papers
@staticmethod
def _get_element_text(entry: ET.Element, element_name: str) -> Optional[str]:
elem = entry.find(f'{ArxivPaperTool.ATOM_NAMESPACE}{element_name}')
return elem.text.strip() if elem is not None and elem.text else None
def _extract_pdf_url(self, entry: ET.Element) -> Optional[str]:
for link in entry.findall(self.ATOM_NAMESPACE + "link"):
if link.attrib.get('title', '').lower() == 'pdf':
return link.attrib.get('href')
for link in entry.findall(self.ATOM_NAMESPACE + "link"):
href = link.attrib.get('href')
if href and 'pdf' in href:
return href
return None
def _format_paper_result(self, paper: dict) -> str:
summary = (paper['summary'][:self.SUMMARY_TRUNCATE_LENGTH] + '...') \
if len(paper['summary']) > self.SUMMARY_TRUNCATE_LENGTH else paper['summary']
authors_str = ', '.join(paper['authors'])
return (f"Title: {paper['title']}\n"
f"Authors: {authors_str}\n"
f"Published: {paper['published_date']}\n"
f"PDF: {paper['pdf_url'] or 'N/A'}\n"
f"Summary: {summary}")
@staticmethod
def _validate_save_path(path: str) -> Path:
save_path = Path(path).resolve()
save_path.mkdir(parents=True, exist_ok=True)
return save_path
def download_pdf(self, pdf_url: str, save_path: str):
try:
logger.info(f"Downloading PDF from {pdf_url} to {save_path}")
urllib.request.urlretrieve(pdf_url, str(save_path))
logger.info(f"PDF saved: {save_path}")
except urllib.error.URLError as e:
logger.error(f"Network error occurred while downloading {pdf_url}: {e}")
raise
except OSError as e:
logger.error(f"File save error for {save_path}: {e}")
raise

View File

@@ -0,0 +1,113 @@
import pytest
import urllib.error
from unittest.mock import patch, MagicMock, mock_open
from pathlib import Path
import xml.etree.ElementTree as ET
from crewai_tools import ArxivPaperTool
@pytest.fixture
def tool():
return ArxivPaperTool(download_pdfs=False)
def mock_arxiv_response():
return '''<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<entry>
<id>http://arxiv.org/abs/1234.5678</id>
<title>Sample Paper</title>
<summary>This is a summary of the sample paper.</summary>
<published>2022-01-01T00:00:00Z</published>
<author><name>John Doe</name></author>
<link title="pdf" href="http://arxiv.org/pdf/1234.5678.pdf"/>
</entry>
</feed>'''
@patch("urllib.request.urlopen")
def test_fetch_arxiv_data(mock_urlopen, tool):
mock_response = MagicMock()
mock_response.status = 200
mock_response.read.return_value = mock_arxiv_response().encode("utf-8")
mock_urlopen.return_value.__enter__.return_value = mock_response
results = tool.fetch_arxiv_data("transformer", 1)
assert isinstance(results, list)
assert results[0]['title'] == "Sample Paper"
@patch("urllib.request.urlopen", side_effect=urllib.error.URLError("Timeout"))
def test_fetch_arxiv_data_network_error(mock_urlopen, tool):
with pytest.raises(urllib.error.URLError):
tool.fetch_arxiv_data("transformer", 1)
@patch("urllib.request.urlretrieve")
def test_download_pdf_success(mock_urlretrieve):
tool = ArxivPaperTool()
tool.download_pdf("http://arxiv.org/pdf/1234.5678.pdf", Path("test.pdf"))
mock_urlretrieve.assert_called_once()
@patch("urllib.request.urlretrieve", side_effect=OSError("Permission denied"))
def test_download_pdf_oserror(mock_urlretrieve):
tool = ArxivPaperTool()
with pytest.raises(OSError):
tool.download_pdf("http://arxiv.org/pdf/1234.5678.pdf", Path("/restricted/test.pdf"))
@patch("urllib.request.urlopen")
@patch("urllib.request.urlretrieve")
def test_run_with_download(mock_urlretrieve, mock_urlopen):
mock_response = MagicMock()
mock_response.status = 200
mock_response.read.return_value = mock_arxiv_response().encode("utf-8")
mock_urlopen.return_value.__enter__.return_value = mock_response
tool = ArxivPaperTool(download_pdfs=True)
output = tool._run("transformer", 1)
assert "Title: Sample Paper" in output
mock_urlretrieve.assert_called_once()
@patch("urllib.request.urlopen")
def test_run_no_download(mock_urlopen):
mock_response = MagicMock()
mock_response.status = 200
mock_response.read.return_value = mock_arxiv_response().encode("utf-8")
mock_urlopen.return_value.__enter__.return_value = mock_response
tool = ArxivPaperTool(download_pdfs=False)
result = tool._run("transformer", 1)
assert "Title: Sample Paper" in result
@patch("pathlib.Path.mkdir")
def test_validate_save_path_creates_directory(mock_mkdir):
path = ArxivPaperTool._validate_save_path("new_folder")
mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
assert isinstance(path, Path)
@patch("urllib.request.urlopen")
def test_run_handles_exception(mock_urlopen):
mock_urlopen.side_effect = Exception("API failure")
tool = ArxivPaperTool()
result = tool._run("transformer", 1)
assert "Failed to fetch or download Arxiv papers" in result
@patch("urllib.request.urlopen")
def test_invalid_xml_response(mock_urlopen, tool):
mock_response = MagicMock()
mock_response.read.return_value = b"<invalid><xml>"
mock_response.status = 200
mock_urlopen.return_value.__enter__.return_value = mock_response
with pytest.raises(ET.ParseError):
tool.fetch_arxiv_data("quantum", 1)
@patch.object(ArxivPaperTool, "fetch_arxiv_data")
def test_run_with_max_results(mock_fetch, tool):
mock_fetch.return_value = [{
"arxiv_id": f"test_{i}",
"title": f"Title {i}",
"summary": "Summary",
"authors": ["Author"],
"published_date": "2023-01-01",
"pdf_url": None
} for i in range(100)]
result = tool._run(search_query="test", max_results=100)
assert result.count("Title:") == 100

View File

@@ -0,0 +1,30 @@
# BraveSearchTool Documentation
## Description
This tool is designed to perform a web search for a specified query from a text's content across the internet. It utilizes the Brave Web Search API, which is a REST API to query Brave Search and get back search results from the web. The following sections describe how to curate requests, including parameters and headers, to Brave Web Search API and get a JSON response back.
## Installation
To incorporate this tool into your project, follow the installation instructions below:
```shell
pip install 'crewai[tools]'
```
## Example
The following example demonstrates how to initialize the tool and execute a search with a given query:
```python
from crewai_tools import BraveSearchTool
# Initialize the tool for internet searching capabilities
tool = BraveSearchTool()
```
## Steps to Get Started
To effectively use the `BraveSearchTool`, follow these steps:
1. **Package Installation**: Confirm that the `crewai[tools]` package is installed in your Python environment.
2. **API Key Acquisition**: Acquire a API key [here](https://api.search.brave.com/app/keys).
3. **Environment Configuration**: Store your obtained API key in an environment variable named `BRAVE_API_KEY` to facilitate its use by the tool.
## Conclusion
By integrating the `BraveSearchTool` into Python projects, users gain the ability to conduct real-time, relevant searches across the internet directly from their applications. By adhering to the setup and usage guidelines provided, incorporating this tool into projects is streamlined and straightforward.

View File

@@ -0,0 +1,121 @@
import datetime
import os
import time
from typing import Any, ClassVar, List, Optional, Type
import requests
from crewai.tools import BaseTool, EnvVar
from pydantic import BaseModel, Field
def _save_results_to_file(content: str) -> None:
"""Saves the search results to a file."""
filename = f"search_results_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.txt"
with open(filename, "w") as file:
file.write(content)
print(f"Results saved to {filename}")
class BraveSearchToolSchema(BaseModel):
"""Input for BraveSearchTool."""
search_query: str = Field(
..., description="Mandatory search query you want to use to search the internet"
)
class BraveSearchTool(BaseTool):
"""
BraveSearchTool - A tool for performing web searches using the Brave Search API.
This module provides functionality to search the internet using Brave's Search API,
supporting customizable result counts and country-specific searches.
Dependencies:
- requests
- pydantic
- python-dotenv (for API key management)
"""
name: str = "Brave Web Search the internet"
description: str = (
"A tool that can be used to search the internet with a search_query."
)
args_schema: Type[BaseModel] = BraveSearchToolSchema
search_url: str = "https://api.search.brave.com/res/v1/web/search"
country: Optional[str] = ""
n_results: int = 10
save_file: bool = False
_last_request_time: ClassVar[float] = 0
_min_request_interval: ClassVar[float] = 1.0 # seconds
env_vars: List[EnvVar] = [
EnvVar(name="BRAVE_API_KEY", description="API key for Brave Search", required=True),
]
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
if "BRAVE_API_KEY" not in os.environ:
raise ValueError(
"BRAVE_API_KEY environment variable is required for BraveSearchTool"
)
def _run(
self,
**kwargs: Any,
) -> Any:
current_time = time.time()
if (current_time - self._last_request_time) < self._min_request_interval:
time.sleep(
self._min_request_interval - (current_time - self._last_request_time)
)
BraveSearchTool._last_request_time = time.time()
try:
search_query = kwargs.get("search_query") or kwargs.get("query")
if not search_query:
raise ValueError("Search query is required")
save_file = kwargs.get("save_file", self.save_file)
n_results = kwargs.get("n_results", self.n_results)
payload = {"q": search_query, "count": n_results}
if self.country != "":
payload["country"] = self.country
headers = {
"X-Subscription-Token": os.environ["BRAVE_API_KEY"],
"Accept": "application/json",
}
response = requests.get(self.search_url, headers=headers, params=payload)
response.raise_for_status() # Handle non-200 responses
results = response.json()
if "web" in results:
results = results["web"]["results"]
string = []
for result in results:
try:
string.append(
"\n".join(
[
f"Title: {result['title']}",
f"Link: {result['url']}",
f"Snippet: {result['description']}",
"---",
]
)
)
except KeyError:
continue
content = "\n".join(string)
except requests.RequestException as e:
return f"Error performing search: {str(e)}"
except KeyError as e:
return f"Error parsing search results: {str(e)}"
if save_file:
_save_results_to_file(content)
return f"\nSearch results: {content}\n"
else:
return content

View File

@@ -0,0 +1,79 @@
# BrightData Tools Documentation
## Description
A comprehensive suite of CrewAI tools that leverage Bright Data's powerful infrastructure for web scraping, data extraction, and search operations. These tools provide three distinct capabilities:
- **BrightDataDatasetTool**: Extract structured data from popular data feeds (Amazon, LinkedIn, Instagram, etc.) using pre-built datasets
- **BrightDataSearchTool**: Perform web searches across multiple search engines with geo-targeting and device simulation
- **BrightDataWebUnlockerTool**: Scrape any website content while bypassing bot protection mechanisms
## Installation
To incorporate these tools into your project, follow the installation instructions below:
```shell
pip install crewai[tools] aiohttp requests
```
## Examples
### Dataset Tool - Extract Amazon Product Data
```python
from crewai_tools import BrightDataDatasetTool
# Initialize with specific dataset and URL
tool = BrightDataDatasetTool(
dataset_type="amazon_product",
url="https://www.amazon.com/dp/B08QB1QMJ5/"
)
result = tool.run()
```
### Search Tool - Perform Web Search
```python
from crewai_tools import BrightDataSearchTool
# Initialize with search query
tool = BrightDataSearchTool(
query="latest AI trends 2025",
search_engine="google",
country="us"
)
result = tool.run()
```
### Web Unlocker Tool - Scrape Website Content
```python
from crewai_tools import BrightDataWebUnlockerTool
# Initialize with target URL
tool = BrightDataWebUnlockerTool(
url="https://example.com",
data_format="markdown"
)
result = tool.run()
```
## Steps to Get Started
To effectively use the BrightData Tools, follow these steps:
1. **Package Installation**: Confirm that the `crewai[tools]` package is installed in your Python environment.
2. **API Key Acquisition**: Register for a Bright Data account at `https://brightdata.com/` and obtain your API credentials from your account settings.
3. **Environment Configuration**: Set up the required environment variables:
```bash
export BRIGHT_DATA_API_KEY="your_api_key_here"
export BRIGHT_DATA_ZONE="your_zone_here"
```
4. **Tool Selection**: Choose the appropriate tool based on your needs:
- Use **DatasetTool** for structured data from supported platforms
- Use **SearchTool** for web search operations
- Use **WebUnlockerTool** for general website scraping
## Conclusion
By integrating BrightData Tools into your CrewAI agents, you gain access to enterprise-grade web scraping and data extraction capabilities. These tools handle complex challenges like bot protection, geo-restrictions, and data parsing, allowing you to focus on building your applications rather than managing scraping infrastructure.

View File

@@ -0,0 +1,9 @@
from .brightdata_dataset import BrightDataDatasetTool
from .brightdata_serp import BrightDataSearchTool
from .brightdata_unlocker import BrightDataWebUnlockerTool
__all__ = [
"BrightDataDatasetTool",
"BrightDataSearchTool",
"BrightDataWebUnlockerTool"
]

View File

@@ -0,0 +1,570 @@
import asyncio
import os
from typing import Any, Dict, Optional, Type
import aiohttp
from crewai.tools import BaseTool
from pydantic import BaseModel, Field
class BrightDataConfig(BaseModel):
API_URL: str = "https://api.brightdata.com"
DEFAULT_TIMEOUT: int = 600
DEFAULT_POLLING_INTERVAL: int = 1
@classmethod
def from_env(cls):
return cls(
API_URL=os.environ.get("BRIGHTDATA_API_URL", "https://api.brightdata.com"),
DEFAULT_TIMEOUT=int(os.environ.get("BRIGHTDATA_DEFAULT_TIMEOUT", "600")),
DEFAULT_POLLING_INTERVAL=int(os.environ.get("BRIGHTDATA_DEFAULT_POLLING_INTERVAL", "1"))
)
class BrightDataDatasetToolException(Exception):
"""Exception raised for custom error in the application."""
def __init__(self, message, error_code):
self.message = message
super().__init__(message)
self.error_code = error_code
def __str__(self):
return f"{self.message} (Error Code: {self.error_code})"
class BrightDataDatasetToolSchema(BaseModel):
"""
Schema for validating input parameters for the BrightDataDatasetTool.
Attributes:
dataset_type (str): Required Bright Data Dataset Type used to specify which dataset to access.
format (str): Response format (json by default). Multiple formats exist - json, ndjson, jsonl, csv
url (str): The URL from which structured data needs to be extracted.
zipcode (Optional[str]): An optional ZIP code to narrow down the data geographically.
additional_params (Optional[Dict]): Extra parameters for the Bright Data API call.
"""
dataset_type: str = Field(..., description="The Bright Data Dataset Type")
format: Optional[str] = Field(
default="json", description="Response format (json by default)"
)
url: str = Field(..., description="The URL to extract data from")
zipcode: Optional[str] = Field(default=None, description="Optional zipcode")
additional_params: Optional[Dict[str, Any]] = Field(
default=None, description="Additional params if any"
)
config = BrightDataConfig.from_env()
BRIGHTDATA_API_URL = config.API_URL
timeout = config.DEFAULT_TIMEOUT
datasets = [
{
"id": "amazon_product",
"dataset_id": "gd_l7q7dkf244hwjntr0",
"description": "\n".join(
[
"Quickly read structured amazon product data.",
"Requires a valid product URL with /dp/ in it.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "amazon_product_reviews",
"dataset_id": "gd_le8e811kzy4ggddlq",
"description": "\n".join(
[
"Quickly read structured amazon product review data.",
"Requires a valid product URL with /dp/ in it.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "amazon_product_search",
"dataset_id": "gd_lwdb4vjm1ehb499uxs",
"description": "\n".join(
[
"Quickly read structured amazon product search data.",
"Requires a valid search keyword and amazon domain URL.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["keyword", "url", "pages_to_search"],
"defaults": {"pages_to_search": "1"},
},
{
"id": "walmart_product",
"dataset_id": "gd_l95fol7l1ru6rlo116",
"description": "\n".join(
[
"Quickly read structured walmart product data.",
"Requires a valid product URL with /ip/ in it.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "walmart_seller",
"dataset_id": "gd_m7ke48w81ocyu4hhz0",
"description": "\n".join(
[
"Quickly read structured walmart seller data.",
"Requires a valid walmart seller URL.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "ebay_product",
"dataset_id": "gd_ltr9mjt81n0zzdk1fb",
"description": "\n".join(
[
"Quickly read structured ebay product data.",
"Requires a valid ebay product URL.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "homedepot_products",
"dataset_id": "gd_lmusivh019i7g97q2n",
"description": "\n".join(
[
"Quickly read structured homedepot product data.",
"Requires a valid homedepot product URL.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "zara_products",
"dataset_id": "gd_lct4vafw1tgx27d4o0",
"description": "\n".join(
[
"Quickly read structured zara product data.",
"Requires a valid zara product URL.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "etsy_products",
"dataset_id": "gd_ltppk0jdv1jqz25mz",
"description": "\n".join(
[
"Quickly read structured etsy product data.",
"Requires a valid etsy product URL.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "bestbuy_products",
"dataset_id": "gd_ltre1jqe1jfr7cccf",
"description": "\n".join(
[
"Quickly read structured bestbuy product data.",
"Requires a valid bestbuy product URL.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "linkedin_person_profile",
"dataset_id": "gd_l1viktl72bvl7bjuj0",
"description": "\n".join(
[
"Quickly read structured linkedin people profile data.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "linkedin_company_profile",
"dataset_id": "gd_l1vikfnt1wgvvqz95w",
"description": "\n".join(
[
"Quickly read structured linkedin company profile data",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "linkedin_job_listings",
"dataset_id": "gd_lpfll7v5hcqtkxl6l",
"description": "\n".join(
[
"Quickly read structured linkedin job listings data",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "linkedin_posts",
"dataset_id": "gd_lyy3tktm25m4avu764",
"description": "\n".join(
[
"Quickly read structured linkedin posts data",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "linkedin_people_search",
"dataset_id": "gd_m8d03he47z8nwb5xc",
"description": "\n".join(
[
"Quickly read structured linkedin people search data",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url", "first_name", "last_name"],
},
{
"id": "crunchbase_company",
"dataset_id": "gd_l1vijqt9jfj7olije",
"description": "\n".join(
[
"Quickly read structured crunchbase company data",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "zoominfo_company_profile",
"dataset_id": "gd_m0ci4a4ivx3j5l6nx",
"description": "\n".join(
[
"Quickly read structured ZoomInfo company profile data.",
"Requires a valid ZoomInfo company URL.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "instagram_profiles",
"dataset_id": "gd_l1vikfch901nx3by4",
"description": "\n".join(
[
"Quickly read structured Instagram profile data.",
"Requires a valid Instagram URL.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "instagram_posts",
"dataset_id": "gd_lk5ns7kz21pck8jpis",
"description": "\n".join(
[
"Quickly read structured Instagram post data.",
"Requires a valid Instagram URL.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "instagram_reels",
"dataset_id": "gd_lyclm20il4r5helnj",
"description": "\n".join(
[
"Quickly read structured Instagram reel data.",
"Requires a valid Instagram URL.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "instagram_comments",
"dataset_id": "gd_ltppn085pokosxh13",
"description": "\n".join(
[
"Quickly read structured Instagram comments data.",
"Requires a valid Instagram URL.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "facebook_posts",
"dataset_id": "gd_lyclm1571iy3mv57zw",
"description": "\n".join(
[
"Quickly read structured Facebook post data.",
"Requires a valid Facebook post URL.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "facebook_marketplace_listings",
"dataset_id": "gd_lvt9iwuh6fbcwmx1a",
"description": "\n".join(
[
"Quickly read structured Facebook marketplace listing data.",
"Requires a valid Facebook marketplace listing URL.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "facebook_company_reviews",
"dataset_id": "gd_m0dtqpiu1mbcyc2g86",
"description": "\n".join(
[
"Quickly read structured Facebook company reviews data.",
"Requires a valid Facebook company URL and number of reviews.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url", "num_of_reviews"],
},
{
"id": "facebook_events",
"dataset_id": "gd_m14sd0to1jz48ppm51",
"description": "\n".join(
[
"Quickly read structured Facebook events data.",
"Requires a valid Facebook event URL.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "tiktok_profiles",
"dataset_id": "gd_l1villgoiiidt09ci",
"description": "\n".join(
[
"Quickly read structured Tiktok profiles data.",
"Requires a valid Tiktok profile URL.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "tiktok_posts",
"dataset_id": "gd_lu702nij2f790tmv9h",
"description": "\n".join(
[
"Quickly read structured Tiktok post data.",
"Requires a valid Tiktok post URL.",
"This can be a cache lookup, so it can be more reliable than scraping",
]
),
"inputs": ["url"],
},
{
"id": "tiktok_shop",
"dataset_id": "gd_m45m1u911dsa4274pi",
"description": "\n".join(
[
"Quickly read structured Tiktok shop data.",
"Requires a valid Tiktok shop product URL.",
"This can be a cache lookup...",
]
),
"inputs": ["url"],
},
]
class BrightDataDatasetTool(BaseTool):
"""
CrewAI-compatible tool for scraping structured data using Bright Data Datasets.
Attributes:
name (str): Tool name displayed in the CrewAI environment.
description (str): Tool description shown to agents or users.
args_schema (Type[BaseModel]): Pydantic schema for validating input arguments.
"""
name: str = "Bright Data Dataset Tool"
description: str = "Scrapes structured data using Bright Data Dataset API from a URL and optional input parameters"
args_schema: Type[BaseModel] = BrightDataDatasetToolSchema
dataset_type: Optional[str] = None
url: Optional[str] = None
format: str = "json"
zipcode: Optional[str] = None
additional_params: Optional[Dict[str, Any]] = None
def __init__(self, dataset_type: str = None, url: str = None, format: str = "json", zipcode: str = None, additional_params: Dict[str, Any] = None):
super().__init__()
self.dataset_type = dataset_type
self.url = url
self.format = format
self.zipcode = zipcode
self.additional_params = additional_params
def filter_dataset_by_id(self, target_id):
return [dataset for dataset in datasets if dataset["id"] == target_id]
async def get_dataset_data_async(
self,
dataset_type: str,
output_format: str,
url: str,
zipcode: Optional[str] = None,
additional_params: Optional[Dict[str, Any]] = None,
polling_interval: int = 1,
) -> Dict:
"""
Asynchronously trigger and poll Bright Data dataset scraping.
Args:
dataset_type (str): Bright Data Dataset Type.
url (str): Target URL to scrape.
zipcode (Optional[str]): Optional ZIP code for geo-specific data.
additional_params (Optional[Dict]): Extra API parameters.
polling_interval (int): Time interval in seconds between polling attempts.
Returns:
Dict: Structured dataset result from Bright Data.
Raises:
Exception: If any API step fails or the job fails.
TimeoutError: If polling times out before job completion.
"""
request_data = {"url": url}
if zipcode is not None:
request_data["zipcode"] = zipcode
# Set additional parameters dynamically depending upon the dataset that is being requested
if additional_params:
request_data.update(additional_params)
api_key = os.getenv("BRIGHT_DATA_API_KEY")
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
}
dataset_id = ""
dataset = self.filter_dataset_by_id(dataset_type)
if len(dataset) == 1:
dataset_id = dataset[0]["dataset_id"]
else:
raise ValueError(
f"Unable to find the dataset for {dataset_type}. Please make sure to pass a valid one"
)
async with aiohttp.ClientSession() as session:
# Step 1: Trigger job
async with session.post(
f"{BRIGHTDATA_API_URL}/datasets/v3/trigger",
params={"dataset_id": dataset_id, "include_errors": "true"},
json=[request_data],
headers=headers,
) as trigger_response:
if trigger_response.status != 200:
raise BrightDataDatasetToolException(
f"Trigger failed: {await trigger_response.text()}",
trigger_response.status,
)
trigger_data = await trigger_response.json()
print(trigger_data)
snapshot_id = trigger_data.get("snapshot_id")
# Step 2: Poll for completion
elapsed = 0
while elapsed < timeout:
await asyncio.sleep(polling_interval)
elapsed += polling_interval
async with session.get(
f"{BRIGHTDATA_API_URL}/datasets/v3/progress/{snapshot_id}",
headers=headers,
) as status_response:
if status_response.status != 200:
raise BrightDataDatasetToolException(
f"Status check failed: {await status_response.text()}",
status_response.status,
)
status_data = await status_response.json()
if status_data.get("status") == "ready":
print("Job is ready")
break
elif status_data.get("status") == "error":
raise BrightDataDatasetToolException(
f"Job failed: {status_data}", 0
)
else:
raise TimeoutError("Polling timed out before job completed.")
# Step 3: Retrieve result
async with session.get(
f"{BRIGHTDATA_API_URL}/datasets/v3/snapshot/{snapshot_id}",
params={"format": output_format},
headers=headers,
) as snapshot_response:
if snapshot_response.status != 200:
raise BrightDataDatasetToolException(
f"Result fetch failed: {await snapshot_response.text()}",
snapshot_response.status,
)
return await snapshot_response.text()
def _run(self, url: str = None, dataset_type: str = None, format: str = None, zipcode: str = None, additional_params: Dict[str, Any] = None, **kwargs: Any) -> Any:
dataset_type = dataset_type or self.dataset_type
output_format = format or self.format
url = url or self.url
zipcode = zipcode or self.zipcode
additional_params = additional_params or self.additional_params
if not dataset_type:
raise ValueError("dataset_type is required either in constructor or method call")
if not url:
raise ValueError("url is required either in constructor or method call")
valid_output_formats = {"json", "ndjson", "jsonl", "csv"}
if output_format not in valid_output_formats:
raise ValueError(
f"Unsupported output format: {output_format}. Must be one of {', '.join(valid_output_formats)}."
)
api_key = os.getenv("BRIGHT_DATA_API_KEY")
if not api_key:
raise ValueError("BRIGHT_DATA_API_KEY environment variable is required.")
try:
return asyncio.run(
self.get_dataset_data_async(
dataset_type=dataset_type,
output_format=output_format,
url=url,
zipcode=zipcode,
additional_params=additional_params,
)
)
except TimeoutError as e:
return f"Timeout Exception occured in method : get_dataset_data_async. Details - {str(e)}"
except BrightDataDatasetToolException as e:
return f"Exception occured in method : get_dataset_data_async. Details - {str(e)}"
except Exception as e:
return f"Bright Data API error: {str(e)}"

View File

@@ -0,0 +1,207 @@
import os
import urllib.parse
from typing import Any, Optional, Type
import requests
from crewai.tools import BaseTool
from pydantic import BaseModel, Field
class BrightDataConfig(BaseModel):
API_URL: str = "https://api.brightdata.com/request"
@classmethod
def from_env(cls):
return cls(
API_URL=os.environ.get("BRIGHTDATA_API_URL", "https://api.brightdata.com/request")
)
class BrightDataSearchToolSchema(BaseModel):
"""
Schema that defines the input arguments for the BrightDataSearchToolSchema.
Attributes:
query (str): The search query to be executed (e.g., "latest AI news").
search_engine (Optional[str]): The search engine to use ("google", "bing", "yandex"). Default is "google".
country (Optional[str]): Two-letter country code for geo-targeting (e.g., "us", "in"). Default is "us".
language (Optional[str]): Language code for search results (e.g., "en", "es"). Default is "en".
search_type (Optional[str]): Type of search, such as "isch" (images), "nws" (news), "jobs", etc.
device_type (Optional[str]): Device type to simulate ("desktop", "mobile", "ios", "android"). Default is "desktop".
parse_results (Optional[bool]): If True, results will be returned in structured JSON. If False, raw HTML. Default is True.
"""
query: str = Field(..., description="Search query to perform")
search_engine: Optional[str] = Field(
default="google",
description="Search engine domain (e.g., 'google', 'bing', 'yandex')",
)
country: Optional[str] = Field(
default="us",
description="Two-letter country code for geo-targeting (e.g., 'us', 'gb')",
)
language: Optional[str] = Field(
default="en",
description="Language code (e.g., 'en', 'es') used in the query URL",
)
search_type: Optional[str] = Field(
default=None,
description="Type of search (e.g., 'isch' for images, 'nws' for news)",
)
device_type: Optional[str] = Field(
default="desktop",
description="Device type to simulate (e.g., 'mobile', 'desktop', 'ios')",
)
parse_results: Optional[bool] = Field(
default=True,
description="Whether to parse and return JSON (True) or raw HTML/text (False)",
)
class BrightDataSearchTool(BaseTool):
"""
A web search tool that utilizes Bright Data's SERP API to perform queries and return either structured results
or raw page content from search engines like Google or Bing.
Attributes:
name (str): Tool name used by the agent.
description (str): A brief explanation of what the tool does.
args_schema (Type[BaseModel]): Schema class for validating tool arguments.
base_url (str): The Bright Data API endpoint used for making the POST request.
api_key (str): Bright Data API key loaded from the environment variable 'BRIGHT_DATA_API_KEY'.
zone (str): Zone identifier from Bright Data, loaded from the environment variable 'BRIGHT_DATA_ZONE'.
Raises:
ValueError: If API key or zone environment variables are not set.
"""
name: str = "Bright Data SERP Search"
description: str = "Tool to perform web search using Bright Data SERP API."
args_schema: Type[BaseModel] = BrightDataSearchToolSchema
_config = BrightDataConfig.from_env()
base_url: str = ""
api_key: str = ""
zone: str = ""
query: Optional[str] = None
search_engine: str = "google"
country: str = "us"
language: str = "en"
search_type: Optional[str] = None
device_type: str = "desktop"
parse_results: bool = True
def __init__(self, query: str = None, search_engine: str = "google", country: str = "us", language: str = "en", search_type: str = None, device_type: str = "desktop", parse_results: bool = True):
super().__init__()
self.base_url = self._config.API_URL
self.query = query
self.search_engine = search_engine
self.country = country
self.language = language
self.search_type = search_type
self.device_type = device_type
self.parse_results = parse_results
self.api_key = os.getenv("BRIGHT_DATA_API_KEY")
self.zone = os.getenv("BRIGHT_DATA_ZONE")
if not self.api_key:
raise ValueError("BRIGHT_DATA_API_KEY environment variable is required.")
if not self.zone:
raise ValueError("BRIGHT_DATA_ZONE environment variable is required.")
def get_search_url(self, engine: str, query: str):
if engine == "yandex":
return f"https://yandex.com/search/?text=${query}"
elif engine == "bing":
return f"https://www.bing.com/search?q=${query}"
return f"https://www.google.com/search?q=${query}"
def _run(self, query: str = None, search_engine: str = None, country: str = None, language: str = None, search_type: str = None, device_type: str = None, parse_results: bool = None, **kwargs) -> Any:
"""
Executes a search query using Bright Data SERP API and returns results.
Args:
query (str): The search query string (URL encoded internally).
search_engine (str): The search engine to use (default: "google").
country (str): Country code for geotargeting (default: "us").
language (str): Language code for the query (default: "en").
search_type (str): Optional type of search such as "nws", "isch", "jobs".
device_type (str): Optional device type to simulate (e.g., "mobile", "ios", "desktop").
parse_results (bool): If True, returns structured data; else raw page (default: True).
results_count (str or int): Number of search results to fetch (default: "10").
Returns:
dict or str: Parsed JSON data from Bright Data if available, otherwise error message.
"""
query = query or self.query
search_engine = search_engine or self.search_engine
country = country or self.country
language = language or self.language
search_type = search_type or self.search_type
device_type = device_type or self.device_type
parse_results = parse_results if parse_results is not None else self.parse_results
results_count = kwargs.get("results_count", "10")
# Validate required parameters
if not query:
raise ValueError("query is required either in constructor or method call")
# Build the search URL
query = urllib.parse.quote(query)
url = self.get_search_url(search_engine, query)
# Add parameters to the URL
params = []
if country:
params.append(f"gl={country}")
if language:
params.append(f"hl={language}")
if results_count:
params.append(f"num={results_count}")
if parse_results:
params.append(f"brd_json=1")
if search_type:
if search_type == "jobs":
params.append("ibp=htl;jobs")
else:
params.append(f"tbm={search_type}")
if device_type:
if device_type == "mobile":
params.append("brd_mobile=1")
elif device_type == "ios":
params.append("brd_mobile=ios")
elif device_type == "android":
params.append("brd_mobile=android")
# Combine parameters with the URL
if params:
url += "&" + "&".join(params)
# Set up the API request parameters
request_params = {"zone": self.zone, "url": url, "format": "raw"}
request_params = {k: v for k, v in request_params.items() if v is not None}
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
}
try:
response = requests.post(
self.base_url, json=request_params, headers=headers
)
print(f"Status code: {response.status_code}")
response.raise_for_status()
return response.text
except requests.RequestException as e:
return f"Error performing BrightData search: {str(e)}"
except Exception as e:
return f"Error fetching results: {str(e)}"

View File

@@ -0,0 +1,122 @@
import os
from typing import Any, Optional, Type
import requests
from crewai.tools import BaseTool
from pydantic import BaseModel, Field
class BrightDataConfig(BaseModel):
API_URL: str = "https://api.brightdata.com/request"
@classmethod
def from_env(cls):
return cls(
API_URL=os.environ.get("BRIGHTDATA_API_URL", "https://api.brightdata.com/request")
)
class BrightDataUnlockerToolSchema(BaseModel):
"""
Pydantic schema for input parameters used by the BrightDataWebUnlockerTool.
This schema defines the structure and validation for parameters passed when performing
a web scraping request using Bright Data's Web Unlocker.
Attributes:
url (str): The target URL to scrape.
format (Optional[str]): Format of the response returned by Bright Data. Default 'raw' format.
data_format (Optional[str]): Response data format (html by default). markdown is one more option.
"""
url: str = Field(..., description="URL to perform the web scraping")
format: Optional[str] = Field(
default="raw", description="Response format (raw is standard)"
)
data_format: Optional[str] = Field(
default="markdown", description="Response data format (html by default)"
)
class BrightDataWebUnlockerTool(BaseTool):
"""
A tool for performing web scraping using the Bright Data Web Unlocker API.
This tool allows automated and programmatic access to web pages by routing requests
through Bright Data's unlocking and proxy infrastructure, which can bypass bot
protection mechanisms like CAPTCHA, geo-restrictions, and anti-bot detection.
Attributes:
name (str): Name of the tool.
description (str): Description of what the tool does.
args_schema (Type[BaseModel]): Pydantic model schema for expected input arguments.
base_url (str): Base URL of the Bright Data Web Unlocker API.
api_key (str): Bright Data API key (must be set in the BRIGHT_DATA_API_KEY environment variable).
zone (str): Bright Data zone identifier (must be set in the BRIGHT_DATA_ZONE environment variable).
Methods:
_run(**kwargs: Any) -> Any:
Sends a scraping request to Bright Data's Web Unlocker API and returns the result.
"""
name: str = "Bright Data Web Unlocker Scraping"
description: str = "Tool to perform web scraping using Bright Data Web Unlocker"
args_schema: Type[BaseModel] = BrightDataUnlockerToolSchema
_config = BrightDataConfig.from_env()
base_url: str = ""
api_key: str = ""
zone: str = ""
url: Optional[str] = None
format: str = "raw"
data_format: str = "markdown"
def __init__(self, url: str = None, format: str = "raw", data_format: str = "markdown"):
super().__init__()
self.base_url = self._config.API_URL
self.url = url
self.format = format
self.data_format = data_format
self.api_key = os.getenv("BRIGHT_DATA_API_KEY")
self.zone = os.getenv("BRIGHT_DATA_ZONE")
if not self.api_key:
raise ValueError("BRIGHT_DATA_API_KEY environment variable is required.")
if not self.zone:
raise ValueError("BRIGHT_DATA_ZONE environment variable is required.")
def _run(self, url: str = None, format: str = None, data_format: str = None, **kwargs: Any) -> Any:
url = url or self.url
format = format or self.format
data_format = data_format or self.data_format
if not url:
raise ValueError("url is required either in constructor or method call")
payload = {
"url": url,
"zone": self.zone,
"format": format,
}
valid_data_formats = {"html", "markdown"}
if data_format not in valid_data_formats:
raise ValueError(
f"Unsupported data format: {data_format}. Must be one of {', '.join(valid_data_formats)}."
)
if data_format == "markdown":
payload["data_format"] = "markdown"
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
}
try:
response = requests.post(self.base_url, json=payload, headers=headers)
print(f"Status Code: {response.status_code}")
response.raise_for_status()
return response.text
except requests.RequestException as e:
return f"HTTP Error performing BrightData Web Unlocker Scrape: {e}\nResponse: {getattr(e.response, 'text', '')}"
except Exception as e:
return f"Error fetching results: {str(e)}"

View File

@@ -0,0 +1,38 @@
# BrowserbaseLoadTool
## Description
[Browserbase](https://browserbase.com) is a developer platform to reliably run, manage, and monitor headless browsers.
Power your AI data retrievals with:
- [Serverless Infrastructure](https://docs.browserbase.com/under-the-hood) providing reliable browsers to extract data from complex UIs
- [Stealth Mode](https://docs.browserbase.com/features/stealth-mode) with included fingerprinting tactics and automatic captcha solving
- [Session Debugger](https://docs.browserbase.com/features/sessions) to inspect your Browser Session with networks timeline and logs
- [Live Debug](https://docs.browserbase.com/guides/session-debug-connection/browser-remote-control) to quickly debug your automation
## Installation
- Get an API key and Project ID from [browserbase.com](https://browserbase.com) and set it in environment variables (`BROWSERBASE_API_KEY`, `BROWSERBASE_PROJECT_ID`).
- Install the [Browserbase SDK](http://github.com/browserbase/python-sdk) along with `crewai[tools]` package:
```
pip install browserbase 'crewai[tools]'
```
## Example
Utilize the BrowserbaseLoadTool as follows to allow your agent to load websites:
```python
from crewai_tools import BrowserbaseLoadTool
tool = BrowserbaseLoadTool()
```
## Arguments
- `api_key` Optional. Browserbase API key. Default is `BROWSERBASE_API_KEY` env variable.
- `project_id` Optional. Browserbase Project ID. Default is `BROWSERBASE_PROJECT_ID` env variable.
- `text_content` Retrieve only text content. Default is `False`.
- `session_id` Optional. Provide an existing Session ID.
- `proxy` Optional. Enable/Disable Proxies."

View File

@@ -0,0 +1,67 @@
import os
from typing import Any, Optional, Type, List
from crewai.tools import BaseTool, EnvVar
from pydantic import BaseModel, Field
class BrowserbaseLoadToolSchema(BaseModel):
url: str = Field(description="Website URL")
class BrowserbaseLoadTool(BaseTool):
name: str = "Browserbase web load tool"
description: str = "Load webpages url in a headless browser using Browserbase and return the contents"
args_schema: Type[BaseModel] = BrowserbaseLoadToolSchema
api_key: Optional[str] = os.getenv("BROWSERBASE_API_KEY")
project_id: Optional[str] = os.getenv("BROWSERBASE_PROJECT_ID")
text_content: Optional[bool] = False
session_id: Optional[str] = None
proxy: Optional[bool] = None
browserbase: Optional[Any] = None
package_dependencies: List[str] = ["browserbase"]
env_vars: List[EnvVar] = [
EnvVar(name="BROWSERBASE_API_KEY", description="API key for Browserbase services", required=False),
EnvVar(name="BROWSERBASE_PROJECT_ID", description="Project ID for Browserbase services", required=False),
]
def __init__(
self,
api_key: Optional[str] = None,
project_id: Optional[str] = None,
text_content: Optional[bool] = False,
session_id: Optional[str] = None,
proxy: Optional[bool] = None,
**kwargs,
):
super().__init__(**kwargs)
if not self.api_key:
raise EnvironmentError(
"BROWSERBASE_API_KEY environment variable is required for initialization"
)
try:
from browserbase import Browserbase # type: ignore
except ImportError:
import click
if click.confirm(
"`browserbase` package not found, would you like to install it?"
):
import subprocess
subprocess.run(["uv", "add", "browserbase"], check=True)
from browserbase import Browserbase # type: ignore
else:
raise ImportError(
"`browserbase` package not found, please run `uv add browserbase`"
)
self.browserbase = Browserbase(api_key=self.api_key)
self.text_content = text_content
self.session_id = session_id
self.proxy = proxy
def _run(self, url: str):
return self.browserbase.load_url(
url, self.text_content, self.session_id, self.proxy
)

View File

@@ -0,0 +1,56 @@
# CodeDocsSearchTool
## Description
The CodeDocsSearchTool is a powerful RAG (Retrieval-Augmented Generation) tool designed for semantic searches within code documentation. It enables users to efficiently find specific information or topics within code documentation. By providing a `docs_url` during initialization, the tool narrows down the search to that particular documentation site. Alternatively, without a specific `docs_url`, it searches across a wide array of code documentation known or discovered throughout its execution, making it versatile for various documentation search needs.
## Installation
To start using the CodeDocsSearchTool, first, install the crewai_tools package via pip:
```shell
pip install 'crewai[tools]'
```
## Example
Utilize the CodeDocsSearchTool as follows to conduct searches within code documentation:
```python
from crewai_tools import CodeDocsSearchTool
# To search any code documentation content if the URL is known or discovered during its execution:
tool = CodeDocsSearchTool()
# OR
# To specifically focus your search on a given documentation site by providing its URL:
tool = CodeDocsSearchTool(docs_url='https://docs.example.com/reference')
```
Note: Substitute 'https://docs.example.com/reference' with your target documentation URL and 'How to use search tool' with the search query relevant to your needs.
## Arguments
- `docs_url`: Optional. Specifies the URL of the code documentation to be searched. Providing this during the tool's initialization focuses the search on the specified documentation content.
## Custom model and embeddings
By default, the tool uses OpenAI for both embeddings and summarization. To customize the model, you can use a config dictionary as follows:
```python
tool = CodeDocsSearchTool(
config=dict(
llm=dict(
provider="ollama", # or google, openai, anthropic, llama2, ...
config=dict(
model="llama2",
# temperature=0.5,
# top_p=1,
# stream=true,
),
),
embedder=dict(
provider="google",
config=dict(
model="models/embedding-001",
task_type="retrieval_document",
# title="Embeddings",
),
),
)
)
```

View File

@@ -0,0 +1,56 @@
from typing import Any, Optional, Type
try:
from embedchain.models.data_type import DataType
EMBEDCHAIN_AVAILABLE = True
except ImportError:
EMBEDCHAIN_AVAILABLE = False
from pydantic import BaseModel, Field
from ..rag.rag_tool import RagTool
class FixedCodeDocsSearchToolSchema(BaseModel):
"""Input for CodeDocsSearchTool."""
search_query: str = Field(
...,
description="Mandatory search query you want to use to search the Code Docs content",
)
class CodeDocsSearchToolSchema(FixedCodeDocsSearchToolSchema):
"""Input for CodeDocsSearchTool."""
docs_url: str = Field(..., description="Mandatory docs_url path you want to search")
class CodeDocsSearchTool(RagTool):
name: str = "Search a Code Docs content"
description: str = (
"A tool that can be used to semantic search a query from a Code Docs content."
)
args_schema: Type[BaseModel] = CodeDocsSearchToolSchema
def __init__(self, docs_url: Optional[str] = None, **kwargs):
super().__init__(**kwargs)
if docs_url is not None:
self.add(docs_url)
self.description = f"A tool that can be used to semantic search a query the {docs_url} Code Docs content."
self.args_schema = FixedCodeDocsSearchToolSchema
self._generate_description()
def add(self, docs_url: str) -> None:
if not EMBEDCHAIN_AVAILABLE:
raise ImportError("embedchain is not installed. Please install it with `pip install crewai-tools[embedchain]`")
super().add(docs_url, data_type=DataType.DOCS_SITE)
def _run(
self,
search_query: str,
docs_url: Optional[str] = None,
) -> str:
if docs_url is not None:
self.add(docs_url)
return super()._run(query=search_query)

View File

@@ -0,0 +1,6 @@
FROM python:3.12-alpine
RUN pip install requests beautifulsoup4
# Set the working directory
WORKDIR /workspace

View File

@@ -0,0 +1,53 @@
# CodeInterpreterTool
## Description
This tool is used to give the Agent the ability to run code (Python3) from the code generated by the Agent itself. The code is executed in a sandboxed environment, so it is safe to run any code.
It is incredible useful since it allows the Agent to generate code, run it in the same environment, get the result and use it to make decisions.
## Requirements
- Docker
## Installation
Install the crewai_tools package
```shell
pip install 'crewai[tools]'
```
## Example
Remember that when using this tool, the code must be generated by the Agent itself. The code must be a Python3 code. And it will take some time for the first time to run because it needs to build the Docker image.
```python
from crewai_tools import CodeInterpreterTool
Agent(
...
tools=[CodeInterpreterTool()],
)
```
Or if you need to pass your own Dockerfile just do this
```python
from crewai_tools import CodeInterpreterTool
Agent(
...
tools=[CodeInterpreterTool(user_dockerfile_path="<Dockerfile_path>")],
)
```
If it is difficult to connect to docker daemon automatically (especially for macOS users), you can do this to setup docker host manually
```python
from crewai_tools import CodeInterpreterTool
Agent(
...
tools=[CodeInterpreterTool(user_docker_base_url="<Docker Host Base Url>",
user_dockerfile_path="<Dockerfile_path>")],
)
```

View File

@@ -0,0 +1,373 @@
"""Code Interpreter Tool for executing Python code in isolated environments.
This module provides a tool for executing Python code either in a Docker container for
safe isolation or directly in a restricted sandbox. It includes mechanisms for blocking
potentially unsafe operations and importing restricted modules.
"""
import importlib.util
import os
from types import ModuleType
from typing import Any, Dict, List, Optional, Type
from crewai.tools import BaseTool
from docker import DockerClient
from docker import from_env as docker_from_env
from docker.errors import ImageNotFound, NotFound
from docker.models.containers import Container
from pydantic import BaseModel, Field
from crewai_tools.printer import Printer
class CodeInterpreterSchema(BaseModel):
"""Schema for defining inputs to the CodeInterpreterTool.
This schema defines the required parameters for code execution,
including the code to run and any libraries that need to be installed.
"""
code: str = Field(
...,
description="Python3 code used to be interpreted in the Docker container. ALWAYS PRINT the final result and the output of the code",
)
libraries_used: List[str] = Field(
...,
description="List of libraries used in the code with proper installing names separated by commas. Example: numpy,pandas,beautifulsoup4",
)
class SandboxPython:
"""A restricted Python execution environment for running code safely.
This class provides methods to safely execute Python code by restricting access to
potentially dangerous modules and built-in functions. It creates a sandboxed
environment where harmful operations are blocked.
"""
BLOCKED_MODULES = {
"os",
"sys",
"subprocess",
"shutil",
"importlib",
"inspect",
"tempfile",
"sysconfig",
"builtins",
}
UNSAFE_BUILTINS = {
"exec",
"eval",
"open",
"compile",
"input",
"globals",
"locals",
"vars",
"help",
"dir",
}
@staticmethod
def restricted_import(
name: str,
custom_globals: Optional[Dict[str, Any]] = None,
custom_locals: Optional[Dict[str, Any]] = None,
fromlist: Optional[List[str]] = None,
level: int = 0,
) -> ModuleType:
"""A restricted import function that blocks importing of unsafe modules.
Args:
name: The name of the module to import.
custom_globals: Global namespace to use.
custom_locals: Local namespace to use.
fromlist: List of items to import from the module.
level: The level value passed to __import__.
Returns:
The imported module if allowed.
Raises:
ImportError: If the module is in the blocked modules list.
"""
if name in SandboxPython.BLOCKED_MODULES:
raise ImportError(f"Importing '{name}' is not allowed.")
return __import__(name, custom_globals, custom_locals, fromlist or (), level)
@staticmethod
def safe_builtins() -> Dict[str, Any]:
"""Creates a dictionary of built-in functions with unsafe ones removed.
Returns:
A dictionary of safe built-in functions and objects.
"""
import builtins
safe_builtins = {
k: v
for k, v in builtins.__dict__.items()
if k not in SandboxPython.UNSAFE_BUILTINS
}
safe_builtins["__import__"] = SandboxPython.restricted_import
return safe_builtins
@staticmethod
def exec(code: str, locals: Dict[str, Any]) -> None:
"""Executes Python code in a restricted environment.
Args:
code: The Python code to execute as a string.
locals: A dictionary that will be used for local variable storage.
"""
exec(code, {"__builtins__": SandboxPython.safe_builtins()}, locals)
class CodeInterpreterTool(BaseTool):
"""A tool for executing Python code in isolated environments.
This tool provides functionality to run Python code either in a Docker container
for safe isolation or directly in a restricted sandbox. It can handle installing
Python packages and executing arbitrary Python code.
"""
name: str = "Code Interpreter"
description: str = "Interprets Python3 code strings with a final print statement."
args_schema: Type[BaseModel] = CodeInterpreterSchema
default_image_tag: str = "code-interpreter:latest"
code: Optional[str] = None
user_dockerfile_path: Optional[str] = None
user_docker_base_url: Optional[str] = None
unsafe_mode: bool = False
@staticmethod
def _get_installed_package_path() -> str:
"""Gets the installation path of the crewai_tools package.
Returns:
The directory path where the package is installed.
"""
spec = importlib.util.find_spec("crewai_tools")
return os.path.dirname(spec.origin)
def _verify_docker_image(self) -> None:
"""Verifies if the Docker image is available or builds it if necessary.
Checks if the required Docker image exists. If not, builds it using either a
user-provided Dockerfile or the default one included with the package.
Raises:
FileNotFoundError: If the Dockerfile cannot be found.
"""
client = (
docker_from_env()
if self.user_docker_base_url is None
else DockerClient(base_url=self.user_docker_base_url)
)
try:
client.images.get(self.default_image_tag)
except ImageNotFound:
if self.user_dockerfile_path and os.path.exists(self.user_dockerfile_path):
dockerfile_path = self.user_dockerfile_path
else:
package_path = self._get_installed_package_path()
dockerfile_path = os.path.join(
package_path, "tools/code_interpreter_tool"
)
if not os.path.exists(dockerfile_path):
raise FileNotFoundError(
f"Dockerfile not found in {dockerfile_path}"
)
client.images.build(
path=dockerfile_path,
tag=self.default_image_tag,
rm=True,
)
def _run(self, **kwargs) -> str:
"""Runs the code interpreter tool with the provided arguments.
Args:
**kwargs: Keyword arguments that should include 'code' and 'libraries_used'.
Returns:
The output of the executed code as a string.
"""
code = kwargs.get("code", self.code)
libraries_used = kwargs.get("libraries_used", [])
if self.unsafe_mode:
return self.run_code_unsafe(code, libraries_used)
else:
return self.run_code_safety(code, libraries_used)
def _install_libraries(self, container: Container, libraries: List[str]) -> None:
"""Installs required Python libraries in the Docker container.
Args:
container: The Docker container where libraries will be installed.
libraries: A list of library names to install using pip.
"""
for library in libraries:
container.exec_run(["pip", "install", library])
def _init_docker_container(self) -> Container:
"""Initializes and returns a Docker container for code execution.
Stops and removes any existing container with the same name before creating
a new one. Maps the current working directory to /workspace in the container.
Returns:
A Docker container object ready for code execution.
"""
container_name = "code-interpreter"
client = docker_from_env()
current_path = os.getcwd()
# Check if the container is already running
try:
existing_container = client.containers.get(container_name)
existing_container.stop()
existing_container.remove()
except NotFound:
pass # Container does not exist, no need to remove
return client.containers.run(
self.default_image_tag,
detach=True,
tty=True,
working_dir="/workspace",
name=container_name,
volumes={current_path: {"bind": "/workspace", "mode": "rw"}}, # type: ignore
)
def _check_docker_available(self) -> bool:
"""Checks if Docker is available and running on the system.
Attempts to run the 'docker info' command to verify Docker availability.
Prints appropriate messages if Docker is not installed or not running.
Returns:
True if Docker is available and running, False otherwise.
"""
import subprocess
try:
subprocess.run(
["docker", "info"],
check=True,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
timeout=1,
)
return True
except (subprocess.CalledProcessError, subprocess.TimeoutExpired):
Printer.print(
"Docker is installed but not running or inaccessible.",
color="bold_purple",
)
return False
except FileNotFoundError:
Printer.print("Docker is not installed", color="bold_purple")
return False
def run_code_safety(self, code: str, libraries_used: List[str]) -> str:
"""Runs code in the safest available environment.
Attempts to run code in Docker if available, falls back to a restricted
sandbox if Docker is not available.
Args:
code: The Python code to execute as a string.
libraries_used: A list of Python library names to install before execution.
Returns:
The output of the executed code as a string.
"""
if self._check_docker_available():
return self.run_code_in_docker(code, libraries_used)
else:
return self.run_code_in_restricted_sandbox(code)
def run_code_in_docker(self, code: str, libraries_used: List[str]) -> str:
"""Runs Python code in a Docker container for safe isolation.
Creates a Docker container, installs the required libraries, executes the code,
and then cleans up by stopping and removing the container.
Args:
code: The Python code to execute as a string.
libraries_used: A list of Python library names to install before execution.
Returns:
The output of the executed code as a string, or an error message if execution failed.
"""
Printer.print("Running code in Docker environment", color="bold_blue")
self._verify_docker_image()
container = self._init_docker_container()
self._install_libraries(container, libraries_used)
exec_result = container.exec_run(["python3", "-c", code])
container.stop()
container.remove()
if exec_result.exit_code != 0:
return f"Something went wrong while running the code: \n{exec_result.output.decode('utf-8')}"
return exec_result.output.decode("utf-8")
def run_code_in_restricted_sandbox(self, code: str) -> str:
"""Runs Python code in a restricted sandbox environment.
Executes the code with restricted access to potentially dangerous modules and
built-in functions for basic safety when Docker is not available.
Args:
code: The Python code to execute as a string.
Returns:
The value of the 'result' variable from the executed code,
or an error message if execution failed.
"""
Printer.print("Running code in restricted sandbox", color="yellow")
exec_locals = {}
try:
SandboxPython.exec(code=code, locals=exec_locals)
return exec_locals.get("result", "No result variable found.")
except Exception as e:
return f"An error occurred: {str(e)}"
def run_code_unsafe(self, code: str, libraries_used: List[str]) -> str:
"""Runs code directly on the host machine without any safety restrictions.
WARNING: This mode is unsafe and should only be used in trusted environments
with code from trusted sources.
Args:
code: The Python code to execute as a string.
libraries_used: A list of Python library names to install before execution.
Returns:
The value of the 'result' variable from the executed code,
or an error message if execution failed.
"""
Printer.print("WARNING: Running code in unsafe mode", color="bold_magenta")
# Install libraries on the host machine
for library in libraries_used:
os.system(f"pip install {library}")
# Execute the code
try:
exec_locals = {}
exec(code, {}, exec_locals)
return exec_locals.get("result", "No result variable found.")
except Exception as e:
return f"An error occurred: {str(e)}"

View File

@@ -0,0 +1,72 @@
# ComposioTool Documentation
## Description
This tools is a wrapper around the composio toolset and gives your agent access to a wide variety of tools from the composio SDK.
## Installation
To incorporate this tool into your project, follow the installation instructions below:
```shell
pip install composio-core
pip install 'crewai[tools]'
```
after the installation is complete, either run `composio login` or export your composio API key as `COMPOSIO_API_KEY`.
## Example
The following example demonstrates how to initialize the tool and execute a github action:
1. Initialize toolset
```python
from composio import App
from crewai_tools import ComposioTool
from crewai import Agent, Task
tools = [ComposioTool.from_action(action=Action.GITHUB_ACTIVITY_STAR_REPO_FOR_AUTHENTICATED_USER)]
```
If you don't know what action you want to use, use `from_app` and `tags` filter to get relevant actions
```python
tools = ComposioTool.from_app(App.GITHUB, tags=["important"])
```
or use `use_case` to search relevant actions
```python
tools = ComposioTool.from_app(App.GITHUB, use_case="Star a github repository")
```
2. Define agent
```python
crewai_agent = Agent(
role="Github Agent",
goal="You take action on Github using Github APIs",
backstory=(
"You are AI agent that is responsible for taking actions on Github "
"on users behalf. You need to take action on Github using Github APIs"
),
verbose=True,
tools=tools,
)
```
3. Execute task
```python
task = Task(
description="Star a repo ComposioHQ/composio on GitHub",
agent=crewai_agent,
expected_output="if the star happened",
)
task.execute()
```
* More detailed list of tools can be found [here](https://app.composio.dev)

View File

@@ -0,0 +1,124 @@
"""
Composio tools wrapper.
"""
import typing as t
import typing_extensions as te
from crewai.tools import BaseTool, EnvVar
class ComposioTool(BaseTool):
"""Wrapper for composio tools."""
composio_action: t.Callable
env_vars: t.List[EnvVar] = [
EnvVar(name="COMPOSIO_API_KEY", description="API key for Composio services", required=True),
]
def _run(self, *args: t.Any, **kwargs: t.Any) -> t.Any:
"""Run the composio action with given arguments."""
return self.composio_action(*args, **kwargs)
@staticmethod
def _check_connected_account(tool: t.Any, toolset: t.Any) -> None:
"""Check if connected account is required and if required it exists or not."""
from composio import Action
from composio.client.collections import ConnectedAccountModel
tool = t.cast(Action, tool)
if tool.no_auth:
return
connections = t.cast(
t.List[ConnectedAccountModel],
toolset.client.connected_accounts.get(),
)
if tool.app not in [connection.appUniqueId for connection in connections]:
raise RuntimeError(
f"No connected account found for app `{tool.app}`; "
f"Run `composio add {tool.app}` to fix this"
)
@classmethod
def from_action(
cls,
action: t.Any,
**kwargs: t.Any,
) -> te.Self:
"""Wrap a composio tool as crewAI tool."""
from composio import Action, ComposioToolSet
from composio.constants import DEFAULT_ENTITY_ID
from composio.utils.shared import json_schema_to_model
toolset = ComposioToolSet()
if not isinstance(action, Action):
action = Action(action)
action = t.cast(Action, action)
cls._check_connected_account(
tool=action,
toolset=toolset,
)
(action_schema,) = toolset.get_action_schemas(actions=[action])
schema = action_schema.model_dump(exclude_none=True)
entity_id = kwargs.pop("entity_id", DEFAULT_ENTITY_ID)
def function(**kwargs: t.Any) -> t.Dict:
"""Wrapper function for composio action."""
return toolset.execute_action(
action=Action(schema["name"]),
params=kwargs,
entity_id=entity_id,
)
function.__name__ = schema["name"]
function.__doc__ = schema["description"]
return cls(
name=schema["name"],
description=schema["description"],
args_schema=json_schema_to_model(
action_schema.parameters.model_dump(
exclude_none=True,
)
),
composio_action=function,
**kwargs,
)
@classmethod
def from_app(
cls,
*apps: t.Any,
tags: t.Optional[t.List[str]] = None,
use_case: t.Optional[str] = None,
**kwargs: t.Any,
) -> t.List[te.Self]:
"""Create toolset from an app."""
if len(apps) == 0:
raise ValueError("You need to provide at least one app name")
if use_case is None and tags is None:
raise ValueError("Both `use_case` and `tags` cannot be `None`")
if use_case is not None and tags is not None:
raise ValueError(
"Cannot use both `use_case` and `tags` to filter the actions"
)
from composio import ComposioToolSet
toolset = ComposioToolSet()
if use_case is not None:
return [
cls.from_action(action=action, **kwargs)
for action in toolset.find_actions_by_use_case(*apps, use_case=use_case)
]
return [
cls.from_action(action=action, **kwargs)
for action in toolset.find_actions_by_tags(*apps, tags=tags)
]

View File

@@ -0,0 +1,58 @@
# ContextualAICreateAgentTool
## Description
This tool is designed to integrate Contextual AI's enterprise-grade RAG agents with CrewAI. This tool enables you to create a new Contextual RAG agent. It uploads your documents to create a datastore and returns the Contextual agent ID and datastore ID.
## Installation
To incorporate this tool into your project, follow the installation instructions below:
```
pip install 'crewai[tools]' contextual-client
```
**Note**: You'll need a Contextual AI API key. Sign up at [app.contextual.ai](https://app.contextual.ai) to get your free API key.
## Example
```python
from crewai_tools import ContextualAICreateAgentTool
# Initialize the tool
tool = ContextualAICreateAgentTool(api_key="your_api_key_here")
# Create agent with documents
result = tool._run(
agent_name="Financial Analysis Agent",
agent_description="Agent for analyzing financial documents",
datastore_name="Financial Reports",
document_paths=["/path/to/report1.pdf", "/path/to/report2.pdf"],
)
print(result)
```
## Parameters
- `api_key`: Your Contextual AI API key
- `agent_name`: Name for the new agent
- `agent_description`: Description of the agent's purpose
- `datastore_name`: Name for the document datastore
- `document_paths`: List of file paths to upload
Example result:
```
Successfully created agent 'Research Analyst' with ID: {created_agent_ID} and datastore ID: {created_datastore_ID}. Uploaded 5 documents.
```
You can use `ContextualAIQueryTool` with the returned IDs to query the knowledge base and retrieve relevant information from your documents.
## Key Features
- **Complete Pipeline Setup**: Creates datastore, uploads documents, and configures agent in one operation
- **Document Processing**: Leverages Contextual AI's powerful parser to ingest complex PDFs and documents
- **Vector Storage**: Use Contextual AI's datastore for large document collections
## Use Cases
- Set up new RAG agents from scratch with complete automation
- Upload and organize document collections into structured datastores
- Create specialized domain agents for legal, financial, technical, or research workflows
For more detailed information about Contextual AI's capabilities, visit the [official documentation](https://docs.contextual.ai).

View File

@@ -0,0 +1,71 @@
from typing import Any, Optional, Type, List
from crewai.tools import BaseTool
from pydantic import BaseModel, Field
import os
class ContextualAICreateAgentSchema(BaseModel):
"""Schema for contextual create agent tool."""
agent_name: str = Field(..., description="Name for the new agent")
agent_description: str = Field(..., description="Description for the new agent")
datastore_name: str = Field(..., description="Name for the new datastore")
document_paths: List[str] = Field(..., description="List of file paths to upload")
class ContextualAICreateAgentTool(BaseTool):
"""Tool to create Contextual AI RAG agents with documents."""
name: str = "Contextual AI Create Agent Tool"
description: str = "Create a new Contextual AI RAG agent with documents and datastore"
args_schema: Type[BaseModel] = ContextualAICreateAgentSchema
api_key: str
contextual_client: Any = None
package_dependencies: List[str] = ["contextual-client"]
def __init__(self, **kwargs):
super().__init__(**kwargs)
try:
from contextual import ContextualAI
self.contextual_client = ContextualAI(api_key=self.api_key)
except ImportError:
raise ImportError(
"contextual-client package is required. Install it with: pip install contextual-client"
)
def _run(
self,
agent_name: str,
agent_description: str,
datastore_name: str,
document_paths: List[str]
) -> str:
"""Create a complete RAG pipeline with documents."""
try:
import os
# Create datastore
datastore = self.contextual_client.datastores.create(name=datastore_name)
datastore_id = datastore.id
# Upload documents
document_ids = []
for doc_path in document_paths:
if not os.path.exists(doc_path):
raise FileNotFoundError(f"Document not found: {doc_path}")
with open(doc_path, 'rb') as f:
ingestion_result = self.contextual_client.datastores.documents.ingest(datastore_id, file=f)
document_ids.append(ingestion_result.id)
# Create agent
agent = self.contextual_client.agents.create(
name=agent_name,
description=agent_description,
datastore_ids=[datastore_id]
)
return f"Successfully created agent '{agent_name}' with ID: {agent.id} and datastore ID: {datastore_id}. Uploaded {len(document_ids)} documents."
except Exception as e:
return f"Failed to create agent with documents: {str(e)}"

View File

@@ -0,0 +1,68 @@
# ContextualAIParseTool
## Description
This tool is designed to integrate Contextual AI's enterprise-grade document parsing capabilities with CrewAI, enabling you to leverage advanced AI-powered document understanding for complex layouts, tables, and figures. Use this tool to extract structured content from your documents using Contextual AI's powerful document parser.
## Installation
To incorporate this tool into your project, follow the installation instructions below:
```
pip install 'crewai[tools]' contextual-client
```
**Note**: You'll need a Contextual AI API key. Sign up at [app.contextual.ai](https://app.contextual.ai) to get your free API key.
## Example
```python
from crewai_tools import ContextualAIParseTool
tool = ContextualAIParseTool(api_key="your_api_key_here")
result = tool._run(
file_path="/path/to/document.pdf",
parse_mode="standard",
page_range="0-5",
output_types=["markdown-per-page"]
)
print(result)
```
The result will show the parsed contents of your document. For example:
```
{
"file_name": "attention_is_all_you_need.pdf",
"status": "completed",
"pages": [
{
"index": 0,
"markdown": "Provided proper attribution ...
},
{
"index": 1,
"markdown": "## 1 Introduction ...
},
...
]
}
```
## Parameters
- `api_key`: Your Contextual AI API key
- `file_path`: Path to document to parse
- `parse_mode`: Parsing mode (default: "standard")
- `figure_caption_mode`: Figure caption handling (default: "concise")
- `enable_document_hierarchy`: Enable hierarchy detection (default: True)
- `page_range`: Pages to parse (e.g., "0-5", None for all)
- `output_types`: Output formats (default: ["markdown-per-page"])
## Key Features
- **Advanced Document Understanding**: Handles complex PDF layouts, tables, and multi-column documents
- **Figure and Table Extraction**: Intelligent extraction of figures, charts, and tabular data
- **Page Range Selection**: Parse specific pages or entire documents
## Use Cases
- Extract structured content from complex PDFs and research papers
- Parse financial reports, legal documents, and technical manuals
- Convert documents to markdown for further processing in RAG pipelines
For more detailed information about Contextual AI's capabilities, visit the [official documentation](https://docs.contextual.ai).

View File

@@ -0,0 +1,92 @@
from typing import Any, Optional, Type, List
from crewai.tools import BaseTool
from pydantic import BaseModel, Field
class ContextualAIParseSchema(BaseModel):
"""Schema for contextual parse tool."""
file_path: str = Field(..., description="Path to the document to parse")
parse_mode: str = Field(default="standard", description="Parsing mode")
figure_caption_mode: str = Field(default="concise", description="Figure caption mode")
enable_document_hierarchy: bool = Field(default=True, description="Enable document hierarchy")
page_range: Optional[str] = Field(default=None, description="Page range to parse (e.g., '0-5')")
output_types: List[str] = Field(default=["markdown-per-page"], description="List of output types")
class ContextualAIParseTool(BaseTool):
"""Tool to parse documents using Contextual AI's parser."""
name: str = "Contextual AI Document Parser"
description: str = "Parse documents using Contextual AI's advanced document parser"
args_schema: Type[BaseModel] = ContextualAIParseSchema
api_key: str
package_dependencies: List[str] = ["contextual-client"]
def _run(
self,
file_path: str,
parse_mode: str = "standard",
figure_caption_mode: str = "concise",
enable_document_hierarchy: bool = True,
page_range: Optional[str] = None,
output_types: List[str] = ["markdown-per-page"]
) -> str:
"""Parse a document using Contextual AI's parser."""
try:
import requests
import json
import os
from time import sleep
if not os.path.exists(file_path):
raise FileNotFoundError(f"Document not found: {file_path}")
base_url = "https://api.contextual.ai/v1"
headers = {
"accept": "application/json",
"authorization": f"Bearer {self.api_key}"
}
# Submit parse job
url = f"{base_url}/parse"
config = {
"parse_mode": parse_mode,
"figure_caption_mode": figure_caption_mode,
"enable_document_hierarchy": enable_document_hierarchy,
}
if page_range:
config["page_range"] = page_range
with open(file_path, "rb") as fp:
file = {"raw_file": fp}
result = requests.post(url, headers=headers, data=config, files=file)
response = json.loads(result.text)
job_id = response['job_id']
# Monitor job status
status_url = f"{base_url}/parse/jobs/{job_id}/status"
while True:
result = requests.get(status_url, headers=headers)
parse_response = json.loads(result.text)['status']
if parse_response == "completed":
break
elif parse_response == "failed":
raise RuntimeError("Document parsing failed")
sleep(5)
# Get parse results
results_url = f"{base_url}/parse/jobs/{job_id}/results"
result = requests.get(
results_url,
headers=headers,
params={"output_types": ",".join(output_types)},
)
return json.dumps(json.loads(result.text), indent=2)
except Exception as e:
return f"Failed to parse document: {str(e)}"

View File

@@ -0,0 +1,54 @@
# ContextualAIQueryTool
## Description
This tool is designed to integrate Contextual AI's enterprise-grade RAG agents with CrewAI. Run this tool to query existing Contextual AI RAG agents that have been pre-configured with documents and knowledge bases.
## Installation
To incorporate this tool into your project, follow the installation instructions below:
```shell
pip install 'crewai[tools]' contextual-client
```
**Note**: You'll need a Contextual AI API key. Sign up at [app.contextual.ai](https://app.contextual.ai) to get your free API key.
## Example
Make sure you have already created a Contextual agent and ingested documents into the datastore before using this tool.
```python
from crewai_tools import ContextualAIQueryTool
# Initialize the tool
tool = ContextualAIQueryTool(api_key="your_api_key_here")
# Query the agent with IDs
result = tool._run(
query="What are the key findings in the financial report?",
agent_id="your_agent_id_here",
datastore_id="your_datastore_id_here" # Optional: for document readiness checking
)
print(result)
```
The result will contain the generated answer to the user's query.
## Parameters
**Initialization:**
- `api_key`: Your Contextual AI API key
**Query (_run method):**
- `query`: The question or query to send to the agent
- `agent_id`: ID of the existing Contextual AI agent to query (required)
- `datastore_id`: Optional datastore ID for document readiness verification (if not provided, document status checking is disabled with a warning)
## Key Features
- **Document Readiness Checking**: Automatically waits for documents to be processed before querying
- **Grounded Responses**: Built-in grounding ensures factual, source-attributed answers
## Use Cases
- Query pre-configured RAG agents with document collections
- Access enterprise knowledge bases through user queries
- Build specialized domain experts with access to curated documents
For more detailed information about Contextual AI's capabilities, visit the [official documentation](https://docs.contextual.ai).

View File

@@ -0,0 +1,99 @@
from typing import Any, Optional, Type, List
from crewai.tools import BaseTool
from pydantic import BaseModel, Field
import asyncio
import requests
import os
class ContextualAIQuerySchema(BaseModel):
"""Schema for contextual query tool."""
query: str = Field(..., description="Query to send to the Contextual AI agent.")
agent_id: str = Field(..., description="ID of the Contextual AI agent to query")
datastore_id: Optional[str] = Field(None, description="Optional datastore ID for document readiness verification")
class ContextualAIQueryTool(BaseTool):
"""Tool to query Contextual AI RAG agents."""
name: str = "Contextual AI Query Tool"
description: str = "Use this tool to query a Contextual AI RAG agent with access to your documents"
args_schema: Type[BaseModel] = ContextualAIQuerySchema
api_key: str
contextual_client: Any = None
package_dependencies: List[str] = ["contextual-client"]
def __init__(self, **kwargs):
super().__init__(**kwargs)
try:
from contextual import ContextualAI
self.contextual_client = ContextualAI(api_key=self.api_key)
except ImportError:
raise ImportError(
"contextual-client package is required. Install it with: pip install contextual-client"
)
def _check_documents_ready(self, datastore_id: str) -> bool:
"""Synchronous check if all documents are ready."""
url = f"https://api.contextual.ai/v1/datastores/{datastore_id}/documents"
headers = {"Authorization": f"Bearer {self.api_key}"}
response = requests.get(url, headers=headers)
if response.status_code == 200:
data = response.json()
documents = data.get('documents', [])
return not any(doc.get('status') in ('processing', 'pending') for doc in documents)
return True
async def _wait_for_documents_async(self, datastore_id: str, max_attempts: int = 20, interval: float = 30.0) -> bool:
"""Asynchronously poll until documents are ready, exiting early if possible."""
for attempt in range(max_attempts):
ready = await asyncio.to_thread(self._check_documents_ready, datastore_id)
if ready:
return True
await asyncio.sleep(interval)
print("Processing documents ...")
return True # give up but don't fail hard
def _run(self, query: str, agent_id: str, datastore_id: Optional[str] = None) -> str:
if not agent_id:
raise ValueError("Agent ID is required to query the Contextual AI agent")
if datastore_id:
ready = self._check_documents_ready(datastore_id)
if not ready:
try:
# If no running event loop, use asyncio.run
loop = asyncio.get_running_loop()
except RuntimeError:
loop = None
if loop and loop.is_running():
# Already inside an event loop
try:
import nest_asyncio
nest_asyncio.apply(loop)
loop.run_until_complete(self._wait_for_documents_async(datastore_id))
except Exception as e:
print(f"Failed to apply nest_asyncio: {str(e)}")
else:
asyncio.run(self._wait_for_documents_async(datastore_id))
else:
print("Warning: No datastore_id provided. Document status checking disabled.")
try:
response = self.contextual_client.agents.query.create(
agent_id=agent_id,
messages=[{"role": "user", "content": query}]
)
if hasattr(response, 'content'):
return response.content
elif hasattr(response, 'message'):
return response.message.content if hasattr(response.message, 'content') else str(response.message)
elif hasattr(response, 'messages') and len(response.messages) > 0:
last_message = response.messages[-1]
return last_message.content if hasattr(last_message, 'content') else str(last_message)
else:
return str(response)
except Exception as e:
return f"Error querying Contextual AI agent: {str(e)}"

View File

@@ -0,0 +1,72 @@
# ContextualAIRerankTool
## Description
This tool is designed to integrate Contextual AI's enterprise-grade instruction-following reranker with CrewAI, enabling you to intelligently reorder documents based on relevance and custom criteria. Use this tool to enhance search result quality and document retrieval for RAG systems using Contextual AI's reranking models that understand context and follow specific instructions for optimal document ordering.
## Installation
To incorporate this tool into your project, follow the installation instructions below:
```shell
pip install 'crewai[tools]' contextual-client
```
**Note**: You'll need a Contextual AI API key. Sign up at [app.contextual.ai](https://app.contextual.ai) to get your free API key.
## Example
```python
from crewai_tools import ContextualAIRerankTool
tool = ContextualAIRerankTool(api_key="your_api_key_here")
result = tool._run(
query="financial performance and revenue metrics",
documents=[
"Q1 report content with revenue data",
"Q2 report content with growth metrics",
"News article about market trends"
],
instruction="Prioritize documents with specific financial metrics and quantitative data"
)
print(result)
```
The result will contain the document ranking. For example:
```
Rerank Result:
{
"results": [
{
"index": 1,
"relevance_score": 0.88227631
},
{
"index": 0,
"relevance_score": 0.61159354
},
{
"index": 2,
"relevance_score": 0.28579462
}
]
}
```
## Parameters
- `api_key`: Your Contextual AI API key
- `query`: Search query for reranking
- `documents`: List of document texts to rerank
- `instruction`: Optional reranking instruction for custom criteria
- `metadata`: Optional metadata for each document
- `model`: Reranker model (default: "ctxl-rerank-en-v1-instruct")
## Key Features
- **Instruction-Following Reranking**: Follows custom instructions for domain-specific document ordering
- **Metadata Integration**: Incorporates document metadata for enhanced ranking decisions
## Use Cases
- Improve search result relevance in document collections
- Reorder documents by custom business criteria (recency, authority, relevance)
- Filter and prioritize documents for research and analysis workflows
For more detailed information about Contextual AI's capabilities, visit the [official documentation](https://docs.contextual.ai).

View File

@@ -0,0 +1,68 @@
from typing import Any, Optional, Type, List
from crewai.tools import BaseTool
from pydantic import BaseModel, Field
class ContextualAIRerankSchema(BaseModel):
"""Schema for contextual rerank tool."""
query: str = Field(..., description="The search query to rerank documents against")
documents: List[str] = Field(..., description="List of document texts to rerank")
instruction: Optional[str] = Field(default=None, description="Optional instruction for reranking behavior")
metadata: Optional[List[str]] = Field(default=None, description="Optional metadata for each document")
model: str = Field(default="ctxl-rerank-en-v1-instruct", description="Reranker model to use")
class ContextualAIRerankTool(BaseTool):
"""Tool to rerank documents using Contextual AI's instruction-following reranker."""
name: str = "Contextual AI Document Reranker"
description: str = "Rerank documents using Contextual AI's instruction-following reranker"
args_schema: Type[BaseModel] = ContextualAIRerankSchema
api_key: str
package_dependencies: List[str] = ["contextual-client"]
def _run(
self,
query: str,
documents: List[str],
instruction: Optional[str] = None,
metadata: Optional[List[str]] = None,
model: str = "ctxl-rerank-en-v1-instruct"
) -> str:
"""Rerank documents using Contextual AI's instruction-following reranker."""
try:
import requests
import json
base_url = "https://api.contextual.ai/v1"
headers = {
"accept": "application/json",
"content-type": "application/json",
"authorization": f"Bearer {self.api_key}"
}
payload = {
"query": query,
"documents": documents,
"model": model
}
if instruction:
payload["instruction"] = instruction
if metadata:
if len(metadata) != len(documents):
raise ValueError("Metadata list must have the same length as documents list")
payload["metadata"] = metadata
rerank_url = f"{base_url}/rerank"
result = requests.post(rerank_url, json=payload, headers=headers)
if result.status_code != 200:
raise RuntimeError(f"Reranker API returned status {result.status_code}: {result.text}")
return json.dumps(result.json(), indent=2)
except Exception as e:
return f"Failed to rerank documents: {str(e)}"

View File

@@ -0,0 +1,62 @@
# CouchbaseFTSVectorSearchTool
## Description
Couchbase is a NoSQL database with vector search capabilities. Users can store and query vector embeddings. You can learn more about Couchbase vector search here: https://docs.couchbase.com/cloud/vector-search/vector-search.html
This tool is specifically crafted for performing semantic search using Couchbase. Use this tool to find semantically similar docs to a given query.
## Installation
Install the crewai_tools package by executing the following command in your terminal:
```shell
uv pip install 'crewai[tools]'
```
## Setup
Before instantiating the tool, you need a Couchbase cluster.
- Create a cluster on [Couchbase Capella](https://docs.couchbase.com/cloud/get-started/create-account.html), Couchbase's cloud database solution.
- Create a [local Couchbase server](https://docs.couchbase.com/server/current/getting-started/start-here.html).
You will need to create a bucket, scope and collection on the cluster. Then, [follow this guide](https://docs.couchbase.com/python-sdk/current/hello-world/start-using-sdk.html) to create a Couchbase Cluster object and load documents into your collection.
Follow the docs below to create a vector search index on Couchbase.
- [Create a vector search index on Couchbase Capella.](https://docs.couchbase.com/cloud/vector-search/create-vector-search-index-ui.html)
- [Create a vector search index on your local Couchbase server.](https://docs.couchbase.com/server/current/vector-search/create-vector-search-index-ui.html)
Ensure that the `Dimension` field in the index matches the embedding model. For example, OpenAI's `text-embedding-3-small` model has an embedding dimension of 1536 dimensions, and so the `Dimension` field must be 1536 in the index.
## Example
To utilize the CouchbaseFTSVectorSearchTool for different use cases, follow these examples:
```python
from crewai_tools import CouchbaseFTSVectorSearchTool
# Instantiate a Couchbase Cluster object from the Couchbase SDK
tool = CouchbaseFTSVectorSearchTool(
cluster=cluster,
collection_name="collection",
scope_name="scope",
bucket_name="bucket",
index_name="index",
embedding_function=embed_fn
)
# Adding the tool to an agent
rag_agent = Agent(
name="rag_agent",
role="You are a helpful assistant that can answer questions with the help of the CouchbaseFTSVectorSearchTool.",
llm="gpt-4o-mini",
tools=[tool],
)
```
## Arguments
- `cluster`: An initialized Couchbase `Cluster` instance.
- `bucket_name`: The name of the Couchbase bucket.
- `scope_name`: The name of the scope within the bucket.
- `collection_name`: The name of the collection within the scope.
- `index_name`: The name of the search index (vector index).
- `embedding_function`: A function that takes a string and returns its embedding (list of floats).
- `embedding_key`: Name of the field in the search index storing the vector. (Optional, defaults to 'embedding')
- `scoped_index`: Whether the index is scoped (True) or cluster-level (False). (Optional, defaults to True)
- `limit`: The maximum number of search results to return. (Optional, defaults to 3)

View File

@@ -0,0 +1,241 @@
import json
import os
from typing import Any, Optional, Type, List, Dict, Callable
try:
import couchbase.search as search
from couchbase.cluster import Cluster
from couchbase.options import SearchOptions
from couchbase.vector_search import VectorQuery, VectorSearch
COUCHBASE_AVAILABLE = True
except ImportError:
COUCHBASE_AVAILABLE = False
search = Any
Cluster = Any
SearchOptions = Any
VectorQuery = Any
VectorSearch = Any
from crewai.tools import BaseTool
from pydantic import BaseModel, Field, SkipValidation
class CouchbaseToolSchema(BaseModel):
"""Input for CouchbaseTool."""
query: str = Field(
...,
description="The query to search retrieve relevant information from the Couchbase database. Pass only the query, not the question.",
)
class CouchbaseFTSVectorSearchTool(BaseTool):
"""Tool to search the Couchbase database"""
model_config = {"arbitrary_types_allowed": True}
name: str = "CouchbaseFTSVectorSearchTool"
description: str = "A tool to search the Couchbase database for relevant information on internal documents."
args_schema: Type[BaseModel] = CouchbaseToolSchema
cluster: SkipValidation[Optional[Cluster]] = None
collection_name: Optional[str] = None,
scope_name: Optional[str] = None,
bucket_name: Optional[str] = None,
index_name: Optional[str] = None,
embedding_key: Optional[str] = Field(
default="embedding",
description="Name of the field in the search index that stores the vector"
)
scoped_index: Optional[bool] = Field(
default=True,
description="Specify whether the index is scoped. Is True by default."
),
limit: Optional[int] = Field(default=3)
embedding_function: SkipValidation[Callable[[str], List[float]]] = Field(
default=None,
description="A function that takes a string and returns a list of floats. This is used to embed the query before searching the database."
)
def _check_bucket_exists(self) -> bool:
"""Check if the bucket exists in the linked Couchbase cluster"""
bucket_manager = self.cluster.buckets()
try:
bucket_manager.get_bucket(self.bucket_name)
return True
except Exception:
return False
def _check_scope_and_collection_exists(self) -> bool:
"""Check if the scope and collection exists in the linked Couchbase bucket
Raises a ValueError if either is not found"""
scope_collection_map: Dict[str, Any] = {}
# Get a list of all scopes in the bucket
for scope in self._bucket.collections().get_all_scopes():
scope_collection_map[scope.name] = []
# Get a list of all the collections in the scope
for collection in scope.collections:
scope_collection_map[scope.name].append(collection.name)
# Check if the scope exists
if self.scope_name not in scope_collection_map.keys():
raise ValueError(
f"Scope {self.scope_name} not found in Couchbase "
f"bucket {self.bucket_name}"
)
# Check if the collection exists in the scope
if self.collection_name not in scope_collection_map[self.scope_name]:
raise ValueError(
f"Collection {self.collection_name} not found in scope "
f"{self.scope_name} in Couchbase bucket {self.bucket_name}"
)
return True
def _check_index_exists(self) -> bool:
"""Check if the Search index exists in the linked Couchbase cluster
Raises a ValueError if the index does not exist"""
if self.scoped_index:
all_indexes = [
index.name for index in self._scope.search_indexes().get_all_indexes()
]
if self.index_name not in all_indexes:
raise ValueError(
f"Index {self.index_name} does not exist. "
" Please create the index before searching."
)
else:
all_indexes = [
index.name for index in self.cluster.search_indexes().get_all_indexes()
]
if self.index_name not in all_indexes:
raise ValueError(
f"Index {self.index_name} does not exist. "
" Please create the index before searching."
)
return True
def __init__(self, **kwargs):
"""Initialize the CouchbaseFTSVectorSearchTool.
Args:
**kwargs: Keyword arguments to pass to the BaseTool constructor and
to configure the Couchbase connection and search parameters.
Requires 'cluster', 'bucket_name', 'scope_name',
'collection_name', 'index_name', and 'embedding_function'.
Raises:
ValueError: If required parameters are missing, the Couchbase cluster
cannot be reached, or the specified bucket, scope,
collection, or index does not exist.
"""
super().__init__(**kwargs)
if COUCHBASE_AVAILABLE:
try:
if not self.cluster:
raise ValueError("Cluster instance must be provided")
if not self.bucket_name:
raise ValueError("Bucket name must be provided")
if not self.scope_name:
raise ValueError("Scope name must be provided")
if not self.collection_name:
raise ValueError("Collection name must be provided")
if not self.index_name:
raise ValueError("Index name must be provided")
if not self.embedding_function:
raise ValueError("Embedding function must be provided")
self._bucket = self.cluster.bucket(self.bucket_name)
self._scope = self._bucket.scope(self.scope_name)
self._collection = self._scope.collection(self.collection_name)
except Exception as e:
raise ValueError(
"Error connecting to couchbase. "
"Please check the connection and credentials"
) from e
# check if bucket exists
if not self._check_bucket_exists():
raise ValueError(
f"Bucket {self.bucket_name} does not exist. "
" Please create the bucket before searching."
)
self._check_scope_and_collection_exists()
self._check_index_exists()
else:
import click
if click.confirm(
"The 'couchbase' package is required to use the CouchbaseFTSVectorSearchTool. "
"Would you like to install it?"
):
import subprocess
subprocess.run(["uv", "add", "couchbase"], check=True)
else:
raise ImportError(
"The 'couchbase' package is required to use the CouchbaseFTSVectorSearchTool. "
"Please install it with: uv add couchbase"
)
def _run(self, query: str) -> str:
"""Execute a vector search query against the Couchbase index.
Args:
query: The search query string.
Returns:
A JSON string containing the search results.
Raises:
ValueError: If the search query fails or returns results without fields.
"""
query_embedding = self.embedding_function(query)
fields = ["*"]
search_req = search.SearchRequest.create(
VectorSearch.from_vector_query(
VectorQuery(
self.embedding_key,
query_embedding,
self.limit
)
)
)
try:
if self.scoped_index:
search_iter = self._scope.search(
self.index_name,
search_req,
SearchOptions(
limit=self.limit,
fields=fields,
)
)
else:
search_iter = self.cluster.search(
self.index_name,
search_req,
SearchOptions(
limit=self.limit,
fields=fields
)
)
json_response = []
for row in search_iter.rows():
json_response.append(row.fields)
except Exception as e:
return f"Search failed with error: {e}"
return json.dumps(json_response, indent=2)

View File

@@ -0,0 +1,88 @@
"""
Crewai Enterprise Tools
"""
import os
import typing as t
import logging
import json
from crewai.tools import BaseTool
from crewai_tools.adapters.enterprise_adapter import EnterpriseActionKitToolAdapter
from crewai_tools.adapters.tool_collection import ToolCollection
logger = logging.getLogger(__name__)
def CrewaiEnterpriseTools(
enterprise_token: t.Optional[str] = None,
actions_list: t.Optional[t.List[str]] = None,
enterprise_action_kit_project_id: t.Optional[str] = None,
enterprise_action_kit_project_url: t.Optional[str] = None,
) -> ToolCollection[BaseTool]:
"""Factory function that returns crewai enterprise tools.
Args:
enterprise_token: The token for accessing enterprise actions.
If not provided, will try to use CREWAI_ENTERPRISE_TOOLS_TOKEN env var.
actions_list: Optional list of specific tool names to include.
If provided, only tools with these names will be returned.
enterprise_action_kit_project_id: Optional ID of the Enterprise Action Kit project.
enterprise_action_kit_project_url: Optional URL of the Enterprise Action Kit project.
Returns:
A ToolCollection of BaseTool instances for enterprise actions
"""
import warnings
warnings.warn(
"CrewaiEnterpriseTools will be removed in v1.0.0. Considering use `Agent(apps=[...])` instead.",
DeprecationWarning,
stacklevel=2
)
if enterprise_token is None or enterprise_token == "":
enterprise_token = os.environ.get("CREWAI_ENTERPRISE_TOOLS_TOKEN")
if not enterprise_token:
logger.warning("No enterprise token provided")
adapter_kwargs = {"enterprise_action_token": enterprise_token}
if enterprise_action_kit_project_id is not None:
adapter_kwargs["enterprise_action_kit_project_id"] = (
enterprise_action_kit_project_id
)
if enterprise_action_kit_project_url is not None:
adapter_kwargs["enterprise_action_kit_project_url"] = (
enterprise_action_kit_project_url
)
adapter = EnterpriseActionKitToolAdapter(**adapter_kwargs)
all_tools = adapter.tools()
parsed_actions_list = _parse_actions_list(actions_list)
# Filter tools based on the provided list
return ToolCollection(all_tools).filter_by_names(parsed_actions_list)
# ENTERPRISE INJECTION ONLY
def _parse_actions_list(actions_list: t.Optional[t.List[str]]) -> t.List[str] | None:
"""Parse a string representation of a list of tool names to a list of tool names.
Args:
actions_list: A string representation of a list of tool names.
Returns:
A list of tool names.
"""
if actions_list is not None:
return actions_list
actions_list_from_env = os.environ.get("CREWAI_ENTERPRISE_TOOLS_ACTIONS_LIST")
if actions_list_from_env is None:
return None
try:
return json.loads(actions_list_from_env)
except json.JSONDecodeError:
logger.warning(f"Failed to parse actions_list as JSON: {actions_list_from_env}")
return None

View File

@@ -0,0 +1,16 @@
"""CrewAI Platform Tools
This module provides tools for integrating with various platform applications
through the CrewAI platform API.
"""
from crewai_tools.tools.crewai_platform_tools.crewai_platform_tools import CrewaiPlatformTools
from crewai_tools.tools.crewai_platform_tools.crewai_platform_action_tool import CrewAIPlatformActionTool
from crewai_tools.tools.crewai_platform_tools.crewai_platform_tool_builder import CrewaiPlatformToolBuilder
__all__ = [
"CrewaiPlatformTools",
"CrewAIPlatformActionTool",
"CrewaiPlatformToolBuilder",
]

View File

@@ -0,0 +1,233 @@
"""
Crewai Enterprise Tools
"""
import re
import json
import requests
from typing import Dict, Any, List, Type, Optional, Union, get_origin, cast, Literal
from pydantic import Field, create_model
from crewai.tools import BaseTool
from crewai_tools.tools.crewai_platform_tools.misc import get_platform_api_base_url, get_platform_integration_token
class CrewAIPlatformActionTool(BaseTool):
action_name: str = Field(default="", description="The name of the action")
action_schema: Dict[str, Any] = Field(
default_factory=dict, description="The schema of the action"
)
def __init__(
self,
description: str,
action_name: str,
action_schema: Dict[str, Any],
):
self._model_registry = {}
self._base_name = self._sanitize_name(action_name)
schema_props, required = self._extract_schema_info(action_schema)
field_definitions = {}
for param_name, param_details in schema_props.items():
param_desc = param_details.get("description", "")
is_required = param_name in required
try:
field_type = self._process_schema_type(
param_details, self._sanitize_name(param_name).title()
)
except Exception as e:
field_type = str
field_definitions[param_name] = self._create_field_definition(
field_type, is_required, param_desc
)
if field_definitions:
try:
args_schema = create_model(
f"{self._base_name}Schema", **field_definitions
)
except Exception as e:
print(f"Warning: Could not create main schema model: {e}")
args_schema = create_model(
f"{self._base_name}Schema",
input_text=(str, Field(description="Input for the action")),
)
else:
args_schema = create_model(
f"{self._base_name}Schema",
input_text=(str, Field(description="Input for the action")),
)
super().__init__(name=action_name.lower().replace(" ", "_"), description=description, args_schema=args_schema)
self.action_name = action_name
self.action_schema = action_schema
def _sanitize_name(self, name: str) -> str:
name = name.lower().replace(" ", "_")
sanitized = re.sub(r"[^a-zA-Z0-9_]", "", name)
parts = sanitized.split("_")
return "".join(word.capitalize() for word in parts if word)
def _extract_schema_info(
self, action_schema: Dict[str, Any]
) -> tuple[Dict[str, Any], List[str]]:
schema_props = (
action_schema.get("function", {})
.get("parameters", {})
.get("properties", {})
)
required = (
action_schema.get("function", {}).get("parameters", {}).get("required", [])
)
return schema_props, required
def _process_schema_type(self, schema: Dict[str, Any], type_name: str) -> Type[Any]:
if "anyOf" in schema:
any_of_types = schema["anyOf"]
is_nullable = any(t.get("type") == "null" for t in any_of_types)
non_null_types = [t for t in any_of_types if t.get("type") != "null"]
if non_null_types:
base_type = self._process_schema_type(non_null_types[0], type_name)
return Optional[base_type] if is_nullable else base_type
return cast(Type[Any], Optional[str])
if "oneOf" in schema:
return self._process_schema_type(schema["oneOf"][0], type_name)
if "allOf" in schema:
return self._process_schema_type(schema["allOf"][0], type_name)
json_type = schema.get("type", "string")
if "enum" in schema:
enum_values = schema["enum"]
if not enum_values:
return self._map_json_type_to_python(json_type)
return Literal[tuple(enum_values)]
if json_type == "array":
items_schema = schema.get("items", {"type": "string"})
item_type = self._process_schema_type(items_schema, f"{type_name}Item")
return List[item_type]
if json_type == "object":
return self._create_nested_model(schema, type_name)
return self._map_json_type_to_python(json_type)
def _create_nested_model(self, schema: Dict[str, Any], model_name: str) -> Type[Any]:
full_model_name = f"{self._base_name}{model_name}"
if full_model_name in self._model_registry:
return self._model_registry[full_model_name]
properties = schema.get("properties", {})
required_fields = schema.get("required", [])
if not properties:
return dict
field_definitions = {}
for prop_name, prop_schema in properties.items():
prop_desc = prop_schema.get("description", "")
is_required = prop_name in required_fields
try:
prop_type = self._process_schema_type(
prop_schema, f"{model_name}{self._sanitize_name(prop_name).title()}"
)
except Exception as e:
prop_type = str
field_definitions[prop_name] = self._create_field_definition(
prop_type, is_required, prop_desc
)
try:
nested_model = create_model(full_model_name, **field_definitions)
self._model_registry[full_model_name] = nested_model
return nested_model
except Exception as e:
print(f"Warning: Could not create nested model {full_model_name}: {e}")
return dict
def _create_field_definition(
self, field_type: Type[Any], is_required: bool, description: str
) -> tuple:
if is_required:
return (field_type, Field(description=description))
else:
if get_origin(field_type) is Union:
return (field_type, Field(default=None, description=description))
else:
return (
Optional[field_type],
Field(default=None, description=description),
)
def _map_json_type_to_python(self, json_type: str) -> Type[Any]:
type_mapping = {
"string": str,
"integer": int,
"number": float,
"boolean": bool,
"array": list,
"object": dict,
"null": type(None),
}
return type_mapping.get(json_type, str)
def _get_required_nullable_fields(self) -> List[str]:
schema_props, required = self._extract_schema_info(self.action_schema)
required_nullable_fields = []
for param_name in required:
param_details = schema_props.get(param_name, {})
if self._is_nullable_type(param_details):
required_nullable_fields.append(param_name)
return required_nullable_fields
def _is_nullable_type(self, schema: Dict[str, Any]) -> bool:
if "anyOf" in schema:
return any(t.get("type") == "null" for t in schema["anyOf"])
return schema.get("type") == "null"
def _run(self, **kwargs) -> str:
try:
cleaned_kwargs = {}
for key, value in kwargs.items():
if value is not None:
cleaned_kwargs[key] = value
required_nullable_fields = self._get_required_nullable_fields()
for field_name in required_nullable_fields:
if field_name not in cleaned_kwargs:
cleaned_kwargs[field_name] = None
api_url = f"{get_platform_api_base_url()}/actions/{self.action_name}/execute"
token = get_platform_integration_token()
headers = {
"Authorization": f"Bearer {token}",
"Content-Type": "application/json",
}
payload = cleaned_kwargs
response = requests.post(
url=api_url, headers=headers, json=payload, timeout=60
)
data = response.json()
if not response.ok:
error_message = data.get("error", {}).get("message", json.dumps(data))
return f"API request failed: {error_message}"
return json.dumps(data, indent=2)
except Exception as e:
return f"Error executing action {self.action_name}: {str(e)}"

View File

@@ -0,0 +1,135 @@
import requests
from typing import List, Any, Dict
from crewai.tools import BaseTool
from crewai_tools.tools.crewai_platform_tools.misc import get_platform_api_base_url, get_platform_integration_token
from crewai_tools.tools.crewai_platform_tools.crewai_platform_action_tool import CrewAIPlatformActionTool
class CrewaiPlatformToolBuilder:
def __init__(
self,
apps: list[str],
):
self._apps = apps
self._actions_schema = {}
self._tools = None
def tools(self) -> list[BaseTool]:
if self._tools is None:
self._fetch_actions()
self._create_tools()
return self._tools if self._tools is not None else []
def _fetch_actions(self):
actions_url = f"{get_platform_api_base_url()}/actions"
headers = {"Authorization": f"Bearer {get_platform_integration_token()}"}
try:
response = requests.get(
actions_url, headers=headers, timeout=30, params={"apps": ",".join(self._apps)}
)
response.raise_for_status()
except Exception as e:
return
raw_data = response.json()
self._actions_schema = {}
action_categories = raw_data.get("actions", {})
for app, action_list in action_categories.items():
if isinstance(action_list, list):
for action in action_list:
if action_name := action.get("name"):
action_schema = {
"function": {
"name": action_name,
"description": action.get("description", f"Execute {action_name}"),
"parameters": action.get("parameters", {}),
"app": app,
}
}
self._actions_schema[action_name] = action_schema
def _generate_detailed_description(
self, schema: Dict[str, Any], indent: int = 0
) -> List[str]:
descriptions = []
indent_str = " " * indent
schema_type = schema.get("type", "string")
if schema_type == "object":
properties = schema.get("properties", {})
required_fields = schema.get("required", [])
if properties:
descriptions.append(f"{indent_str}Object with properties:")
for prop_name, prop_schema in properties.items():
prop_desc = prop_schema.get("description", "")
is_required = prop_name in required_fields
req_str = " (required)" if is_required else " (optional)"
descriptions.append(
f"{indent_str} - {prop_name}: {prop_desc}{req_str}"
)
if prop_schema.get("type") == "object":
descriptions.extend(
self._generate_detailed_description(prop_schema, indent + 2)
)
elif prop_schema.get("type") == "array":
items_schema = prop_schema.get("items", {})
if items_schema.get("type") == "object":
descriptions.append(f"{indent_str} Array of objects:")
descriptions.extend(
self._generate_detailed_description(
items_schema, indent + 3
)
)
elif "enum" in items_schema:
descriptions.append(
f"{indent_str} Array of enum values: {items_schema['enum']}"
)
elif "enum" in prop_schema:
descriptions.append(
f"{indent_str} Enum values: {prop_schema['enum']}"
)
return descriptions
def _create_tools(self):
tools = []
for action_name, action_schema in self._actions_schema.items():
function_details = action_schema.get("function", {})
description = function_details.get("description", f"Execute {action_name}")
parameters = function_details.get("parameters", {})
param_descriptions = []
if parameters.get("properties"):
param_descriptions.append("\nDetailed Parameter Structure:")
param_descriptions.extend(
self._generate_detailed_description(parameters)
)
full_description = description + "\n".join(param_descriptions)
tool = CrewAIPlatformActionTool(
description=full_description,
action_name=action_name,
action_schema=action_schema,
)
tools.append(tool)
self._tools = tools
def __enter__(self):
return self.tools()
def __exit__(self, exc_type, exc_val, exc_tb):
pass

View File

@@ -0,0 +1,28 @@
import re
import os
import typing as t
from typing import Literal
import logging
import json
from crewai.tools import BaseTool
from crewai_tools.tools.crewai_platform_tools.crewai_platform_tool_builder import CrewaiPlatformToolBuilder
from crewai_tools.adapters.tool_collection import ToolCollection
logger = logging.getLogger(__name__)
def CrewaiPlatformTools(
apps: list[str],
) -> ToolCollection[BaseTool]:
"""Factory function that returns crewai platform tools.
Args:
apps: List of platform apps to get tools that are available on the platform.
Returns:
A list of BaseTool instances for platform actions
"""
builder = CrewaiPlatformToolBuilder(apps=apps)
return builder.tools()

View File

@@ -0,0 +1,13 @@
import os
def get_platform_api_base_url() -> str:
"""Get the platform API base URL from environment or use default."""
base_url = os.getenv("CREWAI_PLUS_URL", "https://app.crewai.com")
return f"{base_url}/crewai_plus/api/v1/integrations"
def get_platform_integration_token() -> str:
"""Get the platform API base URL from environment or use default."""
token = os.getenv("CREWAI_PLATFORM_INTEGRATION_TOKEN") or ""
if not token:
raise ValueError("No platform integration token found, please set the CREWAI_PLATFORM_INTEGRATION_TOKEN environment variable")
return token # TODO: Use context manager to get token

View File

@@ -0,0 +1,59 @@
# CSVSearchTool
## Description
This tool is used to perform a RAG (Retrieval-Augmented Generation) search within a CSV file's content. It allows users to semantically search for queries in the content of a specified CSV file. This feature is particularly useful for extracting information from large CSV datasets where traditional search methods might be inefficient. All tools with "Search" in their name, including CSVSearchTool, are RAG tools designed for searching different sources of data.
## Installation
Install the crewai_tools package
```shell
pip install 'crewai[tools]'
```
## Example
```python
from crewai_tools import CSVSearchTool
# Initialize the tool with a specific CSV file. This setup allows the agent to only search the given CSV file.
tool = CSVSearchTool(csv='path/to/your/csvfile.csv')
# OR
# Initialize the tool without a specific CSV file. Agent will need to provide the CSV path at runtime.
tool = CSVSearchTool()
```
## Arguments
- `csv` : The path to the CSV file you want to search. This is a mandatory argument if the tool was initialized without a specific CSV file; otherwise, it is optional.
## Custom model and embeddings
By default, the tool uses OpenAI for both embeddings and summarization. To customize the model, you can use a config dictionary as follows:
```python
tool = CSVSearchTool(
config=dict(
llm=dict(
provider="ollama", # or google, openai, anthropic, llama2, ...
config=dict(
model="llama2",
# temperature=0.5,
# top_p=1,
# stream=true,
),
),
embedder=dict(
provider="google",
config=dict(
model="models/embedding-001",
task_type="retrieval_document",
# title="Embeddings",
),
),
)
)
```

View File

@@ -0,0 +1,56 @@
from typing import Optional, Type
try:
from embedchain.models.data_type import DataType
EMBEDCHAIN_AVAILABLE = True
except ImportError:
EMBEDCHAIN_AVAILABLE = False
from pydantic import BaseModel, Field
from ..rag.rag_tool import RagTool
class FixedCSVSearchToolSchema(BaseModel):
"""Input for CSVSearchTool."""
search_query: str = Field(
...,
description="Mandatory search query you want to use to search the CSV's content",
)
class CSVSearchToolSchema(FixedCSVSearchToolSchema):
"""Input for CSVSearchTool."""
csv: str = Field(..., description="File path or URL of a CSV file to be searched")
class CSVSearchTool(RagTool):
name: str = "Search a CSV's content"
description: str = (
"A tool that can be used to semantic search a query from a CSV's content."
)
args_schema: Type[BaseModel] = CSVSearchToolSchema
def __init__(self, csv: Optional[str] = None, **kwargs):
super().__init__(**kwargs)
if csv is not None:
self.add(csv)
self.description = f"A tool that can be used to semantic search a query the {csv} CSV's content."
self.args_schema = FixedCSVSearchToolSchema
self._generate_description()
def add(self, csv: str) -> None:
if not EMBEDCHAIN_AVAILABLE:
raise ImportError("embedchain is not installed. Please install it with `pip install crewai-tools[embedchain]`")
super().add(csv, data_type=DataType.CSV)
def _run(
self,
search_query: str,
csv: Optional[str] = None,
) -> str:
if csv is not None:
self.add(csv)
return super()._run(query=search_query)

View File

@@ -0,0 +1,41 @@
# DALL-E Tool
## Description
This tool is used to give the Agent the ability to generate images using the DALL-E model. It is a transformer-based model that generates images from textual descriptions. This tool allows the Agent to generate images based on the text input provided by the user.
## Installation
Install the crewai_tools package
```shell
pip install 'crewai[tools]'
```
## Example
Remember that when using this tool, the text must be generated by the Agent itself. The text must be a description of the image you want to generate.
```python
from crewai_tools import DallETool
Agent(
...
tools=[DallETool()],
)
```
If needed you can also tweak the parameters of the DALL-E model by passing them as arguments to the `DallETool` class. For example:
```python
from crewai_tools import DallETool
dalle_tool = DallETool(model="dall-e-3",
size="1024x1024",
quality="standard",
n=1)
Agent(
...
tools=[dalle_tool]
)
```
The parameters are based on the `client.images.generate` method from the OpenAI API. For more information on the parameters, please refer to the [OpenAI API documentation](https://platform.openai.com/docs/guides/images/introduction?lang=python).

View File

@@ -0,0 +1,52 @@
import json
from typing import List, Type
from crewai.tools import BaseTool, EnvVar
from openai import OpenAI
from pydantic import BaseModel, Field
class ImagePromptSchema(BaseModel):
"""Input for Dall-E Tool."""
image_description: str = Field(description="Description of the image to be generated by Dall-E.")
class DallETool(BaseTool):
name: str = "Dall-E Tool"
description: str = "Generates images using OpenAI's Dall-E model."
args_schema: Type[BaseModel] = ImagePromptSchema
model: str = "dall-e-3"
size: str = "1024x1024"
quality: str = "standard"
n: int = 1
env_vars: List[EnvVar] = [
EnvVar(name="OPENAI_API_KEY", description="API key for OpenAI services", required=True),
]
def _run(self, **kwargs) -> str:
client = OpenAI()
image_description = kwargs.get("image_description")
if not image_description:
return "Image description is required."
response = client.images.generate(
model=self.model,
prompt=image_description,
size=self.size,
quality=self.quality,
n=self.n,
)
image_data = json.dumps(
{
"image_url": response.data[0].url,
"image_description": response.data[0].revised_prompt,
}
)
return image_data

View File

@@ -0,0 +1,66 @@
# Databricks Query Tool
## Description
This tool allows AI agents to execute SQL queries against Databricks workspace tables and retrieve the results. It provides a simple interface for querying data from Databricks tables using SQL, making it easy for agents to access and analyze data stored in Databricks.
## Installation
Install the crewai_tools package with the databricks extra:
```shell
pip install 'crewai[tools]' 'databricks-sdk'
```
## Authentication
The tool requires Databricks authentication credentials. You can provide these in two ways:
1. **Using Databricks CLI profile**:
- Set the `DATABRICKS_CONFIG_PROFILE` environment variable to your profile name.
2. **Using direct credentials**:
- Set both `DATABRICKS_HOST` and `DATABRICKS_TOKEN` environment variables.
Example:
```shell
export DATABRICKS_HOST="https://your-workspace.cloud.databricks.com"
export DATABRICKS_TOKEN="dapi1234567890abcdef"
```
## Usage
```python
from crewai_tools import DatabricksQueryTool
# Basic usage
databricks_tool = DatabricksQueryTool()
# With default parameters for catalog, schema, and warehouse
databricks_tool = DatabricksQueryTool(
default_catalog="my_catalog",
default_schema="my_schema",
default_warehouse_id="warehouse_id"
)
# Example in a CrewAI agent
@agent
def data_analyst(self) -> Agent:
return Agent(
config=self.agents_config["data_analyst"],
allow_delegation=False,
tools=[databricks_tool]
)
```
## Parameters
When executing queries, you can provide the following parameters:
- `query` (required): SQL query to execute against the Databricks workspace
- `catalog` (optional): Databricks catalog name
- `schema` (optional): Databricks schema name
- `warehouse_id` (optional): Databricks SQL warehouse ID
- `row_limit` (optional): Maximum number of rows to return (default: 1000)
If not provided, the tool will use the default values set during initialization.

View File

@@ -0,0 +1,670 @@
import os
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type, Union
from crewai.tools import BaseTool
from pydantic import BaseModel, Field, model_validator
if TYPE_CHECKING:
from databricks.sdk import WorkspaceClient
class DatabricksQueryToolSchema(BaseModel):
"""Input schema for DatabricksQueryTool."""
query: str = Field(
..., description="SQL query to execute against the Databricks workspace table"
)
catalog: Optional[str] = Field(
None, description="Databricks catalog name (optional, defaults to configured catalog)"
)
db_schema: Optional[str] = Field(
None, description="Databricks schema name (optional, defaults to configured schema)"
)
warehouse_id: Optional[str] = Field(
None, description="Databricks SQL warehouse ID (optional, defaults to configured warehouse)"
)
row_limit: Optional[int] = Field(
1000, description="Maximum number of rows to return (default: 1000)"
)
@model_validator(mode='after')
def validate_input(self) -> 'DatabricksQueryToolSchema':
"""Validate the input parameters."""
# Ensure the query is not empty
if not self.query or not self.query.strip():
raise ValueError("Query cannot be empty")
# Add a LIMIT clause to the query if row_limit is provided and query doesn't have one
if self.row_limit and "limit" not in self.query.lower():
self.query = f"{self.query.rstrip(';')} LIMIT {self.row_limit};"
return self
class DatabricksQueryTool(BaseTool):
"""
A tool for querying Databricks workspace tables using SQL.
This tool executes SQL queries against Databricks tables and returns the results.
It requires Databricks authentication credentials to be set as environment variables.
Authentication can be provided via:
- Databricks CLI profile: Set DATABRICKS_CONFIG_PROFILE environment variable
- Direct credentials: Set DATABRICKS_HOST and DATABRICKS_TOKEN environment variables
Example:
>>> tool = DatabricksQueryTool()
>>> results = tool.run(query="SELECT * FROM my_table LIMIT 10")
"""
name: str = "Databricks SQL Query"
description: str = (
"Execute SQL queries against Databricks workspace tables and return the results."
" Provide a 'query' parameter with the SQL query to execute."
)
args_schema: Type[BaseModel] = DatabricksQueryToolSchema
# Optional default parameters
default_catalog: Optional[str] = None
default_schema: Optional[str] = None
default_warehouse_id: Optional[str] = None
_workspace_client: Optional["WorkspaceClient"] = None
package_dependencies: List[str] = ["databricks-sdk"]
def __init__(
self,
default_catalog: Optional[str] = None,
default_schema: Optional[str] = None,
default_warehouse_id: Optional[str] = None,
**kwargs: Any,
) -> None:
"""
Initialize the DatabricksQueryTool.
Args:
default_catalog (Optional[str]): Default catalog to use for queries.
default_schema (Optional[str]): Default schema to use for queries.
default_warehouse_id (Optional[str]): Default SQL warehouse ID to use.
**kwargs: Additional keyword arguments passed to BaseTool.
"""
super().__init__(**kwargs)
self.default_catalog = default_catalog
self.default_schema = default_schema
self.default_warehouse_id = default_warehouse_id
self._validate_credentials()
def _validate_credentials(self) -> None:
"""Validate that Databricks credentials are available."""
has_profile = "DATABRICKS_CONFIG_PROFILE" in os.environ
has_direct_auth = "DATABRICKS_HOST" in os.environ and "DATABRICKS_TOKEN" in os.environ
if not (has_profile or has_direct_auth):
raise ValueError(
"Databricks authentication credentials are required. "
"Set either DATABRICKS_CONFIG_PROFILE or both DATABRICKS_HOST and DATABRICKS_TOKEN environment variables."
)
@property
def workspace_client(self) -> "WorkspaceClient":
"""Get or create a Databricks WorkspaceClient instance."""
if self._workspace_client is None:
try:
from databricks.sdk import WorkspaceClient
self._workspace_client = WorkspaceClient()
except ImportError:
raise ImportError(
"`databricks-sdk` package not found, please run `uv add databricks-sdk`"
)
return self._workspace_client
def _format_results(self, results: List[Dict[str, Any]]) -> str:
"""Format query results as a readable string."""
if not results:
return "Query returned no results."
# Get column names from the first row
if not results[0]:
return "Query returned empty rows with no columns."
columns = list(results[0].keys())
# If we have rows but they're all empty, handle that case
if not columns:
return "Query returned rows but with no column data."
# Calculate column widths based on data
col_widths = {col: len(col) for col in columns}
for row in results:
for col in columns:
# Convert value to string and get its length
# Handle None values gracefully
value_str = str(row[col]) if row[col] is not None else "NULL"
col_widths[col] = max(col_widths[col], len(value_str))
# Create header row
header = " | ".join(f"{col:{col_widths[col]}}" for col in columns)
separator = "-+-".join("-" * col_widths[col] for col in columns)
# Format data rows
data_rows = []
for row in results:
# Handle None values by displaying "NULL"
row_values = {col: str(row[col]) if row[col] is not None else "NULL" for col in columns}
data_row = " | ".join(f"{row_values[col]:{col_widths[col]}}" for col in columns)
data_rows.append(data_row)
# Add row count information
result_info = f"({len(results)} row{'s' if len(results) != 1 else ''} returned)"
# Combine all parts
return f"{header}\n{separator}\n" + "\n".join(data_rows) + f"\n\n{result_info}"
def _run(
self,
**kwargs: Any,
) -> str:
"""
Execute a SQL query against Databricks and return the results.
Args:
query (str): SQL query to execute
catalog (Optional[str]): Databricks catalog name
db_schema (Optional[str]): Databricks schema name
warehouse_id (Optional[str]): SQL warehouse ID
row_limit (Optional[int]): Maximum number of rows to return
Returns:
str: Formatted query results
"""
try:
# Get parameters with fallbacks to default values
query = kwargs.get("query")
catalog = kwargs.get("catalog") or self.default_catalog
db_schema = kwargs.get("db_schema") or self.default_schema
warehouse_id = kwargs.get("warehouse_id") or self.default_warehouse_id
row_limit = kwargs.get("row_limit", 1000)
# Validate schema and query
validated_input = DatabricksQueryToolSchema(
query=query,
catalog=catalog,
db_schema=db_schema,
warehouse_id=warehouse_id,
row_limit=row_limit
)
# Extract validated parameters
query = validated_input.query
catalog = validated_input.catalog
db_schema = validated_input.db_schema
warehouse_id = validated_input.warehouse_id
# Setup SQL context with catalog/schema if provided
context = {}
if catalog:
context["catalog"] = catalog
if db_schema:
context["schema"] = db_schema
# Execute query
statement = self.workspace_client.statement_execution
try:
# Execute the statement
execution = statement.execute_statement(
warehouse_id=warehouse_id,
statement=query,
**context
)
statement_id = execution.statement_id
except Exception as execute_error:
# Handle immediate execution errors
return f"Error starting query execution: {str(execute_error)}"
# Poll for results with better error handling
import time
result = None
timeout = 300 # 5 minutes timeout
start_time = time.time()
poll_count = 0
previous_state = None # Track previous state to detect changes
while time.time() - start_time < timeout:
poll_count += 1
try:
# Get statement status
result = statement.get_statement(statement_id)
# Check if finished - be very explicit about state checking
if hasattr(result, 'status') and hasattr(result.status, 'state'):
state_value = str(result.status.state) # Convert to string to handle both string and enum
# Track state changes for debugging
if previous_state != state_value:
previous_state = state_value
# Check if state indicates completion
if "SUCCEEDED" in state_value:
break
elif "FAILED" in state_value:
# Extract error message with more robust handling
error_info = "No detailed error info"
try:
# First try direct access to error.message
if hasattr(result.status, 'error') and result.status.error:
if hasattr(result.status.error, 'message'):
error_info = result.status.error.message
# Some APIs may have a different structure
elif hasattr(result.status.error, 'error_message'):
error_info = result.status.error.error_message
# Last resort, try to convert the whole error object to string
else:
error_info = str(result.status.error)
except Exception as err_extract_error:
# If all else fails, try to get any info we can
error_info = f"Error details unavailable: {str(err_extract_error)}"
# Return immediately on first FAILED state detection
return f"Query execution failed: {error_info}"
elif "CANCELED" in state_value:
return "Query was canceled"
except Exception as poll_error:
# Don't immediately fail - try again a few times
if poll_count > 3:
return f"Error checking query status: {str(poll_error)}"
# Wait before polling again
time.sleep(2)
# Check if we timed out
if result is None:
return "Query returned no result (likely timed out or failed)"
if not hasattr(result, 'status') or not hasattr(result.status, 'state'):
return "Query completed but returned an invalid result structure"
# Convert state to string for comparison
state_value = str(result.status.state)
if not any(state in state_value for state in ["SUCCEEDED", "FAILED", "CANCELED"]):
return f"Query timed out after 5 minutes (last state: {state_value})"
# Get results - adapt this based on the actual structure of the result object
chunk_results = []
# Check if we have results and a schema in a very defensive way
has_schema = (hasattr(result, 'manifest') and result.manifest is not None and
hasattr(result.manifest, 'schema') and result.manifest.schema is not None)
has_result = (hasattr(result, 'result') and result.result is not None)
if has_schema and has_result:
try:
# Get schema for column names
columns = [col.name for col in result.manifest.schema.columns]
# Debug info for schema
# Keep track of all dynamic columns we create
all_columns = set(columns)
# Dump the raw structure of result data to help troubleshoot
if hasattr(result.result, 'data_array'):
# Add defensive check for None data_array
if result.result.data_array is None:
print("data_array is None - likely an empty result set or DDL query")
# Return empty result handling rather than trying to process null data
return "Query executed successfully (no data returned)"
# IMPROVED DETECTION LOGIC: Check if we're possibly dealing with rows where each item
# contains a single value or character (which could indicate incorrect row structure)
is_likely_incorrect_row_structure = False
# Only try to analyze sample if data_array exists and has content
if hasattr(result.result, 'data_array') and result.result.data_array and len(result.result.data_array) > 0 and len(result.result.data_array[0]) > 0:
sample_size = min(20, len(result.result.data_array[0]))
if sample_size > 0:
single_char_count = 0
single_digit_count = 0
total_items = 0
for i in range(sample_size):
val = result.result.data_array[0][i]
total_items += 1
if isinstance(val, str) and len(val) == 1 and not val.isdigit():
single_char_count += 1
elif isinstance(val, str) and len(val) == 1 and val.isdigit():
single_digit_count += 1
# If a significant portion of the first values are single characters or digits,
# this likely indicates data is being incorrectly structured
if total_items > 0 and (single_char_count + single_digit_count) / total_items > 0.5:
is_likely_incorrect_row_structure = True
# Additional check: if many rows have just 1 item when we expect multiple columns
rows_with_single_item = 0
if hasattr(result.result, 'data_array') and result.result.data_array and len(result.result.data_array) > 0:
sample_size_for_rows = min(sample_size, len(result.result.data_array[0])) if 'sample_size' in locals() else min(20, len(result.result.data_array[0]))
rows_with_single_item = sum(1 for row in result.result.data_array[0][:sample_size_for_rows] if isinstance(row, list) and len(row) == 1)
if rows_with_single_item > sample_size_for_rows * 0.5 and len(columns) > 1:
is_likely_incorrect_row_structure = True
# Check if we're getting primarily single characters or the data structure seems off,
# we should use special handling
if 'is_likely_incorrect_row_structure' in locals() and is_likely_incorrect_row_structure:
print("Data appears to be malformed - will use special row reconstruction")
needs_special_string_handling = True
else:
needs_special_string_handling = False
# Process results differently based on detection
if 'needs_special_string_handling' in locals() and needs_special_string_handling:
# We're dealing with data where the rows may be incorrectly structured
print("Using row reconstruction processing mode")
# Collect all values into a flat list
all_values = []
if hasattr(result.result, 'data_array') and result.result.data_array:
# Flatten all values into a single list
for chunk in result.result.data_array:
for item in chunk:
if isinstance(item, (list, tuple)):
all_values.extend(item)
else:
all_values.append(item)
# Get the expected column count from schema
expected_column_count = len(columns)
# Try to reconstruct rows using pattern recognition
reconstructed_rows = []
# PATTERN RECOGNITION APPROACH
# Look for likely indicators of row boundaries in the data
# For Netflix data, we expect IDs as numbers, titles as text strings, etc.
# Use regex pattern to identify ID columns that likely start a new row
import re
id_pattern = re.compile(r'^\d{5,9}$') # Netflix IDs are often 5-9 digits
id_indices = []
for i, val in enumerate(all_values):
if isinstance(val, str) and id_pattern.match(val):
# This value looks like an ID, might be the start of a row
if i < len(all_values) - 1:
next_few_values = all_values[i+1:i+5]
# If following values look like they could be part of a title
if any(isinstance(v, str) and len(v) > 1 for v in next_few_values):
id_indices.append(i)
if id_indices:
# If we found potential row starts, use them to extract rows
for i in range(len(id_indices)):
start_idx = id_indices[i]
end_idx = id_indices[i+1] if i+1 < len(id_indices) else len(all_values)
# Extract values for this row
row_values = all_values[start_idx:end_idx]
# Special handling for Netflix title data
# Titles might be split into individual characters
if 'Title' in columns and len(row_values) > expected_column_count:
# Try to reconstruct by looking for patterns
# We know ID is first, then Title (which may be split)
# Then other fields like Genre, etc.
# Take first value as ID
row_dict = {columns[0]: row_values[0]}
# Look for Genre or other non-title fields to determine where title ends
title_end_idx = 1
for j in range(2, min(100, len(row_values))):
val = row_values[j]
# Check for common genres or non-title markers
if isinstance(val, str) and val in ['Comedy', 'Drama', 'Action', 'Horror', 'Thriller', 'Documentary']:
# Likely found the Genre field
title_end_idx = j
break
# Reconstruct title from individual characters
if title_end_idx > 1:
title_chars = row_values[1:title_end_idx]
# Check if they're individual characters
if all(isinstance(c, str) and len(c) == 1 for c in title_chars):
title = ''.join(title_chars)
row_dict['Title'] = title
# Assign remaining values to columns
remaining_values = row_values[title_end_idx:]
for j, col_name in enumerate(columns[2:], 2):
if j-2 < len(remaining_values):
row_dict[col_name] = remaining_values[j-2]
else:
row_dict[col_name] = None
else:
# Fallback: simple mapping
for j, col_name in enumerate(columns):
if j < len(row_values):
row_dict[col_name] = row_values[j]
else:
row_dict[col_name] = None
else:
# Standard mapping
row_dict = {}
for j, col_name in enumerate(columns):
if j < len(row_values):
row_dict[col_name] = row_values[j]
else:
row_dict[col_name] = None
reconstructed_rows.append(row_dict)
else:
# More intelligent chunking - try to detect where columns like Title might be split
title_idx = columns.index('Title') if 'Title' in columns else -1
if title_idx >= 0:
print("Attempting title reconstruction method")
# Try to detect if title is split across multiple values
i = 0
while i < len(all_values):
# Check if this could be an ID (start of a row)
if isinstance(all_values[i], str) and id_pattern.match(all_values[i]):
row_dict = {columns[0]: all_values[i]}
i += 1
# Try to reconstruct title if it appears to be split
title_chars = []
while (i < len(all_values) and
isinstance(all_values[i], str) and
len(all_values[i]) <= 1 and
len(title_chars) < 100): # Cap title length
title_chars.append(all_values[i])
i += 1
if title_chars:
row_dict[columns[title_idx]] = ''.join(title_chars)
# Add remaining fields
for j in range(title_idx + 1, len(columns)):
if i < len(all_values):
row_dict[columns[j]] = all_values[i]
i += 1
else:
row_dict[columns[j]] = None
reconstructed_rows.append(row_dict)
else:
i += 1
# If we still don't have rows, use simple chunking as fallback
if not reconstructed_rows:
print("Falling back to basic chunking approach")
chunks = [all_values[i:i+expected_column_count] for i in range(0, len(all_values), expected_column_count)]
for chunk in chunks:
# Skip chunks that seem to be partial/incomplete rows
if len(chunk) < expected_column_count * 0.75: # Allow for some missing values
continue
row_dict = {}
# Map values to column names
for i, col in enumerate(columns):
if i < len(chunk):
row_dict[col] = chunk[i]
else:
row_dict[col] = None
reconstructed_rows.append(row_dict)
# Apply post-processing to fix known issues
if reconstructed_rows and 'Title' in columns:
print("Applying post-processing to improve data quality")
for row in reconstructed_rows:
# Fix titles that might still have issues
if isinstance(row.get('Title'), str) and len(row.get('Title')) <= 1:
# This is likely still a fragmented title - mark as potentially incomplete
row['Title'] = f"[INCOMPLETE] {row.get('Title')}"
# Ensure we respect the row limit
if row_limit and len(reconstructed_rows) > row_limit:
reconstructed_rows = reconstructed_rows[:row_limit]
chunk_results = reconstructed_rows
else:
# Process normal result structure as before
print("Using standard processing mode")
# Check different result structures
if hasattr(result.result, 'data_array') and result.result.data_array:
# Check if data appears to be malformed within chunks
for chunk_idx, chunk in enumerate(result.result.data_array):
# Check if chunk might actually contain individual columns of a single row
# This is another way data might be malformed - check the first few values
if len(chunk) > 0 and len(columns) > 1:
# If there seems to be a mismatch between chunk structure and expected columns
first_few_values = chunk[:min(5, len(chunk))]
if all(isinstance(val, (str, int, float)) and not isinstance(val, (list, dict)) for val in first_few_values):
if len(chunk) > len(columns) * 3: # Heuristic: if chunk has way more items than columns
print("Chunk appears to contain individual values rather than rows - switching to row reconstruction")
# This chunk might actually be values of multiple rows - try to reconstruct
values = chunk # All values in this chunk
reconstructed_rows = []
# Try to create rows based on expected column count
for i in range(0, len(values), len(columns)):
if i + len(columns) <= len(values): # Ensure we have enough values
row_values = values[i:i+len(columns)]
row_dict = {col: val for col, val in zip(columns, row_values)}
reconstructed_rows.append(row_dict)
if reconstructed_rows:
chunk_results.extend(reconstructed_rows)
continue # Skip normal processing for this chunk
# Special case: when chunk contains exactly the right number of values for a single row
# This handles the case where instead of a list of rows, we just got all values in a flat list
if all(isinstance(val, (str, int, float)) and not isinstance(val, (list, dict)) for val in chunk):
if len(chunk) == len(columns) or (len(chunk) > 0 and len(chunk) % len(columns) == 0):
# Process flat list of values as rows
for i in range(0, len(chunk), len(columns)):
row_values = chunk[i:i+len(columns)]
if len(row_values) == len(columns): # Only process complete rows
row_dict = {col: val for col, val in zip(columns, row_values)}
chunk_results.append(row_dict)
# Skip regular row processing for this chunk
continue
# Normal processing for typical row structure
for row_idx, row in enumerate(chunk):
# Ensure row is actually a collection of values
if not isinstance(row, (list, tuple, dict)):
# This might be a single value; skip it or handle specially
continue
# Convert each row to a dictionary with column names as keys
row_dict = {}
# Handle dict rows directly
if isinstance(row, dict):
# Use the existing column mapping
row_dict = dict(row)
elif isinstance(row, (list, tuple)):
# Map list of values to columns
for i, val in enumerate(row):
if i < len(columns): # Only process if we have a matching column
row_dict[columns[i]] = val
else:
# Extra values without column names
dynamic_col = f"Column_{i}"
row_dict[dynamic_col] = val
all_columns.add(dynamic_col)
# If we have fewer values than columns, set missing values to None
for col in columns:
if col not in row_dict:
row_dict[col] = None
chunk_results.append(row_dict)
elif hasattr(result.result, 'data') and result.result.data:
# Alternative data structure
for row_idx, row in enumerate(result.result.data):
# Debug info
# Safely create dictionary matching column names to values
row_dict = {}
for i, val in enumerate(row):
if i < len(columns): # Only process if we have a matching column
row_dict[columns[i]] = val
else:
# Extra values without column names
dynamic_col = f"Column_{i}"
row_dict[dynamic_col] = val
all_columns.add(dynamic_col)
# If we have fewer values than columns, set missing values to None
for i, col in enumerate(columns):
if i >= len(row):
row_dict[col] = None
chunk_results.append(row_dict)
# After processing all rows, ensure all rows have all columns
normalized_results = []
for row in chunk_results:
# Create a new row with all columns, defaulting to None for missing ones
normalized_row = {col: row.get(col, None) for col in all_columns}
normalized_results.append(normalized_row)
# Replace the original results with normalized ones
chunk_results = normalized_results
except Exception as results_error:
# Enhanced error message with more context
import traceback
error_details = traceback.format_exc()
return f"Error processing query results: {str(results_error)}\n\nDetails:\n{error_details}"
# If we have no results but the query succeeded (e.g., for DDL statements)
if not chunk_results and hasattr(result, 'status'):
state_value = str(result.status.state)
if "SUCCEEDED" in state_value:
return "Query executed successfully (no results to display)"
# Format and return results
return self._format_results(chunk_results)
except Exception as e:
# Include more details in the error message to help with debugging
import traceback
error_details = traceback.format_exc()
return f"Error executing Databricks query: {str(e)}\n\nDetails:\n{error_details}"

View File

@@ -0,0 +1,40 @@
```markdown
# DirectoryReadTool
## Description
The DirectoryReadTool is a highly efficient utility designed for the comprehensive listing of directory contents. It recursively navigates through the specified directory, providing users with a detailed enumeration of all files, including those nested within subdirectories. This tool is indispensable for tasks requiring a thorough inventory of directory structures or for validating the organization of files within directories.
## Installation
Install the `crewai_tools` package to use the DirectoryReadTool in your project. If you haven't added this package to your environment, you can easily install it with pip using the following command:
```shell
pip install 'crewai[tools]'
```
This installs the latest version of the `crewai_tools` package, allowing access to the DirectoryReadTool and other utilities.
## Example
The DirectoryReadTool is simple to use. The code snippet below shows how to set up and use the tool to list the contents of a specified directory:
```python
from crewai_tools import DirectoryReadTool
# Initialize the tool with the directory you want to explore
tool = DirectoryReadTool(directory='/path/to/your/directory')
# Use the tool to list the contents of the specified directory
directory_contents = tool.run()
print(directory_contents)
```
This example demonstrates the essential steps to utilize the DirectoryReadTool effectively, highlighting its simplicity and user-friendly design.
## Arguments
The DirectoryReadTool requires minimal configuration for use. The essential argument for this tool is as follows:
- `directory`: A mandatory argument that specifies the path to the directory whose contents you wish to list. It accepts both absolute and relative paths, guiding the tool to the desired directory for content listing.
The DirectoryReadTool provides a user-friendly and efficient way to list directory contents, making it an invaluable tool for managing and inspecting directory structures.
```
This revised documentation for the DirectoryReadTool maintains the structure and content requirements as outlined, with adjustments made for clarity, consistency, and adherence to the high-quality standards exemplified in the provided documentation example.

View File

@@ -0,0 +1,47 @@
import os
from typing import Any, Optional, Type
from crewai.tools import BaseTool
from pydantic import BaseModel, Field
class FixedDirectoryReadToolSchema(BaseModel):
"""Input for DirectoryReadTool."""
class DirectoryReadToolSchema(FixedDirectoryReadToolSchema):
"""Input for DirectoryReadTool."""
directory: str = Field(..., description="Mandatory directory to list content")
class DirectoryReadTool(BaseTool):
name: str = "List files in directory"
description: str = (
"A tool that can be used to recursively list a directory's content."
)
args_schema: Type[BaseModel] = DirectoryReadToolSchema
directory: Optional[str] = None
def __init__(self, directory: Optional[str] = None, **kwargs):
super().__init__(**kwargs)
if directory is not None:
self.directory = directory
self.description = f"A tool that can be used to list {directory}'s content."
self.args_schema = FixedDirectoryReadToolSchema
self._generate_description()
def _run(
self,
**kwargs: Any,
) -> Any:
directory = kwargs.get("directory", self.directory)
if directory[-1] == "/":
directory = directory[:-1]
files_list = [
f"{directory}/{(os.path.join(root, filename).replace(directory, '').lstrip(os.path.sep))}"
for root, dirs, files in os.walk(directory)
for filename in files
]
files = "\n- ".join(files_list)
return f"File paths: \n-{files}"

View File

@@ -0,0 +1,55 @@
# DirectorySearchTool
## Description
This tool is designed to perform a semantic search for queries within the content of a specified directory. Utilizing the RAG (Retrieval-Augmented Generation) methodology, it offers a powerful means to semantically navigate through the files of a given directory. The tool can be dynamically set to search any directory specified at runtime or can be pre-configured to search within a specific directory upon initialization.
## Installation
To start using the DirectorySearchTool, you need to install the crewai_tools package. Execute the following command in your terminal:
```shell
pip install 'crewai[tools]'
```
## Example
The following examples demonstrate how to initialize the DirectorySearchTool for different use cases and how to perform a search:
```python
from crewai_tools import DirectorySearchTool
# To enable searching within any specified directory at runtime
tool = DirectorySearchTool()
# Alternatively, to restrict searches to a specific directory
tool = DirectorySearchTool(directory='/path/to/directory')
```
## Arguments
- `directory` : This string argument specifies the directory within which to search. It is mandatory if the tool has not been initialized with a directory; otherwise, the tool will only search within the initialized directory.
## Custom model and embeddings
By default, the tool uses OpenAI for both embeddings and summarization. To customize the model, you can use a config dictionary as follows:
```python
tool = DirectorySearchTool(
config=dict(
llm=dict(
provider="ollama", # or google, openai, anthropic, llama2, ...
config=dict(
model="llama2",
# temperature=0.5,
# top_p=1,
# stream=true,
),
),
embedder=dict(
provider="google",
config=dict(
model="models/embedding-001",
task_type="retrieval_document",
# title="Embeddings",
),
),
)
)
```

View File

@@ -0,0 +1,59 @@
from typing import Optional, Type
try:
from embedchain.loaders.directory_loader import DirectoryLoader
EMBEDCHAIN_AVAILABLE = True
except ImportError:
EMBEDCHAIN_AVAILABLE = False
from pydantic import BaseModel, Field
from ..rag.rag_tool import RagTool
class FixedDirectorySearchToolSchema(BaseModel):
"""Input for DirectorySearchTool."""
search_query: str = Field(
...,
description="Mandatory search query you want to use to search the directory's content",
)
class DirectorySearchToolSchema(FixedDirectorySearchToolSchema):
"""Input for DirectorySearchTool."""
directory: str = Field(..., description="Mandatory directory you want to search")
class DirectorySearchTool(RagTool):
name: str = "Search a directory's content"
description: str = (
"A tool that can be used to semantic search a query from a directory's content."
)
args_schema: Type[BaseModel] = DirectorySearchToolSchema
def __init__(self, directory: Optional[str] = None, **kwargs):
if not EMBEDCHAIN_AVAILABLE:
raise ImportError("embedchain is not installed. Please install it with `pip install crewai-tools[embedchain]`")
super().__init__(**kwargs)
if directory is not None:
self.add(directory)
self.description = f"A tool that can be used to semantic search a query the {directory} directory's content."
self.args_schema = FixedDirectorySearchToolSchema
self._generate_description()
def add(self, directory: str) -> None:
super().add(
directory,
loader=DirectoryLoader(config=dict(recursive=True)),
)
def _run(
self,
search_query: str,
directory: Optional[str] = None,
) -> str:
if directory is not None:
self.add(directory)
return super()._run(query=search_query)

View File

@@ -0,0 +1,57 @@
# DOCXSearchTool
## Description
The DOCXSearchTool is a RAG tool designed for semantic searching within DOCX documents. It enables users to effectively search and extract relevant information from DOCX files using query-based searches. This tool is invaluable for data analysis, information management, and research tasks, streamlining the process of finding specific information within large document collections.
## Installation
Install the crewai_tools package by running the following command in your terminal:
```shell
pip install 'crewai[tools]'
```
## Example
The following example demonstrates initializing the DOCXSearchTool to search within any DOCX file's content or with a specific DOCX file path.
```python
from crewai_tools import DOCXSearchTool
# Initialize the tool to search within any DOCX file's content
tool = DOCXSearchTool()
# OR
# Initialize the tool with a specific DOCX file, so the agent can only search the content of the specified DOCX file
tool = DOCXSearchTool(docx='path/to/your/document.docx')
```
## Arguments
- `docx`: An optional file path to a specific DOCX document you wish to search. If not provided during initialization, the tool allows for later specification of any DOCX file's content path for searching.
## Custom model and embeddings
By default, the tool uses OpenAI for both embeddings and summarization. To customize the model, you can use a config dictionary as follows:
```python
tool = DOCXSearchTool(
config=dict(
llm=dict(
provider="ollama", # or google, openai, anthropic, llama2, ...
config=dict(
model="llama2",
# temperature=0.5,
# top_p=1,
# stream=true,
),
),
embedder=dict(
provider="google",
config=dict(
model="models/embedding-001",
task_type="retrieval_document",
# title="Embeddings",
),
),
)
)
```

View File

@@ -0,0 +1,62 @@
from typing import Any, Optional, Type
try:
from embedchain.models.data_type import DataType
EMBEDCHAIN_AVAILABLE = True
except ImportError:
EMBEDCHAIN_AVAILABLE = False
from pydantic import BaseModel, Field
from ..rag.rag_tool import RagTool
class FixedDOCXSearchToolSchema(BaseModel):
"""Input for DOCXSearchTool."""
docx: Optional[str] = Field(
..., description="File path or URL of a DOCX file to be searched"
)
search_query: str = Field(
...,
description="Mandatory search query you want to use to search the DOCX's content",
)
class DOCXSearchToolSchema(FixedDOCXSearchToolSchema):
"""Input for DOCXSearchTool."""
search_query: str = Field(
...,
description="Mandatory search query you want to use to search the DOCX's content",
)
class DOCXSearchTool(RagTool):
name: str = "Search a DOCX's content"
description: str = (
"A tool that can be used to semantic search a query from a DOCX's content."
)
args_schema: Type[BaseModel] = DOCXSearchToolSchema
def __init__(self, docx: Optional[str] = None, **kwargs):
super().__init__(**kwargs)
if docx is not None:
self.add(docx)
self.description = f"A tool that can be used to semantic search a query the {docx} DOCX's content."
self.args_schema = FixedDOCXSearchToolSchema
self._generate_description()
def add(self, docx: str) -> None:
if not EMBEDCHAIN_AVAILABLE:
raise ImportError("embedchain is not installed. Please install it with `pip install crewai-tools[embedchain]`")
super().add(docx, data_type=DataType.DOCX)
def _run(
self,
search_query: str,
docx: Optional[str] = None,
) -> Any:
if docx is not None:
self.add(docx)
return super()._run(query=search_query)

View File

@@ -0,0 +1,30 @@
# EXASearchTool Documentation
## Description
This tool is designed to perform a semantic search for a specified query from a text's content across the internet. It utilizes the `https://exa.ai/` API to fetch and display the most relevant search results based on the query provided by the user.
## Installation
To incorporate this tool into your project, follow the installation instructions below:
```shell
uv add crewai[tools] exa_py
```
## Example
The following example demonstrates how to initialize the tool and execute a search with a given query:
```python
from crewai_tools import EXASearchTool
# Initialize the tool for internet searching capabilities
tool = EXASearchTool(api_key="your_api_key")
```
## Steps to Get Started
To effectively use the `EXASearchTool`, follow these steps:
1. **Package Installation**: Confirm that the `crewai[tools]` package is installed in your Python environment.
2. **API Key Acquisition**: Acquire a `https://exa.ai/` API key by registering for a free account at `https://exa.ai/`.
3. **Environment Configuration**: Store your obtained API key in an environment variable named `EXA_API_KEY` to facilitate its use by the tool.
## Conclusion
By integrating the `EXASearchTool` into Python projects, users gain the ability to conduct real-time, relevant searches across the internet directly from their applications. By adhering to the setup and usage guidelines provided, incorporating this tool into projects is streamlined and straightforward.

View File

@@ -0,0 +1,108 @@
import os
from typing import Any, List, Optional, Type
from crewai.tools import BaseTool, EnvVar
from pydantic import BaseModel, Field
try:
from exa_py import Exa
EXA_INSTALLED = True
except ImportError:
Exa = Any
EXA_INSTALLED = False
class EXABaseToolSchema(BaseModel):
search_query: str = Field(
..., description="Mandatory search query you want to use to search the internet"
)
start_published_date: Optional[str] = Field(
None, description="Start date for the search"
)
end_published_date: Optional[str] = Field(
None, description="End date for the search"
)
include_domains: Optional[list[str]] = Field(
None, description="List of domains to include in the search"
)
class EXASearchTool(BaseTool):
model_config = {"arbitrary_types_allowed": True}
name: str = "EXASearchTool"
description: str = "Search the internet using Exa"
args_schema: Type[BaseModel] = EXABaseToolSchema
client: Optional["Exa"] = None
content: Optional[bool] = False
summary: Optional[bool] = False
type: Optional[str] = "auto"
package_dependencies: List[str] = ["exa_py"]
api_key: Optional[str] = Field(
default_factory=lambda: os.getenv("EXA_API_KEY"),
description="API key for Exa services",
json_schema_extra={"required": False},
)
env_vars: List[EnvVar] = [
EnvVar(
name="EXA_API_KEY", description="API key for Exa services", required=False
),
]
def __init__(
self,
content: Optional[bool] = False,
summary: Optional[bool] = False,
type: Optional[str] = "auto",
**kwargs,
):
super().__init__(
**kwargs,
)
if not EXA_INSTALLED:
import click
if click.confirm(
"You are missing the 'exa_py' package. Would you like to install it?"
):
import subprocess
subprocess.run(["uv", "add", "exa_py"], check=True)
else:
raise ImportError(
"You are missing the 'exa_py' package. Would you like to install it?"
)
self.client = Exa(api_key=self.api_key)
self.content = content
self.summary = summary
self.type = type
def _run(
self,
search_query: str,
start_published_date: Optional[str] = None,
end_published_date: Optional[str] = None,
include_domains: Optional[list[str]] = None,
) -> Any:
if self.client is None:
raise ValueError("Client not initialized")
search_params = {
"type": self.type,
}
if start_published_date:
search_params["start_published_date"] = start_published_date
if end_published_date:
search_params["end_published_date"] = end_published_date
if include_domains:
search_params["include_domains"] = include_domains
if self.content:
results = self.client.search_and_contents(
search_query, summary=self.summary, **search_params
)
else:
results = self.client.search(search_query, **search_params)
return results

View File

@@ -0,0 +1,40 @@
# FileReadTool
## Description
The FileReadTool is a versatile component of the crewai_tools package, designed to streamline the process of reading and retrieving content from files. It is particularly useful in scenarios such as batch text file processing, runtime configuration file reading, and data importation for analytics. This tool supports various text-based file formats including `.txt`, `.csv`, `.json`, and adapts its functionality based on the file type, for instance, converting JSON content into a Python dictionary for easy use.
The tool also supports reading specific chunks of a file by specifying a starting line and the number of lines to read, which is helpful when working with large files that don't need to be loaded entirely into memory.
## Installation
Install the crewai_tools package to use the FileReadTool in your projects:
```shell
pip install 'crewai[tools]'
```
## Example
To get started with the FileReadTool:
```python
from crewai_tools import FileReadTool
# Initialize the tool to read any files the agents knows or lean the path for
file_read_tool = FileReadTool()
# OR
# Initialize the tool with a specific file path, so the agent can only read the content of the specified file
file_read_tool = FileReadTool(file_path='path/to/your/file.txt')
# Read a specific chunk of the file (lines 100-149)
partial_content = file_read_tool.run(file_path='path/to/your/file.txt', start_line=100, line_count=50)
```
## Arguments
- `file_path`: The path to the file you want to read. It accepts both absolute and relative paths. Ensure the file exists and you have the necessary permissions to access it.
- `start_line`: (Optional) The line number to start reading from (1-indexed). Defaults to 1 (the first line).
- `line_count`: (Optional) The number of lines to read. If not provided, reads from the start_line to the end of the file.

View File

@@ -0,0 +1,97 @@
from typing import Any, Optional, Type
from crewai.tools import BaseTool
from pydantic import BaseModel, Field
class FileReadToolSchema(BaseModel):
"""Input for FileReadTool."""
file_path: str = Field(..., description="Mandatory file full path to read the file")
start_line: Optional[int] = Field(1, description="Line number to start reading from (1-indexed)")
line_count: Optional[int] = Field(None, description="Number of lines to read. If None, reads the entire file")
class FileReadTool(BaseTool):
"""A tool for reading file contents.
This tool inherits its schema handling from BaseTool to avoid recursive schema
definition issues. The args_schema is set to FileReadToolSchema which defines
the required file_path parameter. The schema should not be overridden in the
constructor as it would break the inheritance chain and cause infinite loops.
The tool supports two ways of specifying the file path:
1. At construction time via the file_path parameter
2. At runtime via the file_path parameter in the tool's input
Args:
file_path (Optional[str]): Path to the file to be read. If provided,
this becomes the default file path for the tool.
**kwargs: Additional keyword arguments passed to BaseTool.
Example:
>>> tool = FileReadTool(file_path="/path/to/file.txt")
>>> content = tool.run() # Reads /path/to/file.txt
>>> content = tool.run(file_path="/path/to/other.txt") # Reads other.txt
>>> content = tool.run(file_path="/path/to/file.txt", start_line=100, line_count=50) # Reads lines 100-149
"""
name: str = "Read a file's content"
description: str = "A tool that reads the content of a file. To use this tool, provide a 'file_path' parameter with the path to the file you want to read. Optionally, provide 'start_line' to start reading from a specific line and 'line_count' to limit the number of lines read."
args_schema: Type[BaseModel] = FileReadToolSchema
file_path: Optional[str] = None
def __init__(self, file_path: Optional[str] = None, **kwargs: Any) -> None:
"""Initialize the FileReadTool.
Args:
file_path (Optional[str]): Path to the file to be read. If provided,
this becomes the default file path for the tool.
**kwargs: Additional keyword arguments passed to BaseTool.
"""
if file_path is not None:
kwargs["description"] = (
f"A tool that reads file content. The default file is {file_path}, but you can provide a different 'file_path' parameter to read another file. You can also specify 'start_line' and 'line_count' to read specific parts of the file."
)
super().__init__(**kwargs)
self.file_path = file_path
def _run(
self,
file_path: Optional[str] = None,
start_line: Optional[int] = 1,
line_count: Optional[int] = None,
) -> str:
file_path = file_path or self.file_path
start_line = start_line or 1
line_count = line_count or None
if file_path is None:
return (
"Error: No file path provided. Please provide a file path either in the constructor or as an argument."
)
try:
with open(file_path, "r") as file:
if start_line == 1 and line_count is None:
return file.read()
start_idx = max(start_line - 1, 0)
selected_lines = [
line
for i, line in enumerate(file)
if i >= start_idx and (line_count is None or i < start_idx + line_count)
]
if not selected_lines and start_idx > 0:
return f"Error: Start line {start_line} exceeds the number of lines in the file."
return "".join(selected_lines)
except FileNotFoundError:
return f"Error: File not found at path: {file_path}"
except PermissionError:
return f"Error: Permission denied when trying to read file: {file_path}"
except Exception as e:
return f"Error: Failed to read file {file_path}. {str(e)}"

View File

@@ -0,0 +1,35 @@
Here's the rewritten README for the `FileWriterTool`:
# FileWriterTool Documentation
## Description
The `FileWriterTool` is a component of the crewai_tools package, designed to simplify the process of writing content to files. It is particularly useful in scenarios such as generating reports, saving logs, creating configuration files, and more. This tool supports creating new directories if they don't exist, making it easier to organize your output.
## Installation
Install the crewai_tools package to use the `FileWriterTool` in your projects:
```shell
pip install 'crewai[tools]'
```
## Example
To get started with the `FileWriterTool`:
```python
from crewai_tools import FileWriterTool
# Initialize the tool
file_writer_tool = FileWriterTool()
# Write content to a file in a specified directory
result = file_writer_tool._run('example.txt', 'This is a test content.', 'test_directory')
print(result)
```
## Arguments
- `filename`: The name of the file you want to create or overwrite.
- `content`: The content to write into the file.
- `directory` (optional): The path to the directory where the file will be created. Defaults to the current directory (`.`). If the directory does not exist, it will be created.
## Conclusion
By integrating the `FileWriterTool` into your crews, the agents can execute the process of writing content to files and creating directories. This tool is essential for tasks that require saving output data, creating structured file systems, and more. By adhering to the setup and usage guidelines provided, incorporating this tool into projects is straightforward and efficient.

View File

@@ -0,0 +1,62 @@
import os
from typing import Any, Optional, Type
from crewai.tools import BaseTool
from pydantic import BaseModel
def strtobool(val) -> bool:
if isinstance(val, bool):
return val
val = val.lower()
if val in ("y", "yes", "t", "true", "on", "1"):
return True
elif val in ("n", "no", "f", "false", "off", "0"):
return False
else:
raise ValueError(f"invalid value to cast to bool: {val!r}")
class FileWriterToolInput(BaseModel):
filename: str
directory: Optional[str] = "./"
overwrite: str | bool = False
content: str
class FileWriterTool(BaseTool):
name: str = "File Writer Tool"
description: str = (
"A tool to write content to a specified file. Accepts filename, content, and optionally a directory path and overwrite flag as input."
)
args_schema: Type[BaseModel] = FileWriterToolInput
def _run(self, **kwargs: Any) -> str:
try:
# Create the directory if it doesn't exist
if kwargs.get("directory") and not os.path.exists(kwargs["directory"]):
os.makedirs(kwargs["directory"])
# Construct the full path
filepath = os.path.join(kwargs.get("directory") or "", kwargs["filename"])
# Convert overwrite to boolean
kwargs["overwrite"] = strtobool(kwargs["overwrite"])
# Check if file exists and overwrite is not allowed
if os.path.exists(filepath) and not kwargs["overwrite"]:
return f"File {filepath} already exists and overwrite option was not passed."
# Write content to the file
mode = "w" if kwargs["overwrite"] else "x"
with open(filepath, mode) as file:
file.write(kwargs["content"])
return f"Content successfully written to {filepath}"
except FileExistsError:
return (
f"File {filepath} already exists and overwrite option was not passed."
)
except KeyError as e:
return f"An error occurred while accessing key: {str(e)}"
except Exception as e:
return f"An error occurred while writing to the file: {str(e)}"

View File

@@ -0,0 +1,138 @@
import os
import shutil
import tempfile
import pytest
from crewai_tools.tools.file_writer_tool.file_writer_tool import FileWriterTool
@pytest.fixture
def tool():
return FileWriterTool()
@pytest.fixture
def temp_env():
temp_dir = tempfile.mkdtemp()
test_file = "test.txt"
test_content = "Hello, World!"
yield {
"temp_dir": temp_dir,
"test_file": test_file,
"test_content": test_content,
}
shutil.rmtree(temp_dir, ignore_errors=True)
def get_test_path(filename, directory):
return os.path.join(directory, filename)
def read_file(path):
with open(path, "r") as f:
return f.read()
def test_basic_file_write(tool, temp_env):
result = tool._run(
filename=temp_env["test_file"],
directory=temp_env["temp_dir"],
content=temp_env["test_content"],
overwrite=True,
)
path = get_test_path(temp_env["test_file"], temp_env["temp_dir"])
assert os.path.exists(path)
assert read_file(path) == temp_env["test_content"]
assert "successfully written" in result
def test_directory_creation(tool, temp_env):
new_dir = os.path.join(temp_env["temp_dir"], "nested_dir")
result = tool._run(
filename=temp_env["test_file"],
directory=new_dir,
content=temp_env["test_content"],
overwrite=True,
)
path = get_test_path(temp_env["test_file"], new_dir)
assert os.path.exists(new_dir)
assert os.path.exists(path)
assert "successfully written" in result
@pytest.mark.parametrize(
"overwrite",
["y", "yes", "t", "true", "on", "1", True],
)
def test_overwrite_true(tool, temp_env, overwrite):
path = get_test_path(temp_env["test_file"], temp_env["temp_dir"])
with open(path, "w") as f:
f.write("Original content")
result = tool._run(
filename=temp_env["test_file"],
directory=temp_env["temp_dir"],
content="New content",
overwrite=overwrite,
)
assert read_file(path) == "New content"
assert "successfully written" in result
def test_invalid_overwrite_value(tool, temp_env):
result = tool._run(
filename=temp_env["test_file"],
directory=temp_env["temp_dir"],
content=temp_env["test_content"],
overwrite="invalid",
)
assert "invalid value" in result
def test_missing_required_fields(tool, temp_env):
result = tool._run(
directory=temp_env["temp_dir"],
content=temp_env["test_content"],
overwrite=True,
)
assert "An error occurred while accessing key: 'filename'" in result
def test_empty_content(tool, temp_env):
result = tool._run(
filename=temp_env["test_file"],
directory=temp_env["temp_dir"],
content="",
overwrite=True,
)
path = get_test_path(temp_env["test_file"], temp_env["temp_dir"])
assert os.path.exists(path)
assert read_file(path) == ""
assert "successfully written" in result
@pytest.mark.parametrize(
"overwrite",
["n", "no", "f", "false", "off", "0", False],
)
def test_file_exists_error_handling(tool, temp_env, overwrite):
path = get_test_path(temp_env["test_file"], temp_env["temp_dir"])
with open(path, "w") as f:
f.write("Pre-existing content")
result = tool._run(
filename=temp_env["test_file"],
directory=temp_env["temp_dir"],
content="Should not be written",
overwrite=overwrite,
)
assert "already exists and overwrite option was not passed" in result
assert read_file(path) == "Pre-existing content"

View File

@@ -0,0 +1,119 @@
# 📦 FileCompressorTool
The **FileCompressorTool** is a utility for compressing individual files or entire directories (including nested subdirectories) into different archive formats, such as `.zip` or `.tar` (including `.tar.gz`, `.tar.bz2`, and `.tar.xz`). This tool is useful for archiving logs, documents, datasets, or backups in a compact format, and ensures flexibility in how the archives are created.
---
## Description
This tool:
- Accepts a **file or directory** as input.
- Supports **recursive compression** of subdirectories.
- Lets you define a **custom output archive path** or defaults to the current directory.
- Handles **overwrite protection** to avoid unintentional data loss.
- Supports multiple compression formats: `.zip`, `.tar`, `.tar.gz`, `.tar.bz2`, and `.tar.xz`.
---
## Arguments
| Argument | Type | Required | Description |
|---------------|-----------|----------|-----------------------------------------------------------------------------|
| `input_path` | `str` | ✅ | Path to the file or directory you want to compress. |
| `output_path` | `str` | ❌ | Optional path for the resulting archive file. Defaults to `./<name>.<format>`. |
| `overwrite` | `bool` | ❌ | Whether to overwrite an existing archive file. Defaults to `False`. |
| `format` | `str` | ❌ | Compression format to use. Can be one of `zip`, `tar`, `tar.gz`, `tar.bz2`, `tar.xz`. Defaults to `zip`. |
---
## Usage Example
```python
from crewai_tools import FileCompressorTool
# Initialize the tool
tool = FileCompressorTool()
# Compress a directory with subdirectories and files into a zip archive
result = tool._run(
input_path="./data/project_docs", # Folder containing subfolders & files
output_path="./output/project_docs.zip", # Optional output path (defaults to zip format)
overwrite=True # Allow overwriting if file exists
)
print(result)
# Example output: Successfully compressed './data/project_docs' into './output/project_docs.zip'
```
---
## Example Scenarios
### Compress a single file into a zip archive:
```python
# Compress a single file into a zip archive
result = tool._run(input_path="report.pdf")
# Example output: Successfully compressed 'report.pdf' into './report.zip'
```
### Compress a directory with nested folders into a zip archive:
```python
# Compress a directory containing nested subdirectories and files
result = tool._run(input_path="./my_data", overwrite=True)
# Example output: Successfully compressed 'my_data' into './my_data.zip'
```
### Use a custom output path with a zip archive:
```python
# Compress a directory and specify a custom zip output location
result = tool._run(input_path="./my_data", output_path="./backups/my_data_backup.zip", overwrite=True)
# Example output: Successfully compressed 'my_data' into './backups/my_data_backup.zip'
```
### Prevent overwriting an existing zip file:
```python
# Try to compress a directory without overwriting an existing zip file
result = tool._run(input_path="./my_data", output_path="./backups/my_data_backup.zip", overwrite=False)
# Example output: Output zip './backups/my_data_backup.zip' already exists and overwrite is set to False.
```
### Compress into a tar archive:
```python
# Compress a directory into a tar archive
result = tool._run(input_path="./my_data", format="tar", overwrite=True)
# Example output: Successfully compressed 'my_data' into './my_data.tar'
```
### Compress into a tar.gz archive:
```python
# Compress a directory into a tar.gz archive
result = tool._run(input_path="./my_data", format="tar.gz", overwrite=True)
# Example output: Successfully compressed 'my_data' into './my_data.tar.gz'
```
### Compress into a tar.bz2 archive:
```python
# Compress a directory into a tar.bz2 archive
result = tool._run(input_path="./my_data", format="tar.bz2", overwrite=True)
# Example output: Successfully compressed 'my_data' into './my_data.tar.bz2'
```
### Compress into a tar.xz archive:
```python
# Compress a directory into a tar.xz archive
result = tool._run(input_path="./my_data", format="tar.xz", overwrite=True)
# Example output: Successfully compressed 'my_data' into './my_data.tar.xz'
```
---
## Error Handling and Validations
- **File Extension Validation**: The tool ensures that the output file extension matches the selected format (e.g., `.zip` for `zip` format, `.tar` for `tar` format, etc.).
- **File/Directory Existence**: If the input path does not exist, an error message will be returned.
- **Overwrite Protection**: If a file already exists at the output path, the tool checks the `overwrite` flag before proceeding. If `overwrite=False`, it prevents overwriting the existing file.
---
This tool provides a flexible and robust way to handle file and directory compression across multiple formats for efficient storage and backups.

View File

@@ -0,0 +1,117 @@
import os
import zipfile
import tarfile
from typing import Type, Optional
from pydantic import BaseModel, Field
from crewai.tools import BaseTool
class FileCompressorToolInput(BaseModel):
"""Input schema for FileCompressorTool."""
input_path: str = Field(..., description="Path to the file or directory to compress.")
output_path: Optional[str] = Field(default=None, description="Optional output archive filename.")
overwrite: bool = Field(default=False, description="Whether to overwrite the archive if it already exists.")
format: str = Field(default="zip", description="Compression format ('zip', 'tar', 'tar.gz', 'tar.bz2', 'tar.xz').")
class FileCompressorTool(BaseTool):
name: str = "File Compressor Tool"
description: str = (
"Compresses a file or directory into an archive (.zip currently supported). "
"Useful for archiving logs, documents, or backups."
)
args_schema: Type[BaseModel] = FileCompressorToolInput
def _run(self, input_path: str, output_path: Optional[str] = None, overwrite: bool = False, format: str = "zip") -> str:
if not os.path.exists(input_path):
return f"Input path '{input_path}' does not exist."
if not output_path:
output_path = self._generate_output_path(input_path, format)
FORMAT_EXTENSION = {
"zip": ".zip",
"tar": ".tar",
"tar.gz": ".tar.gz",
"tar.bz2": ".tar.bz2",
"tar.xz": ".tar.xz"
}
if format not in FORMAT_EXTENSION:
return f"Compression format '{format}' is not supported. Allowed formats: {', '.join(FORMAT_EXTENSION.keys())}"
elif not output_path.endswith(FORMAT_EXTENSION[format]):
return f"Error: If '{format}' format is chosen, output file must have a '{FORMAT_EXTENSION[format]}' extension."
if not self._prepare_output(output_path, overwrite):
return f"Output '{output_path}' already exists and overwrite is set to False."
try:
format_compression = {
"zip": self._compress_zip,
"tar": self._compress_tar,
"tar.gz": self._compress_tar,
"tar.bz2": self._compress_tar,
"tar.xz": self._compress_tar
}
if format == "zip":
format_compression[format](input_path, output_path)
else:
format_compression[format](input_path, output_path, format)
return f"Successfully compressed '{input_path}' into '{output_path}'"
except FileNotFoundError:
return f"Error: File not found at path: {input_path}"
except PermissionError:
return f"Error: Permission denied when accessing '{input_path}' or writing '{output_path}'"
except Exception as e:
return f"An unexpected error occurred during compression: {str(e)}"
def _generate_output_path(self, input_path: str, format: str) -> str:
"""Generates output path based on input path and format."""
if os.path.isfile(input_path):
base_name = os.path.splitext(os.path.basename(input_path))[0] # Remove extension
else:
base_name = os.path.basename(os.path.normpath(input_path)) # Directory name
return os.path.join(os.getcwd(), f"{base_name}.{format}")
def _prepare_output(self, output_path: str, overwrite: bool) -> bool:
"""Ensures output path is ready for writing."""
output_dir = os.path.dirname(output_path)
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir)
if os.path.exists(output_path) and not overwrite:
return False
return True
def _compress_zip(self, input_path: str, output_path: str):
"""Compresses input into a zip archive."""
with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
if os.path.isfile(input_path):
zipf.write(input_path, os.path.basename(input_path))
else:
for root, _, files in os.walk(input_path):
for file in files:
full_path = os.path.join(root, file)
arcname = os.path.relpath(full_path, start=input_path)
zipf.write(full_path, arcname)
def _compress_tar(self, input_path: str, output_path: str, format: str):
"""Compresses input into a tar archive with the given format."""
format_mode = {
"tar": "w",
"tar.gz": "w:gz",
"tar.bz2": "w:bz2",
"tar.xz": "w:xz"
}
if format not in format_mode:
raise ValueError(f"Unsupported tar format: {format}")
mode = format_mode[format]
with tarfile.open(output_path, mode) as tarf:
arcname = os.path.basename(input_path)
tarf.add(input_path, arcname=arcname)

View File

@@ -0,0 +1,93 @@
import os
import pytest
from crewai_tools.tools.files_compressor_tool import FileCompressorTool
from unittest.mock import patch, MagicMock
@pytest.fixture
def tool():
return FileCompressorTool()
@patch("os.path.exists", return_value=False)
def test_input_path_does_not_exist(mock_exists, tool):
result = tool._run("nonexistent_path")
assert "does not exist" in result
@patch("os.path.exists", return_value=True)
@patch("os.getcwd", return_value="/mocked/cwd")
@patch.object(FileCompressorTool, "_compress_zip") # Mock actual compression
@patch.object(FileCompressorTool, "_prepare_output", return_value=True)
def test_generate_output_path_default(mock_prepare, mock_compress, mock_cwd, mock_exists, tool):
result = tool._run(input_path="mydir", format="zip")
assert "Successfully compressed" in result
mock_compress.assert_called_once()
@patch("os.path.exists", return_value=True)
@patch.object(FileCompressorTool, "_compress_zip")
@patch.object(FileCompressorTool, "_prepare_output", return_value=True)
def test_zip_compression(mock_prepare, mock_compress, mock_exists, tool):
result = tool._run(input_path="some/path", output_path="archive.zip", format="zip", overwrite=True)
assert "Successfully compressed" in result
mock_compress.assert_called_once()
@patch("os.path.exists", return_value=True)
@patch.object(FileCompressorTool, "_compress_tar")
@patch.object(FileCompressorTool, "_prepare_output", return_value=True)
def test_tar_gz_compression(mock_prepare, mock_compress, mock_exists, tool):
result = tool._run(input_path="some/path", output_path="archive.tar.gz", format="tar.gz", overwrite=True)
assert "Successfully compressed" in result
mock_compress.assert_called_once()
@pytest.mark.parametrize("format", ["tar", "tar.bz2", "tar.xz"])
@patch("os.path.exists", return_value=True)
@patch.object(FileCompressorTool, "_compress_tar")
@patch.object(FileCompressorTool, "_prepare_output", return_value=True)
def test_other_tar_formats(mock_prepare, mock_compress, mock_exists, format, tool):
result = tool._run(input_path="path/to/input", output_path=f"archive.{format}", format=format, overwrite=True)
assert "Successfully compressed" in result
mock_compress.assert_called_once()
@pytest.mark.parametrize("format", ["rar", "7z"])
@patch("os.path.exists", return_value=True) #Ensure input_path exists
def test_unsupported_format(_, tool, format):
result = tool._run(input_path="some/path", output_path=f"archive.{format}", format=format)
assert "not supported" in result
@patch("os.path.exists", return_value=True)
def test_extension_mismatch(_ , tool):
result = tool._run(input_path="some/path", output_path="archive.zip", format="tar.gz")
assert "must have a '.tar.gz' extension" in result
@patch("os.path.exists", return_value=True)
@patch("os.path.isfile", return_value=True)
@patch("os.path.exists", return_value=True)
def test_existing_output_no_overwrite(_, __, ___, tool):
result = tool._run(input_path="some/path", output_path="archive.zip", format="zip", overwrite=False)
assert "overwrite is set to False" in result
@patch("os.path.exists", return_value=True)
@patch("zipfile.ZipFile", side_effect=PermissionError)
def test_permission_error(mock_zip, _, tool):
result = tool._run(input_path="file.txt", output_path="file.zip", format="zip", overwrite=True)
assert "Permission denied" in result
@patch("os.path.exists", return_value=True)
@patch("zipfile.ZipFile", side_effect=FileNotFoundError)
def test_file_not_found_during_zip(mock_zip, _, tool):
result = tool._run(input_path="file.txt", output_path="file.zip", format="zip", overwrite=True)
assert "File not found" in result
@patch("os.path.exists", return_value=True)
@patch("zipfile.ZipFile", side_effect=Exception("Unexpected"))
def test_general_exception_during_zip(mock_zip, _, tool):
result = tool._run(input_path="file.txt", output_path="file.zip", format="zip", overwrite=True)
assert "unexpected error" in result
# Test: Output directory is created when missing
@patch("os.makedirs")
@patch("os.path.exists", return_value=False)
def test_prepare_output_makes_dir(mock_exists, mock_makedirs):
tool = FileCompressorTool()
result = tool._prepare_output("some/missing/path/file.zip", overwrite=True)
assert result is True
mock_makedirs.assert_called_once()

View File

@@ -0,0 +1,60 @@
# FirecrawlCrawlWebsiteTool
## Description
[Firecrawl](https://firecrawl.dev) is a platform for crawling and convert any website into clean markdown or structured data.
## Version Compatibility
This implementation is compatible with FireCrawl API v1
## Installation
- Get an API key from [firecrawl.dev](https://firecrawl.dev) and set it in environment variables (`FIRECRAWL_API_KEY`).
- Install the [Firecrawl SDK](https://github.com/mendableai/firecrawl) along with `crewai[tools]` package:
```
pip install firecrawl-py 'crewai[tools]'
```
## Example
Utilize the FirecrawlScrapeFromWebsiteTool as follows to allow your agent to load websites:
```python
from crewai_tools import FirecrawlCrawlWebsiteTool
from firecrawl import ScrapeOptions
tool = FirecrawlCrawlWebsiteTool(
config={
"limit": 100,
"scrape_options": ScrapeOptions(formats=["markdown", "html"]),
"poll_interval": 30,
}
)
tool.run(url="firecrawl.dev")
```
## Arguments
- `api_key`: Optional. Specifies Firecrawl API key. Defaults is the `FIRECRAWL_API_KEY` environment variable.
- `config`: Optional. It contains Firecrawl API parameters.
This is the default configuration
```python
from firecrawl import ScrapeOptions
{
"max_depth": 2,
"ignore_sitemap": True,
"limit": 100,
"allow_backward_links": False,
"allow_external_links": False,
"scrape_options": ScrapeOptions(
formats=["markdown", "screenshot", "links"],
only_main_content=True,
timeout=30000,
),
}
```

View File

@@ -0,0 +1,115 @@
from typing import Any, Optional, Type, List, TYPE_CHECKING
from crewai.tools import BaseTool, EnvVar
from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
if TYPE_CHECKING:
from firecrawl import FirecrawlApp
try:
from firecrawl import FirecrawlApp
FIRECRAWL_AVAILABLE = True
except ImportError:
FIRECRAWL_AVAILABLE = False
class FirecrawlCrawlWebsiteToolSchema(BaseModel):
url: str = Field(description="Website URL")
class FirecrawlCrawlWebsiteTool(BaseTool):
"""
Tool for crawling websites using Firecrawl. To run this tool, you need to have a Firecrawl API key.
Args:
api_key (str): Your Firecrawl API key.
config (dict): Optional. It contains Firecrawl API parameters.
Default configuration options:
max_depth (int): Maximum depth to crawl. Default: 2
ignore_sitemap (bool): Whether to ignore sitemap. Default: True
limit (int): Maximum number of pages to crawl. Default: 100
allow_backward_links (bool): Allow crawling backward links. Default: False
allow_external_links (bool): Allow crawling external links. Default: False
scrape_options (ScrapeOptions): Options for scraping content
- formats (list[str]): Content formats to return. Default: ["markdown", "screenshot", "links"]
- only_main_content (bool): Only return main content. Default: True
- timeout (int): Timeout in milliseconds. Default: 30000
"""
model_config = ConfigDict(
arbitrary_types_allowed=True, validate_assignment=True, frozen=False
)
name: str = "Firecrawl web crawl tool"
description: str = "Crawl webpages using Firecrawl and return the contents"
args_schema: Type[BaseModel] = FirecrawlCrawlWebsiteToolSchema
api_key: Optional[str] = None
config: Optional[dict[str, Any]] = Field(
default_factory=lambda: {
"maxDepth": 2,
"ignoreSitemap": True,
"limit": 10,
"allowBackwardLinks": False,
"allowExternalLinks": False,
"scrapeOptions": {
"formats": ["markdown", "screenshot", "links"],
"onlyMainContent": True,
"timeout": 10000,
},
}
)
_firecrawl: Optional["FirecrawlApp"] = PrivateAttr(None)
package_dependencies: List[str] = ["firecrawl-py"]
env_vars: List[EnvVar] = [
EnvVar(name="FIRECRAWL_API_KEY", description="API key for Firecrawl services", required=True),
]
def __init__(self, api_key: Optional[str] = None, **kwargs):
super().__init__(**kwargs)
self.api_key = api_key
self._initialize_firecrawl()
def _initialize_firecrawl(self) -> None:
try:
from firecrawl import FirecrawlApp # type: ignore
self._firecrawl = FirecrawlApp(api_key=self.api_key)
except ImportError:
import click
if click.confirm(
"You are missing the 'firecrawl-py' package. Would you like to install it?"
):
import subprocess
try:
subprocess.run(["uv", "add", "firecrawl-py"], check=True)
from firecrawl import FirecrawlApp
self._firecrawl = FirecrawlApp(api_key=self.api_key)
except subprocess.CalledProcessError:
raise ImportError("Failed to install firecrawl-py package")
else:
raise ImportError(
"`firecrawl-py` package not found, please run `uv add firecrawl-py`"
)
def _run(self, url: str):
if not self._firecrawl:
raise RuntimeError("FirecrawlApp not properly initialized")
return self._firecrawl.crawl_url(url, poll_interval=2, params=self.config)
try:
from firecrawl import FirecrawlApp
# Only rebuild if the class hasn't been initialized yet
if not hasattr(FirecrawlCrawlWebsiteTool, "_model_rebuilt"):
FirecrawlCrawlWebsiteTool.model_rebuild()
FirecrawlCrawlWebsiteTool._model_rebuilt = True
except ImportError:
"""
When this tool is not used, then exception can be ignored.
"""

View File

@@ -0,0 +1,46 @@
# FirecrawlScrapeWebsiteTool
## Description
[Firecrawl](https://firecrawl.dev) is a platform for crawling and convert any website into clean markdown or structured data.
## Installation
- Get an API key from [firecrawl.dev](https://firecrawl.dev) and set it in environment variables (`FIRECRAWL_API_KEY`).
- Install the [Firecrawl SDK](https://github.com/mendableai/firecrawl) along with `crewai[tools]` package:
```
pip install firecrawl-py 'crewai[tools]'
```
## Example
Utilize the FirecrawlScrapeWebsiteTool as follows to allow your agent to load websites:
```python
from crewai_tools import FirecrawlScrapeWebsiteTool
tool = FirecrawlScrapeWebsiteTool(config={"formats": ['html']})
tool.run(url="firecrawl.dev")
```
## Arguments
- `api_key`: Optional. Specifies Firecrawl API key. Defaults is the `FIRECRAWL_API_KEY` environment variable.
- `config`: Optional. It contains Firecrawl API parameters.
This is the default configuration
```python
{
"formats": ["markdown"],
"only_main_content": True,
"include_tags": [],
"exclude_tags": [],
"headers": {},
"wait_for": 0,
}
```

View File

@@ -0,0 +1,103 @@
from typing import Any, Optional, Type, Dict, List, TYPE_CHECKING
from crewai.tools import BaseTool, EnvVar
from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
if TYPE_CHECKING:
from firecrawl import FirecrawlApp
try:
from firecrawl import FirecrawlApp
FIRECRAWL_AVAILABLE = True
except ImportError:
FIRECRAWL_AVAILABLE = False
class FirecrawlScrapeWebsiteToolSchema(BaseModel):
url: str = Field(description="Website URL")
class FirecrawlScrapeWebsiteTool(BaseTool):
"""
Tool for scraping webpages using Firecrawl. To run this tool, you need to have a Firecrawl API key.
Args:
api_key (str): Your Firecrawl API key.
config (dict): Optional. It contains Firecrawl API parameters.
Default configuration options:
formats (list[str]): Content formats to return. Default: ["markdown"]
onlyMainContent (bool): Only return main content. Default: True
includeTags (list[str]): Tags to include. Default: []
excludeTags (list[str]): Tags to exclude. Default: []
headers (dict): Headers to include. Default: {}
waitFor (int): Time to wait for page to load in ms. Default: 0
json_options (dict): Options for JSON extraction. Default: None
"""
model_config = ConfigDict(
arbitrary_types_allowed=True, validate_assignment=True, frozen=False
)
name: str = "Firecrawl web scrape tool"
description: str = "Scrape webpages using Firecrawl and return the contents"
args_schema: Type[BaseModel] = FirecrawlScrapeWebsiteToolSchema
api_key: Optional[str] = None
config: Dict[str, Any] = Field(
default_factory=lambda: {
"formats": ["markdown"],
"onlyMainContent": True,
"includeTags": [],
"excludeTags": [],
"headers": {},
"waitFor": 0,
}
)
_firecrawl: Optional["FirecrawlApp"] = PrivateAttr(None)
package_dependencies: List[str] = ["firecrawl-py"]
env_vars: List[EnvVar] = [
EnvVar(name="FIRECRAWL_API_KEY", description="API key for Firecrawl services", required=True),
]
def __init__(self, api_key: Optional[str] = None, **kwargs):
super().__init__(**kwargs)
try:
from firecrawl import FirecrawlApp # type: ignore
except ImportError:
import click
if click.confirm(
"You are missing the 'firecrawl-py' package. Would you like to install it?"
):
import subprocess
subprocess.run(["uv", "add", "firecrawl-py"], check=True)
from firecrawl import (
FirecrawlApp,
)
else:
raise ImportError(
"`firecrawl-py` package not found, please run `uv add firecrawl-py`"
)
self._firecrawl = FirecrawlApp(api_key=api_key)
def _run(self, url: str):
if not self._firecrawl:
raise RuntimeError("FirecrawlApp not properly initialized")
return self._firecrawl.scrape_url(url, params=self.config)
try:
from firecrawl import FirecrawlApp
# Must rebuild model after class is defined
if not hasattr(FirecrawlScrapeWebsiteTool, "_model_rebuilt"):
FirecrawlScrapeWebsiteTool.model_rebuild()
FirecrawlScrapeWebsiteTool._model_rebuilt = True
except ImportError:
"""
When this tool is not used, then exception can be ignored.
"""

View File

@@ -0,0 +1,44 @@
# FirecrawlSearchTool
## Description
[Firecrawl](https://firecrawl.dev) is a platform for crawling and convert any website into clean markdown or structured data.
## Installation
- Get an API key from [firecrawl.dev](https://firecrawl.dev) and set it in environment variables (`FIRECRAWL_API_KEY`).
- Install the [Firecrawl SDK](https://github.com/mendableai/firecrawl) along with `crewai[tools]` package:
```
pip install firecrawl-py 'crewai[tools]'
```
## Example
Utilize the FirecrawlSearchTool as follows to allow your agent to load websites:
```python
from crewai_tools import FirecrawlSearchTool
tool = FirecrawlSearchTool(config={"limit": 5})
tool.run(query="firecrawl web scraping")
```
## Arguments
- `api_key`: Optional. Specifies Firecrawl API key. Defaults is the `FIRECRAWL_API_KEY` environment variable.
- `config`: Optional. It contains Firecrawl API parameters.
This is the default configuration
```python
{
"limit": 5,
"tbs": None,
"lang": "en",
"country": "us",
"location": None,
"timeout": 60000,
}
```

View File

@@ -0,0 +1,119 @@
from typing import TYPE_CHECKING, Any, Dict, Optional, Type, List
from crewai.tools import BaseTool, EnvVar
from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
if TYPE_CHECKING:
from firecrawl import FirecrawlApp
try:
from firecrawl import FirecrawlApp
FIRECRAWL_AVAILABLE = True
except ImportError:
FIRECRAWL_AVAILABLE = False
class FirecrawlSearchToolSchema(BaseModel):
query: str = Field(description="Search query")
class FirecrawlSearchTool(BaseTool):
"""
Tool for searching webpages using Firecrawl. To run this tool, you need to have a Firecrawl API key.
Args:
api_key (str): Your Firecrawl API key.
config (dict): Optional. It contains Firecrawl API parameters.
Default configuration options:
limit (int): Maximum number of pages to crawl. Default: 5
tbs (str): Time before search. Default: None
lang (str): Language. Default: "en"
country (str): Country. Default: "us"
location (str): Location. Default: None
timeout (int): Timeout in milliseconds. Default: 60000
"""
model_config = ConfigDict(
arbitrary_types_allowed=True, validate_assignment=True, frozen=False
)
model_config = ConfigDict(
arbitrary_types_allowed=True, validate_assignment=True, frozen=False
)
name: str = "Firecrawl web search tool"
description: str = "Search webpages using Firecrawl and return the results"
args_schema: Type[BaseModel] = FirecrawlSearchToolSchema
api_key: Optional[str] = None
config: Optional[dict[str, Any]] = Field(
default_factory=lambda: {
"limit": 5,
"tbs": None,
"lang": "en",
"country": "us",
"location": None,
"timeout": 60000,
}
)
_firecrawl: Optional["FirecrawlApp"] = PrivateAttr(None)
package_dependencies: List[str] = ["firecrawl-py"]
env_vars: List[EnvVar] = [
EnvVar(name="FIRECRAWL_API_KEY", description="API key for Firecrawl services", required=True),
]
def __init__(self, api_key: Optional[str] = None, **kwargs):
super().__init__(**kwargs)
self.api_key = api_key
self._initialize_firecrawl()
def _initialize_firecrawl(self) -> None:
try:
from firecrawl import FirecrawlApp # type: ignore
self._firecrawl = FirecrawlApp(api_key=self.api_key)
except ImportError:
import click
if click.confirm(
"You are missing the 'firecrawl-py' package. Would you like to install it?"
):
import subprocess
try:
subprocess.run(["uv", "add", "firecrawl-py"], check=True)
from firecrawl import FirecrawlApp
self._firecrawl = FirecrawlApp(api_key=self.api_key)
except subprocess.CalledProcessError:
raise ImportError("Failed to install firecrawl-py package")
else:
raise ImportError(
"`firecrawl-py` package not found, please run `uv add firecrawl-py`"
)
def _run(
self,
query: str,
) -> Any:
if not self._firecrawl:
raise RuntimeError("FirecrawlApp not properly initialized")
return self._firecrawl.search(
query=query,
params=self.config,
)
try:
from firecrawl import FirecrawlApp # type: ignore
# Only rebuild if the class hasn't been initialized yet
if not hasattr(FirecrawlSearchTool, "_model_rebuilt"):
FirecrawlSearchTool.model_rebuild()
FirecrawlSearchTool._model_rebuilt = True
except ImportError:
"""
When this tool is not used, then exception can be ignored.
"""
pass

View File

@@ -0,0 +1,50 @@
# GenerateCrewaiAutomationTool
## Description
The GenerateCrewaiAutomationTool integrates with CrewAI Studio API to generate complete CrewAI automations from natural language descriptions. It translates high-level requirements into functional CrewAI implementations and returns direct links to Studio projects.
## Environment Variables
Set your CrewAI Personal Access Token (CrewAI Enterprise > Settings > Account > Personal Access Token):
```bash
export CREWAI_PERSONAL_ACCESS_TOKEN="your_personal_access_token_here"
export CREWAI_PLUS_URL="https://app.crewai.com" # optional
```
## Example
```python
from crewai_tools import GenerateCrewaiAutomationTool
from crewai import Agent, Task, Crew
# Initialize tool
tool = GenerateCrewaiAutomationTool()
# Generate automation
result = tool.run(
prompt="Generate a CrewAI automation that scrapes websites and stores data in a database",
organization_id="org_123" # optional but recommended
)
print(result)
# Output: Generated CrewAI Studio project URL: https://studio.crewai.com/project/abc123
# Use with agent
agent = Agent(
role="Automation Architect",
goal="Generate CrewAI automations",
backstory="Expert at creating automated workflows",
tools=[tool]
)
task = Task(
description="Create a lead qualification automation",
agent=agent,
expected_output="Studio project URL"
)
crew = Crew(agents=[agent], tasks=[task])
result = crew.kickoff()
```

View File

@@ -0,0 +1,70 @@
import os
from typing import List, Optional, Type
import requests
from crewai.tools import BaseTool, EnvVar
from pydantic import BaseModel, Field
class GenerateCrewaiAutomationToolSchema(BaseModel):
prompt: str = Field(
description="The prompt to generate the CrewAI automation, e.g. 'Generate a CrewAI automation that will scrape the website and store the data in a database.'"
)
organization_id: Optional[str] = Field(
default=None,
description="The identifier for the CrewAI Enterprise organization. If not specified, a default organization will be used.",
)
class GenerateCrewaiAutomationTool(BaseTool):
name: str = "Generate CrewAI Automation"
description: str = (
"A tool that leverages CrewAI Studio's capabilities to automatically generate complete CrewAI "
"automations based on natural language descriptions. It translates high-level requirements into "
"functional CrewAI implementations."
)
args_schema: Type[BaseModel] = GenerateCrewaiAutomationToolSchema
crewai_enterprise_url: str = Field(
default_factory=lambda: os.getenv("CREWAI_PLUS_URL", "https://app.crewai.com"),
description="The base URL of CrewAI Enterprise. If not provided, it will be loaded from the environment variable CREWAI_PLUS_URL with default https://app.crewai.com.",
)
personal_access_token: Optional[str] = Field(
default_factory=lambda: os.getenv("CREWAI_PERSONAL_ACCESS_TOKEN"),
description="The user's Personal Access Token to access CrewAI Enterprise API. If not provided, it will be loaded from the environment variable CREWAI_PERSONAL_ACCESS_TOKEN.",
)
env_vars: List[EnvVar] = [
EnvVar(
name="CREWAI_PERSONAL_ACCESS_TOKEN",
description="Personal Access Token for CrewAI Enterprise API",
required=True,
),
EnvVar(
name="CREWAI_PLUS_URL",
description="Base URL for CrewAI Enterprise API",
required=False,
),
]
def _run(self, **kwargs) -> str:
input_data = GenerateCrewaiAutomationToolSchema(**kwargs)
response = requests.post(
f"{self.crewai_enterprise_url}/crewai_plus/api/v1/studio",
headers=self._get_headers(input_data.organization_id),
json={"prompt": input_data.prompt},
)
response.raise_for_status()
studio_project_url = response.json().get("url")
return f"Generated CrewAI Studio project URL: {studio_project_url}"
def _get_headers(self, organization_id: Optional[str] = None) -> dict:
headers = {
"Authorization": f"Bearer {self.personal_access_token}",
"Content-Type": "application/json",
"Accept": "application/json",
}
if organization_id:
headers["X-Crewai-Organization-Id"] = organization_id
return headers

View File

@@ -0,0 +1,67 @@
# GithubSearchTool
## Description
The GithubSearchTool is a Retrieval Augmented Generation (RAG) tool specifically designed for conducting semantic searches within GitHub repositories. Utilizing advanced semantic search capabilities, it sifts through code, pull requests, issues, and repositories, making it an essential tool for developers, researchers, or anyone in need of precise information from GitHub.
## Installation
To use the GithubSearchTool, first ensure the crewai_tools package is installed in your Python environment:
```shell
pip install 'crewai[tools]'
```
This command installs the necessary package to run the GithubSearchTool along with any other tools included in the crewai_tools package.
## Example
Heres how you can use the GithubSearchTool to perform semantic searches within a GitHub repository:
```python
from crewai_tools import GithubSearchTool
# Initialize the tool for semantic searches within a specific GitHub repository
tool = GithubSearchTool(
gh_token='...',
github_repo='https://github.com/example/repo',
content_types=['code', 'issue'] # Options: code, repo, pr, issue
)
# OR
# Initialize the tool for semantic searches within a specific GitHub repository, so the agent can search any repository if it learns about during its execution
tool = GithubSearchTool(
gh_token='...',
content_types=['code', 'issue'] # Options: code, repo, pr, issue
)
```
## Arguments
- `gh_token` : The GitHub token used to authenticate the search. This is a mandatory field and allows the tool to access the GitHub API for conducting searches.
- `github_repo` : The URL of the GitHub repository where the search will be conducted. This is a mandatory field and specifies the target repository for your search.
- `content_types` : Specifies the types of content to include in your search. You must provide a list of content types from the following options: `code` for searching within the code, `repo` for searching within the repository's general information, `pr` for searching within pull requests, and `issue` for searching within issues. This field is mandatory and allows tailoring the search to specific content types within the GitHub repository.
## Custom model and embeddings
By default, the tool uses OpenAI for both embeddings and summarization. To customize the model, you can use a config dictionary as follows:
```python
tool = GithubSearchTool(
config=dict(
llm=dict(
provider="ollama", # or google, openai, anthropic, llama2, ...
config=dict(
model="llama2",
# temperature=0.5,
# top_p=1,
# stream=true,
),
),
embedder=dict(
provider="google",
config=dict(
model="models/embedding-001",
task_type="retrieval_document",
# title="Embeddings",
),
),
)
)
```

View File

@@ -0,0 +1,88 @@
from typing import List, Optional, Type, Any
try:
from embedchain.loaders.github import GithubLoader
EMBEDCHAIN_AVAILABLE = True
except ImportError:
EMBEDCHAIN_AVAILABLE = False
from pydantic import BaseModel, Field, PrivateAttr
from ..rag.rag_tool import RagTool
class FixedGithubSearchToolSchema(BaseModel):
"""Input for GithubSearchTool."""
search_query: str = Field(
...,
description="Mandatory search query you want to use to search the github repo's content",
)
class GithubSearchToolSchema(FixedGithubSearchToolSchema):
"""Input for GithubSearchTool."""
github_repo: str = Field(..., description="Mandatory github you want to search")
content_types: List[str] = Field(
...,
description="Mandatory content types you want to be included search, options: [code, repo, pr, issue]",
)
class GithubSearchTool(RagTool):
name: str = "Search a github repo's content"
description: str = (
"A tool that can be used to semantic search a query from a github repo's content. This is not the GitHub API, but instead a tool that can provide semantic search capabilities."
)
summarize: bool = False
gh_token: str
args_schema: Type[BaseModel] = GithubSearchToolSchema
content_types: List[str] = Field(
default_factory=lambda: ["code", "repo", "pr", "issue"],
description="Content types you want to be included search, options: [code, repo, pr, issue]",
)
_loader: Any | None = PrivateAttr(default=None)
def __init__(
self,
github_repo: Optional[str] = None,
content_types: Optional[List[str]] = None,
**kwargs,
):
if not EMBEDCHAIN_AVAILABLE:
raise ImportError("embedchain is not installed. Please install it with `pip install crewai-tools[embedchain]`")
super().__init__(**kwargs)
self._loader = GithubLoader(config={"token": self.gh_token})
if github_repo and content_types:
self.add(repo=github_repo, content_types=content_types)
self.description = f"A tool that can be used to semantic search a query the {github_repo} github repo's content. This is not the GitHub API, but instead a tool that can provide semantic search capabilities."
self.args_schema = FixedGithubSearchToolSchema
self._generate_description()
def add(
self,
repo: str,
content_types: Optional[List[str]] = None,
) -> None:
content_types = content_types or self.content_types
super().add(
f"repo:{repo} type:{','.join(content_types)}",
data_type="github",
loader=self._loader,
)
def _run(
self,
search_query: str,
github_repo: Optional[str] = None,
content_types: Optional[List[str]] = None,
) -> str:
if github_repo:
self.add(
repo=github_repo,
content_types=content_types,
)
return super()._run(query=search_query)

View File

@@ -0,0 +1,42 @@
# HyperbrowserLoadTool
## Description
[Hyperbrowser](https://hyperbrowser.ai) is a platform for running and scaling headless browsers. It lets you launch and manage browser sessions at scale and provides easy to use solutions for any webscraping needs, such as scraping a single page or crawling an entire site.
Key Features:
- Instant Scalability - Spin up hundreds of browser sessions in seconds without infrastructure headaches
- Simple Integration - Works seamlessly with popular tools like Puppeteer and Playwright
- Powerful APIs - Easy to use APIs for scraping/crawling any site, and much more
- Bypass Anti-Bot Measures - Built-in stealth mode, ad blocking, automatic CAPTCHA solving, and rotating proxies
For more information about Hyperbrowser, please visit the [Hyperbrowser website](https://hyperbrowser.ai) or if you want to check out the docs, you can visit the [Hyperbrowser docs](https://docs.hyperbrowser.ai).
## Installation
- Head to [Hyperbrowser](https://app.hyperbrowser.ai/) to sign up and generate an API key. Once you've done this set the `HYPERBROWSER_API_KEY` environment variable or you can pass it to the `HyperbrowserLoadTool` constructor.
- Install the [Hyperbrowser SDK](https://github.com/hyperbrowserai/python-sdk):
```
pip install hyperbrowser 'crewai[tools]'
```
## Example
Utilize the HyperbrowserLoadTool as follows to allow your agent to load websites:
```python
from crewai_tools import HyperbrowserLoadTool
tool = HyperbrowserLoadTool()
```
## Arguments
`__init__` arguments:
- `api_key`: Optional. Specifies Hyperbrowser API key. Defaults to the `HYPERBROWSER_API_KEY` environment variable.
`run` arguments:
- `url`: The base URL to start scraping or crawling from.
- `operation`: Optional. Specifies the operation to perform on the website. Either 'scrape' or 'crawl'. Defaults is 'scrape'.
- `params`: Optional. Specifies the params for the operation. For more information on the supported params, visit https://docs.hyperbrowser.ai/reference/sdks/python/scrape#start-scrape-job-and-wait or https://docs.hyperbrowser.ai/reference/sdks/python/crawl#start-crawl-job-and-wait.

View File

@@ -0,0 +1,107 @@
import os
from typing import Any, Optional, Type, Dict, Literal, Union, List
from crewai.tools import BaseTool, EnvVar
from pydantic import BaseModel, Field
class HyperbrowserLoadToolSchema(BaseModel):
url: str = Field(description="Website URL")
operation: Literal['scrape', 'crawl'] = Field(description="Operation to perform on the website. Either 'scrape' or 'crawl'")
params: Optional[Dict] = Field(description="Optional params for scrape or crawl. For more information on the supported params, visit https://docs.hyperbrowser.ai/reference/sdks/python/scrape#start-scrape-job-and-wait or https://docs.hyperbrowser.ai/reference/sdks/python/crawl#start-crawl-job-and-wait")
class HyperbrowserLoadTool(BaseTool):
"""HyperbrowserLoadTool.
Scrape or crawl web pages and load the contents with optional parameters for configuring content extraction.
Requires the `hyperbrowser` package.
Get your API Key from https://app.hyperbrowser.ai/
Args:
api_key: The Hyperbrowser API key, can be set as an environment variable `HYPERBROWSER_API_KEY` or passed directly
"""
name: str = "Hyperbrowser web load tool"
description: str = "Scrape or crawl a website using Hyperbrowser and return the contents in properly formatted markdown or html"
args_schema: Type[BaseModel] = HyperbrowserLoadToolSchema
api_key: Optional[str] = None
hyperbrowser: Optional[Any] = None
package_dependencies: List[str] = ["hyperbrowser"]
env_vars: List[EnvVar] = [
EnvVar(name="HYPERBROWSER_API_KEY", description="API key for Hyperbrowser services", required=False),
]
def __init__(self, api_key: Optional[str] = None, **kwargs):
super().__init__(**kwargs)
self.api_key = api_key or os.getenv('HYPERBROWSER_API_KEY')
if not api_key:
raise ValueError(
"`api_key` is required, please set the `HYPERBROWSER_API_KEY` environment variable or pass it directly"
)
try:
from hyperbrowser import Hyperbrowser
except ImportError:
raise ImportError("`hyperbrowser` package not found, please run `pip install hyperbrowser`")
if not self.api_key:
raise ValueError("HYPERBROWSER_API_KEY is not set. Please provide it either via the constructor with the `api_key` argument or by setting the HYPERBROWSER_API_KEY environment variable.")
self.hyperbrowser = Hyperbrowser(api_key=self.api_key)
def _prepare_params(self, params: Dict) -> Dict:
"""Prepare session and scrape options parameters."""
try:
from hyperbrowser.models.session import CreateSessionParams
from hyperbrowser.models.scrape import ScrapeOptions
except ImportError:
raise ImportError(
"`hyperbrowser` package not found, please run `pip install hyperbrowser`"
)
if "scrape_options" in params:
if "formats" in params["scrape_options"]:
formats = params["scrape_options"]["formats"]
if not all(fmt in ["markdown", "html"] for fmt in formats):
raise ValueError("formats can only contain 'markdown' or 'html'")
if "session_options" in params:
params["session_options"] = CreateSessionParams(**params["session_options"])
if "scrape_options" in params:
params["scrape_options"] = ScrapeOptions(**params["scrape_options"])
return params
def _extract_content(self, data: Union[Any, None]):
"""Extract content from response data."""
content = ""
if data:
content = data.markdown or data.html or ""
return content
def _run(self, url: str, operation: Literal['scrape', 'crawl'] = 'scrape', params: Optional[Dict] = {}):
try:
from hyperbrowser.models.scrape import StartScrapeJobParams
from hyperbrowser.models.crawl import StartCrawlJobParams
except ImportError:
raise ImportError(
"`hyperbrowser` package not found, please run `pip install hyperbrowser`"
)
params = self._prepare_params(params)
if operation == 'scrape':
scrape_params = StartScrapeJobParams(url=url, **params)
scrape_resp = self.hyperbrowser.scrape.start_and_wait(scrape_params)
content = self._extract_content(scrape_resp.data)
return content
else:
crawl_params = StartCrawlJobParams(url=url, **params)
crawl_resp = self.hyperbrowser.crawl.start_and_wait(crawl_params)
content = ""
if crawl_resp.data:
for page in crawl_resp.data:
page_content = self._extract_content(page)
if page_content:
content += (
f"\n{'-'*50}\nUrl: {page.url}\nContent:\n{page_content}\n"
)
return content

View File

@@ -0,0 +1,159 @@
# InvokeCrewAIAutomationTool
## Description
The InvokeCrewAIAutomationTool provides CrewAI Platform API integration with external crew services. This tool allows you to invoke and interact with CrewAI Platform automations from within your CrewAI agents, enabling seamless integration between different crew workflows.
## Features
- **Dynamic Input Schema**: Configure custom input parameters for different crew automations
- **Automatic Polling**: Automatically polls for task completion with configurable timeout
- **Bearer Token Authentication**: Secure API authentication using bearer tokens
- **Comprehensive Error Handling**: Robust error handling for API failures and timeouts
- **Flexible Configuration**: Support for both simple and complex crew automation workflows
## Installation
Install the required dependencies:
```shell
pip install 'crewai[tools]'
```
## Example
### Basic Usage
```python
from crewai_tools import InvokeCrewAIAutomationTool
# Basic crew automation tool
tool = InvokeCrewAIAutomationTool(
crew_api_url="https://data-analysis-crew-[...].crewai.com",
crew_bearer_token="your_bearer_token_here",
crew_name="Data Analysis Crew",
crew_description="Analyzes data and generates insights"
)
# Use the tool
result = tool.run()
```
### Advanced Usage with Custom Inputs
```python
from crewai_tools import InvokeCrewAIAutomationTool
from pydantic import Field
# Define custom input schema
custom_inputs = {
"year": Field(..., description="Year to retrieve the report for (integer)"),
"region": Field(default="global", description="Geographic region for analysis"),
"format": Field(default="summary", description="Report format (summary, detailed, raw)")
}
# Create tool with custom inputs
tool = InvokeCrewAIAutomationTool(
crew_api_url="https://state-of-ai-report-crew-[...].crewai.com",
crew_bearer_token="your_bearer_token_here",
crew_name="State of AI Report",
crew_description="Retrieves a comprehensive report on state of AI for a given year and region",
crew_inputs=custom_inputs,
max_polling_time=15 * 60 # 15 minutes timeout
)
# Use with custom parameters
result = tool.run(year=2024, region="north-america", format="detailed")
```
### Integration with CrewAI Agents
```python
from crewai import Agent, Task, Crew
from crewai_tools import InvokeCrewAIAutomationTool
# Create the automation tool
market_research_tool = InvokeCrewAIAutomationTool(
crew_api_url="https://market-research-automation-crew-[...].crewai.com",
crew_bearer_token="your_bearer_token_here",
crew_name="Market Research Automation",
crew_description="Conducts comprehensive market research analysis",
inputs={
"year": Field(..., description="Year to use for the market research"),
}
)
# Create an agent with the tool
research_agent = Agent(
role="Research Coordinator",
goal="Coordinate and execute market research tasks",
backstory="You are an expert at coordinating research tasks and leveraging automation tools.",
tools=[market_research_tool],
verbose=True
)
# Create and execute a task
research_task = Task(
description="Conduct market research on AI tools market for 2024",
agent=research_agent,
expected_output="Comprehensive market research report"
)
crew = Crew(
agents=[research_agent],
tasks=[research_task]
)
result = crew.kickoff()
```
## Arguments
### Required Parameters
- `crew_api_url` (str): Base URL of the CrewAI Platform automation API
- `crew_bearer_token` (str): Bearer token for API authentication
- `crew_name` (str): Name of the crew automation
- `crew_description` (str): Description of what the crew automation does
### Optional Parameters
- `max_polling_time` (int): Maximum time in seconds to wait for task completion (default: 600 seconds = 10 minutes)
- `crew_inputs` (dict): Dictionary defining custom input schema fields using Pydantic Field objects
## Custom Input Schema
When defining `crew_inputs`, use Pydantic Field objects to specify the input parameters. These have to be compatible with the crew automation you are invoking:
```python
from pydantic import Field
crew_inputs = {
"required_param": Field(..., description="This parameter is required"),
"optional_param": Field(default="default_value", description="This parameter is optional"),
"typed_param": Field(..., description="Integer parameter", ge=1, le=100) # With validation
}
```
## Error Handling
The tool provides comprehensive error handling for common scenarios:
- **API Connection Errors**: Network connectivity issues
- **Authentication Errors**: Invalid or expired bearer tokens
- **Timeout Errors**: Tasks that exceed the maximum polling time
- **Task Failures**: Crew automations that fail during execution
## API Endpoints
The tool interacts with two main API endpoints:
- `POST {crew_api_url}/kickoff`: Starts a new crew automation task
- `GET {crew_api_url}/status/{crew_id}`: Checks the status of a running task
## Notes
- The tool automatically polls the status endpoint every second until completion or timeout
- Successful tasks return the result directly, while failed tasks return error information
- The bearer token should be kept secure and not hardcoded in production environments
- Consider using environment variables for sensitive configuration like bearer tokens

View File

@@ -0,0 +1,176 @@
from crewai.tools import BaseTool
from pydantic import BaseModel, Field, create_model
from typing import Any, Type
import requests
import time
class InvokeCrewAIAutomationInput(BaseModel):
"""Input schema for InvokeCrewAIAutomationTool."""
prompt: str = Field(..., description="The prompt or query to send to the crew")
class InvokeCrewAIAutomationTool(BaseTool):
"""
A CrewAI tool for invoking external crew/flows APIs.
This tool provides CrewAI Platform API integration with external crew services, supporting:
- Dynamic input schema configuration
- Automatic polling for task completion
- Bearer token authentication
- Comprehensive error handling
Example:
Basic usage:
>>> tool = InvokeCrewAIAutomationTool(
... crew_api_url="https://api.example.com",
... crew_bearer_token="your_token",
... crew_name="My Crew",
... crew_description="Description of what the crew does"
... )
With custom inputs:
>>> custom_inputs = {
... "param1": Field(..., description="Description of param1"),
... "param2": Field(default="default_value", description="Description of param2")
... }
>>> tool = InvokeCrewAIAutomationTool(
... crew_api_url="https://api.example.com",
... crew_bearer_token="your_token",
... crew_name="My Crew",
... crew_description="Description of what the crew does",
... crew_inputs=custom_inputs
... )
Example:
>>> tools=[
... InvokeCrewAIAutomationTool(
... crew_api_url="https://canary-crew-[...].crewai.com",
... crew_bearer_token="[Your token: abcdef012345]",
... crew_name="State of AI Report",
... crew_description="Retrieves a report on state of AI for a given year.",
... crew_inputs={
... "year": Field(..., description="Year to retrieve the report for (integer)")
... }
... )
... ]
"""
name: str = "invoke_amp_automation"
description: str = "Invokes an CrewAI Platform Automation using API"
args_schema: Type[BaseModel] = InvokeCrewAIAutomationInput
crew_api_url: str
crew_bearer_token: str
max_polling_time: int = 10 * 60 # 10 minutes
def __init__(
self,
crew_api_url: str,
crew_bearer_token: str,
crew_name: str,
crew_description: str,
max_polling_time: int = 10 * 60,
crew_inputs: dict[str, Any] = None):
"""
Initialize the InvokeCrewAIAutomationTool.
Args:
crew_api_url: Base URL of the crew API service
crew_bearer_token: Bearer token for API authentication
crew_name: Name of the crew to invoke
crew_description: Description of the crew to invoke
max_polling_time: Maximum time in seconds to wait for task completion (default: 600 seconds = 10 minutes)
crew_inputs: Optional dictionary defining custom input schema fields
"""
# Create dynamic args_schema if custom inputs provided
if crew_inputs:
# Start with the base prompt field
fields = {}
# Add custom fields
for field_name, field_def in crew_inputs.items():
if isinstance(field_def, tuple):
fields[field_name] = field_def
else:
# Assume it's a Field object, extract type from annotation if available
fields[field_name] = (str, field_def)
# Create dynamic model
args_schema = create_model('DynamicInvokeCrewAIAutomationInput', **fields)
else:
args_schema = InvokeCrewAIAutomationInput
# Initialize the parent class with proper field values
super().__init__(
name=crew_name,
description=crew_description,
args_schema=args_schema,
crew_api_url=crew_api_url,
crew_bearer_token=crew_bearer_token,
max_polling_time=max_polling_time
)
def _kickoff_crew(self, inputs: dict[str, Any]) -> dict[str, Any]:
"""Start a new crew task
Args:
inputs: Dictionary containing the query and other input parameters
Returns:
Dictionary containing the crew task response. The response will contain the crew id which needs to be returned to check the status of the crew.
"""
response = requests.post(
f"{self.crew_api_url}/kickoff",
headers={
"Authorization": f"Bearer {self.crew_bearer_token}",
"Content-Type": "application/json",
},
json={"inputs": inputs},
)
response_json = response.json()
return response_json
def _get_crew_status(self, crew_id: str) -> dict[str, Any]:
"""Get the status of a crew task
Args:
crew_id: The ID of the crew task to check
Returns:
Dictionary containing the crew task status
"""
response = requests.get(
f"{self.crew_api_url}/status/{crew_id}",
headers={
"Authorization": f"Bearer {self.crew_bearer_token}",
"Content-Type": "application/json",
},
)
return response.json()
def _run(self, **kwargs) -> str:
"""Execute the crew invocation tool."""
if kwargs is None:
kwargs = {}
# Start the crew
response = self._kickoff_crew(inputs=kwargs)
if response.get("kickoff_id") is None:
return f"Error: Failed to kickoff crew. Response: {response}"
kickoff_id = response.get("kickoff_id")
# Poll for completion
for i in range(self.max_polling_time):
try:
status_response = self._get_crew_status(crew_id=kickoff_id)
if status_response.get("state", "").lower() == "success":
return status_response.get("result", "No result returned")
elif status_response.get("state", "").lower() == "failed":
return f"Error: Crew task failed. Response: {status_response}"
except Exception as e:
if i == self.max_polling_time - 1: # Last attempt
return f"Error: Failed to get crew status after {self.max_polling_time} attempts. Last error: {e}"
time.sleep(1)
return f"Error: Crew did not complete within {self.max_polling_time} seconds"

View File

@@ -0,0 +1,38 @@
# JinaScrapeWebsiteTool
## Description
A tool designed to extract and read the content of a specified website by using Jina.ai reader. It is capable of handling various types of web pages by making HTTP requests and parsing the received HTML content. This tool can be particularly useful for web scraping tasks, data collection, or extracting specific information from websites.
## Installation
Install the crewai_tools package
```shell
pip install 'crewai[tools]'
```
## Example
```python
from crewai_tools import JinaScrapeWebsiteTool
# To enable scraping any website it finds during its execution
tool = JinaScrapeWebsiteTool(api_key='YOUR_API_KEY')
# Initialize the tool with the website URL, so the agent can only scrape the content of the specified website
tool = JinaScrapeWebsiteTool(website_url='https://www.example.com')
# With custom headers
tool = JinaScrapeWebsiteTool(
website_url='https://www.example.com',
custom_headers={'X-Target-Selector': 'body, .class, #id'}
)
```
## Authentication
The tool uses Jina.ai's reader service. While it can work without an API key, Jina.ai may apply rate limiting or blocking to unauthenticated requests. For production use, it's recommended to provide an API key.
## Arguments
- `website_url`: Mandatory website URL to read the file. This is the primary input for the tool, specifying which website's content should be scraped and read.
- `api_key`: Optional Jina.ai API key for authenticated access to the reader service.
- `custom_headers`: Optional dictionary of HTTP headers to use when making requests.
## Note
This tool is an alternative to the standard `ScrapeWebsiteTool` that specifically uses Jina.ai's reader service for enhanced content extraction. Choose this tool when you need more sophisticated content parsing capabilities.

View File

@@ -0,0 +1,52 @@
from typing import Optional, Type
import requests
from crewai.tools import BaseTool
from pydantic import BaseModel, Field
class JinaScrapeWebsiteToolInput(BaseModel):
"""Input schema for JinaScrapeWebsiteTool."""
website_url: str = Field(..., description="Mandatory website url to read the file")
class JinaScrapeWebsiteTool(BaseTool):
name: str = "JinaScrapeWebsiteTool"
description: str = "A tool that can be used to read a website content using Jina.ai reader and return markdown content."
args_schema: Type[BaseModel] = JinaScrapeWebsiteToolInput
website_url: Optional[str] = None
api_key: Optional[str] = None
headers: dict = {}
def __init__(
self,
website_url: Optional[str] = None,
api_key: Optional[str] = None,
custom_headers: Optional[dict] = None,
**kwargs,
):
super().__init__(**kwargs)
if website_url is not None:
self.website_url = website_url
self.description = f"A tool that can be used to read {website_url}'s content and return markdown content."
self._generate_description()
if custom_headers is not None:
self.headers = custom_headers
if api_key is not None:
self.headers["Authorization"] = f"Bearer {api_key}"
def _run(self, website_url: Optional[str] = None) -> str:
url = website_url or self.website_url
if not url:
raise ValueError(
"Website URL must be provided either during initialization or execution"
)
response = requests.get(
f"https://r.jina.ai/{url}", headers=self.headers, timeout=15
)
response.raise_for_status()
return response.text

View File

@@ -0,0 +1,55 @@
# JSONSearchTool
## Description
This tool is used to perform a RAG search within a JSON file's content. It allows users to initiate a search with a specific JSON path, focusing the search operation within that particular JSON file. If the path is provided at initialization, the tool restricts its search scope to the specified JSON file, thereby enhancing the precision of search results.
## Installation
Install the crewai_tools package by executing the following command in your terminal:
```shell
pip install 'crewai[tools]'
```
## Example
Below are examples demonstrating how to use the JSONSearchTool for searching within JSON files. You can either search any JSON content or restrict the search to a specific JSON file.
```python
from crewai_tools import JSONSearchTool
# Example 1: Initialize the tool for a general search across any JSON content. This is useful when the path is known or can be discovered during execution.
tool = JSONSearchTool()
# Example 2: Initialize the tool with a specific JSON path, limiting the search to a particular JSON file.
tool = JSONSearchTool(json_path='./path/to/your/file.json')
```
## Arguments
- `json_path` (str): An optional argument that defines the path to the JSON file to be searched. This parameter is only necessary if the tool is initialized without a specific JSON path. Providing this argument restricts the search to the specified JSON file.
## Custom model and embeddings
By default, the tool uses OpenAI for both embeddings and summarization. To customize the model, you can use a config dictionary as follows:
```python
tool = JSONSearchTool(
config=dict(
llm=dict(
provider="ollama", # or google, openai, anthropic, llama2, ...
config=dict(
model="llama2",
# temperature=0.5,
# top_p=1,
# stream=true,
),
),
embedder=dict(
provider="google",
config=dict(
model="models/embedding-001",
task_type="retrieval_document",
# title="Embeddings",
),
),
)
)
```

View File

@@ -0,0 +1,47 @@
from typing import Optional, Type
from pydantic import BaseModel, Field
from ..rag.rag_tool import RagTool
class FixedJSONSearchToolSchema(BaseModel):
"""Input for JSONSearchTool."""
search_query: str = Field(
...,
description="Mandatory search query you want to use to search the JSON's content",
)
class JSONSearchToolSchema(FixedJSONSearchToolSchema):
"""Input for JSONSearchTool."""
json_path: str = Field(
..., description="File path or URL of a JSON file to be searched"
)
class JSONSearchTool(RagTool):
name: str = "Search a JSON's content"
description: str = (
"A tool that can be used to semantic search a query from a JSON's content."
)
args_schema: Type[BaseModel] = JSONSearchToolSchema
def __init__(self, json_path: Optional[str] = None, **kwargs):
super().__init__(**kwargs)
if json_path is not None:
self.add(json_path)
self.description = f"A tool that can be used to semantic search a query the {json_path} JSON's content."
self.args_schema = FixedJSONSearchToolSchema
self._generate_description()
def _run(
self,
search_query: str,
json_path: Optional[str] = None,
) -> str:
if json_path is not None:
self.add(json_path)
return super()._run(query=search_query)

View File

@@ -0,0 +1,98 @@
# Linkup Search Tool
## Description
The `LinkupSearchTool` is a tool designed for integration with the CrewAI framework. It provides the ability to query the Linkup API for contextual information and retrieve structured results. This tool is ideal for enriching workflows with up-to-date and reliable information from Linkup.
---
## Features
- Perform API queries to the Linkup platform using customizable parameters (`query`, `depth`, `output_type`).
- Gracefully handles API errors and provides structured feedback.
- Returns well-structured results for seamless integration into CrewAI processes.
---
## Installation
### Prerequisites
- Linkup API Key
### Steps
1. ```shell
pip install 'crewai[tools]'
```
2. Create a `.env` file in your project root and add your Linkup API Key:
```plaintext
LINKUP_API_KEY=your_linkup_api_key
```
---
## Usage
### Basic Example
Here is how to use the `LinkupSearchTool` in a CrewAI project:
1. **Import and Initialize**:
```python
from tools.linkup_tools import LinkupSearchTool
import os
from dotenv import load_dotenv
load_dotenv()
linkup_tool = LinkupSearchTool(api_key=os.getenv("LINKUP_API_KEY"))
```
2. **Set Up an Agent and Task**:
```python
from crewai import Agent, Task, Crew
# Define the agent
research_agent = Agent(
role="Information Researcher",
goal="Fetch relevant results from Linkup.",
backstory="An expert in online information retrieval...",
tools=[linkup_tool],
verbose=True
)
# Define the task
search_task = Task(
expected_output="A detailed list of Nobel Prize-winning women in physics with their achievements.",
description="Search for women who have won the Nobel Prize in Physics.",
agent=research_agent
)
# Create and run the crew
crew = Crew(
agents=[research_agent],
tasks=[search_task]
)
result = crew.kickoff()
print(result)
```
### Advanced Configuration
You can customize the parameters for the `LinkupSearchTool`:
- `query`: The search term or phrase.
- `depth`: The search depth (`"standard"` by default).
- `output_type`: The type of output (`"searchResults"` by default).
Example:
```python
response = linkup_tool._run(
query="Women Nobel Prize Physics",
depth="standard",
output_type="searchResults"
)
```

Binary file not shown.

After

Width:  |  Height:  |  Size: 32 KiB

View File

@@ -0,0 +1,78 @@
import os
from typing import Any, List
from crewai.tools import BaseTool, EnvVar
try:
from linkup import LinkupClient
LINKUP_AVAILABLE = True
except ImportError:
LINKUP_AVAILABLE = False
LinkupClient = Any # type placeholder when package is not available
from pydantic import PrivateAttr
class LinkupSearchTool(BaseTool):
name: str = "Linkup Search Tool"
description: str = (
"Performs an API call to Linkup to retrieve contextual information."
)
_client: LinkupClient = PrivateAttr() # type: ignore
description: str = (
"Performs an API call to Linkup to retrieve contextual information."
)
_client: LinkupClient = PrivateAttr() # type: ignore
package_dependencies: List[str] = ["linkup-sdk"]
env_vars: List[EnvVar] = [
EnvVar(name="LINKUP_API_KEY", description="API key for Linkup", required=True),
]
def __init__(self, api_key: str | None = None):
"""
Initialize the tool with an API key.
"""
super().__init__()
try:
from linkup import LinkupClient
except ImportError:
import click
if click.confirm(
"You are missing the 'linkup-sdk' package. Would you like to install it?"
):
import subprocess
subprocess.run(["uv", "add", "linkup-sdk"], check=True)
from linkup import LinkupClient
else:
raise ImportError(
"The 'linkup-sdk' package is required to use the LinkupSearchTool. "
"Please install it with: uv add linkup-sdk"
)
self._client = LinkupClient(api_key=api_key or os.getenv("LINKUP_API_KEY"))
def _run(
self, query: str, depth: str = "standard", output_type: str = "searchResults"
) -> dict:
"""
Executes a search using the Linkup API.
:param query: The query to search for.
:param depth: Search depth (default is "standard").
:param output_type: Desired result type (default is "searchResults").
:return: A dictionary containing the results or an error message.
"""
try:
response = self._client.search(
query=query, depth=depth, output_type=output_type
)
results = [
{"name": result.name, "url": result.url, "content": result.content}
for result in response.results
]
return {"success": True, "results": results}
except Exception as e:
return {"success": False, "error": str(e)}

View File

@@ -0,0 +1,53 @@
# LlamaIndexTool Documentation
## Description
This tool is designed to be a general wrapper around LlamaIndex tools and query engines, enabling you to leverage LlamaIndex resources
in terms of RAG/agentic pipelines as tools to plug into CrewAI agents.
## Installation
To incorporate this tool into your project, follow the installation instructions below:
```shell
pip install 'crewai[tools]'
```
## Example
The following example demonstrates how to initialize the tool and execute a search with a given query:
```python
from crewai_tools import LlamaIndexTool
# Initialize the tool from a LlamaIndex Tool
## Example 1: Initialize from FunctionTool
from llama_index.core.tools import FunctionTool
your_python_function = lambda ...: ...
og_tool = FunctionTool.from_defaults(your_python_function, name="<name>", description='<description>')
tool = LlamaIndexTool.from_tool(og_tool)
## Example 2: Initialize from LlamaHub Tools
from llama_index.tools.wolfram_alpha import WolframAlphaToolSpec
wolfram_spec = WolframAlphaToolSpec(app_id="<app_id>")
wolfram_tools = wolfram_spec.to_tool_list()
tools = [LlamaIndexTool.from_tool(t) for t in wolfram_tools]
# Initialize Tool from a LlamaIndex Query Engine
## NOTE: LlamaIndex has a lot of query engines, define whatever query engine you want
query_engine = index.as_query_engine()
query_tool = LlamaIndexTool.from_query_engine(
query_engine,
name="Uber 2019 10K Query Tool",
description="Use this tool to lookup the 2019 Uber 10K Annual Report"
)
```
## Steps to Get Started
To effectively use the `LlamaIndexTool`, follow these steps:
1. **Install CrewAI**: Confirm that the `crewai[tools]` package is installed in your Python environment.
2. **Install and use LlamaIndex**: Follow LlamaIndex documentation (https://docs.llamaindex.ai/) to setup a RAG/agent pipeline.

View File

@@ -0,0 +1,82 @@
from typing import Any, Optional, Type, cast
from crewai.tools import BaseTool
from pydantic import BaseModel, Field
class LlamaIndexTool(BaseTool):
"""Tool to wrap LlamaIndex tools/query engines."""
llama_index_tool: Any
def _run(
self,
*args: Any,
**kwargs: Any,
) -> Any:
"""Run tool."""
from llama_index.core.tools import BaseTool as LlamaBaseTool
tool = cast(LlamaBaseTool, self.llama_index_tool)
if self.result_as_answer:
return tool(*args, **kwargs).content
return tool(*args, **kwargs)
@classmethod
def from_tool(cls, tool: Any, **kwargs: Any) -> "LlamaIndexTool":
from llama_index.core.tools import BaseTool as LlamaBaseTool
if not isinstance(tool, LlamaBaseTool):
raise ValueError(f"Expected a LlamaBaseTool, got {type(tool)}")
tool = cast(LlamaBaseTool, tool)
if tool.metadata.fn_schema is None:
raise ValueError(
"The LlamaIndex tool does not have an fn_schema specified."
)
args_schema = cast(Type[BaseModel], tool.metadata.fn_schema)
return cls(
name=tool.metadata.name,
description=tool.metadata.description,
args_schema=args_schema,
llama_index_tool=tool,
**kwargs,
)
@classmethod
def from_query_engine(
cls,
query_engine: Any,
name: Optional[str] = None,
description: Optional[str] = None,
return_direct: bool = False,
**kwargs: Any,
) -> "LlamaIndexTool":
from llama_index.core.query_engine import BaseQueryEngine
from llama_index.core.tools import QueryEngineTool
if not isinstance(query_engine, BaseQueryEngine):
raise ValueError(f"Expected a BaseQueryEngine, got {type(query_engine)}")
# NOTE: by default the schema expects an `input` variable. However this
# confuses crewAI so we are renaming to `query`.
class QueryToolSchema(BaseModel):
"""Schema for query tool."""
query: str = Field(..., description="Search query for the query tool.")
# NOTE: setting `resolve_input_errors` to True is important because the schema expects `input` but we are using `query`
query_engine_tool = QueryEngineTool.from_defaults(
query_engine,
name=name,
description=description,
return_direct=return_direct,
resolve_input_errors=True,
)
# HACK: we are replacing the schema with our custom schema
query_engine_tool.metadata.fn_schema = QueryToolSchema
return cls.from_tool(query_engine_tool, **kwargs)

View File

@@ -0,0 +1,57 @@
# MDXSearchTool
## Description
The MDX Search Tool, a key component of the `crewai_tools` package, is designed for advanced market data extraction, offering invaluable support to researchers and analysts requiring immediate market insights in the AI sector. With its ability to interface with various data sources and tools, it streamlines the process of acquiring, reading, and organizing market data efficiently.
## Installation
To utilize the MDX Search Tool, ensure the `crewai_tools` package is installed. If not already present, install it using the following command:
```shell
pip install 'crewai[tools]'
```
## Example
Configuring and using the MDX Search Tool involves setting up environment variables and utilizing the tool within a crewAI project for market research. Here's a simple example:
```python
from crewai_tools import MDXSearchTool
# Initialize the tool so the agent can search any MDX content if it learns about during its execution
tool = MDXSearchTool()
# OR
# Initialize the tool with a specific MDX file path for exclusive search within that document
tool = MDXSearchTool(mdx='path/to/your/document.mdx')
```
## Arguments
- mdx: **Optional** The MDX path for the search. Can be provided at initialization
## Custom model and embeddings
By default, the tool uses OpenAI for both embeddings and summarization. To customize the model, you can use a config dictionary as follows:
```python
tool = MDXSearchTool(
config=dict(
llm=dict(
provider="ollama", # or google, openai, anthropic, llama2, ...
config=dict(
model="llama2",
# temperature=0.5,
# top_p=1,
# stream=true,
),
),
embedder=dict(
provider="google",
config=dict(
model="models/embedding-001",
task_type="retrieval_document",
# title="Embeddings",
),
),
)
)
```

View File

@@ -0,0 +1,56 @@
from typing import Optional, Type
from pydantic import BaseModel, Field
try:
from embedchain.models.data_type import DataType
EMBEDCHAIN_AVAILABLE = True
except ImportError:
EMBEDCHAIN_AVAILABLE = False
from ..rag.rag_tool import RagTool
class FixedMDXSearchToolSchema(BaseModel):
"""Input for MDXSearchTool."""
search_query: str = Field(
...,
description="Mandatory search query you want to use to search the MDX's content",
)
class MDXSearchToolSchema(FixedMDXSearchToolSchema):
"""Input for MDXSearchTool."""
mdx: str = Field(..., description="File path or URL of a MDX file to be searched")
class MDXSearchTool(RagTool):
name: str = "Search a MDX's content"
description: str = (
"A tool that can be used to semantic search a query from a MDX's content."
)
args_schema: Type[BaseModel] = MDXSearchToolSchema
def __init__(self, mdx: Optional[str] = None, **kwargs):
super().__init__(**kwargs)
if mdx is not None:
self.add(mdx)
self.description = f"A tool that can be used to semantic search a query the {mdx} MDX's content."
self.args_schema = FixedMDXSearchToolSchema
self._generate_description()
def add(self, mdx: str) -> None:
if not EMBEDCHAIN_AVAILABLE:
raise ImportError("embedchain is not installed. Please install it with `pip install crewai-tools[embedchain]`")
super().add(mdx, data_type=DataType.MDX)
def _run(
self,
search_query: str,
mdx: Optional[str] = None,
) -> str:
if mdx is not None:
self.add(mdx)
return super()._run(query=search_query)

View File

@@ -0,0 +1,87 @@
# MongoDBVectorSearchTool
## Description
This tool is specifically crafted for conducting vector searches within docs within a MongoDB database. Use this tool to find semantically similar docs to a given query.
MongoDB can act as a vector database that is used to store and query vector embeddings. You can follow the docs here:
https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-overview/
## Installation
Install the crewai_tools package with MongoDB support by executing the following command in your terminal:
```shell
pip install crewai-tools[mongodb]
```
or
```
uv add crewai-tools --extra mongodb
```
## Example
To utilize the MongoDBVectorSearchTool for different use cases, follow these examples:
```python
from crewai_tools import MongoDBVectorSearchTool
# To enable the tool to search any website the agent comes across or learns about during its operation
tool = MongoDBVectorSearchTool(
database_name="example_database',
collection_name='example_collections',
connection_string="<your_mongodb_connection_string>",
)
```
or
```python
from crewai_tools import MongoDBVectorSearchConfig, MongoDBVectorSearchTool
# Setup custom embedding model and customize the parameters.
query_config = MongoDBVectorSearchConfig(limit=10, oversampling_factor=2)
tool = MongoDBVectorSearchTool(
database_name="example_database',
collection_name='example_collections',
connection_string="<your_mongodb_connection_string>",
query_config=query_config,
index_name="my_vector_index",
generative_model="gpt-4o-mini"
)
# Adding the tool to an agent
rag_agent = Agent(
name="rag_agent",
role="You are a helpful assistant that can answer questions with the help of the MongoDBVectorSearchTool.",
goal="...",
backstory="...",
llm="gpt-4o-mini",
tools=[tool],
)
```
Preloading the MongoDB database with documents:
```python
from crewai_tools import MongoDBVectorSearchTool
# Generate the documents and add them to the MongoDB database
test_docs = client.collections.get("example_collections")
# Create the tool.
tool = MongoDBVectorSearchTool(
database_name="example_database',
collection_name='example_collections',
connection_string="<your_mongodb_connection_string>",
)
# Add the text from a set of CrewAI knowledge documents.
texts = []
for d in os.listdir("knowledge"):
with open(os.path.join("knowledge", d), "r") as f:
texts.append(f.read())
tool.add_texts(text)
# Create the vector search index (if it wasn't already created in Atlas).
tool.create_vector_search_index(dimensions=3072)
```

View File

@@ -0,0 +1,11 @@
from .vector_search import (
MongoDBToolSchema,
MongoDBVectorSearchConfig,
MongoDBVectorSearchTool,
)
__all__ = [
"MongoDBVectorSearchConfig",
"MongoDBVectorSearchTool",
"MongoDBToolSchema",
]

View File

@@ -0,0 +1,120 @@
from __future__ import annotations
from time import monotonic, sleep
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
if TYPE_CHECKING:
from pymongo.collection import Collection
def _vector_search_index_definition(
dimensions: int,
path: str,
similarity: str,
filters: Optional[List[str]] = None,
**kwargs: Any,
) -> Dict[str, Any]:
# https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-type/
fields = [
{
"numDimensions": dimensions,
"path": path,
"similarity": similarity,
"type": "vector",
},
]
if filters:
for field in filters:
fields.append({"type": "filter", "path": field})
definition = {"fields": fields}
definition.update(kwargs)
return definition
def create_vector_search_index(
collection: Collection,
index_name: str,
dimensions: int,
path: str,
similarity: str,
filters: Optional[List[str]] = None,
*,
wait_until_complete: Optional[float] = None,
**kwargs: Any,
) -> None:
"""Experimental Utility function to create a vector search index
Args:
collection (Collection): MongoDB Collection
index_name (str): Name of Index
dimensions (int): Number of dimensions in embedding
path (str): field with vector embedding
similarity (str): The similarity score used for the index
filters (List[str]): Fields/paths to index to allow filtering in $vectorSearch
wait_until_complete (Optional[float]): If provided, number of seconds to wait
until search index is ready.
kwargs: Keyword arguments supplying any additional options to SearchIndexModel.
"""
from pymongo.operations import SearchIndexModel
if collection.name not in collection.database.list_collection_names():
collection.database.create_collection(collection.name)
result = collection.create_search_index(
SearchIndexModel(
definition=_vector_search_index_definition(
dimensions=dimensions,
path=path,
similarity=similarity,
filters=filters,
**kwargs,
),
name=index_name,
type="vectorSearch",
)
)
if wait_until_complete:
_wait_for_predicate(
predicate=lambda: _is_index_ready(collection, index_name),
err=f"{index_name=} did not complete in {wait_until_complete}!",
timeout=wait_until_complete,
)
def _is_index_ready(collection: Collection, index_name: str) -> bool:
"""Check for the index name in the list of available search indexes to see if the
specified index is of status READY
Args:
collection (Collection): MongoDB Collection to for the search indexes
index_name (str): Vector Search Index name
Returns:
bool : True if the index is present and READY false otherwise
"""
for index in collection.list_search_indexes(index_name):
if index["status"] == "READY":
return True
return False
def _wait_for_predicate(
predicate: Callable, err: str, timeout: float = 120, interval: float = 0.5
) -> None:
"""Generic to block until the predicate returns true
Args:
predicate (Callable[, bool]): A function that returns a boolean value
err (str): Error message to raise if nothing occurs
timeout (float, optional): Wait time for predicate. Defaults to TIMEOUT.
interval (float, optional): Interval to check predicate. Defaults to DELAY.
Raises:
TimeoutError: _description_
"""
start = monotonic()
while not predicate():
if monotonic() - start > timeout:
raise TimeoutError(err)
sleep(interval)

View File

@@ -0,0 +1,327 @@
import os
from importlib.metadata import version
from logging import getLogger
from typing import Any, Dict, Iterable, List, Optional, Type
from crewai.tools import BaseTool, EnvVar
from openai import AzureOpenAI, Client
from pydantic import BaseModel, Field
from crewai_tools.tools.mongodb_vector_search_tool.utils import (
create_vector_search_index,
)
try:
import pymongo # noqa: F403
MONGODB_AVAILABLE = True
except ImportError:
MONGODB_AVAILABLE = False
logger = getLogger(__name__)
class MongoDBVectorSearchConfig(BaseModel):
"""Configuration for MongoDB vector search queries."""
limit: Optional[int] = Field(
default=4, description="number of documents to return."
)
pre_filter: Optional[dict[str, Any]] = Field(
default=None,
description="List of MQL match expressions comparing an indexed field",
)
post_filter_pipeline: Optional[list[dict]] = Field(
default=None,
description="Pipeline of MongoDB aggregation stages to filter/process results after $vectorSearch.",
)
oversampling_factor: int = Field(
default=10,
description="Multiple of limit used when generating number of candidates at each step in the HNSW Vector Search",
)
include_embeddings: bool = Field(
default=False,
description="Whether to include the embedding vector of each result in metadata.",
)
class MongoDBToolSchema(BaseModel):
"""Input for MongoDBTool."""
query: str = Field(
...,
description="The query to search retrieve relevant information from the MongoDB database. Pass only the query, not the question.",
)
class MongoDBVectorSearchTool(BaseTool):
"""Tool to perfrom a vector search the MongoDB database"""
name: str = "MongoDBVectorSearchTool"
description: str = "A tool to perfrom a vector search on a MongoDB database for relevant information on internal documents."
args_schema: Type[BaseModel] = MongoDBToolSchema
query_config: Optional[MongoDBVectorSearchConfig] = Field(
default=None, description="MongoDB Vector Search query configuration"
)
embedding_model: str = Field(
default="text-embedding-3-large",
description="Text OpenAI embedding model to use",
)
vector_index_name: str = Field(
default="vector_index", description="Name of the Atlas Search vector index"
)
text_key: str = Field(
default="text",
description="MongoDB field that will contain the text for each document",
)
embedding_key: str = Field(
default="embedding",
description="Field that will contain the embedding for each document",
)
database_name: str = Field(..., description="The name of the MongoDB database")
collection_name: str = Field(..., description="The name of the MongoDB collection")
connection_string: str = Field(
...,
description="The connection string of the MongoDB cluster",
)
dimensions: int = Field(
default=1536,
description="Number of dimensions in the embedding vector",
)
env_vars: List[EnvVar] = [
EnvVar(
name="BROWSERBASE_API_KEY",
description="API key for Browserbase services",
required=False,
),
EnvVar(
name="BROWSERBASE_PROJECT_ID",
description="Project ID for Browserbase services",
required=False,
),
]
package_dependencies: List[str] = ["mongdb"]
def __init__(self, **kwargs):
super().__init__(**kwargs)
if not MONGODB_AVAILABLE:
import click
if click.confirm(
"You are missing the 'mongodb' crewai tool. Would you like to install it?"
):
import subprocess
subprocess.run(["uv", "add", "pymongo"], check=True)
else:
raise ImportError("You are missing the 'mongodb' crewai tool.")
if "AZURE_OPENAI_ENDPOINT" in os.environ:
self._openai_client = AzureOpenAI()
elif "OPENAI_API_KEY" in os.environ:
self._openai_client = Client()
else:
raise ValueError(
"OPENAI_API_KEY environment variable is required for MongoDBVectorSearchTool and it is mandatory to use the tool."
)
from pymongo import MongoClient
from pymongo.driver_info import DriverInfo
self._client = MongoClient(
self.connection_string,
driver=DriverInfo(name="CrewAI", version=version("crewai-tools")),
)
self._coll = self._client[self.database_name][self.collection_name]
def create_vector_search_index(
self,
*,
dimensions: int,
relevance_score_fn: str = "cosine",
auto_index_timeout: int = 15,
) -> None:
"""Convenience function to create a vector search index.
Args:
dimensions: Number of dimensions in embedding. If the value is set and
the index does not exist, an index will be created.
relevance_score_fn: The similarity score used for the index
Currently supported: 'euclidean', 'cosine', and 'dotProduct'
auto_index_timeout: Timeout in seconds to wait for an auto-created index
to be ready.
"""
create_vector_search_index(
collection=self._coll,
index_name=self.vector_index_name,
dimensions=dimensions,
path=self.embedding_key,
similarity=relevance_score_fn,
wait_until_complete=auto_index_timeout,
)
def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[Dict[str, Any]]] = None,
ids: Optional[List[str]] = None,
batch_size: int = 100,
**kwargs: Any,
) -> List[str]:
"""Add texts, create embeddings, and add to the Collection and index.
Important notes on ids:
- If _id or id is a key in the metadatas dicts, one must
pop them and provide as separate list.
- They must be unique.
- If they are not provided, the VectorStore will create unique ones,
stored as bson.ObjectIds internally, and strings in Langchain.
These will appear in Document.metadata with key, '_id'.
Args:
texts: Iterable of strings to add to the vectorstore.
metadatas: Optional list of metadatas associated with the texts.
ids: Optional list of unique ids that will be used as index in VectorStore.
See note on ids.
batch_size: Number of documents to insert at a time.
Tuning this may help with performance and sidestep MongoDB limits.
Returns:
List of ids added to the vectorstore.
"""
from bson import ObjectId
_metadatas = metadatas or [{} for _ in texts]
ids = [str(ObjectId()) for _ in range(len(list(texts)))]
metadatas_batch = _metadatas
result_ids = []
texts_batch = []
metadatas_batch = []
size = 0
i = 0
for j, (text, metadata) in enumerate(zip(texts, _metadatas)):
size += len(text) + len(metadata)
texts_batch.append(text)
metadatas_batch.append(metadata)
if (j + 1) % batch_size == 0 or size >= 47_000_000:
batch_res = self._bulk_embed_and_insert_texts(
texts_batch, metadatas_batch, ids[i : j + 1]
)
result_ids.extend(batch_res)
texts_batch = []
metadatas_batch = []
size = 0
i = j + 1
if texts_batch:
batch_res = self._bulk_embed_and_insert_texts(
texts_batch, metadatas_batch, ids[i : j + 1]
)
result_ids.extend(batch_res)
return result_ids
def _embed_texts(self, texts: List[str]) -> List[List[float]]:
return [
i.embedding
for i in self._openai_client.embeddings.create(
input=texts,
model=self.embedding_model,
dimensions=self.dimensions,
).data
]
def _bulk_embed_and_insert_texts(
self,
texts: List[str],
metadatas: List[dict],
ids: List[str],
) -> List[str]:
"""Bulk insert single batch of texts, embeddings, and ids."""
from bson import ObjectId
from pymongo.operations import ReplaceOne
if not texts:
return []
# Compute embedding vectors
embeddings = self._embed_texts(texts)
docs = [
{
"_id": ObjectId(i),
self.text_key: t,
self.embedding_key: embedding,
**m,
}
for i, t, m, embedding in zip(ids, texts, metadatas, embeddings)
]
operations = [ReplaceOne({"_id": doc["_id"]}, doc, upsert=True) for doc in docs]
# insert the documents in MongoDB Atlas
result = self._coll.bulk_write(operations)
assert result.upserted_ids is not None
return [str(_id) for _id in result.upserted_ids.values()]
def _run(self, query: str) -> str:
from bson import json_util
try:
query_config = self.query_config or MongoDBVectorSearchConfig()
limit = query_config.limit
oversampling_factor = query_config.oversampling_factor
pre_filter = query_config.pre_filter
include_embeddings = query_config.include_embeddings
post_filter_pipeline = query_config.post_filter_pipeline
# Create the embedding for the query
query_vector = self._embed_texts([query])[0]
# Atlas Vector Search, potentially with filter
stage = {
"index": self.vector_index_name,
"path": self.embedding_key,
"queryVector": query_vector,
"numCandidates": limit * oversampling_factor,
"limit": limit,
}
if pre_filter:
stage["filter"] = pre_filter
pipeline = [
{"$vectorSearch": stage},
{"$set": {"score": {"$meta": "vectorSearchScore"}}},
]
# Remove embeddings unless requested
if not include_embeddings:
pipeline.append({"$project": {self.embedding_key: 0}})
# Post-processing
if post_filter_pipeline is not None:
pipeline.extend(post_filter_pipeline)
# Execution
cursor = self._coll.aggregate(pipeline) # type: ignore[arg-type]
docs = []
# Format
for doc in cursor:
docs.append(doc)
return json_util.dumps(docs)
except Exception as e:
logger.error(f"Error: {e}")
return ""
def __del__(self):
"""Cleanup clients on deletion."""
try:
if hasattr(self, "_client") and self._client:
self._client.close()
except Exception as e:
logger.error(f"Error: {e}")
try:
if hasattr(self, "_openai_client") and self._openai_client:
self._openai_client.close()
except Exception as e:
logger.error(f"Error: {e}")

View File

@@ -0,0 +1,53 @@
# MultiOnTool Documentation
## Description
The MultiOnTool, integrated within the crewai_tools package, empowers CrewAI agents with the capability to navigate and interact with the web through natural language instructions. Leveraging the Multion API, this tool facilitates seamless web browsing, making it an essential asset for projects requiring dynamic web data interaction.
## Installation
Ensure the `crewai[tools]` package is installed in your environment to use the MultiOnTool. If it's not already installed, you can add it using the command below:
```shell
pip install 'crewai[tools]'
```
## Example
The following example demonstrates how to initialize the tool and execute a search with a given query:
```python
from crewai import Agent, Task, Crew
from crewai_tools import MultiOnTool
# Initialize the tool from a MultiOn Tool
multion_tool = MultiOnTool(api_key= "YOUR_MULTION_API_KEY", local=False)
Browser = Agent(
role="Browser Agent",
goal="control web browsers using natural language ",
backstory="An expert browsing agent.",
tools=[multion_remote_tool],
verbose=True,
)
# example task to search and summarize news
browse = Task(
description="Summarize the top 3 trending AI News headlines",
expected_output="A summary of the top 3 trending AI News headlines",
agent=Browser,
)
crew = Crew(agents=[Browser], tasks=[browse])
crew.kickoff()
```
## Arguments
- `api_key`: Specifies MultiOn API key. Default is the `MULTION_API_KEY` environment variable.
- `local`: Use the local flag set as "true" to run the agent locally on your browser. Make sure the multion browser extension is installed and API Enabled is checked.
- `max_steps`: Optional. Set the max_steps the multion agent can take for a command
## Steps to Get Started
To effectively use the `MultiOnTool`, follow these steps:
1. **Install CrewAI**: Confirm that the `crewai[tools]` package is installed in your Python environment.
2. **Install and use MultiOn**: Follow MultiOn documentation for installing the MultiOn Browser Extension (https://docs.multion.ai/learn/browser-extension).
3. **Enable API Usage**: Click on the MultiOn extension in the extensions folder of your browser (not the hovering MultiOn icon on the web page) to open the extension configurations. Click the API Enabled toggle to enable the API

View File

@@ -0,0 +1,29 @@
import os
from crewai import Agent, Crew, Task
from multion_tool import MultiOnTool
os.environ["OPENAI_API_KEY"] = "Your Key"
multion_browse_tool = MultiOnTool(api_key="Your Key")
# Create a new agent
Browser = Agent(
role="Browser Agent",
goal="control web browsers using natural language ",
backstory="An expert browsing agent.",
tools=[multion_browse_tool],
verbose=True,
)
# Define tasks
browse = Task(
description="Summarize the top 3 trending AI News headlines",
expected_output="A summary of the top 3 trending AI News headlines",
agent=Browser,
)
crew = Crew(agents=[Browser], tasks=[browse])
crew.kickoff()

View File

@@ -0,0 +1,80 @@
"""Multion tool spec."""
import os
from typing import Any, Optional, List
from crewai.tools import BaseTool, EnvVar
class MultiOnTool(BaseTool):
"""Tool to wrap MultiOn Browse Capabilities."""
name: str = "Multion Browse Tool"
description: str = """Multion gives the ability for LLMs to control web browsers using natural language instructions.
If the status is 'CONTINUE', reissue the same instruction to continue execution
"""
multion: Optional[Any] = None
session_id: Optional[str] = None
local: bool = False
max_steps: int = 3
package_dependencies: List[str] = ["multion"]
env_vars: List[EnvVar] = [
EnvVar(name="MULTION_API_KEY", description="API key for Multion", required=True),
]
def __init__(
self,
api_key: Optional[str] = None,
local: bool = False,
max_steps: int = 3,
**kwargs,
):
super().__init__(**kwargs)
try:
from multion.client import MultiOn # type: ignore
except ImportError:
import click
if click.confirm(
"You are missing the 'multion' package. Would you like to install it?"
):
import subprocess
subprocess.run(["uv", "add", "multion"], check=True)
from multion.client import MultiOn
else:
raise ImportError(
"`multion` package not found, please run `uv add multion`"
)
self.session_id = None
self.local = local
self.multion = MultiOn(api_key=api_key or os.getenv("MULTION_API_KEY"))
self.max_steps = max_steps
def _run(
self,
cmd: str,
*args: Any,
**kwargs: Any,
) -> str:
"""
Run the Multion client with the given command.
Args:
cmd (str): The detailed and specific natural language instructrion for web browsing
*args (Any): Additional arguments to pass to the Multion client
**kwargs (Any): Additional keyword arguments to pass to the Multion client
"""
browse = self.multion.browse(
cmd=cmd,
session_id=self.session_id,
local=self.local,
max_steps=self.max_steps,
*args,
**kwargs,
)
self.session_id = browse.session_id
return browse.message + "\n\n STATUS: " + browse.status

View File

@@ -0,0 +1,56 @@
# MySQLSearchTool
## Description
This tool is designed to facilitate semantic searches within MySQL database tables. Leveraging the RAG (Retrieve and Generate) technology, the MySQLSearchTool provides users with an efficient means of querying database table content, specifically tailored for MySQL databases. It simplifies the process of finding relevant data through semantic search queries, making it an invaluable resource for users needing to perform advanced queries on extensive datasets within a MySQL database.
## Installation
To install the `crewai_tools` package and utilize the MySQLSearchTool, execute the following command in your terminal:
```shell
pip install 'crewai[tools]'
```
## Example
Below is an example showcasing how to use the MySQLSearchTool to conduct a semantic search on a table within a MySQL database:
```python
from crewai_tools import MySQLSearchTool
# Initialize the tool with the database URI and the target table name
tool = MySQLSearchTool(db_uri='mysql://user:password@localhost:3306/mydatabase', table_name='employees')
```
## Arguments
The MySQLSearchTool requires the following arguments for its operation:
- `db_uri`: A string representing the URI of the MySQL database to be queried. This argument is mandatory and must include the necessary authentication details and the location of the database.
- `table_name`: A string specifying the name of the table within the database on which the semantic search will be performed. This argument is mandatory.
## Custom model and embeddings
By default, the tool uses OpenAI for both embeddings and summarization. To customize the model, you can use a config dictionary as follows:
```python
tool = MySQLSearchTool(
config=dict(
llm=dict(
provider="ollama", # or google, openai, anthropic, llama2, ...
config=dict(
model="llama2",
# temperature=0.5,
# top_p=1,
# stream=true,
),
),
embedder=dict(
provider="google",
config=dict(
model="models/embedding-001",
task_type="retrieval_document",
# title="Embeddings",
),
),
)
)
```

View File

@@ -0,0 +1,51 @@
from typing import Any, Type
try:
from embedchain.loaders.mysql import MySQLLoader
EMBEDCHAIN_AVAILABLE = True
except ImportError:
EMBEDCHAIN_AVAILABLE = False
from pydantic import BaseModel, Field
from ..rag.rag_tool import RagTool
class MySQLSearchToolSchema(BaseModel):
"""Input for MySQLSearchTool."""
search_query: str = Field(
...,
description="Mandatory semantic search query you want to use to search the database's content",
)
class MySQLSearchTool(RagTool):
name: str = "Search a database's table content"
description: str = "A tool that can be used to semantic search a query from a database table's content."
args_schema: Type[BaseModel] = MySQLSearchToolSchema
db_uri: str = Field(..., description="Mandatory database URI")
def __init__(self, table_name: str, **kwargs):
if not EMBEDCHAIN_AVAILABLE:
raise ImportError("embedchain is not installed. Please install it with `pip install crewai-tools[embedchain]`")
super().__init__(**kwargs)
kwargs["data_type"] = "mysql"
kwargs["loader"] = MySQLLoader(config=dict(url=self.db_uri))
self.add(table_name)
self.description = f"A tool that can be used to semantic search a query the {table_name} database table's content."
self._generate_description()
def add(
self,
table_name: str,
**kwargs: Any,
) -> None:
super().add(f"SELECT * FROM {table_name};", **kwargs)
def _run(
self,
search_query: str,
**kwargs: Any,
) -> Any:
return super()._run(query=search_query)

Some files were not shown because too many files have changed in this diff Show More