Squashed 'packages/tools/' content from commit 78317b9c

git-subtree-dir: packages/tools
git-subtree-split: 78317b9c127f18bd040c1d77e3c0840cdc9a5b38
This commit is contained in:
Greyson Lalonde
2025-09-12 21:58:02 -04:00
commit e16606672a
303 changed files with 49010 additions and 0 deletions

View File

@@ -0,0 +1,87 @@
# MongoDBVectorSearchTool
## Description
This tool is specifically crafted for conducting vector searches within docs within a MongoDB database. Use this tool to find semantically similar docs to a given query.
MongoDB can act as a vector database that is used to store and query vector embeddings. You can follow the docs here:
https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-overview/
## Installation
Install the crewai_tools package with MongoDB support by executing the following command in your terminal:
```shell
pip install crewai-tools[mongodb]
```
or
```
uv add crewai-tools --extra mongodb
```
## Example
To utilize the MongoDBVectorSearchTool for different use cases, follow these examples:
```python
from crewai_tools import MongoDBVectorSearchTool
# To enable the tool to search any website the agent comes across or learns about during its operation
tool = MongoDBVectorSearchTool(
database_name="example_database',
collection_name='example_collections',
connection_string="<your_mongodb_connection_string>",
)
```
or
```python
from crewai_tools import MongoDBVectorSearchConfig, MongoDBVectorSearchTool
# Setup custom embedding model and customize the parameters.
query_config = MongoDBVectorSearchConfig(limit=10, oversampling_factor=2)
tool = MongoDBVectorSearchTool(
database_name="example_database',
collection_name='example_collections',
connection_string="<your_mongodb_connection_string>",
query_config=query_config,
index_name="my_vector_index",
generative_model="gpt-4o-mini"
)
# Adding the tool to an agent
rag_agent = Agent(
name="rag_agent",
role="You are a helpful assistant that can answer questions with the help of the MongoDBVectorSearchTool.",
goal="...",
backstory="...",
llm="gpt-4o-mini",
tools=[tool],
)
```
Preloading the MongoDB database with documents:
```python
from crewai_tools import MongoDBVectorSearchTool
# Generate the documents and add them to the MongoDB database
test_docs = client.collections.get("example_collections")
# Create the tool.
tool = MongoDBVectorSearchTool(
database_name="example_database',
collection_name='example_collections',
connection_string="<your_mongodb_connection_string>",
)
# Add the text from a set of CrewAI knowledge documents.
texts = []
for d in os.listdir("knowledge"):
with open(os.path.join("knowledge", d), "r") as f:
texts.append(f.read())
tool.add_texts(text)
# Create the vector search index (if it wasn't already created in Atlas).
tool.create_vector_search_index(dimensions=3072)
```

View File

@@ -0,0 +1,11 @@
from .vector_search import (
MongoDBToolSchema,
MongoDBVectorSearchConfig,
MongoDBVectorSearchTool,
)
__all__ = [
"MongoDBVectorSearchConfig",
"MongoDBVectorSearchTool",
"MongoDBToolSchema",
]

View File

@@ -0,0 +1,120 @@
from __future__ import annotations
from time import monotonic, sleep
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
if TYPE_CHECKING:
from pymongo.collection import Collection
def _vector_search_index_definition(
dimensions: int,
path: str,
similarity: str,
filters: Optional[List[str]] = None,
**kwargs: Any,
) -> Dict[str, Any]:
# https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-type/
fields = [
{
"numDimensions": dimensions,
"path": path,
"similarity": similarity,
"type": "vector",
},
]
if filters:
for field in filters:
fields.append({"type": "filter", "path": field})
definition = {"fields": fields}
definition.update(kwargs)
return definition
def create_vector_search_index(
collection: Collection,
index_name: str,
dimensions: int,
path: str,
similarity: str,
filters: Optional[List[str]] = None,
*,
wait_until_complete: Optional[float] = None,
**kwargs: Any,
) -> None:
"""Experimental Utility function to create a vector search index
Args:
collection (Collection): MongoDB Collection
index_name (str): Name of Index
dimensions (int): Number of dimensions in embedding
path (str): field with vector embedding
similarity (str): The similarity score used for the index
filters (List[str]): Fields/paths to index to allow filtering in $vectorSearch
wait_until_complete (Optional[float]): If provided, number of seconds to wait
until search index is ready.
kwargs: Keyword arguments supplying any additional options to SearchIndexModel.
"""
from pymongo.operations import SearchIndexModel
if collection.name not in collection.database.list_collection_names():
collection.database.create_collection(collection.name)
result = collection.create_search_index(
SearchIndexModel(
definition=_vector_search_index_definition(
dimensions=dimensions,
path=path,
similarity=similarity,
filters=filters,
**kwargs,
),
name=index_name,
type="vectorSearch",
)
)
if wait_until_complete:
_wait_for_predicate(
predicate=lambda: _is_index_ready(collection, index_name),
err=f"{index_name=} did not complete in {wait_until_complete}!",
timeout=wait_until_complete,
)
def _is_index_ready(collection: Collection, index_name: str) -> bool:
"""Check for the index name in the list of available search indexes to see if the
specified index is of status READY
Args:
collection (Collection): MongoDB Collection to for the search indexes
index_name (str): Vector Search Index name
Returns:
bool : True if the index is present and READY false otherwise
"""
for index in collection.list_search_indexes(index_name):
if index["status"] == "READY":
return True
return False
def _wait_for_predicate(
predicate: Callable, err: str, timeout: float = 120, interval: float = 0.5
) -> None:
"""Generic to block until the predicate returns true
Args:
predicate (Callable[, bool]): A function that returns a boolean value
err (str): Error message to raise if nothing occurs
timeout (float, optional): Wait time for predicate. Defaults to TIMEOUT.
interval (float, optional): Interval to check predicate. Defaults to DELAY.
Raises:
TimeoutError: _description_
"""
start = monotonic()
while not predicate():
if monotonic() - start > timeout:
raise TimeoutError(err)
sleep(interval)

View File

@@ -0,0 +1,327 @@
import os
from importlib.metadata import version
from logging import getLogger
from typing import Any, Dict, Iterable, List, Optional, Type
from crewai.tools import BaseTool, EnvVar
from openai import AzureOpenAI, Client
from pydantic import BaseModel, Field
from crewai_tools.tools.mongodb_vector_search_tool.utils import (
create_vector_search_index,
)
try:
import pymongo # noqa: F403
MONGODB_AVAILABLE = True
except ImportError:
MONGODB_AVAILABLE = False
logger = getLogger(__name__)
class MongoDBVectorSearchConfig(BaseModel):
"""Configuration for MongoDB vector search queries."""
limit: Optional[int] = Field(
default=4, description="number of documents to return."
)
pre_filter: Optional[dict[str, Any]] = Field(
default=None,
description="List of MQL match expressions comparing an indexed field",
)
post_filter_pipeline: Optional[list[dict]] = Field(
default=None,
description="Pipeline of MongoDB aggregation stages to filter/process results after $vectorSearch.",
)
oversampling_factor: int = Field(
default=10,
description="Multiple of limit used when generating number of candidates at each step in the HNSW Vector Search",
)
include_embeddings: bool = Field(
default=False,
description="Whether to include the embedding vector of each result in metadata.",
)
class MongoDBToolSchema(BaseModel):
"""Input for MongoDBTool."""
query: str = Field(
...,
description="The query to search retrieve relevant information from the MongoDB database. Pass only the query, not the question.",
)
class MongoDBVectorSearchTool(BaseTool):
"""Tool to perfrom a vector search the MongoDB database"""
name: str = "MongoDBVectorSearchTool"
description: str = "A tool to perfrom a vector search on a MongoDB database for relevant information on internal documents."
args_schema: Type[BaseModel] = MongoDBToolSchema
query_config: Optional[MongoDBVectorSearchConfig] = Field(
default=None, description="MongoDB Vector Search query configuration"
)
embedding_model: str = Field(
default="text-embedding-3-large",
description="Text OpenAI embedding model to use",
)
vector_index_name: str = Field(
default="vector_index", description="Name of the Atlas Search vector index"
)
text_key: str = Field(
default="text",
description="MongoDB field that will contain the text for each document",
)
embedding_key: str = Field(
default="embedding",
description="Field that will contain the embedding for each document",
)
database_name: str = Field(..., description="The name of the MongoDB database")
collection_name: str = Field(..., description="The name of the MongoDB collection")
connection_string: str = Field(
...,
description="The connection string of the MongoDB cluster",
)
dimensions: int = Field(
default=1536,
description="Number of dimensions in the embedding vector",
)
env_vars: List[EnvVar] = [
EnvVar(
name="BROWSERBASE_API_KEY",
description="API key for Browserbase services",
required=False,
),
EnvVar(
name="BROWSERBASE_PROJECT_ID",
description="Project ID for Browserbase services",
required=False,
),
]
package_dependencies: List[str] = ["mongdb"]
def __init__(self, **kwargs):
super().__init__(**kwargs)
if not MONGODB_AVAILABLE:
import click
if click.confirm(
"You are missing the 'mongodb' crewai tool. Would you like to install it?"
):
import subprocess
subprocess.run(["uv", "add", "pymongo"], check=True)
else:
raise ImportError("You are missing the 'mongodb' crewai tool.")
if "AZURE_OPENAI_ENDPOINT" in os.environ:
self._openai_client = AzureOpenAI()
elif "OPENAI_API_KEY" in os.environ:
self._openai_client = Client()
else:
raise ValueError(
"OPENAI_API_KEY environment variable is required for MongoDBVectorSearchTool and it is mandatory to use the tool."
)
from pymongo import MongoClient
from pymongo.driver_info import DriverInfo
self._client = MongoClient(
self.connection_string,
driver=DriverInfo(name="CrewAI", version=version("crewai-tools")),
)
self._coll = self._client[self.database_name][self.collection_name]
def create_vector_search_index(
self,
*,
dimensions: int,
relevance_score_fn: str = "cosine",
auto_index_timeout: int = 15,
) -> None:
"""Convenience function to create a vector search index.
Args:
dimensions: Number of dimensions in embedding. If the value is set and
the index does not exist, an index will be created.
relevance_score_fn: The similarity score used for the index
Currently supported: 'euclidean', 'cosine', and 'dotProduct'
auto_index_timeout: Timeout in seconds to wait for an auto-created index
to be ready.
"""
create_vector_search_index(
collection=self._coll,
index_name=self.vector_index_name,
dimensions=dimensions,
path=self.embedding_key,
similarity=relevance_score_fn,
wait_until_complete=auto_index_timeout,
)
def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[Dict[str, Any]]] = None,
ids: Optional[List[str]] = None,
batch_size: int = 100,
**kwargs: Any,
) -> List[str]:
"""Add texts, create embeddings, and add to the Collection and index.
Important notes on ids:
- If _id or id is a key in the metadatas dicts, one must
pop them and provide as separate list.
- They must be unique.
- If they are not provided, the VectorStore will create unique ones,
stored as bson.ObjectIds internally, and strings in Langchain.
These will appear in Document.metadata with key, '_id'.
Args:
texts: Iterable of strings to add to the vectorstore.
metadatas: Optional list of metadatas associated with the texts.
ids: Optional list of unique ids that will be used as index in VectorStore.
See note on ids.
batch_size: Number of documents to insert at a time.
Tuning this may help with performance and sidestep MongoDB limits.
Returns:
List of ids added to the vectorstore.
"""
from bson import ObjectId
_metadatas = metadatas or [{} for _ in texts]
ids = [str(ObjectId()) for _ in range(len(list(texts)))]
metadatas_batch = _metadatas
result_ids = []
texts_batch = []
metadatas_batch = []
size = 0
i = 0
for j, (text, metadata) in enumerate(zip(texts, _metadatas)):
size += len(text) + len(metadata)
texts_batch.append(text)
metadatas_batch.append(metadata)
if (j + 1) % batch_size == 0 or size >= 47_000_000:
batch_res = self._bulk_embed_and_insert_texts(
texts_batch, metadatas_batch, ids[i : j + 1]
)
result_ids.extend(batch_res)
texts_batch = []
metadatas_batch = []
size = 0
i = j + 1
if texts_batch:
batch_res = self._bulk_embed_and_insert_texts(
texts_batch, metadatas_batch, ids[i : j + 1]
)
result_ids.extend(batch_res)
return result_ids
def _embed_texts(self, texts: List[str]) -> List[List[float]]:
return [
i.embedding
for i in self._openai_client.embeddings.create(
input=texts,
model=self.embedding_model,
dimensions=self.dimensions,
).data
]
def _bulk_embed_and_insert_texts(
self,
texts: List[str],
metadatas: List[dict],
ids: List[str],
) -> List[str]:
"""Bulk insert single batch of texts, embeddings, and ids."""
from bson import ObjectId
from pymongo.operations import ReplaceOne
if not texts:
return []
# Compute embedding vectors
embeddings = self._embed_texts(texts)
docs = [
{
"_id": ObjectId(i),
self.text_key: t,
self.embedding_key: embedding,
**m,
}
for i, t, m, embedding in zip(ids, texts, metadatas, embeddings)
]
operations = [ReplaceOne({"_id": doc["_id"]}, doc, upsert=True) for doc in docs]
# insert the documents in MongoDB Atlas
result = self._coll.bulk_write(operations)
assert result.upserted_ids is not None
return [str(_id) for _id in result.upserted_ids.values()]
def _run(self, query: str) -> str:
from bson import json_util
try:
query_config = self.query_config or MongoDBVectorSearchConfig()
limit = query_config.limit
oversampling_factor = query_config.oversampling_factor
pre_filter = query_config.pre_filter
include_embeddings = query_config.include_embeddings
post_filter_pipeline = query_config.post_filter_pipeline
# Create the embedding for the query
query_vector = self._embed_texts([query])[0]
# Atlas Vector Search, potentially with filter
stage = {
"index": self.vector_index_name,
"path": self.embedding_key,
"queryVector": query_vector,
"numCandidates": limit * oversampling_factor,
"limit": limit,
}
if pre_filter:
stage["filter"] = pre_filter
pipeline = [
{"$vectorSearch": stage},
{"$set": {"score": {"$meta": "vectorSearchScore"}}},
]
# Remove embeddings unless requested
if not include_embeddings:
pipeline.append({"$project": {self.embedding_key: 0}})
# Post-processing
if post_filter_pipeline is not None:
pipeline.extend(post_filter_pipeline)
# Execution
cursor = self._coll.aggregate(pipeline) # type: ignore[arg-type]
docs = []
# Format
for doc in cursor:
docs.append(doc)
return json_util.dumps(docs)
except Exception as e:
logger.error(f"Error: {e}")
return ""
def __del__(self):
"""Cleanup clients on deletion."""
try:
if hasattr(self, "_client") and self._client:
self._client.close()
except Exception as e:
logger.error(f"Error: {e}")
try:
if hasattr(self, "_openai_client") and self._openai_client:
self._openai_client.close()
except Exception as e:
logger.error(f"Error: {e}")