Merge pull request #142 from crewAIInc/feat/weaviate-tool

setup weaviate vector search tool
This commit is contained in:
Tony Kipkemboi
2024-12-09 10:44:11 -05:00
committed by GitHub
4 changed files with 171 additions and 0 deletions

View File

@@ -42,4 +42,5 @@ from .tools import (
XMLSearchTool,
YoutubeChannelSearchTool,
YoutubeVideoSearchTool,
WeaviateVectorSearchTool,
)

View File

@@ -51,3 +51,4 @@ from .youtube_channel_search_tool.youtube_channel_search_tool import (
YoutubeChannelSearchTool,
)
from .youtube_video_search_tool.youtube_video_search_tool import YoutubeVideoSearchTool
from .weaviate_tool.vector_search import WeaviateVectorSearchTool

View File

@@ -0,0 +1,80 @@
# WeaviateVectorSearchTool
## Description
This tool is specifically crafted for conducting semantic searches within docs within a Weaviate vector database. Use this tool to find semantically similar docs to a given query.
Weaviate is a vector database that is used to store and query vector embeddings. You can follow their docs here: https://weaviate.io/developers/wcs/connect
## Installation
Install the crewai_tools package by executing the following command in your terminal:
```shell
uv pip install 'crewai[tools]'
```
## Example
To utilize the WeaviateVectorSearchTool for different use cases, follow these examples:
```python
from crewai_tools import WeaviateVectorSearchTool
# To enable the tool to search any website the agent comes across or learns about during its operation
tool = WeaviateVectorSearchTool(
collection_name='example_collections',
limit=3,
weaviate_cluster_url="https://your-weaviate-cluster-url.com",
weaviate_api_key="your-weaviate-api-key",
)
# or
# Setup custom model for vectorizer and generative model
tool = WeaviateVectorSearchTool(
collection_name='example_collections',
limit=3,
vectorizer=Configure.Vectorizer.text2vec_openai(model="nomic-embed-text"),
generative_model=Configure.Generative.openai(model="gpt-4o-mini"),
weaviate_cluster_url="https://your-weaviate-cluster-url.com",
weaviate_api_key="your-weaviate-api-key",
)
# Adding the tool to an agent
rag_agent = Agent(
name="rag_agent",
role="You are a helpful assistant that can answer questions with the help of the WeaviateVectorSearchTool.",
llm="gpt-4o-mini",
tools=[tool],
)
```
## Arguments
- `collection_name` : The name of the collection to search within. (Required)
- `weaviate_cluster_url` : The URL of the Weaviate cluster. (Required)
- `weaviate_api_key` : The API key for the Weaviate cluster. (Required)
- `limit` : The number of results to return. (Optional)
- `vectorizer` : The vectorizer to use. (Optional)
- `generative_model` : The generative model to use. (Optional)
Preloading the Weaviate database with documents:
```python
from crewai_tools import WeaviateVectorSearchTool
# Use before hooks to generate the documents and add them to the Weaviate database. Follow the weaviate docs: https://weaviate.io/developers/wcs/connect
test_docs = client.collections.get("example_collections")
docs_to_load = os.listdir("knowledge")
with test_docs.batch.dynamic() as batch:
for d in docs_to_load:
with open(os.path.join("knowledge", d), "r") as f:
content = f.read()
batch.add_object(
{
"content": content,
"year": d.split("_")[0],
}
)
tool = WeaviateVectorSearchTool(collection_name='example_collections', limit=3)
```

View File

@@ -0,0 +1,89 @@
import os
import json
import weaviate
from pydantic import BaseModel, Field
from typing import Type, Optional
from crewai.tools import BaseTool
from weaviate.classes.config import Configure, Vectorizers
from weaviate.classes.init import Auth
class WeaviateToolSchema(BaseModel):
"""Input for WeaviateTool."""
query: str = Field(
...,
description="The query to search retrieve relevant information from the Weaviate database. Pass only the query, not the question.",
)
class WeaviateVectorSearchTool(BaseTool):
"""Tool to search the Weaviate database"""
name: str = "WeaviateVectorSearchTool"
description: str = "A tool to search the Weaviate database for relevant information on internal documents."
args_schema: Type[BaseModel] = WeaviateToolSchema
query: Optional[str] = None
vectorizer: Optional[Vectorizers] = Field(
default=Configure.Vectorizer.text2vec_openai(
model="nomic-embed-text",
)
)
generative_model: Optional[str] = Field(
default=Configure.Generative.openai(
model="gpt-4o",
),
)
collection_name: Optional[str] = None
limit: Optional[int] = Field(default=3)
headers: Optional[dict] = Field(
default={"X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]}
)
weaviate_cluster_url: str = Field(
...,
description="The URL of the Weaviate cluster",
)
weaviate_api_key: str = Field(
...,
description="The API key for the Weaviate cluster",
)
def _run(self, query: str) -> str:
"""Search the Weaviate database
Args:
query (str): The query to search retrieve relevant information from the Weaviate database. Pass only the query as a string, not the question.
Returns:
str: The result of the search query
"""
if not self.weaviate_cluster_url or not self.weaviate_api_key:
raise ValueError("WEAVIATE_URL or WEAVIATE_API_KEY is not set")
client = weaviate.connect_to_weaviate_cloud(
cluster_url=self.weaviate_cluster_url,
auth_credentials=Auth.api_key(self.weaviate_api_key),
headers=self.headers,
)
internal_docs = client.collections.get(self.collection_name)
if not internal_docs:
internal_docs = client.collections.create(
name=self.collection_name,
vectorizer_config=self.vectorizer,
generative_config=self.generative_model,
)
response = internal_docs.query.near_text(
query=query,
limit=self.limit,
)
json_response = ""
for obj in response.objects:
json_response += json.dumps(obj.properties, indent=2)
client.close()
return json_response