From a0e0c2815273efe75760b4b83519f2d44500f916 Mon Sep 17 00:00:00 2001 From: Lorenze Jay Date: Sun, 8 Dec 2024 21:44:19 -0800 Subject: [PATCH] setup weaviate vector search tool --- src/crewai_tools/__init__.py | 1 + src/crewai_tools/tools/__init__.py | 1 + .../tools/weaviate_tool/README.md | 80 +++++++++++++++++ .../tools/weaviate_tool/vector_search.py | 89 +++++++++++++++++++ 4 files changed, 171 insertions(+) create mode 100644 src/crewai_tools/tools/weaviate_tool/README.md create mode 100644 src/crewai_tools/tools/weaviate_tool/vector_search.py diff --git a/src/crewai_tools/__init__.py b/src/crewai_tools/__init__.py index 3fad09d9f..12523a214 100644 --- a/src/crewai_tools/__init__.py +++ b/src/crewai_tools/__init__.py @@ -42,4 +42,5 @@ from .tools import ( XMLSearchTool, YoutubeChannelSearchTool, YoutubeVideoSearchTool, + WeaviateVectorSearchTool, ) diff --git a/src/crewai_tools/tools/__init__.py b/src/crewai_tools/tools/__init__.py index 73a96f4cf..23565dbea 100644 --- a/src/crewai_tools/tools/__init__.py +++ b/src/crewai_tools/tools/__init__.py @@ -51,3 +51,4 @@ from .youtube_channel_search_tool.youtube_channel_search_tool import ( YoutubeChannelSearchTool, ) from .youtube_video_search_tool.youtube_video_search_tool import YoutubeVideoSearchTool +from .weaviate_tool.vector_search import WeaviateVectorSearchTool diff --git a/src/crewai_tools/tools/weaviate_tool/README.md b/src/crewai_tools/tools/weaviate_tool/README.md new file mode 100644 index 000000000..42daa40e0 --- /dev/null +++ b/src/crewai_tools/tools/weaviate_tool/README.md @@ -0,0 +1,80 @@ +# WeaviateVectorSearchTool + +## Description +This tool is specifically crafted for conducting semantic searches within docs within a Weaviate vector database. Use this tool to find semantically similar docs to a given query. + +Weaviate is a vector database that is used to store and query vector embeddings. You can follow their docs here: https://weaviate.io/developers/wcs/connect + +## Installation +Install the crewai_tools package by executing the following command in your terminal: + +```shell +uv pip install 'crewai[tools]' +``` + +## Example +To utilize the WeaviateVectorSearchTool for different use cases, follow these examples: + +```python +from crewai_tools import WeaviateVectorSearchTool + +# To enable the tool to search any website the agent comes across or learns about during its operation +tool = WeaviateVectorSearchTool( + collection_name='example_collections', + limit=3, + weaviate_cluster_url="https://your-weaviate-cluster-url.com", + weaviate_api_key="your-weaviate-api-key", +) + +# or + +# Setup custom model for vectorizer and generative model +tool = WeaviateVectorSearchTool( + collection_name='example_collections', + limit=3, + vectorizer=Configure.Vectorizer.text2vec_openai(model="nomic-embed-text"), + generative_model=Configure.Generative.openai(model="gpt-4o-mini"), + weaviate_cluster_url="https://your-weaviate-cluster-url.com", + weaviate_api_key="your-weaviate-api-key", +) + +# Adding the tool to an agent +rag_agent = Agent( + name="rag_agent", + role="You are a helpful assistant that can answer questions with the help of the WeaviateVectorSearchTool.", + llm="gpt-4o-mini", + tools=[tool], +) +``` + +## Arguments +- `collection_name` : The name of the collection to search within. (Required) +- `weaviate_cluster_url` : The URL of the Weaviate cluster. (Required) +- `weaviate_api_key` : The API key for the Weaviate cluster. (Required) +- `limit` : The number of results to return. (Optional) +- `vectorizer` : The vectorizer to use. (Optional) +- `generative_model` : The generative model to use. (Optional) + +Preloading the Weaviate database with documents: + +```python +from crewai_tools import WeaviateVectorSearchTool + +# Use before hooks to generate the documents and add them to the Weaviate database. Follow the weaviate docs: https://weaviate.io/developers/wcs/connect +test_docs = client.collections.get("test_collection_name") + + +docs_to_load = os.listdir("knowledge") +with test_docs.batch.dynamic() as batch: + for d in docs_to_load: + with open(os.path.join("knowledge", d), "r") as f: + content = f.read() + batch.add_object( + { + "content": content, + "year": d.split("_")[0], + } + ) +tool = WeaviateVectorSearchTool(collection_name='example_collections', limit=3) + +``` diff --git a/src/crewai_tools/tools/weaviate_tool/vector_search.py b/src/crewai_tools/tools/weaviate_tool/vector_search.py new file mode 100644 index 000000000..ab80b6ce1 --- /dev/null +++ b/src/crewai_tools/tools/weaviate_tool/vector_search.py @@ -0,0 +1,89 @@ +import os +import json +import weaviate +from pydantic import BaseModel, Field +from typing import Type, Optional +from crewai.tools import BaseTool + +from weaviate.classes.config import Configure, Vectorizers +from weaviate.classes.init import Auth + + +class WeaviateToolSchema(BaseModel): + """Input for WeaviateTool.""" + + query: str = Field( + ..., + description="The query to search retrieve relevant information from the Weaviate database. Pass only the query, not the question.", + ) + + +class WeaviateVectorSearchTool(BaseTool): + """Tool to search the Weaviate database""" + + name: str = "WeaviateVectorSearchTool" + description: str = "A tool to search the Weaviate database for relevant information on internal documents." + args_schema: Type[BaseModel] = WeaviateToolSchema + query: Optional[str] = None + + vectorizer: Optional[Vectorizers] = Field( + default=Configure.Vectorizer.text2vec_openai( + model="nomic-embed-text", + ) + ) + generative_model: Optional[str] = Field( + default=Configure.Generative.openai( + model="gpt-4o", + ), + ) + collection_name: Optional[str] = None + limit: Optional[int] = Field(default=3) + headers: Optional[dict] = Field( + default={"X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]} + ) + weaviate_cluster_url: str = Field( + ..., + description="The URL of the Weaviate cluster", + ) + weaviate_api_key: str = Field( + ..., + description="The API key for the Weaviate cluster", + ) + + def _run(self, query: str) -> str: + """Search the Weaviate database + + Args: + query (str): The query to search retrieve relevant information from the Weaviate database. Pass only the query as a string, not the question. + + Returns: + str: The result of the search query + """ + + if not self.weaviate_cluster_url or not self.weaviate_api_key: + raise ValueError("WEAVIATE_URL or WEAVIATE_API_KEY is not set") + + client = weaviate.connect_to_weaviate_cloud( + cluster_url=self.weaviate_cluster_url, + auth_credentials=Auth.api_key(self.weaviate_api_key), + headers=self.headers, + ) + internal_docs = client.collections.get(self.collection_name) + + if not internal_docs: + internal_docs = client.collections.create( + name=self.collection_name, + vectorizer_config=self.vectorizer, + generative_config=self.generative_model, + ) + + response = internal_docs.query.near_text( + query=query, + limit=self.limit, + ) + json_response = "" + for obj in response.objects: + json_response += json.dumps(obj.properties, indent=2) + + client.close() + return json_response