From 86825e176900cbc68e00060adc525f0b8d5920e5 Mon Sep 17 00:00:00 2001 From: Tony Kipkemboi Date: Thu, 27 Feb 2025 13:54:44 -0500 Subject: [PATCH] docs: add Qdrant vector search tool documentation (#2184) Co-authored-by: Lorenze Jay <63378463+lorenzejay@users.noreply.github.com> Co-authored-by: Brandon Hancock (bhancock_ai) <109994880+bhancockio@users.noreply.github.com> --- docs/mint.json | 1 + docs/tools/qdrantvectorsearchtool.mdx | 271 ++++++++++++++++++++++++++ 2 files changed, 272 insertions(+) create mode 100644 docs/tools/qdrantvectorsearchtool.mdx diff --git a/docs/mint.json b/docs/mint.json index fb0dcfdf5..9b49648aa 100644 --- a/docs/mint.json +++ b/docs/mint.json @@ -139,6 +139,7 @@ "tools/nl2sqltool", "tools/pdfsearchtool", "tools/pgsearchtool", + "tools/qdrantvectorsearchtool", "tools/scrapewebsitetool", "tools/seleniumscrapingtool", "tools/spidertool", diff --git a/docs/tools/qdrantvectorsearchtool.mdx b/docs/tools/qdrantvectorsearchtool.mdx new file mode 100644 index 000000000..da3dcb1a2 --- /dev/null +++ b/docs/tools/qdrantvectorsearchtool.mdx @@ -0,0 +1,271 @@ +--- +title: 'Qdrant Vector Search Tool' +description: 'Semantic search capabilities for CrewAI agents using Qdrant vector database' +icon: magnifying-glass-plus +--- + +# `QdrantVectorSearchTool` + +The Qdrant Vector Search Tool enables semantic search capabilities in your CrewAI agents by leveraging [Qdrant](https://qdrant.tech/), a vector similarity search engine. This tool allows your agents to search through documents stored in a Qdrant collection using semantic similarity. + +## Installation + +Install the required packages: + +```bash +uv pip install 'crewai[tools] qdrant-client' +``` + +## Basic Usage + +Here's a minimal example of how to use the tool: + +```python +from crewai import Agent +from crewai_tools import QdrantVectorSearchTool + +# Initialize the tool +qdrant_tool = QdrantVectorSearchTool( + qdrant_url="your_qdrant_url", + qdrant_api_key="your_qdrant_api_key", + collection_name="your_collection" +) + +# Create an agent that uses the tool +agent = Agent( + role="Research Assistant", + goal="Find relevant information in documents", + tools=[qdrant_tool] +) + +# The tool will automatically use OpenAI embeddings +# and return the 3 most relevant results with scores > 0.35 +``` + +## Complete Working Example + +Here's a complete example showing how to: +1. Extract text from a PDF +2. Generate embeddings using OpenAI +3. Store in Qdrant +4. Create a CrewAI agentic RAG workflow for semantic search + +```python +import os +import uuid +import pdfplumber +from openai import OpenAI +from dotenv import load_dotenv +from crewai import Agent, Task, Crew, Process, LLM +from crewai_tools import QdrantVectorSearchTool +from qdrant_client import QdrantClient +from qdrant_client.models import PointStruct, Distance, VectorParams + +# Load environment variables +load_dotenv() + +# Initialize OpenAI client +client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) + +# Extract text from PDF +def extract_text_from_pdf(pdf_path): + text = [] + with pdfplumber.open(pdf_path) as pdf: + for page in pdf.pages: + page_text = page.extract_text() + if page_text: + text.append(page_text.strip()) + return text + +# Generate OpenAI embeddings +def get_openai_embedding(text): + response = client.embeddings.create( + input=text, + model="text-embedding-3-small" + ) + return response.data[0].embedding + +# Store text and embeddings in Qdrant +def load_pdf_to_qdrant(pdf_path, qdrant, collection_name): + # Extract text from PDF + text_chunks = extract_text_from_pdf(pdf_path) + + # Create Qdrant collection + if qdrant.collection_exists(collection_name): + qdrant.delete_collection(collection_name) + qdrant.create_collection( + collection_name=collection_name, + vectors_config=VectorParams(size=1536, distance=Distance.COSINE) + ) + + # Store embeddings + points = [] + for chunk in text_chunks: + embedding = get_openai_embedding(chunk) + points.append(PointStruct( + id=str(uuid.uuid4()), + vector=embedding, + payload={"text": chunk} + )) + qdrant.upsert(collection_name=collection_name, points=points) + +# Initialize Qdrant client and load data +qdrant = QdrantClient( + url=os.getenv("QDRANT_URL"), + api_key=os.getenv("QDRANT_API_KEY") +) +collection_name = "example_collection" +pdf_path = "path/to/your/document.pdf" +load_pdf_to_qdrant(pdf_path, qdrant, collection_name) + +# Initialize Qdrant search tool +qdrant_tool = QdrantVectorSearchTool( + qdrant_url=os.getenv("QDRANT_URL"), + qdrant_api_key=os.getenv("QDRANT_API_KEY"), + collection_name=collection_name, + limit=3, + score_threshold=0.35 +) + +# Create CrewAI agents +search_agent = Agent( + role="Senior Semantic Search Agent", + goal="Find and analyze documents based on semantic search", + backstory="""You are an expert research assistant who can find relevant + information using semantic search in a Qdrant database.""", + tools=[qdrant_tool], + verbose=True +) + +answer_agent = Agent( + role="Senior Answer Assistant", + goal="Generate answers to questions based on the context provided", + backstory="""You are an expert answer assistant who can generate + answers to questions based on the context provided.""", + tools=[qdrant_tool], + verbose=True +) + +# Define tasks +search_task = Task( + description="""Search for relevant documents about the {query}. + Your final answer should include: + - The relevant information found + - The similarity scores of the results + - The metadata of the relevant documents""", + agent=search_agent +) + +answer_task = Task( + description="""Given the context and metadata of relevant documents, + generate a final answer based on the context.""", + agent=answer_agent +) + +# Run CrewAI workflow +crew = Crew( + agents=[search_agent, answer_agent], + tasks=[search_task, answer_task], + process=Process.sequential, + verbose=True +) + +result = crew.kickoff( + inputs={"query": "What is the role of X in the document?"} +) +print(result) +``` + +## Tool Parameters + +### Required Parameters +- `qdrant_url` (str): The URL of your Qdrant server +- `qdrant_api_key` (str): API key for authentication with Qdrant +- `collection_name` (str): Name of the Qdrant collection to search + +### Optional Parameters +- `limit` (int): Maximum number of results to return (default: 3) +- `score_threshold` (float): Minimum similarity score threshold (default: 0.35) +- `custom_embedding_fn` (Callable[[str], list[float]]): Custom function for text vectorization + +## Search Parameters + +The tool accepts these parameters in its schema: +- `query` (str): The search query to find similar documents +- `filter_by` (str, optional): Metadata field to filter on +- `filter_value` (str, optional): Value to filter by + +## Return Format + +The tool returns results in JSON format: + +```json +[ + { + "metadata": { + // Any metadata stored with the document + }, + "context": "The actual text content of the document", + "distance": 0.95 // Similarity score + } +] +``` + +## Default Embedding + +By default, the tool uses OpenAI's `text-embedding-3-small` model for vectorization. This requires: +- OpenAI API key set in environment: `OPENAI_API_KEY` + +## Custom Embeddings + +Instead of using the default embedding model, you might want to use your own embedding function in cases where you: + +1. Want to use a different embedding model (e.g., Cohere, HuggingFace, Ollama models) +2. Need to reduce costs by using open-source embedding models +3. Have specific requirements for vector dimensions or embedding quality +4. Want to use domain-specific embeddings (e.g., for medical or legal text) + +Here's an example using a HuggingFace model: + +```python +from transformers import AutoTokenizer, AutoModel +import torch + +# Load model and tokenizer +tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2') +model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2') + +def custom_embeddings(text: str) -> list[float]: + # Tokenize and get model outputs + inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) + outputs = model(**inputs) + + # Use mean pooling to get text embedding + embeddings = outputs.last_hidden_state.mean(dim=1) + + # Convert to list of floats and return + return embeddings[0].tolist() + +# Use custom embeddings with the tool +tool = QdrantVectorSearchTool( + qdrant_url="your_url", + qdrant_api_key="your_key", + collection_name="your_collection", + custom_embedding_fn=custom_embeddings # Pass your custom function +) +``` + +## Error Handling + +The tool handles these specific errors: +- Raises ImportError if `qdrant-client` is not installed (with option to auto-install) +- Raises ValueError if `QDRANT_URL` is not set +- Prompts to install `qdrant-client` if missing using `uv add qdrant-client` + +## Environment Variables + +Required environment variables: +```bash +export QDRANT_URL="your_qdrant_url" # If not provided in constructor +export QDRANT_API_KEY="your_api_key" # If not provided in constructor +export OPENAI_API_KEY="your_openai_key" # If using default embeddings \ No newline at end of file