ensure embeddings are persisted

This commit is contained in:
Lorenze Jay
2024-11-14 18:31:07 -08:00
parent 98a708ca15
commit 10f445e18a
16 changed files with 196 additions and 33 deletions

View File

@@ -39,7 +39,7 @@ class FastEmbed(BaseEmbedder):
if not FASTEMBED_AVAILABLE:
raise ImportError(
"FastEmbed is not installed. Please install it with: "
"pip install fastembed or pip install fastembed-gpu for GPU support"
"uv pip install fastembed or uv pip install fastembed-gpu for GPU support"
)
self.model = TextEmbedding(

View File

@@ -1,10 +1,11 @@
from typing import List
from typing import List, Optional
from pydantic import BaseModel, ConfigDict, Field
from crewai.knowledge.embedder.base_embedder import BaseEmbedder
from crewai.knowledge.embedder.fastembed import FastEmbed
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage
class Knowledge(BaseModel):
@@ -12,6 +13,8 @@ class Knowledge(BaseModel):
embedder: BaseEmbedder = Field(default_factory=FastEmbed)
model_config = ConfigDict(arbitrary_types_allowed=True)
agents: List[str] = Field(default_factory=list)
storage: KnowledgeStorage = Field(default_factory=KnowledgeStorage)
def __init__(self, **data):
super().__init__(**data)
@@ -19,35 +22,45 @@ class Knowledge(BaseModel):
for source in self.sources:
source.add(self.embedder)
def query(self, query: str, top_k: int = 3) -> List[str]:
def query(
self, query: str, top_k: int = 3, preference: Optional[str] = None
) -> List[str]:
"""
Query across all knowledge sources to find the most relevant information.
Returns the top_k most relevant chunks.
"""
if not self.sources:
return []
# if not self.sources:
# return []
results = self.storage.search(
[query],
top_k,
filter={"preference": preference} if preference else None,
score_threshold=0.35,
)
return results
# Collect all chunks and embeddings from all sources
all_chunks = []
all_embeddings = []
# all_chunks = []
# all_embeddings = []
for source in self.sources:
all_chunks.extend(source.chunks)
all_embeddings.extend(source.get_embeddings())
# for source in self.sources:
# all_chunks.extend(source.chunks)
# all_embeddings.extend(source.get_embeddings())
# Embed the query
query_embedding = self.embedder.embed_text(query)
# # Embed the query
# query_embedding = self.embedder.embed_text(query)
# Calculate similarities
similarities = []
for idx, embedding in enumerate(all_embeddings):
similarity = query_embedding.dot(embedding)
similarities.append((similarity, idx))
# # Calculate similarities
# similarities = []
# for idx, embedding in enumerate(all_embeddings):
# similarity = query_embedding.dot(embedding)
# similarities.append((similarity, idx))
# Sort by similarity
similarities.sort(reverse=True, key=lambda x: x[0])
# # Sort by similarity
# similarities.sort(reverse=True, key=lambda x: x[0])
# Get top_k results
top_chunks = [all_chunks[idx] for _, idx in similarities[:top_k]]
# # Get top_k results
# top_chunks = [all_chunks[idx] for _, idx in similarities[:top_k]]
return top_chunks
# return top_chunks

View File

@@ -3,6 +3,7 @@ from pathlib import Path
from pydantic import Field
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
from typing import Dict, Any
class BaseFileKnowledgeSource(BaseKnowledgeSource):
@@ -22,3 +23,7 @@ class BaseFileKnowledgeSource(BaseKnowledgeSource):
if not self.file_path.is_file():
raise ValueError(f"Path is not a file: {self.file_path}")
return ""
def _save_documents(self, metadata: Dict[str, Any]):
"""Save the documents to the storage."""
self.storage.save(self.chunks, metadata)

View File

@@ -1,10 +1,12 @@
from abc import ABC, abstractmethod
from typing import List
from typing import List, Optional
import numpy as np
from pydantic import BaseModel, ConfigDict, Field
from crewai.knowledge.embedder.base_embedder import BaseEmbedder
from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage
from typing import Dict, Any
class BaseKnowledgeSource(BaseModel, ABC):
@@ -16,6 +18,8 @@ class BaseKnowledgeSource(BaseModel, ABC):
chunk_embeddings: List[np.ndarray] = Field(default_factory=list)
model_config = ConfigDict(arbitrary_types_allowed=True)
storage: KnowledgeStorage = Field(default_factory=KnowledgeStorage)
metadata: Dict[str, Any] = Field(default_factory=dict)
@abstractmethod
def load_content(self):
@@ -37,3 +41,10 @@ class BaseKnowledgeSource(BaseModel, ABC):
text[i : i + self.chunk_size]
for i in range(0, len(text), self.chunk_size - self.chunk_overlap)
]
def _save_documents(self, metadata: Dict[str, Any]):
"""
Save the documents to the storage.
This method should be called after the chunks and embeddings are generated.
"""
self.storage.save(self.chunks, metadata)

View File

@@ -29,6 +29,7 @@ class CSVKnowledgeSource(BaseFileKnowledgeSource):
new_embeddings = embedder.embed_chunks(new_chunks)
# Save the embeddings
self.chunk_embeddings.extend(new_embeddings)
self._save_documents(metadata=self.metadata)
def _chunk_text(self, text: str) -> List[str]:
"""Utility method to split text into chunks."""

View File

@@ -39,6 +39,7 @@ class ExcelKnowledgeSource(BaseFileKnowledgeSource):
new_embeddings = embedder.embed_chunks(new_chunks)
# Save the embeddings
self.chunk_embeddings.extend(new_embeddings)
self._save_documents(metadata=self.metadata)
def _chunk_text(self, text: str) -> List[str]:
"""Utility method to split text into chunks."""

View File

@@ -41,6 +41,7 @@ class JSONKnowledgeSource(BaseFileKnowledgeSource):
new_embeddings = embedder.embed_chunks(new_chunks)
# Save the embeddings
self.chunk_embeddings.extend(new_embeddings)
self._save_documents(metadata=self.metadata)
def _chunk_text(self, text: str) -> List[str]:
"""Utility method to split text into chunks."""

View File

@@ -41,6 +41,7 @@ class PDFKnowledgeSource(BaseFileKnowledgeSource):
new_embeddings = embedder.embed_chunks(new_chunks)
# Save the embeddings
self.chunk_embeddings.extend(new_embeddings)
self._save_documents(metadata=self.metadata)
def _chunk_text(self, text: str) -> List[str]:
"""Utility method to split text into chunks."""

View File

@@ -1,4 +1,4 @@
from typing import List
from typing import List, Dict, Any
from pydantic import Field
@@ -28,6 +28,8 @@ class StringKnowledgeSource(BaseKnowledgeSource):
new_embeddings = embedder.embed_chunks(new_chunks)
# Save the embeddings
self.chunk_embeddings.extend(new_embeddings)
print("adding")
self._save_documents(metadata=self.metadata)
def _chunk_text(self, text: str) -> List[str]:
"""Utility method to split text into chunks."""

View File

@@ -24,6 +24,7 @@ class TextFileKnowledgeSource(BaseFileKnowledgeSource):
new_embeddings = embedder.embed_chunks(new_chunks)
# Save the embeddings
self.chunk_embeddings.extend(new_embeddings)
self._save_documents(metadata=self.metadata)
def _chunk_text(self, text: str) -> List[str]:
"""Utility method to split text into chunks."""

View File

View File

@@ -0,0 +1,110 @@
from crewai.memory.storage.base_rag_storage import BaseRAGStorage
from crewai.utilities.paths import db_storage_path
from typing import Optional, List
import chromadb
import numpy as np
from typing import Dict, Any
import uuid
import contextlib
import io
import logging
@contextlib.contextmanager
def suppress_logging(
logger_name="chromadb.segment.impl.vector.local_persistent_hnsw",
level=logging.ERROR,
):
logger = logging.getLogger(logger_name)
original_level = logger.getEffectiveLevel()
logger.setLevel(level)
with (
contextlib.redirect_stdout(io.StringIO()),
contextlib.redirect_stderr(io.StringIO()),
contextlib.suppress(UserWarning),
):
yield
logger.setLevel(original_level)
class KnowledgeStorage(BaseRAGStorage):
"""
Extends Storage to handle embeddings for memory entries, improving
search efficiency.
"""
collection: Optional[chromadb.Collection] = None
def __init__(self, embedder_config=None):
self.embedder_config = (
embedder_config or self._create_default_embedding_function()
)
self._initialize_app()
def _sanitize_role(self, role: str) -> str:
return role.replace(" ", "_")
def search(
self,
query: List[str],
limit: int = 3,
filter: Optional[dict] = None,
score_threshold: float = 0.35,
) -> List[Dict[str, Any]]:
with suppress_logging():
if self.collection:
fetched = self.collection.query(
query_texts=query,
n_results=limit,
where=filter,
)
print("Fetched", fetched)
results = []
for i in range(len(fetched["ids"][0])):
result = {
"id": fetched["ids"][0][i],
"metadata": fetched["metadatas"][0][i],
"context": fetched["documents"][0][i],
"score": fetched["distances"][0][i],
}
if result["score"] >= score_threshold:
results.append(result)
return results
else:
raise Exception("Collection not initialized")
def _initialize_app(self):
import chromadb
from chromadb.config import Settings
chroma_client = chromadb.PersistentClient(
path=f"{db_storage_path()}/knowledge",
settings=Settings(allow_reset=True),
)
self.app = chroma_client
try:
self.collection = self.app.get_or_create_collection(name="knowledge")
except Exception:
raise Exception("Failed to create or get collection")
def reset(self):
if self.app:
self.app.reset()
def save(self, documents: List[str], metadata: Dict[str, Any]):
if self.collection:
self.collection.add(
documents=documents,
metadatas=metadata,
ids=[str(uuid.uuid4())],
)
else:
raise Exception("Collection not initialized")
def _create_default_embedding_function(self):
from crewai.knowledge.embedder.fastembed import FastEmbed
return FastEmbed().embed_texts