mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-08 15:48:29 +00:00
Fix duplicate document issue
This commit is contained in:
@@ -124,59 +124,59 @@ class KnowledgeStorage(BaseKnowledgeStorage):
|
||||
documents: List[str],
|
||||
metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
||||
):
|
||||
if self.collection:
|
||||
try:
|
||||
if metadata is None:
|
||||
metadatas: Optional[OneOrMany[chromadb.Metadata]] = None
|
||||
elif isinstance(metadata, list):
|
||||
metadatas = [cast(chromadb.Metadata, m) for m in metadata]
|
||||
else:
|
||||
metadatas = cast(chromadb.Metadata, metadata)
|
||||
|
||||
print("DOCUMENTS", documents)
|
||||
for doc, idx in enumerate(documents):
|
||||
print(f"DOC: {idx}", doc)
|
||||
|
||||
ids = [
|
||||
hashlib.sha256(doc.encode("utf-8")).hexdigest() for doc in documents
|
||||
]
|
||||
|
||||
self.collection.upsert(
|
||||
documents=documents,
|
||||
metadatas=metadatas,
|
||||
ids=ids,
|
||||
)
|
||||
except chromadb.errors.InvalidDimensionException as e:
|
||||
Logger(verbose=True).log(
|
||||
"error",
|
||||
"Embedding dimension mismatch. This usually happens when mixing different embedding models. Try resetting the collection using `crewai reset-memories -a`",
|
||||
"red",
|
||||
)
|
||||
raise ValueError(
|
||||
"Embedding dimension mismatch. Make sure you're using the same embedding model "
|
||||
"across all operations with this collection."
|
||||
"Try resetting the collection using `crewai reset-memories -a`"
|
||||
) from e
|
||||
except Exception as e:
|
||||
Logger(verbose=True).log(
|
||||
"error", f"Failed to upsert documents: {e}", "red"
|
||||
)
|
||||
raise
|
||||
else:
|
||||
if not self.collection:
|
||||
raise Exception("Collection not initialized")
|
||||
|
||||
def _create_default_embedding_function(self):
|
||||
from chromadb.utils.embedding_functions.openai_embedding_function import (
|
||||
OpenAIEmbeddingFunction,
|
||||
)
|
||||
try:
|
||||
# Create a dictionary to store unique documents
|
||||
unique_docs = {}
|
||||
|
||||
return OpenAIEmbeddingFunction(
|
||||
api_key=os.getenv("OPENAI_API_KEY"), model_name="text-embedding-3-small"
|
||||
)
|
||||
# Generate IDs and create a mapping of id -> (document, metadata)
|
||||
for idx, doc in enumerate(documents):
|
||||
doc_id = hashlib.sha256(doc.encode("utf-8")).hexdigest()
|
||||
doc_metadata = None
|
||||
if metadata is not None:
|
||||
if isinstance(metadata, list):
|
||||
doc_metadata = metadata[idx]
|
||||
else:
|
||||
doc_metadata = metadata
|
||||
unique_docs[doc_id] = (doc, doc_metadata)
|
||||
|
||||
def _set_embedder_config(
|
||||
self, embedder_config: Optional[Dict[str, Any]] = None
|
||||
) -> None:
|
||||
# Prepare filtered lists for ChromaDB
|
||||
filtered_docs = []
|
||||
filtered_metadata = []
|
||||
filtered_ids = []
|
||||
|
||||
# Build the filtered lists
|
||||
for doc_id, (doc, meta) in unique_docs.items():
|
||||
filtered_docs.append(doc)
|
||||
filtered_metadata.append(meta)
|
||||
filtered_ids.append(doc_id)
|
||||
|
||||
# If we have no metadata at all, set it to None
|
||||
final_metadata: Optional[OneOrMany[chromadb.Metadata]] = (
|
||||
None if all(m is None for m in filtered_metadata) else filtered_metadata
|
||||
)
|
||||
|
||||
self.collection.upsert(
|
||||
documents=filtered_docs,
|
||||
metadatas=final_metadata,
|
||||
ids=filtered_ids,
|
||||
)
|
||||
except chromadb.errors.InvalidDimensionException as e:
|
||||
Logger(verbose=True).log(
|
||||
"error",
|
||||
"Embedding dimension mismatch. This usually happens when mixing different embedding models. Try resetting the collection using `crewai reset-memories -a`",
|
||||
"red",
|
||||
)
|
||||
raise ValueError(
|
||||
"Embedding dimension mismatch. Make sure you're using the same embedding model "
|
||||
"across all operations with this collection."
|
||||
"Try resetting the collection using `crewai reset-memories -a`"
|
||||
) from e
|
||||
except Exception as e:
|
||||
Logger(verbose=True).log("error", f"Failed to upsert documents: {e}", "red")
|
||||
raise
|
||||
"""Set the embedding configuration for the knowledge storage.
|
||||
|
||||
Args:
|
||||
|
||||
Reference in New Issue
Block a user