mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-10 00:28:31 +00:00
Fix duplicate document issue
This commit is contained in:
@@ -124,59 +124,59 @@ class KnowledgeStorage(BaseKnowledgeStorage):
|
|||||||
documents: List[str],
|
documents: List[str],
|
||||||
metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
||||||
):
|
):
|
||||||
if self.collection:
|
if not self.collection:
|
||||||
try:
|
|
||||||
if metadata is None:
|
|
||||||
metadatas: Optional[OneOrMany[chromadb.Metadata]] = None
|
|
||||||
elif isinstance(metadata, list):
|
|
||||||
metadatas = [cast(chromadb.Metadata, m) for m in metadata]
|
|
||||||
else:
|
|
||||||
metadatas = cast(chromadb.Metadata, metadata)
|
|
||||||
|
|
||||||
print("DOCUMENTS", documents)
|
|
||||||
for doc, idx in enumerate(documents):
|
|
||||||
print(f"DOC: {idx}", doc)
|
|
||||||
|
|
||||||
ids = [
|
|
||||||
hashlib.sha256(doc.encode("utf-8")).hexdigest() for doc in documents
|
|
||||||
]
|
|
||||||
|
|
||||||
self.collection.upsert(
|
|
||||||
documents=documents,
|
|
||||||
metadatas=metadatas,
|
|
||||||
ids=ids,
|
|
||||||
)
|
|
||||||
except chromadb.errors.InvalidDimensionException as e:
|
|
||||||
Logger(verbose=True).log(
|
|
||||||
"error",
|
|
||||||
"Embedding dimension mismatch. This usually happens when mixing different embedding models. Try resetting the collection using `crewai reset-memories -a`",
|
|
||||||
"red",
|
|
||||||
)
|
|
||||||
raise ValueError(
|
|
||||||
"Embedding dimension mismatch. Make sure you're using the same embedding model "
|
|
||||||
"across all operations with this collection."
|
|
||||||
"Try resetting the collection using `crewai reset-memories -a`"
|
|
||||||
) from e
|
|
||||||
except Exception as e:
|
|
||||||
Logger(verbose=True).log(
|
|
||||||
"error", f"Failed to upsert documents: {e}", "red"
|
|
||||||
)
|
|
||||||
raise
|
|
||||||
else:
|
|
||||||
raise Exception("Collection not initialized")
|
raise Exception("Collection not initialized")
|
||||||
|
|
||||||
def _create_default_embedding_function(self):
|
try:
|
||||||
from chromadb.utils.embedding_functions.openai_embedding_function import (
|
# Create a dictionary to store unique documents
|
||||||
OpenAIEmbeddingFunction,
|
unique_docs = {}
|
||||||
)
|
|
||||||
|
|
||||||
return OpenAIEmbeddingFunction(
|
# Generate IDs and create a mapping of id -> (document, metadata)
|
||||||
api_key=os.getenv("OPENAI_API_KEY"), model_name="text-embedding-3-small"
|
for idx, doc in enumerate(documents):
|
||||||
)
|
doc_id = hashlib.sha256(doc.encode("utf-8")).hexdigest()
|
||||||
|
doc_metadata = None
|
||||||
|
if metadata is not None:
|
||||||
|
if isinstance(metadata, list):
|
||||||
|
doc_metadata = metadata[idx]
|
||||||
|
else:
|
||||||
|
doc_metadata = metadata
|
||||||
|
unique_docs[doc_id] = (doc, doc_metadata)
|
||||||
|
|
||||||
def _set_embedder_config(
|
# Prepare filtered lists for ChromaDB
|
||||||
self, embedder_config: Optional[Dict[str, Any]] = None
|
filtered_docs = []
|
||||||
) -> None:
|
filtered_metadata = []
|
||||||
|
filtered_ids = []
|
||||||
|
|
||||||
|
# Build the filtered lists
|
||||||
|
for doc_id, (doc, meta) in unique_docs.items():
|
||||||
|
filtered_docs.append(doc)
|
||||||
|
filtered_metadata.append(meta)
|
||||||
|
filtered_ids.append(doc_id)
|
||||||
|
|
||||||
|
# If we have no metadata at all, set it to None
|
||||||
|
final_metadata: Optional[OneOrMany[chromadb.Metadata]] = (
|
||||||
|
None if all(m is None for m in filtered_metadata) else filtered_metadata
|
||||||
|
)
|
||||||
|
|
||||||
|
self.collection.upsert(
|
||||||
|
documents=filtered_docs,
|
||||||
|
metadatas=final_metadata,
|
||||||
|
ids=filtered_ids,
|
||||||
|
)
|
||||||
|
except chromadb.errors.InvalidDimensionException as e:
|
||||||
|
Logger(verbose=True).log(
|
||||||
|
"error",
|
||||||
|
"Embedding dimension mismatch. This usually happens when mixing different embedding models. Try resetting the collection using `crewai reset-memories -a`",
|
||||||
|
"red",
|
||||||
|
)
|
||||||
|
raise ValueError(
|
||||||
|
"Embedding dimension mismatch. Make sure you're using the same embedding model "
|
||||||
|
"across all operations with this collection."
|
||||||
|
"Try resetting the collection using `crewai reset-memories -a`"
|
||||||
|
) from e
|
||||||
|
except Exception as e:
|
||||||
|
Logger(verbose=True).log("error", f"Failed to upsert documents: {e}", "red")
|
||||||
|
raise
|
||||||
"""Set the embedding configuration for the knowledge storage.
|
"""Set the embedding configuration for the knowledge storage.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|||||||
Reference in New Issue
Block a user