mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-05-03 16:22:49 +00:00
Lorenze/fix duplicating doc ids for knowledge (#3840)
* fix: update document ID handling in ChromaDB utility functions to use SHA-256 hashing and include index for uniqueness * test: add tests for hash-based ID generation in ChromaDB utility functions * drop idx for preventing dups, upsert should handle dups * fix: update document ID extraction logic in ChromaDB utility functions to check for doc_id at the top level of the document * fix: enhance document ID generation in ChromaDB utility functions to deduplicate documents and ensure unique hash-based IDs without suffixes * fix: improve error handling and document ID generation in ChromaDB utility functions to ensure robust processing and uniqueness
This commit is contained in:
@@ -67,31 +67,44 @@ def _prepare_documents_for_chromadb(
|
||||
ids: list[str] = []
|
||||
texts: list[str] = []
|
||||
metadatas: list[Mapping[str, str | int | float | bool]] = []
|
||||
seen_ids: dict[str, int] = {}
|
||||
|
||||
try:
|
||||
for doc in documents:
|
||||
if "doc_id" in doc:
|
||||
doc_id = str(doc["doc_id"])
|
||||
else:
|
||||
metadata = doc.get("metadata")
|
||||
if metadata and isinstance(metadata, dict) and "doc_id" in metadata:
|
||||
doc_id = str(metadata["doc_id"])
|
||||
else:
|
||||
content_for_hash = doc["content"]
|
||||
if metadata:
|
||||
metadata_str = json.dumps(metadata, sort_keys=True)
|
||||
content_for_hash = f"{content_for_hash}|{metadata_str}"
|
||||
doc_id = hashlib.sha256(content_for_hash.encode()).hexdigest()
|
||||
|
||||
for doc in documents:
|
||||
if "doc_id" in doc:
|
||||
ids.append(doc["doc_id"])
|
||||
else:
|
||||
content_for_hash = doc["content"]
|
||||
metadata = doc.get("metadata")
|
||||
if metadata:
|
||||
metadata_str = json.dumps(metadata, sort_keys=True)
|
||||
content_for_hash = f"{content_for_hash}|{metadata_str}"
|
||||
|
||||
content_hash = hashlib.blake2b(
|
||||
content_for_hash.encode(), digest_size=32
|
||||
).hexdigest()
|
||||
ids.append(content_hash)
|
||||
|
||||
texts.append(doc["content"])
|
||||
metadata = doc.get("metadata")
|
||||
if metadata:
|
||||
if isinstance(metadata, list):
|
||||
metadatas.append(metadata[0] if metadata and metadata[0] else {})
|
||||
if isinstance(metadata, list):
|
||||
processed_metadata = metadata[0] if metadata and metadata[0] else {}
|
||||
else:
|
||||
processed_metadata = metadata
|
||||
else:
|
||||
metadatas.append(metadata)
|
||||
else:
|
||||
metadatas.append({})
|
||||
processed_metadata = {}
|
||||
|
||||
if doc_id in seen_ids:
|
||||
idx = seen_ids[doc_id]
|
||||
texts[idx] = doc["content"]
|
||||
metadatas[idx] = processed_metadata
|
||||
else:
|
||||
idx = len(ids)
|
||||
ids.append(doc_id)
|
||||
texts.append(doc["content"])
|
||||
metadatas.append(processed_metadata)
|
||||
seen_ids[doc_id] = idx
|
||||
except Exception as e:
|
||||
raise ValueError(f"Error preparing documents for ChromaDB: {e}") from e
|
||||
|
||||
return PreparedDocuments(ids, texts, metadatas)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user