Lorenze/fix duplicating doc ids for knowledge (#3840)

* fix: update document ID handling in ChromaDB utility functions to use SHA-256 hashing and include index for uniqueness

* test: add tests for hash-based ID generation in ChromaDB utility functions

* drop idx for preventing dups, upsert should handle dups

* fix: update document ID extraction logic in ChromaDB utility functions to check for doc_id at the top level of the document

* fix: enhance document ID generation in ChromaDB utility functions to deduplicate documents and ensure unique hash-based IDs without suffixes

* fix: improve error handling and document ID generation in ChromaDB utility functions to ensure robust processing and uniqueness
This commit is contained in:
Lorenze Jay
2025-11-06 10:59:52 -08:00
committed by GitHub
parent e4cc9a664c
commit fc521839e4
2 changed files with 112 additions and 21 deletions

View File

@@ -67,31 +67,44 @@ def _prepare_documents_for_chromadb(
ids: list[str] = []
texts: list[str] = []
metadatas: list[Mapping[str, str | int | float | bool]] = []
seen_ids: dict[str, int] = {}
try:
for doc in documents:
if "doc_id" in doc:
doc_id = str(doc["doc_id"])
else:
metadata = doc.get("metadata")
if metadata and isinstance(metadata, dict) and "doc_id" in metadata:
doc_id = str(metadata["doc_id"])
else:
content_for_hash = doc["content"]
if metadata:
metadata_str = json.dumps(metadata, sort_keys=True)
content_for_hash = f"{content_for_hash}|{metadata_str}"
doc_id = hashlib.sha256(content_for_hash.encode()).hexdigest()
for doc in documents:
if "doc_id" in doc:
ids.append(doc["doc_id"])
else:
content_for_hash = doc["content"]
metadata = doc.get("metadata")
if metadata:
metadata_str = json.dumps(metadata, sort_keys=True)
content_for_hash = f"{content_for_hash}|{metadata_str}"
content_hash = hashlib.blake2b(
content_for_hash.encode(), digest_size=32
).hexdigest()
ids.append(content_hash)
texts.append(doc["content"])
metadata = doc.get("metadata")
if metadata:
if isinstance(metadata, list):
metadatas.append(metadata[0] if metadata and metadata[0] else {})
if isinstance(metadata, list):
processed_metadata = metadata[0] if metadata and metadata[0] else {}
else:
processed_metadata = metadata
else:
metadatas.append(metadata)
else:
metadatas.append({})
processed_metadata = {}
if doc_id in seen_ids:
idx = seen_ids[doc_id]
texts[idx] = doc["content"]
metadatas[idx] = processed_metadata
else:
idx = len(ids)
ids.append(doc_id)
texts.append(doc["content"])
metadatas.append(processed_metadata)
seen_ids[doc_id] = idx
except Exception as e:
raise ValueError(f"Error preparing documents for ChromaDB: {e}") from e
return PreparedDocuments(ids, texts, metadatas)