Lorenze/fix duplicating doc ids for knowledge (#3840)

* fix: update document ID handling in ChromaDB utility functions to use SHA-256 hashing and include index for uniqueness * test: add tests for hash-based ID generation in ChromaDB utility functions * drop idx for preventing dups, upsert should handle dups * fix: update document ID extraction logic in ChromaDB utility functions to check for doc_id at the top level of the document * fix: enhance document ID generation in ChromaDB utility functions to deduplicate documents and ensure unique hash-based IDs without suffixes * fix: improve error handling and document ID generation in ChromaDB utility functions to ensure robust processing and uniqueness
2026-05-03 16:22:49 +00:00 · 2025-11-06 10:59:52 -08:00
parent e4cc9a664c
commit fc521839e4
2 changed files with 112 additions and 21 deletions
--- a/lib/crewai/src/crewai/rag/chromadb/utils.py
+++ b/lib/crewai/src/crewai/rag/chromadb/utils.py
@@ -67,31 +67,44 @@ def _prepare_documents_for_chromadb(
    ids: list[str] = []
    texts: list[str] = []
    metadatas: list[Mapping[str, str | int | float | bool]] = []
+    seen_ids: dict[str, int] = {}
+
+    try:
+        for doc in documents:
+            if "doc_id" in doc:
+                doc_id = str(doc["doc_id"])
+            else:
+                metadata = doc.get("metadata")
+                if metadata and isinstance(metadata, dict) and "doc_id" in metadata:
+                    doc_id = str(metadata["doc_id"])
+                else:
+                    content_for_hash = doc["content"]
+                    if metadata:
+                        metadata_str = json.dumps(metadata, sort_keys=True)
+                        content_for_hash = f"{content_for_hash}|{metadata_str}"
+                    doc_id = hashlib.sha256(content_for_hash.encode()).hexdigest()

-    for doc in documents:
-        if "doc_id" in doc:
-            ids.append(doc["doc_id"])
-        else:
-            content_for_hash = doc["content"]
            metadata = doc.get("metadata")
            if metadata:
-                metadata_str = json.dumps(metadata, sort_keys=True)
-                content_for_hash = f"{content_for_hash}|{metadata_str}"
-
-            content_hash = hashlib.blake2b(
-                content_for_hash.encode(), digest_size=32
-            ).hexdigest()
-            ids.append(content_hash)
-
-        texts.append(doc["content"])
-        metadata = doc.get("metadata")
-        if metadata:
-            if isinstance(metadata, list):
-                metadatas.append(metadata[0] if metadata and metadata[0] else {})
+                if isinstance(metadata, list):
+                    processed_metadata = metadata[0] if metadata and metadata[0] else {}
+                else:
+                    processed_metadata = metadata
            else:
-                metadatas.append(metadata)
-        else:
-            metadatas.append({})
+                processed_metadata = {}
+
+            if doc_id in seen_ids:
+                idx = seen_ids[doc_id]
+                texts[idx] = doc["content"]
+                metadatas[idx] = processed_metadata
+            else:
+                idx = len(ids)
+                ids.append(doc_id)
+                texts.append(doc["content"])
+                metadatas.append(processed_metadata)
+                seen_ids[doc_id] = idx
+    except Exception as e:
+        raise ValueError(f"Error preparing documents for ChromaDB: {e}") from e

    return PreparedDocuments(ids, texts, metadatas)