From fc521839e467174d864887c4b888421d82796f82 Mon Sep 17 00:00:00 2001 From: Lorenze Jay <63378463+lorenzejay@users.noreply.github.com> Date: Thu, 6 Nov 2025 10:59:52 -0800 Subject: [PATCH] Lorenze/fix duplicating doc ids for knowledge (#3840) * fix: update document ID handling in ChromaDB utility functions to use SHA-256 hashing and include index for uniqueness * test: add tests for hash-based ID generation in ChromaDB utility functions * drop idx for preventing dups, upsert should handle dups * fix: update document ID extraction logic in ChromaDB utility functions to check for doc_id at the top level of the document * fix: enhance document ID generation in ChromaDB utility functions to deduplicate documents and ensure unique hash-based IDs without suffixes * fix: improve error handling and document ID generation in ChromaDB utility functions to ensure robust processing and uniqueness --- lib/crewai/src/crewai/rag/chromadb/utils.py | 55 ++++++++------ lib/crewai/tests/knowledge/test_knowledge.py | 78 ++++++++++++++++++++ 2 files changed, 112 insertions(+), 21 deletions(-) diff --git a/lib/crewai/src/crewai/rag/chromadb/utils.py b/lib/crewai/src/crewai/rag/chromadb/utils.py index dc58fa8c5..4f17bef5a 100644 --- a/lib/crewai/src/crewai/rag/chromadb/utils.py +++ b/lib/crewai/src/crewai/rag/chromadb/utils.py @@ -67,31 +67,44 @@ def _prepare_documents_for_chromadb( ids: list[str] = [] texts: list[str] = [] metadatas: list[Mapping[str, str | int | float | bool]] = [] + seen_ids: dict[str, int] = {} + + try: + for doc in documents: + if "doc_id" in doc: + doc_id = str(doc["doc_id"]) + else: + metadata = doc.get("metadata") + if metadata and isinstance(metadata, dict) and "doc_id" in metadata: + doc_id = str(metadata["doc_id"]) + else: + content_for_hash = doc["content"] + if metadata: + metadata_str = json.dumps(metadata, sort_keys=True) + content_for_hash = f"{content_for_hash}|{metadata_str}" + doc_id = hashlib.sha256(content_for_hash.encode()).hexdigest() - for doc in documents: - if "doc_id" in doc: - ids.append(doc["doc_id"]) - else: - content_for_hash = doc["content"] metadata = doc.get("metadata") if metadata: - metadata_str = json.dumps(metadata, sort_keys=True) - content_for_hash = f"{content_for_hash}|{metadata_str}" - - content_hash = hashlib.blake2b( - content_for_hash.encode(), digest_size=32 - ).hexdigest() - ids.append(content_hash) - - texts.append(doc["content"]) - metadata = doc.get("metadata") - if metadata: - if isinstance(metadata, list): - metadatas.append(metadata[0] if metadata and metadata[0] else {}) + if isinstance(metadata, list): + processed_metadata = metadata[0] if metadata and metadata[0] else {} + else: + processed_metadata = metadata else: - metadatas.append(metadata) - else: - metadatas.append({}) + processed_metadata = {} + + if doc_id in seen_ids: + idx = seen_ids[doc_id] + texts[idx] = doc["content"] + metadatas[idx] = processed_metadata + else: + idx = len(ids) + ids.append(doc_id) + texts.append(doc["content"]) + metadatas.append(processed_metadata) + seen_ids[doc_id] = idx + except Exception as e: + raise ValueError(f"Error preparing documents for ChromaDB: {e}") from e return PreparedDocuments(ids, texts, metadatas) diff --git a/lib/crewai/tests/knowledge/test_knowledge.py b/lib/crewai/tests/knowledge/test_knowledge.py index a6f253fb1..b56d46a74 100644 --- a/lib/crewai/tests/knowledge/test_knowledge.py +++ b/lib/crewai/tests/knowledge/test_knowledge.py @@ -601,3 +601,81 @@ def test_file_path_validation(): match="file_path/file_paths must be a Path, str, or a list of these types", ): PDFKnowledgeSource() + + +def test_hash_based_id_generation_without_doc_id(mock_vector_db): + """Test that documents without doc_id generate hash-based IDs. Duplicates are deduplicated before upsert.""" + import hashlib + import json + from crewai.rag.chromadb.utils import _prepare_documents_for_chromadb + from crewai.rag.types import BaseRecord + + documents: list[BaseRecord] = [ + {"content": "First document content", "metadata": {"source": "test1", "category": "research"}}, + {"content": "Second document content", "metadata": {"source": "test2", "category": "research"}}, + {"content": "Third document content"}, # No metadata + ] + + result = _prepare_documents_for_chromadb(documents) + + assert len(result.ids) == 3 + + # Unique documents should get 64-character hex hashes (no suffix) + for doc_id in result.ids: + assert len(doc_id) == 64, f"ID should be 64 characters: {doc_id}" + assert all(c in "0123456789abcdef" for c in doc_id), f"ID should be hex: {doc_id}" + + # Different documents should have different hashes + assert result.ids[0] != result.ids[1] != result.ids[2] + + # Verify hashes match expected values + expected_hash_1 = hashlib.sha256( + f"First document content|{json.dumps({'category': 'research', 'source': 'test1'}, sort_keys=True)}".encode() + ).hexdigest() + assert result.ids[0] == expected_hash_1, "First document hash should match expected" + + expected_hash_3 = hashlib.sha256("Third document content".encode()).hexdigest() + assert result.ids[2] == expected_hash_3, "Third document hash should match expected" + + # Test that duplicate documents are deduplicated (same ID, only one sent) + duplicate_documents: list[BaseRecord] = [ + {"content": "Same content", "metadata": {"source": "test"}}, + {"content": "Same content", "metadata": {"source": "test"}}, + {"content": "Same content", "metadata": {"source": "test"}}, + ] + duplicate_result = _prepare_documents_for_chromadb(duplicate_documents) + # Duplicates should be deduplicated - only one ID should remain + assert len(duplicate_result.ids) == 1, "Duplicate documents should be deduplicated" + assert len(duplicate_result.ids[0]) == 64, "Deduplicated ID should be clean hash" + # Verify it's the expected hash + expected_hash = hashlib.sha256( + f"Same content|{json.dumps({'source': 'test'}, sort_keys=True)}".encode() + ).hexdigest() + assert duplicate_result.ids[0] == expected_hash, "Deduplicated ID should match expected hash" + + +def test_hash_based_id_generation_with_doc_id_in_metadata(mock_vector_db): + """Test that documents with doc_id in metadata use the doc_id directly, not hash-based.""" + from crewai.rag.chromadb.utils import _prepare_documents_for_chromadb + from crewai.rag.types import BaseRecord + + documents_with_doc_id: list[BaseRecord] = [ + {"content": "First document", "metadata": {"doc_id": "custom-id-1", "source": "test1"}}, + {"content": "Second document", "metadata": {"doc_id": "custom-id-2"}}, + ] + + documents_without_doc_id: list[BaseRecord] = [ + {"content": "First document", "metadata": {"source": "test1"}}, + {"content": "Second document"}, + ] + + result_with_doc_id = _prepare_documents_for_chromadb(documents_with_doc_id) + result_without_doc_id = _prepare_documents_for_chromadb(documents_without_doc_id) + + assert result_with_doc_id.ids == ["custom-id-1", "custom-id-2"] + + assert len(result_without_doc_id.ids) == 2 + # Unique documents get 64-character hashes + for doc_id in result_without_doc_id.ids: + assert len(doc_id) == 64, "ID should be 64 characters" + assert all(c in "0123456789abcdef" for c in doc_id), "ID should be hex"