Fix CrewDoclingSource filepath metadata misalignment

- Extract filepath from ConversionResult.input.file instead of indexing safe_file_paths - Add content_paths field to track source filepath for each converted document - Ensures correct filepath metadata even when some files fail conversion - Add comprehensive test for filepath metadata with conversion failures Addresses Cursor Bugbot comment on PR #3813 Co-Authored-By: João <joao@crewai.com>
2025-12-15 20:08:29 +00:00 · 2025-10-30 09:30:37 +00:00
parent 58c5a2670a
commit 5dc40bff9e
2 changed files with 85 additions and 2 deletions
--- a/lib/crewai/src/crewai/knowledge/source/crew_docling_source.py
+++ b/lib/crewai/src/crewai/knowledge/source/crew_docling_source.py
@@ -51,6 +51,7 @@ class CrewDoclingSource(BaseKnowledgeSource):
    chunks: list[str] = Field(default_factory=list)
    safe_file_paths: list[Path | str] = Field(default_factory=list)
    content: list[DoclingDocument] = Field(default_factory=list)
+    content_paths: list[Path | str] = Field(default_factory=list)
    document_converter: DocumentConverter = Field(
        default_factory=lambda: DocumentConverter(
            allowed_formats=[
@@ -95,7 +96,7 @@ class CrewDoclingSource(BaseKnowledgeSource):
        if self.content is None:
            return
        for doc_index, doc in enumerate(self.content):
-            filepath = self.safe_file_paths[doc_index] if doc_index < len(self.safe_file_paths) else "unknown"
+            filepath = self.content_paths[doc_index] if doc_index < len(self.content_paths) else "unknown"
            chunk_index = 0
            for chunk_text in self._chunk_doc(doc):
                self.chunks.append({
@@ -111,7 +112,12 @@ class CrewDoclingSource(BaseKnowledgeSource):

    def _convert_source_to_docling_documents(self) -> list[DoclingDocument]:
        conv_results_iter = self.document_converter.convert_all(self.safe_file_paths)
-        return [result.document for result in conv_results_iter]
+        documents = []
+        self.content_paths = []
+        for result in conv_results_iter:
+            documents.append(result.document)
+            self.content_paths.append(result.input.file)
+        return documents

    def _chunk_doc(self, doc: DoclingDocument) -> Iterator[str]:
        chunker = HierarchicalChunker()
--- a/lib/crewai/tests/knowledge/test_knowledge_metadata.py
+++ b/lib/crewai/tests/knowledge/test_knowledge_metadata.py
@@ -398,3 +398,80 @@ class TestBackwardCompatibility:
            assert len(saved_docs) == 2
            assert all("content" in doc for doc in saved_docs)
            assert all("metadata" in doc for doc in saved_docs)
+
+
+class TestCrewDoclingSourceMetadata:
+    """Test CrewDoclingSource metadata with conversion failures."""
+
+    @pytest.mark.skipif(
+        not hasattr(pytest, "importorskip") or pytest.importorskip("docling", reason="docling not available") is None,
+        reason="docling not available"
+    )
+    def test_docling_filepath_metadata_with_conversion_failure(self, tmp_path):
+        """Test that filepath metadata is correct even when some files fail conversion."""
+        try:
+            from pathlib import Path
+            from unittest.mock import MagicMock, Mock
+            from crewai.knowledge.source.crew_docling_source import CrewDoclingSource
+            from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage
+            
+            file1 = tmp_path / "file1.txt"
+            file2 = tmp_path / "file2.txt"
+            file3 = tmp_path / "file3.txt"
+            
+            file1.write_text("Content from file 1")
+            file2.write_text("Content from file 2")
+            file3.write_text("Content from file 3")
+            
+            mock_doc1 = MagicMock()
+            mock_doc3 = MagicMock()
+            
+            mock_result1 = MagicMock()
+            mock_result1.document = mock_doc1
+            mock_result1.input.file = file1
+            
+            mock_result3 = MagicMock()
+            mock_result3.document = mock_doc3
+            mock_result3.input.file = file3
+            
+            with patch("crewai.knowledge.source.crew_docling_source.DocumentConverter") as mock_converter_class:
+                mock_converter = MagicMock()
+                mock_converter_class.return_value = mock_converter
+                mock_converter.convert_all.return_value = iter([mock_result1, mock_result3])
+                mock_converter.allowed_formats = []
+                
+                with patch.object(KnowledgeStorage, 'save') as mock_save:
+                    with patch("crewai.knowledge.source.crew_docling_source.CrewDoclingSource._chunk_doc") as mock_chunk:
+                        mock_chunk.side_effect = [
+                            iter(["Chunk 1 from file1", "Chunk 2 from file1"]),
+                            iter(["Chunk 1 from file3", "Chunk 2 from file3"])
+                        ]
+                        
+                        storage = KnowledgeStorage()
+                        source = CrewDoclingSource(
+                            file_paths=[file1, file2, file3],
+                            storage=storage
+                        )
+                        
+                        source.add()
+                        
+                        assert len(source.chunks) == 4
+                        
+                        assert source.chunks[0]["metadata"]["filepath"] == str(file1)
+                        assert source.chunks[0]["metadata"]["source_type"] == "docling"
+                        assert source.chunks[0]["metadata"]["chunk_index"] == 0
+                        
+                        assert source.chunks[1]["metadata"]["filepath"] == str(file1)
+                        assert source.chunks[1]["metadata"]["chunk_index"] == 1
+                        
+                        assert source.chunks[2]["metadata"]["filepath"] == str(file3)
+                        assert source.chunks[2]["metadata"]["source_type"] == "docling"
+                        assert source.chunks[2]["metadata"]["chunk_index"] == 0
+                        
+                        assert source.chunks[3]["metadata"]["filepath"] == str(file3)
+                        assert source.chunks[3]["metadata"]["chunk_index"] == 1
+                        
+                        for chunk in source.chunks:
+                            assert chunk["metadata"]["filepath"] != str(file2)
+        except ImportError:
+            pytest.skip("docling not available")