From 5dc40bff9e30e12b7cadbc4c857141672e247370 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 30 Oct 2025 09:30:37 +0000 Subject: [PATCH] Fix CrewDoclingSource filepath metadata misalignment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Extract filepath from ConversionResult.input.file instead of indexing safe_file_paths - Add content_paths field to track source filepath for each converted document - Ensures correct filepath metadata even when some files fail conversion - Add comprehensive test for filepath metadata with conversion failures Addresses Cursor Bugbot comment on PR #3813 Co-Authored-By: João --- .../knowledge/source/crew_docling_source.py | 10 ++- .../knowledge/test_knowledge_metadata.py | 77 +++++++++++++++++++ 2 files changed, 85 insertions(+), 2 deletions(-) diff --git a/lib/crewai/src/crewai/knowledge/source/crew_docling_source.py b/lib/crewai/src/crewai/knowledge/source/crew_docling_source.py index caec422ab..142f44d57 100644 --- a/lib/crewai/src/crewai/knowledge/source/crew_docling_source.py +++ b/lib/crewai/src/crewai/knowledge/source/crew_docling_source.py @@ -51,6 +51,7 @@ class CrewDoclingSource(BaseKnowledgeSource): chunks: list[str] = Field(default_factory=list) safe_file_paths: list[Path | str] = Field(default_factory=list) content: list[DoclingDocument] = Field(default_factory=list) + content_paths: list[Path | str] = Field(default_factory=list) document_converter: DocumentConverter = Field( default_factory=lambda: DocumentConverter( allowed_formats=[ @@ -95,7 +96,7 @@ class CrewDoclingSource(BaseKnowledgeSource): if self.content is None: return for doc_index, doc in enumerate(self.content): - filepath = self.safe_file_paths[doc_index] if doc_index < len(self.safe_file_paths) else "unknown" + filepath = self.content_paths[doc_index] if doc_index < len(self.content_paths) else "unknown" chunk_index = 0 for chunk_text in self._chunk_doc(doc): self.chunks.append({ @@ -111,7 +112,12 @@ class CrewDoclingSource(BaseKnowledgeSource): def _convert_source_to_docling_documents(self) -> list[DoclingDocument]: conv_results_iter = self.document_converter.convert_all(self.safe_file_paths) - return [result.document for result in conv_results_iter] + documents = [] + self.content_paths = [] + for result in conv_results_iter: + documents.append(result.document) + self.content_paths.append(result.input.file) + return documents def _chunk_doc(self, doc: DoclingDocument) -> Iterator[str]: chunker = HierarchicalChunker() diff --git a/lib/crewai/tests/knowledge/test_knowledge_metadata.py b/lib/crewai/tests/knowledge/test_knowledge_metadata.py index 1b67f5c32..e5ce74d48 100644 --- a/lib/crewai/tests/knowledge/test_knowledge_metadata.py +++ b/lib/crewai/tests/knowledge/test_knowledge_metadata.py @@ -398,3 +398,80 @@ class TestBackwardCompatibility: assert len(saved_docs) == 2 assert all("content" in doc for doc in saved_docs) assert all("metadata" in doc for doc in saved_docs) + + +class TestCrewDoclingSourceMetadata: + """Test CrewDoclingSource metadata with conversion failures.""" + + @pytest.mark.skipif( + not hasattr(pytest, "importorskip") or pytest.importorskip("docling", reason="docling not available") is None, + reason="docling not available" + ) + def test_docling_filepath_metadata_with_conversion_failure(self, tmp_path): + """Test that filepath metadata is correct even when some files fail conversion.""" + try: + from pathlib import Path + from unittest.mock import MagicMock, Mock + from crewai.knowledge.source.crew_docling_source import CrewDoclingSource + from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage + + file1 = tmp_path / "file1.txt" + file2 = tmp_path / "file2.txt" + file3 = tmp_path / "file3.txt" + + file1.write_text("Content from file 1") + file2.write_text("Content from file 2") + file3.write_text("Content from file 3") + + mock_doc1 = MagicMock() + mock_doc3 = MagicMock() + + mock_result1 = MagicMock() + mock_result1.document = mock_doc1 + mock_result1.input.file = file1 + + mock_result3 = MagicMock() + mock_result3.document = mock_doc3 + mock_result3.input.file = file3 + + with patch("crewai.knowledge.source.crew_docling_source.DocumentConverter") as mock_converter_class: + mock_converter = MagicMock() + mock_converter_class.return_value = mock_converter + mock_converter.convert_all.return_value = iter([mock_result1, mock_result3]) + mock_converter.allowed_formats = [] + + with patch.object(KnowledgeStorage, 'save') as mock_save: + with patch("crewai.knowledge.source.crew_docling_source.CrewDoclingSource._chunk_doc") as mock_chunk: + mock_chunk.side_effect = [ + iter(["Chunk 1 from file1", "Chunk 2 from file1"]), + iter(["Chunk 1 from file3", "Chunk 2 from file3"]) + ] + + storage = KnowledgeStorage() + source = CrewDoclingSource( + file_paths=[file1, file2, file3], + storage=storage + ) + + source.add() + + assert len(source.chunks) == 4 + + assert source.chunks[0]["metadata"]["filepath"] == str(file1) + assert source.chunks[0]["metadata"]["source_type"] == "docling" + assert source.chunks[0]["metadata"]["chunk_index"] == 0 + + assert source.chunks[1]["metadata"]["filepath"] == str(file1) + assert source.chunks[1]["metadata"]["chunk_index"] == 1 + + assert source.chunks[2]["metadata"]["filepath"] == str(file3) + assert source.chunks[2]["metadata"]["source_type"] == "docling" + assert source.chunks[2]["metadata"]["chunk_index"] == 0 + + assert source.chunks[3]["metadata"]["filepath"] == str(file3) + assert source.chunks[3]["metadata"]["chunk_index"] == 1 + + for chunk in source.chunks: + assert chunk["metadata"]["filepath"] != str(file2) + except ImportError: + pytest.skip("docling not available")