mirror of
https://github.com/crewAIInc/crewAI.git
synced 2025-12-16 04:18:35 +00:00
Fix CrewDoclingSource filepath metadata misalignment
- Extract filepath from ConversionResult.input.file instead of indexing safe_file_paths - Add content_paths field to track source filepath for each converted document - Ensures correct filepath metadata even when some files fail conversion - Add comprehensive test for filepath metadata with conversion failures Addresses Cursor Bugbot comment on PR #3813 Co-Authored-By: João <joao@crewai.com>
This commit is contained in:
@@ -51,6 +51,7 @@ class CrewDoclingSource(BaseKnowledgeSource):
|
|||||||
chunks: list[str] = Field(default_factory=list)
|
chunks: list[str] = Field(default_factory=list)
|
||||||
safe_file_paths: list[Path | str] = Field(default_factory=list)
|
safe_file_paths: list[Path | str] = Field(default_factory=list)
|
||||||
content: list[DoclingDocument] = Field(default_factory=list)
|
content: list[DoclingDocument] = Field(default_factory=list)
|
||||||
|
content_paths: list[Path | str] = Field(default_factory=list)
|
||||||
document_converter: DocumentConverter = Field(
|
document_converter: DocumentConverter = Field(
|
||||||
default_factory=lambda: DocumentConverter(
|
default_factory=lambda: DocumentConverter(
|
||||||
allowed_formats=[
|
allowed_formats=[
|
||||||
@@ -95,7 +96,7 @@ class CrewDoclingSource(BaseKnowledgeSource):
|
|||||||
if self.content is None:
|
if self.content is None:
|
||||||
return
|
return
|
||||||
for doc_index, doc in enumerate(self.content):
|
for doc_index, doc in enumerate(self.content):
|
||||||
filepath = self.safe_file_paths[doc_index] if doc_index < len(self.safe_file_paths) else "unknown"
|
filepath = self.content_paths[doc_index] if doc_index < len(self.content_paths) else "unknown"
|
||||||
chunk_index = 0
|
chunk_index = 0
|
||||||
for chunk_text in self._chunk_doc(doc):
|
for chunk_text in self._chunk_doc(doc):
|
||||||
self.chunks.append({
|
self.chunks.append({
|
||||||
@@ -111,7 +112,12 @@ class CrewDoclingSource(BaseKnowledgeSource):
|
|||||||
|
|
||||||
def _convert_source_to_docling_documents(self) -> list[DoclingDocument]:
|
def _convert_source_to_docling_documents(self) -> list[DoclingDocument]:
|
||||||
conv_results_iter = self.document_converter.convert_all(self.safe_file_paths)
|
conv_results_iter = self.document_converter.convert_all(self.safe_file_paths)
|
||||||
return [result.document for result in conv_results_iter]
|
documents = []
|
||||||
|
self.content_paths = []
|
||||||
|
for result in conv_results_iter:
|
||||||
|
documents.append(result.document)
|
||||||
|
self.content_paths.append(result.input.file)
|
||||||
|
return documents
|
||||||
|
|
||||||
def _chunk_doc(self, doc: DoclingDocument) -> Iterator[str]:
|
def _chunk_doc(self, doc: DoclingDocument) -> Iterator[str]:
|
||||||
chunker = HierarchicalChunker()
|
chunker = HierarchicalChunker()
|
||||||
|
|||||||
@@ -398,3 +398,80 @@ class TestBackwardCompatibility:
|
|||||||
assert len(saved_docs) == 2
|
assert len(saved_docs) == 2
|
||||||
assert all("content" in doc for doc in saved_docs)
|
assert all("content" in doc for doc in saved_docs)
|
||||||
assert all("metadata" in doc for doc in saved_docs)
|
assert all("metadata" in doc for doc in saved_docs)
|
||||||
|
|
||||||
|
|
||||||
|
class TestCrewDoclingSourceMetadata:
|
||||||
|
"""Test CrewDoclingSource metadata with conversion failures."""
|
||||||
|
|
||||||
|
@pytest.mark.skipif(
|
||||||
|
not hasattr(pytest, "importorskip") or pytest.importorskip("docling", reason="docling not available") is None,
|
||||||
|
reason="docling not available"
|
||||||
|
)
|
||||||
|
def test_docling_filepath_metadata_with_conversion_failure(self, tmp_path):
|
||||||
|
"""Test that filepath metadata is correct even when some files fail conversion."""
|
||||||
|
try:
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import MagicMock, Mock
|
||||||
|
from crewai.knowledge.source.crew_docling_source import CrewDoclingSource
|
||||||
|
from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage
|
||||||
|
|
||||||
|
file1 = tmp_path / "file1.txt"
|
||||||
|
file2 = tmp_path / "file2.txt"
|
||||||
|
file3 = tmp_path / "file3.txt"
|
||||||
|
|
||||||
|
file1.write_text("Content from file 1")
|
||||||
|
file2.write_text("Content from file 2")
|
||||||
|
file3.write_text("Content from file 3")
|
||||||
|
|
||||||
|
mock_doc1 = MagicMock()
|
||||||
|
mock_doc3 = MagicMock()
|
||||||
|
|
||||||
|
mock_result1 = MagicMock()
|
||||||
|
mock_result1.document = mock_doc1
|
||||||
|
mock_result1.input.file = file1
|
||||||
|
|
||||||
|
mock_result3 = MagicMock()
|
||||||
|
mock_result3.document = mock_doc3
|
||||||
|
mock_result3.input.file = file3
|
||||||
|
|
||||||
|
with patch("crewai.knowledge.source.crew_docling_source.DocumentConverter") as mock_converter_class:
|
||||||
|
mock_converter = MagicMock()
|
||||||
|
mock_converter_class.return_value = mock_converter
|
||||||
|
mock_converter.convert_all.return_value = iter([mock_result1, mock_result3])
|
||||||
|
mock_converter.allowed_formats = []
|
||||||
|
|
||||||
|
with patch.object(KnowledgeStorage, 'save') as mock_save:
|
||||||
|
with patch("crewai.knowledge.source.crew_docling_source.CrewDoclingSource._chunk_doc") as mock_chunk:
|
||||||
|
mock_chunk.side_effect = [
|
||||||
|
iter(["Chunk 1 from file1", "Chunk 2 from file1"]),
|
||||||
|
iter(["Chunk 1 from file3", "Chunk 2 from file3"])
|
||||||
|
]
|
||||||
|
|
||||||
|
storage = KnowledgeStorage()
|
||||||
|
source = CrewDoclingSource(
|
||||||
|
file_paths=[file1, file2, file3],
|
||||||
|
storage=storage
|
||||||
|
)
|
||||||
|
|
||||||
|
source.add()
|
||||||
|
|
||||||
|
assert len(source.chunks) == 4
|
||||||
|
|
||||||
|
assert source.chunks[0]["metadata"]["filepath"] == str(file1)
|
||||||
|
assert source.chunks[0]["metadata"]["source_type"] == "docling"
|
||||||
|
assert source.chunks[0]["metadata"]["chunk_index"] == 0
|
||||||
|
|
||||||
|
assert source.chunks[1]["metadata"]["filepath"] == str(file1)
|
||||||
|
assert source.chunks[1]["metadata"]["chunk_index"] == 1
|
||||||
|
|
||||||
|
assert source.chunks[2]["metadata"]["filepath"] == str(file3)
|
||||||
|
assert source.chunks[2]["metadata"]["source_type"] == "docling"
|
||||||
|
assert source.chunks[2]["metadata"]["chunk_index"] == 0
|
||||||
|
|
||||||
|
assert source.chunks[3]["metadata"]["filepath"] == str(file3)
|
||||||
|
assert source.chunks[3]["metadata"]["chunk_index"] == 1
|
||||||
|
|
||||||
|
for chunk in source.chunks:
|
||||||
|
assert chunk["metadata"]["filepath"] != str(file2)
|
||||||
|
except ImportError:
|
||||||
|
pytest.skip("docling not available")
|
||||||
|
|||||||
Reference in New Issue
Block a user