mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-10 00:28:31 +00:00
Fix docling issues
This commit is contained in:
@@ -8,6 +8,7 @@ try:
|
|||||||
from docling.exceptions import ConversionError
|
from docling.exceptions import ConversionError
|
||||||
from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
|
from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
|
||||||
from docling_core.types.doc.document import DoclingDocument
|
from docling_core.types.doc.document import DoclingDocument
|
||||||
|
|
||||||
DOCLING_AVAILABLE = True
|
DOCLING_AVAILABLE = True
|
||||||
except ImportError:
|
except ImportError:
|
||||||
DOCLING_AVAILABLE = False
|
DOCLING_AVAILABLE = False
|
||||||
@@ -38,8 +39,8 @@ class CrewDoclingSource(BaseKnowledgeSource):
|
|||||||
file_paths: List[Union[Path, str]] = Field(default_factory=list)
|
file_paths: List[Union[Path, str]] = Field(default_factory=list)
|
||||||
chunks: List[str] = Field(default_factory=list)
|
chunks: List[str] = Field(default_factory=list)
|
||||||
safe_file_paths: List[Union[Path, str]] = Field(default_factory=list)
|
safe_file_paths: List[Union[Path, str]] = Field(default_factory=list)
|
||||||
content: List[DoclingDocument] = Field(default_factory=list)
|
content: List["DoclingDocument"] = Field(default_factory=list)
|
||||||
document_converter: DocumentConverter = Field(
|
document_converter: "DocumentConverter" = Field(
|
||||||
default_factory=lambda: DocumentConverter(
|
default_factory=lambda: DocumentConverter(
|
||||||
allowed_formats=[
|
allowed_formats=[
|
||||||
InputFormat.MD,
|
InputFormat.MD,
|
||||||
@@ -65,7 +66,7 @@ class CrewDoclingSource(BaseKnowledgeSource):
|
|||||||
self.safe_file_paths = self.validate_content()
|
self.safe_file_paths = self.validate_content()
|
||||||
self.content = self._load_content()
|
self.content = self._load_content()
|
||||||
|
|
||||||
def _load_content(self) -> List[DoclingDocument]:
|
def _load_content(self) -> List["DoclingDocument"]:
|
||||||
try:
|
try:
|
||||||
return self._convert_source_to_docling_documents()
|
return self._convert_source_to_docling_documents()
|
||||||
except ConversionError as e:
|
except ConversionError as e:
|
||||||
@@ -87,11 +88,11 @@ class CrewDoclingSource(BaseKnowledgeSource):
|
|||||||
self.chunks.extend(list(new_chunks_iterable))
|
self.chunks.extend(list(new_chunks_iterable))
|
||||||
self._save_documents()
|
self._save_documents()
|
||||||
|
|
||||||
def _convert_source_to_docling_documents(self) -> List[DoclingDocument]:
|
def _convert_source_to_docling_documents(self) -> List["DoclingDocument"]:
|
||||||
conv_results_iter = self.document_converter.convert_all(self.safe_file_paths)
|
conv_results_iter = self.document_converter.convert_all(self.safe_file_paths)
|
||||||
return [result.document for result in conv_results_iter]
|
return [result.document for result in conv_results_iter]
|
||||||
|
|
||||||
def _chunk_doc(self, doc: DoclingDocument) -> Iterator[str]:
|
def _chunk_doc(self, doc: "DoclingDocument") -> Iterator[str]:
|
||||||
chunker = HierarchicalChunker()
|
chunker = HierarchicalChunker()
|
||||||
for chunk in chunker.chunk(doc):
|
for chunk in chunker.chunk(doc):
|
||||||
yield chunk.text
|
yield chunk.text
|
||||||
|
|||||||
Reference in New Issue
Block a user