fix run-types

This commit is contained in:
Lorenze Jay
2024-12-16 20:09:54 -08:00
parent c2ed1f2355
commit 356eb07d5f

View File

@@ -1,5 +1,5 @@
from pathlib import Path from pathlib import Path
from typing import Iterator, List, Union from typing import Iterator, List, Union, Dict
from urllib.parse import urlparse from urllib.parse import urlparse
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
@@ -8,22 +8,25 @@ from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChu
from docling_core.types.doc.document import DoclingDocument from docling_core.types.doc.document import DoclingDocument
from pydantic import Field from pydantic import Field
from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource
from crewai.utilities.constants import KNOWLEDGE_DIRECTORY from crewai.utilities.constants import KNOWLEDGE_DIRECTORY
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
from crewai.utilities.logger import Logger
class DoclingSource(BaseFileKnowledgeSource): class DoclingSource(BaseKnowledgeSource):
"""Utility package for converting documents to markdown or json """Utility package for converting documents to markdown or json
This will auto support PDF, DOCX, and TXT, XLSX, files without any additional dependencies. This will auto support PDF, DOCX, and TXT, XLSX, files without any additional dependencies.
""" """
_logger: Logger = Logger(verbose=True)
file_path: List[str] = Field(default=None)
file_paths: List[str] = Field(default_factory=list) file_paths: List[str] = Field(default_factory=list)
document_converter: DocumentConverter = Field(default_factory=DocumentConverter) document_converter: DocumentConverter = Field(default_factory=DocumentConverter)
chunks: List[str] = Field(default_factory=list) chunks: List[str] = Field(default_factory=list)
# We are accepting string urls and validating them if they are valid urls safe_file_paths: List[str] = Field(default_factory=list)
# Overiding content to be a list of DoclingDocuments content: List[DoclingDocument] = Field(default_factory=list)
safe_file_paths: List[Union[Path, str]] = Field(default_factory=list) # type: ignore[assignment]
content: List[DoclingDocument] | None = Field(default=None) # type: ignore[assignment]
def model_post_init(self, _) -> None: def model_post_init(self, _) -> None:
if self.file_path: if self.file_path:
@@ -32,7 +35,7 @@ class DoclingSource(BaseFileKnowledgeSource):
"The 'file_path' attribute is deprecated and will be removed in a future version. Please use 'file_paths' instead.", "The 'file_path' attribute is deprecated and will be removed in a future version. Please use 'file_paths' instead.",
color="yellow", color="yellow",
) )
self.file_paths = self.file_path # type: ignore[assignment] self.file_paths = self.file_path
self.safe_file_paths = self._process_file_paths() self.safe_file_paths = self._process_file_paths()
self.document_converter = DocumentConverter( self.document_converter = DocumentConverter(
allowed_formats=[ allowed_formats=[
@@ -48,12 +51,12 @@ class DoclingSource(BaseFileKnowledgeSource):
) )
self.content = self.load_content() self.content = self.load_content()
def load_content(self) -> List[DoclingDocument] | None: # type: ignore[assignment] def load_content(self) -> List[DoclingDocument]: # type: ignore
try: try:
return self.convert_source_to_docling_documents() return self.convert_source_to_docling_documents()
except Exception as e: except Exception as e:
self._logger.log("error", f"Error loading content: {e}") self._logger.log("error", f"Error loading content: {e}")
return None return []
def add(self) -> None: def add(self) -> None:
if self.content is None: if self.content is None:
@@ -67,12 +70,12 @@ class DoclingSource(BaseFileKnowledgeSource):
conv_results_iter = self.document_converter.convert_all(self.safe_file_paths) conv_results_iter = self.document_converter.convert_all(self.safe_file_paths)
return [result.document for result in conv_results_iter] return [result.document for result in conv_results_iter]
def _chunk_text(self, doc: DoclingDocument) -> Iterator[str]: # type: ignore[assignment] def _chunk_text(self, doc: DoclingDocument) -> Iterator[str]: # type: ignore[override]
chunker = HierarchicalChunker() chunker = HierarchicalChunker()
for chunk in chunker.chunk(doc): for chunk in chunker.chunk(doc):
yield chunk.text yield chunk.text
def _process_file_paths(self) -> list[Path | str]: # type: ignore[assignment] def _process_file_paths(self) -> List[str]:
processed_paths = [] processed_paths = []
for path in self.file_paths: for path in self.file_paths:
if isinstance(path, str): if isinstance(path, str):
@@ -87,7 +90,7 @@ class DoclingSource(BaseFileKnowledgeSource):
else: else:
local_path = Path(KNOWLEDGE_DIRECTORY + "/" + path) local_path = Path(KNOWLEDGE_DIRECTORY + "/" + path)
if local_path.exists(): if local_path.exists():
processed_paths.append(local_path) processed_paths.append(local_path.name)
else: else:
raise FileNotFoundError(f"File not found: {local_path}") raise FileNotFoundError(f"File not found: {local_path}")
else: else: