mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-05-02 15:52:34 +00:00
fixed run types
This commit is contained in:
@@ -28,7 +28,7 @@ class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):
|
|||||||
def model_post_init(self, _):
|
def model_post_init(self, _):
|
||||||
"""Post-initialization method to load content."""
|
"""Post-initialization method to load content."""
|
||||||
self.safe_file_paths = self._process_file_paths()
|
self.safe_file_paths = self._process_file_paths()
|
||||||
self.validate_paths()
|
self.validate_content()
|
||||||
self.content = self.load_content()
|
self.content = self.load_content()
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
@@ -36,7 +36,7 @@ class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):
|
|||||||
"""Load and preprocess file content. Should be overridden by subclasses. Assume that the file path is relative to the project root in the knowledge directory."""
|
"""Load and preprocess file content. Should be overridden by subclasses. Assume that the file path is relative to the project root in the knowledge directory."""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def validate_paths(self):
|
def validate_content(self):
|
||||||
"""Validate the paths."""
|
"""Validate the paths."""
|
||||||
for path in self.safe_file_paths:
|
for path in self.safe_file_paths:
|
||||||
if not path.exists():
|
if not path.exists():
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ class BaseKnowledgeSource(BaseModel, ABC):
|
|||||||
collection_name: Optional[str] = Field(default=None)
|
collection_name: Optional[str] = Field(default=None)
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def load_content(self) -> Dict[Any, str]:
|
def validate_content(self) -> Any:
|
||||||
"""Load and preprocess content from the source."""
|
"""Load and preprocess content from the source."""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ from urllib.parse import urlparse
|
|||||||
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.document_converter import DocumentConverter
|
from docling.document_converter import DocumentConverter
|
||||||
|
from docling.exceptions import ConversionError
|
||||||
from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
|
from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
|
||||||
from docling_core.types.doc.document import DoclingDocument
|
from docling_core.types.doc.document import DoclingDocument
|
||||||
from pydantic import Field
|
from pydantic import Field
|
||||||
@@ -15,28 +16,18 @@ from crewai.utilities.logger import Logger
|
|||||||
|
|
||||||
class DoclingSource(BaseKnowledgeSource):
|
class DoclingSource(BaseKnowledgeSource):
|
||||||
"""Utility package for converting documents to markdown or json
|
"""Utility package for converting documents to markdown or json
|
||||||
This will auto support PDF, DOCX, and TXT, XLSX, files without any additional dependencies.
|
This will auto support PDF, DOCX, and TXT, XLSX, Images, and HTML files without any additional dependencies.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
_logger: Logger = Logger(verbose=True)
|
_logger: Logger = Logger(verbose=True)
|
||||||
|
|
||||||
file_path: Optional[List[Union[Path, str]]] = Field(default=None)
|
file_path: Optional[List[Union[Path, str]]] = Field(default=None)
|
||||||
file_paths: List[Union[Path, str]] = Field(default_factory=list)
|
file_paths: List[Union[Path, str]] = Field(default_factory=list)
|
||||||
document_converter: DocumentConverter = Field(default_factory=DocumentConverter)
|
|
||||||
chunks: List[str] = Field(default_factory=list)
|
chunks: List[str] = Field(default_factory=list)
|
||||||
safe_file_paths: List[Union[Path, str]] = Field(default_factory=list)
|
safe_file_paths: List[Union[Path, str]] = Field(default_factory=list)
|
||||||
content: List[DoclingDocument] = Field(default_factory=list)
|
content: List[DoclingDocument] = Field(default_factory=list)
|
||||||
|
document_converter: DocumentConverter = Field(
|
||||||
def model_post_init(self, _) -> None:
|
default_factory=lambda: DocumentConverter(
|
||||||
if self.file_path:
|
|
||||||
self._logger.log(
|
|
||||||
"warning",
|
|
||||||
"The 'file_path' attribute is deprecated and will be removed in a future version. Please use 'file_paths' instead.",
|
|
||||||
color="yellow",
|
|
||||||
)
|
|
||||||
self.file_paths = self.file_path
|
|
||||||
self.safe_file_paths = self._process_file_paths()
|
|
||||||
self.document_converter = DocumentConverter(
|
|
||||||
allowed_formats=[
|
allowed_formats=[
|
||||||
InputFormat.MD,
|
InputFormat.MD,
|
||||||
InputFormat.ASCIIDOC,
|
InputFormat.ASCIIDOC,
|
||||||
@@ -48,33 +39,51 @@ class DoclingSource(BaseKnowledgeSource):
|
|||||||
InputFormat.PPTX,
|
InputFormat.PPTX,
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def model_post_init(self, _) -> None:
|
||||||
|
if self.file_path:
|
||||||
|
self._logger.log(
|
||||||
|
"warning",
|
||||||
|
"The 'file_path' attribute is deprecated and will be removed in a future version. Please use 'file_paths' instead.",
|
||||||
|
color="yellow",
|
||||||
|
)
|
||||||
|
self.file_paths = self.file_path
|
||||||
|
self.safe_file_paths = self.validate_content()
|
||||||
self.content = self.load_content()
|
self.content = self.load_content()
|
||||||
|
|
||||||
def load_content(self) -> List[DoclingDocument]: # type: ignore
|
def load_content(self) -> List[DoclingDocument]:
|
||||||
try:
|
try:
|
||||||
return self.convert_source_to_docling_documents()
|
return self.convert_source_to_docling_documents()
|
||||||
|
except ConversionError as e:
|
||||||
|
self._logger.log(
|
||||||
|
"error",
|
||||||
|
f"Error loading content: {e}. Supported formats: {self.document_converter.allowed_formats}",
|
||||||
|
"red",
|
||||||
|
)
|
||||||
|
raise e
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self._logger.log("error", f"Error loading content: {e}")
|
self._logger.log("error", f"Error loading content: {e}")
|
||||||
return []
|
raise e
|
||||||
|
|
||||||
def add(self) -> None:
|
def add(self) -> None:
|
||||||
if self.content is None:
|
if self.content is None:
|
||||||
return
|
return
|
||||||
for doc in self.content:
|
for doc in self.content:
|
||||||
new_chunks = self._chunk_text(doc)
|
new_chunks_iterable = self._chunk_doc(doc)
|
||||||
self.chunks.extend(new_chunks)
|
self.chunks.extend(list(new_chunks_iterable))
|
||||||
self._save_documents()
|
self._save_documents()
|
||||||
|
|
||||||
def convert_source_to_docling_documents(self) -> List[DoclingDocument]:
|
def convert_source_to_docling_documents(self) -> List[DoclingDocument]:
|
||||||
conv_results_iter = self.document_converter.convert_all(self.safe_file_paths)
|
conv_results_iter = self.document_converter.convert_all(self.safe_file_paths)
|
||||||
return [result.document for result in conv_results_iter]
|
return [result.document for result in conv_results_iter]
|
||||||
|
|
||||||
def _chunk_text(self, doc: DoclingDocument) -> Iterator[str]: # type: ignore[override]
|
def _chunk_doc(self, doc: DoclingDocument) -> Iterator[str]:
|
||||||
chunker = HierarchicalChunker()
|
chunker = HierarchicalChunker()
|
||||||
for chunk in chunker.chunk(doc):
|
for chunk in chunker.chunk(doc):
|
||||||
yield chunk.text
|
yield chunk.text
|
||||||
|
|
||||||
def _process_file_paths(self) -> List[Union[Path, str]]:
|
def validate_content(self) -> List[Union[Path, str]]:
|
||||||
processed_paths: List[Union[Path, str]] = []
|
processed_paths: List[Union[Path, str]] = []
|
||||||
for path in self.file_paths:
|
for path in self.file_paths:
|
||||||
if isinstance(path, str):
|
if isinstance(path, str):
|
||||||
|
|||||||
@@ -13,9 +13,9 @@ class StringKnowledgeSource(BaseKnowledgeSource):
|
|||||||
|
|
||||||
def model_post_init(self, _):
|
def model_post_init(self, _):
|
||||||
"""Post-initialization method to validate content."""
|
"""Post-initialization method to validate content."""
|
||||||
self.load_content()
|
self.validate_content()
|
||||||
|
|
||||||
def load_content(self):
|
def validate_content(self):
|
||||||
"""Validate string content."""
|
"""Validate string content."""
|
||||||
if not isinstance(self.content, str):
|
if not isinstance(self.content, str):
|
||||||
raise ValueError("StringKnowledgeSource only accepts string content")
|
raise ValueError("StringKnowledgeSource only accepts string content")
|
||||||
|
|||||||
Reference in New Issue
Block a user