fixed run types

2026-01-15 02:58:30 +00:00 · 2024-12-17 10:05:36 -08:00
parent ef7a101631
commit 7885c5f906
4 changed files with 33 additions and 24 deletions
--- a/src/crewai/knowledge/source/base_file_knowledge_source.py
+++ b/src/crewai/knowledge/source/base_file_knowledge_source.py
@@ -28,7 +28,7 @@ class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):
    def model_post_init(self, _):
        """Post-initialization method to load content."""
        self.safe_file_paths = self._process_file_paths()
-        self.validate_paths()
+        self.validate_content()
        self.content = self.load_content()

    @abstractmethod
@@ -36,7 +36,7 @@ class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):
        """Load and preprocess file content. Should be overridden by subclasses. Assume that the file path is relative to the project root in the knowledge directory."""
        pass

-    def validate_paths(self):
+    def validate_content(self):
        """Validate the paths."""
        for path in self.safe_file_paths:
            if not path.exists():
--- a/src/crewai/knowledge/source/base_knowledge_source.py
+++ b/src/crewai/knowledge/source/base_knowledge_source.py
@@ -21,7 +21,7 @@ class BaseKnowledgeSource(BaseModel, ABC):
    collection_name: Optional[str] = Field(default=None)

    @abstractmethod
-    def load_content(self) -> Dict[Any, str]:
+    def validate_content(self) -> Any:
        """Load and preprocess content from the source."""
        pass

--- a/src/crewai/knowledge/source/docling_source.py
+++ b/src/crewai/knowledge/source/docling_source.py
@@ -4,6 +4,7 @@ from urllib.parse import urlparse

 from docling.datamodel.base_models import InputFormat
 from docling.document_converter import DocumentConverter
+from docling.exceptions import ConversionError
 from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
 from docling_core.types.doc.document import DoclingDocument
 from pydantic import Field
@@ -15,28 +16,18 @@ from crewai.utilities.logger import Logger

 class DoclingSource(BaseKnowledgeSource):
    """Utility package for converting documents to markdown or json
-    This will auto support PDF, DOCX, and TXT, XLSX, files without any additional dependencies.
+    This will auto support PDF, DOCX, and TXT, XLSX, Images, and HTML files without any additional dependencies.
    """

    _logger: Logger = Logger(verbose=True)

    file_path: Optional[List[Union[Path, str]]] = Field(default=None)
    file_paths: List[Union[Path, str]] = Field(default_factory=list)
-    document_converter: DocumentConverter = Field(default_factory=DocumentConverter)
    chunks: List[str] = Field(default_factory=list)
    safe_file_paths: List[Union[Path, str]] = Field(default_factory=list)
    content: List[DoclingDocument] = Field(default_factory=list)
-
-    def model_post_init(self, _) -> None:
-        if self.file_path:
-            self._logger.log(
-                "warning",
-                "The 'file_path' attribute is deprecated and will be removed in a future version. Please use 'file_paths' instead.",
-                color="yellow",
-            )
-            self.file_paths = self.file_path
-        self.safe_file_paths = self._process_file_paths()
-        self.document_converter = DocumentConverter(
+    document_converter: DocumentConverter = Field(
+        default_factory=lambda: DocumentConverter(
            allowed_formats=[
                InputFormat.MD,
                InputFormat.ASCIIDOC,
@@ -48,33 +39,51 @@ class DoclingSource(BaseKnowledgeSource):
                InputFormat.PPTX,
            ]
        )
+    )
+
+    def model_post_init(self, _) -> None:
+        if self.file_path:
+            self._logger.log(
+                "warning",
+                "The 'file_path' attribute is deprecated and will be removed in a future version. Please use 'file_paths' instead.",
+                color="yellow",
+            )
+            self.file_paths = self.file_path
+        self.safe_file_paths = self.validate_content()
        self.content = self.load_content()

-    def load_content(self) -> List[DoclingDocument]:  # type: ignore
+    def load_content(self) -> List[DoclingDocument]:
        try:
            return self.convert_source_to_docling_documents()
+        except ConversionError as e:
+            self._logger.log(
+                "error",
+                f"Error loading content: {e}. Supported formats: {self.document_converter.allowed_formats}",
+                "red",
+            )
+            raise e
        except Exception as e:
            self._logger.log("error", f"Error loading content: {e}")
-            return []
+            raise e

    def add(self) -> None:
        if self.content is None:
            return
        for doc in self.content:
-            new_chunks = self._chunk_text(doc)
-            self.chunks.extend(new_chunks)
+            new_chunks_iterable = self._chunk_doc(doc)
+            self.chunks.extend(list(new_chunks_iterable))
        self._save_documents()

    def convert_source_to_docling_documents(self) -> List[DoclingDocument]:
        conv_results_iter = self.document_converter.convert_all(self.safe_file_paths)
        return [result.document for result in conv_results_iter]

-    def _chunk_text(self, doc: DoclingDocument) -> Iterator[str]:  # type: ignore[override]
+    def _chunk_doc(self, doc: DoclingDocument) -> Iterator[str]:
        chunker = HierarchicalChunker()
        for chunk in chunker.chunk(doc):
            yield chunk.text

-    def _process_file_paths(self) -> List[Union[Path, str]]:
+    def validate_content(self) -> List[Union[Path, str]]:
        processed_paths: List[Union[Path, str]] = []
        for path in self.file_paths:
            if isinstance(path, str):
--- a/src/crewai/knowledge/source/string_knowledge_source.py
+++ b/src/crewai/knowledge/source/string_knowledge_source.py
@@ -13,9 +13,9 @@ class StringKnowledgeSource(BaseKnowledgeSource):

    def model_post_init(self, _):
        """Post-initialization method to validate content."""
-        self.load_content()
+        self.validate_content()

-    def load_content(self):
+    def validate_content(self):
        """Validate string content."""
        if not isinstance(self.content, str):
            raise ValueError("StringKnowledgeSource only accepts string content")