Feat/docling-support (#1763)

* added tool for docling support * docling support installation * use file_paths instead of file_path * fix import * organized imports * run_type docs * needs to be list * fixed logic * logged but file_path is backwards compatible * use file_paths instead of file_path 2 * added test for multiple sources for file_paths * fix run-types * enabling local files to work and type cleanup * linted * fix test and types * fixed run types * fix types * renamed to CrewDoclingSource * linted * added docs * resolve conflicts --------- Co-authored-by: Brandon Hancock (bhancock_ai) <109994880+bhancockio@users.noreply.github.com> Co-authored-by: Brandon Hancock <brandon@brandonhancock.io>
2026-01-10 08:38:30 +00:00 · 2024-12-23 10:19:58 -08:00
parent c887ff1f47
commit b3185ad90c
8 changed files with 1166 additions and 35 deletions
--- a/src/crewai/knowledge/source/crew_docling_source.py
+++ b/src/crewai/knowledge/source/crew_docling_source.py
@@ -0,0 +1,120 @@
+from pathlib import Path
+from typing import Iterator, List, Optional, Union
+from urllib.parse import urlparse
+
+from docling.datamodel.base_models import InputFormat
+from docling.document_converter import DocumentConverter
+from docling.exceptions import ConversionError
+from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
+from docling_core.types.doc.document import DoclingDocument
+from pydantic import Field
+
+from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
+from crewai.utilities.constants import KNOWLEDGE_DIRECTORY
+from crewai.utilities.logger import Logger
+
+
+class CrewDoclingSource(BaseKnowledgeSource):
+    """Default Source class for converting documents to markdown or json
+    This will auto support PDF, DOCX, and TXT, XLSX, Images, and HTML files without any additional dependencies and follows the docling package as the source of truth.
+    """
+
+    _logger: Logger = Logger(verbose=True)
+
+    file_path: Optional[List[Union[Path, str]]] = Field(default=None)
+    file_paths: List[Union[Path, str]] = Field(default_factory=list)
+    chunks: List[str] = Field(default_factory=list)
+    safe_file_paths: List[Union[Path, str]] = Field(default_factory=list)
+    content: List[DoclingDocument] = Field(default_factory=list)
+    document_converter: DocumentConverter = Field(
+        default_factory=lambda: DocumentConverter(
+            allowed_formats=[
+                InputFormat.MD,
+                InputFormat.ASCIIDOC,
+                InputFormat.PDF,
+                InputFormat.DOCX,
+                InputFormat.HTML,
+                InputFormat.IMAGE,
+                InputFormat.XLSX,
+                InputFormat.PPTX,
+            ]
+        )
+    )
+
+    def model_post_init(self, _) -> None:
+        if self.file_path:
+            self._logger.log(
+                "warning",
+                "The 'file_path' attribute is deprecated and will be removed in a future version. Please use 'file_paths' instead.",
+                color="yellow",
+            )
+            self.file_paths = self.file_path
+        self.safe_file_paths = self.validate_content()
+        self.content = self._load_content()
+
+    def _load_content(self) -> List[DoclingDocument]:
+        try:
+            return self._convert_source_to_docling_documents()
+        except ConversionError as e:
+            self._logger.log(
+                "error",
+                f"Error loading content: {e}. Supported formats: {self.document_converter.allowed_formats}",
+                "red",
+            )
+            raise e
+        except Exception as e:
+            self._logger.log("error", f"Error loading content: {e}")
+            raise e
+
+    def add(self) -> None:
+        if self.content is None:
+            return
+        for doc in self.content:
+            new_chunks_iterable = self._chunk_doc(doc)
+            self.chunks.extend(list(new_chunks_iterable))
+        self._save_documents()
+
+    def _convert_source_to_docling_documents(self) -> List[DoclingDocument]:
+        conv_results_iter = self.document_converter.convert_all(self.safe_file_paths)
+        return [result.document for result in conv_results_iter]
+
+    def _chunk_doc(self, doc: DoclingDocument) -> Iterator[str]:
+        chunker = HierarchicalChunker()
+        for chunk in chunker.chunk(doc):
+            yield chunk.text
+
+    def validate_content(self) -> List[Union[Path, str]]:
+        processed_paths: List[Union[Path, str]] = []
+        for path in self.file_paths:
+            if isinstance(path, str):
+                if path.startswith(("http://", "https://")):
+                    try:
+                        if self._validate_url(path):
+                            processed_paths.append(path)
+                        else:
+                            raise ValueError(f"Invalid URL format: {path}")
+                    except Exception as e:
+                        raise ValueError(f"Invalid URL: {path}. Error: {str(e)}")
+                else:
+                    local_path = Path(KNOWLEDGE_DIRECTORY + "/" + path)
+                    if local_path.exists():
+                        processed_paths.append(local_path)
+                    else:
+                        raise FileNotFoundError(f"File not found: {local_path}")
+            else:
+                # this is an instance of Path
+                processed_paths.append(path)
+        return processed_paths
+
+    def _validate_url(self, url: str) -> bool:
+        try:
+            result = urlparse(url)
+            return all(
+                [
+                    result.scheme in ("http", "https"),
+                    result.netloc,
+                    len(result.netloc.split(".")) >= 2,  # Ensure domain has TLD
+                ]
+            )
+        except Exception:
+            return False