From 054bc266b95ef4d6a3837c1b485d98fe25d28169 Mon Sep 17 00:00:00 2001
From: Lorenze Jay <lorenzejaytech@gmail.com>
Date: Mon, 16 Dec 2024 16:30:47 -0800
Subject: [PATCH] logged but file_path is backwards compatible

---
 src/crewai/knowledge/source/docling_source.py | 50 ++++++++++++++-----
 1 file changed, 37 insertions(+), 13 deletions(-)

diff --git a/src/crewai/knowledge/source/docling_source.py b/src/crewai/knowledge/source/docling_source.py
index df56cdabb..618e4a647 100644
--- a/src/crewai/knowledge/source/docling_source.py
+++ b/src/crewai/knowledge/source/docling_source.py
@@ -19,11 +19,20 @@ class DoclingSource(BaseFileKnowledgeSource):
 
     file_paths: List[str] = Field(default_factory=list)
     document_converter: DocumentConverter = Field(default_factory=DocumentConverter)
-    safe_file_paths: List[Union[Path, str]] = Field(default_factory=list)
-    content: List[DoclingDocument] | None = Field(default=None)
     chunks: List[str] = Field(default_factory=list)
+    # We are accepting string urls and validating them if they are valid urls
+    # Overiding content to be a list of DoclingDocuments
+    safe_file_paths: List[Union[Path, str]] = Field(default_factory=list)  # type: ignore[assignment]
+    content: List[DoclingDocument] | None = Field(default=None)  # type: ignore[assignment]
 
     def model_post_init(self, _) -> None:
+        if self.file_path:
+            self._logger.log(
+                "warning",
+                "The 'file_path' attribute is deprecated and will be removed in a future version. Please use 'file_paths' instead.",
+                color="yellow",
+            )
+            self.file_paths = self.file_path  # type: ignore[assignment]
         self.safe_file_paths = self._process_file_paths()
         self.document_converter = DocumentConverter(
             allowed_formats=[
@@ -39,7 +48,7 @@ class DoclingSource(BaseFileKnowledgeSource):
         )
         self.content = self.load_content()
 
-    def load_content(self) -> List[DoclingDocument] | None:
+    def load_content(self) -> List[DoclingDocument] | None:  # type: ignore[assignment]
         try:
             return self.convert_source_to_docling_documents()
         except Exception as e:
@@ -58,28 +67,43 @@ class DoclingSource(BaseFileKnowledgeSource):
         conv_results_iter = self.document_converter.convert_all(self.safe_file_paths)
         return [result.document for result in conv_results_iter]
 
-    def _chunk_text(self, doc: DoclingDocument) -> Iterator[str]:
+    def _chunk_text(self, doc: DoclingDocument) -> Iterator[str]:  # type: ignore[assignment]
         chunker = HierarchicalChunker()
         for chunk in chunker.chunk(doc):
             yield chunk.text
 
-    def _process_file_paths(self) -> list[Path | str]:
+    def _process_file_paths(self) -> list[Path | str]:  # type: ignore[assignment]
         processed_paths = []
         for path in self.file_paths:
-            if path.startswith("http"):
+            if isinstance(path, str):
                 if path.startswith(("http://", "https://")):
                     try:
-                        result = urlparse(path)
-                        if all([result.scheme, result.netloc]):  # Basic URL validation
+                        if self._validate_url(path):
                             processed_paths.append(path)
                         else:
                             raise ValueError(f"Invalid URL format: {path}")
                     except Exception as e:
                         raise ValueError(f"Invalid URL: {path}. Error: {str(e)}")
-            else:
-                local_path = Path(KNOWLEDGE_DIRECTORY + "/" + path)
-                if local_path.exists():
-                    processed_paths.append(local_path)
                 else:
-                    raise FileNotFoundError(f"File not found: {local_path}")
+                    local_path = Path(KNOWLEDGE_DIRECTORY + "/" + path)
+                    if local_path.exists():
+                        processed_paths.append(local_path)
+                    else:
+                        raise FileNotFoundError(f"File not found: {local_path}")
+            else:
+                # this is an instance of Path
+                processed_paths.append(path)
         return processed_paths
+
+    def _validate_url(self, url: str) -> bool:
+        try:
+            result = urlparse(url)
+            return all(
+                [
+                    result.scheme in ("http", "https"),
+                    result.netloc,
+                    len(result.netloc.split(".")) >= 2,  # Ensure domain has TLD
+                ]
+            )
+        except Exception:
+            return False