From 054bc266b95ef4d6a3837c1b485d98fe25d28169 Mon Sep 17 00:00:00 2001 From: Lorenze Jay Date: Mon, 16 Dec 2024 16:30:47 -0800 Subject: [PATCH] logged but file_path is backwards compatible --- src/crewai/knowledge/source/docling_source.py | 50 ++++++++++++++----- 1 file changed, 37 insertions(+), 13 deletions(-) diff --git a/src/crewai/knowledge/source/docling_source.py b/src/crewai/knowledge/source/docling_source.py index df56cdabb..618e4a647 100644 --- a/src/crewai/knowledge/source/docling_source.py +++ b/src/crewai/knowledge/source/docling_source.py @@ -19,11 +19,20 @@ class DoclingSource(BaseFileKnowledgeSource): file_paths: List[str] = Field(default_factory=list) document_converter: DocumentConverter = Field(default_factory=DocumentConverter) - safe_file_paths: List[Union[Path, str]] = Field(default_factory=list) - content: List[DoclingDocument] | None = Field(default=None) chunks: List[str] = Field(default_factory=list) + # We are accepting string urls and validating them if they are valid urls + # Overiding content to be a list of DoclingDocuments + safe_file_paths: List[Union[Path, str]] = Field(default_factory=list) # type: ignore[assignment] + content: List[DoclingDocument] | None = Field(default=None) # type: ignore[assignment] def model_post_init(self, _) -> None: + if self.file_path: + self._logger.log( + "warning", + "The 'file_path' attribute is deprecated and will be removed in a future version. Please use 'file_paths' instead.", + color="yellow", + ) + self.file_paths = self.file_path # type: ignore[assignment] self.safe_file_paths = self._process_file_paths() self.document_converter = DocumentConverter( allowed_formats=[ @@ -39,7 +48,7 @@ class DoclingSource(BaseFileKnowledgeSource): ) self.content = self.load_content() - def load_content(self) -> List[DoclingDocument] | None: + def load_content(self) -> List[DoclingDocument] | None: # type: ignore[assignment] try: return self.convert_source_to_docling_documents() except Exception as e: @@ -58,28 +67,43 @@ class DoclingSource(BaseFileKnowledgeSource): conv_results_iter = self.document_converter.convert_all(self.safe_file_paths) return [result.document for result in conv_results_iter] - def _chunk_text(self, doc: DoclingDocument) -> Iterator[str]: + def _chunk_text(self, doc: DoclingDocument) -> Iterator[str]: # type: ignore[assignment] chunker = HierarchicalChunker() for chunk in chunker.chunk(doc): yield chunk.text - def _process_file_paths(self) -> list[Path | str]: + def _process_file_paths(self) -> list[Path | str]: # type: ignore[assignment] processed_paths = [] for path in self.file_paths: - if path.startswith("http"): + if isinstance(path, str): if path.startswith(("http://", "https://")): try: - result = urlparse(path) - if all([result.scheme, result.netloc]): # Basic URL validation + if self._validate_url(path): processed_paths.append(path) else: raise ValueError(f"Invalid URL format: {path}") except Exception as e: raise ValueError(f"Invalid URL: {path}. Error: {str(e)}") - else: - local_path = Path(KNOWLEDGE_DIRECTORY + "/" + path) - if local_path.exists(): - processed_paths.append(local_path) else: - raise FileNotFoundError(f"File not found: {local_path}") + local_path = Path(KNOWLEDGE_DIRECTORY + "/" + path) + if local_path.exists(): + processed_paths.append(local_path) + else: + raise FileNotFoundError(f"File not found: {local_path}") + else: + # this is an instance of Path + processed_paths.append(path) return processed_paths + + def _validate_url(self, url: str) -> bool: + try: + result = urlparse(url) + return all( + [ + result.scheme in ("http", "https"), + result.netloc, + len(result.netloc.split(".")) >= 2, # Ensure domain has TLD + ] + ) + except Exception: + return False