logged but file_path is backwards compatible

2026-01-09 08:08:32 +00:00 · 2024-12-16 16:30:47 -08:00
parent f1c9caa8ec
commit 054bc266b9
1 changed files with 37 additions and 13 deletions
--- a/src/crewai/knowledge/source/docling_source.py
+++ b/src/crewai/knowledge/source/docling_source.py
@@ -19,11 +19,20 @@ class DoclingSource(BaseFileKnowledgeSource):
    file_paths: List[str] = Field(default_factory=list)
    document_converter: DocumentConverter = Field(default_factory=DocumentConverter)
    safe_file_paths: List[Union[Path, str]] = Field(default_factory=list)
    content: List[DoclingDocument] | None = Field(default=None)
    chunks: List[str] = Field(default_factory=list)
    # We are accepting string urls and validating them if they are valid urls
    # Overiding content to be a list of DoclingDocuments
    safe_file_paths: List[Union[Path, str]] = Field(default_factory=list)  # type: ignore[assignment]
    content: List[DoclingDocument] | None = Field(default=None)  # type: ignore[assignment]
    def model_post_init(self, _) -> None:
        if self.file_path:
            self._logger.log(
                "warning",
                "The 'file_path' attribute is deprecated and will be removed in a future version. Please use 'file_paths' instead.",
                color="yellow",
            )
            self.file_paths = self.file_path  # type: ignore[assignment]
        self.safe_file_paths = self._process_file_paths()
        self.document_converter = DocumentConverter(
            allowed_formats=[
@@ -39,7 +48,7 @@ class DoclingSource(BaseFileKnowledgeSource):
        )
        self.content = self.load_content()
-    def load_content(self) -> List[DoclingDocument] | None:
+    def load_content(self) -> List[DoclingDocument] | None:  # type: ignore[assignment]
        try:
            return self.convert_source_to_docling_documents()
        except Exception as e:
@@ -58,28 +67,43 @@ class DoclingSource(BaseFileKnowledgeSource):
        conv_results_iter = self.document_converter.convert_all(self.safe_file_paths)
        return [result.document for result in conv_results_iter]
-    def _chunk_text(self, doc: DoclingDocument) -> Iterator[str]:
+    def _chunk_text(self, doc: DoclingDocument) -> Iterator[str]:  # type: ignore[assignment]
        chunker = HierarchicalChunker()
        for chunk in chunker.chunk(doc):
            yield chunk.text
-    def _process_file_paths(self) -> list[Path | str]:
+    def _process_file_paths(self) -> list[Path | str]:  # type: ignore[assignment]
        processed_paths = []
        for path in self.file_paths:
-            if path.startswith("http"):
+            if isinstance(path, str):
                if path.startswith(("http://", "https://")):
                    try:
-                        result = urlparse(path)
+                        if self._validate_url(path):
                        if all([result.scheme, result.netloc]):  # Basic URL validation
                            processed_paths.append(path)
                        else:
                            raise ValueError(f"Invalid URL format: {path}")
                    except Exception as e:
                        raise ValueError(f"Invalid URL: {path}. Error: {str(e)}")
            else:
                local_path = Path(KNOWLEDGE_DIRECTORY + "/" + path)
                if local_path.exists():
                    processed_paths.append(local_path)
                else:
-                    raise FileNotFoundError(f"File not found: {local_path}")
+                    local_path = Path(KNOWLEDGE_DIRECTORY + "/" + path)
                    if local_path.exists():
                        processed_paths.append(local_path)
                    else:
                        raise FileNotFoundError(f"File not found: {local_path}")
            else:
                # this is an instance of Path
                processed_paths.append(path)
        return processed_paths
    def _validate_url(self, url: str) -> bool:
        try:
            result = urlparse(url)
            return all(
                [
                    result.scheme in ("http", "https"),
                    result.netloc,
                    len(result.netloc.split(".")) >= 2,  # Ensure domain has TLD
                ]
            )
        except Exception:
            return False