chore: fix ruff linting and mypy issues in knowledge module

2026-01-22 14:48:13 +00:00 · 2025-09-19 21:39:15 -04:00
parent 2cfc4d37b8
commit 8e571ea8a7
9 changed files with 72 additions and 75 deletions
--- a/src/crewai/knowledge/source/base_file_knowledge_source.py
+++ b/src/crewai/knowledge/source/base_file_knowledge_source.py
@@ -1,6 +1,5 @@
 from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import Dict, List, Optional, Union

 from pydantic import Field, field_validator

@@ -14,19 +13,19 @@ class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):
    """Base class for knowledge sources that load content from files."""

    _logger: Logger = Logger(verbose=True)
-    file_path: Optional[Union[Path, List[Path], str, List[str]]] = Field(
+    file_path: Path | list[Path] | str | list[str] | None = Field(
        default=None,
        description="[Deprecated] The path to the file. Use file_paths instead.",
    )
-    file_paths: Optional[Union[Path, List[Path], str, List[str]]] = Field(
+    file_paths: Path | list[Path] | str | list[str] | None = Field(
        default_factory=list, description="The path to the file"
    )
-    content: Dict[Path, str] = Field(init=False, default_factory=dict)
-    storage: Optional[KnowledgeStorage] = Field(default=None)
-    safe_file_paths: List[Path] = Field(default_factory=list)
+    content: dict[Path, str] = Field(init=False, default_factory=dict)
+    storage: KnowledgeStorage | None = Field(default=None)
+    safe_file_paths: list[Path] = Field(default_factory=list)

    @field_validator("file_path", "file_paths", mode="before")
-    def validate_file_path(cls, v, info):
+    def validate_file_path(cls, v, info):  # noqa: N805
        """Validate that at least one of file_path or file_paths is provided."""
        # Single check if both are None, O(1) instead of nested conditions
        if (
@@ -46,9 +45,8 @@ class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):
        self.content = self.load_content()

    @abstractmethod
-    def load_content(self) -> Dict[Path, str]:
+    def load_content(self) -> dict[Path, str]:
        """Load and preprocess file content. Should be overridden by subclasses. Assume that the file path is relative to the project root in the knowledge directory."""
-        pass

    def validate_content(self):
        """Validate the paths."""
@@ -74,11 +72,11 @@ class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):
        else:
            raise ValueError("No storage found to save documents.")

-    def convert_to_path(self, path: Union[Path, str]) -> Path:
+    def convert_to_path(self, path: Path | str) -> Path:
        """Convert a path to a Path object."""
        return Path(KNOWLEDGE_DIRECTORY + "/" + path) if isinstance(path, str) else path

-    def _process_file_paths(self) -> List[Path]:
+    def _process_file_paths(self) -> list[Path]:
        """Convert file_path to a list of Path objects."""

        if hasattr(self, "file_path") and self.file_path is not None:
@@ -93,7 +91,7 @@ class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):
            raise ValueError("Your source must be provided with a file_paths: []")

        # Convert single path to list
-        path_list: List[Union[Path, str]] = (
+        path_list: list[Path | str] = (
            [self.file_paths]
            if isinstance(self.file_paths, (str, Path))
            else list(self.file_paths)
--- a/src/crewai/knowledge/source/base_knowledge_source.py
+++ b/src/crewai/knowledge/source/base_knowledge_source.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Any, Dict, List, Optional
+from typing import Any

 import numpy as np
 from pydantic import BaseModel, ConfigDict, Field
@@ -12,29 +12,27 @@ class BaseKnowledgeSource(BaseModel, ABC):

    chunk_size: int = 4000
    chunk_overlap: int = 200
-    chunks: List[str] = Field(default_factory=list)
-    chunk_embeddings: List[np.ndarray] = Field(default_factory=list)
+    chunks: list[str] = Field(default_factory=list)
+    chunk_embeddings: list[np.ndarray] = Field(default_factory=list)

    model_config = ConfigDict(arbitrary_types_allowed=True)
-    storage: Optional[KnowledgeStorage] = Field(default=None)
-    metadata: Dict[str, Any] = Field(default_factory=dict)  # Currently unused
-    collection_name: Optional[str] = Field(default=None)
+    storage: KnowledgeStorage | None = Field(default=None)
+    metadata: dict[str, Any] = Field(default_factory=dict)  # Currently unused
+    collection_name: str | None = Field(default=None)

    @abstractmethod
    def validate_content(self) -> Any:
        """Load and preprocess content from the source."""
-        pass

    @abstractmethod
    def add(self) -> None:
        """Process content, chunk it, compute embeddings, and save them."""
-        pass

-    def get_embeddings(self) -> List[np.ndarray]:
+    def get_embeddings(self) -> list[np.ndarray]:
        """Return the list of embeddings for the chunks."""
        return self.chunk_embeddings

-    def _chunk_text(self, text: str) -> List[str]:
+    def _chunk_text(self, text: str) -> list[str]:
        """Utility method to split text into chunks."""
        return [
            text[i : i + self.chunk_size]
--- a/src/crewai/knowledge/source/crew_docling_source.py
+++ b/src/crewai/knowledge/source/crew_docling_source.py
@@ -1,13 +1,21 @@
+from collections.abc import Iterator
 from pathlib import Path
-from typing import Iterator, List, Optional, Union
 from urllib.parse import urlparse

 try:
-    from docling.datamodel.base_models import InputFormat
-    from docling.document_converter import DocumentConverter
-    from docling.exceptions import ConversionError
-    from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
-    from docling_core.types.doc.document import DoclingDocument
+    from docling.datamodel.base_models import (  # type: ignore[import-not-found]
+        InputFormat,
+    )
+    from docling.document_converter import (  # type: ignore[import-not-found]
+        DocumentConverter,
+    )
+    from docling.exceptions import ConversionError  # type: ignore[import-not-found]
+    from docling_core.transforms.chunker.hierarchical_chunker import (  # type: ignore[import-not-found]
+        HierarchicalChunker,
+    )
+    from docling_core.types.doc.document import (  # type: ignore[import-not-found]
+        DoclingDocument,
+    )

    DOCLING_AVAILABLE = True
 except ImportError:
@@ -35,11 +43,11 @@ class CrewDoclingSource(BaseKnowledgeSource):

    _logger: Logger = Logger(verbose=True)

-    file_path: Optional[List[Union[Path, str]]] = Field(default=None)
-    file_paths: List[Union[Path, str]] = Field(default_factory=list)
-    chunks: List[str] = Field(default_factory=list)
-    safe_file_paths: List[Union[Path, str]] = Field(default_factory=list)
-    content: List["DoclingDocument"] = Field(default_factory=list)
+    file_path: list[Path | str] | None = Field(default=None)
+    file_paths: list[Path | str] = Field(default_factory=list)
+    chunks: list[str] = Field(default_factory=list)
+    safe_file_paths: list[Path | str] = Field(default_factory=list)
+    content: list["DoclingDocument"] = Field(default_factory=list)
    document_converter: "DocumentConverter" = Field(
        default_factory=lambda: DocumentConverter(
            allowed_formats=[
@@ -66,7 +74,7 @@ class CrewDoclingSource(BaseKnowledgeSource):
        self.safe_file_paths = self.validate_content()
        self.content = self._load_content()

-    def _load_content(self) -> List["DoclingDocument"]:
+    def _load_content(self) -> list["DoclingDocument"]:
        try:
            return self._convert_source_to_docling_documents()
        except ConversionError as e:
@@ -88,7 +96,7 @@ class CrewDoclingSource(BaseKnowledgeSource):
            self.chunks.extend(list(new_chunks_iterable))
        self._save_documents()

-    def _convert_source_to_docling_documents(self) -> List["DoclingDocument"]:
+    def _convert_source_to_docling_documents(self) -> list["DoclingDocument"]:
        conv_results_iter = self.document_converter.convert_all(self.safe_file_paths)
        return [result.document for result in conv_results_iter]

@@ -97,8 +105,8 @@ class CrewDoclingSource(BaseKnowledgeSource):
        for chunk in chunker.chunk(doc):
            yield chunk.text

-    def validate_content(self) -> List[Union[Path, str]]:
-        processed_paths: List[Union[Path, str]] = []
+    def validate_content(self) -> list[Path | str]:
+        processed_paths: list[Path | str] = []
        for path in self.file_paths:
            if isinstance(path, str):
                if path.startswith(("http://", "https://")):
@@ -108,7 +116,7 @@ class CrewDoclingSource(BaseKnowledgeSource):
                        else:
                            raise ValueError(f"Invalid URL format: {path}")
                    except Exception as e:
-                        raise ValueError(f"Invalid URL: {path}. Error: {str(e)}")
+                        raise ValueError(f"Invalid URL: {path}. Error: {e!s}") from e
                else:
                    local_path = Path(KNOWLEDGE_DIRECTORY + "/" + path)
                    if local_path.exists():
--- a/src/crewai/knowledge/source/csv_knowledge_source.py
+++ b/src/crewai/knowledge/source/csv_knowledge_source.py
@@ -1,6 +1,5 @@
 import csv
 from pathlib import Path
-from typing import Dict, List

 from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource

@@ -8,7 +7,7 @@ from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledge
 class CSVKnowledgeSource(BaseFileKnowledgeSource):
    """A knowledge source that stores and queries CSV file content using embeddings."""

-    def load_content(self) -> Dict[Path, str]:
+    def load_content(self) -> dict[Path, str]:
        """Load and preprocess CSV file content."""
        content_dict = {}
        for file_path in self.safe_file_paths:
@@ -32,7 +31,7 @@ class CSVKnowledgeSource(BaseFileKnowledgeSource):
        self.chunks.extend(new_chunks)
        self._save_documents()

-    def _chunk_text(self, text: str) -> List[str]:
+    def _chunk_text(self, text: str) -> list[str]:
        """Utility method to split text into chunks."""
        return [
            text[i : i + self.chunk_size]
--- a/src/crewai/knowledge/source/excel_knowledge_source.py
+++ b/src/crewai/knowledge/source/excel_knowledge_source.py
@@ -1,6 +1,4 @@
 from pathlib import Path
-from typing import Dict, Iterator, List, Optional, Union
-from urllib.parse import urlparse

 from pydantic import Field, field_validator

@@ -16,19 +14,19 @@ class ExcelKnowledgeSource(BaseKnowledgeSource):

    _logger: Logger = Logger(verbose=True)

-    file_path: Optional[Union[Path, List[Path], str, List[str]]] = Field(
+    file_path: Path | list[Path] | str | list[str] | None = Field(
        default=None,
        description="[Deprecated] The path to the file. Use file_paths instead.",
    )
-    file_paths: Optional[Union[Path, List[Path], str, List[str]]] = Field(
+    file_paths: Path | list[Path] | str | list[str] | None = Field(
        default_factory=list, description="The path to the file"
    )
-    chunks: List[str] = Field(default_factory=list)
-    content: Dict[Path, Dict[str, str]] = Field(default_factory=dict)
-    safe_file_paths: List[Path] = Field(default_factory=list)
+    chunks: list[str] = Field(default_factory=list)
+    content: dict[Path, dict[str, str]] = Field(default_factory=dict)
+    safe_file_paths: list[Path] = Field(default_factory=list)

    @field_validator("file_path", "file_paths", mode="before")
-    def validate_file_path(cls, v, info):
+    def validate_file_path(cls, v, info):  # noqa: N805
        """Validate that at least one of file_path or file_paths is provided."""
        # Single check if both are None, O(1) instead of nested conditions
        if (
@@ -41,7 +39,7 @@ class ExcelKnowledgeSource(BaseKnowledgeSource):
            raise ValueError("Either file_path or file_paths must be provided")
        return v

-    def _process_file_paths(self) -> List[Path]:
+    def _process_file_paths(self) -> list[Path]:
        """Convert file_path to a list of Path objects."""

        if hasattr(self, "file_path") and self.file_path is not None:
@@ -56,7 +54,7 @@ class ExcelKnowledgeSource(BaseKnowledgeSource):
            raise ValueError("Your source must be provided with a file_paths: []")

        # Convert single path to list
-        path_list: List[Union[Path, str]] = (
+        path_list: list[Path | str] = (
            [self.file_paths]
            if isinstance(self.file_paths, (str, Path))
            else list(self.file_paths)
@@ -100,7 +98,7 @@ class ExcelKnowledgeSource(BaseKnowledgeSource):
        self.validate_content()
        self.content = self._load_content()

-    def _load_content(self) -> Dict[Path, Dict[str, str]]:
+    def _load_content(self) -> dict[Path, dict[str, str]]:
        """Load and preprocess Excel file content from multiple sheets.

        Each sheet's content is converted to CSV format and stored.
@@ -126,21 +124,21 @@ class ExcelKnowledgeSource(BaseKnowledgeSource):
            content_dict[file_path] = sheet_dict
        return content_dict

-    def convert_to_path(self, path: Union[Path, str]) -> Path:
+    def convert_to_path(self, path: Path | str) -> Path:
        """Convert a path to a Path object."""
        return Path(KNOWLEDGE_DIRECTORY + "/" + path) if isinstance(path, str) else path

    def _import_dependencies(self):
        """Dynamically import dependencies."""
        try:
-            import pandas as pd
+            import pandas as pd  # type: ignore[import-untyped,import-not-found]

            return pd
        except ImportError as e:
            missing_package = str(e).split()[-1]
            raise ImportError(
                f"{missing_package} is not installed. Please install it with: pip install {missing_package}"
-            )
+            ) from e

    def add(self) -> None:
        """
@@ -161,7 +159,7 @@ class ExcelKnowledgeSource(BaseKnowledgeSource):
        self.chunks.extend(new_chunks)
        self._save_documents()

-    def _chunk_text(self, text: str) -> List[str]:
+    def _chunk_text(self, text: str) -> list[str]:
        """Utility method to split text into chunks."""
        return [
            text[i : i + self.chunk_size]
--- a/src/crewai/knowledge/source/json_knowledge_source.py
+++ b/src/crewai/knowledge/source/json_knowledge_source.py
@@ -1,6 +1,6 @@
 import json
 from pathlib import Path
-from typing import Any, Dict, List
+from typing import Any

 from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource

@@ -8,9 +8,9 @@ from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledge
 class JSONKnowledgeSource(BaseFileKnowledgeSource):
    """A knowledge source that stores and queries JSON file content using embeddings."""

-    def load_content(self) -> Dict[Path, str]:
+    def load_content(self) -> dict[Path, str]:
        """Load and preprocess JSON file content."""
-        content: Dict[Path, str] = {}
+        content: dict[Path, str] = {}
        for path in self.safe_file_paths:
            path = self.convert_to_path(path)
            with open(path, "r", encoding="utf-8") as json_file:
@@ -29,7 +29,7 @@ class JSONKnowledgeSource(BaseFileKnowledgeSource):
            for item in data:
                text += f"{indent}- {self._json_to_text(item, level + 1)}\n"
        else:
-            text += f"{str(data)}"
+            text += f"{data!s}"
        return text

    def add(self) -> None:
@@ -44,7 +44,7 @@ class JSONKnowledgeSource(BaseFileKnowledgeSource):
        self.chunks.extend(new_chunks)
        self._save_documents()

-    def _chunk_text(self, text: str) -> List[str]:
+    def _chunk_text(self, text: str) -> list[str]:
        """Utility method to split text into chunks."""
        return [
            text[i : i + self.chunk_size]
--- a/src/crewai/knowledge/source/pdf_knowledge_source.py
+++ b/src/crewai/knowledge/source/pdf_knowledge_source.py
@@ -1,5 +1,4 @@
 from pathlib import Path
-from typing import Dict, List

 from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource

@@ -7,7 +6,7 @@ from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledge
 class PDFKnowledgeSource(BaseFileKnowledgeSource):
    """A knowledge source that stores and queries PDF file content using embeddings."""

-    def load_content(self) -> Dict[Path, str]:
+    def load_content(self) -> dict[Path, str]:
        """Load and preprocess PDF file content."""
        pdfplumber = self._import_pdfplumber()

@@ -30,22 +29,22 @@ class PDFKnowledgeSource(BaseFileKnowledgeSource):
            import pdfplumber

            return pdfplumber
-        except ImportError:
+        except ImportError as e:
            raise ImportError(
                "pdfplumber is not installed. Please install it with: pip install pdfplumber"
-            )
+            ) from e

    def add(self) -> None:
        """
        Add PDF file content to the knowledge source, chunk it, compute embeddings,
        and save the embeddings.
        """
-        for _, text in self.content.items():
+        for text in self.content.values():
            new_chunks = self._chunk_text(text)
            self.chunks.extend(new_chunks)
        self._save_documents()

-    def _chunk_text(self, text: str) -> List[str]:
+    def _chunk_text(self, text: str) -> list[str]:
        """Utility method to split text into chunks."""
        return [
            text[i : i + self.chunk_size]
--- a/src/crewai/knowledge/source/string_knowledge_source.py
+++ b/src/crewai/knowledge/source/string_knowledge_source.py
@@ -1,5 +1,3 @@
-from typing import List, Optional
-
 from pydantic import Field

 from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
@@ -9,7 +7,7 @@ class StringKnowledgeSource(BaseKnowledgeSource):
    """A knowledge source that stores and queries plain text content using embeddings."""

    content: str = Field(...)
-    collection_name: Optional[str] = Field(default=None)
+    collection_name: str | None = Field(default=None)

    def model_post_init(self, _):
        """Post-initialization method to validate content."""
@@ -26,7 +24,7 @@ class StringKnowledgeSource(BaseKnowledgeSource):
        self.chunks.extend(new_chunks)
        self._save_documents()

-    def _chunk_text(self, text: str) -> List[str]:
+    def _chunk_text(self, text: str) -> list[str]:
        """Utility method to split text into chunks."""
        return [
            text[i : i + self.chunk_size]
--- a/src/crewai/knowledge/source/text_file_knowledge_source.py
+++ b/src/crewai/knowledge/source/text_file_knowledge_source.py
@@ -1,5 +1,4 @@
 from pathlib import Path
-from typing import Dict, List

 from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource

@@ -7,7 +6,7 @@ from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledge
 class TextFileKnowledgeSource(BaseFileKnowledgeSource):
    """A knowledge source that stores and queries text file content using embeddings."""

-    def load_content(self) -> Dict[Path, str]:
+    def load_content(self) -> dict[Path, str]:
        """Load and preprocess text file content."""
        content = {}
        for path in self.safe_file_paths:
@@ -21,12 +20,12 @@ class TextFileKnowledgeSource(BaseFileKnowledgeSource):
        Add text file content to the knowledge source, chunk it, compute embeddings,
        and save the embeddings.
        """
-        for _, text in self.content.items():
+        for text in self.content.values():
            new_chunks = self._chunk_text(text)
            self.chunks.extend(new_chunks)
        self._save_documents()

-    def _chunk_text(self, text: str) -> List[str]:
+    def _chunk_text(self, text: str) -> list[str]:
        """Utility method to split text into chunks."""
        return [
            text[i : i + self.chunk_size]