diff --git a/src/crewai/knowledge/source/base_file_knowledge_source.py b/src/crewai/knowledge/source/base_file_knowledge_source.py index 4c4b9b337..42af18736 100644 --- a/src/crewai/knowledge/source/base_file_knowledge_source.py +++ b/src/crewai/knowledge/source/base_file_knowledge_source.py @@ -1,6 +1,5 @@ from abc import ABC, abstractmethod from pathlib import Path -from typing import Dict, List, Optional, Union from pydantic import Field, field_validator @@ -14,19 +13,19 @@ class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC): """Base class for knowledge sources that load content from files.""" _logger: Logger = Logger(verbose=True) - file_path: Optional[Union[Path, List[Path], str, List[str]]] = Field( + file_path: Path | list[Path] | str | list[str] | None = Field( default=None, description="[Deprecated] The path to the file. Use file_paths instead.", ) - file_paths: Optional[Union[Path, List[Path], str, List[str]]] = Field( + file_paths: Path | list[Path] | str | list[str] | None = Field( default_factory=list, description="The path to the file" ) - content: Dict[Path, str] = Field(init=False, default_factory=dict) - storage: Optional[KnowledgeStorage] = Field(default=None) - safe_file_paths: List[Path] = Field(default_factory=list) + content: dict[Path, str] = Field(init=False, default_factory=dict) + storage: KnowledgeStorage | None = Field(default=None) + safe_file_paths: list[Path] = Field(default_factory=list) @field_validator("file_path", "file_paths", mode="before") - def validate_file_path(cls, v, info): + def validate_file_path(cls, v, info): # noqa: N805 """Validate that at least one of file_path or file_paths is provided.""" # Single check if both are None, O(1) instead of nested conditions if ( @@ -46,9 +45,8 @@ class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC): self.content = self.load_content() @abstractmethod - def load_content(self) -> Dict[Path, str]: + def load_content(self) -> dict[Path, str]: """Load and preprocess file content. Should be overridden by subclasses. Assume that the file path is relative to the project root in the knowledge directory.""" - pass def validate_content(self): """Validate the paths.""" @@ -74,11 +72,11 @@ class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC): else: raise ValueError("No storage found to save documents.") - def convert_to_path(self, path: Union[Path, str]) -> Path: + def convert_to_path(self, path: Path | str) -> Path: """Convert a path to a Path object.""" return Path(KNOWLEDGE_DIRECTORY + "/" + path) if isinstance(path, str) else path - def _process_file_paths(self) -> List[Path]: + def _process_file_paths(self) -> list[Path]: """Convert file_path to a list of Path objects.""" if hasattr(self, "file_path") and self.file_path is not None: @@ -93,7 +91,7 @@ class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC): raise ValueError("Your source must be provided with a file_paths: []") # Convert single path to list - path_list: List[Union[Path, str]] = ( + path_list: list[Path | str] = ( [self.file_paths] if isinstance(self.file_paths, (str, Path)) else list(self.file_paths) diff --git a/src/crewai/knowledge/source/base_knowledge_source.py b/src/crewai/knowledge/source/base_knowledge_source.py index b558a4b9a..b62dd0f04 100644 --- a/src/crewai/knowledge/source/base_knowledge_source.py +++ b/src/crewai/knowledge/source/base_knowledge_source.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Any, Dict, List, Optional +from typing import Any import numpy as np from pydantic import BaseModel, ConfigDict, Field @@ -12,29 +12,27 @@ class BaseKnowledgeSource(BaseModel, ABC): chunk_size: int = 4000 chunk_overlap: int = 200 - chunks: List[str] = Field(default_factory=list) - chunk_embeddings: List[np.ndarray] = Field(default_factory=list) + chunks: list[str] = Field(default_factory=list) + chunk_embeddings: list[np.ndarray] = Field(default_factory=list) model_config = ConfigDict(arbitrary_types_allowed=True) - storage: Optional[KnowledgeStorage] = Field(default=None) - metadata: Dict[str, Any] = Field(default_factory=dict) # Currently unused - collection_name: Optional[str] = Field(default=None) + storage: KnowledgeStorage | None = Field(default=None) + metadata: dict[str, Any] = Field(default_factory=dict) # Currently unused + collection_name: str | None = Field(default=None) @abstractmethod def validate_content(self) -> Any: """Load and preprocess content from the source.""" - pass @abstractmethod def add(self) -> None: """Process content, chunk it, compute embeddings, and save them.""" - pass - def get_embeddings(self) -> List[np.ndarray]: + def get_embeddings(self) -> list[np.ndarray]: """Return the list of embeddings for the chunks.""" return self.chunk_embeddings - def _chunk_text(self, text: str) -> List[str]: + def _chunk_text(self, text: str) -> list[str]: """Utility method to split text into chunks.""" return [ text[i : i + self.chunk_size] diff --git a/src/crewai/knowledge/source/crew_docling_source.py b/src/crewai/knowledge/source/crew_docling_source.py index 6ca0ae967..9a371866c 100644 --- a/src/crewai/knowledge/source/crew_docling_source.py +++ b/src/crewai/knowledge/source/crew_docling_source.py @@ -1,13 +1,21 @@ +from collections.abc import Iterator from pathlib import Path -from typing import Iterator, List, Optional, Union from urllib.parse import urlparse try: - from docling.datamodel.base_models import InputFormat - from docling.document_converter import DocumentConverter - from docling.exceptions import ConversionError - from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker - from docling_core.types.doc.document import DoclingDocument + from docling.datamodel.base_models import ( # type: ignore[import-not-found] + InputFormat, + ) + from docling.document_converter import ( # type: ignore[import-not-found] + DocumentConverter, + ) + from docling.exceptions import ConversionError # type: ignore[import-not-found] + from docling_core.transforms.chunker.hierarchical_chunker import ( # type: ignore[import-not-found] + HierarchicalChunker, + ) + from docling_core.types.doc.document import ( # type: ignore[import-not-found] + DoclingDocument, + ) DOCLING_AVAILABLE = True except ImportError: @@ -35,11 +43,11 @@ class CrewDoclingSource(BaseKnowledgeSource): _logger: Logger = Logger(verbose=True) - file_path: Optional[List[Union[Path, str]]] = Field(default=None) - file_paths: List[Union[Path, str]] = Field(default_factory=list) - chunks: List[str] = Field(default_factory=list) - safe_file_paths: List[Union[Path, str]] = Field(default_factory=list) - content: List["DoclingDocument"] = Field(default_factory=list) + file_path: list[Path | str] | None = Field(default=None) + file_paths: list[Path | str] = Field(default_factory=list) + chunks: list[str] = Field(default_factory=list) + safe_file_paths: list[Path | str] = Field(default_factory=list) + content: list["DoclingDocument"] = Field(default_factory=list) document_converter: "DocumentConverter" = Field( default_factory=lambda: DocumentConverter( allowed_formats=[ @@ -66,7 +74,7 @@ class CrewDoclingSource(BaseKnowledgeSource): self.safe_file_paths = self.validate_content() self.content = self._load_content() - def _load_content(self) -> List["DoclingDocument"]: + def _load_content(self) -> list["DoclingDocument"]: try: return self._convert_source_to_docling_documents() except ConversionError as e: @@ -88,7 +96,7 @@ class CrewDoclingSource(BaseKnowledgeSource): self.chunks.extend(list(new_chunks_iterable)) self._save_documents() - def _convert_source_to_docling_documents(self) -> List["DoclingDocument"]: + def _convert_source_to_docling_documents(self) -> list["DoclingDocument"]: conv_results_iter = self.document_converter.convert_all(self.safe_file_paths) return [result.document for result in conv_results_iter] @@ -97,8 +105,8 @@ class CrewDoclingSource(BaseKnowledgeSource): for chunk in chunker.chunk(doc): yield chunk.text - def validate_content(self) -> List[Union[Path, str]]: - processed_paths: List[Union[Path, str]] = [] + def validate_content(self) -> list[Path | str]: + processed_paths: list[Path | str] = [] for path in self.file_paths: if isinstance(path, str): if path.startswith(("http://", "https://")): @@ -108,7 +116,7 @@ class CrewDoclingSource(BaseKnowledgeSource): else: raise ValueError(f"Invalid URL format: {path}") except Exception as e: - raise ValueError(f"Invalid URL: {path}. Error: {str(e)}") + raise ValueError(f"Invalid URL: {path}. Error: {e!s}") from e else: local_path = Path(KNOWLEDGE_DIRECTORY + "/" + path) if local_path.exists(): diff --git a/src/crewai/knowledge/source/csv_knowledge_source.py b/src/crewai/knowledge/source/csv_knowledge_source.py index 3bb0714d9..dc7401598 100644 --- a/src/crewai/knowledge/source/csv_knowledge_source.py +++ b/src/crewai/knowledge/source/csv_knowledge_source.py @@ -1,6 +1,5 @@ import csv from pathlib import Path -from typing import Dict, List from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource @@ -8,7 +7,7 @@ from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledge class CSVKnowledgeSource(BaseFileKnowledgeSource): """A knowledge source that stores and queries CSV file content using embeddings.""" - def load_content(self) -> Dict[Path, str]: + def load_content(self) -> dict[Path, str]: """Load and preprocess CSV file content.""" content_dict = {} for file_path in self.safe_file_paths: @@ -32,7 +31,7 @@ class CSVKnowledgeSource(BaseFileKnowledgeSource): self.chunks.extend(new_chunks) self._save_documents() - def _chunk_text(self, text: str) -> List[str]: + def _chunk_text(self, text: str) -> list[str]: """Utility method to split text into chunks.""" return [ text[i : i + self.chunk_size] diff --git a/src/crewai/knowledge/source/excel_knowledge_source.py b/src/crewai/knowledge/source/excel_knowledge_source.py index a73afb1df..3c33e8803 100644 --- a/src/crewai/knowledge/source/excel_knowledge_source.py +++ b/src/crewai/knowledge/source/excel_knowledge_source.py @@ -1,6 +1,4 @@ from pathlib import Path -from typing import Dict, Iterator, List, Optional, Union -from urllib.parse import urlparse from pydantic import Field, field_validator @@ -16,19 +14,19 @@ class ExcelKnowledgeSource(BaseKnowledgeSource): _logger: Logger = Logger(verbose=True) - file_path: Optional[Union[Path, List[Path], str, List[str]]] = Field( + file_path: Path | list[Path] | str | list[str] | None = Field( default=None, description="[Deprecated] The path to the file. Use file_paths instead.", ) - file_paths: Optional[Union[Path, List[Path], str, List[str]]] = Field( + file_paths: Path | list[Path] | str | list[str] | None = Field( default_factory=list, description="The path to the file" ) - chunks: List[str] = Field(default_factory=list) - content: Dict[Path, Dict[str, str]] = Field(default_factory=dict) - safe_file_paths: List[Path] = Field(default_factory=list) + chunks: list[str] = Field(default_factory=list) + content: dict[Path, dict[str, str]] = Field(default_factory=dict) + safe_file_paths: list[Path] = Field(default_factory=list) @field_validator("file_path", "file_paths", mode="before") - def validate_file_path(cls, v, info): + def validate_file_path(cls, v, info): # noqa: N805 """Validate that at least one of file_path or file_paths is provided.""" # Single check if both are None, O(1) instead of nested conditions if ( @@ -41,7 +39,7 @@ class ExcelKnowledgeSource(BaseKnowledgeSource): raise ValueError("Either file_path or file_paths must be provided") return v - def _process_file_paths(self) -> List[Path]: + def _process_file_paths(self) -> list[Path]: """Convert file_path to a list of Path objects.""" if hasattr(self, "file_path") and self.file_path is not None: @@ -56,7 +54,7 @@ class ExcelKnowledgeSource(BaseKnowledgeSource): raise ValueError("Your source must be provided with a file_paths: []") # Convert single path to list - path_list: List[Union[Path, str]] = ( + path_list: list[Path | str] = ( [self.file_paths] if isinstance(self.file_paths, (str, Path)) else list(self.file_paths) @@ -100,7 +98,7 @@ class ExcelKnowledgeSource(BaseKnowledgeSource): self.validate_content() self.content = self._load_content() - def _load_content(self) -> Dict[Path, Dict[str, str]]: + def _load_content(self) -> dict[Path, dict[str, str]]: """Load and preprocess Excel file content from multiple sheets. Each sheet's content is converted to CSV format and stored. @@ -126,21 +124,21 @@ class ExcelKnowledgeSource(BaseKnowledgeSource): content_dict[file_path] = sheet_dict return content_dict - def convert_to_path(self, path: Union[Path, str]) -> Path: + def convert_to_path(self, path: Path | str) -> Path: """Convert a path to a Path object.""" return Path(KNOWLEDGE_DIRECTORY + "/" + path) if isinstance(path, str) else path def _import_dependencies(self): """Dynamically import dependencies.""" try: - import pandas as pd + import pandas as pd # type: ignore[import-untyped,import-not-found] return pd except ImportError as e: missing_package = str(e).split()[-1] raise ImportError( f"{missing_package} is not installed. Please install it with: pip install {missing_package}" - ) + ) from e def add(self) -> None: """ @@ -161,7 +159,7 @@ class ExcelKnowledgeSource(BaseKnowledgeSource): self.chunks.extend(new_chunks) self._save_documents() - def _chunk_text(self, text: str) -> List[str]: + def _chunk_text(self, text: str) -> list[str]: """Utility method to split text into chunks.""" return [ text[i : i + self.chunk_size] diff --git a/src/crewai/knowledge/source/json_knowledge_source.py b/src/crewai/knowledge/source/json_knowledge_source.py index b02d438e6..0e5c847e2 100644 --- a/src/crewai/knowledge/source/json_knowledge_source.py +++ b/src/crewai/knowledge/source/json_knowledge_source.py @@ -1,6 +1,6 @@ import json from pathlib import Path -from typing import Any, Dict, List +from typing import Any from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource @@ -8,9 +8,9 @@ from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledge class JSONKnowledgeSource(BaseFileKnowledgeSource): """A knowledge source that stores and queries JSON file content using embeddings.""" - def load_content(self) -> Dict[Path, str]: + def load_content(self) -> dict[Path, str]: """Load and preprocess JSON file content.""" - content: Dict[Path, str] = {} + content: dict[Path, str] = {} for path in self.safe_file_paths: path = self.convert_to_path(path) with open(path, "r", encoding="utf-8") as json_file: @@ -29,7 +29,7 @@ class JSONKnowledgeSource(BaseFileKnowledgeSource): for item in data: text += f"{indent}- {self._json_to_text(item, level + 1)}\n" else: - text += f"{str(data)}" + text += f"{data!s}" return text def add(self) -> None: @@ -44,7 +44,7 @@ class JSONKnowledgeSource(BaseFileKnowledgeSource): self.chunks.extend(new_chunks) self._save_documents() - def _chunk_text(self, text: str) -> List[str]: + def _chunk_text(self, text: str) -> list[str]: """Utility method to split text into chunks.""" return [ text[i : i + self.chunk_size] diff --git a/src/crewai/knowledge/source/pdf_knowledge_source.py b/src/crewai/knowledge/source/pdf_knowledge_source.py index 38cd67807..7fa663b92 100644 --- a/src/crewai/knowledge/source/pdf_knowledge_source.py +++ b/src/crewai/knowledge/source/pdf_knowledge_source.py @@ -1,5 +1,4 @@ from pathlib import Path -from typing import Dict, List from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource @@ -7,7 +6,7 @@ from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledge class PDFKnowledgeSource(BaseFileKnowledgeSource): """A knowledge source that stores and queries PDF file content using embeddings.""" - def load_content(self) -> Dict[Path, str]: + def load_content(self) -> dict[Path, str]: """Load and preprocess PDF file content.""" pdfplumber = self._import_pdfplumber() @@ -30,22 +29,22 @@ class PDFKnowledgeSource(BaseFileKnowledgeSource): import pdfplumber return pdfplumber - except ImportError: + except ImportError as e: raise ImportError( "pdfplumber is not installed. Please install it with: pip install pdfplumber" - ) + ) from e def add(self) -> None: """ Add PDF file content to the knowledge source, chunk it, compute embeddings, and save the embeddings. """ - for _, text in self.content.items(): + for text in self.content.values(): new_chunks = self._chunk_text(text) self.chunks.extend(new_chunks) self._save_documents() - def _chunk_text(self, text: str) -> List[str]: + def _chunk_text(self, text: str) -> list[str]: """Utility method to split text into chunks.""" return [ text[i : i + self.chunk_size] diff --git a/src/crewai/knowledge/source/string_knowledge_source.py b/src/crewai/knowledge/source/string_knowledge_source.py index 614303b1f..97473d9d3 100644 --- a/src/crewai/knowledge/source/string_knowledge_source.py +++ b/src/crewai/knowledge/source/string_knowledge_source.py @@ -1,5 +1,3 @@ -from typing import List, Optional - from pydantic import Field from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource @@ -9,7 +7,7 @@ class StringKnowledgeSource(BaseKnowledgeSource): """A knowledge source that stores and queries plain text content using embeddings.""" content: str = Field(...) - collection_name: Optional[str] = Field(default=None) + collection_name: str | None = Field(default=None) def model_post_init(self, _): """Post-initialization method to validate content.""" @@ -26,7 +24,7 @@ class StringKnowledgeSource(BaseKnowledgeSource): self.chunks.extend(new_chunks) self._save_documents() - def _chunk_text(self, text: str) -> List[str]: + def _chunk_text(self, text: str) -> list[str]: """Utility method to split text into chunks.""" return [ text[i : i + self.chunk_size] diff --git a/src/crewai/knowledge/source/text_file_knowledge_source.py b/src/crewai/knowledge/source/text_file_knowledge_source.py index ddb1f2516..93a3e2849 100644 --- a/src/crewai/knowledge/source/text_file_knowledge_source.py +++ b/src/crewai/knowledge/source/text_file_knowledge_source.py @@ -1,5 +1,4 @@ from pathlib import Path -from typing import Dict, List from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource @@ -7,7 +6,7 @@ from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledge class TextFileKnowledgeSource(BaseFileKnowledgeSource): """A knowledge source that stores and queries text file content using embeddings.""" - def load_content(self) -> Dict[Path, str]: + def load_content(self) -> dict[Path, str]: """Load and preprocess text file content.""" content = {} for path in self.safe_file_paths: @@ -21,12 +20,12 @@ class TextFileKnowledgeSource(BaseFileKnowledgeSource): Add text file content to the knowledge source, chunk it, compute embeddings, and save the embeddings. """ - for _, text in self.content.items(): + for text in self.content.values(): new_chunks = self._chunk_text(text) self.chunks.extend(new_chunks) self._save_documents() - def _chunk_text(self, text: str) -> List[str]: + def _chunk_text(self, text: str) -> list[str]: """Utility method to split text into chunks.""" return [ text[i : i + self.chunk_size]