fix: Address lint issues in external knowledge directory implementation

- Remove unnecessary variable assignment in paths.py (RET504)
- Add proper exception chaining in crew_docling_source.py and excel_knowledge_source.py (B904)
- Use next(iter(...)) instead of list(...)[0] in test files (RUF015)

Note: N805 error about cls vs self in field_validator is a false positive -
Pydantic field validators correctly use cls as first parameter

Co-Authored-By: João <joao@crewai.com>
This commit is contained in:
Devin AI
2025-09-16 00:20:08 +00:00
parent 101cee8a27
commit 77065f2151
4 changed files with 47 additions and 52 deletions

View File

@@ -1,5 +1,5 @@
from collections.abc import Iterator
from pathlib import Path from pathlib import Path
from typing import Iterator, List, Optional, Union
from urllib.parse import urlparse from urllib.parse import urlparse
try: try:
@@ -16,7 +16,6 @@ except ImportError:
from pydantic import Field from pydantic import Field
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
from crewai.utilities.constants import KNOWLEDGE_DIRECTORY
from crewai.utilities.logger import Logger from crewai.utilities.logger import Logger
from crewai.utilities.paths import get_knowledge_directory from crewai.utilities.paths import get_knowledge_directory
@@ -36,11 +35,11 @@ class CrewDoclingSource(BaseKnowledgeSource):
_logger: Logger = Logger(verbose=True) _logger: Logger = Logger(verbose=True)
file_path: Optional[List[Union[Path, str]]] = Field(default=None) file_path: list[Path | str] | None = Field(default=None)
file_paths: List[Union[Path, str]] = Field(default_factory=list) file_paths: list[Path | str] = Field(default_factory=list)
chunks: List[str] = Field(default_factory=list) chunks: list[str] = Field(default_factory=list)
safe_file_paths: List[Union[Path, str]] = Field(default_factory=list) safe_file_paths: list[Path | str] = Field(default_factory=list)
content: List["DoclingDocument"] = Field(default_factory=list) content: list["DoclingDocument"] = Field(default_factory=list)
document_converter: "DocumentConverter" = Field( document_converter: "DocumentConverter" = Field(
default_factory=lambda: DocumentConverter( default_factory=lambda: DocumentConverter(
allowed_formats=[ allowed_formats=[
@@ -67,7 +66,7 @@ class CrewDoclingSource(BaseKnowledgeSource):
self.safe_file_paths = self.validate_content() self.safe_file_paths = self.validate_content()
self.content = self._load_content() self.content = self._load_content()
def _load_content(self) -> List["DoclingDocument"]: def _load_content(self) -> list["DoclingDocument"]:
try: try:
return self._convert_source_to_docling_documents() return self._convert_source_to_docling_documents()
except ConversionError as e: except ConversionError as e:
@@ -89,7 +88,7 @@ class CrewDoclingSource(BaseKnowledgeSource):
self.chunks.extend(list(new_chunks_iterable)) self.chunks.extend(list(new_chunks_iterable))
self._save_documents() self._save_documents()
def _convert_source_to_docling_documents(self) -> List["DoclingDocument"]: def _convert_source_to_docling_documents(self) -> list["DoclingDocument"]:
conv_results_iter = self.document_converter.convert_all(self.safe_file_paths) conv_results_iter = self.document_converter.convert_all(self.safe_file_paths)
return [result.document for result in conv_results_iter] return [result.document for result in conv_results_iter]
@@ -98,8 +97,8 @@ class CrewDoclingSource(BaseKnowledgeSource):
for chunk in chunker.chunk(doc): for chunk in chunker.chunk(doc):
yield chunk.text yield chunk.text
def validate_content(self) -> List[Union[Path, str]]: def validate_content(self) -> list[Path | str]:
processed_paths: List[Union[Path, str]] = [] processed_paths: list[Path | str] = []
for path in self.file_paths: for path in self.file_paths:
if isinstance(path, str): if isinstance(path, str):
if path.startswith(("http://", "https://")): if path.startswith(("http://", "https://")):
@@ -109,7 +108,7 @@ class CrewDoclingSource(BaseKnowledgeSource):
else: else:
raise ValueError(f"Invalid URL format: {path}") raise ValueError(f"Invalid URL format: {path}")
except Exception as e: except Exception as e:
raise ValueError(f"Invalid URL: {path}. Error: {str(e)}") raise ValueError(f"Invalid URL: {path}. Error: {e!s}") from e
else: else:
local_path = Path(get_knowledge_directory() + "/" + path) local_path = Path(get_knowledge_directory() + "/" + path)
if local_path.exists(): if local_path.exists():

View File

@@ -1,11 +1,8 @@
from pathlib import Path from pathlib import Path
from typing import Dict, Iterator, List, Optional, Union
from urllib.parse import urlparse
from pydantic import Field, field_validator from pydantic import Field, field_validator
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
from crewai.utilities.constants import KNOWLEDGE_DIRECTORY
from crewai.utilities.logger import Logger from crewai.utilities.logger import Logger
from crewai.utilities.paths import get_knowledge_directory from crewai.utilities.paths import get_knowledge_directory
@@ -17,16 +14,16 @@ class ExcelKnowledgeSource(BaseKnowledgeSource):
_logger: Logger = Logger(verbose=True) _logger: Logger = Logger(verbose=True)
file_path: Optional[Union[Path, List[Path], str, List[str]]] = Field( file_path: Path | list[Path] | str | list[str] | None = Field(
default=None, default=None,
description="[Deprecated] The path to the file. Use file_paths instead.", description="[Deprecated] The path to the file. Use file_paths instead.",
) )
file_paths: Optional[Union[Path, List[Path], str, List[str]]] = Field( file_paths: Path | list[Path] | str | list[str] | None = Field(
default_factory=list, description="The path to the file" default_factory=list, description="The path to the file"
) )
chunks: List[str] = Field(default_factory=list) chunks: list[str] = Field(default_factory=list)
content: Dict[Path, Dict[str, str]] = Field(default_factory=dict) content: dict[Path, dict[str, str]] = Field(default_factory=dict)
safe_file_paths: List[Path] = Field(default_factory=list) safe_file_paths: list[Path] = Field(default_factory=list)
@field_validator("file_path", "file_paths", mode="before") @field_validator("file_path", "file_paths", mode="before")
def validate_file_path(cls, v, info): def validate_file_path(cls, v, info):
@@ -42,7 +39,7 @@ class ExcelKnowledgeSource(BaseKnowledgeSource):
raise ValueError("Either file_path or file_paths must be provided") raise ValueError("Either file_path or file_paths must be provided")
return v return v
def _process_file_paths(self) -> List[Path]: def _process_file_paths(self) -> list[Path]:
"""Convert file_path to a list of Path objects.""" """Convert file_path to a list of Path objects."""
if hasattr(self, "file_path") and self.file_path is not None: if hasattr(self, "file_path") and self.file_path is not None:
@@ -57,7 +54,7 @@ class ExcelKnowledgeSource(BaseKnowledgeSource):
raise ValueError("Your source must be provided with a file_paths: []") raise ValueError("Your source must be provided with a file_paths: []")
# Convert single path to list # Convert single path to list
path_list: List[Union[Path, str]] = ( path_list: list[Path | str] = (
[self.file_paths] [self.file_paths]
if isinstance(self.file_paths, (str, Path)) if isinstance(self.file_paths, (str, Path))
else list(self.file_paths) else list(self.file_paths)
@@ -101,7 +98,7 @@ class ExcelKnowledgeSource(BaseKnowledgeSource):
self.validate_content() self.validate_content()
self.content = self._load_content() self.content = self._load_content()
def _load_content(self) -> Dict[Path, Dict[str, str]]: def _load_content(self) -> dict[Path, dict[str, str]]:
"""Load and preprocess Excel file content from multiple sheets. """Load and preprocess Excel file content from multiple sheets.
Each sheet's content is converted to CSV format and stored. Each sheet's content is converted to CSV format and stored.
@@ -127,7 +124,7 @@ class ExcelKnowledgeSource(BaseKnowledgeSource):
content_dict[file_path] = sheet_dict content_dict[file_path] = sheet_dict
return content_dict return content_dict
def convert_to_path(self, path: Union[Path, str]) -> Path: def convert_to_path(self, path: Path | str) -> Path:
"""Convert a path to a Path object.""" """Convert a path to a Path object."""
return Path(get_knowledge_directory() + "/" + path) if isinstance(path, str) else path return Path(get_knowledge_directory() + "/" + path) if isinstance(path, str) else path
@@ -141,7 +138,7 @@ class ExcelKnowledgeSource(BaseKnowledgeSource):
missing_package = str(e).split()[-1] missing_package = str(e).split()[-1]
raise ImportError( raise ImportError(
f"{missing_package} is not installed. Please install it with: pip install {missing_package}" f"{missing_package} is not installed. Please install it with: pip install {missing_package}"
) ) from e
def add(self) -> None: def add(self) -> None:
""" """
@@ -162,7 +159,7 @@ class ExcelKnowledgeSource(BaseKnowledgeSource):
self.chunks.extend(new_chunks) self.chunks.extend(new_chunks)
self._save_documents() self._save_documents()
def _chunk_text(self, text: str) -> List[str]: def _chunk_text(self, text: str) -> list[str]:
"""Utility method to split text into chunks.""" """Utility method to split text into chunks."""
return [ return [
text[i : i + self.chunk_size] text[i : i + self.chunk_size]

View File

@@ -25,10 +25,8 @@ def get_project_directory_name():
if project_directory_name: if project_directory_name:
return project_directory_name return project_directory_name
else:
cwd = Path.cwd() cwd = Path.cwd()
project_directory_name = cwd.name return cwd.name
return project_directory_name
def get_knowledge_directory(): def get_knowledge_directory():
@@ -40,5 +38,4 @@ def get_knowledge_directory():
if not knowledge_path.exists(): if not knowledge_path.exists():
raise ValueError(f"Knowledge directory does not exist: {knowledge_dir}") raise ValueError(f"Knowledge directory does not exist: {knowledge_dir}")
return str(knowledge_path) return str(knowledge_path)
else:
return "knowledge" return "knowledge"

View File

@@ -1,9 +1,11 @@
import os import os
import tempfile import tempfile
from pathlib import Path from pathlib import Path
import pytest import pytest
from crewai.knowledge.source.text_file_knowledge_source import TextFileKnowledgeSource
from crewai.knowledge.source.json_knowledge_source import JSONKnowledgeSource from crewai.knowledge.source.json_knowledge_source import JSONKnowledgeSource
from crewai.knowledge.source.text_file_knowledge_source import TextFileKnowledgeSource
class TestExternalKnowledgeDirectory: class TestExternalKnowledgeDirectory:
@@ -19,7 +21,7 @@ class TestExternalKnowledgeDirectory:
source = TextFileKnowledgeSource(file_paths=["test.txt"]) source = TextFileKnowledgeSource(file_paths=["test.txt"])
assert len(source.content) == 1 assert len(source.content) == 1
loaded_content = list(source.content.values())[0] loaded_content = next(iter(source.content.values()))
assert loaded_content == test_content assert loaded_content == test_content
finally: finally:
@@ -38,7 +40,7 @@ class TestExternalKnowledgeDirectory:
source = JSONKnowledgeSource(file_paths=["test.json"]) source = JSONKnowledgeSource(file_paths=["test.json"])
assert len(source.content) == 1 assert len(source.content) == 1
loaded_content = list(source.content.values())[0] loaded_content = next(iter(source.content.values()))
assert "John" in loaded_content assert "John" in loaded_content
assert "30" in loaded_content assert "30" in loaded_content
assert "New York" in loaded_content assert "New York" in loaded_content
@@ -62,7 +64,7 @@ class TestExternalKnowledgeDirectory:
source = TextFileKnowledgeSource(file_paths=["test_fallback.txt"]) source = TextFileKnowledgeSource(file_paths=["test_fallback.txt"])
assert len(source.content) == 1 assert len(source.content) == 1
loaded_content = list(source.content.values())[0] loaded_content = next(iter(source.content.values()))
assert loaded_content == test_content assert loaded_content == test_content
finally: finally:
@@ -82,7 +84,7 @@ class TestExternalKnowledgeDirectory:
source = TextFileKnowledgeSource(file_paths=[str(test_file)]) source = TextFileKnowledgeSource(file_paths=[str(test_file)])
assert len(source.content) == 1 assert len(source.content) == 1
loaded_content = list(source.content.values())[0] loaded_content = next(iter(source.content.values()))
assert loaded_content == test_content assert loaded_content == test_content
finally: finally: