mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-05-02 15:52:34 +00:00
chore: fix ruff linting and mypy issues in knowledge module
Some checks failed
CodeQL Advanced / Analyze (actions) (push) Has been cancelled
CodeQL Advanced / Analyze (python) (push) Has been cancelled
Notify Downstream / notify-downstream (push) Has been cancelled
Update Test Durations / update-durations (3.10) (push) Has been cancelled
Update Test Durations / update-durations (3.11) (push) Has been cancelled
Update Test Durations / update-durations (3.12) (push) Has been cancelled
Update Test Durations / update-durations (3.13) (push) Has been cancelled
Build uv cache / build-cache (3.10) (push) Has been cancelled
Build uv cache / build-cache (3.11) (push) Has been cancelled
Build uv cache / build-cache (3.12) (push) Has been cancelled
Build uv cache / build-cache (3.13) (push) Has been cancelled
Some checks failed
CodeQL Advanced / Analyze (actions) (push) Has been cancelled
CodeQL Advanced / Analyze (python) (push) Has been cancelled
Notify Downstream / notify-downstream (push) Has been cancelled
Update Test Durations / update-durations (3.10) (push) Has been cancelled
Update Test Durations / update-durations (3.11) (push) Has been cancelled
Update Test Durations / update-durations (3.12) (push) Has been cancelled
Update Test Durations / update-durations (3.13) (push) Has been cancelled
Build uv cache / build-cache (3.10) (push) Has been cancelled
Build uv cache / build-cache (3.11) (push) Has been cancelled
Build uv cache / build-cache (3.12) (push) Has been cancelled
Build uv cache / build-cache (3.13) (push) Has been cancelled
This commit is contained in:
@@ -1,6 +1,5 @@
|
|||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, List, Optional, Union
|
|
||||||
|
|
||||||
from pydantic import Field, field_validator
|
from pydantic import Field, field_validator
|
||||||
|
|
||||||
@@ -14,19 +13,19 @@ class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):
|
|||||||
"""Base class for knowledge sources that load content from files."""
|
"""Base class for knowledge sources that load content from files."""
|
||||||
|
|
||||||
_logger: Logger = Logger(verbose=True)
|
_logger: Logger = Logger(verbose=True)
|
||||||
file_path: Optional[Union[Path, List[Path], str, List[str]]] = Field(
|
file_path: Path | list[Path] | str | list[str] | None = Field(
|
||||||
default=None,
|
default=None,
|
||||||
description="[Deprecated] The path to the file. Use file_paths instead.",
|
description="[Deprecated] The path to the file. Use file_paths instead.",
|
||||||
)
|
)
|
||||||
file_paths: Optional[Union[Path, List[Path], str, List[str]]] = Field(
|
file_paths: Path | list[Path] | str | list[str] | None = Field(
|
||||||
default_factory=list, description="The path to the file"
|
default_factory=list, description="The path to the file"
|
||||||
)
|
)
|
||||||
content: Dict[Path, str] = Field(init=False, default_factory=dict)
|
content: dict[Path, str] = Field(init=False, default_factory=dict)
|
||||||
storage: Optional[KnowledgeStorage] = Field(default=None)
|
storage: KnowledgeStorage | None = Field(default=None)
|
||||||
safe_file_paths: List[Path] = Field(default_factory=list)
|
safe_file_paths: list[Path] = Field(default_factory=list)
|
||||||
|
|
||||||
@field_validator("file_path", "file_paths", mode="before")
|
@field_validator("file_path", "file_paths", mode="before")
|
||||||
def validate_file_path(cls, v, info):
|
def validate_file_path(cls, v, info): # noqa: N805
|
||||||
"""Validate that at least one of file_path or file_paths is provided."""
|
"""Validate that at least one of file_path or file_paths is provided."""
|
||||||
# Single check if both are None, O(1) instead of nested conditions
|
# Single check if both are None, O(1) instead of nested conditions
|
||||||
if (
|
if (
|
||||||
@@ -46,9 +45,8 @@ class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):
|
|||||||
self.content = self.load_content()
|
self.content = self.load_content()
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def load_content(self) -> Dict[Path, str]:
|
def load_content(self) -> dict[Path, str]:
|
||||||
"""Load and preprocess file content. Should be overridden by subclasses. Assume that the file path is relative to the project root in the knowledge directory."""
|
"""Load and preprocess file content. Should be overridden by subclasses. Assume that the file path is relative to the project root in the knowledge directory."""
|
||||||
pass
|
|
||||||
|
|
||||||
def validate_content(self):
|
def validate_content(self):
|
||||||
"""Validate the paths."""
|
"""Validate the paths."""
|
||||||
@@ -74,11 +72,11 @@ class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):
|
|||||||
else:
|
else:
|
||||||
raise ValueError("No storage found to save documents.")
|
raise ValueError("No storage found to save documents.")
|
||||||
|
|
||||||
def convert_to_path(self, path: Union[Path, str]) -> Path:
|
def convert_to_path(self, path: Path | str) -> Path:
|
||||||
"""Convert a path to a Path object."""
|
"""Convert a path to a Path object."""
|
||||||
return Path(KNOWLEDGE_DIRECTORY + "/" + path) if isinstance(path, str) else path
|
return Path(KNOWLEDGE_DIRECTORY + "/" + path) if isinstance(path, str) else path
|
||||||
|
|
||||||
def _process_file_paths(self) -> List[Path]:
|
def _process_file_paths(self) -> list[Path]:
|
||||||
"""Convert file_path to a list of Path objects."""
|
"""Convert file_path to a list of Path objects."""
|
||||||
|
|
||||||
if hasattr(self, "file_path") and self.file_path is not None:
|
if hasattr(self, "file_path") and self.file_path is not None:
|
||||||
@@ -93,7 +91,7 @@ class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):
|
|||||||
raise ValueError("Your source must be provided with a file_paths: []")
|
raise ValueError("Your source must be provided with a file_paths: []")
|
||||||
|
|
||||||
# Convert single path to list
|
# Convert single path to list
|
||||||
path_list: List[Union[Path, str]] = (
|
path_list: list[Path | str] = (
|
||||||
[self.file_paths]
|
[self.file_paths]
|
||||||
if isinstance(self.file_paths, (str, Path))
|
if isinstance(self.file_paths, (str, Path))
|
||||||
else list(self.file_paths)
|
else list(self.file_paths)
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from pydantic import BaseModel, ConfigDict, Field
|
from pydantic import BaseModel, ConfigDict, Field
|
||||||
@@ -12,29 +12,27 @@ class BaseKnowledgeSource(BaseModel, ABC):
|
|||||||
|
|
||||||
chunk_size: int = 4000
|
chunk_size: int = 4000
|
||||||
chunk_overlap: int = 200
|
chunk_overlap: int = 200
|
||||||
chunks: List[str] = Field(default_factory=list)
|
chunks: list[str] = Field(default_factory=list)
|
||||||
chunk_embeddings: List[np.ndarray] = Field(default_factory=list)
|
chunk_embeddings: list[np.ndarray] = Field(default_factory=list)
|
||||||
|
|
||||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||||
storage: Optional[KnowledgeStorage] = Field(default=None)
|
storage: KnowledgeStorage | None = Field(default=None)
|
||||||
metadata: Dict[str, Any] = Field(default_factory=dict) # Currently unused
|
metadata: dict[str, Any] = Field(default_factory=dict) # Currently unused
|
||||||
collection_name: Optional[str] = Field(default=None)
|
collection_name: str | None = Field(default=None)
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def validate_content(self) -> Any:
|
def validate_content(self) -> Any:
|
||||||
"""Load and preprocess content from the source."""
|
"""Load and preprocess content from the source."""
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def add(self) -> None:
|
def add(self) -> None:
|
||||||
"""Process content, chunk it, compute embeddings, and save them."""
|
"""Process content, chunk it, compute embeddings, and save them."""
|
||||||
pass
|
|
||||||
|
|
||||||
def get_embeddings(self) -> List[np.ndarray]:
|
def get_embeddings(self) -> list[np.ndarray]:
|
||||||
"""Return the list of embeddings for the chunks."""
|
"""Return the list of embeddings for the chunks."""
|
||||||
return self.chunk_embeddings
|
return self.chunk_embeddings
|
||||||
|
|
||||||
def _chunk_text(self, text: str) -> List[str]:
|
def _chunk_text(self, text: str) -> list[str]:
|
||||||
"""Utility method to split text into chunks."""
|
"""Utility method to split text into chunks."""
|
||||||
return [
|
return [
|
||||||
text[i : i + self.chunk_size]
|
text[i : i + self.chunk_size]
|
||||||
|
|||||||
@@ -1,13 +1,21 @@
|
|||||||
|
from collections.abc import Iterator
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterator, List, Optional, Union
|
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import ( # type: ignore[import-not-found]
|
||||||
from docling.document_converter import DocumentConverter
|
InputFormat,
|
||||||
from docling.exceptions import ConversionError
|
)
|
||||||
from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
|
from docling.document_converter import ( # type: ignore[import-not-found]
|
||||||
from docling_core.types.doc.document import DoclingDocument
|
DocumentConverter,
|
||||||
|
)
|
||||||
|
from docling.exceptions import ConversionError # type: ignore[import-not-found]
|
||||||
|
from docling_core.transforms.chunker.hierarchical_chunker import ( # type: ignore[import-not-found]
|
||||||
|
HierarchicalChunker,
|
||||||
|
)
|
||||||
|
from docling_core.types.doc.document import ( # type: ignore[import-not-found]
|
||||||
|
DoclingDocument,
|
||||||
|
)
|
||||||
|
|
||||||
DOCLING_AVAILABLE = True
|
DOCLING_AVAILABLE = True
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@@ -35,11 +43,11 @@ class CrewDoclingSource(BaseKnowledgeSource):
|
|||||||
|
|
||||||
_logger: Logger = Logger(verbose=True)
|
_logger: Logger = Logger(verbose=True)
|
||||||
|
|
||||||
file_path: Optional[List[Union[Path, str]]] = Field(default=None)
|
file_path: list[Path | str] | None = Field(default=None)
|
||||||
file_paths: List[Union[Path, str]] = Field(default_factory=list)
|
file_paths: list[Path | str] = Field(default_factory=list)
|
||||||
chunks: List[str] = Field(default_factory=list)
|
chunks: list[str] = Field(default_factory=list)
|
||||||
safe_file_paths: List[Union[Path, str]] = Field(default_factory=list)
|
safe_file_paths: list[Path | str] = Field(default_factory=list)
|
||||||
content: List["DoclingDocument"] = Field(default_factory=list)
|
content: list["DoclingDocument"] = Field(default_factory=list)
|
||||||
document_converter: "DocumentConverter" = Field(
|
document_converter: "DocumentConverter" = Field(
|
||||||
default_factory=lambda: DocumentConverter(
|
default_factory=lambda: DocumentConverter(
|
||||||
allowed_formats=[
|
allowed_formats=[
|
||||||
@@ -66,7 +74,7 @@ class CrewDoclingSource(BaseKnowledgeSource):
|
|||||||
self.safe_file_paths = self.validate_content()
|
self.safe_file_paths = self.validate_content()
|
||||||
self.content = self._load_content()
|
self.content = self._load_content()
|
||||||
|
|
||||||
def _load_content(self) -> List["DoclingDocument"]:
|
def _load_content(self) -> list["DoclingDocument"]:
|
||||||
try:
|
try:
|
||||||
return self._convert_source_to_docling_documents()
|
return self._convert_source_to_docling_documents()
|
||||||
except ConversionError as e:
|
except ConversionError as e:
|
||||||
@@ -88,7 +96,7 @@ class CrewDoclingSource(BaseKnowledgeSource):
|
|||||||
self.chunks.extend(list(new_chunks_iterable))
|
self.chunks.extend(list(new_chunks_iterable))
|
||||||
self._save_documents()
|
self._save_documents()
|
||||||
|
|
||||||
def _convert_source_to_docling_documents(self) -> List["DoclingDocument"]:
|
def _convert_source_to_docling_documents(self) -> list["DoclingDocument"]:
|
||||||
conv_results_iter = self.document_converter.convert_all(self.safe_file_paths)
|
conv_results_iter = self.document_converter.convert_all(self.safe_file_paths)
|
||||||
return [result.document for result in conv_results_iter]
|
return [result.document for result in conv_results_iter]
|
||||||
|
|
||||||
@@ -97,8 +105,8 @@ class CrewDoclingSource(BaseKnowledgeSource):
|
|||||||
for chunk in chunker.chunk(doc):
|
for chunk in chunker.chunk(doc):
|
||||||
yield chunk.text
|
yield chunk.text
|
||||||
|
|
||||||
def validate_content(self) -> List[Union[Path, str]]:
|
def validate_content(self) -> list[Path | str]:
|
||||||
processed_paths: List[Union[Path, str]] = []
|
processed_paths: list[Path | str] = []
|
||||||
for path in self.file_paths:
|
for path in self.file_paths:
|
||||||
if isinstance(path, str):
|
if isinstance(path, str):
|
||||||
if path.startswith(("http://", "https://")):
|
if path.startswith(("http://", "https://")):
|
||||||
@@ -108,7 +116,7 @@ class CrewDoclingSource(BaseKnowledgeSource):
|
|||||||
else:
|
else:
|
||||||
raise ValueError(f"Invalid URL format: {path}")
|
raise ValueError(f"Invalid URL format: {path}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise ValueError(f"Invalid URL: {path}. Error: {str(e)}")
|
raise ValueError(f"Invalid URL: {path}. Error: {e!s}") from e
|
||||||
else:
|
else:
|
||||||
local_path = Path(KNOWLEDGE_DIRECTORY + "/" + path)
|
local_path = Path(KNOWLEDGE_DIRECTORY + "/" + path)
|
||||||
if local_path.exists():
|
if local_path.exists():
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
import csv
|
import csv
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, List
|
|
||||||
|
|
||||||
from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource
|
from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource
|
||||||
|
|
||||||
@@ -8,7 +7,7 @@ from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledge
|
|||||||
class CSVKnowledgeSource(BaseFileKnowledgeSource):
|
class CSVKnowledgeSource(BaseFileKnowledgeSource):
|
||||||
"""A knowledge source that stores and queries CSV file content using embeddings."""
|
"""A knowledge source that stores and queries CSV file content using embeddings."""
|
||||||
|
|
||||||
def load_content(self) -> Dict[Path, str]:
|
def load_content(self) -> dict[Path, str]:
|
||||||
"""Load and preprocess CSV file content."""
|
"""Load and preprocess CSV file content."""
|
||||||
content_dict = {}
|
content_dict = {}
|
||||||
for file_path in self.safe_file_paths:
|
for file_path in self.safe_file_paths:
|
||||||
@@ -32,7 +31,7 @@ class CSVKnowledgeSource(BaseFileKnowledgeSource):
|
|||||||
self.chunks.extend(new_chunks)
|
self.chunks.extend(new_chunks)
|
||||||
self._save_documents()
|
self._save_documents()
|
||||||
|
|
||||||
def _chunk_text(self, text: str) -> List[str]:
|
def _chunk_text(self, text: str) -> list[str]:
|
||||||
"""Utility method to split text into chunks."""
|
"""Utility method to split text into chunks."""
|
||||||
return [
|
return [
|
||||||
text[i : i + self.chunk_size]
|
text[i : i + self.chunk_size]
|
||||||
|
|||||||
@@ -1,6 +1,4 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, Iterator, List, Optional, Union
|
|
||||||
from urllib.parse import urlparse
|
|
||||||
|
|
||||||
from pydantic import Field, field_validator
|
from pydantic import Field, field_validator
|
||||||
|
|
||||||
@@ -16,19 +14,19 @@ class ExcelKnowledgeSource(BaseKnowledgeSource):
|
|||||||
|
|
||||||
_logger: Logger = Logger(verbose=True)
|
_logger: Logger = Logger(verbose=True)
|
||||||
|
|
||||||
file_path: Optional[Union[Path, List[Path], str, List[str]]] = Field(
|
file_path: Path | list[Path] | str | list[str] | None = Field(
|
||||||
default=None,
|
default=None,
|
||||||
description="[Deprecated] The path to the file. Use file_paths instead.",
|
description="[Deprecated] The path to the file. Use file_paths instead.",
|
||||||
)
|
)
|
||||||
file_paths: Optional[Union[Path, List[Path], str, List[str]]] = Field(
|
file_paths: Path | list[Path] | str | list[str] | None = Field(
|
||||||
default_factory=list, description="The path to the file"
|
default_factory=list, description="The path to the file"
|
||||||
)
|
)
|
||||||
chunks: List[str] = Field(default_factory=list)
|
chunks: list[str] = Field(default_factory=list)
|
||||||
content: Dict[Path, Dict[str, str]] = Field(default_factory=dict)
|
content: dict[Path, dict[str, str]] = Field(default_factory=dict)
|
||||||
safe_file_paths: List[Path] = Field(default_factory=list)
|
safe_file_paths: list[Path] = Field(default_factory=list)
|
||||||
|
|
||||||
@field_validator("file_path", "file_paths", mode="before")
|
@field_validator("file_path", "file_paths", mode="before")
|
||||||
def validate_file_path(cls, v, info):
|
def validate_file_path(cls, v, info): # noqa: N805
|
||||||
"""Validate that at least one of file_path or file_paths is provided."""
|
"""Validate that at least one of file_path or file_paths is provided."""
|
||||||
# Single check if both are None, O(1) instead of nested conditions
|
# Single check if both are None, O(1) instead of nested conditions
|
||||||
if (
|
if (
|
||||||
@@ -41,7 +39,7 @@ class ExcelKnowledgeSource(BaseKnowledgeSource):
|
|||||||
raise ValueError("Either file_path or file_paths must be provided")
|
raise ValueError("Either file_path or file_paths must be provided")
|
||||||
return v
|
return v
|
||||||
|
|
||||||
def _process_file_paths(self) -> List[Path]:
|
def _process_file_paths(self) -> list[Path]:
|
||||||
"""Convert file_path to a list of Path objects."""
|
"""Convert file_path to a list of Path objects."""
|
||||||
|
|
||||||
if hasattr(self, "file_path") and self.file_path is not None:
|
if hasattr(self, "file_path") and self.file_path is not None:
|
||||||
@@ -56,7 +54,7 @@ class ExcelKnowledgeSource(BaseKnowledgeSource):
|
|||||||
raise ValueError("Your source must be provided with a file_paths: []")
|
raise ValueError("Your source must be provided with a file_paths: []")
|
||||||
|
|
||||||
# Convert single path to list
|
# Convert single path to list
|
||||||
path_list: List[Union[Path, str]] = (
|
path_list: list[Path | str] = (
|
||||||
[self.file_paths]
|
[self.file_paths]
|
||||||
if isinstance(self.file_paths, (str, Path))
|
if isinstance(self.file_paths, (str, Path))
|
||||||
else list(self.file_paths)
|
else list(self.file_paths)
|
||||||
@@ -100,7 +98,7 @@ class ExcelKnowledgeSource(BaseKnowledgeSource):
|
|||||||
self.validate_content()
|
self.validate_content()
|
||||||
self.content = self._load_content()
|
self.content = self._load_content()
|
||||||
|
|
||||||
def _load_content(self) -> Dict[Path, Dict[str, str]]:
|
def _load_content(self) -> dict[Path, dict[str, str]]:
|
||||||
"""Load and preprocess Excel file content from multiple sheets.
|
"""Load and preprocess Excel file content from multiple sheets.
|
||||||
|
|
||||||
Each sheet's content is converted to CSV format and stored.
|
Each sheet's content is converted to CSV format and stored.
|
||||||
@@ -126,21 +124,21 @@ class ExcelKnowledgeSource(BaseKnowledgeSource):
|
|||||||
content_dict[file_path] = sheet_dict
|
content_dict[file_path] = sheet_dict
|
||||||
return content_dict
|
return content_dict
|
||||||
|
|
||||||
def convert_to_path(self, path: Union[Path, str]) -> Path:
|
def convert_to_path(self, path: Path | str) -> Path:
|
||||||
"""Convert a path to a Path object."""
|
"""Convert a path to a Path object."""
|
||||||
return Path(KNOWLEDGE_DIRECTORY + "/" + path) if isinstance(path, str) else path
|
return Path(KNOWLEDGE_DIRECTORY + "/" + path) if isinstance(path, str) else path
|
||||||
|
|
||||||
def _import_dependencies(self):
|
def _import_dependencies(self):
|
||||||
"""Dynamically import dependencies."""
|
"""Dynamically import dependencies."""
|
||||||
try:
|
try:
|
||||||
import pandas as pd
|
import pandas as pd # type: ignore[import-untyped,import-not-found]
|
||||||
|
|
||||||
return pd
|
return pd
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
missing_package = str(e).split()[-1]
|
missing_package = str(e).split()[-1]
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
f"{missing_package} is not installed. Please install it with: pip install {missing_package}"
|
f"{missing_package} is not installed. Please install it with: pip install {missing_package}"
|
||||||
)
|
) from e
|
||||||
|
|
||||||
def add(self) -> None:
|
def add(self) -> None:
|
||||||
"""
|
"""
|
||||||
@@ -161,7 +159,7 @@ class ExcelKnowledgeSource(BaseKnowledgeSource):
|
|||||||
self.chunks.extend(new_chunks)
|
self.chunks.extend(new_chunks)
|
||||||
self._save_documents()
|
self._save_documents()
|
||||||
|
|
||||||
def _chunk_text(self, text: str) -> List[str]:
|
def _chunk_text(self, text: str) -> list[str]:
|
||||||
"""Utility method to split text into chunks."""
|
"""Utility method to split text into chunks."""
|
||||||
return [
|
return [
|
||||||
text[i : i + self.chunk_size]
|
text[i : i + self.chunk_size]
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import json
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Dict, List
|
from typing import Any
|
||||||
|
|
||||||
from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource
|
from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource
|
||||||
|
|
||||||
@@ -8,9 +8,9 @@ from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledge
|
|||||||
class JSONKnowledgeSource(BaseFileKnowledgeSource):
|
class JSONKnowledgeSource(BaseFileKnowledgeSource):
|
||||||
"""A knowledge source that stores and queries JSON file content using embeddings."""
|
"""A knowledge source that stores and queries JSON file content using embeddings."""
|
||||||
|
|
||||||
def load_content(self) -> Dict[Path, str]:
|
def load_content(self) -> dict[Path, str]:
|
||||||
"""Load and preprocess JSON file content."""
|
"""Load and preprocess JSON file content."""
|
||||||
content: Dict[Path, str] = {}
|
content: dict[Path, str] = {}
|
||||||
for path in self.safe_file_paths:
|
for path in self.safe_file_paths:
|
||||||
path = self.convert_to_path(path)
|
path = self.convert_to_path(path)
|
||||||
with open(path, "r", encoding="utf-8") as json_file:
|
with open(path, "r", encoding="utf-8") as json_file:
|
||||||
@@ -29,7 +29,7 @@ class JSONKnowledgeSource(BaseFileKnowledgeSource):
|
|||||||
for item in data:
|
for item in data:
|
||||||
text += f"{indent}- {self._json_to_text(item, level + 1)}\n"
|
text += f"{indent}- {self._json_to_text(item, level + 1)}\n"
|
||||||
else:
|
else:
|
||||||
text += f"{str(data)}"
|
text += f"{data!s}"
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def add(self) -> None:
|
def add(self) -> None:
|
||||||
@@ -44,7 +44,7 @@ class JSONKnowledgeSource(BaseFileKnowledgeSource):
|
|||||||
self.chunks.extend(new_chunks)
|
self.chunks.extend(new_chunks)
|
||||||
self._save_documents()
|
self._save_documents()
|
||||||
|
|
||||||
def _chunk_text(self, text: str) -> List[str]:
|
def _chunk_text(self, text: str) -> list[str]:
|
||||||
"""Utility method to split text into chunks."""
|
"""Utility method to split text into chunks."""
|
||||||
return [
|
return [
|
||||||
text[i : i + self.chunk_size]
|
text[i : i + self.chunk_size]
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, List
|
|
||||||
|
|
||||||
from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource
|
from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource
|
||||||
|
|
||||||
@@ -7,7 +6,7 @@ from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledge
|
|||||||
class PDFKnowledgeSource(BaseFileKnowledgeSource):
|
class PDFKnowledgeSource(BaseFileKnowledgeSource):
|
||||||
"""A knowledge source that stores and queries PDF file content using embeddings."""
|
"""A knowledge source that stores and queries PDF file content using embeddings."""
|
||||||
|
|
||||||
def load_content(self) -> Dict[Path, str]:
|
def load_content(self) -> dict[Path, str]:
|
||||||
"""Load and preprocess PDF file content."""
|
"""Load and preprocess PDF file content."""
|
||||||
pdfplumber = self._import_pdfplumber()
|
pdfplumber = self._import_pdfplumber()
|
||||||
|
|
||||||
@@ -30,22 +29,22 @@ class PDFKnowledgeSource(BaseFileKnowledgeSource):
|
|||||||
import pdfplumber
|
import pdfplumber
|
||||||
|
|
||||||
return pdfplumber
|
return pdfplumber
|
||||||
except ImportError:
|
except ImportError as e:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"pdfplumber is not installed. Please install it with: pip install pdfplumber"
|
"pdfplumber is not installed. Please install it with: pip install pdfplumber"
|
||||||
)
|
) from e
|
||||||
|
|
||||||
def add(self) -> None:
|
def add(self) -> None:
|
||||||
"""
|
"""
|
||||||
Add PDF file content to the knowledge source, chunk it, compute embeddings,
|
Add PDF file content to the knowledge source, chunk it, compute embeddings,
|
||||||
and save the embeddings.
|
and save the embeddings.
|
||||||
"""
|
"""
|
||||||
for _, text in self.content.items():
|
for text in self.content.values():
|
||||||
new_chunks = self._chunk_text(text)
|
new_chunks = self._chunk_text(text)
|
||||||
self.chunks.extend(new_chunks)
|
self.chunks.extend(new_chunks)
|
||||||
self._save_documents()
|
self._save_documents()
|
||||||
|
|
||||||
def _chunk_text(self, text: str) -> List[str]:
|
def _chunk_text(self, text: str) -> list[str]:
|
||||||
"""Utility method to split text into chunks."""
|
"""Utility method to split text into chunks."""
|
||||||
return [
|
return [
|
||||||
text[i : i + self.chunk_size]
|
text[i : i + self.chunk_size]
|
||||||
|
|||||||
@@ -1,5 +1,3 @@
|
|||||||
from typing import List, Optional
|
|
||||||
|
|
||||||
from pydantic import Field
|
from pydantic import Field
|
||||||
|
|
||||||
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
|
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
|
||||||
@@ -9,7 +7,7 @@ class StringKnowledgeSource(BaseKnowledgeSource):
|
|||||||
"""A knowledge source that stores and queries plain text content using embeddings."""
|
"""A knowledge source that stores and queries plain text content using embeddings."""
|
||||||
|
|
||||||
content: str = Field(...)
|
content: str = Field(...)
|
||||||
collection_name: Optional[str] = Field(default=None)
|
collection_name: str | None = Field(default=None)
|
||||||
|
|
||||||
def model_post_init(self, _):
|
def model_post_init(self, _):
|
||||||
"""Post-initialization method to validate content."""
|
"""Post-initialization method to validate content."""
|
||||||
@@ -26,7 +24,7 @@ class StringKnowledgeSource(BaseKnowledgeSource):
|
|||||||
self.chunks.extend(new_chunks)
|
self.chunks.extend(new_chunks)
|
||||||
self._save_documents()
|
self._save_documents()
|
||||||
|
|
||||||
def _chunk_text(self, text: str) -> List[str]:
|
def _chunk_text(self, text: str) -> list[str]:
|
||||||
"""Utility method to split text into chunks."""
|
"""Utility method to split text into chunks."""
|
||||||
return [
|
return [
|
||||||
text[i : i + self.chunk_size]
|
text[i : i + self.chunk_size]
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, List
|
|
||||||
|
|
||||||
from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource
|
from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource
|
||||||
|
|
||||||
@@ -7,7 +6,7 @@ from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledge
|
|||||||
class TextFileKnowledgeSource(BaseFileKnowledgeSource):
|
class TextFileKnowledgeSource(BaseFileKnowledgeSource):
|
||||||
"""A knowledge source that stores and queries text file content using embeddings."""
|
"""A knowledge source that stores and queries text file content using embeddings."""
|
||||||
|
|
||||||
def load_content(self) -> Dict[Path, str]:
|
def load_content(self) -> dict[Path, str]:
|
||||||
"""Load and preprocess text file content."""
|
"""Load and preprocess text file content."""
|
||||||
content = {}
|
content = {}
|
||||||
for path in self.safe_file_paths:
|
for path in self.safe_file_paths:
|
||||||
@@ -21,12 +20,12 @@ class TextFileKnowledgeSource(BaseFileKnowledgeSource):
|
|||||||
Add text file content to the knowledge source, chunk it, compute embeddings,
|
Add text file content to the knowledge source, chunk it, compute embeddings,
|
||||||
and save the embeddings.
|
and save the embeddings.
|
||||||
"""
|
"""
|
||||||
for _, text in self.content.items():
|
for text in self.content.values():
|
||||||
new_chunks = self._chunk_text(text)
|
new_chunks = self._chunk_text(text)
|
||||||
self.chunks.extend(new_chunks)
|
self.chunks.extend(new_chunks)
|
||||||
self._save_documents()
|
self._save_documents()
|
||||||
|
|
||||||
def _chunk_text(self, text: str) -> List[str]:
|
def _chunk_text(self, text: str) -> list[str]:
|
||||||
"""Utility method to split text into chunks."""
|
"""Utility method to split text into chunks."""
|
||||||
return [
|
return [
|
||||||
text[i : i + self.chunk_size]
|
text[i : i + self.chunk_size]
|
||||||
|
|||||||
Reference in New Issue
Block a user