Knowledge project directory standard (#1691)

* Knowledge project directory standard

* fixed types

* comment fix

* made base file knowledge source an abstract class

* cleaner validator on model_post_init

* fix type checker

* cleaner refactor

* better template
This commit is contained in:
Lorenze Jay
2024-12-03 12:27:48 -08:00
committed by GitHub
parent ed3487aa22
commit 1af95f5146
11 changed files with 123 additions and 50 deletions

View File

@@ -156,14 +156,35 @@ crew = Crew(
agents=[agent], agents=[agent],
tasks=[task], tasks=[task],
knowledge_sources=[source], knowledge_sources=[source],
embedder_config={ embedder={
"model": "BAAI/bge-small-en-v1.5", "provider": "ollama",
"normalize": True, "config": {"model": "nomic-embed-text:latest"},
"max_length": 512
} }
) )
``` ```
### Referencing Sources
You can reference knowledge sources by their collection name or metadata.
* Add a directory to your crew project called `knowledge`:
* File paths in knowledge can be referenced relative to the `knowledge` directory.
Example:
A file inside the `knowledge` directory called `example.txt` can be referenced as `example.txt`.
```python
source = TextFileKnowledgeSource(
file_path="example.txt", # or /example.txt
collection_name="example"
)
crew = Crew(
agents=[agent],
tasks=[task],
knowledge_sources=[source],
)
```
## Best Practices ## Best Practices
<AccordionGroup> <AccordionGroup>

View File

@@ -39,6 +39,7 @@ def create_folder_structure(name, parent_folder=None):
folder_path.mkdir(parents=True) folder_path.mkdir(parents=True)
(folder_path / "tests").mkdir(exist_ok=True) (folder_path / "tests").mkdir(exist_ok=True)
(folder_path / "knowledge").mkdir(exist_ok=True)
if not parent_folder: if not parent_folder:
(folder_path / "src" / folder_name).mkdir(parents=True) (folder_path / "src" / folder_name).mkdir(parents=True)
(folder_path / "src" / folder_name / "tools").mkdir(parents=True) (folder_path / "src" / folder_name / "tools").mkdir(parents=True)
@@ -52,7 +53,14 @@ def copy_template_files(folder_path, name, class_name, parent_folder):
templates_dir = package_dir / "templates" / "crew" templates_dir = package_dir / "templates" / "crew"
root_template_files = ( root_template_files = (
[".gitignore", "pyproject.toml", "README.md"] if not parent_folder else [] [
".gitignore",
"pyproject.toml",
"README.md",
"knowledge/user_preference.txt",
]
if not parent_folder
else []
) )
tools_template_files = ["tools/custom_tool.py", "tools/__init__.py"] tools_template_files = ["tools/custom_tool.py", "tools/__init__.py"]
config_template_files = ["config/agents.yaml", "config/tasks.yaml"] config_template_files = ["config/agents.yaml", "config/tasks.yaml"]
@@ -168,7 +176,9 @@ def create_crew(name, provider=None, skip_provider=False, parent_folder=None):
templates_dir = package_dir / "templates" / "crew" templates_dir = package_dir / "templates" / "crew"
root_template_files = ( root_template_files = (
[".gitignore", "pyproject.toml", "README.md"] if not parent_folder else [] [".gitignore", "pyproject.toml", "README.md", "knowledge/user_preference.txt"]
if not parent_folder
else []
) )
tools_template_files = ["tools/custom_tool.py", "tools/__init__.py"] tools_template_files = ["tools/custom_tool.py", "tools/__init__.py"]
config_template_files = ["config/agents.yaml", "config/tasks.yaml"] config_template_files = ["config/agents.yaml", "config/tasks.yaml"]

View File

@@ -1,8 +1,9 @@
from crewai import Agent, Crew, Process, Task from crewai import Agent, Crew, Process, Task
from crewai.project import CrewBase, agent, crew, task, before_kickoff, after_kickoff from crewai.project import CrewBase, agent, crew, task, before_kickoff, after_kickoff
# Uncomment the following line to use an example of a custom tool # Uncomment the following line to use an example of a custom tool
# from {{folder_name}}.tools.custom_tool import MyCustomTool # from {{folder_name}}.tools.custom_tool import MyCustomTool
# Uncomment the following line to use an example of a knowledge source
# from crewai.knowledge.source.text_file_knowledge_source import TextFileKnowledgeSource
# Check our tools documentations for more information on how to use them # Check our tools documentations for more information on how to use them
# from crewai_tools import SerperDevTool # from crewai_tools import SerperDevTool
@@ -57,10 +58,20 @@ class {{crew_name}}():
@crew @crew
def crew(self) -> Crew: def crew(self) -> Crew:
"""Creates the {{crew_name}} crew""" """Creates the {{crew_name}} crew"""
# You can add knowledge sources here
# knowledge_path = "user_preference.txt"
# sources = [
# TextFileKnowledgeSource(
# file_path="knowledge/user_preference.txt",
# metadata={"preference": "personal"}
# ),
# ]
return Crew( return Crew(
agents=self.agents, # Automatically created by the @agent decorator agents=self.agents, # Automatically created by the @agent decorator
tasks=self.tasks, # Automatically created by the @task decorator tasks=self.tasks, # Automatically created by the @task decorator
process=Process.sequential, process=Process.sequential,
verbose=True, verbose=True,
# process=Process.hierarchical, # In case you wanna use that instead https://docs.crewai.com/how-to/Hierarchical/ # process=Process.hierarchical, # In case you wanna use that instead https://docs.crewai.com/how-to/Hierarchical/
# knowledge_sources=sources, # In the case you want to add knowledge sources
) )

View File

@@ -0,0 +1,4 @@
User name is John Doe.
User is an AI Engineer.
User is interested in AI Agents.
User is based in San Francisco, California.

View File

@@ -1,36 +1,72 @@
from abc import ABC, abstractmethod
from pathlib import Path from pathlib import Path
from typing import Union, List from typing import Union, List, Dict, Any
from pydantic import Field from pydantic import Field
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
from typing import Dict, Any from crewai.utilities.logger import Logger
from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage
from crewai.utilities.constants import KNOWLEDGE_DIRECTORY
class BaseFileKnowledgeSource(BaseKnowledgeSource): class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):
"""Base class for knowledge sources that load content from files.""" """Base class for knowledge sources that load content from files."""
file_path: Union[Path, List[Path]] = Field(...) _logger: Logger = Logger(verbose=True)
file_path: Union[Path, List[Path], str, List[str]] = Field(
..., description="The path to the file"
)
content: Dict[Path, str] = Field(init=False, default_factory=dict) content: Dict[Path, str] = Field(init=False, default_factory=dict)
storage: KnowledgeStorage = Field(default_factory=KnowledgeStorage) storage: KnowledgeStorage = Field(default_factory=KnowledgeStorage)
safe_file_paths: List[Path] = Field(default_factory=list)
def model_post_init(self, _): def model_post_init(self, _):
"""Post-initialization method to load content.""" """Post-initialization method to load content."""
self.safe_file_paths = self._process_file_paths()
self.validate_paths()
self.content = self.load_content() self.content = self.load_content()
@abstractmethod
def load_content(self) -> Dict[Path, str]: def load_content(self) -> Dict[Path, str]:
"""Load and preprocess file content. Should be overridden by subclasses.""" """Load and preprocess file content. Should be overridden by subclasses. Assume that the file path is relative to the project root in the knowledge directory."""
paths = [self.file_path] if isinstance(self.file_path, Path) else self.file_path pass
for path in paths: def validate_paths(self):
"""Validate the paths."""
for path in self.safe_file_paths:
if not path.exists(): if not path.exists():
self._logger.log(
"error",
f"File not found: {path}. Try adding sources to the knowledge directory. If its inside the knowledge directory, use the relative path.",
color="red",
)
raise FileNotFoundError(f"File not found: {path}") raise FileNotFoundError(f"File not found: {path}")
if not path.is_file(): if not path.is_file():
raise ValueError(f"Path is not a file: {path}") self._logger.log(
return {} "error",
f"Path is not a file: {path}",
color="red",
)
def save_documents(self, metadata: Dict[str, Any]): def save_documents(self, metadata: Dict[str, Any]):
"""Save the documents to the storage.""" """Save the documents to the storage."""
chunk_metadatas = [metadata.copy() for _ in self.chunks] chunk_metadatas = [metadata.copy() for _ in self.chunks]
self.storage.save(self.chunks, chunk_metadatas) self.storage.save(self.chunks, chunk_metadatas)
def convert_to_path(self, path: Union[Path, str]) -> Path:
"""Convert a path to a Path object."""
return Path(KNOWLEDGE_DIRECTORY + "/" + path) if isinstance(path, str) else path
def _process_file_paths(self) -> List[Path]:
"""Convert file_path to a list of Path objects."""
paths = (
[self.file_path]
if isinstance(self.file_path, (str, Path))
else self.file_path
)
if not isinstance(paths, list):
raise ValueError("file_path must be a Path, str, or a list of these types")
return [self.convert_to_path(path) for path in paths]

View File

@@ -10,19 +10,15 @@ class CSVKnowledgeSource(BaseFileKnowledgeSource):
def load_content(self) -> Dict[Path, str]: def load_content(self) -> Dict[Path, str]:
"""Load and preprocess CSV file content.""" """Load and preprocess CSV file content."""
super().load_content() # Validate the file path content_dict = {}
for file_path in self.safe_file_paths:
file_path = ( with open(file_path, "r", encoding="utf-8") as csvfile:
self.file_path[0] if isinstance(self.file_path, list) else self.file_path reader = csv.reader(csvfile)
) content = ""
file_path = Path(file_path) if isinstance(file_path, str) else file_path for row in reader:
content += " ".join(row) + "\n"
with open(file_path, "r", encoding="utf-8") as csvfile: content_dict[file_path] = content
reader = csv.reader(csvfile) return content_dict
content = ""
for row in reader:
content += " ".join(row) + "\n"
return {file_path: content}
def add(self) -> None: def add(self) -> None:
""" """

View File

@@ -8,17 +8,15 @@ class ExcelKnowledgeSource(BaseFileKnowledgeSource):
def load_content(self) -> Dict[Path, str]: def load_content(self) -> Dict[Path, str]:
"""Load and preprocess Excel file content.""" """Load and preprocess Excel file content."""
super().load_content() # Validate the file path
pd = self._import_dependencies() pd = self._import_dependencies()
if isinstance(self.file_path, list): content_dict = {}
file_path = self.file_path[0] for file_path in self.safe_file_paths:
else: file_path = self.convert_to_path(file_path)
file_path = self.file_path df = pd.read_excel(file_path)
content = df.to_csv(index=False)
df = pd.read_excel(file_path) content_dict[file_path] = content
content = df.to_csv(index=False) return content_dict
return {file_path: content}
def _import_dependencies(self): def _import_dependencies(self):
"""Dynamically import dependencies.""" """Dynamically import dependencies."""

View File

@@ -10,11 +10,9 @@ class JSONKnowledgeSource(BaseFileKnowledgeSource):
def load_content(self) -> Dict[Path, str]: def load_content(self) -> Dict[Path, str]:
"""Load and preprocess JSON file content.""" """Load and preprocess JSON file content."""
super().load_content() # Validate the file path
paths = [self.file_path] if isinstance(self.file_path, Path) else self.file_path
content: Dict[Path, str] = {} content: Dict[Path, str] = {}
for path in paths: for path in self.safe_file_paths:
path = self.convert_to_path(path)
with open(path, "r", encoding="utf-8") as json_file: with open(path, "r", encoding="utf-8") as json_file:
data = json.load(json_file) data = json.load(json_file)
content[path] = self._json_to_text(data) content[path] = self._json_to_text(data)

View File

@@ -9,14 +9,13 @@ class PDFKnowledgeSource(BaseFileKnowledgeSource):
def load_content(self) -> Dict[Path, str]: def load_content(self) -> Dict[Path, str]:
"""Load and preprocess PDF file content.""" """Load and preprocess PDF file content."""
super().load_content() # Validate the file paths
pdfplumber = self._import_pdfplumber() pdfplumber = self._import_pdfplumber()
paths = [self.file_path] if isinstance(self.file_path, Path) else self.file_path
content = {} content = {}
for path in paths: for path in self.safe_file_paths:
text = "" text = ""
path = self.convert_to_path(path)
with pdfplumber.open(path) as pdf: with pdfplumber.open(path) as pdf:
for page in pdf.pages: for page in pdf.pages:
page_text = page.extract_text() page_text = page.extract_text()

View File

@@ -9,12 +9,11 @@ class TextFileKnowledgeSource(BaseFileKnowledgeSource):
def load_content(self) -> Dict[Path, str]: def load_content(self) -> Dict[Path, str]:
"""Load and preprocess text file content.""" """Load and preprocess text file content."""
super().load_content()
paths = [self.file_path] if isinstance(self.file_path, Path) else self.file_path
content = {} content = {}
for path in paths: for path in self.safe_file_paths:
with path.open("r", encoding="utf-8") as f: path = self.convert_to_path(path)
content[path] = f.read() # type: ignore with open(path, "r", encoding="utf-8") as f:
content[path] = f.read()
return content return content
def add(self) -> None: def add(self) -> None:

View File

@@ -1,3 +1,4 @@
TRAINING_DATA_FILE = "training_data.pkl" TRAINING_DATA_FILE = "training_data.pkl"
TRAINED_AGENTS_DATA_FILE = "trained_agents_data.pkl" TRAINED_AGENTS_DATA_FILE = "trained_agents_data.pkl"
DEFAULT_SCORE_THRESHOLD = 0.35 DEFAULT_SCORE_THRESHOLD = 0.35
KNOWLEDGE_DIRECTORY = "knowledge"