From 1f8ee15753de7d0d17066bfcc8cb7ae13317def0 Mon Sep 17 00:00:00 2001 From: Lorenze Jay <63378463+lorenzejay@users.noreply.github.com> Date: Tue, 3 Dec 2024 12:27:48 -0800 Subject: [PATCH] Knowledge project directory standard (#1691) * Knowledge project directory standard * fixed types * comment fix * made base file knowledge source an abstract class * cleaner validator on model_post_init * fix type checker * cleaner refactor * better template --- docs/concepts/knowledge.mdx | 29 ++++++++-- src/crewai/cli/create_crew.py | 14 ++++- src/crewai/cli/templates/crew/crew.py | 13 ++++- .../crew/knowledge/user_preference.txt | 4 ++ .../source/base_file_knowledge_source.py | 54 +++++++++++++++---- .../knowledge/source/csv_knowledge_source.py | 22 ++++---- .../source/excel_knowledge_source.py | 16 +++--- .../knowledge/source/json_knowledge_source.py | 6 +-- .../knowledge/source/pdf_knowledge_source.py | 5 +- .../source/text_file_knowledge_source.py | 9 ++-- src/crewai/utilities/constants.py | 1 + 11 files changed, 123 insertions(+), 50 deletions(-) create mode 100644 src/crewai/cli/templates/crew/knowledge/user_preference.txt diff --git a/docs/concepts/knowledge.mdx b/docs/concepts/knowledge.mdx index 69fa4e644..a00b2c2f0 100644 --- a/docs/concepts/knowledge.mdx +++ b/docs/concepts/knowledge.mdx @@ -156,14 +156,35 @@ crew = Crew( agents=[agent], tasks=[task], knowledge_sources=[source], - embedder_config={ - "model": "BAAI/bge-small-en-v1.5", - "normalize": True, - "max_length": 512 + embedder={ + "provider": "ollama", + "config": {"model": "nomic-embed-text:latest"}, } ) ``` +### Referencing Sources + +You can reference knowledge sources by their collection name or metadata. + +* Add a directory to your crew project called `knowledge`: +* File paths in knowledge can be referenced relative to the `knowledge` directory. + +Example: +A file inside the `knowledge` directory called `example.txt` can be referenced as `example.txt`. + +```python +source = TextFileKnowledgeSource( + file_path="example.txt", # or /example.txt + collection_name="example" +) +crew = Crew( + agents=[agent], + tasks=[task], + knowledge_sources=[source], +) +``` + ## Best Practices diff --git a/src/crewai/cli/create_crew.py b/src/crewai/cli/create_crew.py index 06440d74e..c658b0de1 100644 --- a/src/crewai/cli/create_crew.py +++ b/src/crewai/cli/create_crew.py @@ -39,6 +39,7 @@ def create_folder_structure(name, parent_folder=None): folder_path.mkdir(parents=True) (folder_path / "tests").mkdir(exist_ok=True) + (folder_path / "knowledge").mkdir(exist_ok=True) if not parent_folder: (folder_path / "src" / folder_name).mkdir(parents=True) (folder_path / "src" / folder_name / "tools").mkdir(parents=True) @@ -52,7 +53,14 @@ def copy_template_files(folder_path, name, class_name, parent_folder): templates_dir = package_dir / "templates" / "crew" root_template_files = ( - [".gitignore", "pyproject.toml", "README.md"] if not parent_folder else [] + [ + ".gitignore", + "pyproject.toml", + "README.md", + "knowledge/user_preference.txt", + ] + if not parent_folder + else [] ) tools_template_files = ["tools/custom_tool.py", "tools/__init__.py"] config_template_files = ["config/agents.yaml", "config/tasks.yaml"] @@ -168,7 +176,9 @@ def create_crew(name, provider=None, skip_provider=False, parent_folder=None): templates_dir = package_dir / "templates" / "crew" root_template_files = ( - [".gitignore", "pyproject.toml", "README.md"] if not parent_folder else [] + [".gitignore", "pyproject.toml", "README.md", "knowledge/user_preference.txt"] + if not parent_folder + else [] ) tools_template_files = ["tools/custom_tool.py", "tools/__init__.py"] config_template_files = ["config/agents.yaml", "config/tasks.yaml"] diff --git a/src/crewai/cli/templates/crew/crew.py b/src/crewai/cli/templates/crew/crew.py index 6f8e66c4a..0a8c7481a 100644 --- a/src/crewai/cli/templates/crew/crew.py +++ b/src/crewai/cli/templates/crew/crew.py @@ -1,8 +1,9 @@ from crewai import Agent, Crew, Process, Task from crewai.project import CrewBase, agent, crew, task, before_kickoff, after_kickoff - # Uncomment the following line to use an example of a custom tool # from {{folder_name}}.tools.custom_tool import MyCustomTool +# Uncomment the following line to use an example of a knowledge source +# from crewai.knowledge.source.text_file_knowledge_source import TextFileKnowledgeSource # Check our tools documentations for more information on how to use them # from crewai_tools import SerperDevTool @@ -57,10 +58,20 @@ class {{crew_name}}(): @crew def crew(self) -> Crew: """Creates the {{crew_name}} crew""" + # You can add knowledge sources here + # knowledge_path = "user_preference.txt" + # sources = [ + # TextFileKnowledgeSource( + # file_path="knowledge/user_preference.txt", + # metadata={"preference": "personal"} + # ), + # ] + return Crew( agents=self.agents, # Automatically created by the @agent decorator tasks=self.tasks, # Automatically created by the @task decorator process=Process.sequential, verbose=True, # process=Process.hierarchical, # In case you wanna use that instead https://docs.crewai.com/how-to/Hierarchical/ + # knowledge_sources=sources, # In the case you want to add knowledge sources ) diff --git a/src/crewai/cli/templates/crew/knowledge/user_preference.txt b/src/crewai/cli/templates/crew/knowledge/user_preference.txt new file mode 100644 index 000000000..dd63a17bf --- /dev/null +++ b/src/crewai/cli/templates/crew/knowledge/user_preference.txt @@ -0,0 +1,4 @@ +User name is John Doe. +User is an AI Engineer. +User is interested in AI Agents. +User is based in San Francisco, California. diff --git a/src/crewai/knowledge/source/base_file_knowledge_source.py b/src/crewai/knowledge/source/base_file_knowledge_source.py index b6e346534..9cb9ec2a2 100644 --- a/src/crewai/knowledge/source/base_file_knowledge_source.py +++ b/src/crewai/knowledge/source/base_file_knowledge_source.py @@ -1,36 +1,72 @@ +from abc import ABC, abstractmethod from pathlib import Path -from typing import Union, List +from typing import Union, List, Dict, Any from pydantic import Field from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource -from typing import Dict, Any +from crewai.utilities.logger import Logger from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage +from crewai.utilities.constants import KNOWLEDGE_DIRECTORY -class BaseFileKnowledgeSource(BaseKnowledgeSource): +class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC): """Base class for knowledge sources that load content from files.""" - file_path: Union[Path, List[Path]] = Field(...) + _logger: Logger = Logger(verbose=True) + file_path: Union[Path, List[Path], str, List[str]] = Field( + ..., description="The path to the file" + ) content: Dict[Path, str] = Field(init=False, default_factory=dict) storage: KnowledgeStorage = Field(default_factory=KnowledgeStorage) + safe_file_paths: List[Path] = Field(default_factory=list) def model_post_init(self, _): """Post-initialization method to load content.""" + self.safe_file_paths = self._process_file_paths() + self.validate_paths() self.content = self.load_content() + @abstractmethod def load_content(self) -> Dict[Path, str]: - """Load and preprocess file content. Should be overridden by subclasses.""" - paths = [self.file_path] if isinstance(self.file_path, Path) else self.file_path + """Load and preprocess file content. Should be overridden by subclasses. Assume that the file path is relative to the project root in the knowledge directory.""" + pass - for path in paths: + def validate_paths(self): + """Validate the paths.""" + for path in self.safe_file_paths: if not path.exists(): + self._logger.log( + "error", + f"File not found: {path}. Try adding sources to the knowledge directory. If its inside the knowledge directory, use the relative path.", + color="red", + ) raise FileNotFoundError(f"File not found: {path}") if not path.is_file(): - raise ValueError(f"Path is not a file: {path}") - return {} + self._logger.log( + "error", + f"Path is not a file: {path}", + color="red", + ) def save_documents(self, metadata: Dict[str, Any]): """Save the documents to the storage.""" chunk_metadatas = [metadata.copy() for _ in self.chunks] self.storage.save(self.chunks, chunk_metadatas) + + def convert_to_path(self, path: Union[Path, str]) -> Path: + """Convert a path to a Path object.""" + return Path(KNOWLEDGE_DIRECTORY + "/" + path) if isinstance(path, str) else path + + def _process_file_paths(self) -> List[Path]: + """Convert file_path to a list of Path objects.""" + paths = ( + [self.file_path] + if isinstance(self.file_path, (str, Path)) + else self.file_path + ) + + if not isinstance(paths, list): + raise ValueError("file_path must be a Path, str, or a list of these types") + + return [self.convert_to_path(path) for path in paths] diff --git a/src/crewai/knowledge/source/csv_knowledge_source.py b/src/crewai/knowledge/source/csv_knowledge_source.py index 0946104a4..117cbe32f 100644 --- a/src/crewai/knowledge/source/csv_knowledge_source.py +++ b/src/crewai/knowledge/source/csv_knowledge_source.py @@ -10,19 +10,15 @@ class CSVKnowledgeSource(BaseFileKnowledgeSource): def load_content(self) -> Dict[Path, str]: """Load and preprocess CSV file content.""" - super().load_content() # Validate the file path - - file_path = ( - self.file_path[0] if isinstance(self.file_path, list) else self.file_path - ) - file_path = Path(file_path) if isinstance(file_path, str) else file_path - - with open(file_path, "r", encoding="utf-8") as csvfile: - reader = csv.reader(csvfile) - content = "" - for row in reader: - content += " ".join(row) + "\n" - return {file_path: content} + content_dict = {} + for file_path in self.safe_file_paths: + with open(file_path, "r", encoding="utf-8") as csvfile: + reader = csv.reader(csvfile) + content = "" + for row in reader: + content += " ".join(row) + "\n" + content_dict[file_path] = content + return content_dict def add(self) -> None: """ diff --git a/src/crewai/knowledge/source/excel_knowledge_source.py b/src/crewai/knowledge/source/excel_knowledge_source.py index 3b5c71514..5484e59d5 100644 --- a/src/crewai/knowledge/source/excel_knowledge_source.py +++ b/src/crewai/knowledge/source/excel_knowledge_source.py @@ -8,17 +8,15 @@ class ExcelKnowledgeSource(BaseFileKnowledgeSource): def load_content(self) -> Dict[Path, str]: """Load and preprocess Excel file content.""" - super().load_content() # Validate the file path pd = self._import_dependencies() - if isinstance(self.file_path, list): - file_path = self.file_path[0] - else: - file_path = self.file_path - - df = pd.read_excel(file_path) - content = df.to_csv(index=False) - return {file_path: content} + content_dict = {} + for file_path in self.safe_file_paths: + file_path = self.convert_to_path(file_path) + df = pd.read_excel(file_path) + content = df.to_csv(index=False) + content_dict[file_path] = content + return content_dict def _import_dependencies(self): """Dynamically import dependencies.""" diff --git a/src/crewai/knowledge/source/json_knowledge_source.py b/src/crewai/knowledge/source/json_knowledge_source.py index 490423a00..ed6eec86d 100644 --- a/src/crewai/knowledge/source/json_knowledge_source.py +++ b/src/crewai/knowledge/source/json_knowledge_source.py @@ -10,11 +10,9 @@ class JSONKnowledgeSource(BaseFileKnowledgeSource): def load_content(self) -> Dict[Path, str]: """Load and preprocess JSON file content.""" - super().load_content() # Validate the file path - paths = [self.file_path] if isinstance(self.file_path, Path) else self.file_path - content: Dict[Path, str] = {} - for path in paths: + for path in self.safe_file_paths: + path = self.convert_to_path(path) with open(path, "r", encoding="utf-8") as json_file: data = json.load(json_file) content[path] = self._json_to_text(data) diff --git a/src/crewai/knowledge/source/pdf_knowledge_source.py b/src/crewai/knowledge/source/pdf_knowledge_source.py index 623ba30a2..e6733d78b 100644 --- a/src/crewai/knowledge/source/pdf_knowledge_source.py +++ b/src/crewai/knowledge/source/pdf_knowledge_source.py @@ -9,14 +9,13 @@ class PDFKnowledgeSource(BaseFileKnowledgeSource): def load_content(self) -> Dict[Path, str]: """Load and preprocess PDF file content.""" - super().load_content() # Validate the file paths pdfplumber = self._import_pdfplumber() - paths = [self.file_path] if isinstance(self.file_path, Path) else self.file_path content = {} - for path in paths: + for path in self.safe_file_paths: text = "" + path = self.convert_to_path(path) with pdfplumber.open(path) as pdf: for page in pdf.pages: page_text = page.extract_text() diff --git a/src/crewai/knowledge/source/text_file_knowledge_source.py b/src/crewai/knowledge/source/text_file_knowledge_source.py index 640db4ef9..1ceb02843 100644 --- a/src/crewai/knowledge/source/text_file_knowledge_source.py +++ b/src/crewai/knowledge/source/text_file_knowledge_source.py @@ -9,12 +9,11 @@ class TextFileKnowledgeSource(BaseFileKnowledgeSource): def load_content(self) -> Dict[Path, str]: """Load and preprocess text file content.""" - super().load_content() - paths = [self.file_path] if isinstance(self.file_path, Path) else self.file_path content = {} - for path in paths: - with path.open("r", encoding="utf-8") as f: - content[path] = f.read() # type: ignore + for path in self.safe_file_paths: + path = self.convert_to_path(path) + with open(path, "r", encoding="utf-8") as f: + content[path] = f.read() return content def add(self) -> None: diff --git a/src/crewai/utilities/constants.py b/src/crewai/utilities/constants.py index 59f789913..97fadda48 100644 --- a/src/crewai/utilities/constants.py +++ b/src/crewai/utilities/constants.py @@ -1,3 +1,4 @@ TRAINING_DATA_FILE = "training_data.pkl" TRAINED_AGENTS_DATA_FILE = "trained_agents_data.pkl" DEFAULT_SCORE_THRESHOLD = 0.35 +KNOWLEDGE_DIRECTORY = "knowledge"