Knowledge project directory standard (#1691)

* Knowledge project directory standard * fixed types * comment fix * made base file knowledge source an abstract class * cleaner validator on model_post_init * fix type checker * cleaner refactor * better template
2026-01-08 15:48:29 +00:00 · 2024-12-03 12:27:48 -08:00
parent 04cbf10a78
commit 1f8ee15753
11 changed files with 123 additions and 50 deletions
--- a/docs/concepts/knowledge.mdx
+++ b/docs/concepts/knowledge.mdx
@@ -156,14 +156,35 @@ crew = Crew(
    agents=[agent],
    tasks=[task],
    knowledge_sources=[source],
-    embedder_config={
-        "model": "BAAI/bge-small-en-v1.5",
-        "normalize": True,
-        "max_length": 512
+    embedder={
+        "provider": "ollama",
+        "config": {"model": "nomic-embed-text:latest"},
    }
 )
 ```

+### Referencing Sources
+
+You can reference knowledge sources by their collection name or metadata.
+
+* Add a directory to your crew project called `knowledge`:
+* File paths in knowledge can be referenced relative to the `knowledge` directory.
+
+Example:
+A file inside the `knowledge` directory called `example.txt` can be referenced as `example.txt`.
+
+```python
+source = TextFileKnowledgeSource(
+    file_path="example.txt", # or /example.txt
+    collection_name="example"
+)
+crew = Crew(
+    agents=[agent],
+    tasks=[task],
+    knowledge_sources=[source],
+)
+```
+
 ## Best Practices

 <AccordionGroup>
--- a/src/crewai/cli/create_crew.py
+++ b/src/crewai/cli/create_crew.py
@@ -39,6 +39,7 @@ def create_folder_structure(name, parent_folder=None):

    folder_path.mkdir(parents=True)
    (folder_path / "tests").mkdir(exist_ok=True)
+    (folder_path / "knowledge").mkdir(exist_ok=True)
    if not parent_folder:
        (folder_path / "src" / folder_name).mkdir(parents=True)
        (folder_path / "src" / folder_name / "tools").mkdir(parents=True)
@@ -52,7 +53,14 @@ def copy_template_files(folder_path, name, class_name, parent_folder):
    templates_dir = package_dir / "templates" / "crew"

    root_template_files = (
-        [".gitignore", "pyproject.toml", "README.md"] if not parent_folder else []
+        [
+            ".gitignore",
+            "pyproject.toml",
+            "README.md",
+            "knowledge/user_preference.txt",
+        ]
+        if not parent_folder
+        else []
    )
    tools_template_files = ["tools/custom_tool.py", "tools/__init__.py"]
    config_template_files = ["config/agents.yaml", "config/tasks.yaml"]
@@ -168,7 +176,9 @@ def create_crew(name, provider=None, skip_provider=False, parent_folder=None):
    templates_dir = package_dir / "templates" / "crew"

    root_template_files = (
-        [".gitignore", "pyproject.toml", "README.md"] if not parent_folder else []
+        [".gitignore", "pyproject.toml", "README.md", "knowledge/user_preference.txt"]
+        if not parent_folder
+        else []
    )
    tools_template_files = ["tools/custom_tool.py", "tools/__init__.py"]
    config_template_files = ["config/agents.yaml", "config/tasks.yaml"]
--- a/src/crewai/cli/templates/crew/crew.py
+++ b/src/crewai/cli/templates/crew/crew.py
@@ -1,8 +1,9 @@
 from crewai import Agent, Crew, Process, Task
 from crewai.project import CrewBase, agent, crew, task, before_kickoff, after_kickoff
-
 # Uncomment the following line to use an example of a custom tool
 # from {{folder_name}}.tools.custom_tool import MyCustomTool
+# Uncomment the following line to use an example of a knowledge source
+# from crewai.knowledge.source.text_file_knowledge_source import TextFileKnowledgeSource

 # Check our tools documentations for more information on how to use them
 # from crewai_tools import SerperDevTool
@@ -57,10 +58,20 @@ class {{crew_name}}():
 	@crew
 	def crew(self) -> Crew:
 		"""Creates the {{crew_name}} crew"""
+		# You can add knowledge sources here
+		# knowledge_path = "user_preference.txt"
+		# sources = [
+		# 	TextFileKnowledgeSource(
+		# 		file_path="knowledge/user_preference.txt",
+		# 		metadata={"preference": "personal"}
+		# 	),
+		# ]
+
 		return Crew(
 			agents=self.agents, # Automatically created by the @agent decorator
 			tasks=self.tasks, # Automatically created by the @task decorator
 			process=Process.sequential,
 			verbose=True,
 			# process=Process.hierarchical, # In case you wanna use that instead https://docs.crewai.com/how-to/Hierarchical/
+			# knowledge_sources=sources, # In the case you want to add knowledge sources
 		)
--- a/src/crewai/cli/templates/crew/knowledge/user_preference.txt
+++ b/src/crewai/cli/templates/crew/knowledge/user_preference.txt
@@ -0,0 +1,4 @@
+User name is John Doe.
+User is an AI Engineer.
+User is interested in AI Agents.
+User is based in San Francisco, California.
--- a/src/crewai/knowledge/source/base_file_knowledge_source.py
+++ b/src/crewai/knowledge/source/base_file_knowledge_source.py
@@ -1,36 +1,72 @@
+from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import Union, List
+from typing import Union, List, Dict, Any

 from pydantic import Field

 from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
-from typing import Dict, Any
+from crewai.utilities.logger import Logger
 from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage
+from crewai.utilities.constants import KNOWLEDGE_DIRECTORY


-class BaseFileKnowledgeSource(BaseKnowledgeSource):
+class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):
    """Base class for knowledge sources that load content from files."""

-    file_path: Union[Path, List[Path]] = Field(...)
+    _logger: Logger = Logger(verbose=True)
+    file_path: Union[Path, List[Path], str, List[str]] = Field(
+        ..., description="The path to the file"
+    )
    content: Dict[Path, str] = Field(init=False, default_factory=dict)
    storage: KnowledgeStorage = Field(default_factory=KnowledgeStorage)
+    safe_file_paths: List[Path] = Field(default_factory=list)

    def model_post_init(self, _):
        """Post-initialization method to load content."""
+        self.safe_file_paths = self._process_file_paths()
+        self.validate_paths()
        self.content = self.load_content()

+    @abstractmethod
    def load_content(self) -> Dict[Path, str]:
-        """Load and preprocess file content. Should be overridden by subclasses."""
-        paths = [self.file_path] if isinstance(self.file_path, Path) else self.file_path
+        """Load and preprocess file content. Should be overridden by subclasses. Assume that the file path is relative to the project root in the knowledge directory."""
+        pass

-        for path in paths:
+    def validate_paths(self):
+        """Validate the paths."""
+        for path in self.safe_file_paths:
            if not path.exists():
+                self._logger.log(
+                    "error",
+                    f"File not found: {path}. Try adding sources to the knowledge directory. If its inside the knowledge directory, use the relative path.",
+                    color="red",
+                )
                raise FileNotFoundError(f"File not found: {path}")
            if not path.is_file():
-                raise ValueError(f"Path is not a file: {path}")
-        return {}
+                self._logger.log(
+                    "error",
+                    f"Path is not a file: {path}",
+                    color="red",
+                )

    def save_documents(self, metadata: Dict[str, Any]):
        """Save the documents to the storage."""
        chunk_metadatas = [metadata.copy() for _ in self.chunks]
        self.storage.save(self.chunks, chunk_metadatas)
+
+    def convert_to_path(self, path: Union[Path, str]) -> Path:
+        """Convert a path to a Path object."""
+        return Path(KNOWLEDGE_DIRECTORY + "/" + path) if isinstance(path, str) else path
+
+    def _process_file_paths(self) -> List[Path]:
+        """Convert file_path to a list of Path objects."""
+        paths = (
+            [self.file_path]
+            if isinstance(self.file_path, (str, Path))
+            else self.file_path
+        )
+
+        if not isinstance(paths, list):
+            raise ValueError("file_path must be a Path, str, or a list of these types")
+
+        return [self.convert_to_path(path) for path in paths]
--- a/src/crewai/knowledge/source/csv_knowledge_source.py
+++ b/src/crewai/knowledge/source/csv_knowledge_source.py
@@ -10,19 +10,15 @@ class CSVKnowledgeSource(BaseFileKnowledgeSource):

    def load_content(self) -> Dict[Path, str]:
        """Load and preprocess CSV file content."""
-        super().load_content()  # Validate the file path
-
-        file_path = (
-            self.file_path[0] if isinstance(self.file_path, list) else self.file_path
-        )
-        file_path = Path(file_path) if isinstance(file_path, str) else file_path
-
-        with open(file_path, "r", encoding="utf-8") as csvfile:
-            reader = csv.reader(csvfile)
-            content = ""
-            for row in reader:
-                content += " ".join(row) + "\n"
-        return {file_path: content}
+        content_dict = {}
+        for file_path in self.safe_file_paths:
+            with open(file_path, "r", encoding="utf-8") as csvfile:
+                reader = csv.reader(csvfile)
+                content = ""
+                for row in reader:
+                    content += " ".join(row) + "\n"
+                content_dict[file_path] = content
+        return content_dict

    def add(self) -> None:
        """
--- a/src/crewai/knowledge/source/excel_knowledge_source.py
+++ b/src/crewai/knowledge/source/excel_knowledge_source.py
@@ -8,17 +8,15 @@ class ExcelKnowledgeSource(BaseFileKnowledgeSource):

    def load_content(self) -> Dict[Path, str]:
        """Load and preprocess Excel file content."""
-        super().load_content()  # Validate the file path
        pd = self._import_dependencies()

-        if isinstance(self.file_path, list):
-            file_path = self.file_path[0]
-        else:
-            file_path = self.file_path
-
-        df = pd.read_excel(file_path)
-        content = df.to_csv(index=False)
-        return {file_path: content}
+        content_dict = {}
+        for file_path in self.safe_file_paths:
+            file_path = self.convert_to_path(file_path)
+            df = pd.read_excel(file_path)
+            content = df.to_csv(index=False)
+            content_dict[file_path] = content
+        return content_dict

    def _import_dependencies(self):
        """Dynamically import dependencies."""
--- a/src/crewai/knowledge/source/json_knowledge_source.py
+++ b/src/crewai/knowledge/source/json_knowledge_source.py
@@ -10,11 +10,9 @@ class JSONKnowledgeSource(BaseFileKnowledgeSource):

    def load_content(self) -> Dict[Path, str]:
        """Load and preprocess JSON file content."""
-        super().load_content()  # Validate the file path
-        paths = [self.file_path] if isinstance(self.file_path, Path) else self.file_path
-
        content: Dict[Path, str] = {}
-        for path in paths:
+        for path in self.safe_file_paths:
+            path = self.convert_to_path(path)
            with open(path, "r", encoding="utf-8") as json_file:
                data = json.load(json_file)
            content[path] = self._json_to_text(data)
--- a/src/crewai/knowledge/source/pdf_knowledge_source.py
+++ b/src/crewai/knowledge/source/pdf_knowledge_source.py
@@ -9,14 +9,13 @@ class PDFKnowledgeSource(BaseFileKnowledgeSource):

    def load_content(self) -> Dict[Path, str]:
        """Load and preprocess PDF file content."""
-        super().load_content()  # Validate the file paths
        pdfplumber = self._import_pdfplumber()

-        paths = [self.file_path] if isinstance(self.file_path, Path) else self.file_path
        content = {}

-        for path in paths:
+        for path in self.safe_file_paths:
            text = ""
+            path = self.convert_to_path(path)
            with pdfplumber.open(path) as pdf:
                for page in pdf.pages:
                    page_text = page.extract_text()
--- a/src/crewai/knowledge/source/text_file_knowledge_source.py
+++ b/src/crewai/knowledge/source/text_file_knowledge_source.py
@@ -9,12 +9,11 @@ class TextFileKnowledgeSource(BaseFileKnowledgeSource):

    def load_content(self) -> Dict[Path, str]:
        """Load and preprocess text file content."""
-        super().load_content()
-        paths = [self.file_path] if isinstance(self.file_path, Path) else self.file_path
        content = {}
-        for path in paths:
-            with path.open("r", encoding="utf-8") as f:
-                content[path] = f.read()  # type: ignore
+        for path in self.safe_file_paths:
+            path = self.convert_to_path(path)
+            with open(path, "r", encoding="utf-8") as f:
+                content[path] = f.read()
        return content

    def add(self) -> None:
--- a/src/crewai/utilities/constants.py
+++ b/src/crewai/utilities/constants.py
@@ -1,3 +1,4 @@
 TRAINING_DATA_FILE = "training_data.pkl"
 TRAINED_AGENTS_DATA_FILE = "trained_agents_data.pkl"
 DEFAULT_SCORE_THRESHOLD = 0.35
+KNOWLEDGE_DIRECTORY = "knowledge"