Knowledge project directory standard (#1691)

* Knowledge project directory standard * fixed types * comment fix * made base file knowledge source an abstract class * cleaner validator on model_post_init * fix type checker * cleaner refactor * better template
2026-01-09 08:08:32 +00:00 · 2024-12-03 12:27:48 -08:00
parent ed3487aa22
commit 1af95f5146
11 changed files with 123 additions and 50 deletions
--- a/docs/concepts/knowledge.mdx
+++ b/docs/concepts/knowledge.mdx
@@ -156,14 +156,35 @@ crew = Crew(
    agents=[agent],
    tasks=[task],
    knowledge_sources=[source],
-    embedder_config={
+    embedder={
-        "model": "BAAI/bge-small-en-v1.5",
+        "provider": "ollama",
-        "normalize": True,
+        "config": {"model": "nomic-embed-text:latest"},
        "max_length": 512
    }
 )
 ```
 ### Referencing Sources
 You can reference knowledge sources by their collection name or metadata.
 * Add a directory to your crew project called `knowledge`:
 * File paths in knowledge can be referenced relative to the `knowledge` directory.
 Example:
 A file inside the `knowledge` directory called `example.txt` can be referenced as `example.txt`.
 ```python
 source = TextFileKnowledgeSource(
    file_path="example.txt", # or /example.txt
    collection_name="example"
 )
 crew = Crew(
    agents=[agent],
    tasks=[task],
    knowledge_sources=[source],
 )
 ```
 ## Best Practices
 <AccordionGroup>
--- a/src/crewai/cli/create_crew.py
+++ b/src/crewai/cli/create_crew.py
@@ -39,6 +39,7 @@ def create_folder_structure(name, parent_folder=None):
    folder_path.mkdir(parents=True)
    (folder_path / "tests").mkdir(exist_ok=True)
    (folder_path / "knowledge").mkdir(exist_ok=True)
    if not parent_folder:
        (folder_path / "src" / folder_name).mkdir(parents=True)
        (folder_path / "src" / folder_name / "tools").mkdir(parents=True)
@@ -52,7 +53,14 @@ def copy_template_files(folder_path, name, class_name, parent_folder):
    templates_dir = package_dir / "templates" / "crew"
    root_template_files = (
-        [".gitignore", "pyproject.toml", "README.md"] if not parent_folder else []
+        [
            ".gitignore",
            "pyproject.toml",
            "README.md",
            "knowledge/user_preference.txt",
        ]
        if not parent_folder
        else []
    )
    tools_template_files = ["tools/custom_tool.py", "tools/__init__.py"]
    config_template_files = ["config/agents.yaml", "config/tasks.yaml"]
@@ -168,7 +176,9 @@ def create_crew(name, provider=None, skip_provider=False, parent_folder=None):
    templates_dir = package_dir / "templates" / "crew"
    root_template_files = (
-        [".gitignore", "pyproject.toml", "README.md"] if not parent_folder else []
+        [".gitignore", "pyproject.toml", "README.md", "knowledge/user_preference.txt"]
        if not parent_folder
        else []
    )
    tools_template_files = ["tools/custom_tool.py", "tools/__init__.py"]
    config_template_files = ["config/agents.yaml", "config/tasks.yaml"]
--- a/src/crewai/cli/templates/crew/crew.py
+++ b/src/crewai/cli/templates/crew/crew.py
@@ -1,8 +1,9 @@
 from crewai import Agent, Crew, Process, Task
 from crewai.project import CrewBase, agent, crew, task, before_kickoff, after_kickoff
 # Uncomment the following line to use an example of a custom tool
 # from {{folder_name}}.tools.custom_tool import MyCustomTool
 # Uncomment the following line to use an example of a knowledge source
 # from crewai.knowledge.source.text_file_knowledge_source import TextFileKnowledgeSource
 # Check our tools documentations for more information on how to use them
 # from crewai_tools import SerperDevTool
@@ -57,10 +58,20 @@ class {{crew_name}}():
 	@crew
 	def crew(self) -> Crew:
 		"""Creates the {{crew_name}} crew"""
 		# You can add knowledge sources here
 		# knowledge_path = "user_preference.txt"
 		# sources = [
 		# 	TextFileKnowledgeSource(
 		# 		file_path="knowledge/user_preference.txt",
 		# 		metadata={"preference": "personal"}
 		# 	),
 		# ]
 		return Crew(
 			agents=self.agents, # Automatically created by the @agent decorator
 			tasks=self.tasks, # Automatically created by the @task decorator
 			process=Process.sequential,
 			verbose=True,
 			# process=Process.hierarchical, # In case you wanna use that instead https://docs.crewai.com/how-to/Hierarchical/
 			# knowledge_sources=sources, # In the case you want to add knowledge sources
 		)
--- a/src/crewai/cli/templates/crew/knowledge/user_preference.txt
+++ b/src/crewai/cli/templates/crew/knowledge/user_preference.txt
@@ -0,0 +1,4 @@
 User name is John Doe.
 User is an AI Engineer.
 User is interested in AI Agents.
 User is based in San Francisco, California.
--- a/src/crewai/knowledge/source/base_file_knowledge_source.py
+++ b/src/crewai/knowledge/source/base_file_knowledge_source.py
@@ -1,36 +1,72 @@
 from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import Union, List
+from typing import Union, List, Dict, Any
 from pydantic import Field
 from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
-from typing import Dict, Any
+from crewai.utilities.logger import Logger
 from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage
 from crewai.utilities.constants import KNOWLEDGE_DIRECTORY
-class BaseFileKnowledgeSource(BaseKnowledgeSource):
+class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):
    """Base class for knowledge sources that load content from files."""
-    file_path: Union[Path, List[Path]] = Field(...)
+    _logger: Logger = Logger(verbose=True)
    file_path: Union[Path, List[Path], str, List[str]] = Field(
        ..., description="The path to the file"
    )
    content: Dict[Path, str] = Field(init=False, default_factory=dict)
    storage: KnowledgeStorage = Field(default_factory=KnowledgeStorage)
    safe_file_paths: List[Path] = Field(default_factory=list)
    def model_post_init(self, _):
        """Post-initialization method to load content."""
        self.safe_file_paths = self._process_file_paths()
        self.validate_paths()
        self.content = self.load_content()
    @abstractmethod
    def load_content(self) -> Dict[Path, str]:
-        """Load and preprocess file content. Should be overridden by subclasses."""
+        """Load and preprocess file content. Should be overridden by subclasses. Assume that the file path is relative to the project root in the knowledge directory."""
-        paths = [self.file_path] if isinstance(self.file_path, Path) else self.file_path
+        pass
-        for path in paths:
+    def validate_paths(self):
        """Validate the paths."""
        for path in self.safe_file_paths:
            if not path.exists():
                self._logger.log(
                    "error",
                    f"File not found: {path}. Try adding sources to the knowledge directory. If its inside the knowledge directory, use the relative path.",
                    color="red",
                )
                raise FileNotFoundError(f"File not found: {path}")
            if not path.is_file():
-                raise ValueError(f"Path is not a file: {path}")
+                self._logger.log(
-        return {}
+                    "error",
                    f"Path is not a file: {path}",
                    color="red",
                )
    def save_documents(self, metadata: Dict[str, Any]):
        """Save the documents to the storage."""
        chunk_metadatas = [metadata.copy() for _ in self.chunks]
        self.storage.save(self.chunks, chunk_metadatas)
    def convert_to_path(self, path: Union[Path, str]) -> Path:
        """Convert a path to a Path object."""
        return Path(KNOWLEDGE_DIRECTORY + "/" + path) if isinstance(path, str) else path
    def _process_file_paths(self) -> List[Path]:
        """Convert file_path to a list of Path objects."""
        paths = (
            [self.file_path]
            if isinstance(self.file_path, (str, Path))
            else self.file_path
        )
        if not isinstance(paths, list):
            raise ValueError("file_path must be a Path, str, or a list of these types")
        return [self.convert_to_path(path) for path in paths]
--- a/src/crewai/knowledge/source/csv_knowledge_source.py
+++ b/src/crewai/knowledge/source/csv_knowledge_source.py
@@ -10,19 +10,15 @@ class CSVKnowledgeSource(BaseFileKnowledgeSource):
    def load_content(self) -> Dict[Path, str]:
        """Load and preprocess CSV file content."""
-        super().load_content()  # Validate the file path
+        content_dict = {}
-
+        for file_path in self.safe_file_paths:
-        file_path = (
+            with open(file_path, "r", encoding="utf-8") as csvfile:
-            self.file_path[0] if isinstance(self.file_path, list) else self.file_path
+                reader = csv.reader(csvfile)
-        )
+                content = ""
-        file_path = Path(file_path) if isinstance(file_path, str) else file_path
+                for row in reader:
-
+                    content += " ".join(row) + "\n"
-        with open(file_path, "r", encoding="utf-8") as csvfile:
+                content_dict[file_path] = content
-            reader = csv.reader(csvfile)
+        return content_dict
            content = ""
            for row in reader:
                content += " ".join(row) + "\n"
        return {file_path: content}
    def add(self) -> None:
        """
--- a/src/crewai/knowledge/source/excel_knowledge_source.py
+++ b/src/crewai/knowledge/source/excel_knowledge_source.py
@@ -8,17 +8,15 @@ class ExcelKnowledgeSource(BaseFileKnowledgeSource):
    def load_content(self) -> Dict[Path, str]:
        """Load and preprocess Excel file content."""
        super().load_content()  # Validate the file path
        pd = self._import_dependencies()
-        if isinstance(self.file_path, list):
+        content_dict = {}
-            file_path = self.file_path[0]
+        for file_path in self.safe_file_paths:
-        else:
+            file_path = self.convert_to_path(file_path)
-            file_path = self.file_path
+            df = pd.read_excel(file_path)
-
+            content = df.to_csv(index=False)
-        df = pd.read_excel(file_path)
+            content_dict[file_path] = content
-        content = df.to_csv(index=False)
+        return content_dict
        return {file_path: content}
    def _import_dependencies(self):
        """Dynamically import dependencies."""
--- a/src/crewai/knowledge/source/json_knowledge_source.py
+++ b/src/crewai/knowledge/source/json_knowledge_source.py
@@ -10,11 +10,9 @@ class JSONKnowledgeSource(BaseFileKnowledgeSource):
    def load_content(self) -> Dict[Path, str]:
        """Load and preprocess JSON file content."""
        super().load_content()  # Validate the file path
        paths = [self.file_path] if isinstance(self.file_path, Path) else self.file_path
        content: Dict[Path, str] = {}
-        for path in paths:
+        for path in self.safe_file_paths:
            path = self.convert_to_path(path)
            with open(path, "r", encoding="utf-8") as json_file:
                data = json.load(json_file)
            content[path] = self._json_to_text(data)
--- a/src/crewai/knowledge/source/pdf_knowledge_source.py
+++ b/src/crewai/knowledge/source/pdf_knowledge_source.py
@@ -9,14 +9,13 @@ class PDFKnowledgeSource(BaseFileKnowledgeSource):
    def load_content(self) -> Dict[Path, str]:
        """Load and preprocess PDF file content."""
        super().load_content()  # Validate the file paths
        pdfplumber = self._import_pdfplumber()
        paths = [self.file_path] if isinstance(self.file_path, Path) else self.file_path
        content = {}
-        for path in paths:
+        for path in self.safe_file_paths:
            text = ""
            path = self.convert_to_path(path)
            with pdfplumber.open(path) as pdf:
                for page in pdf.pages:
                    page_text = page.extract_text()
--- a/src/crewai/knowledge/source/text_file_knowledge_source.py
+++ b/src/crewai/knowledge/source/text_file_knowledge_source.py
@@ -9,12 +9,11 @@ class TextFileKnowledgeSource(BaseFileKnowledgeSource):
    def load_content(self) -> Dict[Path, str]:
        """Load and preprocess text file content."""
        super().load_content()
        paths = [self.file_path] if isinstance(self.file_path, Path) else self.file_path
        content = {}
-        for path in paths:
+        for path in self.safe_file_paths:
-            with path.open("r", encoding="utf-8") as f:
+            path = self.convert_to_path(path)
-                content[path] = f.read()  # type: ignore
+            with open(path, "r", encoding="utf-8") as f:
                content[path] = f.read()
        return content
    def add(self) -> None:
--- a/src/crewai/utilities/constants.py
+++ b/src/crewai/utilities/constants.py
@@ -1,3 +1,4 @@
 TRAINING_DATA_FILE = "training_data.pkl"
 TRAINED_AGENTS_DATA_FILE = "trained_agents_data.pkl"
 DEFAULT_SCORE_THRESHOLD = 0.35
 KNOWLEDGE_DIRECTORY = "knowledge"