From 6131dbac4f4899b6062297526eff85698c638a00 Mon Sep 17 00:00:00 2001
From: Brandon Hancock <brandon@brandonhancock.io>
Date: Wed, 6 Nov 2024 15:57:03 -0500
Subject: [PATCH] Improve types and better support for file paths

---
 src/crewai/agent.py                           | 26 +++++++++---
 .../source/base_file_knowledge_source.py      | 24 +++++++++++
 .../knowledge/source/base_knowledge_source.py | 18 ++++----
 .../knowledge/source/pdf_knowledge_source.py  | 41 ++++++-------------
 .../source/string_knowledge_source.py         | 24 ++++-------
 .../source/text_file_knowledge_source.py      | 21 ++--------
 tests/agent_test.py                           | 32 ++++++++++++++-
 tests/knowledge/knowledge_test.py             | 26 ++++++------
 8 files changed, 121 insertions(+), 91 deletions(-)
 create mode 100644 src/crewai/knowledge/source/base_file_knowledge_source.py

diff --git a/src/crewai/agent.py b/src/crewai/agent.py
index 817eacb2b..ea4231eb3 100644
--- a/src/crewai/agent.py
+++ b/src/crewai/agent.py
@@ -8,8 +8,8 @@ from pydantic import Field, InstanceOf, PrivateAttr, model_validator
 from crewai.agents import CacheHandler
 from crewai.agents.agent_builder.base_agent import BaseAgent
 from crewai.agents.crew_agent_executor import CrewAgentExecutor
-
-# from crewai.knowledge import StringKnowledgeBase
+from crewai.knowledge.knowledge import Knowledge
+from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
 from crewai.llm import LLM
 from crewai.memory.contextual.contextual_memory import ContextualMemory
 from crewai.tools import BaseTool
@@ -87,9 +87,9 @@ class Agent(BaseAgent):
     llm: Union[str, InstanceOf[LLM], Any] = Field(
         description="Language model that will run the agent.", default=None
     )
-    knowledge: Optional[str] = Field(
+    knowledge_sources: Optional[List[BaseKnowledgeSource]] = Field(
         default=None,
-        description="Knowledge base for the agent.",
+        description="Knowledge sources for the agent.",
     )
     function_calling_llm: Optional[Any] = Field(
         description="Language model that will run the agent.", default=None
@@ -125,6 +125,8 @@ class Agent(BaseAgent):
         default="safe",
         description="Mode for code execution: 'safe' (using Docker) or 'unsafe' (direct execution).",
     )
+    # TODO: We need to add in knowledge config (score, top_k, etc)
+    _knowledge: Optional[Knowledge] = PrivateAttr(default=None)
 
     @model_validator(mode="after")
     def post_init_setup(self):
@@ -189,7 +191,11 @@ class Agent(BaseAgent):
         if self.allow_code_execution:
             self._validate_docker_installation()
 
-        # self.knowledge = StringKnowledgeBase(content=self.knowledge)
+        # Initialize the Knowledge object if knowledge_sources are provided
+        if self.knowledge_sources:
+            self._knowledge = Knowledge(sources=self.knowledge_sources)
+        else:
+            self._knowledge = None
 
         return self
 
@@ -234,6 +240,16 @@ class Agent(BaseAgent):
             if memory.strip() != "":
                 task_prompt += self.i18n.slice("memory").format(memory=memory)
 
+        # Integrate the knowledge base
+        if self._knowledge:
+            # Query the knowledge base for relevant information
+            knowledge_snippets = self._knowledge.query(query=task.prompt())
+            print("knowledge_snippets", knowledge_snippets)
+            if knowledge_snippets:
+                formatted_knowledge = "\n".join(knowledge_snippets)
+                print("formatted_knowledge", formatted_knowledge)
+                task_prompt += f"\n\nAdditional Information:\n{formatted_knowledge}"
+
         tools = tools or self.tools or []
         self.create_agent_executor(tools=tools, task=task)
 
diff --git a/src/crewai/knowledge/source/base_file_knowledge_source.py b/src/crewai/knowledge/source/base_file_knowledge_source.py
new file mode 100644
index 000000000..a658d2e30
--- /dev/null
+++ b/src/crewai/knowledge/source/base_file_knowledge_source.py
@@ -0,0 +1,24 @@
+from pathlib import Path
+
+from pydantic import Field
+
+from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
+
+
+class BaseFileKnowledgeSource(BaseKnowledgeSource):
+    """Base class for knowledge sources that load content from files."""
+
+    file_path: Path = Field(...)
+    content: str = Field(init=False, default="")
+
+    def model_post_init(self, context):
+        """Post-initialization method to load content."""
+        self.content = self.load_content()
+
+    def load_content(self) -> str:
+        """Load and preprocess file content. Should be overridden by subclasses."""
+        if not self.file_path.exists():
+            raise FileNotFoundError(f"File not found: {self.file_path}")
+        if not self.file_path.is_file():
+            raise ValueError(f"Path is not a file: {self.file_path}")
+        return ""
diff --git a/src/crewai/knowledge/source/base_knowledge_source.py b/src/crewai/knowledge/source/base_knowledge_source.py
index 15f65f1f6..51675af68 100644
--- a/src/crewai/knowledge/source/base_knowledge_source.py
+++ b/src/crewai/knowledge/source/base_knowledge_source.py
@@ -2,22 +2,20 @@ from abc import ABC, abstractmethod
 from typing import List
 
 import numpy as np
+from pydantic import BaseModel, ConfigDict, Field
 
 from crewai.knowledge.embedder.base_embedder import BaseEmbedder
 
 
-class BaseKnowledgeSource(ABC):
+class BaseKnowledgeSource(BaseModel, ABC):
     """Abstract base class for knowledge sources."""
 
-    def __init__(
-        self,
-        chunk_size: int = 1000,
-        chunk_overlap: int = 200,
-    ):
-        self.chunk_size = chunk_size
-        self.chunk_overlap = chunk_overlap
-        self.chunks: List[str] = []
-        self.chunk_embeddings: List[np.ndarray] = []
+    chunk_size: int = 1000
+    chunk_overlap: int = 200
+    chunks: List[str] = Field(default_factory=list)
+    chunk_embeddings: List[np.ndarray] = Field(default_factory=list)
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
 
     @abstractmethod
     def load_content(self):
diff --git a/src/crewai/knowledge/source/pdf_knowledge_source.py b/src/crewai/knowledge/source/pdf_knowledge_source.py
index c86a8abc2..1ca0ab356 100644
--- a/src/crewai/knowledge/source/pdf_knowledge_source.py
+++ b/src/crewai/knowledge/source/pdf_knowledge_source.py
@@ -1,22 +1,23 @@
-from pathlib import Path
 from typing import List
 
 from crewai.knowledge.embedder.base_embedder import BaseEmbedder
-from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
+from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource
 
 
-class PDFKnowledgeSource(BaseKnowledgeSource):
+class PDFKnowledgeSource(BaseFileKnowledgeSource):
     """A knowledge source that stores and queries PDF file content using embeddings."""
 
-    def __init__(
-        self,
-        file_path: str,
-        chunk_size: int = 1000,
-        chunk_overlap: int = 200,
-    ):
-        super().__init__(chunk_size, chunk_overlap)
-        self.file_path = Path(file_path)
-        self.content = self.load_content()
+    def load_content(self) -> str:
+        """Load and preprocess PDF file content."""
+        super().load_content()  # Validate the file path
+        pdfplumber = self._import_pdfplumber()
+        text = ""
+        with pdfplumber.open(self.file_path) as pdf:
+            for page in pdf.pages:
+                page_text = page.extract_text()
+                if page_text:
+                    text += page_text + "\n"
+        return text
 
     def _import_pdfplumber(self):
         """Dynamically import pdfplumber."""
@@ -29,22 +30,6 @@ class PDFKnowledgeSource(BaseKnowledgeSource):
                 "pdfplumber is not installed. Please install it with: pip install pdfplumber"
             )
 
-    def load_content(self) -> str:
-        """Load and preprocess PDF file content."""
-        if not self.file_path.exists():
-            raise FileNotFoundError(f"File not found: {self.file_path}")
-        if not self.file_path.is_file():
-            raise ValueError(f"Path is not a file: {self.file_path}")
-
-        pdfplumber = self._import_pdfplumber()
-        text = ""
-        with pdfplumber.open(self.file_path) as pdf:
-            for page in pdf.pages:
-                page_text = page.extract_text()
-                if page_text:
-                    text += page_text + "\n"
-        return text
-
     def add(self, embedder: BaseEmbedder) -> None:
         """
         Add PDF file content to the knowledge source, chunk it, compute embeddings,
diff --git a/src/crewai/knowledge/source/string_knowledge_source.py b/src/crewai/knowledge/source/string_knowledge_source.py
index a2f423fbd..9dd0ecd9f 100644
--- a/src/crewai/knowledge/source/string_knowledge_source.py
+++ b/src/crewai/knowledge/source/string_knowledge_source.py
@@ -1,5 +1,7 @@
 from typing import List
 
+from pydantic import Field
+
 from crewai.knowledge.embedder.base_embedder import BaseEmbedder
 from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
 
@@ -7,29 +9,19 @@ from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
 class StringKnowledgeSource(BaseKnowledgeSource):
     """A knowledge source that stores and queries plain text content using embeddings."""
 
-    def __init__(
-        self,
-        content: str,
-        chunk_size: int = 1000,
-        chunk_overlap: int = 200,
-    ):
-        super().__init__(
-            chunk_size,
-            chunk_overlap,
-        )
-        self.content = content
+    content: str = Field(...)
+
+    def model_post_init(self, context):
+        """Post-initialization method to validate content."""
         self.load_content()
 
     def load_content(self):
-        """Load and preprocess string content."""
+        """Validate string content."""
         if not isinstance(self.content, str):
             raise ValueError("StringKnowledgeSource only accepts string content")
 
     def add(self, embedder: BaseEmbedder) -> None:
-        """
-        Add string content to the knowledge source, chunk it, compute embeddings,
-        and save the embeddings.
-        """
+        """Add string content to the knowledge source, chunk it, compute embeddings, and save them."""
         new_chunks = self._chunk_text(self.content)
         self.chunks.extend(new_chunks)
         # Compute embeddings for the new chunks
diff --git a/src/crewai/knowledge/source/text_file_knowledge_source.py b/src/crewai/knowledge/source/text_file_knowledge_source.py
index 8c97ae9ca..fb14319e5 100644
--- a/src/crewai/knowledge/source/text_file_knowledge_source.py
+++ b/src/crewai/knowledge/source/text_file_knowledge_source.py
@@ -1,30 +1,15 @@
-from pathlib import Path
 from typing import List
 
 from crewai.knowledge.embedder.base_embedder import BaseEmbedder
-from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
+from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource
 
 
-class TextFileKnowledgeSource(BaseKnowledgeSource):
+class TextFileKnowledgeSource(BaseFileKnowledgeSource):
     """A knowledge source that stores and queries text file content using embeddings."""
 
-    def __init__(
-        self,
-        file_path: str,
-        chunk_size: int = 1000,
-        chunk_overlap: int = 200,
-    ):
-        super().__init__(chunk_size, chunk_overlap)
-        self.file_path = Path(file_path)
-        self.content = self.load_content()
-
     def load_content(self) -> str:
         """Load and preprocess text file content."""
-        if not self.file_path.exists():
-            raise FileNotFoundError(f"File not found: {self.file_path}")
-        if not self.file_path.is_file():
-            raise ValueError(f"Path is not a file: {self.file_path}")
-
+        super().load_content()  # Validate the file path
         with self.file_path.open("r", encoding="utf-8") as f:
             return f.read()
 
diff --git a/tests/agent_test.py b/tests/agent_test.py
index c4094d15c..6fcd79d2e 100644
--- a/tests/agent_test.py
+++ b/tests/agent_test.py
@@ -10,10 +10,11 @@ from crewai import Agent, Crew, Task
 from crewai.agents.cache import CacheHandler
 from crewai.agents.crew_agent_executor import CrewAgentExecutor
 from crewai.agents.parser import AgentAction, CrewAgentParser, OutputParserException
+from crewai.knowledge.source.string_knowledge_source import StringKnowledgeSource
 from crewai.llm import LLM
+from crewai.tools import tool
 from crewai.tools.tool_calling import InstructorToolCalling
 from crewai.tools.tool_usage import ToolUsage
-from crewai.tools import tool
 from crewai.tools.tool_usage_events import ToolUsageFinished
 from crewai.utilities import RPMController
 from crewai.utilities.events import Emitter
@@ -1574,3 +1575,32 @@ def test_agent_execute_task_with_ollama():
     result = agent.execute_task(task)
     assert len(result.split(".")) == 2
     assert "AI" in result or "artificial intelligence" in result.lower()
+
+
+# @pytest.mark.vcr(filter_headers=["authorization"])
+def test_agent_with_knowledge_sources():
+    # Create a knowledge source with some content
+    content = "Brandon's favorite color is blue and he likes Mexican food."
+    string_source = StringKnowledgeSource(content=content)
+
+    # Create an agent with the knowledge source
+    agent = Agent(
+        role="Information Agent",
+        goal="Provide information based on knowledge sources",
+        backstory="You have access to specific knowledge sources.",
+        llm=LLM(model="gpt-3.5-turbo"),
+        knowledge_sources=[string_source],
+    )
+
+    # Create a task that requires the agent to use the knowledge
+    task = Task(
+        description="What is Brandon's favorite color?",
+        expected_output="Brandon's favorite color.",
+        agent=agent,
+    )
+
+    # Execute the task
+    result = agent.execute_task(task)
+
+    # Assert that the agent provides the correct information
+    assert "blue" in result.lower()
diff --git a/tests/knowledge/knowledge_test.py b/tests/knowledge/knowledge_test.py
index c61226d8b..c77b06dee 100644
--- a/tests/knowledge/knowledge_test.py
+++ b/tests/knowledge/knowledge_test.py
@@ -1,6 +1,6 @@
 """Test Knowledge creation and querying functionality."""
 
-import os
+from pathlib import Path
 
 from crewai.knowledge.knowledge import Knowledge
 from crewai.knowledge.source.pdf_knowledge_source import PDFKnowledgeSource
@@ -141,11 +141,11 @@ def test_multiple_2k_character_strings():
 def test_single_short_file(tmpdir):
     # Create a single short text file
     content = "Brandon's favorite sport is basketball."
-    file_path = tmpdir.join("short_file.txt")
+    file_path = Path(tmpdir.join("short_file.txt"))
     with open(file_path, "w") as f:
         f.write(content)
 
-    file_source = TextFileKnowledgeSource(file_path=str(file_path))
+    file_source = TextFileKnowledgeSource(file_path=file_path)
     knowledge_base = Knowledge(sources=[file_source])
 
     # Perform a query
@@ -180,11 +180,11 @@ def test_single_2k_character_file(tmpdir):
         "Brandon's favorite sport is basketball, and he often plays with his friends on weekends. "
         "He is also a fan of the Golden State Warriors and enjoys watching their games. "
     ) * 2  # Repeat to ensure it's 2k characters
-    file_path = tmpdir.join("long_file.txt")
+    file_path = Path(tmpdir.join("long_file.txt"))
     with open(file_path, "w") as f:
         f.write(content)
 
-    file_source = TextFileKnowledgeSource(file_path=str(file_path))
+    file_source = TextFileKnowledgeSource(file_path=file_path)
     knowledge_base = Knowledge(sources=[file_source])
 
     # Perform a query
@@ -204,10 +204,10 @@ def test_multiple_short_files(tmpdir):
     ]
     file_paths = []
     for i, content in enumerate(contents):
-        file_path = tmpdir.join(f"file_{i}.txt")
+        file_path = Path(tmpdir.join(f"file_{i}.txt"))
         with open(file_path, "w") as f:
             f.write(content)
-        file_paths.append(str(file_path))
+        file_paths.append(file_path)
 
     file_sources = [TextFileKnowledgeSource(file_path=path) for path in file_paths]
     knowledge_base = Knowledge(sources=file_sources)
@@ -272,10 +272,10 @@ def test_multiple_2k_character_files(tmpdir):
     ]
     file_paths = []
     for i, content in enumerate(contents):
-        file_path = tmpdir.join(f"long_file_{i}.txt")
+        file_path = Path(tmpdir.join(f"long_file_{i}.txt"))
         with open(file_path, "w") as f:
             f.write(content)
-        file_paths.append(str(file_path))
+        file_paths.append(file_path)
 
     file_sources = [TextFileKnowledgeSource(file_path=path) for path in file_paths]
     knowledge_base = Knowledge(sources=file_sources)
@@ -307,10 +307,10 @@ def test_hybrid_string_and_files(tmpdir):
     ]
     file_paths = []
     for i, content in enumerate(file_contents):
-        file_path = tmpdir.join(f"file_{i}.txt")
+        file_path = Path(tmpdir.join(f"file_{i}.txt"))
         with open(file_path, "w") as f:
             f.write(content)
-        file_paths.append(str(file_path))
+        file_paths.append(file_path)
 
     file_sources = [TextFileKnowledgeSource(file_path=path) for path in file_paths]
 
@@ -327,9 +327,9 @@ def test_hybrid_string_and_files(tmpdir):
 
 def test_pdf_knowledge_source():
     # Get the directory of the current file
-    current_dir = os.path.dirname(__file__)
+    current_dir = Path(__file__).parent
     # Construct the path to the PDF file
-    pdf_path = os.path.join(current_dir, "crewai_quickstart.pdf")
+    pdf_path = current_dir / "crewai_quickstart.pdf"
 
     # Create a PDFKnowledgeSource
     pdf_source = PDFKnowledgeSource(file_path=pdf_path)