From 6131dbac4f4899b6062297526eff85698c638a00 Mon Sep 17 00:00:00 2001 From: Brandon Hancock Date: Wed, 6 Nov 2024 15:57:03 -0500 Subject: [PATCH] Improve types and better support for file paths --- src/crewai/agent.py | 26 +++++++++--- .../source/base_file_knowledge_source.py | 24 +++++++++++ .../knowledge/source/base_knowledge_source.py | 18 ++++---- .../knowledge/source/pdf_knowledge_source.py | 41 ++++++------------- .../source/string_knowledge_source.py | 24 ++++------- .../source/text_file_knowledge_source.py | 21 ++-------- tests/agent_test.py | 32 ++++++++++++++- tests/knowledge/knowledge_test.py | 26 ++++++------ 8 files changed, 121 insertions(+), 91 deletions(-) create mode 100644 src/crewai/knowledge/source/base_file_knowledge_source.py diff --git a/src/crewai/agent.py b/src/crewai/agent.py index 817eacb2b..ea4231eb3 100644 --- a/src/crewai/agent.py +++ b/src/crewai/agent.py @@ -8,8 +8,8 @@ from pydantic import Field, InstanceOf, PrivateAttr, model_validator from crewai.agents import CacheHandler from crewai.agents.agent_builder.base_agent import BaseAgent from crewai.agents.crew_agent_executor import CrewAgentExecutor - -# from crewai.knowledge import StringKnowledgeBase +from crewai.knowledge.knowledge import Knowledge +from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource from crewai.llm import LLM from crewai.memory.contextual.contextual_memory import ContextualMemory from crewai.tools import BaseTool @@ -87,9 +87,9 @@ class Agent(BaseAgent): llm: Union[str, InstanceOf[LLM], Any] = Field( description="Language model that will run the agent.", default=None ) - knowledge: Optional[str] = Field( + knowledge_sources: Optional[List[BaseKnowledgeSource]] = Field( default=None, - description="Knowledge base for the agent.", + description="Knowledge sources for the agent.", ) function_calling_llm: Optional[Any] = Field( description="Language model that will run the agent.", default=None @@ -125,6 +125,8 @@ class Agent(BaseAgent): default="safe", description="Mode for code execution: 'safe' (using Docker) or 'unsafe' (direct execution).", ) + # TODO: We need to add in knowledge config (score, top_k, etc) + _knowledge: Optional[Knowledge] = PrivateAttr(default=None) @model_validator(mode="after") def post_init_setup(self): @@ -189,7 +191,11 @@ class Agent(BaseAgent): if self.allow_code_execution: self._validate_docker_installation() - # self.knowledge = StringKnowledgeBase(content=self.knowledge) + # Initialize the Knowledge object if knowledge_sources are provided + if self.knowledge_sources: + self._knowledge = Knowledge(sources=self.knowledge_sources) + else: + self._knowledge = None return self @@ -234,6 +240,16 @@ class Agent(BaseAgent): if memory.strip() != "": task_prompt += self.i18n.slice("memory").format(memory=memory) + # Integrate the knowledge base + if self._knowledge: + # Query the knowledge base for relevant information + knowledge_snippets = self._knowledge.query(query=task.prompt()) + print("knowledge_snippets", knowledge_snippets) + if knowledge_snippets: + formatted_knowledge = "\n".join(knowledge_snippets) + print("formatted_knowledge", formatted_knowledge) + task_prompt += f"\n\nAdditional Information:\n{formatted_knowledge}" + tools = tools or self.tools or [] self.create_agent_executor(tools=tools, task=task) diff --git a/src/crewai/knowledge/source/base_file_knowledge_source.py b/src/crewai/knowledge/source/base_file_knowledge_source.py new file mode 100644 index 000000000..a658d2e30 --- /dev/null +++ b/src/crewai/knowledge/source/base_file_knowledge_source.py @@ -0,0 +1,24 @@ +from pathlib import Path + +from pydantic import Field + +from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource + + +class BaseFileKnowledgeSource(BaseKnowledgeSource): + """Base class for knowledge sources that load content from files.""" + + file_path: Path = Field(...) + content: str = Field(init=False, default="") + + def model_post_init(self, context): + """Post-initialization method to load content.""" + self.content = self.load_content() + + def load_content(self) -> str: + """Load and preprocess file content. Should be overridden by subclasses.""" + if not self.file_path.exists(): + raise FileNotFoundError(f"File not found: {self.file_path}") + if not self.file_path.is_file(): + raise ValueError(f"Path is not a file: {self.file_path}") + return "" diff --git a/src/crewai/knowledge/source/base_knowledge_source.py b/src/crewai/knowledge/source/base_knowledge_source.py index 15f65f1f6..51675af68 100644 --- a/src/crewai/knowledge/source/base_knowledge_source.py +++ b/src/crewai/knowledge/source/base_knowledge_source.py @@ -2,22 +2,20 @@ from abc import ABC, abstractmethod from typing import List import numpy as np +from pydantic import BaseModel, ConfigDict, Field from crewai.knowledge.embedder.base_embedder import BaseEmbedder -class BaseKnowledgeSource(ABC): +class BaseKnowledgeSource(BaseModel, ABC): """Abstract base class for knowledge sources.""" - def __init__( - self, - chunk_size: int = 1000, - chunk_overlap: int = 200, - ): - self.chunk_size = chunk_size - self.chunk_overlap = chunk_overlap - self.chunks: List[str] = [] - self.chunk_embeddings: List[np.ndarray] = [] + chunk_size: int = 1000 + chunk_overlap: int = 200 + chunks: List[str] = Field(default_factory=list) + chunk_embeddings: List[np.ndarray] = Field(default_factory=list) + + model_config = ConfigDict(arbitrary_types_allowed=True) @abstractmethod def load_content(self): diff --git a/src/crewai/knowledge/source/pdf_knowledge_source.py b/src/crewai/knowledge/source/pdf_knowledge_source.py index c86a8abc2..1ca0ab356 100644 --- a/src/crewai/knowledge/source/pdf_knowledge_source.py +++ b/src/crewai/knowledge/source/pdf_knowledge_source.py @@ -1,22 +1,23 @@ -from pathlib import Path from typing import List from crewai.knowledge.embedder.base_embedder import BaseEmbedder -from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource +from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource -class PDFKnowledgeSource(BaseKnowledgeSource): +class PDFKnowledgeSource(BaseFileKnowledgeSource): """A knowledge source that stores and queries PDF file content using embeddings.""" - def __init__( - self, - file_path: str, - chunk_size: int = 1000, - chunk_overlap: int = 200, - ): - super().__init__(chunk_size, chunk_overlap) - self.file_path = Path(file_path) - self.content = self.load_content() + def load_content(self) -> str: + """Load and preprocess PDF file content.""" + super().load_content() # Validate the file path + pdfplumber = self._import_pdfplumber() + text = "" + with pdfplumber.open(self.file_path) as pdf: + for page in pdf.pages: + page_text = page.extract_text() + if page_text: + text += page_text + "\n" + return text def _import_pdfplumber(self): """Dynamically import pdfplumber.""" @@ -29,22 +30,6 @@ class PDFKnowledgeSource(BaseKnowledgeSource): "pdfplumber is not installed. Please install it with: pip install pdfplumber" ) - def load_content(self) -> str: - """Load and preprocess PDF file content.""" - if not self.file_path.exists(): - raise FileNotFoundError(f"File not found: {self.file_path}") - if not self.file_path.is_file(): - raise ValueError(f"Path is not a file: {self.file_path}") - - pdfplumber = self._import_pdfplumber() - text = "" - with pdfplumber.open(self.file_path) as pdf: - for page in pdf.pages: - page_text = page.extract_text() - if page_text: - text += page_text + "\n" - return text - def add(self, embedder: BaseEmbedder) -> None: """ Add PDF file content to the knowledge source, chunk it, compute embeddings, diff --git a/src/crewai/knowledge/source/string_knowledge_source.py b/src/crewai/knowledge/source/string_knowledge_source.py index a2f423fbd..9dd0ecd9f 100644 --- a/src/crewai/knowledge/source/string_knowledge_source.py +++ b/src/crewai/knowledge/source/string_knowledge_source.py @@ -1,5 +1,7 @@ from typing import List +from pydantic import Field + from crewai.knowledge.embedder.base_embedder import BaseEmbedder from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource @@ -7,29 +9,19 @@ from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource class StringKnowledgeSource(BaseKnowledgeSource): """A knowledge source that stores and queries plain text content using embeddings.""" - def __init__( - self, - content: str, - chunk_size: int = 1000, - chunk_overlap: int = 200, - ): - super().__init__( - chunk_size, - chunk_overlap, - ) - self.content = content + content: str = Field(...) + + def model_post_init(self, context): + """Post-initialization method to validate content.""" self.load_content() def load_content(self): - """Load and preprocess string content.""" + """Validate string content.""" if not isinstance(self.content, str): raise ValueError("StringKnowledgeSource only accepts string content") def add(self, embedder: BaseEmbedder) -> None: - """ - Add string content to the knowledge source, chunk it, compute embeddings, - and save the embeddings. - """ + """Add string content to the knowledge source, chunk it, compute embeddings, and save them.""" new_chunks = self._chunk_text(self.content) self.chunks.extend(new_chunks) # Compute embeddings for the new chunks diff --git a/src/crewai/knowledge/source/text_file_knowledge_source.py b/src/crewai/knowledge/source/text_file_knowledge_source.py index 8c97ae9ca..fb14319e5 100644 --- a/src/crewai/knowledge/source/text_file_knowledge_source.py +++ b/src/crewai/knowledge/source/text_file_knowledge_source.py @@ -1,30 +1,15 @@ -from pathlib import Path from typing import List from crewai.knowledge.embedder.base_embedder import BaseEmbedder -from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource +from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource -class TextFileKnowledgeSource(BaseKnowledgeSource): +class TextFileKnowledgeSource(BaseFileKnowledgeSource): """A knowledge source that stores and queries text file content using embeddings.""" - def __init__( - self, - file_path: str, - chunk_size: int = 1000, - chunk_overlap: int = 200, - ): - super().__init__(chunk_size, chunk_overlap) - self.file_path = Path(file_path) - self.content = self.load_content() - def load_content(self) -> str: """Load and preprocess text file content.""" - if not self.file_path.exists(): - raise FileNotFoundError(f"File not found: {self.file_path}") - if not self.file_path.is_file(): - raise ValueError(f"Path is not a file: {self.file_path}") - + super().load_content() # Validate the file path with self.file_path.open("r", encoding="utf-8") as f: return f.read() diff --git a/tests/agent_test.py b/tests/agent_test.py index c4094d15c..6fcd79d2e 100644 --- a/tests/agent_test.py +++ b/tests/agent_test.py @@ -10,10 +10,11 @@ from crewai import Agent, Crew, Task from crewai.agents.cache import CacheHandler from crewai.agents.crew_agent_executor import CrewAgentExecutor from crewai.agents.parser import AgentAction, CrewAgentParser, OutputParserException +from crewai.knowledge.source.string_knowledge_source import StringKnowledgeSource from crewai.llm import LLM +from crewai.tools import tool from crewai.tools.tool_calling import InstructorToolCalling from crewai.tools.tool_usage import ToolUsage -from crewai.tools import tool from crewai.tools.tool_usage_events import ToolUsageFinished from crewai.utilities import RPMController from crewai.utilities.events import Emitter @@ -1574,3 +1575,32 @@ def test_agent_execute_task_with_ollama(): result = agent.execute_task(task) assert len(result.split(".")) == 2 assert "AI" in result or "artificial intelligence" in result.lower() + + +# @pytest.mark.vcr(filter_headers=["authorization"]) +def test_agent_with_knowledge_sources(): + # Create a knowledge source with some content + content = "Brandon's favorite color is blue and he likes Mexican food." + string_source = StringKnowledgeSource(content=content) + + # Create an agent with the knowledge source + agent = Agent( + role="Information Agent", + goal="Provide information based on knowledge sources", + backstory="You have access to specific knowledge sources.", + llm=LLM(model="gpt-3.5-turbo"), + knowledge_sources=[string_source], + ) + + # Create a task that requires the agent to use the knowledge + task = Task( + description="What is Brandon's favorite color?", + expected_output="Brandon's favorite color.", + agent=agent, + ) + + # Execute the task + result = agent.execute_task(task) + + # Assert that the agent provides the correct information + assert "blue" in result.lower() diff --git a/tests/knowledge/knowledge_test.py b/tests/knowledge/knowledge_test.py index c61226d8b..c77b06dee 100644 --- a/tests/knowledge/knowledge_test.py +++ b/tests/knowledge/knowledge_test.py @@ -1,6 +1,6 @@ """Test Knowledge creation and querying functionality.""" -import os +from pathlib import Path from crewai.knowledge.knowledge import Knowledge from crewai.knowledge.source.pdf_knowledge_source import PDFKnowledgeSource @@ -141,11 +141,11 @@ def test_multiple_2k_character_strings(): def test_single_short_file(tmpdir): # Create a single short text file content = "Brandon's favorite sport is basketball." - file_path = tmpdir.join("short_file.txt") + file_path = Path(tmpdir.join("short_file.txt")) with open(file_path, "w") as f: f.write(content) - file_source = TextFileKnowledgeSource(file_path=str(file_path)) + file_source = TextFileKnowledgeSource(file_path=file_path) knowledge_base = Knowledge(sources=[file_source]) # Perform a query @@ -180,11 +180,11 @@ def test_single_2k_character_file(tmpdir): "Brandon's favorite sport is basketball, and he often plays with his friends on weekends. " "He is also a fan of the Golden State Warriors and enjoys watching their games. " ) * 2 # Repeat to ensure it's 2k characters - file_path = tmpdir.join("long_file.txt") + file_path = Path(tmpdir.join("long_file.txt")) with open(file_path, "w") as f: f.write(content) - file_source = TextFileKnowledgeSource(file_path=str(file_path)) + file_source = TextFileKnowledgeSource(file_path=file_path) knowledge_base = Knowledge(sources=[file_source]) # Perform a query @@ -204,10 +204,10 @@ def test_multiple_short_files(tmpdir): ] file_paths = [] for i, content in enumerate(contents): - file_path = tmpdir.join(f"file_{i}.txt") + file_path = Path(tmpdir.join(f"file_{i}.txt")) with open(file_path, "w") as f: f.write(content) - file_paths.append(str(file_path)) + file_paths.append(file_path) file_sources = [TextFileKnowledgeSource(file_path=path) for path in file_paths] knowledge_base = Knowledge(sources=file_sources) @@ -272,10 +272,10 @@ def test_multiple_2k_character_files(tmpdir): ] file_paths = [] for i, content in enumerate(contents): - file_path = tmpdir.join(f"long_file_{i}.txt") + file_path = Path(tmpdir.join(f"long_file_{i}.txt")) with open(file_path, "w") as f: f.write(content) - file_paths.append(str(file_path)) + file_paths.append(file_path) file_sources = [TextFileKnowledgeSource(file_path=path) for path in file_paths] knowledge_base = Knowledge(sources=file_sources) @@ -307,10 +307,10 @@ def test_hybrid_string_and_files(tmpdir): ] file_paths = [] for i, content in enumerate(file_contents): - file_path = tmpdir.join(f"file_{i}.txt") + file_path = Path(tmpdir.join(f"file_{i}.txt")) with open(file_path, "w") as f: f.write(content) - file_paths.append(str(file_path)) + file_paths.append(file_path) file_sources = [TextFileKnowledgeSource(file_path=path) for path in file_paths] @@ -327,9 +327,9 @@ def test_hybrid_string_and_files(tmpdir): def test_pdf_knowledge_source(): # Get the directory of the current file - current_dir = os.path.dirname(__file__) + current_dir = Path(__file__).parent # Construct the path to the PDF file - pdf_path = os.path.join(current_dir, "crewai_quickstart.pdf") + pdf_path = current_dir / "crewai_quickstart.pdf" # Create a PDFKnowledgeSource pdf_source = PDFKnowledgeSource(file_path=pdf_path)