Improve types and better support for file paths

This commit is contained in:
Brandon Hancock
2024-11-06 15:57:03 -05:00
parent 1a35114c08
commit 6131dbac4f
8 changed files with 121 additions and 91 deletions

View File

@@ -8,8 +8,8 @@ from pydantic import Field, InstanceOf, PrivateAttr, model_validator
from crewai.agents import CacheHandler
from crewai.agents.agent_builder.base_agent import BaseAgent
from crewai.agents.crew_agent_executor import CrewAgentExecutor
# from crewai.knowledge import StringKnowledgeBase
from crewai.knowledge.knowledge import Knowledge
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
from crewai.llm import LLM
from crewai.memory.contextual.contextual_memory import ContextualMemory
from crewai.tools import BaseTool
@@ -87,9 +87,9 @@ class Agent(BaseAgent):
llm: Union[str, InstanceOf[LLM], Any] = Field(
description="Language model that will run the agent.", default=None
)
knowledge: Optional[str] = Field(
knowledge_sources: Optional[List[BaseKnowledgeSource]] = Field(
default=None,
description="Knowledge base for the agent.",
description="Knowledge sources for the agent.",
)
function_calling_llm: Optional[Any] = Field(
description="Language model that will run the agent.", default=None
@@ -125,6 +125,8 @@ class Agent(BaseAgent):
default="safe",
description="Mode for code execution: 'safe' (using Docker) or 'unsafe' (direct execution).",
)
# TODO: We need to add in knowledge config (score, top_k, etc)
_knowledge: Optional[Knowledge] = PrivateAttr(default=None)
@model_validator(mode="after")
def post_init_setup(self):
@@ -189,7 +191,11 @@ class Agent(BaseAgent):
if self.allow_code_execution:
self._validate_docker_installation()
# self.knowledge = StringKnowledgeBase(content=self.knowledge)
# Initialize the Knowledge object if knowledge_sources are provided
if self.knowledge_sources:
self._knowledge = Knowledge(sources=self.knowledge_sources)
else:
self._knowledge = None
return self
@@ -234,6 +240,16 @@ class Agent(BaseAgent):
if memory.strip() != "":
task_prompt += self.i18n.slice("memory").format(memory=memory)
# Integrate the knowledge base
if self._knowledge:
# Query the knowledge base for relevant information
knowledge_snippets = self._knowledge.query(query=task.prompt())
print("knowledge_snippets", knowledge_snippets)
if knowledge_snippets:
formatted_knowledge = "\n".join(knowledge_snippets)
print("formatted_knowledge", formatted_knowledge)
task_prompt += f"\n\nAdditional Information:\n{formatted_knowledge}"
tools = tools or self.tools or []
self.create_agent_executor(tools=tools, task=task)

View File

@@ -0,0 +1,24 @@
from pathlib import Path
from pydantic import Field
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
class BaseFileKnowledgeSource(BaseKnowledgeSource):
"""Base class for knowledge sources that load content from files."""
file_path: Path = Field(...)
content: str = Field(init=False, default="")
def model_post_init(self, context):
"""Post-initialization method to load content."""
self.content = self.load_content()
def load_content(self) -> str:
"""Load and preprocess file content. Should be overridden by subclasses."""
if not self.file_path.exists():
raise FileNotFoundError(f"File not found: {self.file_path}")
if not self.file_path.is_file():
raise ValueError(f"Path is not a file: {self.file_path}")
return ""

View File

@@ -2,22 +2,20 @@ from abc import ABC, abstractmethod
from typing import List
import numpy as np
from pydantic import BaseModel, ConfigDict, Field
from crewai.knowledge.embedder.base_embedder import BaseEmbedder
class BaseKnowledgeSource(ABC):
class BaseKnowledgeSource(BaseModel, ABC):
"""Abstract base class for knowledge sources."""
def __init__(
self,
chunk_size: int = 1000,
chunk_overlap: int = 200,
):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.chunks: List[str] = []
self.chunk_embeddings: List[np.ndarray] = []
chunk_size: int = 1000
chunk_overlap: int = 200
chunks: List[str] = Field(default_factory=list)
chunk_embeddings: List[np.ndarray] = Field(default_factory=list)
model_config = ConfigDict(arbitrary_types_allowed=True)
@abstractmethod
def load_content(self):

View File

@@ -1,22 +1,23 @@
from pathlib import Path
from typing import List
from crewai.knowledge.embedder.base_embedder import BaseEmbedder
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource
class PDFKnowledgeSource(BaseKnowledgeSource):
class PDFKnowledgeSource(BaseFileKnowledgeSource):
"""A knowledge source that stores and queries PDF file content using embeddings."""
def __init__(
self,
file_path: str,
chunk_size: int = 1000,
chunk_overlap: int = 200,
):
super().__init__(chunk_size, chunk_overlap)
self.file_path = Path(file_path)
self.content = self.load_content()
def load_content(self) -> str:
"""Load and preprocess PDF file content."""
super().load_content() # Validate the file path
pdfplumber = self._import_pdfplumber()
text = ""
with pdfplumber.open(self.file_path) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
return text
def _import_pdfplumber(self):
"""Dynamically import pdfplumber."""
@@ -29,22 +30,6 @@ class PDFKnowledgeSource(BaseKnowledgeSource):
"pdfplumber is not installed. Please install it with: pip install pdfplumber"
)
def load_content(self) -> str:
"""Load and preprocess PDF file content."""
if not self.file_path.exists():
raise FileNotFoundError(f"File not found: {self.file_path}")
if not self.file_path.is_file():
raise ValueError(f"Path is not a file: {self.file_path}")
pdfplumber = self._import_pdfplumber()
text = ""
with pdfplumber.open(self.file_path) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
return text
def add(self, embedder: BaseEmbedder) -> None:
"""
Add PDF file content to the knowledge source, chunk it, compute embeddings,

View File

@@ -1,5 +1,7 @@
from typing import List
from pydantic import Field
from crewai.knowledge.embedder.base_embedder import BaseEmbedder
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
@@ -7,29 +9,19 @@ from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
class StringKnowledgeSource(BaseKnowledgeSource):
"""A knowledge source that stores and queries plain text content using embeddings."""
def __init__(
self,
content: str,
chunk_size: int = 1000,
chunk_overlap: int = 200,
):
super().__init__(
chunk_size,
chunk_overlap,
)
self.content = content
content: str = Field(...)
def model_post_init(self, context):
"""Post-initialization method to validate content."""
self.load_content()
def load_content(self):
"""Load and preprocess string content."""
"""Validate string content."""
if not isinstance(self.content, str):
raise ValueError("StringKnowledgeSource only accepts string content")
def add(self, embedder: BaseEmbedder) -> None:
"""
Add string content to the knowledge source, chunk it, compute embeddings,
and save the embeddings.
"""
"""Add string content to the knowledge source, chunk it, compute embeddings, and save them."""
new_chunks = self._chunk_text(self.content)
self.chunks.extend(new_chunks)
# Compute embeddings for the new chunks

View File

@@ -1,30 +1,15 @@
from pathlib import Path
from typing import List
from crewai.knowledge.embedder.base_embedder import BaseEmbedder
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource
class TextFileKnowledgeSource(BaseKnowledgeSource):
class TextFileKnowledgeSource(BaseFileKnowledgeSource):
"""A knowledge source that stores and queries text file content using embeddings."""
def __init__(
self,
file_path: str,
chunk_size: int = 1000,
chunk_overlap: int = 200,
):
super().__init__(chunk_size, chunk_overlap)
self.file_path = Path(file_path)
self.content = self.load_content()
def load_content(self) -> str:
"""Load and preprocess text file content."""
if not self.file_path.exists():
raise FileNotFoundError(f"File not found: {self.file_path}")
if not self.file_path.is_file():
raise ValueError(f"Path is not a file: {self.file_path}")
super().load_content() # Validate the file path
with self.file_path.open("r", encoding="utf-8") as f:
return f.read()

View File

@@ -10,10 +10,11 @@ from crewai import Agent, Crew, Task
from crewai.agents.cache import CacheHandler
from crewai.agents.crew_agent_executor import CrewAgentExecutor
from crewai.agents.parser import AgentAction, CrewAgentParser, OutputParserException
from crewai.knowledge.source.string_knowledge_source import StringKnowledgeSource
from crewai.llm import LLM
from crewai.tools import tool
from crewai.tools.tool_calling import InstructorToolCalling
from crewai.tools.tool_usage import ToolUsage
from crewai.tools import tool
from crewai.tools.tool_usage_events import ToolUsageFinished
from crewai.utilities import RPMController
from crewai.utilities.events import Emitter
@@ -1574,3 +1575,32 @@ def test_agent_execute_task_with_ollama():
result = agent.execute_task(task)
assert len(result.split(".")) == 2
assert "AI" in result or "artificial intelligence" in result.lower()
# @pytest.mark.vcr(filter_headers=["authorization"])
def test_agent_with_knowledge_sources():
# Create a knowledge source with some content
content = "Brandon's favorite color is blue and he likes Mexican food."
string_source = StringKnowledgeSource(content=content)
# Create an agent with the knowledge source
agent = Agent(
role="Information Agent",
goal="Provide information based on knowledge sources",
backstory="You have access to specific knowledge sources.",
llm=LLM(model="gpt-3.5-turbo"),
knowledge_sources=[string_source],
)
# Create a task that requires the agent to use the knowledge
task = Task(
description="What is Brandon's favorite color?",
expected_output="Brandon's favorite color.",
agent=agent,
)
# Execute the task
result = agent.execute_task(task)
# Assert that the agent provides the correct information
assert "blue" in result.lower()

View File

@@ -1,6 +1,6 @@
"""Test Knowledge creation and querying functionality."""
import os
from pathlib import Path
from crewai.knowledge.knowledge import Knowledge
from crewai.knowledge.source.pdf_knowledge_source import PDFKnowledgeSource
@@ -141,11 +141,11 @@ def test_multiple_2k_character_strings():
def test_single_short_file(tmpdir):
# Create a single short text file
content = "Brandon's favorite sport is basketball."
file_path = tmpdir.join("short_file.txt")
file_path = Path(tmpdir.join("short_file.txt"))
with open(file_path, "w") as f:
f.write(content)
file_source = TextFileKnowledgeSource(file_path=str(file_path))
file_source = TextFileKnowledgeSource(file_path=file_path)
knowledge_base = Knowledge(sources=[file_source])
# Perform a query
@@ -180,11 +180,11 @@ def test_single_2k_character_file(tmpdir):
"Brandon's favorite sport is basketball, and he often plays with his friends on weekends. "
"He is also a fan of the Golden State Warriors and enjoys watching their games. "
) * 2 # Repeat to ensure it's 2k characters
file_path = tmpdir.join("long_file.txt")
file_path = Path(tmpdir.join("long_file.txt"))
with open(file_path, "w") as f:
f.write(content)
file_source = TextFileKnowledgeSource(file_path=str(file_path))
file_source = TextFileKnowledgeSource(file_path=file_path)
knowledge_base = Knowledge(sources=[file_source])
# Perform a query
@@ -204,10 +204,10 @@ def test_multiple_short_files(tmpdir):
]
file_paths = []
for i, content in enumerate(contents):
file_path = tmpdir.join(f"file_{i}.txt")
file_path = Path(tmpdir.join(f"file_{i}.txt"))
with open(file_path, "w") as f:
f.write(content)
file_paths.append(str(file_path))
file_paths.append(file_path)
file_sources = [TextFileKnowledgeSource(file_path=path) for path in file_paths]
knowledge_base = Knowledge(sources=file_sources)
@@ -272,10 +272,10 @@ def test_multiple_2k_character_files(tmpdir):
]
file_paths = []
for i, content in enumerate(contents):
file_path = tmpdir.join(f"long_file_{i}.txt")
file_path = Path(tmpdir.join(f"long_file_{i}.txt"))
with open(file_path, "w") as f:
f.write(content)
file_paths.append(str(file_path))
file_paths.append(file_path)
file_sources = [TextFileKnowledgeSource(file_path=path) for path in file_paths]
knowledge_base = Knowledge(sources=file_sources)
@@ -307,10 +307,10 @@ def test_hybrid_string_and_files(tmpdir):
]
file_paths = []
for i, content in enumerate(file_contents):
file_path = tmpdir.join(f"file_{i}.txt")
file_path = Path(tmpdir.join(f"file_{i}.txt"))
with open(file_path, "w") as f:
f.write(content)
file_paths.append(str(file_path))
file_paths.append(file_path)
file_sources = [TextFileKnowledgeSource(file_path=path) for path in file_paths]
@@ -327,9 +327,9 @@ def test_hybrid_string_and_files(tmpdir):
def test_pdf_knowledge_source():
# Get the directory of the current file
current_dir = os.path.dirname(__file__)
current_dir = Path(__file__).parent
# Construct the path to the PDF file
pdf_path = os.path.join(current_dir, "crewai_quickstart.pdf")
pdf_path = current_dir / "crewai_quickstart.pdf"
# Create a PDFKnowledgeSource
pdf_source = PDFKnowledgeSource(file_path=pdf_path)