initial knowledge

This commit is contained in:
João Moura
2024-11-04 15:53:19 -03:00
parent 57201fb856
commit 75322b2de1
7 changed files with 460 additions and 127 deletions

View File

@@ -8,6 +8,7 @@ from pydantic import Field, InstanceOf, PrivateAttr, model_validator
from crewai.agents import CacheHandler
from crewai.agents.agent_builder.base_agent import BaseAgent
from crewai.agents.crew_agent_executor import CrewAgentExecutor
from crewai.knowledge import StringKnowledgeBase
from crewai.llm import LLM
from crewai.memory.contextual.contextual_memory import ContextualMemory
from crewai.tools.agent_tools.agent_tools import AgentTools
@@ -51,6 +52,7 @@ class Agent(BaseAgent):
role: The role of the agent.
goal: The objective of the agent.
backstory: The backstory of the agent.
knowledge: The knowledge base of the agent.
config: Dict representation of agent configuration.
llm: The language model that will run the agent.
function_calling_llm: The language model that will handle the tool calling for this agent, it overrides the crew function_calling_llm.
@@ -84,6 +86,10 @@ class Agent(BaseAgent):
llm: Union[str, InstanceOf[LLM], Any] = Field(
description="Language model that will run the agent.", default=None
)
knowledge: Optional[str] = Field(
default=None,
description="Knowledge base for the agent.",
)
function_calling_llm: Optional[Any] = Field(
description="Language model that will run the agent.", default=None
)
@@ -182,6 +188,8 @@ class Agent(BaseAgent):
if self.allow_code_execution:
self._validate_docker_installation()
self.knowledge = StringKnowledgeBase(content=self.knowledge)
return self
def _setup_agent_executor(self):

View File

View File

@@ -0,0 +1,115 @@
from typing import List, Any, Optional, Dict
from abc import ABC, abstractmethod
import numpy as np
from .embeddings import Embeddings
class BaseKnowledgeBase(ABC):
"""Abstract base class for knowledge bases"""
def __init__(
self,
chunk_size: int = 1000,
chunk_overlap: int = 200,
embeddings_class: Optional[Embeddings] = None,
):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.chunks: List[str] = []
self.chunk_embeddings: Dict[int, np.ndarray] = {}
self.embeddings_class = embeddings_class or Embeddings()
@abstractmethod
def query(self, query: str) -> str:
"""Query the knowledge base and return relevant information"""
pass
@abstractmethod
def add(self, content: Any) -> None:
"""Process and store content in the knowledge base"""
pass
def reset(self) -> None:
"""Reset the knowledge base"""
self.chunks = []
self.chunk_embeddings = {}
def _embed_chunks(self, new_chunks: List[str]) -> None:
"""Embed chunks and store them"""
if not new_chunks:
return
# Get embeddings for new chunks
embeddings = self.embeddings_class.embed_texts(new_chunks)
# Store embeddings with their corresponding chunks
start_idx = len(self.chunks)
for i, embedding in enumerate(embeddings):
self.chunk_embeddings[start_idx + i] = embedding
def _chunk_text(self, text: str) -> List[str]:
"""Split text into chunks with overlap"""
chunks = []
start = 0
text_length = len(text)
while start < text_length:
# Get the chunk of size chunk_size
end = start + self.chunk_size
if end >= text_length:
# If we're at the end, just take the rest
chunks.append(text[start:].strip())
break
# Look for a good breaking point
# Priority: double newline > single newline > period > space
break_chars = ["\n\n", "\n", ". ", " "]
chunk_end = end
for break_char in break_chars:
# Look for the break_char in a window around the end point
window_start = max(start + self.chunk_size - 100, start)
window_end = min(start + self.chunk_size + 100, text_length)
window_text = text[window_start:window_end]
# Find the last occurrence of the break_char in the window
last_break = window_text.rfind(break_char)
if last_break != -1:
chunk_end = window_start + last_break + len(break_char)
break
# Add the chunk
chunk = text[start:chunk_end].strip()
if chunk: # Only add non-empty chunks
chunks.append(chunk)
# Move the start pointer, accounting for overlap
start = max(
start + self.chunk_size - self.chunk_overlap,
chunk_end - self.chunk_overlap,
)
return chunks
def _find_similar_chunks(self, query: str, top_k: int = 3) -> List[str]:
"""Find the most similar chunks to a query using embeddings"""
if not self.chunks:
return []
# Get query embedding
query_embedding = self.embeddings_class.embed_text(query)
# Calculate similarities with all chunks
similarities = []
for idx, chunk_embedding in self.chunk_embeddings.items():
similarity = np.dot(query_embedding, chunk_embedding)
similarities.append((similarity, idx))
# Sort by similarity and get top_k chunks
similarities.sort(reverse=True)
top_chunks = []
for _, idx in similarities[:top_k]:
top_chunks.append(self.chunks[idx])
return top_chunks

View File

@@ -0,0 +1,78 @@
from typing import List, Optional, Union
from pathlib import Path
import numpy as np
try:
from fastembed_gpu import TextEmbedding
FASTEMBED_AVAILABLE = True
except ImportError:
try:
from fastembed import TextEmbedding
FASTEMBED_AVAILABLE = True
except ImportError:
FASTEMBED_AVAILABLE = False
class Embeddings:
"""
A wrapper class for text embedding models using FastEmbed
"""
def __init__(
self,
model_name: str = "BAAI/bge-small-en-v1.5",
cache_dir: Optional[Union[str, Path]] = None,
):
"""
Initialize the embedding model
Args:
model_name: Name of the model to use
cache_dir: Directory to cache the model
gpu: Whether to use GPU acceleration
"""
if not FASTEMBED_AVAILABLE:
raise ImportError(
"FastEmbed is not installed. Please install it with: "
"pip install fastembed or pip install fastembed-gpu for GPU support"
)
self.model = TextEmbedding(
model_name=model_name,
cache_dir=str(cache_dir) if cache_dir else None,
)
def embed_texts(self, texts: List[str]) -> np.ndarray:
"""
Generate embeddings for a list of texts
Args:
texts: List of texts to embed
Returns:
Array of embeddings
"""
# FastEmbed returns a generator, convert to list then numpy array
embeddings = list(self.model.embed(texts))
return np.array(embeddings)
def embed_text(self, text: str) -> np.ndarray:
"""
Generate embedding for a single text
Args:
text: Text to embed
Returns:
Embedding array
"""
return self.embed_texts([text])[0]
@property
def dimension(self) -> int:
"""Get the dimension of the embeddings"""
# Generate a test embedding to get dimensions
test_embed = self.embed_text("test")
return len(test_embed)

View File

@@ -0,0 +1,40 @@
from typing import Optional
from crewai.knowledge.base_knowledge import BaseKnowledgeBase
from crewai.knowledge.embeddings import Embeddings
class StringKnowledgeBase(BaseKnowledgeBase):
"""A knowledge base that stores and queries plain text content using embeddings"""
def __init__(
self,
chunk_size: int = 1000,
chunk_overlap: int = 200,
embeddings_class: Optional[Embeddings] = None,
content: Optional[str] = None,
):
super().__init__(chunk_size, chunk_overlap, embeddings_class)
if content:
self.add(content)
def add(self, content: str) -> None:
"""Add text content to the knowledge base, chunk it, and compute embeddings"""
if not isinstance(content, str):
raise ValueError("StringKnowledgeBase only accepts string content")
# Create chunks from the text
new_chunks = self._chunk_text(content)
# Add chunks to the knowledge base
self.chunks.extend(new_chunks)
# Compute and store embeddings for the new chunks
self._embed_chunks(new_chunks)
def query(self, query: str, top_k: int = 3) -> str:
"""
Query the knowledge base using semantic search
Returns the most relevant chunk based on embedding similarity
"""
similar_chunks = self._find_similar_chunks(query, top_k=top_k)
return similar_chunks[0] if similar_chunks else ""