This commit is contained in:
Brandon Hancock
2024-11-05 12:04:58 -05:00
parent dc314c1151
commit a8a2f80616
7 changed files with 163 additions and 34 deletions

View File

@@ -8,11 +8,12 @@ from pydantic import Field, InstanceOf, PrivateAttr, model_validator
from crewai.agents import CacheHandler
from crewai.agents.agent_builder.base_agent import BaseAgent
from crewai.agents.crew_agent_executor import CrewAgentExecutor
from crewai.knowledge import StringKnowledgeBase
# from crewai.knowledge import StringKnowledgeBase
from crewai.llm import LLM
from crewai.memory.contextual.contextual_memory import ContextualMemory
from crewai.tools.agent_tools.agent_tools import AgentTools
from crewai.tools import BaseTool
from crewai.tools.agent_tools.agent_tools import AgentTools
from crewai.utilities import Converter, Prompts
from crewai.utilities.constants import TRAINED_AGENTS_DATA_FILE, TRAINING_DATA_FILE
from crewai.utilities.token_counter_callback import TokenCalcHandler
@@ -188,7 +189,7 @@ class Agent(BaseAgent):
if self.allow_code_execution:
self._validate_docker_installation()
self.knowledge = StringKnowledgeBase(content=self.knowledge)
# self.knowledge = StringKnowledgeBase(content=self.knowledge)
return self

View File

@@ -0,0 +1,55 @@
from abc import ABC, abstractmethod
from typing import List
import numpy as np
class BaseEmbedder(ABC):
"""
Abstract base class for text embedding models
"""
@abstractmethod
def embed_chunks(self, chunks: List[str]) -> np.ndarray:
"""
Generate embeddings for a list of text chunks
Args:
chunks: List of text chunks to embed
Returns:
Array of embeddings
"""
pass
@abstractmethod
def embed_texts(self, texts: List[str]) -> np.ndarray:
"""
Generate embeddings for a list of texts
Args:
texts: List of texts to embed
Returns:
Array of embeddings
"""
pass
@abstractmethod
def embed_text(self, text: str) -> np.ndarray:
"""
Generate embedding for a single text
Args:
text: Text to embed
Returns:
Embedding array
"""
pass
@property
@abstractmethod
def dimension(self) -> int:
"""Get the dimension of the embeddings"""
pass

View File

@@ -1,9 +1,12 @@
from typing import List, Optional, Union
from pathlib import Path
from typing import List, Optional, Union
import numpy as np
from .base_embedder import BaseEmbedder
try:
from fastembed_gpu import TextEmbedding
from fastembed_gpu import TextEmbedding # type: ignore
FASTEMBED_AVAILABLE = True
except ImportError:
@@ -15,7 +18,7 @@ except ImportError:
FASTEMBED_AVAILABLE = False
class Embeddings:
class FastEmbed(BaseEmbedder):
"""
A wrapper class for text embedding models using FastEmbed
"""
@@ -44,6 +47,20 @@ class Embeddings:
cache_dir=str(cache_dir) if cache_dir else None,
)
def embed_chunks(self, chunks: List[str]) -> np.ndarray:
"""
Generate embeddings for a list of text chunks
Args:
chunks: List of text chunks to embed
Returns:
Array of embeddings
"""
# FastEmbed returns a generator, convert to list then numpy array
embeddings = list(self.model.embed(chunks))
return np.array(embeddings)
def embed_texts(self, texts: List[str]) -> np.ndarray:
"""
Generate embeddings for a list of texts

View File

@@ -0,0 +1,21 @@
from typing import List, Optional
from pydantic import BaseModel
from .embedder.base_embedder import BaseEmbedder
from .embedder.fastembed import FastEmbed
from .source.base_knowledge_source import BaseKnowledgeSource
class Knowledge(BaseModel):
sources: Optional[List[BaseKnowledgeSource]] = None
embedder: BaseEmbedder
def __init__(
self,
sources: Optional[List[BaseKnowledgeSource]] = None,
embedder: Optional[BaseEmbedder] = None,
):
super().__init__()
self.sources = sources or []
self.embedder = embedder or FastEmbed()

View File

@@ -1,23 +1,23 @@
from typing import List, Any, Optional, Dict
from abc import ABC, abstractmethod
from typing import Any, Dict, List
import numpy as np
from .embeddings import Embeddings
from crewai.knowledge.embedder.base_embedder import BaseEmbedder
class BaseKnowledgeBase(ABC):
class BaseKnowledgeSource(ABC):
"""Abstract base class for knowledge bases"""
def __init__(
self,
chunk_size: int = 1000,
chunk_overlap: int = 200,
embeddings_class: Optional[Embeddings] = None,
):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.chunks: List[str] = []
self.chunk_embeddings: Dict[int, np.ndarray] = {}
self.embeddings_class = embeddings_class or Embeddings()
@abstractmethod
def query(self, query: str) -> str:
@@ -29,18 +29,13 @@ class BaseKnowledgeBase(ABC):
"""Process and store content in the knowledge base"""
pass
def reset(self) -> None:
"""Reset the knowledge base"""
self.chunks = []
self.chunk_embeddings = {}
def _embed_chunks(self, new_chunks: List[str]) -> None:
def embed(self, embedder: BaseEmbedder, new_chunks: List[str]) -> None:
"""Embed chunks and store them"""
if not new_chunks:
return
# Get embeddings for new chunks
embeddings = self.embeddings_class.embed_texts(new_chunks)
embeddings = embedder.embed_texts(new_chunks)
# Store embeddings with their corresponding chunks
start_idx = len(self.chunks)
@@ -92,13 +87,15 @@ class BaseKnowledgeBase(ABC):
return chunks
def _find_similar_chunks(self, query: str, top_k: int = 3) -> List[str]:
def _find_similar_chunks(
self, embedder: BaseEmbedder, query: str, top_k: int = 3
) -> List[str]:
"""Find the most similar chunks to a query using embeddings"""
if not self.chunks:
return []
# Get query embedding
query_embedding = self.embeddings_class.embed_text(query)
query_embedding = embedder.embed_text(query)
# Calculate similarities with all chunks
similarities = []

View File

@@ -1,25 +1,24 @@
from typing import Optional
from crewai.knowledge.base_knowledge import BaseKnowledgeBase
from crewai.knowledge.embeddings import Embeddings
from crewai.knowledge.embedder.base_embedder import BaseEmbedder
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
class StringKnowledgeBase(BaseKnowledgeBase):
class StringKnowledgeSource(BaseKnowledgeSource):
"""A knowledge base that stores and queries plain text content using embeddings"""
def __init__(
self,
content: str,
chunk_size: int = 1000,
chunk_overlap: int = 200,
embeddings_class: Optional[Embeddings] = None,
content: Optional[str] = None,
):
super().__init__(chunk_size, chunk_overlap, embeddings_class)
if content:
self.add(content)
super().__init__(
chunk_size,
chunk_overlap,
)
def add(self, content: str) -> None:
def add(self, embedder: BaseEmbedder) -> None:
"""Add text content to the knowledge base, chunk it, and compute embeddings"""
if not isinstance(content, str):
if not isinstance(self.content, str):
raise ValueError("StringKnowledgeBase only accepts string content")
# Create chunks from the text
@@ -29,12 +28,12 @@ class StringKnowledgeBase(BaseKnowledgeBase):
self.chunks.extend(new_chunks)
# Compute and store embeddings for the new chunks
self._embed_chunks(new_chunks)
embedder.embed_chunks(new_chunks)
def query(self, query: str, top_k: int = 3) -> str:
def query(self, embedder: BaseEmbedder, query: str, top_k: int = 3) -> str:
"""
Query the knowledge base using semantic search
Returns the most relevant chunk based on embedding similarity
"""
similar_chunks = self._find_similar_chunks(query, top_k=top_k)
similar_chunks = self._find_similar_chunks(embedder, query, top_k=top_k)
return similar_chunks[0] if similar_chunks else ""

View File

@@ -0,0 +1,39 @@
from crewai.knowledge.embedder.base_embedder import BaseEmbedder
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
class TextFileKnowledgeSource(BaseKnowledgeSource):
"""A knowledge base that stores and queries plain text content using embeddings"""
def __init__(
self,
file_path: str,
chunk_size: int = 1000,
chunk_overlap: int = 200,
):
super().__init__(
chunk_size,
chunk_overlap,
)
def add(self, embedder: BaseEmbedder) -> None:
"""Add text content to the knowledge base, chunk it, and compute embeddings"""
if not isinstance(self.content, str):
raise ValueError("StringKnowledgeBase only accepts string content")
# Create chunks from the text
new_chunks = self._chunk_text(content)
# Add chunks to the knowledge base
self.chunks.extend(new_chunks)
# Compute and store embeddings for the new chunks
embedder.embed_chunks(new_chunks)
def query(self, embedder: BaseEmbedder, query: str, top_k: int = 3) -> str:
"""
Query the knowledge base using semantic search
Returns the most relevant chunk based on embedding similarity
"""
similar_chunks = self._find_similar_chunks(embedder, query, top_k=top_k)
return similar_chunks[0] if similar_chunks else ""