This commit is contained in:
Brandon Hancock
2024-11-05 12:04:58 -05:00
parent dc314c1151
commit a8a2f80616
7 changed files with 163 additions and 34 deletions

View File

@@ -8,11 +8,12 @@ from pydantic import Field, InstanceOf, PrivateAttr, model_validator
from crewai.agents import CacheHandler from crewai.agents import CacheHandler
from crewai.agents.agent_builder.base_agent import BaseAgent from crewai.agents.agent_builder.base_agent import BaseAgent
from crewai.agents.crew_agent_executor import CrewAgentExecutor from crewai.agents.crew_agent_executor import CrewAgentExecutor
from crewai.knowledge import StringKnowledgeBase
# from crewai.knowledge import StringKnowledgeBase
from crewai.llm import LLM from crewai.llm import LLM
from crewai.memory.contextual.contextual_memory import ContextualMemory from crewai.memory.contextual.contextual_memory import ContextualMemory
from crewai.tools.agent_tools.agent_tools import AgentTools
from crewai.tools import BaseTool from crewai.tools import BaseTool
from crewai.tools.agent_tools.agent_tools import AgentTools
from crewai.utilities import Converter, Prompts from crewai.utilities import Converter, Prompts
from crewai.utilities.constants import TRAINED_AGENTS_DATA_FILE, TRAINING_DATA_FILE from crewai.utilities.constants import TRAINED_AGENTS_DATA_FILE, TRAINING_DATA_FILE
from crewai.utilities.token_counter_callback import TokenCalcHandler from crewai.utilities.token_counter_callback import TokenCalcHandler
@@ -188,7 +189,7 @@ class Agent(BaseAgent):
if self.allow_code_execution: if self.allow_code_execution:
self._validate_docker_installation() self._validate_docker_installation()
self.knowledge = StringKnowledgeBase(content=self.knowledge) # self.knowledge = StringKnowledgeBase(content=self.knowledge)
return self return self

View File

@@ -0,0 +1,55 @@
from abc import ABC, abstractmethod
from typing import List
import numpy as np
class BaseEmbedder(ABC):
"""
Abstract base class for text embedding models
"""
@abstractmethod
def embed_chunks(self, chunks: List[str]) -> np.ndarray:
"""
Generate embeddings for a list of text chunks
Args:
chunks: List of text chunks to embed
Returns:
Array of embeddings
"""
pass
@abstractmethod
def embed_texts(self, texts: List[str]) -> np.ndarray:
"""
Generate embeddings for a list of texts
Args:
texts: List of texts to embed
Returns:
Array of embeddings
"""
pass
@abstractmethod
def embed_text(self, text: str) -> np.ndarray:
"""
Generate embedding for a single text
Args:
text: Text to embed
Returns:
Embedding array
"""
pass
@property
@abstractmethod
def dimension(self) -> int:
"""Get the dimension of the embeddings"""
pass

View File

@@ -1,9 +1,12 @@
from typing import List, Optional, Union
from pathlib import Path from pathlib import Path
from typing import List, Optional, Union
import numpy as np import numpy as np
from .base_embedder import BaseEmbedder
try: try:
from fastembed_gpu import TextEmbedding from fastembed_gpu import TextEmbedding # type: ignore
FASTEMBED_AVAILABLE = True FASTEMBED_AVAILABLE = True
except ImportError: except ImportError:
@@ -15,7 +18,7 @@ except ImportError:
FASTEMBED_AVAILABLE = False FASTEMBED_AVAILABLE = False
class Embeddings: class FastEmbed(BaseEmbedder):
""" """
A wrapper class for text embedding models using FastEmbed A wrapper class for text embedding models using FastEmbed
""" """
@@ -44,6 +47,20 @@ class Embeddings:
cache_dir=str(cache_dir) if cache_dir else None, cache_dir=str(cache_dir) if cache_dir else None,
) )
def embed_chunks(self, chunks: List[str]) -> np.ndarray:
"""
Generate embeddings for a list of text chunks
Args:
chunks: List of text chunks to embed
Returns:
Array of embeddings
"""
# FastEmbed returns a generator, convert to list then numpy array
embeddings = list(self.model.embed(chunks))
return np.array(embeddings)
def embed_texts(self, texts: List[str]) -> np.ndarray: def embed_texts(self, texts: List[str]) -> np.ndarray:
""" """
Generate embeddings for a list of texts Generate embeddings for a list of texts

View File

@@ -0,0 +1,21 @@
from typing import List, Optional
from pydantic import BaseModel
from .embedder.base_embedder import BaseEmbedder
from .embedder.fastembed import FastEmbed
from .source.base_knowledge_source import BaseKnowledgeSource
class Knowledge(BaseModel):
sources: Optional[List[BaseKnowledgeSource]] = None
embedder: BaseEmbedder
def __init__(
self,
sources: Optional[List[BaseKnowledgeSource]] = None,
embedder: Optional[BaseEmbedder] = None,
):
super().__init__()
self.sources = sources or []
self.embedder = embedder or FastEmbed()

View File

@@ -1,23 +1,23 @@
from typing import List, Any, Optional, Dict
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Any, Dict, List
import numpy as np import numpy as np
from .embeddings import Embeddings
from crewai.knowledge.embedder.base_embedder import BaseEmbedder
class BaseKnowledgeBase(ABC): class BaseKnowledgeSource(ABC):
"""Abstract base class for knowledge bases""" """Abstract base class for knowledge bases"""
def __init__( def __init__(
self, self,
chunk_size: int = 1000, chunk_size: int = 1000,
chunk_overlap: int = 200, chunk_overlap: int = 200,
embeddings_class: Optional[Embeddings] = None,
): ):
self.chunk_size = chunk_size self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap self.chunk_overlap = chunk_overlap
self.chunks: List[str] = [] self.chunks: List[str] = []
self.chunk_embeddings: Dict[int, np.ndarray] = {} self.chunk_embeddings: Dict[int, np.ndarray] = {}
self.embeddings_class = embeddings_class or Embeddings()
@abstractmethod @abstractmethod
def query(self, query: str) -> str: def query(self, query: str) -> str:
@@ -29,18 +29,13 @@ class BaseKnowledgeBase(ABC):
"""Process and store content in the knowledge base""" """Process and store content in the knowledge base"""
pass pass
def reset(self) -> None: def embed(self, embedder: BaseEmbedder, new_chunks: List[str]) -> None:
"""Reset the knowledge base"""
self.chunks = []
self.chunk_embeddings = {}
def _embed_chunks(self, new_chunks: List[str]) -> None:
"""Embed chunks and store them""" """Embed chunks and store them"""
if not new_chunks: if not new_chunks:
return return
# Get embeddings for new chunks # Get embeddings for new chunks
embeddings = self.embeddings_class.embed_texts(new_chunks) embeddings = embedder.embed_texts(new_chunks)
# Store embeddings with their corresponding chunks # Store embeddings with their corresponding chunks
start_idx = len(self.chunks) start_idx = len(self.chunks)
@@ -92,13 +87,15 @@ class BaseKnowledgeBase(ABC):
return chunks return chunks
def _find_similar_chunks(self, query: str, top_k: int = 3) -> List[str]: def _find_similar_chunks(
self, embedder: BaseEmbedder, query: str, top_k: int = 3
) -> List[str]:
"""Find the most similar chunks to a query using embeddings""" """Find the most similar chunks to a query using embeddings"""
if not self.chunks: if not self.chunks:
return [] return []
# Get query embedding # Get query embedding
query_embedding = self.embeddings_class.embed_text(query) query_embedding = embedder.embed_text(query)
# Calculate similarities with all chunks # Calculate similarities with all chunks
similarities = [] similarities = []

View File

@@ -1,25 +1,24 @@
from typing import Optional from crewai.knowledge.embedder.base_embedder import BaseEmbedder
from crewai.knowledge.base_knowledge import BaseKnowledgeBase from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
from crewai.knowledge.embeddings import Embeddings
class StringKnowledgeBase(BaseKnowledgeBase): class StringKnowledgeSource(BaseKnowledgeSource):
"""A knowledge base that stores and queries plain text content using embeddings""" """A knowledge base that stores and queries plain text content using embeddings"""
def __init__( def __init__(
self, self,
content: str,
chunk_size: int = 1000, chunk_size: int = 1000,
chunk_overlap: int = 200, chunk_overlap: int = 200,
embeddings_class: Optional[Embeddings] = None,
content: Optional[str] = None,
): ):
super().__init__(chunk_size, chunk_overlap, embeddings_class) super().__init__(
if content: chunk_size,
self.add(content) chunk_overlap,
)
def add(self, content: str) -> None: def add(self, embedder: BaseEmbedder) -> None:
"""Add text content to the knowledge base, chunk it, and compute embeddings""" """Add text content to the knowledge base, chunk it, and compute embeddings"""
if not isinstance(content, str): if not isinstance(self.content, str):
raise ValueError("StringKnowledgeBase only accepts string content") raise ValueError("StringKnowledgeBase only accepts string content")
# Create chunks from the text # Create chunks from the text
@@ -29,12 +28,12 @@ class StringKnowledgeBase(BaseKnowledgeBase):
self.chunks.extend(new_chunks) self.chunks.extend(new_chunks)
# Compute and store embeddings for the new chunks # Compute and store embeddings for the new chunks
self._embed_chunks(new_chunks) embedder.embed_chunks(new_chunks)
def query(self, query: str, top_k: int = 3) -> str: def query(self, embedder: BaseEmbedder, query: str, top_k: int = 3) -> str:
""" """
Query the knowledge base using semantic search Query the knowledge base using semantic search
Returns the most relevant chunk based on embedding similarity Returns the most relevant chunk based on embedding similarity
""" """
similar_chunks = self._find_similar_chunks(query, top_k=top_k) similar_chunks = self._find_similar_chunks(embedder, query, top_k=top_k)
return similar_chunks[0] if similar_chunks else "" return similar_chunks[0] if similar_chunks else ""

View File

@@ -0,0 +1,39 @@
from crewai.knowledge.embedder.base_embedder import BaseEmbedder
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
class TextFileKnowledgeSource(BaseKnowledgeSource):
"""A knowledge base that stores and queries plain text content using embeddings"""
def __init__(
self,
file_path: str,
chunk_size: int = 1000,
chunk_overlap: int = 200,
):
super().__init__(
chunk_size,
chunk_overlap,
)
def add(self, embedder: BaseEmbedder) -> None:
"""Add text content to the knowledge base, chunk it, and compute embeddings"""
if not isinstance(self.content, str):
raise ValueError("StringKnowledgeBase only accepts string content")
# Create chunks from the text
new_chunks = self._chunk_text(content)
# Add chunks to the knowledge base
self.chunks.extend(new_chunks)
# Compute and store embeddings for the new chunks
embedder.embed_chunks(new_chunks)
def query(self, embedder: BaseEmbedder, query: str, top_k: int = 3) -> str:
"""
Query the knowledge base using semantic search
Returns the most relevant chunk based on embedding similarity
"""
similar_chunks = self._find_similar_chunks(embedder, query, top_k=top_k)
return similar_chunks[0] if similar_chunks else ""