From a8a2f80616851a2e25be92539a5f18970841f11d Mon Sep 17 00:00:00 2001 From: Brandon Hancock Date: Tue, 5 Nov 2024 12:04:58 -0500 Subject: [PATCH] WIP --- src/crewai/agent.py | 7 ++- .../knowledge/embedder/base_embedder.py | 55 +++++++++++++++++++ .../{embeddings.py => embedder/fastembed.py} | 23 +++++++- src/crewai/knowledge/knowledge.py | 21 +++++++ .../base_knowledge_source.py} | 25 ++++----- .../string_knowledge_source.py} | 27 +++++---- .../source/text_file_knowledge_source.py | 39 +++++++++++++ 7 files changed, 163 insertions(+), 34 deletions(-) create mode 100644 src/crewai/knowledge/embedder/base_embedder.py rename src/crewai/knowledge/{embeddings.py => embedder/fastembed.py} (78%) create mode 100644 src/crewai/knowledge/knowledge.py rename src/crewai/knowledge/{base_knowledge.py => source/base_knowledge_source.py} (84%) rename src/crewai/knowledge/{string_knowledge.py => source/string_knowledge_source.py} (55%) create mode 100644 src/crewai/knowledge/source/text_file_knowledge_source.py diff --git a/src/crewai/agent.py b/src/crewai/agent.py index 8fd623d14..817eacb2b 100644 --- a/src/crewai/agent.py +++ b/src/crewai/agent.py @@ -8,11 +8,12 @@ from pydantic import Field, InstanceOf, PrivateAttr, model_validator from crewai.agents import CacheHandler from crewai.agents.agent_builder.base_agent import BaseAgent from crewai.agents.crew_agent_executor import CrewAgentExecutor -from crewai.knowledge import StringKnowledgeBase + +# from crewai.knowledge import StringKnowledgeBase from crewai.llm import LLM from crewai.memory.contextual.contextual_memory import ContextualMemory -from crewai.tools.agent_tools.agent_tools import AgentTools from crewai.tools import BaseTool +from crewai.tools.agent_tools.agent_tools import AgentTools from crewai.utilities import Converter, Prompts from crewai.utilities.constants import TRAINED_AGENTS_DATA_FILE, TRAINING_DATA_FILE from crewai.utilities.token_counter_callback import TokenCalcHandler @@ -188,7 +189,7 @@ class Agent(BaseAgent): if self.allow_code_execution: self._validate_docker_installation() - self.knowledge = StringKnowledgeBase(content=self.knowledge) + # self.knowledge = StringKnowledgeBase(content=self.knowledge) return self diff --git a/src/crewai/knowledge/embedder/base_embedder.py b/src/crewai/knowledge/embedder/base_embedder.py new file mode 100644 index 000000000..c3252bf43 --- /dev/null +++ b/src/crewai/knowledge/embedder/base_embedder.py @@ -0,0 +1,55 @@ +from abc import ABC, abstractmethod +from typing import List + +import numpy as np + + +class BaseEmbedder(ABC): + """ + Abstract base class for text embedding models + """ + + @abstractmethod + def embed_chunks(self, chunks: List[str]) -> np.ndarray: + """ + Generate embeddings for a list of text chunks + + Args: + chunks: List of text chunks to embed + + Returns: + Array of embeddings + """ + pass + + @abstractmethod + def embed_texts(self, texts: List[str]) -> np.ndarray: + """ + Generate embeddings for a list of texts + + Args: + texts: List of texts to embed + + Returns: + Array of embeddings + """ + pass + + @abstractmethod + def embed_text(self, text: str) -> np.ndarray: + """ + Generate embedding for a single text + + Args: + text: Text to embed + + Returns: + Embedding array + """ + pass + + @property + @abstractmethod + def dimension(self) -> int: + """Get the dimension of the embeddings""" + pass diff --git a/src/crewai/knowledge/embeddings.py b/src/crewai/knowledge/embedder/fastembed.py similarity index 78% rename from src/crewai/knowledge/embeddings.py rename to src/crewai/knowledge/embedder/fastembed.py index f46ba490c..13e2f7bda 100644 --- a/src/crewai/knowledge/embeddings.py +++ b/src/crewai/knowledge/embedder/fastembed.py @@ -1,9 +1,12 @@ -from typing import List, Optional, Union from pathlib import Path +from typing import List, Optional, Union + import numpy as np +from .base_embedder import BaseEmbedder + try: - from fastembed_gpu import TextEmbedding + from fastembed_gpu import TextEmbedding # type: ignore FASTEMBED_AVAILABLE = True except ImportError: @@ -15,7 +18,7 @@ except ImportError: FASTEMBED_AVAILABLE = False -class Embeddings: +class FastEmbed(BaseEmbedder): """ A wrapper class for text embedding models using FastEmbed """ @@ -44,6 +47,20 @@ class Embeddings: cache_dir=str(cache_dir) if cache_dir else None, ) + def embed_chunks(self, chunks: List[str]) -> np.ndarray: + """ + Generate embeddings for a list of text chunks + + Args: + chunks: List of text chunks to embed + + Returns: + Array of embeddings + """ + # FastEmbed returns a generator, convert to list then numpy array + embeddings = list(self.model.embed(chunks)) + return np.array(embeddings) + def embed_texts(self, texts: List[str]) -> np.ndarray: """ Generate embeddings for a list of texts diff --git a/src/crewai/knowledge/knowledge.py b/src/crewai/knowledge/knowledge.py new file mode 100644 index 000000000..288a6f3ae --- /dev/null +++ b/src/crewai/knowledge/knowledge.py @@ -0,0 +1,21 @@ +from typing import List, Optional + +from pydantic import BaseModel + +from .embedder.base_embedder import BaseEmbedder +from .embedder.fastembed import FastEmbed +from .source.base_knowledge_source import BaseKnowledgeSource + + +class Knowledge(BaseModel): + sources: Optional[List[BaseKnowledgeSource]] = None + embedder: BaseEmbedder + + def __init__( + self, + sources: Optional[List[BaseKnowledgeSource]] = None, + embedder: Optional[BaseEmbedder] = None, + ): + super().__init__() + self.sources = sources or [] + self.embedder = embedder or FastEmbed() diff --git a/src/crewai/knowledge/base_knowledge.py b/src/crewai/knowledge/source/base_knowledge_source.py similarity index 84% rename from src/crewai/knowledge/base_knowledge.py rename to src/crewai/knowledge/source/base_knowledge_source.py index d8809a2ca..31794bcb8 100644 --- a/src/crewai/knowledge/base_knowledge.py +++ b/src/crewai/knowledge/source/base_knowledge_source.py @@ -1,23 +1,23 @@ -from typing import List, Any, Optional, Dict from abc import ABC, abstractmethod +from typing import Any, Dict, List + import numpy as np -from .embeddings import Embeddings + +from crewai.knowledge.embedder.base_embedder import BaseEmbedder -class BaseKnowledgeBase(ABC): +class BaseKnowledgeSource(ABC): """Abstract base class for knowledge bases""" def __init__( self, chunk_size: int = 1000, chunk_overlap: int = 200, - embeddings_class: Optional[Embeddings] = None, ): self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap self.chunks: List[str] = [] self.chunk_embeddings: Dict[int, np.ndarray] = {} - self.embeddings_class = embeddings_class or Embeddings() @abstractmethod def query(self, query: str) -> str: @@ -29,18 +29,13 @@ class BaseKnowledgeBase(ABC): """Process and store content in the knowledge base""" pass - def reset(self) -> None: - """Reset the knowledge base""" - self.chunks = [] - self.chunk_embeddings = {} - - def _embed_chunks(self, new_chunks: List[str]) -> None: + def embed(self, embedder: BaseEmbedder, new_chunks: List[str]) -> None: """Embed chunks and store them""" if not new_chunks: return # Get embeddings for new chunks - embeddings = self.embeddings_class.embed_texts(new_chunks) + embeddings = embedder.embed_texts(new_chunks) # Store embeddings with their corresponding chunks start_idx = len(self.chunks) @@ -92,13 +87,15 @@ class BaseKnowledgeBase(ABC): return chunks - def _find_similar_chunks(self, query: str, top_k: int = 3) -> List[str]: + def _find_similar_chunks( + self, embedder: BaseEmbedder, query: str, top_k: int = 3 + ) -> List[str]: """Find the most similar chunks to a query using embeddings""" if not self.chunks: return [] # Get query embedding - query_embedding = self.embeddings_class.embed_text(query) + query_embedding = embedder.embed_text(query) # Calculate similarities with all chunks similarities = [] diff --git a/src/crewai/knowledge/string_knowledge.py b/src/crewai/knowledge/source/string_knowledge_source.py similarity index 55% rename from src/crewai/knowledge/string_knowledge.py rename to src/crewai/knowledge/source/string_knowledge_source.py index 36c09ba89..028bbe493 100644 --- a/src/crewai/knowledge/string_knowledge.py +++ b/src/crewai/knowledge/source/string_knowledge_source.py @@ -1,25 +1,24 @@ -from typing import Optional -from crewai.knowledge.base_knowledge import BaseKnowledgeBase -from crewai.knowledge.embeddings import Embeddings +from crewai.knowledge.embedder.base_embedder import BaseEmbedder +from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource -class StringKnowledgeBase(BaseKnowledgeBase): +class StringKnowledgeSource(BaseKnowledgeSource): """A knowledge base that stores and queries plain text content using embeddings""" def __init__( self, + content: str, chunk_size: int = 1000, chunk_overlap: int = 200, - embeddings_class: Optional[Embeddings] = None, - content: Optional[str] = None, ): - super().__init__(chunk_size, chunk_overlap, embeddings_class) - if content: - self.add(content) + super().__init__( + chunk_size, + chunk_overlap, + ) - def add(self, content: str) -> None: + def add(self, embedder: BaseEmbedder) -> None: """Add text content to the knowledge base, chunk it, and compute embeddings""" - if not isinstance(content, str): + if not isinstance(self.content, str): raise ValueError("StringKnowledgeBase only accepts string content") # Create chunks from the text @@ -29,12 +28,12 @@ class StringKnowledgeBase(BaseKnowledgeBase): self.chunks.extend(new_chunks) # Compute and store embeddings for the new chunks - self._embed_chunks(new_chunks) + embedder.embed_chunks(new_chunks) - def query(self, query: str, top_k: int = 3) -> str: + def query(self, embedder: BaseEmbedder, query: str, top_k: int = 3) -> str: """ Query the knowledge base using semantic search Returns the most relevant chunk based on embedding similarity """ - similar_chunks = self._find_similar_chunks(query, top_k=top_k) + similar_chunks = self._find_similar_chunks(embedder, query, top_k=top_k) return similar_chunks[0] if similar_chunks else "" diff --git a/src/crewai/knowledge/source/text_file_knowledge_source.py b/src/crewai/knowledge/source/text_file_knowledge_source.py new file mode 100644 index 000000000..0808319d1 --- /dev/null +++ b/src/crewai/knowledge/source/text_file_knowledge_source.py @@ -0,0 +1,39 @@ +from crewai.knowledge.embedder.base_embedder import BaseEmbedder +from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource + + +class TextFileKnowledgeSource(BaseKnowledgeSource): + """A knowledge base that stores and queries plain text content using embeddings""" + + def __init__( + self, + file_path: str, + chunk_size: int = 1000, + chunk_overlap: int = 200, + ): + super().__init__( + chunk_size, + chunk_overlap, + ) + + def add(self, embedder: BaseEmbedder) -> None: + """Add text content to the knowledge base, chunk it, and compute embeddings""" + if not isinstance(self.content, str): + raise ValueError("StringKnowledgeBase only accepts string content") + + # Create chunks from the text + new_chunks = self._chunk_text(content) + + # Add chunks to the knowledge base + self.chunks.extend(new_chunks) + + # Compute and store embeddings for the new chunks + embedder.embed_chunks(new_chunks) + + def query(self, embedder: BaseEmbedder, query: str, top_k: int = 3) -> str: + """ + Query the knowledge base using semantic search + Returns the most relevant chunk based on embedding similarity + """ + similar_chunks = self._find_similar_chunks(embedder, query, top_k=top_k) + return similar_chunks[0] if similar_chunks else ""