initial knowledge

2026-01-14 18:48:29 +00:00 · 2024-11-04 15:53:19 -03:00
parent 57201fb856
commit 75322b2de1
7 changed files with 460 additions and 127 deletions
--- a/src/crewai/agent.py
+++ b/src/crewai/agent.py
@@ -8,6 +8,7 @@ from pydantic import Field, InstanceOf, PrivateAttr, model_validator
 from crewai.agents import CacheHandler
 from crewai.agents.agent_builder.base_agent import BaseAgent
 from crewai.agents.crew_agent_executor import CrewAgentExecutor
+from crewai.knowledge import StringKnowledgeBase
 from crewai.llm import LLM
 from crewai.memory.contextual.contextual_memory import ContextualMemory
 from crewai.tools.agent_tools.agent_tools import AgentTools
@@ -51,6 +52,7 @@ class Agent(BaseAgent):
            role: The role of the agent.
            goal: The objective of the agent.
            backstory: The backstory of the agent.
+            knowledge: The knowledge base of the agent.
            config: Dict representation of agent configuration.
            llm: The language model that will run the agent.
            function_calling_llm: The language model that will handle the tool calling for this agent, it overrides the crew function_calling_llm.
@@ -84,6 +86,10 @@ class Agent(BaseAgent):
    llm: Union[str, InstanceOf[LLM], Any] = Field(
        description="Language model that will run the agent.", default=None
    )
+    knowledge: Optional[str] = Field(
+        default=None,
+        description="Knowledge base for the agent.",
+    )
    function_calling_llm: Optional[Any] = Field(
        description="Language model that will run the agent.", default=None
    )
@@ -182,6 +188,8 @@ class Agent(BaseAgent):
        if self.allow_code_execution:
            self._validate_docker_installation()

+        self.knowledge = StringKnowledgeBase(content=self.knowledge)
+
        return self

    def _setup_agent_executor(self):
--- a/src/crewai/knowledge/init.py
+++ b/src/crewai/knowledge/init.py
--- a/src/crewai/knowledge/base_knowledge.py
+++ b/src/crewai/knowledge/base_knowledge.py
@@ -0,0 +1,115 @@
+from typing import List, Any, Optional, Dict
+from abc import ABC, abstractmethod
+import numpy as np
+from .embeddings import Embeddings
+
+
+class BaseKnowledgeBase(ABC):
+    """Abstract base class for knowledge bases"""
+
+    def __init__(
+        self,
+        chunk_size: int = 1000,
+        chunk_overlap: int = 200,
+        embeddings_class: Optional[Embeddings] = None,
+    ):
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.chunks: List[str] = []
+        self.chunk_embeddings: Dict[int, np.ndarray] = {}
+        self.embeddings_class = embeddings_class or Embeddings()
+
+    @abstractmethod
+    def query(self, query: str) -> str:
+        """Query the knowledge base and return relevant information"""
+        pass
+
+    @abstractmethod
+    def add(self, content: Any) -> None:
+        """Process and store content in the knowledge base"""
+        pass
+
+    def reset(self) -> None:
+        """Reset the knowledge base"""
+        self.chunks = []
+        self.chunk_embeddings = {}
+
+    def _embed_chunks(self, new_chunks: List[str]) -> None:
+        """Embed chunks and store them"""
+        if not new_chunks:
+            return
+
+        # Get embeddings for new chunks
+        embeddings = self.embeddings_class.embed_texts(new_chunks)
+
+        # Store embeddings with their corresponding chunks
+        start_idx = len(self.chunks)
+        for i, embedding in enumerate(embeddings):
+            self.chunk_embeddings[start_idx + i] = embedding
+
+    def _chunk_text(self, text: str) -> List[str]:
+        """Split text into chunks with overlap"""
+        chunks = []
+        start = 0
+        text_length = len(text)
+
+        while start < text_length:
+            # Get the chunk of size chunk_size
+            end = start + self.chunk_size
+
+            if end >= text_length:
+                # If we're at the end, just take the rest
+                chunks.append(text[start:].strip())
+                break
+
+            # Look for a good breaking point
+            # Priority: double newline > single newline > period > space
+            break_chars = ["\n\n", "\n", ". ", " "]
+            chunk_end = end
+
+            for break_char in break_chars:
+                # Look for the break_char in a window around the end point
+                window_start = max(start + self.chunk_size - 100, start)
+                window_end = min(start + self.chunk_size + 100, text_length)
+                window_text = text[window_start:window_end]
+
+                # Find the last occurrence of the break_char in the window
+                last_break = window_text.rfind(break_char)
+                if last_break != -1:
+                    chunk_end = window_start + last_break + len(break_char)
+                    break
+
+            # Add the chunk
+            chunk = text[start:chunk_end].strip()
+            if chunk:  # Only add non-empty chunks
+                chunks.append(chunk)
+
+            # Move the start pointer, accounting for overlap
+            start = max(
+                start + self.chunk_size - self.chunk_overlap,
+                chunk_end - self.chunk_overlap,
+            )
+
+        return chunks
+
+    def _find_similar_chunks(self, query: str, top_k: int = 3) -> List[str]:
+        """Find the most similar chunks to a query using embeddings"""
+        if not self.chunks:
+            return []
+
+        # Get query embedding
+        query_embedding = self.embeddings_class.embed_text(query)
+
+        # Calculate similarities with all chunks
+        similarities = []
+        for idx, chunk_embedding in self.chunk_embeddings.items():
+            similarity = np.dot(query_embedding, chunk_embedding)
+            similarities.append((similarity, idx))
+
+        # Sort by similarity and get top_k chunks
+        similarities.sort(reverse=True)
+        top_chunks = []
+        for _, idx in similarities[:top_k]:
+            top_chunks.append(self.chunks[idx])
+
+        return top_chunks
--- a/src/crewai/knowledge/embeddings.py
+++ b/src/crewai/knowledge/embeddings.py
@@ -0,0 +1,78 @@
+from typing import List, Optional, Union
+from pathlib import Path
+import numpy as np
+
+try:
+    from fastembed_gpu import TextEmbedding
+
+    FASTEMBED_AVAILABLE = True
+except ImportError:
+    try:
+        from fastembed import TextEmbedding
+
+        FASTEMBED_AVAILABLE = True
+    except ImportError:
+        FASTEMBED_AVAILABLE = False
+
+
+class Embeddings:
+    """
+    A wrapper class for text embedding models using FastEmbed
+    """
+
+    def __init__(
+        self,
+        model_name: str = "BAAI/bge-small-en-v1.5",
+        cache_dir: Optional[Union[str, Path]] = None,
+    ):
+        """
+        Initialize the embedding model
+
+        Args:
+            model_name: Name of the model to use
+            cache_dir: Directory to cache the model
+            gpu: Whether to use GPU acceleration
+        """
+        if not FASTEMBED_AVAILABLE:
+            raise ImportError(
+                "FastEmbed is not installed. Please install it with: "
+                "pip install fastembed or pip install fastembed-gpu for GPU support"
+            )
+
+        self.model = TextEmbedding(
+            model_name=model_name,
+            cache_dir=str(cache_dir) if cache_dir else None,
+        )
+
+    def embed_texts(self, texts: List[str]) -> np.ndarray:
+        """
+        Generate embeddings for a list of texts
+
+        Args:
+            texts: List of texts to embed
+
+        Returns:
+            Array of embeddings
+        """
+        # FastEmbed returns a generator, convert to list then numpy array
+        embeddings = list(self.model.embed(texts))
+        return np.array(embeddings)
+
+    def embed_text(self, text: str) -> np.ndarray:
+        """
+        Generate embedding for a single text
+
+        Args:
+            text: Text to embed
+
+        Returns:
+            Embedding array
+        """
+        return self.embed_texts([text])[0]
+
+    @property
+    def dimension(self) -> int:
+        """Get the dimension of the embeddings"""
+        # Generate a test embedding to get dimensions
+        test_embed = self.embed_text("test")
+        return len(test_embed)
--- a/src/crewai/knowledge/string_knowledge.py
+++ b/src/crewai/knowledge/string_knowledge.py
@@ -0,0 +1,40 @@
+from typing import Optional
+from crewai.knowledge.base_knowledge import BaseKnowledgeBase
+from crewai.knowledge.embeddings import Embeddings
+
+
+class StringKnowledgeBase(BaseKnowledgeBase):
+    """A knowledge base that stores and queries plain text content using embeddings"""
+
+    def __init__(
+        self,
+        chunk_size: int = 1000,
+        chunk_overlap: int = 200,
+        embeddings_class: Optional[Embeddings] = None,
+        content: Optional[str] = None,
+    ):
+        super().__init__(chunk_size, chunk_overlap, embeddings_class)
+        if content:
+            self.add(content)
+
+    def add(self, content: str) -> None:
+        """Add text content to the knowledge base, chunk it, and compute embeddings"""
+        if not isinstance(content, str):
+            raise ValueError("StringKnowledgeBase only accepts string content")
+
+        # Create chunks from the text
+        new_chunks = self._chunk_text(content)
+
+        # Add chunks to the knowledge base
+        self.chunks.extend(new_chunks)
+
+        # Compute and store embeddings for the new chunks
+        self._embed_chunks(new_chunks)
+
+    def query(self, query: str, top_k: int = 3) -> str:
+        """
+        Query the knowledge base using semantic search
+        Returns the most relevant chunk based on embedding similarity
+        """
+        similar_chunks = self._find_similar_chunks(query, top_k=top_k)
+        return similar_chunks[0] if similar_chunks else ""