Adding core knowledge sources

2026-01-20 21:38:14 +00:00 · 2024-11-06 12:33:55 -05:00
parent a8a2f80616
commit 1a35114c08
15 changed files with 645 additions and 155 deletions
--- a/src/crewai/init.py
+++ b/src/crewai/init.py
@@ -1,7 +1,9 @@
 import warnings
+
 from crewai.agent import Agent
 from crewai.crew import Crew
 from crewai.flow.flow import Flow
+from crewai.knowledge.knowledge import Knowledge
 from crewai.llm import LLM
 from crewai.pipeline import Pipeline
 from crewai.process import Process
@@ -15,4 +17,14 @@ warnings.filterwarnings(
    module="pydantic.main",
 )
 __version__ = "0.76.9"
-__all__ = ["Agent", "Crew", "Process", "Task", "Pipeline", "Router", "LLM", "Flow"]
+__all__ = [
+    "Agent",
+    "Crew",
+    "Process",
+    "Task",
+    "Pipeline",
+    "Router",
+    "LLM",
+    "Flow",
+    "Knowledge",
+]
--- a/src/crewai/knowledge/embedder/init.py
+++ b/src/crewai/knowledge/embedder/init.py
--- a/src/crewai/knowledge/embedder/fastembed.py
+++ b/src/crewai/knowledge/embedder/fastembed.py
@@ -47,7 +47,7 @@ class FastEmbed(BaseEmbedder):
            cache_dir=str(cache_dir) if cache_dir else None,
        )

-    def embed_chunks(self, chunks: List[str]) -> np.ndarray:
+    def embed_chunks(self, chunks: List[str]) -> List[np.ndarray]:
        """
        Generate embeddings for a list of text chunks

@@ -55,13 +55,12 @@ class FastEmbed(BaseEmbedder):
            chunks: List of text chunks to embed

        Returns:
-            Array of embeddings
+            List of embeddings
        """
-        # FastEmbed returns a generator, convert to list then numpy array
        embeddings = list(self.model.embed(chunks))
-        return np.array(embeddings)
+        return embeddings

-    def embed_texts(self, texts: List[str]) -> np.ndarray:
+    def embed_texts(self, texts: List[str]) -> List[np.ndarray]:
        """
        Generate embeddings for a list of texts

@@ -69,11 +68,10 @@ class FastEmbed(BaseEmbedder):
            texts: List of texts to embed

        Returns:
-            Array of embeddings
+            List of embeddings
        """
-        # FastEmbed returns a generator, convert to list then numpy array
        embeddings = list(self.model.embed(texts))
-        return np.array(embeddings)
+        return embeddings

    def embed_text(self, text: str) -> np.ndarray:
        """
--- a/src/crewai/knowledge/knowledge.py
+++ b/src/crewai/knowledge/knowledge.py
@@ -1,21 +1,53 @@
-from typing import List, Optional
+from typing import List

-from pydantic import BaseModel
+from pydantic import BaseModel, ConfigDict, Field

-from .embedder.base_embedder import BaseEmbedder
-from .embedder.fastembed import FastEmbed
-from .source.base_knowledge_source import BaseKnowledgeSource
+from crewai.knowledge.embedder.base_embedder import BaseEmbedder
+from crewai.knowledge.embedder.fastembed import FastEmbed
+from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource


 class Knowledge(BaseModel):
-    sources: Optional[List[BaseKnowledgeSource]] = None
-    embedder: BaseEmbedder
+    sources: List[BaseKnowledgeSource] = Field(default_factory=list)
+    embedder: BaseEmbedder = Field(default_factory=FastEmbed)

-    def __init__(
-        self,
-        sources: Optional[List[BaseKnowledgeSource]] = None,
-        embedder: Optional[BaseEmbedder] = None,
-    ):
-        super().__init__()
-        self.sources = sources or []
-        self.embedder = embedder or FastEmbed()
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    def __init__(self, **data):
+        super().__init__(**data)
+        # Call add on all sources during initialization
+        for source in self.sources:
+            source.add(self.embedder)
+
+    def query(self, query: str, top_k: int = 3) -> List[str]:
+        """
+        Query across all knowledge sources to find the most relevant information.
+        Returns the top_k most relevant chunks.
+        """
+        if not self.sources:
+            return []
+
+        # Collect all chunks and embeddings from all sources
+        all_chunks = []
+        all_embeddings = []
+
+        for source in self.sources:
+            all_chunks.extend(source.chunks)
+            all_embeddings.extend(source.get_embeddings())
+
+        # Embed the query
+        query_embedding = self.embedder.embed_text(query)
+
+        # Calculate similarities
+        similarities = []
+        for idx, embedding in enumerate(all_embeddings):
+            similarity = query_embedding.dot(embedding)
+            similarities.append((similarity, idx))
+
+        # Sort by similarity
+        similarities.sort(reverse=True, key=lambda x: x[0])
+
+        # Get top_k results
+        top_chunks = [all_chunks[idx] for _, idx in similarities[:top_k]]
+
+        return top_chunks
--- a/src/crewai/knowledge/source/init.py
+++ b/src/crewai/knowledge/source/init.py
--- a/src/crewai/knowledge/source/base_knowledge_source.py
+++ b/src/crewai/knowledge/source/base_knowledge_source.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Any, Dict, List
+from typing import List

 import numpy as np

@@ -7,7 +7,7 @@ from crewai.knowledge.embedder.base_embedder import BaseEmbedder


 class BaseKnowledgeSource(ABC):
-    """Abstract base class for knowledge bases"""
+    """Abstract base class for knowledge sources."""

    def __init__(
        self,
@@ -17,96 +17,25 @@ class BaseKnowledgeSource(ABC):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.chunks: List[str] = []
-        self.chunk_embeddings: Dict[int, np.ndarray] = {}
+        self.chunk_embeddings: List[np.ndarray] = []

    @abstractmethod
-    def query(self, query: str) -> str:
-        """Query the knowledge base and return relevant information"""
+    def load_content(self):
+        """Load and preprocess content from the source."""
        pass

    @abstractmethod
-    def add(self, content: Any) -> None:
-        """Process and store content in the knowledge base"""
+    def add(self, embedder: BaseEmbedder) -> None:
+        """Process content, chunk it, compute embeddings, and save them."""
        pass

-    def embed(self, embedder: BaseEmbedder, new_chunks: List[str]) -> None:
-        """Embed chunks and store them"""
-        if not new_chunks:
-            return
-
-        # Get embeddings for new chunks
-        embeddings = embedder.embed_texts(new_chunks)
-
-        # Store embeddings with their corresponding chunks
-        start_idx = len(self.chunks)
-        for i, embedding in enumerate(embeddings):
-            self.chunk_embeddings[start_idx + i] = embedding
+    def get_embeddings(self) -> List[np.ndarray]:
+        """Return the list of embeddings for the chunks."""
+        return self.chunk_embeddings

    def _chunk_text(self, text: str) -> List[str]:
-        """Split text into chunks with overlap"""
-        chunks = []
-        start = 0
-        text_length = len(text)
-
-        while start < text_length:
-            # Get the chunk of size chunk_size
-            end = start + self.chunk_size
-
-            if end >= text_length:
-                # If we're at the end, just take the rest
-                chunks.append(text[start:].strip())
-                break
-
-            # Look for a good breaking point
-            # Priority: double newline > single newline > period > space
-            break_chars = ["\n\n", "\n", ". ", " "]
-            chunk_end = end
-
-            for break_char in break_chars:
-                # Look for the break_char in a window around the end point
-                window_start = max(start + self.chunk_size - 100, start)
-                window_end = min(start + self.chunk_size + 100, text_length)
-                window_text = text[window_start:window_end]
-
-                # Find the last occurrence of the break_char in the window
-                last_break = window_text.rfind(break_char)
-                if last_break != -1:
-                    chunk_end = window_start + last_break + len(break_char)
-                    break
-
-            # Add the chunk
-            chunk = text[start:chunk_end].strip()
-            if chunk:  # Only add non-empty chunks
-                chunks.append(chunk)
-
-            # Move the start pointer, accounting for overlap
-            start = max(
-                start + self.chunk_size - self.chunk_overlap,
-                chunk_end - self.chunk_overlap,
-            )
-
-        return chunks
-
-    def _find_similar_chunks(
-        self, embedder: BaseEmbedder, query: str, top_k: int = 3
-    ) -> List[str]:
-        """Find the most similar chunks to a query using embeddings"""
-        if not self.chunks:
-            return []
-
-        # Get query embedding
-        query_embedding = embedder.embed_text(query)
-
-        # Calculate similarities with all chunks
-        similarities = []
-        for idx, chunk_embedding in self.chunk_embeddings.items():
-            similarity = np.dot(query_embedding, chunk_embedding)
-            similarities.append((similarity, idx))
-
-        # Sort by similarity and get top_k chunks
-        similarities.sort(reverse=True)
-        top_chunks = []
-        for _, idx in similarities[:top_k]:
-            top_chunks.append(self.chunks[idx])
-
-        return top_chunks
+        """Utility method to split text into chunks."""
+        return [
+            text[i : i + self.chunk_size]
+            for i in range(0, len(text), self.chunk_size - self.chunk_overlap)
+        ]
--- a/src/crewai/knowledge/source/pdf_knowledge_source.py
+++ b/src/crewai/knowledge/source/pdf_knowledge_source.py
@@ -0,0 +1,65 @@
+from pathlib import Path
+from typing import List
+
+from crewai.knowledge.embedder.base_embedder import BaseEmbedder
+from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
+
+
+class PDFKnowledgeSource(BaseKnowledgeSource):
+    """A knowledge source that stores and queries PDF file content using embeddings."""
+
+    def __init__(
+        self,
+        file_path: str,
+        chunk_size: int = 1000,
+        chunk_overlap: int = 200,
+    ):
+        super().__init__(chunk_size, chunk_overlap)
+        self.file_path = Path(file_path)
+        self.content = self.load_content()
+
+    def _import_pdfplumber(self):
+        """Dynamically import pdfplumber."""
+        try:
+            import pdfplumber
+
+            return pdfplumber
+        except ImportError:
+            raise ImportError(
+                "pdfplumber is not installed. Please install it with: pip install pdfplumber"
+            )
+
+    def load_content(self) -> str:
+        """Load and preprocess PDF file content."""
+        if not self.file_path.exists():
+            raise FileNotFoundError(f"File not found: {self.file_path}")
+        if not self.file_path.is_file():
+            raise ValueError(f"Path is not a file: {self.file_path}")
+
+        pdfplumber = self._import_pdfplumber()
+        text = ""
+        with pdfplumber.open(self.file_path) as pdf:
+            for page in pdf.pages:
+                page_text = page.extract_text()
+                if page_text:
+                    text += page_text + "\n"
+        return text
+
+    def add(self, embedder: BaseEmbedder) -> None:
+        """
+        Add PDF file content to the knowledge source, chunk it, compute embeddings,
+        and save the embeddings.
+        """
+        new_chunks = self._chunk_text(self.content)
+        self.chunks.extend(new_chunks)
+        # Compute embeddings for the new chunks
+        new_embeddings = embedder.embed_chunks(new_chunks)
+        # Save the embeddings
+        self.chunk_embeddings.extend(new_embeddings)
+
+    def _chunk_text(self, text: str) -> List[str]:
+        """Utility method to split text into chunks."""
+        return [
+            text[i : i + self.chunk_size]
+            for i in range(0, len(text), self.chunk_size - self.chunk_overlap)
+        ]
--- a/src/crewai/knowledge/source/string_knowledge_source.py
+++ b/src/crewai/knowledge/source/string_knowledge_source.py
@@ -1,9 +1,11 @@
+from typing import List
+
 from crewai.knowledge.embedder.base_embedder import BaseEmbedder
 from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource


 class StringKnowledgeSource(BaseKnowledgeSource):
-    """A knowledge base that stores and queries plain text content using embeddings"""
+    """A knowledge source that stores and queries plain text content using embeddings."""

    def __init__(
        self,
@@ -15,25 +17,29 @@ class StringKnowledgeSource(BaseKnowledgeSource):
            chunk_size,
            chunk_overlap,
        )
+        self.content = content
+        self.load_content()
+
+    def load_content(self):
+        """Load and preprocess string content."""
+        if not isinstance(self.content, str):
+            raise ValueError("StringKnowledgeSource only accepts string content")

    def add(self, embedder: BaseEmbedder) -> None:
-        """Add text content to the knowledge base, chunk it, and compute embeddings"""
-        if not isinstance(self.content, str):
-            raise ValueError("StringKnowledgeBase only accepts string content")
-
-        # Create chunks from the text
-        new_chunks = self._chunk_text(content)
-
-        # Add chunks to the knowledge base
+        """
+        Add string content to the knowledge source, chunk it, compute embeddings,
+        and save the embeddings.
+        """
+        new_chunks = self._chunk_text(self.content)
        self.chunks.extend(new_chunks)
+        # Compute embeddings for the new chunks
+        new_embeddings = embedder.embed_chunks(new_chunks)
+        # Save the embeddings
+        self.chunk_embeddings.extend(new_embeddings)

-        # Compute and store embeddings for the new chunks
-        embedder.embed_chunks(new_chunks)
-
-    def query(self, embedder: BaseEmbedder, query: str, top_k: int = 3) -> str:
-        """
-        Query the knowledge base using semantic search
-        Returns the most relevant chunk based on embedding similarity
-        """
-        similar_chunks = self._find_similar_chunks(embedder, query, top_k=top_k)
-        return similar_chunks[0] if similar_chunks else ""
+    def _chunk_text(self, text: str) -> List[str]:
+        """Utility method to split text into chunks."""
+        return [
+            text[i : i + self.chunk_size]
+            for i in range(0, len(text), self.chunk_size - self.chunk_overlap)
+        ]
--- a/src/crewai/knowledge/source/text_file_knowledge_source.py
+++ b/src/crewai/knowledge/source/text_file_knowledge_source.py
@@ -1,9 +1,12 @@
+from pathlib import Path
+from typing import List
+
 from crewai.knowledge.embedder.base_embedder import BaseEmbedder
 from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource


 class TextFileKnowledgeSource(BaseKnowledgeSource):
-    """A knowledge base that stores and queries plain text content using embeddings"""
+    """A knowledge source that stores and queries text file content using embeddings."""

    def __init__(
        self,
@@ -11,29 +14,35 @@ class TextFileKnowledgeSource(BaseKnowledgeSource):
        chunk_size: int = 1000,
        chunk_overlap: int = 200,
    ):
-        super().__init__(
-            chunk_size,
-            chunk_overlap,
-        )
+        super().__init__(chunk_size, chunk_overlap)
+        self.file_path = Path(file_path)
+        self.content = self.load_content()
+
+    def load_content(self) -> str:
+        """Load and preprocess text file content."""
+        if not self.file_path.exists():
+            raise FileNotFoundError(f"File not found: {self.file_path}")
+        if not self.file_path.is_file():
+            raise ValueError(f"Path is not a file: {self.file_path}")
+
+        with self.file_path.open("r", encoding="utf-8") as f:
+            return f.read()

    def add(self, embedder: BaseEmbedder) -> None:
-        """Add text content to the knowledge base, chunk it, and compute embeddings"""
-        if not isinstance(self.content, str):
-            raise ValueError("StringKnowledgeBase only accepts string content")
-
-        # Create chunks from the text
-        new_chunks = self._chunk_text(content)
-
-        # Add chunks to the knowledge base
+        """
+        Add text file content to the knowledge source, chunk it, compute embeddings,
+        and save the embeddings.
+        """
+        new_chunks = self._chunk_text(self.content)
        self.chunks.extend(new_chunks)
+        # Compute embeddings for the new chunks
+        new_embeddings = embedder.embed_chunks(new_chunks)
+        # Save the embeddings
+        self.chunk_embeddings.extend(new_embeddings)

-        # Compute and store embeddings for the new chunks
-        embedder.embed_chunks(new_chunks)
-
-    def query(self, embedder: BaseEmbedder, query: str, top_k: int = 3) -> str:
-        """
-        Query the knowledge base using semantic search
-        Returns the most relevant chunk based on embedding similarity
-        """
-        similar_chunks = self._find_similar_chunks(embedder, query, top_k=top_k)
-        return similar_chunks[0] if similar_chunks else ""
+    def _chunk_text(self, text: str) -> List[str]:
+        """Utility method to split text into chunks."""
+        return [
+            text[i : i + self.chunk_size]
+            for i in range(0, len(text), self.chunk_size - self.chunk_overlap)
+        ]