Adding core knowledge sources

2026-01-09 08:08:32 +00:00 · 2024-11-06 12:33:55 -05:00
parent a8a2f80616
commit 1a35114c08
15 changed files with 645 additions and 155 deletions
--- a/path/to/src/crewai/knowledge/source/base_knowledge_source.py
+++ b/path/to/src/crewai/knowledge/source/base_knowledge_source.py
@@ -0,0 +1,32 @@
 from abc import ABC, abstractmethod
 from typing import List
 from crewai.knowledge.embedder.base_embedder import BaseEmbedder
 class BaseKnowledgeSource(ABC):
    """Abstract base class for different types of knowledge sources."""
    def __init__(
        self,
        chunk_size: int = 1000,
        chunk_overlap: int = 200,
    ):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.chunks: List[str] = []
    @abstractmethod
    def load_content(self):
        """Load and preprocess content from the source."""
        pass
    @abstractmethod
    def add(self, embedder: BaseEmbedder) -> None:
        """Add content to the knowledge base, chunk it, and compute embeddings."""
        pass
    @abstractmethod
    def query(self, embedder: BaseEmbedder, query: str, top_k: int = 3) -> str:
        """Query the knowledge base using semantic search."""
        pass
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,7 +29,6 @@ dependencies = [
    "tomli-w>=1.1.0",
    "chromadb>=0.4.24",
    "tomli>=2.0.2",
    "fastembed>=0.4.1",
 ]
 [project.urls]
@@ -40,6 +39,10 @@ Repository = "https://github.com/crewAIInc/crewAI"
 [project.optional-dependencies]
 tools = ["crewai-tools>=0.13.4"]
 agentops = ["agentops>=0.3.0"]
 fastembed = ["fastembed>=0.4.1"]
 pdfplumber = [
    "pdfplumber>=0.11.4",
 ]
 [tool.uv]
 dev-dependencies = [
--- a/src/crewai/init.py
+++ b/src/crewai/init.py
@@ -1,7 +1,9 @@
 import warnings
 from crewai.agent import Agent
 from crewai.crew import Crew
 from crewai.flow.flow import Flow
 from crewai.knowledge.knowledge import Knowledge
 from crewai.llm import LLM
 from crewai.pipeline import Pipeline
 from crewai.process import Process
@@ -15,4 +17,14 @@ warnings.filterwarnings(
    module="pydantic.main",
 )
 __version__ = "0.76.9"
-__all__ = ["Agent", "Crew", "Process", "Task", "Pipeline", "Router", "LLM", "Flow"]
+__all__ = [
    "Agent",
    "Crew",
    "Process",
    "Task",
    "Pipeline",
    "Router",
    "LLM",
    "Flow",
    "Knowledge",
 ]
--- a/src/crewai/knowledge/embedder/init.py
+++ b/src/crewai/knowledge/embedder/init.py
--- a/src/crewai/knowledge/embedder/fastembed.py
+++ b/src/crewai/knowledge/embedder/fastembed.py
@@ -47,7 +47,7 @@ class FastEmbed(BaseEmbedder):
            cache_dir=str(cache_dir) if cache_dir else None,
        )
-    def embed_chunks(self, chunks: List[str]) -> np.ndarray:
+    def embed_chunks(self, chunks: List[str]) -> List[np.ndarray]:
        """
        Generate embeddings for a list of text chunks
@@ -55,13 +55,12 @@ class FastEmbed(BaseEmbedder):
            chunks: List of text chunks to embed
        Returns:
-            Array of embeddings
+            List of embeddings
        """
        # FastEmbed returns a generator, convert to list then numpy array
        embeddings = list(self.model.embed(chunks))
-        return np.array(embeddings)
+        return embeddings
-    def embed_texts(self, texts: List[str]) -> np.ndarray:
+    def embed_texts(self, texts: List[str]) -> List[np.ndarray]:
        """
        Generate embeddings for a list of texts
@@ -69,11 +68,10 @@ class FastEmbed(BaseEmbedder):
            texts: List of texts to embed
        Returns:
-            Array of embeddings
+            List of embeddings
        """
        # FastEmbed returns a generator, convert to list then numpy array
        embeddings = list(self.model.embed(texts))
-        return np.array(embeddings)
+        return embeddings
    def embed_text(self, text: str) -> np.ndarray:
        """
--- a/src/crewai/knowledge/knowledge.py
+++ b/src/crewai/knowledge/knowledge.py
@@ -1,21 +1,53 @@
-from typing import List, Optional
+from typing import List
-from pydantic import BaseModel
+from pydantic import BaseModel, ConfigDict, Field
-from .embedder.base_embedder import BaseEmbedder
+from crewai.knowledge.embedder.base_embedder import BaseEmbedder
-from .embedder.fastembed import FastEmbed
+from crewai.knowledge.embedder.fastembed import FastEmbed
-from .source.base_knowledge_source import BaseKnowledgeSource
+from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
 class Knowledge(BaseModel):
-    sources: Optional[List[BaseKnowledgeSource]] = None
+    sources: List[BaseKnowledgeSource] = Field(default_factory=list)
-    embedder: BaseEmbedder
+    embedder: BaseEmbedder = Field(default_factory=FastEmbed)
-    def __init__(
+    model_config = ConfigDict(arbitrary_types_allowed=True)
-        self,
+
-        sources: Optional[List[BaseKnowledgeSource]] = None,
+    def __init__(self, **data):
-        embedder: Optional[BaseEmbedder] = None,
+        super().__init__(**data)
-    ):
+        # Call add on all sources during initialization
-        super().__init__()
+        for source in self.sources:
-        self.sources = sources or []
+            source.add(self.embedder)
-        self.embedder = embedder or FastEmbed()
+
    def query(self, query: str, top_k: int = 3) -> List[str]:
        """
        Query across all knowledge sources to find the most relevant information.
        Returns the top_k most relevant chunks.
        """
        if not self.sources:
            return []
        # Collect all chunks and embeddings from all sources
        all_chunks = []
        all_embeddings = []
        for source in self.sources:
            all_chunks.extend(source.chunks)
            all_embeddings.extend(source.get_embeddings())
        # Embed the query
        query_embedding = self.embedder.embed_text(query)
        # Calculate similarities
        similarities = []
        for idx, embedding in enumerate(all_embeddings):
            similarity = query_embedding.dot(embedding)
            similarities.append((similarity, idx))
        # Sort by similarity
        similarities.sort(reverse=True, key=lambda x: x[0])
        # Get top_k results
        top_chunks = [all_chunks[idx] for _, idx in similarities[:top_k]]
        return top_chunks
--- a/src/crewai/knowledge/source/init.py
+++ b/src/crewai/knowledge/source/init.py
--- a/src/crewai/knowledge/source/base_knowledge_source.py
+++ b/src/crewai/knowledge/source/base_knowledge_source.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Any, Dict, List
+from typing import List
 import numpy as np
@@ -7,7 +7,7 @@ from crewai.knowledge.embedder.base_embedder import BaseEmbedder
 class BaseKnowledgeSource(ABC):
-    """Abstract base class for knowledge bases"""
+    """Abstract base class for knowledge sources."""
    def __init__(
        self,
@@ -17,96 +17,25 @@ class BaseKnowledgeSource(ABC):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.chunks: List[str] = []
-        self.chunk_embeddings: Dict[int, np.ndarray] = {}
+        self.chunk_embeddings: List[np.ndarray] = []
    @abstractmethod
-    def query(self, query: str) -> str:
+    def load_content(self):
-        """Query the knowledge base and return relevant information"""
+        """Load and preprocess content from the source."""
        pass
    @abstractmethod
-    def add(self, content: Any) -> None:
+    def add(self, embedder: BaseEmbedder) -> None:
-        """Process and store content in the knowledge base"""
+        """Process content, chunk it, compute embeddings, and save them."""
        pass
-    def embed(self, embedder: BaseEmbedder, new_chunks: List[str]) -> None:
+    def get_embeddings(self) -> List[np.ndarray]:
-        """Embed chunks and store them"""
+        """Return the list of embeddings for the chunks."""
-        if not new_chunks:
+        return self.chunk_embeddings
            return
        # Get embeddings for new chunks
        embeddings = embedder.embed_texts(new_chunks)
        # Store embeddings with their corresponding chunks
        start_idx = len(self.chunks)
        for i, embedding in enumerate(embeddings):
            self.chunk_embeddings[start_idx + i] = embedding
    def _chunk_text(self, text: str) -> List[str]:
-        """Split text into chunks with overlap"""
+        """Utility method to split text into chunks."""
-        chunks = []
+        return [
-        start = 0
+            text[i : i + self.chunk_size]
-        text_length = len(text)
+            for i in range(0, len(text), self.chunk_size - self.chunk_overlap)
-
+        ]
        while start < text_length:
            # Get the chunk of size chunk_size
            end = start + self.chunk_size
            if end >= text_length:
                # If we're at the end, just take the rest
                chunks.append(text[start:].strip())
                break
            # Look for a good breaking point
            # Priority: double newline > single newline > period > space
            break_chars = ["\n\n", "\n", ". ", " "]
            chunk_end = end
            for break_char in break_chars:
                # Look for the break_char in a window around the end point
                window_start = max(start + self.chunk_size - 100, start)
                window_end = min(start + self.chunk_size + 100, text_length)
                window_text = text[window_start:window_end]
                # Find the last occurrence of the break_char in the window
                last_break = window_text.rfind(break_char)
                if last_break != -1:
                    chunk_end = window_start + last_break + len(break_char)
                    break
            # Add the chunk
            chunk = text[start:chunk_end].strip()
            if chunk:  # Only add non-empty chunks
                chunks.append(chunk)
            # Move the start pointer, accounting for overlap
            start = max(
                start + self.chunk_size - self.chunk_overlap,
                chunk_end - self.chunk_overlap,
            )
        return chunks
    def _find_similar_chunks(
        self, embedder: BaseEmbedder, query: str, top_k: int = 3
    ) -> List[str]:
        """Find the most similar chunks to a query using embeddings"""
        if not self.chunks:
            return []
        # Get query embedding
        query_embedding = embedder.embed_text(query)
        # Calculate similarities with all chunks
        similarities = []
        for idx, chunk_embedding in self.chunk_embeddings.items():
            similarity = np.dot(query_embedding, chunk_embedding)
            similarities.append((similarity, idx))
        # Sort by similarity and get top_k chunks
        similarities.sort(reverse=True)
        top_chunks = []
        for _, idx in similarities[:top_k]:
            top_chunks.append(self.chunks[idx])
        return top_chunks
--- a/src/crewai/knowledge/source/pdf_knowledge_source.py
+++ b/src/crewai/knowledge/source/pdf_knowledge_source.py
@@ -0,0 +1,65 @@
 from pathlib import Path
 from typing import List
 from crewai.knowledge.embedder.base_embedder import BaseEmbedder
 from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
 class PDFKnowledgeSource(BaseKnowledgeSource):
    """A knowledge source that stores and queries PDF file content using embeddings."""
    def __init__(
        self,
        file_path: str,
        chunk_size: int = 1000,
        chunk_overlap: int = 200,
    ):
        super().__init__(chunk_size, chunk_overlap)
        self.file_path = Path(file_path)
        self.content = self.load_content()
    def _import_pdfplumber(self):
        """Dynamically import pdfplumber."""
        try:
            import pdfplumber
            return pdfplumber
        except ImportError:
            raise ImportError(
                "pdfplumber is not installed. Please install it with: pip install pdfplumber"
            )
    def load_content(self) -> str:
        """Load and preprocess PDF file content."""
        if not self.file_path.exists():
            raise FileNotFoundError(f"File not found: {self.file_path}")
        if not self.file_path.is_file():
            raise ValueError(f"Path is not a file: {self.file_path}")
        pdfplumber = self._import_pdfplumber()
        text = ""
        with pdfplumber.open(self.file_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
        return text
    def add(self, embedder: BaseEmbedder) -> None:
        """
        Add PDF file content to the knowledge source, chunk it, compute embeddings,
        and save the embeddings.
        """
        new_chunks = self._chunk_text(self.content)
        self.chunks.extend(new_chunks)
        # Compute embeddings for the new chunks
        new_embeddings = embedder.embed_chunks(new_chunks)
        # Save the embeddings
        self.chunk_embeddings.extend(new_embeddings)
    def _chunk_text(self, text: str) -> List[str]:
        """Utility method to split text into chunks."""
        return [
            text[i : i + self.chunk_size]
            for i in range(0, len(text), self.chunk_size - self.chunk_overlap)
        ]
--- a/src/crewai/knowledge/source/string_knowledge_source.py
+++ b/src/crewai/knowledge/source/string_knowledge_source.py
@@ -1,9 +1,11 @@
 from typing import List
 from crewai.knowledge.embedder.base_embedder import BaseEmbedder
 from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
 class StringKnowledgeSource(BaseKnowledgeSource):
-    """A knowledge base that stores and queries plain text content using embeddings"""
+    """A knowledge source that stores and queries plain text content using embeddings."""
    def __init__(
        self,
@@ -15,25 +17,29 @@ class StringKnowledgeSource(BaseKnowledgeSource):
            chunk_size,
            chunk_overlap,
        )
        self.content = content
        self.load_content()
    def load_content(self):
        """Load and preprocess string content."""
        if not isinstance(self.content, str):
            raise ValueError("StringKnowledgeSource only accepts string content")
    def add(self, embedder: BaseEmbedder) -> None:
-        """Add text content to the knowledge base, chunk it, and compute embeddings"""
+        """
-        if not isinstance(self.content, str):
+        Add string content to the knowledge source, chunk it, compute embeddings,
-            raise ValueError("StringKnowledgeBase only accepts string content")
+        and save the embeddings.
-
+        """
-        # Create chunks from the text
+        new_chunks = self._chunk_text(self.content)
        new_chunks = self._chunk_text(content)
        # Add chunks to the knowledge base
        self.chunks.extend(new_chunks)
        # Compute embeddings for the new chunks
        new_embeddings = embedder.embed_chunks(new_chunks)
        # Save the embeddings
        self.chunk_embeddings.extend(new_embeddings)
-        # Compute and store embeddings for the new chunks
+    def _chunk_text(self, text: str) -> List[str]:
-        embedder.embed_chunks(new_chunks)
+        """Utility method to split text into chunks."""
-
+        return [
-    def query(self, embedder: BaseEmbedder, query: str, top_k: int = 3) -> str:
+            text[i : i + self.chunk_size]
-        """
+            for i in range(0, len(text), self.chunk_size - self.chunk_overlap)
-        Query the knowledge base using semantic search
+        ]
        Returns the most relevant chunk based on embedding similarity
        """
        similar_chunks = self._find_similar_chunks(embedder, query, top_k=top_k)
        return similar_chunks[0] if similar_chunks else ""
--- a/src/crewai/knowledge/source/text_file_knowledge_source.py
+++ b/src/crewai/knowledge/source/text_file_knowledge_source.py
@@ -1,9 +1,12 @@
 from pathlib import Path
 from typing import List
 from crewai.knowledge.embedder.base_embedder import BaseEmbedder
 from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
 class TextFileKnowledgeSource(BaseKnowledgeSource):
-    """A knowledge base that stores and queries plain text content using embeddings"""
+    """A knowledge source that stores and queries text file content using embeddings."""
    def __init__(
        self,
@@ -11,29 +14,35 @@ class TextFileKnowledgeSource(BaseKnowledgeSource):
        chunk_size: int = 1000,
        chunk_overlap: int = 200,
    ):
-        super().__init__(
+        super().__init__(chunk_size, chunk_overlap)
-            chunk_size,
+        self.file_path = Path(file_path)
-            chunk_overlap,
+        self.content = self.load_content()
-        )
+
    def load_content(self) -> str:
        """Load and preprocess text file content."""
        if not self.file_path.exists():
            raise FileNotFoundError(f"File not found: {self.file_path}")
        if not self.file_path.is_file():
            raise ValueError(f"Path is not a file: {self.file_path}")
        with self.file_path.open("r", encoding="utf-8") as f:
            return f.read()
    def add(self, embedder: BaseEmbedder) -> None:
-        """Add text content to the knowledge base, chunk it, and compute embeddings"""
+        """
-        if not isinstance(self.content, str):
+        Add text file content to the knowledge source, chunk it, compute embeddings,
-            raise ValueError("StringKnowledgeBase only accepts string content")
+        and save the embeddings.
-
+        """
-        # Create chunks from the text
+        new_chunks = self._chunk_text(self.content)
        new_chunks = self._chunk_text(content)
        # Add chunks to the knowledge base
        self.chunks.extend(new_chunks)
        # Compute embeddings for the new chunks
        new_embeddings = embedder.embed_chunks(new_chunks)
        # Save the embeddings
        self.chunk_embeddings.extend(new_embeddings)
-        # Compute and store embeddings for the new chunks
+    def _chunk_text(self, text: str) -> List[str]:
-        embedder.embed_chunks(new_chunks)
+        """Utility method to split text into chunks."""
-
+        return [
-    def query(self, embedder: BaseEmbedder, query: str, top_k: int = 3) -> str:
+            text[i : i + self.chunk_size]
-        """
+            for i in range(0, len(text), self.chunk_size - self.chunk_overlap)
-        Query the knowledge base using semantic search
+        ]
        Returns the most relevant chunk based on embedding similarity
        """
        similar_chunks = self._find_similar_chunks(embedder, query, top_k=top_k)
        return similar_chunks[0] if similar_chunks else ""
--- a/tests/knowledge/init.py
+++ b/tests/knowledge/init.py
--- a/tests/knowledge/crewai_quickstart.pdf
+++ b/tests/knowledge/crewai_quickstart.pdf
--- a/tests/knowledge/knowledge_test.py
+++ b/tests/knowledge/knowledge_test.py
@@ -0,0 +1,347 @@
 """Test Knowledge creation and querying functionality."""
 import os
 from crewai.knowledge.knowledge import Knowledge
 from crewai.knowledge.source.pdf_knowledge_source import PDFKnowledgeSource
 from crewai.knowledge.source.string_knowledge_source import StringKnowledgeSource
 from crewai.knowledge.source.text_file_knowledge_source import TextFileKnowledgeSource
 def test_single_short_string():
    # Create a knowledge base with a single short string
    content = "Brandon's favorite color is blue and he likes Mexican food."
    string_source = StringKnowledgeSource(content=content)
    knowledge_base = Knowledge(sources=[string_source])
    # Perform a query
    query = "What is Brandon's favorite color?"
    results = knowledge_base.query(query)
    # Assert that the results contain the expected information
    assert any("blue" in result.lower() for result in results)
 def test_single_2k_character_string():
    # Create a 2k character string with various facts about Brandon
    content = (
        "Brandon is a software engineer who lives in San Francisco. "
        "He enjoys hiking and often visits the trails in the Bay Area. "
        "Brandon has a pet dog named Max, who is a golden retriever. "
        "He loves reading science fiction books, and his favorite author is Isaac Asimov. "
        "Brandon's favorite movie is Inception, and he enjoys watching it with his friends. "
        "He is also a fan of Mexican cuisine, especially tacos and burritos. "
        "Brandon plays the guitar and often performs at local open mic nights. "
        "He is learning French and plans to visit Paris next year. "
        "Brandon is passionate about technology and often attends tech meetups in the city. "
        "He is also interested in AI and machine learning, and he is currently working on a project related to natural language processing. "
        "Brandon's favorite color is blue, and he often wears blue shirts. "
        "He enjoys cooking and often tries new recipes on weekends. "
        "Brandon is a morning person and likes to start his day with a run in the park. "
        "He is also a coffee enthusiast and enjoys trying different coffee blends. "
        "Brandon is a member of a local book club and enjoys discussing books with fellow members. "
        "He is also a fan of board games and often hosts game nights at his place. "
        "Brandon is an advocate for environmental conservation and volunteers for local clean-up drives. "
        "He is also a mentor for aspiring software developers and enjoys sharing his knowledge with others. "
        "Brandon's favorite sport is basketball, and he often plays with his friends on weekends. "
        "He is also a fan of the Golden State Warriors and enjoys watching their games. "
    )
    string_source = StringKnowledgeSource(content=content)
    knowledge_base = Knowledge(sources=[string_source])
    # Perform a query
    query = "What is Brandon's favorite movie?"
    results = knowledge_base.query(query)
    # Assert that the results contain the expected information
    assert any("inception" in result.lower() for result in results)
 def test_multiple_short_strings():
    # Create multiple short string sources
    contents = [
        "Brandon loves hiking.",
        "Brandon has a dog named Max.",
        "Brandon enjoys painting landscapes.",
    ]
    string_sources = [StringKnowledgeSource(content=content) for content in contents]
    knowledge_base = Knowledge(sources=string_sources)
    # Perform a query
    query = "What is the name of Brandon's pet?"
    results = knowledge_base.query(query)
    # Assert that the correct information is retrieved
    assert any("max" in result.lower() for result in results)
 def test_multiple_2k_character_strings():
    # Create multiple 2k character strings with various facts about Brandon
    contents = [
        (
            "Brandon is a software engineer who lives in San Francisco. "
            "He enjoys hiking and often visits the trails in the Bay Area. "
            "Brandon has a pet dog named Max, who is a golden retriever. "
            "He loves reading science fiction books, and his favorite author is Isaac Asimov. "
            "Brandon's favorite movie is Inception, and he enjoys watching it with his friends. "
            "He is also a fan of Mexican cuisine, especially tacos and burritos. "
            "Brandon plays the guitar and often performs at local open mic nights. "
            "He is learning French and plans to visit Paris next year. "
            "Brandon is passionate about technology and often attends tech meetups in the city. "
            "He is also interested in AI and machine learning, and he is currently working on a project related to natural language processing. "
            "Brandon's favorite color is blue, and he often wears blue shirts. "
            "He enjoys cooking and often tries new recipes on weekends. "
            "Brandon is a morning person and likes to start his day with a run in the park. "
            "He is also a coffee enthusiast and enjoys trying different coffee blends. "
            "Brandon is a member of a local book club and enjoys discussing books with fellow members. "
            "He is also a fan of board games and often hosts game nights at his place. "
            "Brandon is an advocate for environmental conservation and volunteers for local clean-up drives. "
            "He is also a mentor for aspiring software developers and enjoys sharing his knowledge with others. "
            "Brandon's favorite sport is basketball, and he often plays with his friends on weekends. "
            "He is also a fan of the Golden State Warriors and enjoys watching their games. "
        )
        * 2,  # Repeat to ensure it's 2k characters
        (
            "Brandon loves traveling and has visited over 20 countries. "
            "He is fluent in Spanish and often practices with his friends. "
            "Brandon's favorite city is Barcelona, where he enjoys the architecture and culture. "
            "He is a foodie and loves trying new cuisines, with a particular fondness for sushi. "
            "Brandon is an avid cyclist and participates in local cycling events. "
            "He is also a photographer and enjoys capturing landscapes and cityscapes. "
            "Brandon is a tech enthusiast and follows the latest trends in gadgets and software. "
            "He is also a fan of virtual reality and owns a VR headset. "
            "Brandon's favorite book is 'The Hitchhiker's Guide to the Galaxy'. "
            "He enjoys watching documentaries and learning about history and science. "
            "Brandon is a coffee lover and has a collection of coffee mugs from different countries. "
            "He is also a fan of jazz music and often attends live performances. "
            "Brandon is a member of a local running club and participates in marathons. "
            "He is also a volunteer at a local animal shelter and helps with dog walking. "
            "Brandon's favorite holiday is Christmas, and he enjoys decorating his home. "
            "He is also a fan of classic movies and has a collection of DVDs. "
            "Brandon is a mentor for young professionals and enjoys giving career advice. "
            "He is also a fan of puzzles and enjoys solving them in his free time. "
            "Brandon's favorite sport is soccer, and he often plays with his friends. "
            "He is also a fan of FC Barcelona and enjoys watching their matches. "
        )
        * 2,  # Repeat to ensure it's 2k characters
    ]
    string_sources = [StringKnowledgeSource(content=content) for content in contents]
    knowledge_base = Knowledge(sources=string_sources)
    # Perform a query
    query = "What is Brandon's favorite book?"
    results = knowledge_base.query(query)
    # Assert that the correct information is retrieved
    assert any(
        "the hitchhiker's guide to the galaxy" in result.lower() for result in results
    )
 def test_single_short_file(tmpdir):
    # Create a single short text file
    content = "Brandon's favorite sport is basketball."
    file_path = tmpdir.join("short_file.txt")
    with open(file_path, "w") as f:
        f.write(content)
    file_source = TextFileKnowledgeSource(file_path=str(file_path))
    knowledge_base = Knowledge(sources=[file_source])
    # Perform a query
    query = "What sport does Brandon like?"
    results = knowledge_base.query(query)
    # Assert that the results contain the expected information
    assert any("basketball" in result.lower() for result in results)
 def test_single_2k_character_file(tmpdir):
    # Create a single 2k character text file with various facts about Brandon
    content = (
        "Brandon is a software engineer who lives in San Francisco. "
        "He enjoys hiking and often visits the trails in the Bay Area. "
        "Brandon has a pet dog named Max, who is a golden retriever. "
        "He loves reading science fiction books, and his favorite author is Isaac Asimov. "
        "Brandon's favorite movie is Inception, and he enjoys watching it with his friends. "
        "He is also a fan of Mexican cuisine, especially tacos and burritos. "
        "Brandon plays the guitar and often performs at local open mic nights. "
        "He is learning French and plans to visit Paris next year. "
        "Brandon is passionate about technology and often attends tech meetups in the city. "
        "He is also interested in AI and machine learning, and he is currently working on a project related to natural language processing. "
        "Brandon's favorite color is blue, and he often wears blue shirts. "
        "He enjoys cooking and often tries new recipes on weekends. "
        "Brandon is a morning person and likes to start his day with a run in the park. "
        "He is also a coffee enthusiast and enjoys trying different coffee blends. "
        "Brandon is a member of a local book club and enjoys discussing books with fellow members. "
        "He is also a fan of board games and often hosts game nights at his place. "
        "Brandon is an advocate for environmental conservation and volunteers for local clean-up drives. "
        "He is also a mentor for aspiring software developers and enjoys sharing his knowledge with others. "
        "Brandon's favorite sport is basketball, and he often plays with his friends on weekends. "
        "He is also a fan of the Golden State Warriors and enjoys watching their games. "
    ) * 2  # Repeat to ensure it's 2k characters
    file_path = tmpdir.join("long_file.txt")
    with open(file_path, "w") as f:
        f.write(content)
    file_source = TextFileKnowledgeSource(file_path=str(file_path))
    knowledge_base = Knowledge(sources=[file_source])
    # Perform a query
    query = "What is Brandon's favorite movie?"
    results = knowledge_base.query(query)
    # Assert that the results contain the expected information
    assert any("inception" in result.lower() for result in results)
 def test_multiple_short_files(tmpdir):
    # Create multiple short text files
    contents = [
        "Brandon lives in New York.",
        "Brandon works as a software engineer.",
        "Brandon enjoys cooking Italian food.",
    ]
    file_paths = []
    for i, content in enumerate(contents):
        file_path = tmpdir.join(f"file_{i}.txt")
        with open(file_path, "w") as f:
            f.write(content)
        file_paths.append(str(file_path))
    file_sources = [TextFileKnowledgeSource(file_path=path) for path in file_paths]
    knowledge_base = Knowledge(sources=file_sources)
    # Perform a query
    query = "Where does Brandon live?"
    results = knowledge_base.query(query)
    # Assert that the correct information is retrieved
    assert any("new york" in result.lower() for result in results)
 def test_multiple_2k_character_files(tmpdir):
    # Create multiple 2k character text files with various facts about Brandon
    contents = [
        (
            "Brandon loves traveling and has visited over 20 countries. "
            "He is fluent in Spanish and often practices with his friends. "
            "Brandon's favorite city is Barcelona, where he enjoys the architecture and culture. "
            "He is a foodie and loves trying new cuisines, with a particular fondness for sushi. "
            "Brandon is an avid cyclist and participates in local cycling events. "
            "He is also a photographer and enjoys capturing landscapes and cityscapes. "
            "Brandon is a tech enthusiast and follows the latest trends in gadgets and software. "
            "He is also a fan of virtual reality and owns a VR headset. "
            "Brandon's favorite book is 'The Hitchhiker's Guide to the Galaxy'. "
            "He enjoys watching documentaries and learning about history and science. "
            "Brandon is a coffee lover and has a collection of coffee mugs from different countries. "
            "He is also a fan of jazz music and often attends live performances. "
            "Brandon is a member of a local running club and participates in marathons. "
            "He is also a volunteer at a local animal shelter and helps with dog walking. "
            "Brandon's favorite holiday is Christmas, and he enjoys decorating his home. "
            "He is also a fan of classic movies and has a collection of DVDs. "
            "Brandon is a mentor for young professionals and enjoys giving career advice. "
            "He is also a fan of puzzles and enjoys solving them in his free time. "
            "Brandon's favorite sport is soccer, and he often plays with his friends. "
            "He is also a fan of FC Barcelona and enjoys watching their matches. "
        )
        * 2,  # Repeat to ensure it's 2k characters
        (
            "Brandon is a software engineer who lives in San Francisco. "
            "He enjoys hiking and often visits the trails in the Bay Area. "
            "Brandon has a pet dog named Max, who is a golden retriever. "
            "He loves reading science fiction books, and his favorite author is Isaac Asimov. "
            "Brandon's favorite movie is Inception, and he enjoys watching it with his friends. "
            "He is also a fan of Mexican cuisine, especially tacos and burritos. "
            "Brandon plays the guitar and often performs at local open mic nights. "
            "He is learning French and plans to visit Paris next year. "
            "Brandon is passionate about technology and often attends tech meetups in the city. "
            "He is also interested in AI and machine learning, and he is currently working on a project related to natural language processing. "
            "Brandon's favorite color is blue, and he often wears blue shirts. "
            "He enjoys cooking and often tries new recipes on weekends. "
            "Brandon is a morning person and likes to start his day with a run in the park. "
            "He is also a coffee enthusiast and enjoys trying different coffee blends. "
            "Brandon is a member of a local book club and enjoys discussing books with fellow members. "
            "He is also a fan of board games and often hosts game nights at his place. "
            "Brandon is an advocate for environmental conservation and volunteers for local clean-up drives. "
            "He is also a mentor for aspiring software developers and enjoys sharing his knowledge with others. "
            "Brandon's favorite sport is basketball, and he often plays with his friends on weekends. "
            "He is also a fan of the Golden State Warriors and enjoys watching their games. "
        )
        * 2,  # Repeat to ensure it's 2k characters
    ]
    file_paths = []
    for i, content in enumerate(contents):
        file_path = tmpdir.join(f"long_file_{i}.txt")
        with open(file_path, "w") as f:
            f.write(content)
        file_paths.append(str(file_path))
    file_sources = [TextFileKnowledgeSource(file_path=path) for path in file_paths]
    knowledge_base = Knowledge(sources=file_sources)
    # Perform a query
    query = "What is Brandon's favorite book?"
    results = knowledge_base.query(query)
    # Assert that the correct information is retrieved
    assert any(
        "the hitchhiker's guide to the galaxy" in result.lower() for result in results
    )
 def test_hybrid_string_and_files(tmpdir):
    # Create string sources
    string_contents = [
        "Brandon is learning French.",
        "Brandon visited Paris last summer.",
    ]
    string_sources = [
        StringKnowledgeSource(content=content) for content in string_contents
    ]
    # Create file sources
    file_contents = [
        "Brandon prefers tea over coffee.",
        "Brandon's favorite book is 'The Alchemist'.",
    ]
    file_paths = []
    for i, content in enumerate(file_contents):
        file_path = tmpdir.join(f"file_{i}.txt")
        with open(file_path, "w") as f:
            f.write(content)
        file_paths.append(str(file_path))
    file_sources = [TextFileKnowledgeSource(file_path=path) for path in file_paths]
    # Combine string and file sources
    knowledge_base = Knowledge(sources=string_sources + file_sources)
    # Perform a query
    query = "What is Brandon's favorite book?"
    results = knowledge_base.query(query)
    # Assert that the correct information is retrieved
    assert any("the alchemist" in result.lower() for result in results)
 def test_pdf_knowledge_source():
    # Get the directory of the current file
    current_dir = os.path.dirname(__file__)
    # Construct the path to the PDF file
    pdf_path = os.path.join(current_dir, "crewai_quickstart.pdf")
    # Create a PDFKnowledgeSource
    pdf_source = PDFKnowledgeSource(file_path=pdf_path)
    knowledge_base = Knowledge(sources=[pdf_source])
    # Perform a query
    query = "How do you create a crew?"
    results = knowledge_base.query(query)
    print("Results from querying PDFKnowledgeSource:", results)
    # Assert that the correct information is retrieved
    assert any(
        "crewai create crew latest-ai-development" in result.lower()
        for result in results
    )
--- a/uv.lock
+++ b/uv.lock
@@ -612,7 +612,6 @@ dependencies = [
    { name = "chromadb" },
    { name = "click" },
    { name = "crewai-tools" },
    { name = "fastembed" },
    { name = "instructor" },
    { name = "json-repair" },
    { name = "jsonref" },
@@ -635,6 +634,15 @@ dependencies = [
 agentops = [
    { name = "agentops" },
 ]
 fastembed = [
    { name = "fastembed" },
 ]
 network = [
    { name = "pdfplumber" },
 ]
 pdfplumber = [
    { name = "pdfplumber" },
 ]
 tools = [
    { name = "crewai-tools" },
 ]
@@ -668,7 +676,7 @@ requires-dist = [
    { name = "click", specifier = ">=8.1.7" },
    { name = "crewai-tools", specifier = ">=0.13.4" },
    { name = "crewai-tools", marker = "extra == 'tools'", specifier = ">=0.13.4" },
-    { name = "fastembed", specifier = ">=0.4.1" },
+    { name = "fastembed", marker = "extra == 'fastembed'", specifier = ">=0.4.1" },
    { name = "instructor", specifier = ">=1.3.3" },
    { name = "json-repair", specifier = ">=0.25.2" },
    { name = "jsonref", specifier = ">=1.1.0" },
@@ -678,6 +686,8 @@ requires-dist = [
    { name = "opentelemetry-api", specifier = ">=1.22.0" },
    { name = "opentelemetry-exporter-otlp-proto-http", specifier = ">=1.22.0" },
    { name = "opentelemetry-sdk", specifier = ">=1.22.0" },
    { name = "pdfplumber", marker = "extra == 'network'", specifier = ">=0.11.4" },
    { name = "pdfplumber", marker = "extra == 'pdfplumber'", specifier = ">=0.11.4" },
    { name = "pydantic", specifier = ">=2.4.2" },
    { name = "python-dotenv", specifier = ">=1.0.0" },
    { name = "pyvis", specifier = ">=0.3.2" },
@@ -2975,6 +2985,33 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08", size = 31191 },
 ]
 [[package]]
 name = "pdfminer-six"
 version = "20231228"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "charset-normalizer" },
    { name = "cryptography" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/31/b1/a43e3bd872ded4deea4f8efc7aff1703fca8c5455d0c06e20506a06a44ff/pdfminer.six-20231228.tar.gz", hash = "sha256:6004da3ad1a7a4d45930cb950393df89b068e73be365a6ff64a838d37bcb08c4", size = 7362505 }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/eb/9c/e46fe7502b32d7db6af6e36a9105abb93301fa1ec475b5ddcba8b35ae23a/pdfminer.six-20231228-py3-none-any.whl", hash = "sha256:e8d3c3310e6fbc1fe414090123ab01351634b4ecb021232206c4c9a8ca3e3b8f", size = 5614515 },
 ]
 [[package]]
 name = "pdfplumber"
 version = "0.11.4"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "pdfminer-six" },
    { name = "pillow" },
    { name = "pypdfium2" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/ca/f0/457bda3629dfa5b01c645519fe30230e1739751f6645e23fca2dabf6c2e5/pdfplumber-0.11.4.tar.gz", hash = "sha256:147b55cde2351fcb9523b46b09cc771eea3602faecfb60d463c6bf951694fbe8", size = 113305 }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/d0/87/415cb472981a8d2e36beeeadf074ebb686cc2bfe8d18de973232da291bd5/pdfplumber-0.11.4-py3-none-any.whl", hash = "sha256:6150f0678c7aaba974ac09839c17475d6c0c4d126b5f92cb85154885f31c6d73", size = 59182 },
 ]
 [[package]]
 name = "pexpect"
 version = "4.9.0"
@@ -3546,6 +3583,26 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/48/8f/9bbf22ba6a00001a45dbc54337e5bbbd43e7d8f34c8158c92cddc45736af/pypdf-5.0.1-py3-none-any.whl", hash = "sha256:ff8a32da6c7a63fea9c32fa4dd837cdd0db7966adf6c14f043e3f12592e992db", size = 294470 },
 ]
 [[package]]
 name = "pypdfium2"
 version = "4.30.0"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/a1/14/838b3ba247a0ba92e4df5d23f2bea9478edcfd72b78a39d6ca36ccd84ad2/pypdfium2-4.30.0.tar.gz", hash = "sha256:48b5b7e5566665bc1015b9d69c1ebabe21f6aee468b509531c3c8318eeee2e16", size = 140239 }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/c7/9a/c8ff5cc352c1b60b0b97642ae734f51edbab6e28b45b4fcdfe5306ee3c83/pypdfium2-4.30.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:b33ceded0b6ff5b2b93bc1fe0ad4b71aa6b7e7bd5875f1ca0cdfb6ba6ac01aab", size = 2837254 },
    { url = "https://files.pythonhosted.org/packages/21/8b/27d4d5409f3c76b985f4ee4afe147b606594411e15ac4dc1c3363c9a9810/pypdfium2-4.30.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:4e55689f4b06e2d2406203e771f78789bd4f190731b5d57383d05cf611d829de", size = 2707624 },
    { url = "https://files.pythonhosted.org/packages/11/63/28a73ca17c24b41a205d658e177d68e198d7dde65a8c99c821d231b6ee3d/pypdfium2-4.30.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e6e50f5ce7f65a40a33d7c9edc39f23140c57e37144c2d6d9e9262a2a854854", size = 2793126 },
    { url = "https://files.pythonhosted.org/packages/d1/96/53b3ebf0955edbd02ac6da16a818ecc65c939e98fdeb4e0958362bd385c8/pypdfium2-4.30.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3d0dd3ecaffd0b6dbda3da663220e705cb563918249bda26058c6036752ba3a2", size = 2591077 },
    { url = "https://files.pythonhosted.org/packages/ec/ee/0394e56e7cab8b5b21f744d988400948ef71a9a892cbeb0b200d324ab2c7/pypdfium2-4.30.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cc3bf29b0db8c76cdfaac1ec1cde8edf211a7de7390fbf8934ad2aa9b4d6dfad", size = 2864431 },
    { url = "https://files.pythonhosted.org/packages/65/cd/3f1edf20a0ef4a212a5e20a5900e64942c5a374473671ac0780eaa08ea80/pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1f78d2189e0ddf9ac2b7a9b9bd4f0c66f54d1389ff6c17e9fd9dc034d06eb3f", size = 2812008 },
    { url = "https://files.pythonhosted.org/packages/c8/91/2d517db61845698f41a2a974de90762e50faeb529201c6b3574935969045/pypdfium2-4.30.0-py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:5eda3641a2da7a7a0b2f4dbd71d706401a656fea521b6b6faa0675b15d31a163", size = 6181543 },
    { url = "https://files.pythonhosted.org/packages/ba/c4/ed1315143a7a84b2c7616569dfb472473968d628f17c231c39e29ae9d780/pypdfium2-4.30.0-py3-none-musllinux_1_1_i686.whl", hash = "sha256:0dfa61421b5eb68e1188b0b2231e7ba35735aef2d867d86e48ee6cab6975195e", size = 6175911 },
    { url = "https://files.pythonhosted.org/packages/7a/c4/9e62d03f414e0e3051c56d5943c3bf42aa9608ede4e19dc96438364e9e03/pypdfium2-4.30.0-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:f33bd79e7a09d5f7acca3b0b69ff6c8a488869a7fab48fdf400fec6e20b9c8be", size = 6267430 },
    { url = "https://files.pythonhosted.org/packages/90/47/eda4904f715fb98561e34012826e883816945934a851745570521ec89520/pypdfium2-4.30.0-py3-none-win32.whl", hash = "sha256:ee2410f15d576d976c2ab2558c93d392a25fb9f6635e8dd0a8a3a5241b275e0e", size = 2775951 },
    { url = "https://files.pythonhosted.org/packages/25/bd/56d9ec6b9f0fc4e0d95288759f3179f0fcd34b1a1526b75673d2f6d5196f/pypdfium2-4.30.0-py3-none-win_amd64.whl", hash = "sha256:90dbb2ac07be53219f56be09961eb95cf2473f834d01a42d901d13ccfad64b4c", size = 2892098 },
    { url = "https://files.pythonhosted.org/packages/be/7a/097801205b991bc3115e8af1edb850d30aeaf0118520b016354cf5ccd3f6/pypdfium2-4.30.0-py3-none-win_arm64.whl", hash = "sha256:119b2969a6d6b1e8d55e99caaf05290294f2d0fe49c12a3f17102d01c441bd29", size = 2752118 },
 ]
 [[package]]
 name = "pypika"
 version = "0.48.9"