Adding core knowledge sources

2026-01-08 15:48:29 +00:00 · 2024-11-06 12:33:55 -05:00
parent a8a2f80616
commit 1a35114c08
15 changed files with 645 additions and 155 deletions
--- a/path/to/src/crewai/knowledge/source/base_knowledge_source.py
+++ b/path/to/src/crewai/knowledge/source/base_knowledge_source.py
@@ -0,0 +1,32 @@
+from abc import ABC, abstractmethod
+from typing import List
+
+from crewai.knowledge.embedder.base_embedder import BaseEmbedder
+
+
+class BaseKnowledgeSource(ABC):
+    """Abstract base class for different types of knowledge sources."""
+
+    def __init__(
+        self,
+        chunk_size: int = 1000,
+        chunk_overlap: int = 200,
+    ):
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.chunks: List[str] = []
+
+    @abstractmethod
+    def load_content(self):
+        """Load and preprocess content from the source."""
+        pass
+
+    @abstractmethod
+    def add(self, embedder: BaseEmbedder) -> None:
+        """Add content to the knowledge base, chunk it, and compute embeddings."""
+        pass
+
+    @abstractmethod
+    def query(self, embedder: BaseEmbedder, query: str, top_k: int = 3) -> str:
+        """Query the knowledge base using semantic search."""
+        pass
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,7 +29,6 @@ dependencies = [
    "tomli-w>=1.1.0",
    "chromadb>=0.4.24",
    "tomli>=2.0.2",
-    "fastembed>=0.4.1",
 ]

 [project.urls]
@@ -40,6 +39,10 @@ Repository = "https://github.com/crewAIInc/crewAI"
 [project.optional-dependencies]
 tools = ["crewai-tools>=0.13.4"]
 agentops = ["agentops>=0.3.0"]
+fastembed = ["fastembed>=0.4.1"]
+pdfplumber = [
+    "pdfplumber>=0.11.4",
+]

 [tool.uv]
 dev-dependencies = [
--- a/src/crewai/init.py
+++ b/src/crewai/init.py
@@ -1,7 +1,9 @@
 import warnings
+
 from crewai.agent import Agent
 from crewai.crew import Crew
 from crewai.flow.flow import Flow
+from crewai.knowledge.knowledge import Knowledge
 from crewai.llm import LLM
 from crewai.pipeline import Pipeline
 from crewai.process import Process
@@ -15,4 +17,14 @@ warnings.filterwarnings(
    module="pydantic.main",
 )
 __version__ = "0.76.9"
-__all__ = ["Agent", "Crew", "Process", "Task", "Pipeline", "Router", "LLM", "Flow"]
+__all__ = [
+    "Agent",
+    "Crew",
+    "Process",
+    "Task",
+    "Pipeline",
+    "Router",
+    "LLM",
+    "Flow",
+    "Knowledge",
+]
--- a/src/crewai/knowledge/embedder/init.py
+++ b/src/crewai/knowledge/embedder/init.py
--- a/src/crewai/knowledge/embedder/fastembed.py
+++ b/src/crewai/knowledge/embedder/fastembed.py
@@ -47,7 +47,7 @@ class FastEmbed(BaseEmbedder):
            cache_dir=str(cache_dir) if cache_dir else None,
        )

-    def embed_chunks(self, chunks: List[str]) -> np.ndarray:
+    def embed_chunks(self, chunks: List[str]) -> List[np.ndarray]:
        """
        Generate embeddings for a list of text chunks

@@ -55,13 +55,12 @@ class FastEmbed(BaseEmbedder):
            chunks: List of text chunks to embed

        Returns:
-            Array of embeddings
+            List of embeddings
        """
-        # FastEmbed returns a generator, convert to list then numpy array
        embeddings = list(self.model.embed(chunks))
-        return np.array(embeddings)
+        return embeddings

-    def embed_texts(self, texts: List[str]) -> np.ndarray:
+    def embed_texts(self, texts: List[str]) -> List[np.ndarray]:
        """
        Generate embeddings for a list of texts

@@ -69,11 +68,10 @@ class FastEmbed(BaseEmbedder):
            texts: List of texts to embed

        Returns:
-            Array of embeddings
+            List of embeddings
        """
-        # FastEmbed returns a generator, convert to list then numpy array
        embeddings = list(self.model.embed(texts))
-        return np.array(embeddings)
+        return embeddings

    def embed_text(self, text: str) -> np.ndarray:
        """
--- a/src/crewai/knowledge/knowledge.py
+++ b/src/crewai/knowledge/knowledge.py
@@ -1,21 +1,53 @@
-from typing import List, Optional
+from typing import List

-from pydantic import BaseModel
+from pydantic import BaseModel, ConfigDict, Field

-from .embedder.base_embedder import BaseEmbedder
-from .embedder.fastembed import FastEmbed
-from .source.base_knowledge_source import BaseKnowledgeSource
+from crewai.knowledge.embedder.base_embedder import BaseEmbedder
+from crewai.knowledge.embedder.fastembed import FastEmbed
+from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource


 class Knowledge(BaseModel):
-    sources: Optional[List[BaseKnowledgeSource]] = None
-    embedder: BaseEmbedder
+    sources: List[BaseKnowledgeSource] = Field(default_factory=list)
+    embedder: BaseEmbedder = Field(default_factory=FastEmbed)

-    def __init__(
-        self,
-        sources: Optional[List[BaseKnowledgeSource]] = None,
-        embedder: Optional[BaseEmbedder] = None,
-    ):
-        super().__init__()
-        self.sources = sources or []
-        self.embedder = embedder or FastEmbed()
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    def __init__(self, **data):
+        super().__init__(**data)
+        # Call add on all sources during initialization
+        for source in self.sources:
+            source.add(self.embedder)
+
+    def query(self, query: str, top_k: int = 3) -> List[str]:
+        """
+        Query across all knowledge sources to find the most relevant information.
+        Returns the top_k most relevant chunks.
+        """
+        if not self.sources:
+            return []
+
+        # Collect all chunks and embeddings from all sources
+        all_chunks = []
+        all_embeddings = []
+
+        for source in self.sources:
+            all_chunks.extend(source.chunks)
+            all_embeddings.extend(source.get_embeddings())
+
+        # Embed the query
+        query_embedding = self.embedder.embed_text(query)
+
+        # Calculate similarities
+        similarities = []
+        for idx, embedding in enumerate(all_embeddings):
+            similarity = query_embedding.dot(embedding)
+            similarities.append((similarity, idx))
+
+        # Sort by similarity
+        similarities.sort(reverse=True, key=lambda x: x[0])
+
+        # Get top_k results
+        top_chunks = [all_chunks[idx] for _, idx in similarities[:top_k]]
+
+        return top_chunks
--- a/src/crewai/knowledge/source/init.py
+++ b/src/crewai/knowledge/source/init.py
--- a/src/crewai/knowledge/source/base_knowledge_source.py
+++ b/src/crewai/knowledge/source/base_knowledge_source.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Any, Dict, List
+from typing import List

 import numpy as np

@@ -7,7 +7,7 @@ from crewai.knowledge.embedder.base_embedder import BaseEmbedder


 class BaseKnowledgeSource(ABC):
-    """Abstract base class for knowledge bases"""
+    """Abstract base class for knowledge sources."""

    def __init__(
        self,
@@ -17,96 +17,25 @@ class BaseKnowledgeSource(ABC):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.chunks: List[str] = []
-        self.chunk_embeddings: Dict[int, np.ndarray] = {}
+        self.chunk_embeddings: List[np.ndarray] = []

    @abstractmethod
-    def query(self, query: str) -> str:
-        """Query the knowledge base and return relevant information"""
+    def load_content(self):
+        """Load and preprocess content from the source."""
        pass

    @abstractmethod
-    def add(self, content: Any) -> None:
-        """Process and store content in the knowledge base"""
+    def add(self, embedder: BaseEmbedder) -> None:
+        """Process content, chunk it, compute embeddings, and save them."""
        pass

-    def embed(self, embedder: BaseEmbedder, new_chunks: List[str]) -> None:
-        """Embed chunks and store them"""
-        if not new_chunks:
-            return
-
-        # Get embeddings for new chunks
-        embeddings = embedder.embed_texts(new_chunks)
-
-        # Store embeddings with their corresponding chunks
-        start_idx = len(self.chunks)
-        for i, embedding in enumerate(embeddings):
-            self.chunk_embeddings[start_idx + i] = embedding
+    def get_embeddings(self) -> List[np.ndarray]:
+        """Return the list of embeddings for the chunks."""
+        return self.chunk_embeddings

    def _chunk_text(self, text: str) -> List[str]:
-        """Split text into chunks with overlap"""
-        chunks = []
-        start = 0
-        text_length = len(text)
-
-        while start < text_length:
-            # Get the chunk of size chunk_size
-            end = start + self.chunk_size
-
-            if end >= text_length:
-                # If we're at the end, just take the rest
-                chunks.append(text[start:].strip())
-                break
-
-            # Look for a good breaking point
-            # Priority: double newline > single newline > period > space
-            break_chars = ["\n\n", "\n", ". ", " "]
-            chunk_end = end
-
-            for break_char in break_chars:
-                # Look for the break_char in a window around the end point
-                window_start = max(start + self.chunk_size - 100, start)
-                window_end = min(start + self.chunk_size + 100, text_length)
-                window_text = text[window_start:window_end]
-
-                # Find the last occurrence of the break_char in the window
-                last_break = window_text.rfind(break_char)
-                if last_break != -1:
-                    chunk_end = window_start + last_break + len(break_char)
-                    break
-
-            # Add the chunk
-            chunk = text[start:chunk_end].strip()
-            if chunk:  # Only add non-empty chunks
-                chunks.append(chunk)
-
-            # Move the start pointer, accounting for overlap
-            start = max(
-                start + self.chunk_size - self.chunk_overlap,
-                chunk_end - self.chunk_overlap,
-            )
-
-        return chunks
-
-    def _find_similar_chunks(
-        self, embedder: BaseEmbedder, query: str, top_k: int = 3
-    ) -> List[str]:
-        """Find the most similar chunks to a query using embeddings"""
-        if not self.chunks:
-            return []
-
-        # Get query embedding
-        query_embedding = embedder.embed_text(query)
-
-        # Calculate similarities with all chunks
-        similarities = []
-        for idx, chunk_embedding in self.chunk_embeddings.items():
-            similarity = np.dot(query_embedding, chunk_embedding)
-            similarities.append((similarity, idx))
-
-        # Sort by similarity and get top_k chunks
-        similarities.sort(reverse=True)
-        top_chunks = []
-        for _, idx in similarities[:top_k]:
-            top_chunks.append(self.chunks[idx])
-
-        return top_chunks
+        """Utility method to split text into chunks."""
+        return [
+            text[i : i + self.chunk_size]
+            for i in range(0, len(text), self.chunk_size - self.chunk_overlap)
+        ]
--- a/src/crewai/knowledge/source/pdf_knowledge_source.py
+++ b/src/crewai/knowledge/source/pdf_knowledge_source.py
@@ -0,0 +1,65 @@
+from pathlib import Path
+from typing import List
+
+from crewai.knowledge.embedder.base_embedder import BaseEmbedder
+from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
+
+
+class PDFKnowledgeSource(BaseKnowledgeSource):
+    """A knowledge source that stores and queries PDF file content using embeddings."""
+
+    def __init__(
+        self,
+        file_path: str,
+        chunk_size: int = 1000,
+        chunk_overlap: int = 200,
+    ):
+        super().__init__(chunk_size, chunk_overlap)
+        self.file_path = Path(file_path)
+        self.content = self.load_content()
+
+    def _import_pdfplumber(self):
+        """Dynamically import pdfplumber."""
+        try:
+            import pdfplumber
+
+            return pdfplumber
+        except ImportError:
+            raise ImportError(
+                "pdfplumber is not installed. Please install it with: pip install pdfplumber"
+            )
+
+    def load_content(self) -> str:
+        """Load and preprocess PDF file content."""
+        if not self.file_path.exists():
+            raise FileNotFoundError(f"File not found: {self.file_path}")
+        if not self.file_path.is_file():
+            raise ValueError(f"Path is not a file: {self.file_path}")
+
+        pdfplumber = self._import_pdfplumber()
+        text = ""
+        with pdfplumber.open(self.file_path) as pdf:
+            for page in pdf.pages:
+                page_text = page.extract_text()
+                if page_text:
+                    text += page_text + "\n"
+        return text
+
+    def add(self, embedder: BaseEmbedder) -> None:
+        """
+        Add PDF file content to the knowledge source, chunk it, compute embeddings,
+        and save the embeddings.
+        """
+        new_chunks = self._chunk_text(self.content)
+        self.chunks.extend(new_chunks)
+        # Compute embeddings for the new chunks
+        new_embeddings = embedder.embed_chunks(new_chunks)
+        # Save the embeddings
+        self.chunk_embeddings.extend(new_embeddings)
+
+    def _chunk_text(self, text: str) -> List[str]:
+        """Utility method to split text into chunks."""
+        return [
+            text[i : i + self.chunk_size]
+            for i in range(0, len(text), self.chunk_size - self.chunk_overlap)
+        ]
--- a/src/crewai/knowledge/source/string_knowledge_source.py
+++ b/src/crewai/knowledge/source/string_knowledge_source.py
@@ -1,9 +1,11 @@
+from typing import List
+
 from crewai.knowledge.embedder.base_embedder import BaseEmbedder
 from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource


 class StringKnowledgeSource(BaseKnowledgeSource):
-    """A knowledge base that stores and queries plain text content using embeddings"""
+    """A knowledge source that stores and queries plain text content using embeddings."""

    def __init__(
        self,
@@ -15,25 +17,29 @@ class StringKnowledgeSource(BaseKnowledgeSource):
            chunk_size,
            chunk_overlap,
        )
+        self.content = content
+        self.load_content()
+
+    def load_content(self):
+        """Load and preprocess string content."""
+        if not isinstance(self.content, str):
+            raise ValueError("StringKnowledgeSource only accepts string content")

    def add(self, embedder: BaseEmbedder) -> None:
-        """Add text content to the knowledge base, chunk it, and compute embeddings"""
-        if not isinstance(self.content, str):
-            raise ValueError("StringKnowledgeBase only accepts string content")
-
-        # Create chunks from the text
-        new_chunks = self._chunk_text(content)
-
-        # Add chunks to the knowledge base
+        """
+        Add string content to the knowledge source, chunk it, compute embeddings,
+        and save the embeddings.
+        """
+        new_chunks = self._chunk_text(self.content)
        self.chunks.extend(new_chunks)
+        # Compute embeddings for the new chunks
+        new_embeddings = embedder.embed_chunks(new_chunks)
+        # Save the embeddings
+        self.chunk_embeddings.extend(new_embeddings)

-        # Compute and store embeddings for the new chunks
-        embedder.embed_chunks(new_chunks)
-
-    def query(self, embedder: BaseEmbedder, query: str, top_k: int = 3) -> str:
-        """
-        Query the knowledge base using semantic search
-        Returns the most relevant chunk based on embedding similarity
-        """
-        similar_chunks = self._find_similar_chunks(embedder, query, top_k=top_k)
-        return similar_chunks[0] if similar_chunks else ""
+    def _chunk_text(self, text: str) -> List[str]:
+        """Utility method to split text into chunks."""
+        return [
+            text[i : i + self.chunk_size]
+            for i in range(0, len(text), self.chunk_size - self.chunk_overlap)
+        ]
--- a/src/crewai/knowledge/source/text_file_knowledge_source.py
+++ b/src/crewai/knowledge/source/text_file_knowledge_source.py
@@ -1,9 +1,12 @@
+from pathlib import Path
+from typing import List
+
 from crewai.knowledge.embedder.base_embedder import BaseEmbedder
 from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource


 class TextFileKnowledgeSource(BaseKnowledgeSource):
-    """A knowledge base that stores and queries plain text content using embeddings"""
+    """A knowledge source that stores and queries text file content using embeddings."""

    def __init__(
        self,
@@ -11,29 +14,35 @@ class TextFileKnowledgeSource(BaseKnowledgeSource):
        chunk_size: int = 1000,
        chunk_overlap: int = 200,
    ):
-        super().__init__(
-            chunk_size,
-            chunk_overlap,
-        )
+        super().__init__(chunk_size, chunk_overlap)
+        self.file_path = Path(file_path)
+        self.content = self.load_content()
+
+    def load_content(self) -> str:
+        """Load and preprocess text file content."""
+        if not self.file_path.exists():
+            raise FileNotFoundError(f"File not found: {self.file_path}")
+        if not self.file_path.is_file():
+            raise ValueError(f"Path is not a file: {self.file_path}")
+
+        with self.file_path.open("r", encoding="utf-8") as f:
+            return f.read()

    def add(self, embedder: BaseEmbedder) -> None:
-        """Add text content to the knowledge base, chunk it, and compute embeddings"""
-        if not isinstance(self.content, str):
-            raise ValueError("StringKnowledgeBase only accepts string content")
-
-        # Create chunks from the text
-        new_chunks = self._chunk_text(content)
-
-        # Add chunks to the knowledge base
+        """
+        Add text file content to the knowledge source, chunk it, compute embeddings,
+        and save the embeddings.
+        """
+        new_chunks = self._chunk_text(self.content)
        self.chunks.extend(new_chunks)
+        # Compute embeddings for the new chunks
+        new_embeddings = embedder.embed_chunks(new_chunks)
+        # Save the embeddings
+        self.chunk_embeddings.extend(new_embeddings)

-        # Compute and store embeddings for the new chunks
-        embedder.embed_chunks(new_chunks)
-
-    def query(self, embedder: BaseEmbedder, query: str, top_k: int = 3) -> str:
-        """
-        Query the knowledge base using semantic search
-        Returns the most relevant chunk based on embedding similarity
-        """
-        similar_chunks = self._find_similar_chunks(embedder, query, top_k=top_k)
-        return similar_chunks[0] if similar_chunks else ""
+    def _chunk_text(self, text: str) -> List[str]:
+        """Utility method to split text into chunks."""
+        return [
+            text[i : i + self.chunk_size]
+            for i in range(0, len(text), self.chunk_size - self.chunk_overlap)
+        ]
--- a/tests/knowledge/init.py
+++ b/tests/knowledge/init.py
--- a/tests/knowledge/crewai_quickstart.pdf
+++ b/tests/knowledge/crewai_quickstart.pdf
--- a/tests/knowledge/knowledge_test.py
+++ b/tests/knowledge/knowledge_test.py
@@ -0,0 +1,347 @@
+"""Test Knowledge creation and querying functionality."""
+
+import os
+
+from crewai.knowledge.knowledge import Knowledge
+from crewai.knowledge.source.pdf_knowledge_source import PDFKnowledgeSource
+from crewai.knowledge.source.string_knowledge_source import StringKnowledgeSource
+from crewai.knowledge.source.text_file_knowledge_source import TextFileKnowledgeSource
+
+
+def test_single_short_string():
+    # Create a knowledge base with a single short string
+    content = "Brandon's favorite color is blue and he likes Mexican food."
+    string_source = StringKnowledgeSource(content=content)
+    knowledge_base = Knowledge(sources=[string_source])
+
+    # Perform a query
+    query = "What is Brandon's favorite color?"
+    results = knowledge_base.query(query)
+
+    # Assert that the results contain the expected information
+    assert any("blue" in result.lower() for result in results)
+
+
+def test_single_2k_character_string():
+    # Create a 2k character string with various facts about Brandon
+    content = (
+        "Brandon is a software engineer who lives in San Francisco. "
+        "He enjoys hiking and often visits the trails in the Bay Area. "
+        "Brandon has a pet dog named Max, who is a golden retriever. "
+        "He loves reading science fiction books, and his favorite author is Isaac Asimov. "
+        "Brandon's favorite movie is Inception, and he enjoys watching it with his friends. "
+        "He is also a fan of Mexican cuisine, especially tacos and burritos. "
+        "Brandon plays the guitar and often performs at local open mic nights. "
+        "He is learning French and plans to visit Paris next year. "
+        "Brandon is passionate about technology and often attends tech meetups in the city. "
+        "He is also interested in AI and machine learning, and he is currently working on a project related to natural language processing. "
+        "Brandon's favorite color is blue, and he often wears blue shirts. "
+        "He enjoys cooking and often tries new recipes on weekends. "
+        "Brandon is a morning person and likes to start his day with a run in the park. "
+        "He is also a coffee enthusiast and enjoys trying different coffee blends. "
+        "Brandon is a member of a local book club and enjoys discussing books with fellow members. "
+        "He is also a fan of board games and often hosts game nights at his place. "
+        "Brandon is an advocate for environmental conservation and volunteers for local clean-up drives. "
+        "He is also a mentor for aspiring software developers and enjoys sharing his knowledge with others. "
+        "Brandon's favorite sport is basketball, and he often plays with his friends on weekends. "
+        "He is also a fan of the Golden State Warriors and enjoys watching their games. "
+    )
+    string_source = StringKnowledgeSource(content=content)
+    knowledge_base = Knowledge(sources=[string_source])
+
+    # Perform a query
+    query = "What is Brandon's favorite movie?"
+    results = knowledge_base.query(query)
+
+    # Assert that the results contain the expected information
+    assert any("inception" in result.lower() for result in results)
+
+
+def test_multiple_short_strings():
+    # Create multiple short string sources
+    contents = [
+        "Brandon loves hiking.",
+        "Brandon has a dog named Max.",
+        "Brandon enjoys painting landscapes.",
+    ]
+    string_sources = [StringKnowledgeSource(content=content) for content in contents]
+    knowledge_base = Knowledge(sources=string_sources)
+
+    # Perform a query
+    query = "What is the name of Brandon's pet?"
+    results = knowledge_base.query(query)
+
+    # Assert that the correct information is retrieved
+    assert any("max" in result.lower() for result in results)
+
+
+def test_multiple_2k_character_strings():
+    # Create multiple 2k character strings with various facts about Brandon
+    contents = [
+        (
+            "Brandon is a software engineer who lives in San Francisco. "
+            "He enjoys hiking and often visits the trails in the Bay Area. "
+            "Brandon has a pet dog named Max, who is a golden retriever. "
+            "He loves reading science fiction books, and his favorite author is Isaac Asimov. "
+            "Brandon's favorite movie is Inception, and he enjoys watching it with his friends. "
+            "He is also a fan of Mexican cuisine, especially tacos and burritos. "
+            "Brandon plays the guitar and often performs at local open mic nights. "
+            "He is learning French and plans to visit Paris next year. "
+            "Brandon is passionate about technology and often attends tech meetups in the city. "
+            "He is also interested in AI and machine learning, and he is currently working on a project related to natural language processing. "
+            "Brandon's favorite color is blue, and he often wears blue shirts. "
+            "He enjoys cooking and often tries new recipes on weekends. "
+            "Brandon is a morning person and likes to start his day with a run in the park. "
+            "He is also a coffee enthusiast and enjoys trying different coffee blends. "
+            "Brandon is a member of a local book club and enjoys discussing books with fellow members. "
+            "He is also a fan of board games and often hosts game nights at his place. "
+            "Brandon is an advocate for environmental conservation and volunteers for local clean-up drives. "
+            "He is also a mentor for aspiring software developers and enjoys sharing his knowledge with others. "
+            "Brandon's favorite sport is basketball, and he often plays with his friends on weekends. "
+            "He is also a fan of the Golden State Warriors and enjoys watching their games. "
+        )
+        * 2,  # Repeat to ensure it's 2k characters
+        (
+            "Brandon loves traveling and has visited over 20 countries. "
+            "He is fluent in Spanish and often practices with his friends. "
+            "Brandon's favorite city is Barcelona, where he enjoys the architecture and culture. "
+            "He is a foodie and loves trying new cuisines, with a particular fondness for sushi. "
+            "Brandon is an avid cyclist and participates in local cycling events. "
+            "He is also a photographer and enjoys capturing landscapes and cityscapes. "
+            "Brandon is a tech enthusiast and follows the latest trends in gadgets and software. "
+            "He is also a fan of virtual reality and owns a VR headset. "
+            "Brandon's favorite book is 'The Hitchhiker's Guide to the Galaxy'. "
+            "He enjoys watching documentaries and learning about history and science. "
+            "Brandon is a coffee lover and has a collection of coffee mugs from different countries. "
+            "He is also a fan of jazz music and often attends live performances. "
+            "Brandon is a member of a local running club and participates in marathons. "
+            "He is also a volunteer at a local animal shelter and helps with dog walking. "
+            "Brandon's favorite holiday is Christmas, and he enjoys decorating his home. "
+            "He is also a fan of classic movies and has a collection of DVDs. "
+            "Brandon is a mentor for young professionals and enjoys giving career advice. "
+            "He is also a fan of puzzles and enjoys solving them in his free time. "
+            "Brandon's favorite sport is soccer, and he often plays with his friends. "
+            "He is also a fan of FC Barcelona and enjoys watching their matches. "
+        )
+        * 2,  # Repeat to ensure it's 2k characters
+    ]
+    string_sources = [StringKnowledgeSource(content=content) for content in contents]
+    knowledge_base = Knowledge(sources=string_sources)
+
+    # Perform a query
+    query = "What is Brandon's favorite book?"
+    results = knowledge_base.query(query)
+
+    # Assert that the correct information is retrieved
+    assert any(
+        "the hitchhiker's guide to the galaxy" in result.lower() for result in results
+    )
+
+
+def test_single_short_file(tmpdir):
+    # Create a single short text file
+    content = "Brandon's favorite sport is basketball."
+    file_path = tmpdir.join("short_file.txt")
+    with open(file_path, "w") as f:
+        f.write(content)
+
+    file_source = TextFileKnowledgeSource(file_path=str(file_path))
+    knowledge_base = Knowledge(sources=[file_source])
+
+    # Perform a query
+    query = "What sport does Brandon like?"
+    results = knowledge_base.query(query)
+
+    # Assert that the results contain the expected information
+    assert any("basketball" in result.lower() for result in results)
+
+
+def test_single_2k_character_file(tmpdir):
+    # Create a single 2k character text file with various facts about Brandon
+    content = (
+        "Brandon is a software engineer who lives in San Francisco. "
+        "He enjoys hiking and often visits the trails in the Bay Area. "
+        "Brandon has a pet dog named Max, who is a golden retriever. "
+        "He loves reading science fiction books, and his favorite author is Isaac Asimov. "
+        "Brandon's favorite movie is Inception, and he enjoys watching it with his friends. "
+        "He is also a fan of Mexican cuisine, especially tacos and burritos. "
+        "Brandon plays the guitar and often performs at local open mic nights. "
+        "He is learning French and plans to visit Paris next year. "
+        "Brandon is passionate about technology and often attends tech meetups in the city. "
+        "He is also interested in AI and machine learning, and he is currently working on a project related to natural language processing. "
+        "Brandon's favorite color is blue, and he often wears blue shirts. "
+        "He enjoys cooking and often tries new recipes on weekends. "
+        "Brandon is a morning person and likes to start his day with a run in the park. "
+        "He is also a coffee enthusiast and enjoys trying different coffee blends. "
+        "Brandon is a member of a local book club and enjoys discussing books with fellow members. "
+        "He is also a fan of board games and often hosts game nights at his place. "
+        "Brandon is an advocate for environmental conservation and volunteers for local clean-up drives. "
+        "He is also a mentor for aspiring software developers and enjoys sharing his knowledge with others. "
+        "Brandon's favorite sport is basketball, and he often plays with his friends on weekends. "
+        "He is also a fan of the Golden State Warriors and enjoys watching their games. "
+    ) * 2  # Repeat to ensure it's 2k characters
+    file_path = tmpdir.join("long_file.txt")
+    with open(file_path, "w") as f:
+        f.write(content)
+
+    file_source = TextFileKnowledgeSource(file_path=str(file_path))
+    knowledge_base = Knowledge(sources=[file_source])
+
+    # Perform a query
+    query = "What is Brandon's favorite movie?"
+    results = knowledge_base.query(query)
+
+    # Assert that the results contain the expected information
+    assert any("inception" in result.lower() for result in results)
+
+
+def test_multiple_short_files(tmpdir):
+    # Create multiple short text files
+    contents = [
+        "Brandon lives in New York.",
+        "Brandon works as a software engineer.",
+        "Brandon enjoys cooking Italian food.",
+    ]
+    file_paths = []
+    for i, content in enumerate(contents):
+        file_path = tmpdir.join(f"file_{i}.txt")
+        with open(file_path, "w") as f:
+            f.write(content)
+        file_paths.append(str(file_path))
+
+    file_sources = [TextFileKnowledgeSource(file_path=path) for path in file_paths]
+    knowledge_base = Knowledge(sources=file_sources)
+
+    # Perform a query
+    query = "Where does Brandon live?"
+    results = knowledge_base.query(query)
+
+    # Assert that the correct information is retrieved
+    assert any("new york" in result.lower() for result in results)
+
+
+def test_multiple_2k_character_files(tmpdir):
+    # Create multiple 2k character text files with various facts about Brandon
+    contents = [
+        (
+            "Brandon loves traveling and has visited over 20 countries. "
+            "He is fluent in Spanish and often practices with his friends. "
+            "Brandon's favorite city is Barcelona, where he enjoys the architecture and culture. "
+            "He is a foodie and loves trying new cuisines, with a particular fondness for sushi. "
+            "Brandon is an avid cyclist and participates in local cycling events. "
+            "He is also a photographer and enjoys capturing landscapes and cityscapes. "
+            "Brandon is a tech enthusiast and follows the latest trends in gadgets and software. "
+            "He is also a fan of virtual reality and owns a VR headset. "
+            "Brandon's favorite book is 'The Hitchhiker's Guide to the Galaxy'. "
+            "He enjoys watching documentaries and learning about history and science. "
+            "Brandon is a coffee lover and has a collection of coffee mugs from different countries. "
+            "He is also a fan of jazz music and often attends live performances. "
+            "Brandon is a member of a local running club and participates in marathons. "
+            "He is also a volunteer at a local animal shelter and helps with dog walking. "
+            "Brandon's favorite holiday is Christmas, and he enjoys decorating his home. "
+            "He is also a fan of classic movies and has a collection of DVDs. "
+            "Brandon is a mentor for young professionals and enjoys giving career advice. "
+            "He is also a fan of puzzles and enjoys solving them in his free time. "
+            "Brandon's favorite sport is soccer, and he often plays with his friends. "
+            "He is also a fan of FC Barcelona and enjoys watching their matches. "
+        )
+        * 2,  # Repeat to ensure it's 2k characters
+        (
+            "Brandon is a software engineer who lives in San Francisco. "
+            "He enjoys hiking and often visits the trails in the Bay Area. "
+            "Brandon has a pet dog named Max, who is a golden retriever. "
+            "He loves reading science fiction books, and his favorite author is Isaac Asimov. "
+            "Brandon's favorite movie is Inception, and he enjoys watching it with his friends. "
+            "He is also a fan of Mexican cuisine, especially tacos and burritos. "
+            "Brandon plays the guitar and often performs at local open mic nights. "
+            "He is learning French and plans to visit Paris next year. "
+            "Brandon is passionate about technology and often attends tech meetups in the city. "
+            "He is also interested in AI and machine learning, and he is currently working on a project related to natural language processing. "
+            "Brandon's favorite color is blue, and he often wears blue shirts. "
+            "He enjoys cooking and often tries new recipes on weekends. "
+            "Brandon is a morning person and likes to start his day with a run in the park. "
+            "He is also a coffee enthusiast and enjoys trying different coffee blends. "
+            "Brandon is a member of a local book club and enjoys discussing books with fellow members. "
+            "He is also a fan of board games and often hosts game nights at his place. "
+            "Brandon is an advocate for environmental conservation and volunteers for local clean-up drives. "
+            "He is also a mentor for aspiring software developers and enjoys sharing his knowledge with others. "
+            "Brandon's favorite sport is basketball, and he often plays with his friends on weekends. "
+            "He is also a fan of the Golden State Warriors and enjoys watching their games. "
+        )
+        * 2,  # Repeat to ensure it's 2k characters
+    ]
+    file_paths = []
+    for i, content in enumerate(contents):
+        file_path = tmpdir.join(f"long_file_{i}.txt")
+        with open(file_path, "w") as f:
+            f.write(content)
+        file_paths.append(str(file_path))
+
+    file_sources = [TextFileKnowledgeSource(file_path=path) for path in file_paths]
+    knowledge_base = Knowledge(sources=file_sources)
+
+    # Perform a query
+    query = "What is Brandon's favorite book?"
+    results = knowledge_base.query(query)
+
+    # Assert that the correct information is retrieved
+    assert any(
+        "the hitchhiker's guide to the galaxy" in result.lower() for result in results
+    )
+
+
+def test_hybrid_string_and_files(tmpdir):
+    # Create string sources
+    string_contents = [
+        "Brandon is learning French.",
+        "Brandon visited Paris last summer.",
+    ]
+    string_sources = [
+        StringKnowledgeSource(content=content) for content in string_contents
+    ]
+
+    # Create file sources
+    file_contents = [
+        "Brandon prefers tea over coffee.",
+        "Brandon's favorite book is 'The Alchemist'.",
+    ]
+    file_paths = []
+    for i, content in enumerate(file_contents):
+        file_path = tmpdir.join(f"file_{i}.txt")
+        with open(file_path, "w") as f:
+            f.write(content)
+        file_paths.append(str(file_path))
+
+    file_sources = [TextFileKnowledgeSource(file_path=path) for path in file_paths]
+
+    # Combine string and file sources
+    knowledge_base = Knowledge(sources=string_sources + file_sources)
+
+    # Perform a query
+    query = "What is Brandon's favorite book?"
+    results = knowledge_base.query(query)
+
+    # Assert that the correct information is retrieved
+    assert any("the alchemist" in result.lower() for result in results)
+
+
+def test_pdf_knowledge_source():
+    # Get the directory of the current file
+    current_dir = os.path.dirname(__file__)
+    # Construct the path to the PDF file
+    pdf_path = os.path.join(current_dir, "crewai_quickstart.pdf")
+
+    # Create a PDFKnowledgeSource
+    pdf_source = PDFKnowledgeSource(file_path=pdf_path)
+    knowledge_base = Knowledge(sources=[pdf_source])
+
+    # Perform a query
+    query = "How do you create a crew?"
+    results = knowledge_base.query(query)
+
+    print("Results from querying PDFKnowledgeSource:", results)
+    # Assert that the correct information is retrieved
+    assert any(
+        "crewai create crew latest-ai-development" in result.lower()
+        for result in results
+    )
--- a/uv.lock
+++ b/uv.lock
@@ -612,7 +612,6 @@ dependencies = [
    { name = "chromadb" },
    { name = "click" },
    { name = "crewai-tools" },
-    { name = "fastembed" },
    { name = "instructor" },
    { name = "json-repair" },
    { name = "jsonref" },
@@ -635,6 +634,15 @@ dependencies = [
 agentops = [
    { name = "agentops" },
 ]
+fastembed = [
+    { name = "fastembed" },
+]
+network = [
+    { name = "pdfplumber" },
+]
+pdfplumber = [
+    { name = "pdfplumber" },
+]
 tools = [
    { name = "crewai-tools" },
 ]
@@ -668,7 +676,7 @@ requires-dist = [
    { name = "click", specifier = ">=8.1.7" },
    { name = "crewai-tools", specifier = ">=0.13.4" },
    { name = "crewai-tools", marker = "extra == 'tools'", specifier = ">=0.13.4" },
-    { name = "fastembed", specifier = ">=0.4.1" },
+    { name = "fastembed", marker = "extra == 'fastembed'", specifier = ">=0.4.1" },
    { name = "instructor", specifier = ">=1.3.3" },
    { name = "json-repair", specifier = ">=0.25.2" },
    { name = "jsonref", specifier = ">=1.1.0" },
@@ -678,6 +686,8 @@ requires-dist = [
    { name = "opentelemetry-api", specifier = ">=1.22.0" },
    { name = "opentelemetry-exporter-otlp-proto-http", specifier = ">=1.22.0" },
    { name = "opentelemetry-sdk", specifier = ">=1.22.0" },
+    { name = "pdfplumber", marker = "extra == 'network'", specifier = ">=0.11.4" },
+    { name = "pdfplumber", marker = "extra == 'pdfplumber'", specifier = ">=0.11.4" },
    { name = "pydantic", specifier = ">=2.4.2" },
    { name = "python-dotenv", specifier = ">=1.0.0" },
    { name = "pyvis", specifier = ">=0.3.2" },
@@ -2975,6 +2985,33 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08", size = 31191 },
 ]

+[[package]]
+name = "pdfminer-six"
+version = "20231228"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "charset-normalizer" },
+    { name = "cryptography" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/31/b1/a43e3bd872ded4deea4f8efc7aff1703fca8c5455d0c06e20506a06a44ff/pdfminer.six-20231228.tar.gz", hash = "sha256:6004da3ad1a7a4d45930cb950393df89b068e73be365a6ff64a838d37bcb08c4", size = 7362505 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/eb/9c/e46fe7502b32d7db6af6e36a9105abb93301fa1ec475b5ddcba8b35ae23a/pdfminer.six-20231228-py3-none-any.whl", hash = "sha256:e8d3c3310e6fbc1fe414090123ab01351634b4ecb021232206c4c9a8ca3e3b8f", size = 5614515 },
+]
+
+[[package]]
+name = "pdfplumber"
+version = "0.11.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pdfminer-six" },
+    { name = "pillow" },
+    { name = "pypdfium2" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ca/f0/457bda3629dfa5b01c645519fe30230e1739751f6645e23fca2dabf6c2e5/pdfplumber-0.11.4.tar.gz", hash = "sha256:147b55cde2351fcb9523b46b09cc771eea3602faecfb60d463c6bf951694fbe8", size = 113305 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d0/87/415cb472981a8d2e36beeeadf074ebb686cc2bfe8d18de973232da291bd5/pdfplumber-0.11.4-py3-none-any.whl", hash = "sha256:6150f0678c7aaba974ac09839c17475d6c0c4d126b5f92cb85154885f31c6d73", size = 59182 },
+]
+
 [[package]]
 name = "pexpect"
 version = "4.9.0"
@@ -3546,6 +3583,26 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/48/8f/9bbf22ba6a00001a45dbc54337e5bbbd43e7d8f34c8158c92cddc45736af/pypdf-5.0.1-py3-none-any.whl", hash = "sha256:ff8a32da6c7a63fea9c32fa4dd837cdd0db7966adf6c14f043e3f12592e992db", size = 294470 },
 ]

+[[package]]
+name = "pypdfium2"
+version = "4.30.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a1/14/838b3ba247a0ba92e4df5d23f2bea9478edcfd72b78a39d6ca36ccd84ad2/pypdfium2-4.30.0.tar.gz", hash = "sha256:48b5b7e5566665bc1015b9d69c1ebabe21f6aee468b509531c3c8318eeee2e16", size = 140239 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c7/9a/c8ff5cc352c1b60b0b97642ae734f51edbab6e28b45b4fcdfe5306ee3c83/pypdfium2-4.30.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:b33ceded0b6ff5b2b93bc1fe0ad4b71aa6b7e7bd5875f1ca0cdfb6ba6ac01aab", size = 2837254 },
+    { url = "https://files.pythonhosted.org/packages/21/8b/27d4d5409f3c76b985f4ee4afe147b606594411e15ac4dc1c3363c9a9810/pypdfium2-4.30.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:4e55689f4b06e2d2406203e771f78789bd4f190731b5d57383d05cf611d829de", size = 2707624 },
+    { url = "https://files.pythonhosted.org/packages/11/63/28a73ca17c24b41a205d658e177d68e198d7dde65a8c99c821d231b6ee3d/pypdfium2-4.30.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e6e50f5ce7f65a40a33d7c9edc39f23140c57e37144c2d6d9e9262a2a854854", size = 2793126 },
+    { url = "https://files.pythonhosted.org/packages/d1/96/53b3ebf0955edbd02ac6da16a818ecc65c939e98fdeb4e0958362bd385c8/pypdfium2-4.30.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3d0dd3ecaffd0b6dbda3da663220e705cb563918249bda26058c6036752ba3a2", size = 2591077 },
+    { url = "https://files.pythonhosted.org/packages/ec/ee/0394e56e7cab8b5b21f744d988400948ef71a9a892cbeb0b200d324ab2c7/pypdfium2-4.30.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cc3bf29b0db8c76cdfaac1ec1cde8edf211a7de7390fbf8934ad2aa9b4d6dfad", size = 2864431 },
+    { url = "https://files.pythonhosted.org/packages/65/cd/3f1edf20a0ef4a212a5e20a5900e64942c5a374473671ac0780eaa08ea80/pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1f78d2189e0ddf9ac2b7a9b9bd4f0c66f54d1389ff6c17e9fd9dc034d06eb3f", size = 2812008 },
+    { url = "https://files.pythonhosted.org/packages/c8/91/2d517db61845698f41a2a974de90762e50faeb529201c6b3574935969045/pypdfium2-4.30.0-py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:5eda3641a2da7a7a0b2f4dbd71d706401a656fea521b6b6faa0675b15d31a163", size = 6181543 },
+    { url = "https://files.pythonhosted.org/packages/ba/c4/ed1315143a7a84b2c7616569dfb472473968d628f17c231c39e29ae9d780/pypdfium2-4.30.0-py3-none-musllinux_1_1_i686.whl", hash = "sha256:0dfa61421b5eb68e1188b0b2231e7ba35735aef2d867d86e48ee6cab6975195e", size = 6175911 },
+    { url = "https://files.pythonhosted.org/packages/7a/c4/9e62d03f414e0e3051c56d5943c3bf42aa9608ede4e19dc96438364e9e03/pypdfium2-4.30.0-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:f33bd79e7a09d5f7acca3b0b69ff6c8a488869a7fab48fdf400fec6e20b9c8be", size = 6267430 },
+    { url = "https://files.pythonhosted.org/packages/90/47/eda4904f715fb98561e34012826e883816945934a851745570521ec89520/pypdfium2-4.30.0-py3-none-win32.whl", hash = "sha256:ee2410f15d576d976c2ab2558c93d392a25fb9f6635e8dd0a8a3a5241b275e0e", size = 2775951 },
+    { url = "https://files.pythonhosted.org/packages/25/bd/56d9ec6b9f0fc4e0d95288759f3179f0fcd34b1a1526b75673d2f6d5196f/pypdfium2-4.30.0-py3-none-win_amd64.whl", hash = "sha256:90dbb2ac07be53219f56be09961eb95cf2473f834d01a42d901d13ccfad64b4c", size = 2892098 },
+    { url = "https://files.pythonhosted.org/packages/be/7a/097801205b991bc3115e8af1edb850d30aeaf0118520b016354cf5ccd3f6/pypdfium2-4.30.0-py3-none-win_arm64.whl", hash = "sha256:119b2969a6d6b1e8d55e99caaf05290294f2d0fe49c12a3f17102d01c441bd29", size = 2752118 },
+]
+
 [[package]]
 name = "pypika"
 version = "0.48.9"