diff --git a/path/to/src/crewai/knowledge/source/base_knowledge_source.py b/path/to/src/crewai/knowledge/source/base_knowledge_source.py new file mode 100644 index 000000000..0b5c8fee8 --- /dev/null +++ b/path/to/src/crewai/knowledge/source/base_knowledge_source.py @@ -0,0 +1,32 @@ +from abc import ABC, abstractmethod +from typing import List + +from crewai.knowledge.embedder.base_embedder import BaseEmbedder + + +class BaseKnowledgeSource(ABC): + """Abstract base class for different types of knowledge sources.""" + + def __init__( + self, + chunk_size: int = 1000, + chunk_overlap: int = 200, + ): + self.chunk_size = chunk_size + self.chunk_overlap = chunk_overlap + self.chunks: List[str] = [] + + @abstractmethod + def load_content(self): + """Load and preprocess content from the source.""" + pass + + @abstractmethod + def add(self, embedder: BaseEmbedder) -> None: + """Add content to the knowledge base, chunk it, and compute embeddings.""" + pass + + @abstractmethod + def query(self, embedder: BaseEmbedder, query: str, top_k: int = 3) -> str: + """Query the knowledge base using semantic search.""" + pass diff --git a/pyproject.toml b/pyproject.toml index b4e1e26e6..f287e5484 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,6 @@ dependencies = [ "tomli-w>=1.1.0", "chromadb>=0.4.24", "tomli>=2.0.2", - "fastembed>=0.4.1", ] [project.urls] @@ -40,6 +39,10 @@ Repository = "https://github.com/crewAIInc/crewAI" [project.optional-dependencies] tools = ["crewai-tools>=0.13.4"] agentops = ["agentops>=0.3.0"] +fastembed = ["fastembed>=0.4.1"] +pdfplumber = [ + "pdfplumber>=0.11.4", +] [tool.uv] dev-dependencies = [ diff --git a/src/crewai/__init__.py b/src/crewai/__init__.py index 0a2f02a59..48566fbc9 100644 --- a/src/crewai/__init__.py +++ b/src/crewai/__init__.py @@ -1,7 +1,9 @@ import warnings + from crewai.agent import Agent from crewai.crew import Crew from crewai.flow.flow import Flow +from crewai.knowledge.knowledge import Knowledge from crewai.llm import LLM from crewai.pipeline import Pipeline from crewai.process import Process @@ -15,4 +17,14 @@ warnings.filterwarnings( module="pydantic.main", ) __version__ = "0.76.9" -__all__ = ["Agent", "Crew", "Process", "Task", "Pipeline", "Router", "LLM", "Flow"] +__all__ = [ + "Agent", + "Crew", + "Process", + "Task", + "Pipeline", + "Router", + "LLM", + "Flow", + "Knowledge", +] diff --git a/src/crewai/knowledge/embedder/__init__.py b/src/crewai/knowledge/embedder/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/crewai/knowledge/embedder/fastembed.py b/src/crewai/knowledge/embedder/fastembed.py index 13e2f7bda..adff1cdbe 100644 --- a/src/crewai/knowledge/embedder/fastembed.py +++ b/src/crewai/knowledge/embedder/fastembed.py @@ -47,7 +47,7 @@ class FastEmbed(BaseEmbedder): cache_dir=str(cache_dir) if cache_dir else None, ) - def embed_chunks(self, chunks: List[str]) -> np.ndarray: + def embed_chunks(self, chunks: List[str]) -> List[np.ndarray]: """ Generate embeddings for a list of text chunks @@ -55,13 +55,12 @@ class FastEmbed(BaseEmbedder): chunks: List of text chunks to embed Returns: - Array of embeddings + List of embeddings """ - # FastEmbed returns a generator, convert to list then numpy array embeddings = list(self.model.embed(chunks)) - return np.array(embeddings) + return embeddings - def embed_texts(self, texts: List[str]) -> np.ndarray: + def embed_texts(self, texts: List[str]) -> List[np.ndarray]: """ Generate embeddings for a list of texts @@ -69,11 +68,10 @@ class FastEmbed(BaseEmbedder): texts: List of texts to embed Returns: - Array of embeddings + List of embeddings """ - # FastEmbed returns a generator, convert to list then numpy array embeddings = list(self.model.embed(texts)) - return np.array(embeddings) + return embeddings def embed_text(self, text: str) -> np.ndarray: """ diff --git a/src/crewai/knowledge/knowledge.py b/src/crewai/knowledge/knowledge.py index 288a6f3ae..ba4ac34a9 100644 --- a/src/crewai/knowledge/knowledge.py +++ b/src/crewai/knowledge/knowledge.py @@ -1,21 +1,53 @@ -from typing import List, Optional +from typing import List -from pydantic import BaseModel +from pydantic import BaseModel, ConfigDict, Field -from .embedder.base_embedder import BaseEmbedder -from .embedder.fastembed import FastEmbed -from .source.base_knowledge_source import BaseKnowledgeSource +from crewai.knowledge.embedder.base_embedder import BaseEmbedder +from crewai.knowledge.embedder.fastembed import FastEmbed +from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource class Knowledge(BaseModel): - sources: Optional[List[BaseKnowledgeSource]] = None - embedder: BaseEmbedder + sources: List[BaseKnowledgeSource] = Field(default_factory=list) + embedder: BaseEmbedder = Field(default_factory=FastEmbed) - def __init__( - self, - sources: Optional[List[BaseKnowledgeSource]] = None, - embedder: Optional[BaseEmbedder] = None, - ): - super().__init__() - self.sources = sources or [] - self.embedder = embedder or FastEmbed() + model_config = ConfigDict(arbitrary_types_allowed=True) + + def __init__(self, **data): + super().__init__(**data) + # Call add on all sources during initialization + for source in self.sources: + source.add(self.embedder) + + def query(self, query: str, top_k: int = 3) -> List[str]: + """ + Query across all knowledge sources to find the most relevant information. + Returns the top_k most relevant chunks. + """ + if not self.sources: + return [] + + # Collect all chunks and embeddings from all sources + all_chunks = [] + all_embeddings = [] + + for source in self.sources: + all_chunks.extend(source.chunks) + all_embeddings.extend(source.get_embeddings()) + + # Embed the query + query_embedding = self.embedder.embed_text(query) + + # Calculate similarities + similarities = [] + for idx, embedding in enumerate(all_embeddings): + similarity = query_embedding.dot(embedding) + similarities.append((similarity, idx)) + + # Sort by similarity + similarities.sort(reverse=True, key=lambda x: x[0]) + + # Get top_k results + top_chunks = [all_chunks[idx] for _, idx in similarities[:top_k]] + + return top_chunks diff --git a/src/crewai/knowledge/source/__init__.py b/src/crewai/knowledge/source/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/crewai/knowledge/source/base_knowledge_source.py b/src/crewai/knowledge/source/base_knowledge_source.py index 31794bcb8..15f65f1f6 100644 --- a/src/crewai/knowledge/source/base_knowledge_source.py +++ b/src/crewai/knowledge/source/base_knowledge_source.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Any, Dict, List +from typing import List import numpy as np @@ -7,7 +7,7 @@ from crewai.knowledge.embedder.base_embedder import BaseEmbedder class BaseKnowledgeSource(ABC): - """Abstract base class for knowledge bases""" + """Abstract base class for knowledge sources.""" def __init__( self, @@ -17,96 +17,25 @@ class BaseKnowledgeSource(ABC): self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap self.chunks: List[str] = [] - self.chunk_embeddings: Dict[int, np.ndarray] = {} + self.chunk_embeddings: List[np.ndarray] = [] @abstractmethod - def query(self, query: str) -> str: - """Query the knowledge base and return relevant information""" + def load_content(self): + """Load and preprocess content from the source.""" pass @abstractmethod - def add(self, content: Any) -> None: - """Process and store content in the knowledge base""" + def add(self, embedder: BaseEmbedder) -> None: + """Process content, chunk it, compute embeddings, and save them.""" pass - def embed(self, embedder: BaseEmbedder, new_chunks: List[str]) -> None: - """Embed chunks and store them""" - if not new_chunks: - return - - # Get embeddings for new chunks - embeddings = embedder.embed_texts(new_chunks) - - # Store embeddings with their corresponding chunks - start_idx = len(self.chunks) - for i, embedding in enumerate(embeddings): - self.chunk_embeddings[start_idx + i] = embedding + def get_embeddings(self) -> List[np.ndarray]: + """Return the list of embeddings for the chunks.""" + return self.chunk_embeddings def _chunk_text(self, text: str) -> List[str]: - """Split text into chunks with overlap""" - chunks = [] - start = 0 - text_length = len(text) - - while start < text_length: - # Get the chunk of size chunk_size - end = start + self.chunk_size - - if end >= text_length: - # If we're at the end, just take the rest - chunks.append(text[start:].strip()) - break - - # Look for a good breaking point - # Priority: double newline > single newline > period > space - break_chars = ["\n\n", "\n", ". ", " "] - chunk_end = end - - for break_char in break_chars: - # Look for the break_char in a window around the end point - window_start = max(start + self.chunk_size - 100, start) - window_end = min(start + self.chunk_size + 100, text_length) - window_text = text[window_start:window_end] - - # Find the last occurrence of the break_char in the window - last_break = window_text.rfind(break_char) - if last_break != -1: - chunk_end = window_start + last_break + len(break_char) - break - - # Add the chunk - chunk = text[start:chunk_end].strip() - if chunk: # Only add non-empty chunks - chunks.append(chunk) - - # Move the start pointer, accounting for overlap - start = max( - start + self.chunk_size - self.chunk_overlap, - chunk_end - self.chunk_overlap, - ) - - return chunks - - def _find_similar_chunks( - self, embedder: BaseEmbedder, query: str, top_k: int = 3 - ) -> List[str]: - """Find the most similar chunks to a query using embeddings""" - if not self.chunks: - return [] - - # Get query embedding - query_embedding = embedder.embed_text(query) - - # Calculate similarities with all chunks - similarities = [] - for idx, chunk_embedding in self.chunk_embeddings.items(): - similarity = np.dot(query_embedding, chunk_embedding) - similarities.append((similarity, idx)) - - # Sort by similarity and get top_k chunks - similarities.sort(reverse=True) - top_chunks = [] - for _, idx in similarities[:top_k]: - top_chunks.append(self.chunks[idx]) - - return top_chunks + """Utility method to split text into chunks.""" + return [ + text[i : i + self.chunk_size] + for i in range(0, len(text), self.chunk_size - self.chunk_overlap) + ] diff --git a/src/crewai/knowledge/source/pdf_knowledge_source.py b/src/crewai/knowledge/source/pdf_knowledge_source.py new file mode 100644 index 000000000..c86a8abc2 --- /dev/null +++ b/src/crewai/knowledge/source/pdf_knowledge_source.py @@ -0,0 +1,65 @@ +from pathlib import Path +from typing import List + +from crewai.knowledge.embedder.base_embedder import BaseEmbedder +from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource + + +class PDFKnowledgeSource(BaseKnowledgeSource): + """A knowledge source that stores and queries PDF file content using embeddings.""" + + def __init__( + self, + file_path: str, + chunk_size: int = 1000, + chunk_overlap: int = 200, + ): + super().__init__(chunk_size, chunk_overlap) + self.file_path = Path(file_path) + self.content = self.load_content() + + def _import_pdfplumber(self): + """Dynamically import pdfplumber.""" + try: + import pdfplumber + + return pdfplumber + except ImportError: + raise ImportError( + "pdfplumber is not installed. Please install it with: pip install pdfplumber" + ) + + def load_content(self) -> str: + """Load and preprocess PDF file content.""" + if not self.file_path.exists(): + raise FileNotFoundError(f"File not found: {self.file_path}") + if not self.file_path.is_file(): + raise ValueError(f"Path is not a file: {self.file_path}") + + pdfplumber = self._import_pdfplumber() + text = "" + with pdfplumber.open(self.file_path) as pdf: + for page in pdf.pages: + page_text = page.extract_text() + if page_text: + text += page_text + "\n" + return text + + def add(self, embedder: BaseEmbedder) -> None: + """ + Add PDF file content to the knowledge source, chunk it, compute embeddings, + and save the embeddings. + """ + new_chunks = self._chunk_text(self.content) + self.chunks.extend(new_chunks) + # Compute embeddings for the new chunks + new_embeddings = embedder.embed_chunks(new_chunks) + # Save the embeddings + self.chunk_embeddings.extend(new_embeddings) + + def _chunk_text(self, text: str) -> List[str]: + """Utility method to split text into chunks.""" + return [ + text[i : i + self.chunk_size] + for i in range(0, len(text), self.chunk_size - self.chunk_overlap) + ] diff --git a/src/crewai/knowledge/source/string_knowledge_source.py b/src/crewai/knowledge/source/string_knowledge_source.py index 028bbe493..a2f423fbd 100644 --- a/src/crewai/knowledge/source/string_knowledge_source.py +++ b/src/crewai/knowledge/source/string_knowledge_source.py @@ -1,9 +1,11 @@ +from typing import List + from crewai.knowledge.embedder.base_embedder import BaseEmbedder from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource class StringKnowledgeSource(BaseKnowledgeSource): - """A knowledge base that stores and queries plain text content using embeddings""" + """A knowledge source that stores and queries plain text content using embeddings.""" def __init__( self, @@ -15,25 +17,29 @@ class StringKnowledgeSource(BaseKnowledgeSource): chunk_size, chunk_overlap, ) + self.content = content + self.load_content() + + def load_content(self): + """Load and preprocess string content.""" + if not isinstance(self.content, str): + raise ValueError("StringKnowledgeSource only accepts string content") def add(self, embedder: BaseEmbedder) -> None: - """Add text content to the knowledge base, chunk it, and compute embeddings""" - if not isinstance(self.content, str): - raise ValueError("StringKnowledgeBase only accepts string content") - - # Create chunks from the text - new_chunks = self._chunk_text(content) - - # Add chunks to the knowledge base + """ + Add string content to the knowledge source, chunk it, compute embeddings, + and save the embeddings. + """ + new_chunks = self._chunk_text(self.content) self.chunks.extend(new_chunks) + # Compute embeddings for the new chunks + new_embeddings = embedder.embed_chunks(new_chunks) + # Save the embeddings + self.chunk_embeddings.extend(new_embeddings) - # Compute and store embeddings for the new chunks - embedder.embed_chunks(new_chunks) - - def query(self, embedder: BaseEmbedder, query: str, top_k: int = 3) -> str: - """ - Query the knowledge base using semantic search - Returns the most relevant chunk based on embedding similarity - """ - similar_chunks = self._find_similar_chunks(embedder, query, top_k=top_k) - return similar_chunks[0] if similar_chunks else "" + def _chunk_text(self, text: str) -> List[str]: + """Utility method to split text into chunks.""" + return [ + text[i : i + self.chunk_size] + for i in range(0, len(text), self.chunk_size - self.chunk_overlap) + ] diff --git a/src/crewai/knowledge/source/text_file_knowledge_source.py b/src/crewai/knowledge/source/text_file_knowledge_source.py index 0808319d1..8c97ae9ca 100644 --- a/src/crewai/knowledge/source/text_file_knowledge_source.py +++ b/src/crewai/knowledge/source/text_file_knowledge_source.py @@ -1,9 +1,12 @@ +from pathlib import Path +from typing import List + from crewai.knowledge.embedder.base_embedder import BaseEmbedder from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource class TextFileKnowledgeSource(BaseKnowledgeSource): - """A knowledge base that stores and queries plain text content using embeddings""" + """A knowledge source that stores and queries text file content using embeddings.""" def __init__( self, @@ -11,29 +14,35 @@ class TextFileKnowledgeSource(BaseKnowledgeSource): chunk_size: int = 1000, chunk_overlap: int = 200, ): - super().__init__( - chunk_size, - chunk_overlap, - ) + super().__init__(chunk_size, chunk_overlap) + self.file_path = Path(file_path) + self.content = self.load_content() + + def load_content(self) -> str: + """Load and preprocess text file content.""" + if not self.file_path.exists(): + raise FileNotFoundError(f"File not found: {self.file_path}") + if not self.file_path.is_file(): + raise ValueError(f"Path is not a file: {self.file_path}") + + with self.file_path.open("r", encoding="utf-8") as f: + return f.read() def add(self, embedder: BaseEmbedder) -> None: - """Add text content to the knowledge base, chunk it, and compute embeddings""" - if not isinstance(self.content, str): - raise ValueError("StringKnowledgeBase only accepts string content") - - # Create chunks from the text - new_chunks = self._chunk_text(content) - - # Add chunks to the knowledge base + """ + Add text file content to the knowledge source, chunk it, compute embeddings, + and save the embeddings. + """ + new_chunks = self._chunk_text(self.content) self.chunks.extend(new_chunks) + # Compute embeddings for the new chunks + new_embeddings = embedder.embed_chunks(new_chunks) + # Save the embeddings + self.chunk_embeddings.extend(new_embeddings) - # Compute and store embeddings for the new chunks - embedder.embed_chunks(new_chunks) - - def query(self, embedder: BaseEmbedder, query: str, top_k: int = 3) -> str: - """ - Query the knowledge base using semantic search - Returns the most relevant chunk based on embedding similarity - """ - similar_chunks = self._find_similar_chunks(embedder, query, top_k=top_k) - return similar_chunks[0] if similar_chunks else "" + def _chunk_text(self, text: str) -> List[str]: + """Utility method to split text into chunks.""" + return [ + text[i : i + self.chunk_size] + for i in range(0, len(text), self.chunk_size - self.chunk_overlap) + ] diff --git a/tests/knowledge/__init__.py b/tests/knowledge/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/knowledge/crewai_quickstart.pdf b/tests/knowledge/crewai_quickstart.pdf new file mode 100644 index 000000000..671baf782 Binary files /dev/null and b/tests/knowledge/crewai_quickstart.pdf differ diff --git a/tests/knowledge/knowledge_test.py b/tests/knowledge/knowledge_test.py new file mode 100644 index 000000000..c61226d8b --- /dev/null +++ b/tests/knowledge/knowledge_test.py @@ -0,0 +1,347 @@ +"""Test Knowledge creation and querying functionality.""" + +import os + +from crewai.knowledge.knowledge import Knowledge +from crewai.knowledge.source.pdf_knowledge_source import PDFKnowledgeSource +from crewai.knowledge.source.string_knowledge_source import StringKnowledgeSource +from crewai.knowledge.source.text_file_knowledge_source import TextFileKnowledgeSource + + +def test_single_short_string(): + # Create a knowledge base with a single short string + content = "Brandon's favorite color is blue and he likes Mexican food." + string_source = StringKnowledgeSource(content=content) + knowledge_base = Knowledge(sources=[string_source]) + + # Perform a query + query = "What is Brandon's favorite color?" + results = knowledge_base.query(query) + + # Assert that the results contain the expected information + assert any("blue" in result.lower() for result in results) + + +def test_single_2k_character_string(): + # Create a 2k character string with various facts about Brandon + content = ( + "Brandon is a software engineer who lives in San Francisco. " + "He enjoys hiking and often visits the trails in the Bay Area. " + "Brandon has a pet dog named Max, who is a golden retriever. " + "He loves reading science fiction books, and his favorite author is Isaac Asimov. " + "Brandon's favorite movie is Inception, and he enjoys watching it with his friends. " + "He is also a fan of Mexican cuisine, especially tacos and burritos. " + "Brandon plays the guitar and often performs at local open mic nights. " + "He is learning French and plans to visit Paris next year. " + "Brandon is passionate about technology and often attends tech meetups in the city. " + "He is also interested in AI and machine learning, and he is currently working on a project related to natural language processing. " + "Brandon's favorite color is blue, and he often wears blue shirts. " + "He enjoys cooking and often tries new recipes on weekends. " + "Brandon is a morning person and likes to start his day with a run in the park. " + "He is also a coffee enthusiast and enjoys trying different coffee blends. " + "Brandon is a member of a local book club and enjoys discussing books with fellow members. " + "He is also a fan of board games and often hosts game nights at his place. " + "Brandon is an advocate for environmental conservation and volunteers for local clean-up drives. " + "He is also a mentor for aspiring software developers and enjoys sharing his knowledge with others. " + "Brandon's favorite sport is basketball, and he often plays with his friends on weekends. " + "He is also a fan of the Golden State Warriors and enjoys watching their games. " + ) + string_source = StringKnowledgeSource(content=content) + knowledge_base = Knowledge(sources=[string_source]) + + # Perform a query + query = "What is Brandon's favorite movie?" + results = knowledge_base.query(query) + + # Assert that the results contain the expected information + assert any("inception" in result.lower() for result in results) + + +def test_multiple_short_strings(): + # Create multiple short string sources + contents = [ + "Brandon loves hiking.", + "Brandon has a dog named Max.", + "Brandon enjoys painting landscapes.", + ] + string_sources = [StringKnowledgeSource(content=content) for content in contents] + knowledge_base = Knowledge(sources=string_sources) + + # Perform a query + query = "What is the name of Brandon's pet?" + results = knowledge_base.query(query) + + # Assert that the correct information is retrieved + assert any("max" in result.lower() for result in results) + + +def test_multiple_2k_character_strings(): + # Create multiple 2k character strings with various facts about Brandon + contents = [ + ( + "Brandon is a software engineer who lives in San Francisco. " + "He enjoys hiking and often visits the trails in the Bay Area. " + "Brandon has a pet dog named Max, who is a golden retriever. " + "He loves reading science fiction books, and his favorite author is Isaac Asimov. " + "Brandon's favorite movie is Inception, and he enjoys watching it with his friends. " + "He is also a fan of Mexican cuisine, especially tacos and burritos. " + "Brandon plays the guitar and often performs at local open mic nights. " + "He is learning French and plans to visit Paris next year. " + "Brandon is passionate about technology and often attends tech meetups in the city. " + "He is also interested in AI and machine learning, and he is currently working on a project related to natural language processing. " + "Brandon's favorite color is blue, and he often wears blue shirts. " + "He enjoys cooking and often tries new recipes on weekends. " + "Brandon is a morning person and likes to start his day with a run in the park. " + "He is also a coffee enthusiast and enjoys trying different coffee blends. " + "Brandon is a member of a local book club and enjoys discussing books with fellow members. " + "He is also a fan of board games and often hosts game nights at his place. " + "Brandon is an advocate for environmental conservation and volunteers for local clean-up drives. " + "He is also a mentor for aspiring software developers and enjoys sharing his knowledge with others. " + "Brandon's favorite sport is basketball, and he often plays with his friends on weekends. " + "He is also a fan of the Golden State Warriors and enjoys watching their games. " + ) + * 2, # Repeat to ensure it's 2k characters + ( + "Brandon loves traveling and has visited over 20 countries. " + "He is fluent in Spanish and often practices with his friends. " + "Brandon's favorite city is Barcelona, where he enjoys the architecture and culture. " + "He is a foodie and loves trying new cuisines, with a particular fondness for sushi. " + "Brandon is an avid cyclist and participates in local cycling events. " + "He is also a photographer and enjoys capturing landscapes and cityscapes. " + "Brandon is a tech enthusiast and follows the latest trends in gadgets and software. " + "He is also a fan of virtual reality and owns a VR headset. " + "Brandon's favorite book is 'The Hitchhiker's Guide to the Galaxy'. " + "He enjoys watching documentaries and learning about history and science. " + "Brandon is a coffee lover and has a collection of coffee mugs from different countries. " + "He is also a fan of jazz music and often attends live performances. " + "Brandon is a member of a local running club and participates in marathons. " + "He is also a volunteer at a local animal shelter and helps with dog walking. " + "Brandon's favorite holiday is Christmas, and he enjoys decorating his home. " + "He is also a fan of classic movies and has a collection of DVDs. " + "Brandon is a mentor for young professionals and enjoys giving career advice. " + "He is also a fan of puzzles and enjoys solving them in his free time. " + "Brandon's favorite sport is soccer, and he often plays with his friends. " + "He is also a fan of FC Barcelona and enjoys watching their matches. " + ) + * 2, # Repeat to ensure it's 2k characters + ] + string_sources = [StringKnowledgeSource(content=content) for content in contents] + knowledge_base = Knowledge(sources=string_sources) + + # Perform a query + query = "What is Brandon's favorite book?" + results = knowledge_base.query(query) + + # Assert that the correct information is retrieved + assert any( + "the hitchhiker's guide to the galaxy" in result.lower() for result in results + ) + + +def test_single_short_file(tmpdir): + # Create a single short text file + content = "Brandon's favorite sport is basketball." + file_path = tmpdir.join("short_file.txt") + with open(file_path, "w") as f: + f.write(content) + + file_source = TextFileKnowledgeSource(file_path=str(file_path)) + knowledge_base = Knowledge(sources=[file_source]) + + # Perform a query + query = "What sport does Brandon like?" + results = knowledge_base.query(query) + + # Assert that the results contain the expected information + assert any("basketball" in result.lower() for result in results) + + +def test_single_2k_character_file(tmpdir): + # Create a single 2k character text file with various facts about Brandon + content = ( + "Brandon is a software engineer who lives in San Francisco. " + "He enjoys hiking and often visits the trails in the Bay Area. " + "Brandon has a pet dog named Max, who is a golden retriever. " + "He loves reading science fiction books, and his favorite author is Isaac Asimov. " + "Brandon's favorite movie is Inception, and he enjoys watching it with his friends. " + "He is also a fan of Mexican cuisine, especially tacos and burritos. " + "Brandon plays the guitar and often performs at local open mic nights. " + "He is learning French and plans to visit Paris next year. " + "Brandon is passionate about technology and often attends tech meetups in the city. " + "He is also interested in AI and machine learning, and he is currently working on a project related to natural language processing. " + "Brandon's favorite color is blue, and he often wears blue shirts. " + "He enjoys cooking and often tries new recipes on weekends. " + "Brandon is a morning person and likes to start his day with a run in the park. " + "He is also a coffee enthusiast and enjoys trying different coffee blends. " + "Brandon is a member of a local book club and enjoys discussing books with fellow members. " + "He is also a fan of board games and often hosts game nights at his place. " + "Brandon is an advocate for environmental conservation and volunteers for local clean-up drives. " + "He is also a mentor for aspiring software developers and enjoys sharing his knowledge with others. " + "Brandon's favorite sport is basketball, and he often plays with his friends on weekends. " + "He is also a fan of the Golden State Warriors and enjoys watching their games. " + ) * 2 # Repeat to ensure it's 2k characters + file_path = tmpdir.join("long_file.txt") + with open(file_path, "w") as f: + f.write(content) + + file_source = TextFileKnowledgeSource(file_path=str(file_path)) + knowledge_base = Knowledge(sources=[file_source]) + + # Perform a query + query = "What is Brandon's favorite movie?" + results = knowledge_base.query(query) + + # Assert that the results contain the expected information + assert any("inception" in result.lower() for result in results) + + +def test_multiple_short_files(tmpdir): + # Create multiple short text files + contents = [ + "Brandon lives in New York.", + "Brandon works as a software engineer.", + "Brandon enjoys cooking Italian food.", + ] + file_paths = [] + for i, content in enumerate(contents): + file_path = tmpdir.join(f"file_{i}.txt") + with open(file_path, "w") as f: + f.write(content) + file_paths.append(str(file_path)) + + file_sources = [TextFileKnowledgeSource(file_path=path) for path in file_paths] + knowledge_base = Knowledge(sources=file_sources) + + # Perform a query + query = "Where does Brandon live?" + results = knowledge_base.query(query) + + # Assert that the correct information is retrieved + assert any("new york" in result.lower() for result in results) + + +def test_multiple_2k_character_files(tmpdir): + # Create multiple 2k character text files with various facts about Brandon + contents = [ + ( + "Brandon loves traveling and has visited over 20 countries. " + "He is fluent in Spanish and often practices with his friends. " + "Brandon's favorite city is Barcelona, where he enjoys the architecture and culture. " + "He is a foodie and loves trying new cuisines, with a particular fondness for sushi. " + "Brandon is an avid cyclist and participates in local cycling events. " + "He is also a photographer and enjoys capturing landscapes and cityscapes. " + "Brandon is a tech enthusiast and follows the latest trends in gadgets and software. " + "He is also a fan of virtual reality and owns a VR headset. " + "Brandon's favorite book is 'The Hitchhiker's Guide to the Galaxy'. " + "He enjoys watching documentaries and learning about history and science. " + "Brandon is a coffee lover and has a collection of coffee mugs from different countries. " + "He is also a fan of jazz music and often attends live performances. " + "Brandon is a member of a local running club and participates in marathons. " + "He is also a volunteer at a local animal shelter and helps with dog walking. " + "Brandon's favorite holiday is Christmas, and he enjoys decorating his home. " + "He is also a fan of classic movies and has a collection of DVDs. " + "Brandon is a mentor for young professionals and enjoys giving career advice. " + "He is also a fan of puzzles and enjoys solving them in his free time. " + "Brandon's favorite sport is soccer, and he often plays with his friends. " + "He is also a fan of FC Barcelona and enjoys watching their matches. " + ) + * 2, # Repeat to ensure it's 2k characters + ( + "Brandon is a software engineer who lives in San Francisco. " + "He enjoys hiking and often visits the trails in the Bay Area. " + "Brandon has a pet dog named Max, who is a golden retriever. " + "He loves reading science fiction books, and his favorite author is Isaac Asimov. " + "Brandon's favorite movie is Inception, and he enjoys watching it with his friends. " + "He is also a fan of Mexican cuisine, especially tacos and burritos. " + "Brandon plays the guitar and often performs at local open mic nights. " + "He is learning French and plans to visit Paris next year. " + "Brandon is passionate about technology and often attends tech meetups in the city. " + "He is also interested in AI and machine learning, and he is currently working on a project related to natural language processing. " + "Brandon's favorite color is blue, and he often wears blue shirts. " + "He enjoys cooking and often tries new recipes on weekends. " + "Brandon is a morning person and likes to start his day with a run in the park. " + "He is also a coffee enthusiast and enjoys trying different coffee blends. " + "Brandon is a member of a local book club and enjoys discussing books with fellow members. " + "He is also a fan of board games and often hosts game nights at his place. " + "Brandon is an advocate for environmental conservation and volunteers for local clean-up drives. " + "He is also a mentor for aspiring software developers and enjoys sharing his knowledge with others. " + "Brandon's favorite sport is basketball, and he often plays with his friends on weekends. " + "He is also a fan of the Golden State Warriors and enjoys watching their games. " + ) + * 2, # Repeat to ensure it's 2k characters + ] + file_paths = [] + for i, content in enumerate(contents): + file_path = tmpdir.join(f"long_file_{i}.txt") + with open(file_path, "w") as f: + f.write(content) + file_paths.append(str(file_path)) + + file_sources = [TextFileKnowledgeSource(file_path=path) for path in file_paths] + knowledge_base = Knowledge(sources=file_sources) + + # Perform a query + query = "What is Brandon's favorite book?" + results = knowledge_base.query(query) + + # Assert that the correct information is retrieved + assert any( + "the hitchhiker's guide to the galaxy" in result.lower() for result in results + ) + + +def test_hybrid_string_and_files(tmpdir): + # Create string sources + string_contents = [ + "Brandon is learning French.", + "Brandon visited Paris last summer.", + ] + string_sources = [ + StringKnowledgeSource(content=content) for content in string_contents + ] + + # Create file sources + file_contents = [ + "Brandon prefers tea over coffee.", + "Brandon's favorite book is 'The Alchemist'.", + ] + file_paths = [] + for i, content in enumerate(file_contents): + file_path = tmpdir.join(f"file_{i}.txt") + with open(file_path, "w") as f: + f.write(content) + file_paths.append(str(file_path)) + + file_sources = [TextFileKnowledgeSource(file_path=path) for path in file_paths] + + # Combine string and file sources + knowledge_base = Knowledge(sources=string_sources + file_sources) + + # Perform a query + query = "What is Brandon's favorite book?" + results = knowledge_base.query(query) + + # Assert that the correct information is retrieved + assert any("the alchemist" in result.lower() for result in results) + + +def test_pdf_knowledge_source(): + # Get the directory of the current file + current_dir = os.path.dirname(__file__) + # Construct the path to the PDF file + pdf_path = os.path.join(current_dir, "crewai_quickstart.pdf") + + # Create a PDFKnowledgeSource + pdf_source = PDFKnowledgeSource(file_path=pdf_path) + knowledge_base = Knowledge(sources=[pdf_source]) + + # Perform a query + query = "How do you create a crew?" + results = knowledge_base.query(query) + + print("Results from querying PDFKnowledgeSource:", results) + # Assert that the correct information is retrieved + assert any( + "crewai create crew latest-ai-development" in result.lower() + for result in results + ) diff --git a/uv.lock b/uv.lock index dea09db5c..13a896f7a 100644 --- a/uv.lock +++ b/uv.lock @@ -612,7 +612,6 @@ dependencies = [ { name = "chromadb" }, { name = "click" }, { name = "crewai-tools" }, - { name = "fastembed" }, { name = "instructor" }, { name = "json-repair" }, { name = "jsonref" }, @@ -635,6 +634,15 @@ dependencies = [ agentops = [ { name = "agentops" }, ] +fastembed = [ + { name = "fastembed" }, +] +network = [ + { name = "pdfplumber" }, +] +pdfplumber = [ + { name = "pdfplumber" }, +] tools = [ { name = "crewai-tools" }, ] @@ -668,7 +676,7 @@ requires-dist = [ { name = "click", specifier = ">=8.1.7" }, { name = "crewai-tools", specifier = ">=0.13.4" }, { name = "crewai-tools", marker = "extra == 'tools'", specifier = ">=0.13.4" }, - { name = "fastembed", specifier = ">=0.4.1" }, + { name = "fastembed", marker = "extra == 'fastembed'", specifier = ">=0.4.1" }, { name = "instructor", specifier = ">=1.3.3" }, { name = "json-repair", specifier = ">=0.25.2" }, { name = "jsonref", specifier = ">=1.1.0" }, @@ -678,6 +686,8 @@ requires-dist = [ { name = "opentelemetry-api", specifier = ">=1.22.0" }, { name = "opentelemetry-exporter-otlp-proto-http", specifier = ">=1.22.0" }, { name = "opentelemetry-sdk", specifier = ">=1.22.0" }, + { name = "pdfplumber", marker = "extra == 'network'", specifier = ">=0.11.4" }, + { name = "pdfplumber", marker = "extra == 'pdfplumber'", specifier = ">=0.11.4" }, { name = "pydantic", specifier = ">=2.4.2" }, { name = "python-dotenv", specifier = ">=1.0.0" }, { name = "pyvis", specifier = ">=0.3.2" }, @@ -2975,6 +2985,33 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08", size = 31191 }, ] +[[package]] +name = "pdfminer-six" +version = "20231228" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "charset-normalizer" }, + { name = "cryptography" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/31/b1/a43e3bd872ded4deea4f8efc7aff1703fca8c5455d0c06e20506a06a44ff/pdfminer.six-20231228.tar.gz", hash = "sha256:6004da3ad1a7a4d45930cb950393df89b068e73be365a6ff64a838d37bcb08c4", size = 7362505 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/eb/9c/e46fe7502b32d7db6af6e36a9105abb93301fa1ec475b5ddcba8b35ae23a/pdfminer.six-20231228-py3-none-any.whl", hash = "sha256:e8d3c3310e6fbc1fe414090123ab01351634b4ecb021232206c4c9a8ca3e3b8f", size = 5614515 }, +] + +[[package]] +name = "pdfplumber" +version = "0.11.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pdfminer-six" }, + { name = "pillow" }, + { name = "pypdfium2" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ca/f0/457bda3629dfa5b01c645519fe30230e1739751f6645e23fca2dabf6c2e5/pdfplumber-0.11.4.tar.gz", hash = "sha256:147b55cde2351fcb9523b46b09cc771eea3602faecfb60d463c6bf951694fbe8", size = 113305 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d0/87/415cb472981a8d2e36beeeadf074ebb686cc2bfe8d18de973232da291bd5/pdfplumber-0.11.4-py3-none-any.whl", hash = "sha256:6150f0678c7aaba974ac09839c17475d6c0c4d126b5f92cb85154885f31c6d73", size = 59182 }, +] + [[package]] name = "pexpect" version = "4.9.0" @@ -3546,6 +3583,26 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/48/8f/9bbf22ba6a00001a45dbc54337e5bbbd43e7d8f34c8158c92cddc45736af/pypdf-5.0.1-py3-none-any.whl", hash = "sha256:ff8a32da6c7a63fea9c32fa4dd837cdd0db7966adf6c14f043e3f12592e992db", size = 294470 }, ] +[[package]] +name = "pypdfium2" +version = "4.30.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a1/14/838b3ba247a0ba92e4df5d23f2bea9478edcfd72b78a39d6ca36ccd84ad2/pypdfium2-4.30.0.tar.gz", hash = "sha256:48b5b7e5566665bc1015b9d69c1ebabe21f6aee468b509531c3c8318eeee2e16", size = 140239 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/9a/c8ff5cc352c1b60b0b97642ae734f51edbab6e28b45b4fcdfe5306ee3c83/pypdfium2-4.30.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:b33ceded0b6ff5b2b93bc1fe0ad4b71aa6b7e7bd5875f1ca0cdfb6ba6ac01aab", size = 2837254 }, + { url = "https://files.pythonhosted.org/packages/21/8b/27d4d5409f3c76b985f4ee4afe147b606594411e15ac4dc1c3363c9a9810/pypdfium2-4.30.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:4e55689f4b06e2d2406203e771f78789bd4f190731b5d57383d05cf611d829de", size = 2707624 }, + { url = "https://files.pythonhosted.org/packages/11/63/28a73ca17c24b41a205d658e177d68e198d7dde65a8c99c821d231b6ee3d/pypdfium2-4.30.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e6e50f5ce7f65a40a33d7c9edc39f23140c57e37144c2d6d9e9262a2a854854", size = 2793126 }, + { url = "https://files.pythonhosted.org/packages/d1/96/53b3ebf0955edbd02ac6da16a818ecc65c939e98fdeb4e0958362bd385c8/pypdfium2-4.30.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3d0dd3ecaffd0b6dbda3da663220e705cb563918249bda26058c6036752ba3a2", size = 2591077 }, + { url = "https://files.pythonhosted.org/packages/ec/ee/0394e56e7cab8b5b21f744d988400948ef71a9a892cbeb0b200d324ab2c7/pypdfium2-4.30.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cc3bf29b0db8c76cdfaac1ec1cde8edf211a7de7390fbf8934ad2aa9b4d6dfad", size = 2864431 }, + { url = "https://files.pythonhosted.org/packages/65/cd/3f1edf20a0ef4a212a5e20a5900e64942c5a374473671ac0780eaa08ea80/pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1f78d2189e0ddf9ac2b7a9b9bd4f0c66f54d1389ff6c17e9fd9dc034d06eb3f", size = 2812008 }, + { url = "https://files.pythonhosted.org/packages/c8/91/2d517db61845698f41a2a974de90762e50faeb529201c6b3574935969045/pypdfium2-4.30.0-py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:5eda3641a2da7a7a0b2f4dbd71d706401a656fea521b6b6faa0675b15d31a163", size = 6181543 }, + { url = "https://files.pythonhosted.org/packages/ba/c4/ed1315143a7a84b2c7616569dfb472473968d628f17c231c39e29ae9d780/pypdfium2-4.30.0-py3-none-musllinux_1_1_i686.whl", hash = "sha256:0dfa61421b5eb68e1188b0b2231e7ba35735aef2d867d86e48ee6cab6975195e", size = 6175911 }, + { url = "https://files.pythonhosted.org/packages/7a/c4/9e62d03f414e0e3051c56d5943c3bf42aa9608ede4e19dc96438364e9e03/pypdfium2-4.30.0-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:f33bd79e7a09d5f7acca3b0b69ff6c8a488869a7fab48fdf400fec6e20b9c8be", size = 6267430 }, + { url = "https://files.pythonhosted.org/packages/90/47/eda4904f715fb98561e34012826e883816945934a851745570521ec89520/pypdfium2-4.30.0-py3-none-win32.whl", hash = "sha256:ee2410f15d576d976c2ab2558c93d392a25fb9f6635e8dd0a8a3a5241b275e0e", size = 2775951 }, + { url = "https://files.pythonhosted.org/packages/25/bd/56d9ec6b9f0fc4e0d95288759f3179f0fcd34b1a1526b75673d2f6d5196f/pypdfium2-4.30.0-py3-none-win_amd64.whl", hash = "sha256:90dbb2ac07be53219f56be09961eb95cf2473f834d01a42d901d13ccfad64b4c", size = 2892098 }, + { url = "https://files.pythonhosted.org/packages/be/7a/097801205b991bc3115e8af1edb850d30aeaf0118520b016354cf5ccd3f6/pypdfium2-4.30.0-py3-none-win_arm64.whl", hash = "sha256:119b2969a6d6b1e8d55e99caaf05290294f2d0fe49c12a3f17102d01c441bd29", size = 2752118 }, +] + [[package]] name = "pypika" version = "0.48.9"