Adding core knowledge sources

This commit is contained in:
Brandon Hancock
2024-11-06 12:33:55 -05:00
parent a8a2f80616
commit 1a35114c08
15 changed files with 645 additions and 155 deletions

View File

@@ -0,0 +1,32 @@
from abc import ABC, abstractmethod
from typing import List
from crewai.knowledge.embedder.base_embedder import BaseEmbedder
class BaseKnowledgeSource(ABC):
"""Abstract base class for different types of knowledge sources."""
def __init__(
self,
chunk_size: int = 1000,
chunk_overlap: int = 200,
):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.chunks: List[str] = []
@abstractmethod
def load_content(self):
"""Load and preprocess content from the source."""
pass
@abstractmethod
def add(self, embedder: BaseEmbedder) -> None:
"""Add content to the knowledge base, chunk it, and compute embeddings."""
pass
@abstractmethod
def query(self, embedder: BaseEmbedder, query: str, top_k: int = 3) -> str:
"""Query the knowledge base using semantic search."""
pass

View File

@@ -29,7 +29,6 @@ dependencies = [
"tomli-w>=1.1.0", "tomli-w>=1.1.0",
"chromadb>=0.4.24", "chromadb>=0.4.24",
"tomli>=2.0.2", "tomli>=2.0.2",
"fastembed>=0.4.1",
] ]
[project.urls] [project.urls]
@@ -40,6 +39,10 @@ Repository = "https://github.com/crewAIInc/crewAI"
[project.optional-dependencies] [project.optional-dependencies]
tools = ["crewai-tools>=0.13.4"] tools = ["crewai-tools>=0.13.4"]
agentops = ["agentops>=0.3.0"] agentops = ["agentops>=0.3.0"]
fastembed = ["fastembed>=0.4.1"]
pdfplumber = [
"pdfplumber>=0.11.4",
]
[tool.uv] [tool.uv]
dev-dependencies = [ dev-dependencies = [

View File

@@ -1,7 +1,9 @@
import warnings import warnings
from crewai.agent import Agent from crewai.agent import Agent
from crewai.crew import Crew from crewai.crew import Crew
from crewai.flow.flow import Flow from crewai.flow.flow import Flow
from crewai.knowledge.knowledge import Knowledge
from crewai.llm import LLM from crewai.llm import LLM
from crewai.pipeline import Pipeline from crewai.pipeline import Pipeline
from crewai.process import Process from crewai.process import Process
@@ -15,4 +17,14 @@ warnings.filterwarnings(
module="pydantic.main", module="pydantic.main",
) )
__version__ = "0.76.9" __version__ = "0.76.9"
__all__ = ["Agent", "Crew", "Process", "Task", "Pipeline", "Router", "LLM", "Flow"] __all__ = [
"Agent",
"Crew",
"Process",
"Task",
"Pipeline",
"Router",
"LLM",
"Flow",
"Knowledge",
]

View File

@@ -47,7 +47,7 @@ class FastEmbed(BaseEmbedder):
cache_dir=str(cache_dir) if cache_dir else None, cache_dir=str(cache_dir) if cache_dir else None,
) )
def embed_chunks(self, chunks: List[str]) -> np.ndarray: def embed_chunks(self, chunks: List[str]) -> List[np.ndarray]:
""" """
Generate embeddings for a list of text chunks Generate embeddings for a list of text chunks
@@ -55,13 +55,12 @@ class FastEmbed(BaseEmbedder):
chunks: List of text chunks to embed chunks: List of text chunks to embed
Returns: Returns:
Array of embeddings List of embeddings
""" """
# FastEmbed returns a generator, convert to list then numpy array
embeddings = list(self.model.embed(chunks)) embeddings = list(self.model.embed(chunks))
return np.array(embeddings) return embeddings
def embed_texts(self, texts: List[str]) -> np.ndarray: def embed_texts(self, texts: List[str]) -> List[np.ndarray]:
""" """
Generate embeddings for a list of texts Generate embeddings for a list of texts
@@ -69,11 +68,10 @@ class FastEmbed(BaseEmbedder):
texts: List of texts to embed texts: List of texts to embed
Returns: Returns:
Array of embeddings List of embeddings
""" """
# FastEmbed returns a generator, convert to list then numpy array
embeddings = list(self.model.embed(texts)) embeddings = list(self.model.embed(texts))
return np.array(embeddings) return embeddings
def embed_text(self, text: str) -> np.ndarray: def embed_text(self, text: str) -> np.ndarray:
""" """

View File

@@ -1,21 +1,53 @@
from typing import List, Optional from typing import List
from pydantic import BaseModel from pydantic import BaseModel, ConfigDict, Field
from .embedder.base_embedder import BaseEmbedder from crewai.knowledge.embedder.base_embedder import BaseEmbedder
from .embedder.fastembed import FastEmbed from crewai.knowledge.embedder.fastembed import FastEmbed
from .source.base_knowledge_source import BaseKnowledgeSource from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
class Knowledge(BaseModel): class Knowledge(BaseModel):
sources: Optional[List[BaseKnowledgeSource]] = None sources: List[BaseKnowledgeSource] = Field(default_factory=list)
embedder: BaseEmbedder embedder: BaseEmbedder = Field(default_factory=FastEmbed)
def __init__( model_config = ConfigDict(arbitrary_types_allowed=True)
self,
sources: Optional[List[BaseKnowledgeSource]] = None, def __init__(self, **data):
embedder: Optional[BaseEmbedder] = None, super().__init__(**data)
): # Call add on all sources during initialization
super().__init__() for source in self.sources:
self.sources = sources or [] source.add(self.embedder)
self.embedder = embedder or FastEmbed()
def query(self, query: str, top_k: int = 3) -> List[str]:
"""
Query across all knowledge sources to find the most relevant information.
Returns the top_k most relevant chunks.
"""
if not self.sources:
return []
# Collect all chunks and embeddings from all sources
all_chunks = []
all_embeddings = []
for source in self.sources:
all_chunks.extend(source.chunks)
all_embeddings.extend(source.get_embeddings())
# Embed the query
query_embedding = self.embedder.embed_text(query)
# Calculate similarities
similarities = []
for idx, embedding in enumerate(all_embeddings):
similarity = query_embedding.dot(embedding)
similarities.append((similarity, idx))
# Sort by similarity
similarities.sort(reverse=True, key=lambda x: x[0])
# Get top_k results
top_chunks = [all_chunks[idx] for _, idx in similarities[:top_k]]
return top_chunks

View File

View File

@@ -1,5 +1,5 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Any, Dict, List from typing import List
import numpy as np import numpy as np
@@ -7,7 +7,7 @@ from crewai.knowledge.embedder.base_embedder import BaseEmbedder
class BaseKnowledgeSource(ABC): class BaseKnowledgeSource(ABC):
"""Abstract base class for knowledge bases""" """Abstract base class for knowledge sources."""
def __init__( def __init__(
self, self,
@@ -17,96 +17,25 @@ class BaseKnowledgeSource(ABC):
self.chunk_size = chunk_size self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap self.chunk_overlap = chunk_overlap
self.chunks: List[str] = [] self.chunks: List[str] = []
self.chunk_embeddings: Dict[int, np.ndarray] = {} self.chunk_embeddings: List[np.ndarray] = []
@abstractmethod @abstractmethod
def query(self, query: str) -> str: def load_content(self):
"""Query the knowledge base and return relevant information""" """Load and preprocess content from the source."""
pass pass
@abstractmethod @abstractmethod
def add(self, content: Any) -> None: def add(self, embedder: BaseEmbedder) -> None:
"""Process and store content in the knowledge base""" """Process content, chunk it, compute embeddings, and save them."""
pass pass
def embed(self, embedder: BaseEmbedder, new_chunks: List[str]) -> None: def get_embeddings(self) -> List[np.ndarray]:
"""Embed chunks and store them""" """Return the list of embeddings for the chunks."""
if not new_chunks: return self.chunk_embeddings
return
# Get embeddings for new chunks
embeddings = embedder.embed_texts(new_chunks)
# Store embeddings with their corresponding chunks
start_idx = len(self.chunks)
for i, embedding in enumerate(embeddings):
self.chunk_embeddings[start_idx + i] = embedding
def _chunk_text(self, text: str) -> List[str]: def _chunk_text(self, text: str) -> List[str]:
"""Split text into chunks with overlap""" """Utility method to split text into chunks."""
chunks = [] return [
start = 0 text[i : i + self.chunk_size]
text_length = len(text) for i in range(0, len(text), self.chunk_size - self.chunk_overlap)
]
while start < text_length:
# Get the chunk of size chunk_size
end = start + self.chunk_size
if end >= text_length:
# If we're at the end, just take the rest
chunks.append(text[start:].strip())
break
# Look for a good breaking point
# Priority: double newline > single newline > period > space
break_chars = ["\n\n", "\n", ". ", " "]
chunk_end = end
for break_char in break_chars:
# Look for the break_char in a window around the end point
window_start = max(start + self.chunk_size - 100, start)
window_end = min(start + self.chunk_size + 100, text_length)
window_text = text[window_start:window_end]
# Find the last occurrence of the break_char in the window
last_break = window_text.rfind(break_char)
if last_break != -1:
chunk_end = window_start + last_break + len(break_char)
break
# Add the chunk
chunk = text[start:chunk_end].strip()
if chunk: # Only add non-empty chunks
chunks.append(chunk)
# Move the start pointer, accounting for overlap
start = max(
start + self.chunk_size - self.chunk_overlap,
chunk_end - self.chunk_overlap,
)
return chunks
def _find_similar_chunks(
self, embedder: BaseEmbedder, query: str, top_k: int = 3
) -> List[str]:
"""Find the most similar chunks to a query using embeddings"""
if not self.chunks:
return []
# Get query embedding
query_embedding = embedder.embed_text(query)
# Calculate similarities with all chunks
similarities = []
for idx, chunk_embedding in self.chunk_embeddings.items():
similarity = np.dot(query_embedding, chunk_embedding)
similarities.append((similarity, idx))
# Sort by similarity and get top_k chunks
similarities.sort(reverse=True)
top_chunks = []
for _, idx in similarities[:top_k]:
top_chunks.append(self.chunks[idx])
return top_chunks

View File

@@ -0,0 +1,65 @@
from pathlib import Path
from typing import List
from crewai.knowledge.embedder.base_embedder import BaseEmbedder
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
class PDFKnowledgeSource(BaseKnowledgeSource):
"""A knowledge source that stores and queries PDF file content using embeddings."""
def __init__(
self,
file_path: str,
chunk_size: int = 1000,
chunk_overlap: int = 200,
):
super().__init__(chunk_size, chunk_overlap)
self.file_path = Path(file_path)
self.content = self.load_content()
def _import_pdfplumber(self):
"""Dynamically import pdfplumber."""
try:
import pdfplumber
return pdfplumber
except ImportError:
raise ImportError(
"pdfplumber is not installed. Please install it with: pip install pdfplumber"
)
def load_content(self) -> str:
"""Load and preprocess PDF file content."""
if not self.file_path.exists():
raise FileNotFoundError(f"File not found: {self.file_path}")
if not self.file_path.is_file():
raise ValueError(f"Path is not a file: {self.file_path}")
pdfplumber = self._import_pdfplumber()
text = ""
with pdfplumber.open(self.file_path) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
return text
def add(self, embedder: BaseEmbedder) -> None:
"""
Add PDF file content to the knowledge source, chunk it, compute embeddings,
and save the embeddings.
"""
new_chunks = self._chunk_text(self.content)
self.chunks.extend(new_chunks)
# Compute embeddings for the new chunks
new_embeddings = embedder.embed_chunks(new_chunks)
# Save the embeddings
self.chunk_embeddings.extend(new_embeddings)
def _chunk_text(self, text: str) -> List[str]:
"""Utility method to split text into chunks."""
return [
text[i : i + self.chunk_size]
for i in range(0, len(text), self.chunk_size - self.chunk_overlap)
]

View File

@@ -1,9 +1,11 @@
from typing import List
from crewai.knowledge.embedder.base_embedder import BaseEmbedder from crewai.knowledge.embedder.base_embedder import BaseEmbedder
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
class StringKnowledgeSource(BaseKnowledgeSource): class StringKnowledgeSource(BaseKnowledgeSource):
"""A knowledge base that stores and queries plain text content using embeddings""" """A knowledge source that stores and queries plain text content using embeddings."""
def __init__( def __init__(
self, self,
@@ -15,25 +17,29 @@ class StringKnowledgeSource(BaseKnowledgeSource):
chunk_size, chunk_size,
chunk_overlap, chunk_overlap,
) )
self.content = content
self.load_content()
def load_content(self):
"""Load and preprocess string content."""
if not isinstance(self.content, str):
raise ValueError("StringKnowledgeSource only accepts string content")
def add(self, embedder: BaseEmbedder) -> None: def add(self, embedder: BaseEmbedder) -> None:
"""Add text content to the knowledge base, chunk it, and compute embeddings""" """
if not isinstance(self.content, str): Add string content to the knowledge source, chunk it, compute embeddings,
raise ValueError("StringKnowledgeBase only accepts string content") and save the embeddings.
"""
# Create chunks from the text new_chunks = self._chunk_text(self.content)
new_chunks = self._chunk_text(content)
# Add chunks to the knowledge base
self.chunks.extend(new_chunks) self.chunks.extend(new_chunks)
# Compute embeddings for the new chunks
new_embeddings = embedder.embed_chunks(new_chunks)
# Save the embeddings
self.chunk_embeddings.extend(new_embeddings)
# Compute and store embeddings for the new chunks def _chunk_text(self, text: str) -> List[str]:
embedder.embed_chunks(new_chunks) """Utility method to split text into chunks."""
return [
def query(self, embedder: BaseEmbedder, query: str, top_k: int = 3) -> str: text[i : i + self.chunk_size]
""" for i in range(0, len(text), self.chunk_size - self.chunk_overlap)
Query the knowledge base using semantic search ]
Returns the most relevant chunk based on embedding similarity
"""
similar_chunks = self._find_similar_chunks(embedder, query, top_k=top_k)
return similar_chunks[0] if similar_chunks else ""

View File

@@ -1,9 +1,12 @@
from pathlib import Path
from typing import List
from crewai.knowledge.embedder.base_embedder import BaseEmbedder from crewai.knowledge.embedder.base_embedder import BaseEmbedder
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
class TextFileKnowledgeSource(BaseKnowledgeSource): class TextFileKnowledgeSource(BaseKnowledgeSource):
"""A knowledge base that stores and queries plain text content using embeddings""" """A knowledge source that stores and queries text file content using embeddings."""
def __init__( def __init__(
self, self,
@@ -11,29 +14,35 @@ class TextFileKnowledgeSource(BaseKnowledgeSource):
chunk_size: int = 1000, chunk_size: int = 1000,
chunk_overlap: int = 200, chunk_overlap: int = 200,
): ):
super().__init__( super().__init__(chunk_size, chunk_overlap)
chunk_size, self.file_path = Path(file_path)
chunk_overlap, self.content = self.load_content()
)
def load_content(self) -> str:
"""Load and preprocess text file content."""
if not self.file_path.exists():
raise FileNotFoundError(f"File not found: {self.file_path}")
if not self.file_path.is_file():
raise ValueError(f"Path is not a file: {self.file_path}")
with self.file_path.open("r", encoding="utf-8") as f:
return f.read()
def add(self, embedder: BaseEmbedder) -> None: def add(self, embedder: BaseEmbedder) -> None:
"""Add text content to the knowledge base, chunk it, and compute embeddings""" """
if not isinstance(self.content, str): Add text file content to the knowledge source, chunk it, compute embeddings,
raise ValueError("StringKnowledgeBase only accepts string content") and save the embeddings.
"""
# Create chunks from the text new_chunks = self._chunk_text(self.content)
new_chunks = self._chunk_text(content)
# Add chunks to the knowledge base
self.chunks.extend(new_chunks) self.chunks.extend(new_chunks)
# Compute embeddings for the new chunks
new_embeddings = embedder.embed_chunks(new_chunks)
# Save the embeddings
self.chunk_embeddings.extend(new_embeddings)
# Compute and store embeddings for the new chunks def _chunk_text(self, text: str) -> List[str]:
embedder.embed_chunks(new_chunks) """Utility method to split text into chunks."""
return [
def query(self, embedder: BaseEmbedder, query: str, top_k: int = 3) -> str: text[i : i + self.chunk_size]
""" for i in range(0, len(text), self.chunk_size - self.chunk_overlap)
Query the knowledge base using semantic search ]
Returns the most relevant chunk based on embedding similarity
"""
similar_chunks = self._find_similar_chunks(embedder, query, top_k=top_k)
return similar_chunks[0] if similar_chunks else ""

View File

Binary file not shown.

View File

@@ -0,0 +1,347 @@
"""Test Knowledge creation and querying functionality."""
import os
from crewai.knowledge.knowledge import Knowledge
from crewai.knowledge.source.pdf_knowledge_source import PDFKnowledgeSource
from crewai.knowledge.source.string_knowledge_source import StringKnowledgeSource
from crewai.knowledge.source.text_file_knowledge_source import TextFileKnowledgeSource
def test_single_short_string():
# Create a knowledge base with a single short string
content = "Brandon's favorite color is blue and he likes Mexican food."
string_source = StringKnowledgeSource(content=content)
knowledge_base = Knowledge(sources=[string_source])
# Perform a query
query = "What is Brandon's favorite color?"
results = knowledge_base.query(query)
# Assert that the results contain the expected information
assert any("blue" in result.lower() for result in results)
def test_single_2k_character_string():
# Create a 2k character string with various facts about Brandon
content = (
"Brandon is a software engineer who lives in San Francisco. "
"He enjoys hiking and often visits the trails in the Bay Area. "
"Brandon has a pet dog named Max, who is a golden retriever. "
"He loves reading science fiction books, and his favorite author is Isaac Asimov. "
"Brandon's favorite movie is Inception, and he enjoys watching it with his friends. "
"He is also a fan of Mexican cuisine, especially tacos and burritos. "
"Brandon plays the guitar and often performs at local open mic nights. "
"He is learning French and plans to visit Paris next year. "
"Brandon is passionate about technology and often attends tech meetups in the city. "
"He is also interested in AI and machine learning, and he is currently working on a project related to natural language processing. "
"Brandon's favorite color is blue, and he often wears blue shirts. "
"He enjoys cooking and often tries new recipes on weekends. "
"Brandon is a morning person and likes to start his day with a run in the park. "
"He is also a coffee enthusiast and enjoys trying different coffee blends. "
"Brandon is a member of a local book club and enjoys discussing books with fellow members. "
"He is also a fan of board games and often hosts game nights at his place. "
"Brandon is an advocate for environmental conservation and volunteers for local clean-up drives. "
"He is also a mentor for aspiring software developers and enjoys sharing his knowledge with others. "
"Brandon's favorite sport is basketball, and he often plays with his friends on weekends. "
"He is also a fan of the Golden State Warriors and enjoys watching their games. "
)
string_source = StringKnowledgeSource(content=content)
knowledge_base = Knowledge(sources=[string_source])
# Perform a query
query = "What is Brandon's favorite movie?"
results = knowledge_base.query(query)
# Assert that the results contain the expected information
assert any("inception" in result.lower() for result in results)
def test_multiple_short_strings():
# Create multiple short string sources
contents = [
"Brandon loves hiking.",
"Brandon has a dog named Max.",
"Brandon enjoys painting landscapes.",
]
string_sources = [StringKnowledgeSource(content=content) for content in contents]
knowledge_base = Knowledge(sources=string_sources)
# Perform a query
query = "What is the name of Brandon's pet?"
results = knowledge_base.query(query)
# Assert that the correct information is retrieved
assert any("max" in result.lower() for result in results)
def test_multiple_2k_character_strings():
# Create multiple 2k character strings with various facts about Brandon
contents = [
(
"Brandon is a software engineer who lives in San Francisco. "
"He enjoys hiking and often visits the trails in the Bay Area. "
"Brandon has a pet dog named Max, who is a golden retriever. "
"He loves reading science fiction books, and his favorite author is Isaac Asimov. "
"Brandon's favorite movie is Inception, and he enjoys watching it with his friends. "
"He is also a fan of Mexican cuisine, especially tacos and burritos. "
"Brandon plays the guitar and often performs at local open mic nights. "
"He is learning French and plans to visit Paris next year. "
"Brandon is passionate about technology and often attends tech meetups in the city. "
"He is also interested in AI and machine learning, and he is currently working on a project related to natural language processing. "
"Brandon's favorite color is blue, and he often wears blue shirts. "
"He enjoys cooking and often tries new recipes on weekends. "
"Brandon is a morning person and likes to start his day with a run in the park. "
"He is also a coffee enthusiast and enjoys trying different coffee blends. "
"Brandon is a member of a local book club and enjoys discussing books with fellow members. "
"He is also a fan of board games and often hosts game nights at his place. "
"Brandon is an advocate for environmental conservation and volunteers for local clean-up drives. "
"He is also a mentor for aspiring software developers and enjoys sharing his knowledge with others. "
"Brandon's favorite sport is basketball, and he often plays with his friends on weekends. "
"He is also a fan of the Golden State Warriors and enjoys watching their games. "
)
* 2, # Repeat to ensure it's 2k characters
(
"Brandon loves traveling and has visited over 20 countries. "
"He is fluent in Spanish and often practices with his friends. "
"Brandon's favorite city is Barcelona, where he enjoys the architecture and culture. "
"He is a foodie and loves trying new cuisines, with a particular fondness for sushi. "
"Brandon is an avid cyclist and participates in local cycling events. "
"He is also a photographer and enjoys capturing landscapes and cityscapes. "
"Brandon is a tech enthusiast and follows the latest trends in gadgets and software. "
"He is also a fan of virtual reality and owns a VR headset. "
"Brandon's favorite book is 'The Hitchhiker's Guide to the Galaxy'. "
"He enjoys watching documentaries and learning about history and science. "
"Brandon is a coffee lover and has a collection of coffee mugs from different countries. "
"He is also a fan of jazz music and often attends live performances. "
"Brandon is a member of a local running club and participates in marathons. "
"He is also a volunteer at a local animal shelter and helps with dog walking. "
"Brandon's favorite holiday is Christmas, and he enjoys decorating his home. "
"He is also a fan of classic movies and has a collection of DVDs. "
"Brandon is a mentor for young professionals and enjoys giving career advice. "
"He is also a fan of puzzles and enjoys solving them in his free time. "
"Brandon's favorite sport is soccer, and he often plays with his friends. "
"He is also a fan of FC Barcelona and enjoys watching their matches. "
)
* 2, # Repeat to ensure it's 2k characters
]
string_sources = [StringKnowledgeSource(content=content) for content in contents]
knowledge_base = Knowledge(sources=string_sources)
# Perform a query
query = "What is Brandon's favorite book?"
results = knowledge_base.query(query)
# Assert that the correct information is retrieved
assert any(
"the hitchhiker's guide to the galaxy" in result.lower() for result in results
)
def test_single_short_file(tmpdir):
# Create a single short text file
content = "Brandon's favorite sport is basketball."
file_path = tmpdir.join("short_file.txt")
with open(file_path, "w") as f:
f.write(content)
file_source = TextFileKnowledgeSource(file_path=str(file_path))
knowledge_base = Knowledge(sources=[file_source])
# Perform a query
query = "What sport does Brandon like?"
results = knowledge_base.query(query)
# Assert that the results contain the expected information
assert any("basketball" in result.lower() for result in results)
def test_single_2k_character_file(tmpdir):
# Create a single 2k character text file with various facts about Brandon
content = (
"Brandon is a software engineer who lives in San Francisco. "
"He enjoys hiking and often visits the trails in the Bay Area. "
"Brandon has a pet dog named Max, who is a golden retriever. "
"He loves reading science fiction books, and his favorite author is Isaac Asimov. "
"Brandon's favorite movie is Inception, and he enjoys watching it with his friends. "
"He is also a fan of Mexican cuisine, especially tacos and burritos. "
"Brandon plays the guitar and often performs at local open mic nights. "
"He is learning French and plans to visit Paris next year. "
"Brandon is passionate about technology and often attends tech meetups in the city. "
"He is also interested in AI and machine learning, and he is currently working on a project related to natural language processing. "
"Brandon's favorite color is blue, and he often wears blue shirts. "
"He enjoys cooking and often tries new recipes on weekends. "
"Brandon is a morning person and likes to start his day with a run in the park. "
"He is also a coffee enthusiast and enjoys trying different coffee blends. "
"Brandon is a member of a local book club and enjoys discussing books with fellow members. "
"He is also a fan of board games and often hosts game nights at his place. "
"Brandon is an advocate for environmental conservation and volunteers for local clean-up drives. "
"He is also a mentor for aspiring software developers and enjoys sharing his knowledge with others. "
"Brandon's favorite sport is basketball, and he often plays with his friends on weekends. "
"He is also a fan of the Golden State Warriors and enjoys watching their games. "
) * 2 # Repeat to ensure it's 2k characters
file_path = tmpdir.join("long_file.txt")
with open(file_path, "w") as f:
f.write(content)
file_source = TextFileKnowledgeSource(file_path=str(file_path))
knowledge_base = Knowledge(sources=[file_source])
# Perform a query
query = "What is Brandon's favorite movie?"
results = knowledge_base.query(query)
# Assert that the results contain the expected information
assert any("inception" in result.lower() for result in results)
def test_multiple_short_files(tmpdir):
# Create multiple short text files
contents = [
"Brandon lives in New York.",
"Brandon works as a software engineer.",
"Brandon enjoys cooking Italian food.",
]
file_paths = []
for i, content in enumerate(contents):
file_path = tmpdir.join(f"file_{i}.txt")
with open(file_path, "w") as f:
f.write(content)
file_paths.append(str(file_path))
file_sources = [TextFileKnowledgeSource(file_path=path) for path in file_paths]
knowledge_base = Knowledge(sources=file_sources)
# Perform a query
query = "Where does Brandon live?"
results = knowledge_base.query(query)
# Assert that the correct information is retrieved
assert any("new york" in result.lower() for result in results)
def test_multiple_2k_character_files(tmpdir):
# Create multiple 2k character text files with various facts about Brandon
contents = [
(
"Brandon loves traveling and has visited over 20 countries. "
"He is fluent in Spanish and often practices with his friends. "
"Brandon's favorite city is Barcelona, where he enjoys the architecture and culture. "
"He is a foodie and loves trying new cuisines, with a particular fondness for sushi. "
"Brandon is an avid cyclist and participates in local cycling events. "
"He is also a photographer and enjoys capturing landscapes and cityscapes. "
"Brandon is a tech enthusiast and follows the latest trends in gadgets and software. "
"He is also a fan of virtual reality and owns a VR headset. "
"Brandon's favorite book is 'The Hitchhiker's Guide to the Galaxy'. "
"He enjoys watching documentaries and learning about history and science. "
"Brandon is a coffee lover and has a collection of coffee mugs from different countries. "
"He is also a fan of jazz music and often attends live performances. "
"Brandon is a member of a local running club and participates in marathons. "
"He is also a volunteer at a local animal shelter and helps with dog walking. "
"Brandon's favorite holiday is Christmas, and he enjoys decorating his home. "
"He is also a fan of classic movies and has a collection of DVDs. "
"Brandon is a mentor for young professionals and enjoys giving career advice. "
"He is also a fan of puzzles and enjoys solving them in his free time. "
"Brandon's favorite sport is soccer, and he often plays with his friends. "
"He is also a fan of FC Barcelona and enjoys watching their matches. "
)
* 2, # Repeat to ensure it's 2k characters
(
"Brandon is a software engineer who lives in San Francisco. "
"He enjoys hiking and often visits the trails in the Bay Area. "
"Brandon has a pet dog named Max, who is a golden retriever. "
"He loves reading science fiction books, and his favorite author is Isaac Asimov. "
"Brandon's favorite movie is Inception, and he enjoys watching it with his friends. "
"He is also a fan of Mexican cuisine, especially tacos and burritos. "
"Brandon plays the guitar and often performs at local open mic nights. "
"He is learning French and plans to visit Paris next year. "
"Brandon is passionate about technology and often attends tech meetups in the city. "
"He is also interested in AI and machine learning, and he is currently working on a project related to natural language processing. "
"Brandon's favorite color is blue, and he often wears blue shirts. "
"He enjoys cooking and often tries new recipes on weekends. "
"Brandon is a morning person and likes to start his day with a run in the park. "
"He is also a coffee enthusiast and enjoys trying different coffee blends. "
"Brandon is a member of a local book club and enjoys discussing books with fellow members. "
"He is also a fan of board games and often hosts game nights at his place. "
"Brandon is an advocate for environmental conservation and volunteers for local clean-up drives. "
"He is also a mentor for aspiring software developers and enjoys sharing his knowledge with others. "
"Brandon's favorite sport is basketball, and he often plays with his friends on weekends. "
"He is also a fan of the Golden State Warriors and enjoys watching their games. "
)
* 2, # Repeat to ensure it's 2k characters
]
file_paths = []
for i, content in enumerate(contents):
file_path = tmpdir.join(f"long_file_{i}.txt")
with open(file_path, "w") as f:
f.write(content)
file_paths.append(str(file_path))
file_sources = [TextFileKnowledgeSource(file_path=path) for path in file_paths]
knowledge_base = Knowledge(sources=file_sources)
# Perform a query
query = "What is Brandon's favorite book?"
results = knowledge_base.query(query)
# Assert that the correct information is retrieved
assert any(
"the hitchhiker's guide to the galaxy" in result.lower() for result in results
)
def test_hybrid_string_and_files(tmpdir):
# Create string sources
string_contents = [
"Brandon is learning French.",
"Brandon visited Paris last summer.",
]
string_sources = [
StringKnowledgeSource(content=content) for content in string_contents
]
# Create file sources
file_contents = [
"Brandon prefers tea over coffee.",
"Brandon's favorite book is 'The Alchemist'.",
]
file_paths = []
for i, content in enumerate(file_contents):
file_path = tmpdir.join(f"file_{i}.txt")
with open(file_path, "w") as f:
f.write(content)
file_paths.append(str(file_path))
file_sources = [TextFileKnowledgeSource(file_path=path) for path in file_paths]
# Combine string and file sources
knowledge_base = Knowledge(sources=string_sources + file_sources)
# Perform a query
query = "What is Brandon's favorite book?"
results = knowledge_base.query(query)
# Assert that the correct information is retrieved
assert any("the alchemist" in result.lower() for result in results)
def test_pdf_knowledge_source():
# Get the directory of the current file
current_dir = os.path.dirname(__file__)
# Construct the path to the PDF file
pdf_path = os.path.join(current_dir, "crewai_quickstart.pdf")
# Create a PDFKnowledgeSource
pdf_source = PDFKnowledgeSource(file_path=pdf_path)
knowledge_base = Knowledge(sources=[pdf_source])
# Perform a query
query = "How do you create a crew?"
results = knowledge_base.query(query)
print("Results from querying PDFKnowledgeSource:", results)
# Assert that the correct information is retrieved
assert any(
"crewai create crew latest-ai-development" in result.lower()
for result in results
)

61
uv.lock generated
View File

@@ -612,7 +612,6 @@ dependencies = [
{ name = "chromadb" }, { name = "chromadb" },
{ name = "click" }, { name = "click" },
{ name = "crewai-tools" }, { name = "crewai-tools" },
{ name = "fastembed" },
{ name = "instructor" }, { name = "instructor" },
{ name = "json-repair" }, { name = "json-repair" },
{ name = "jsonref" }, { name = "jsonref" },
@@ -635,6 +634,15 @@ dependencies = [
agentops = [ agentops = [
{ name = "agentops" }, { name = "agentops" },
] ]
fastembed = [
{ name = "fastembed" },
]
network = [
{ name = "pdfplumber" },
]
pdfplumber = [
{ name = "pdfplumber" },
]
tools = [ tools = [
{ name = "crewai-tools" }, { name = "crewai-tools" },
] ]
@@ -668,7 +676,7 @@ requires-dist = [
{ name = "click", specifier = ">=8.1.7" }, { name = "click", specifier = ">=8.1.7" },
{ name = "crewai-tools", specifier = ">=0.13.4" }, { name = "crewai-tools", specifier = ">=0.13.4" },
{ name = "crewai-tools", marker = "extra == 'tools'", specifier = ">=0.13.4" }, { name = "crewai-tools", marker = "extra == 'tools'", specifier = ">=0.13.4" },
{ name = "fastembed", specifier = ">=0.4.1" }, { name = "fastembed", marker = "extra == 'fastembed'", specifier = ">=0.4.1" },
{ name = "instructor", specifier = ">=1.3.3" }, { name = "instructor", specifier = ">=1.3.3" },
{ name = "json-repair", specifier = ">=0.25.2" }, { name = "json-repair", specifier = ">=0.25.2" },
{ name = "jsonref", specifier = ">=1.1.0" }, { name = "jsonref", specifier = ">=1.1.0" },
@@ -678,6 +686,8 @@ requires-dist = [
{ name = "opentelemetry-api", specifier = ">=1.22.0" }, { name = "opentelemetry-api", specifier = ">=1.22.0" },
{ name = "opentelemetry-exporter-otlp-proto-http", specifier = ">=1.22.0" }, { name = "opentelemetry-exporter-otlp-proto-http", specifier = ">=1.22.0" },
{ name = "opentelemetry-sdk", specifier = ">=1.22.0" }, { name = "opentelemetry-sdk", specifier = ">=1.22.0" },
{ name = "pdfplumber", marker = "extra == 'network'", specifier = ">=0.11.4" },
{ name = "pdfplumber", marker = "extra == 'pdfplumber'", specifier = ">=0.11.4" },
{ name = "pydantic", specifier = ">=2.4.2" }, { name = "pydantic", specifier = ">=2.4.2" },
{ name = "python-dotenv", specifier = ">=1.0.0" }, { name = "python-dotenv", specifier = ">=1.0.0" },
{ name = "pyvis", specifier = ">=0.3.2" }, { name = "pyvis", specifier = ">=0.3.2" },
@@ -2975,6 +2985,33 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08", size = 31191 }, { url = "https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08", size = 31191 },
] ]
[[package]]
name = "pdfminer-six"
version = "20231228"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "charset-normalizer" },
{ name = "cryptography" },
]
sdist = { url = "https://files.pythonhosted.org/packages/31/b1/a43e3bd872ded4deea4f8efc7aff1703fca8c5455d0c06e20506a06a44ff/pdfminer.six-20231228.tar.gz", hash = "sha256:6004da3ad1a7a4d45930cb950393df89b068e73be365a6ff64a838d37bcb08c4", size = 7362505 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/eb/9c/e46fe7502b32d7db6af6e36a9105abb93301fa1ec475b5ddcba8b35ae23a/pdfminer.six-20231228-py3-none-any.whl", hash = "sha256:e8d3c3310e6fbc1fe414090123ab01351634b4ecb021232206c4c9a8ca3e3b8f", size = 5614515 },
]
[[package]]
name = "pdfplumber"
version = "0.11.4"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "pdfminer-six" },
{ name = "pillow" },
{ name = "pypdfium2" },
]
sdist = { url = "https://files.pythonhosted.org/packages/ca/f0/457bda3629dfa5b01c645519fe30230e1739751f6645e23fca2dabf6c2e5/pdfplumber-0.11.4.tar.gz", hash = "sha256:147b55cde2351fcb9523b46b09cc771eea3602faecfb60d463c6bf951694fbe8", size = 113305 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/d0/87/415cb472981a8d2e36beeeadf074ebb686cc2bfe8d18de973232da291bd5/pdfplumber-0.11.4-py3-none-any.whl", hash = "sha256:6150f0678c7aaba974ac09839c17475d6c0c4d126b5f92cb85154885f31c6d73", size = 59182 },
]
[[package]] [[package]]
name = "pexpect" name = "pexpect"
version = "4.9.0" version = "4.9.0"
@@ -3546,6 +3583,26 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/48/8f/9bbf22ba6a00001a45dbc54337e5bbbd43e7d8f34c8158c92cddc45736af/pypdf-5.0.1-py3-none-any.whl", hash = "sha256:ff8a32da6c7a63fea9c32fa4dd837cdd0db7966adf6c14f043e3f12592e992db", size = 294470 }, { url = "https://files.pythonhosted.org/packages/48/8f/9bbf22ba6a00001a45dbc54337e5bbbd43e7d8f34c8158c92cddc45736af/pypdf-5.0.1-py3-none-any.whl", hash = "sha256:ff8a32da6c7a63fea9c32fa4dd837cdd0db7966adf6c14f043e3f12592e992db", size = 294470 },
] ]
[[package]]
name = "pypdfium2"
version = "4.30.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/a1/14/838b3ba247a0ba92e4df5d23f2bea9478edcfd72b78a39d6ca36ccd84ad2/pypdfium2-4.30.0.tar.gz", hash = "sha256:48b5b7e5566665bc1015b9d69c1ebabe21f6aee468b509531c3c8318eeee2e16", size = 140239 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/c7/9a/c8ff5cc352c1b60b0b97642ae734f51edbab6e28b45b4fcdfe5306ee3c83/pypdfium2-4.30.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:b33ceded0b6ff5b2b93bc1fe0ad4b71aa6b7e7bd5875f1ca0cdfb6ba6ac01aab", size = 2837254 },
{ url = "https://files.pythonhosted.org/packages/21/8b/27d4d5409f3c76b985f4ee4afe147b606594411e15ac4dc1c3363c9a9810/pypdfium2-4.30.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:4e55689f4b06e2d2406203e771f78789bd4f190731b5d57383d05cf611d829de", size = 2707624 },
{ url = "https://files.pythonhosted.org/packages/11/63/28a73ca17c24b41a205d658e177d68e198d7dde65a8c99c821d231b6ee3d/pypdfium2-4.30.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e6e50f5ce7f65a40a33d7c9edc39f23140c57e37144c2d6d9e9262a2a854854", size = 2793126 },
{ url = "https://files.pythonhosted.org/packages/d1/96/53b3ebf0955edbd02ac6da16a818ecc65c939e98fdeb4e0958362bd385c8/pypdfium2-4.30.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3d0dd3ecaffd0b6dbda3da663220e705cb563918249bda26058c6036752ba3a2", size = 2591077 },
{ url = "https://files.pythonhosted.org/packages/ec/ee/0394e56e7cab8b5b21f744d988400948ef71a9a892cbeb0b200d324ab2c7/pypdfium2-4.30.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cc3bf29b0db8c76cdfaac1ec1cde8edf211a7de7390fbf8934ad2aa9b4d6dfad", size = 2864431 },
{ url = "https://files.pythonhosted.org/packages/65/cd/3f1edf20a0ef4a212a5e20a5900e64942c5a374473671ac0780eaa08ea80/pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1f78d2189e0ddf9ac2b7a9b9bd4f0c66f54d1389ff6c17e9fd9dc034d06eb3f", size = 2812008 },
{ url = "https://files.pythonhosted.org/packages/c8/91/2d517db61845698f41a2a974de90762e50faeb529201c6b3574935969045/pypdfium2-4.30.0-py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:5eda3641a2da7a7a0b2f4dbd71d706401a656fea521b6b6faa0675b15d31a163", size = 6181543 },
{ url = "https://files.pythonhosted.org/packages/ba/c4/ed1315143a7a84b2c7616569dfb472473968d628f17c231c39e29ae9d780/pypdfium2-4.30.0-py3-none-musllinux_1_1_i686.whl", hash = "sha256:0dfa61421b5eb68e1188b0b2231e7ba35735aef2d867d86e48ee6cab6975195e", size = 6175911 },
{ url = "https://files.pythonhosted.org/packages/7a/c4/9e62d03f414e0e3051c56d5943c3bf42aa9608ede4e19dc96438364e9e03/pypdfium2-4.30.0-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:f33bd79e7a09d5f7acca3b0b69ff6c8a488869a7fab48fdf400fec6e20b9c8be", size = 6267430 },
{ url = "https://files.pythonhosted.org/packages/90/47/eda4904f715fb98561e34012826e883816945934a851745570521ec89520/pypdfium2-4.30.0-py3-none-win32.whl", hash = "sha256:ee2410f15d576d976c2ab2558c93d392a25fb9f6635e8dd0a8a3a5241b275e0e", size = 2775951 },
{ url = "https://files.pythonhosted.org/packages/25/bd/56d9ec6b9f0fc4e0d95288759f3179f0fcd34b1a1526b75673d2f6d5196f/pypdfium2-4.30.0-py3-none-win_amd64.whl", hash = "sha256:90dbb2ac07be53219f56be09961eb95cf2473f834d01a42d901d13ccfad64b4c", size = 2892098 },
{ url = "https://files.pythonhosted.org/packages/be/7a/097801205b991bc3115e8af1edb850d30aeaf0118520b016354cf5ccd3f6/pypdfium2-4.30.0-py3-none-win_arm64.whl", hash = "sha256:119b2969a6d6b1e8d55e99caaf05290294f2d0fe49c12a3f17102d01c441bd29", size = 2752118 },
]
[[package]] [[package]]
name = "pypika" name = "pypika"
version = "0.48.9" version = "0.48.9"