Adding core knowledge sources

This commit is contained in:
Brandon Hancock
2024-11-06 12:33:55 -05:00
parent a8a2f80616
commit 1a35114c08
15 changed files with 645 additions and 155 deletions

View File

@@ -0,0 +1,32 @@
from abc import ABC, abstractmethod
from typing import List
from crewai.knowledge.embedder.base_embedder import BaseEmbedder
class BaseKnowledgeSource(ABC):
"""Abstract base class for different types of knowledge sources."""
def __init__(
self,
chunk_size: int = 1000,
chunk_overlap: int = 200,
):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.chunks: List[str] = []
@abstractmethod
def load_content(self):
"""Load and preprocess content from the source."""
pass
@abstractmethod
def add(self, embedder: BaseEmbedder) -> None:
"""Add content to the knowledge base, chunk it, and compute embeddings."""
pass
@abstractmethod
def query(self, embedder: BaseEmbedder, query: str, top_k: int = 3) -> str:
"""Query the knowledge base using semantic search."""
pass

View File

@@ -29,7 +29,6 @@ dependencies = [
"tomli-w>=1.1.0",
"chromadb>=0.4.24",
"tomli>=2.0.2",
"fastembed>=0.4.1",
]
[project.urls]
@@ -40,6 +39,10 @@ Repository = "https://github.com/crewAIInc/crewAI"
[project.optional-dependencies]
tools = ["crewai-tools>=0.13.4"]
agentops = ["agentops>=0.3.0"]
fastembed = ["fastembed>=0.4.1"]
pdfplumber = [
"pdfplumber>=0.11.4",
]
[tool.uv]
dev-dependencies = [

View File

@@ -1,7 +1,9 @@
import warnings
from crewai.agent import Agent
from crewai.crew import Crew
from crewai.flow.flow import Flow
from crewai.knowledge.knowledge import Knowledge
from crewai.llm import LLM
from crewai.pipeline import Pipeline
from crewai.process import Process
@@ -15,4 +17,14 @@ warnings.filterwarnings(
module="pydantic.main",
)
__version__ = "0.76.9"
__all__ = ["Agent", "Crew", "Process", "Task", "Pipeline", "Router", "LLM", "Flow"]
__all__ = [
"Agent",
"Crew",
"Process",
"Task",
"Pipeline",
"Router",
"LLM",
"Flow",
"Knowledge",
]

View File

@@ -47,7 +47,7 @@ class FastEmbed(BaseEmbedder):
cache_dir=str(cache_dir) if cache_dir else None,
)
def embed_chunks(self, chunks: List[str]) -> np.ndarray:
def embed_chunks(self, chunks: List[str]) -> List[np.ndarray]:
"""
Generate embeddings for a list of text chunks
@@ -55,13 +55,12 @@ class FastEmbed(BaseEmbedder):
chunks: List of text chunks to embed
Returns:
Array of embeddings
List of embeddings
"""
# FastEmbed returns a generator, convert to list then numpy array
embeddings = list(self.model.embed(chunks))
return np.array(embeddings)
return embeddings
def embed_texts(self, texts: List[str]) -> np.ndarray:
def embed_texts(self, texts: List[str]) -> List[np.ndarray]:
"""
Generate embeddings for a list of texts
@@ -69,11 +68,10 @@ class FastEmbed(BaseEmbedder):
texts: List of texts to embed
Returns:
Array of embeddings
List of embeddings
"""
# FastEmbed returns a generator, convert to list then numpy array
embeddings = list(self.model.embed(texts))
return np.array(embeddings)
return embeddings
def embed_text(self, text: str) -> np.ndarray:
"""

View File

@@ -1,21 +1,53 @@
from typing import List, Optional
from typing import List
from pydantic import BaseModel
from pydantic import BaseModel, ConfigDict, Field
from .embedder.base_embedder import BaseEmbedder
from .embedder.fastembed import FastEmbed
from .source.base_knowledge_source import BaseKnowledgeSource
from crewai.knowledge.embedder.base_embedder import BaseEmbedder
from crewai.knowledge.embedder.fastembed import FastEmbed
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
class Knowledge(BaseModel):
sources: Optional[List[BaseKnowledgeSource]] = None
embedder: BaseEmbedder
sources: List[BaseKnowledgeSource] = Field(default_factory=list)
embedder: BaseEmbedder = Field(default_factory=FastEmbed)
def __init__(
self,
sources: Optional[List[BaseKnowledgeSource]] = None,
embedder: Optional[BaseEmbedder] = None,
):
super().__init__()
self.sources = sources or []
self.embedder = embedder or FastEmbed()
model_config = ConfigDict(arbitrary_types_allowed=True)
def __init__(self, **data):
super().__init__(**data)
# Call add on all sources during initialization
for source in self.sources:
source.add(self.embedder)
def query(self, query: str, top_k: int = 3) -> List[str]:
"""
Query across all knowledge sources to find the most relevant information.
Returns the top_k most relevant chunks.
"""
if not self.sources:
return []
# Collect all chunks and embeddings from all sources
all_chunks = []
all_embeddings = []
for source in self.sources:
all_chunks.extend(source.chunks)
all_embeddings.extend(source.get_embeddings())
# Embed the query
query_embedding = self.embedder.embed_text(query)
# Calculate similarities
similarities = []
for idx, embedding in enumerate(all_embeddings):
similarity = query_embedding.dot(embedding)
similarities.append((similarity, idx))
# Sort by similarity
similarities.sort(reverse=True, key=lambda x: x[0])
# Get top_k results
top_chunks = [all_chunks[idx] for _, idx in similarities[:top_k]]
return top_chunks

View File

View File

@@ -1,5 +1,5 @@
from abc import ABC, abstractmethod
from typing import Any, Dict, List
from typing import List
import numpy as np
@@ -7,7 +7,7 @@ from crewai.knowledge.embedder.base_embedder import BaseEmbedder
class BaseKnowledgeSource(ABC):
"""Abstract base class for knowledge bases"""
"""Abstract base class for knowledge sources."""
def __init__(
self,
@@ -17,96 +17,25 @@ class BaseKnowledgeSource(ABC):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.chunks: List[str] = []
self.chunk_embeddings: Dict[int, np.ndarray] = {}
self.chunk_embeddings: List[np.ndarray] = []
@abstractmethod
def query(self, query: str) -> str:
"""Query the knowledge base and return relevant information"""
def load_content(self):
"""Load and preprocess content from the source."""
pass
@abstractmethod
def add(self, content: Any) -> None:
"""Process and store content in the knowledge base"""
def add(self, embedder: BaseEmbedder) -> None:
"""Process content, chunk it, compute embeddings, and save them."""
pass
def embed(self, embedder: BaseEmbedder, new_chunks: List[str]) -> None:
"""Embed chunks and store them"""
if not new_chunks:
return
# Get embeddings for new chunks
embeddings = embedder.embed_texts(new_chunks)
# Store embeddings with their corresponding chunks
start_idx = len(self.chunks)
for i, embedding in enumerate(embeddings):
self.chunk_embeddings[start_idx + i] = embedding
def get_embeddings(self) -> List[np.ndarray]:
"""Return the list of embeddings for the chunks."""
return self.chunk_embeddings
def _chunk_text(self, text: str) -> List[str]:
"""Split text into chunks with overlap"""
chunks = []
start = 0
text_length = len(text)
while start < text_length:
# Get the chunk of size chunk_size
end = start + self.chunk_size
if end >= text_length:
# If we're at the end, just take the rest
chunks.append(text[start:].strip())
break
# Look for a good breaking point
# Priority: double newline > single newline > period > space
break_chars = ["\n\n", "\n", ". ", " "]
chunk_end = end
for break_char in break_chars:
# Look for the break_char in a window around the end point
window_start = max(start + self.chunk_size - 100, start)
window_end = min(start + self.chunk_size + 100, text_length)
window_text = text[window_start:window_end]
# Find the last occurrence of the break_char in the window
last_break = window_text.rfind(break_char)
if last_break != -1:
chunk_end = window_start + last_break + len(break_char)
break
# Add the chunk
chunk = text[start:chunk_end].strip()
if chunk: # Only add non-empty chunks
chunks.append(chunk)
# Move the start pointer, accounting for overlap
start = max(
start + self.chunk_size - self.chunk_overlap,
chunk_end - self.chunk_overlap,
)
return chunks
def _find_similar_chunks(
self, embedder: BaseEmbedder, query: str, top_k: int = 3
) -> List[str]:
"""Find the most similar chunks to a query using embeddings"""
if not self.chunks:
return []
# Get query embedding
query_embedding = embedder.embed_text(query)
# Calculate similarities with all chunks
similarities = []
for idx, chunk_embedding in self.chunk_embeddings.items():
similarity = np.dot(query_embedding, chunk_embedding)
similarities.append((similarity, idx))
# Sort by similarity and get top_k chunks
similarities.sort(reverse=True)
top_chunks = []
for _, idx in similarities[:top_k]:
top_chunks.append(self.chunks[idx])
return top_chunks
"""Utility method to split text into chunks."""
return [
text[i : i + self.chunk_size]
for i in range(0, len(text), self.chunk_size - self.chunk_overlap)
]

View File

@@ -0,0 +1,65 @@
from pathlib import Path
from typing import List
from crewai.knowledge.embedder.base_embedder import BaseEmbedder
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
class PDFKnowledgeSource(BaseKnowledgeSource):
"""A knowledge source that stores and queries PDF file content using embeddings."""
def __init__(
self,
file_path: str,
chunk_size: int = 1000,
chunk_overlap: int = 200,
):
super().__init__(chunk_size, chunk_overlap)
self.file_path = Path(file_path)
self.content = self.load_content()
def _import_pdfplumber(self):
"""Dynamically import pdfplumber."""
try:
import pdfplumber
return pdfplumber
except ImportError:
raise ImportError(
"pdfplumber is not installed. Please install it with: pip install pdfplumber"
)
def load_content(self) -> str:
"""Load and preprocess PDF file content."""
if not self.file_path.exists():
raise FileNotFoundError(f"File not found: {self.file_path}")
if not self.file_path.is_file():
raise ValueError(f"Path is not a file: {self.file_path}")
pdfplumber = self._import_pdfplumber()
text = ""
with pdfplumber.open(self.file_path) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
return text
def add(self, embedder: BaseEmbedder) -> None:
"""
Add PDF file content to the knowledge source, chunk it, compute embeddings,
and save the embeddings.
"""
new_chunks = self._chunk_text(self.content)
self.chunks.extend(new_chunks)
# Compute embeddings for the new chunks
new_embeddings = embedder.embed_chunks(new_chunks)
# Save the embeddings
self.chunk_embeddings.extend(new_embeddings)
def _chunk_text(self, text: str) -> List[str]:
"""Utility method to split text into chunks."""
return [
text[i : i + self.chunk_size]
for i in range(0, len(text), self.chunk_size - self.chunk_overlap)
]

View File

@@ -1,9 +1,11 @@
from typing import List
from crewai.knowledge.embedder.base_embedder import BaseEmbedder
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
class StringKnowledgeSource(BaseKnowledgeSource):
"""A knowledge base that stores and queries plain text content using embeddings"""
"""A knowledge source that stores and queries plain text content using embeddings."""
def __init__(
self,
@@ -15,25 +17,29 @@ class StringKnowledgeSource(BaseKnowledgeSource):
chunk_size,
chunk_overlap,
)
self.content = content
self.load_content()
def load_content(self):
"""Load and preprocess string content."""
if not isinstance(self.content, str):
raise ValueError("StringKnowledgeSource only accepts string content")
def add(self, embedder: BaseEmbedder) -> None:
"""Add text content to the knowledge base, chunk it, and compute embeddings"""
if not isinstance(self.content, str):
raise ValueError("StringKnowledgeBase only accepts string content")
# Create chunks from the text
new_chunks = self._chunk_text(content)
# Add chunks to the knowledge base
"""
Add string content to the knowledge source, chunk it, compute embeddings,
and save the embeddings.
"""
new_chunks = self._chunk_text(self.content)
self.chunks.extend(new_chunks)
# Compute embeddings for the new chunks
new_embeddings = embedder.embed_chunks(new_chunks)
# Save the embeddings
self.chunk_embeddings.extend(new_embeddings)
# Compute and store embeddings for the new chunks
embedder.embed_chunks(new_chunks)
def query(self, embedder: BaseEmbedder, query: str, top_k: int = 3) -> str:
"""
Query the knowledge base using semantic search
Returns the most relevant chunk based on embedding similarity
"""
similar_chunks = self._find_similar_chunks(embedder, query, top_k=top_k)
return similar_chunks[0] if similar_chunks else ""
def _chunk_text(self, text: str) -> List[str]:
"""Utility method to split text into chunks."""
return [
text[i : i + self.chunk_size]
for i in range(0, len(text), self.chunk_size - self.chunk_overlap)
]

View File

@@ -1,9 +1,12 @@
from pathlib import Path
from typing import List
from crewai.knowledge.embedder.base_embedder import BaseEmbedder
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
class TextFileKnowledgeSource(BaseKnowledgeSource):
"""A knowledge base that stores and queries plain text content using embeddings"""
"""A knowledge source that stores and queries text file content using embeddings."""
def __init__(
self,
@@ -11,29 +14,35 @@ class TextFileKnowledgeSource(BaseKnowledgeSource):
chunk_size: int = 1000,
chunk_overlap: int = 200,
):
super().__init__(
chunk_size,
chunk_overlap,
)
super().__init__(chunk_size, chunk_overlap)
self.file_path = Path(file_path)
self.content = self.load_content()
def load_content(self) -> str:
"""Load and preprocess text file content."""
if not self.file_path.exists():
raise FileNotFoundError(f"File not found: {self.file_path}")
if not self.file_path.is_file():
raise ValueError(f"Path is not a file: {self.file_path}")
with self.file_path.open("r", encoding="utf-8") as f:
return f.read()
def add(self, embedder: BaseEmbedder) -> None:
"""Add text content to the knowledge base, chunk it, and compute embeddings"""
if not isinstance(self.content, str):
raise ValueError("StringKnowledgeBase only accepts string content")
# Create chunks from the text
new_chunks = self._chunk_text(content)
# Add chunks to the knowledge base
"""
Add text file content to the knowledge source, chunk it, compute embeddings,
and save the embeddings.
"""
new_chunks = self._chunk_text(self.content)
self.chunks.extend(new_chunks)
# Compute embeddings for the new chunks
new_embeddings = embedder.embed_chunks(new_chunks)
# Save the embeddings
self.chunk_embeddings.extend(new_embeddings)
# Compute and store embeddings for the new chunks
embedder.embed_chunks(new_chunks)
def query(self, embedder: BaseEmbedder, query: str, top_k: int = 3) -> str:
"""
Query the knowledge base using semantic search
Returns the most relevant chunk based on embedding similarity
"""
similar_chunks = self._find_similar_chunks(embedder, query, top_k=top_k)
return similar_chunks[0] if similar_chunks else ""
def _chunk_text(self, text: str) -> List[str]:
"""Utility method to split text into chunks."""
return [
text[i : i + self.chunk_size]
for i in range(0, len(text), self.chunk_size - self.chunk_overlap)
]

View File

Binary file not shown.

View File

@@ -0,0 +1,347 @@
"""Test Knowledge creation and querying functionality."""
import os
from crewai.knowledge.knowledge import Knowledge
from crewai.knowledge.source.pdf_knowledge_source import PDFKnowledgeSource
from crewai.knowledge.source.string_knowledge_source import StringKnowledgeSource
from crewai.knowledge.source.text_file_knowledge_source import TextFileKnowledgeSource
def test_single_short_string():
# Create a knowledge base with a single short string
content = "Brandon's favorite color is blue and he likes Mexican food."
string_source = StringKnowledgeSource(content=content)
knowledge_base = Knowledge(sources=[string_source])
# Perform a query
query = "What is Brandon's favorite color?"
results = knowledge_base.query(query)
# Assert that the results contain the expected information
assert any("blue" in result.lower() for result in results)
def test_single_2k_character_string():
# Create a 2k character string with various facts about Brandon
content = (
"Brandon is a software engineer who lives in San Francisco. "
"He enjoys hiking and often visits the trails in the Bay Area. "
"Brandon has a pet dog named Max, who is a golden retriever. "
"He loves reading science fiction books, and his favorite author is Isaac Asimov. "
"Brandon's favorite movie is Inception, and he enjoys watching it with his friends. "
"He is also a fan of Mexican cuisine, especially tacos and burritos. "
"Brandon plays the guitar and often performs at local open mic nights. "
"He is learning French and plans to visit Paris next year. "
"Brandon is passionate about technology and often attends tech meetups in the city. "
"He is also interested in AI and machine learning, and he is currently working on a project related to natural language processing. "
"Brandon's favorite color is blue, and he often wears blue shirts. "
"He enjoys cooking and often tries new recipes on weekends. "
"Brandon is a morning person and likes to start his day with a run in the park. "
"He is also a coffee enthusiast and enjoys trying different coffee blends. "
"Brandon is a member of a local book club and enjoys discussing books with fellow members. "
"He is also a fan of board games and often hosts game nights at his place. "
"Brandon is an advocate for environmental conservation and volunteers for local clean-up drives. "
"He is also a mentor for aspiring software developers and enjoys sharing his knowledge with others. "
"Brandon's favorite sport is basketball, and he often plays with his friends on weekends. "
"He is also a fan of the Golden State Warriors and enjoys watching their games. "
)
string_source = StringKnowledgeSource(content=content)
knowledge_base = Knowledge(sources=[string_source])
# Perform a query
query = "What is Brandon's favorite movie?"
results = knowledge_base.query(query)
# Assert that the results contain the expected information
assert any("inception" in result.lower() for result in results)
def test_multiple_short_strings():
# Create multiple short string sources
contents = [
"Brandon loves hiking.",
"Brandon has a dog named Max.",
"Brandon enjoys painting landscapes.",
]
string_sources = [StringKnowledgeSource(content=content) for content in contents]
knowledge_base = Knowledge(sources=string_sources)
# Perform a query
query = "What is the name of Brandon's pet?"
results = knowledge_base.query(query)
# Assert that the correct information is retrieved
assert any("max" in result.lower() for result in results)
def test_multiple_2k_character_strings():
# Create multiple 2k character strings with various facts about Brandon
contents = [
(
"Brandon is a software engineer who lives in San Francisco. "
"He enjoys hiking and often visits the trails in the Bay Area. "
"Brandon has a pet dog named Max, who is a golden retriever. "
"He loves reading science fiction books, and his favorite author is Isaac Asimov. "
"Brandon's favorite movie is Inception, and he enjoys watching it with his friends. "
"He is also a fan of Mexican cuisine, especially tacos and burritos. "
"Brandon plays the guitar and often performs at local open mic nights. "
"He is learning French and plans to visit Paris next year. "
"Brandon is passionate about technology and often attends tech meetups in the city. "
"He is also interested in AI and machine learning, and he is currently working on a project related to natural language processing. "
"Brandon's favorite color is blue, and he often wears blue shirts. "
"He enjoys cooking and often tries new recipes on weekends. "
"Brandon is a morning person and likes to start his day with a run in the park. "
"He is also a coffee enthusiast and enjoys trying different coffee blends. "
"Brandon is a member of a local book club and enjoys discussing books with fellow members. "
"He is also a fan of board games and often hosts game nights at his place. "
"Brandon is an advocate for environmental conservation and volunteers for local clean-up drives. "
"He is also a mentor for aspiring software developers and enjoys sharing his knowledge with others. "
"Brandon's favorite sport is basketball, and he often plays with his friends on weekends. "
"He is also a fan of the Golden State Warriors and enjoys watching their games. "
)
* 2, # Repeat to ensure it's 2k characters
(
"Brandon loves traveling and has visited over 20 countries. "
"He is fluent in Spanish and often practices with his friends. "
"Brandon's favorite city is Barcelona, where he enjoys the architecture and culture. "
"He is a foodie and loves trying new cuisines, with a particular fondness for sushi. "
"Brandon is an avid cyclist and participates in local cycling events. "
"He is also a photographer and enjoys capturing landscapes and cityscapes. "
"Brandon is a tech enthusiast and follows the latest trends in gadgets and software. "
"He is also a fan of virtual reality and owns a VR headset. "
"Brandon's favorite book is 'The Hitchhiker's Guide to the Galaxy'. "
"He enjoys watching documentaries and learning about history and science. "
"Brandon is a coffee lover and has a collection of coffee mugs from different countries. "
"He is also a fan of jazz music and often attends live performances. "
"Brandon is a member of a local running club and participates in marathons. "
"He is also a volunteer at a local animal shelter and helps with dog walking. "
"Brandon's favorite holiday is Christmas, and he enjoys decorating his home. "
"He is also a fan of classic movies and has a collection of DVDs. "
"Brandon is a mentor for young professionals and enjoys giving career advice. "
"He is also a fan of puzzles and enjoys solving them in his free time. "
"Brandon's favorite sport is soccer, and he often plays with his friends. "
"He is also a fan of FC Barcelona and enjoys watching their matches. "
)
* 2, # Repeat to ensure it's 2k characters
]
string_sources = [StringKnowledgeSource(content=content) for content in contents]
knowledge_base = Knowledge(sources=string_sources)
# Perform a query
query = "What is Brandon's favorite book?"
results = knowledge_base.query(query)
# Assert that the correct information is retrieved
assert any(
"the hitchhiker's guide to the galaxy" in result.lower() for result in results
)
def test_single_short_file(tmpdir):
# Create a single short text file
content = "Brandon's favorite sport is basketball."
file_path = tmpdir.join("short_file.txt")
with open(file_path, "w") as f:
f.write(content)
file_source = TextFileKnowledgeSource(file_path=str(file_path))
knowledge_base = Knowledge(sources=[file_source])
# Perform a query
query = "What sport does Brandon like?"
results = knowledge_base.query(query)
# Assert that the results contain the expected information
assert any("basketball" in result.lower() for result in results)
def test_single_2k_character_file(tmpdir):
# Create a single 2k character text file with various facts about Brandon
content = (
"Brandon is a software engineer who lives in San Francisco. "
"He enjoys hiking and often visits the trails in the Bay Area. "
"Brandon has a pet dog named Max, who is a golden retriever. "
"He loves reading science fiction books, and his favorite author is Isaac Asimov. "
"Brandon's favorite movie is Inception, and he enjoys watching it with his friends. "
"He is also a fan of Mexican cuisine, especially tacos and burritos. "
"Brandon plays the guitar and often performs at local open mic nights. "
"He is learning French and plans to visit Paris next year. "
"Brandon is passionate about technology and often attends tech meetups in the city. "
"He is also interested in AI and machine learning, and he is currently working on a project related to natural language processing. "
"Brandon's favorite color is blue, and he often wears blue shirts. "
"He enjoys cooking and often tries new recipes on weekends. "
"Brandon is a morning person and likes to start his day with a run in the park. "
"He is also a coffee enthusiast and enjoys trying different coffee blends. "
"Brandon is a member of a local book club and enjoys discussing books with fellow members. "
"He is also a fan of board games and often hosts game nights at his place. "
"Brandon is an advocate for environmental conservation and volunteers for local clean-up drives. "
"He is also a mentor for aspiring software developers and enjoys sharing his knowledge with others. "
"Brandon's favorite sport is basketball, and he often plays with his friends on weekends. "
"He is also a fan of the Golden State Warriors and enjoys watching their games. "
) * 2 # Repeat to ensure it's 2k characters
file_path = tmpdir.join("long_file.txt")
with open(file_path, "w") as f:
f.write(content)
file_source = TextFileKnowledgeSource(file_path=str(file_path))
knowledge_base = Knowledge(sources=[file_source])
# Perform a query
query = "What is Brandon's favorite movie?"
results = knowledge_base.query(query)
# Assert that the results contain the expected information
assert any("inception" in result.lower() for result in results)
def test_multiple_short_files(tmpdir):
# Create multiple short text files
contents = [
"Brandon lives in New York.",
"Brandon works as a software engineer.",
"Brandon enjoys cooking Italian food.",
]
file_paths = []
for i, content in enumerate(contents):
file_path = tmpdir.join(f"file_{i}.txt")
with open(file_path, "w") as f:
f.write(content)
file_paths.append(str(file_path))
file_sources = [TextFileKnowledgeSource(file_path=path) for path in file_paths]
knowledge_base = Knowledge(sources=file_sources)
# Perform a query
query = "Where does Brandon live?"
results = knowledge_base.query(query)
# Assert that the correct information is retrieved
assert any("new york" in result.lower() for result in results)
def test_multiple_2k_character_files(tmpdir):
# Create multiple 2k character text files with various facts about Brandon
contents = [
(
"Brandon loves traveling and has visited over 20 countries. "
"He is fluent in Spanish and often practices with his friends. "
"Brandon's favorite city is Barcelona, where he enjoys the architecture and culture. "
"He is a foodie and loves trying new cuisines, with a particular fondness for sushi. "
"Brandon is an avid cyclist and participates in local cycling events. "
"He is also a photographer and enjoys capturing landscapes and cityscapes. "
"Brandon is a tech enthusiast and follows the latest trends in gadgets and software. "
"He is also a fan of virtual reality and owns a VR headset. "
"Brandon's favorite book is 'The Hitchhiker's Guide to the Galaxy'. "
"He enjoys watching documentaries and learning about history and science. "
"Brandon is a coffee lover and has a collection of coffee mugs from different countries. "
"He is also a fan of jazz music and often attends live performances. "
"Brandon is a member of a local running club and participates in marathons. "
"He is also a volunteer at a local animal shelter and helps with dog walking. "
"Brandon's favorite holiday is Christmas, and he enjoys decorating his home. "
"He is also a fan of classic movies and has a collection of DVDs. "
"Brandon is a mentor for young professionals and enjoys giving career advice. "
"He is also a fan of puzzles and enjoys solving them in his free time. "
"Brandon's favorite sport is soccer, and he often plays with his friends. "
"He is also a fan of FC Barcelona and enjoys watching their matches. "
)
* 2, # Repeat to ensure it's 2k characters
(
"Brandon is a software engineer who lives in San Francisco. "
"He enjoys hiking and often visits the trails in the Bay Area. "
"Brandon has a pet dog named Max, who is a golden retriever. "
"He loves reading science fiction books, and his favorite author is Isaac Asimov. "
"Brandon's favorite movie is Inception, and he enjoys watching it with his friends. "
"He is also a fan of Mexican cuisine, especially tacos and burritos. "
"Brandon plays the guitar and often performs at local open mic nights. "
"He is learning French and plans to visit Paris next year. "
"Brandon is passionate about technology and often attends tech meetups in the city. "
"He is also interested in AI and machine learning, and he is currently working on a project related to natural language processing. "
"Brandon's favorite color is blue, and he often wears blue shirts. "
"He enjoys cooking and often tries new recipes on weekends. "
"Brandon is a morning person and likes to start his day with a run in the park. "
"He is also a coffee enthusiast and enjoys trying different coffee blends. "
"Brandon is a member of a local book club and enjoys discussing books with fellow members. "
"He is also a fan of board games and often hosts game nights at his place. "
"Brandon is an advocate for environmental conservation and volunteers for local clean-up drives. "
"He is also a mentor for aspiring software developers and enjoys sharing his knowledge with others. "
"Brandon's favorite sport is basketball, and he often plays with his friends on weekends. "
"He is also a fan of the Golden State Warriors and enjoys watching their games. "
)
* 2, # Repeat to ensure it's 2k characters
]
file_paths = []
for i, content in enumerate(contents):
file_path = tmpdir.join(f"long_file_{i}.txt")
with open(file_path, "w") as f:
f.write(content)
file_paths.append(str(file_path))
file_sources = [TextFileKnowledgeSource(file_path=path) for path in file_paths]
knowledge_base = Knowledge(sources=file_sources)
# Perform a query
query = "What is Brandon's favorite book?"
results = knowledge_base.query(query)
# Assert that the correct information is retrieved
assert any(
"the hitchhiker's guide to the galaxy" in result.lower() for result in results
)
def test_hybrid_string_and_files(tmpdir):
# Create string sources
string_contents = [
"Brandon is learning French.",
"Brandon visited Paris last summer.",
]
string_sources = [
StringKnowledgeSource(content=content) for content in string_contents
]
# Create file sources
file_contents = [
"Brandon prefers tea over coffee.",
"Brandon's favorite book is 'The Alchemist'.",
]
file_paths = []
for i, content in enumerate(file_contents):
file_path = tmpdir.join(f"file_{i}.txt")
with open(file_path, "w") as f:
f.write(content)
file_paths.append(str(file_path))
file_sources = [TextFileKnowledgeSource(file_path=path) for path in file_paths]
# Combine string and file sources
knowledge_base = Knowledge(sources=string_sources + file_sources)
# Perform a query
query = "What is Brandon's favorite book?"
results = knowledge_base.query(query)
# Assert that the correct information is retrieved
assert any("the alchemist" in result.lower() for result in results)
def test_pdf_knowledge_source():
# Get the directory of the current file
current_dir = os.path.dirname(__file__)
# Construct the path to the PDF file
pdf_path = os.path.join(current_dir, "crewai_quickstart.pdf")
# Create a PDFKnowledgeSource
pdf_source = PDFKnowledgeSource(file_path=pdf_path)
knowledge_base = Knowledge(sources=[pdf_source])
# Perform a query
query = "How do you create a crew?"
results = knowledge_base.query(query)
print("Results from querying PDFKnowledgeSource:", results)
# Assert that the correct information is retrieved
assert any(
"crewai create crew latest-ai-development" in result.lower()
for result in results
)

61
uv.lock generated
View File

@@ -612,7 +612,6 @@ dependencies = [
{ name = "chromadb" },
{ name = "click" },
{ name = "crewai-tools" },
{ name = "fastembed" },
{ name = "instructor" },
{ name = "json-repair" },
{ name = "jsonref" },
@@ -635,6 +634,15 @@ dependencies = [
agentops = [
{ name = "agentops" },
]
fastembed = [
{ name = "fastembed" },
]
network = [
{ name = "pdfplumber" },
]
pdfplumber = [
{ name = "pdfplumber" },
]
tools = [
{ name = "crewai-tools" },
]
@@ -668,7 +676,7 @@ requires-dist = [
{ name = "click", specifier = ">=8.1.7" },
{ name = "crewai-tools", specifier = ">=0.13.4" },
{ name = "crewai-tools", marker = "extra == 'tools'", specifier = ">=0.13.4" },
{ name = "fastembed", specifier = ">=0.4.1" },
{ name = "fastembed", marker = "extra == 'fastembed'", specifier = ">=0.4.1" },
{ name = "instructor", specifier = ">=1.3.3" },
{ name = "json-repair", specifier = ">=0.25.2" },
{ name = "jsonref", specifier = ">=1.1.0" },
@@ -678,6 +686,8 @@ requires-dist = [
{ name = "opentelemetry-api", specifier = ">=1.22.0" },
{ name = "opentelemetry-exporter-otlp-proto-http", specifier = ">=1.22.0" },
{ name = "opentelemetry-sdk", specifier = ">=1.22.0" },
{ name = "pdfplumber", marker = "extra == 'network'", specifier = ">=0.11.4" },
{ name = "pdfplumber", marker = "extra == 'pdfplumber'", specifier = ">=0.11.4" },
{ name = "pydantic", specifier = ">=2.4.2" },
{ name = "python-dotenv", specifier = ">=1.0.0" },
{ name = "pyvis", specifier = ">=0.3.2" },
@@ -2975,6 +2985,33 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08", size = 31191 },
]
[[package]]
name = "pdfminer-six"
version = "20231228"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "charset-normalizer" },
{ name = "cryptography" },
]
sdist = { url = "https://files.pythonhosted.org/packages/31/b1/a43e3bd872ded4deea4f8efc7aff1703fca8c5455d0c06e20506a06a44ff/pdfminer.six-20231228.tar.gz", hash = "sha256:6004da3ad1a7a4d45930cb950393df89b068e73be365a6ff64a838d37bcb08c4", size = 7362505 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/eb/9c/e46fe7502b32d7db6af6e36a9105abb93301fa1ec475b5ddcba8b35ae23a/pdfminer.six-20231228-py3-none-any.whl", hash = "sha256:e8d3c3310e6fbc1fe414090123ab01351634b4ecb021232206c4c9a8ca3e3b8f", size = 5614515 },
]
[[package]]
name = "pdfplumber"
version = "0.11.4"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "pdfminer-six" },
{ name = "pillow" },
{ name = "pypdfium2" },
]
sdist = { url = "https://files.pythonhosted.org/packages/ca/f0/457bda3629dfa5b01c645519fe30230e1739751f6645e23fca2dabf6c2e5/pdfplumber-0.11.4.tar.gz", hash = "sha256:147b55cde2351fcb9523b46b09cc771eea3602faecfb60d463c6bf951694fbe8", size = 113305 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/d0/87/415cb472981a8d2e36beeeadf074ebb686cc2bfe8d18de973232da291bd5/pdfplumber-0.11.4-py3-none-any.whl", hash = "sha256:6150f0678c7aaba974ac09839c17475d6c0c4d126b5f92cb85154885f31c6d73", size = 59182 },
]
[[package]]
name = "pexpect"
version = "4.9.0"
@@ -3546,6 +3583,26 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/48/8f/9bbf22ba6a00001a45dbc54337e5bbbd43e7d8f34c8158c92cddc45736af/pypdf-5.0.1-py3-none-any.whl", hash = "sha256:ff8a32da6c7a63fea9c32fa4dd837cdd0db7966adf6c14f043e3f12592e992db", size = 294470 },
]
[[package]]
name = "pypdfium2"
version = "4.30.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/a1/14/838b3ba247a0ba92e4df5d23f2bea9478edcfd72b78a39d6ca36ccd84ad2/pypdfium2-4.30.0.tar.gz", hash = "sha256:48b5b7e5566665bc1015b9d69c1ebabe21f6aee468b509531c3c8318eeee2e16", size = 140239 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/c7/9a/c8ff5cc352c1b60b0b97642ae734f51edbab6e28b45b4fcdfe5306ee3c83/pypdfium2-4.30.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:b33ceded0b6ff5b2b93bc1fe0ad4b71aa6b7e7bd5875f1ca0cdfb6ba6ac01aab", size = 2837254 },
{ url = "https://files.pythonhosted.org/packages/21/8b/27d4d5409f3c76b985f4ee4afe147b606594411e15ac4dc1c3363c9a9810/pypdfium2-4.30.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:4e55689f4b06e2d2406203e771f78789bd4f190731b5d57383d05cf611d829de", size = 2707624 },
{ url = "https://files.pythonhosted.org/packages/11/63/28a73ca17c24b41a205d658e177d68e198d7dde65a8c99c821d231b6ee3d/pypdfium2-4.30.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e6e50f5ce7f65a40a33d7c9edc39f23140c57e37144c2d6d9e9262a2a854854", size = 2793126 },
{ url = "https://files.pythonhosted.org/packages/d1/96/53b3ebf0955edbd02ac6da16a818ecc65c939e98fdeb4e0958362bd385c8/pypdfium2-4.30.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3d0dd3ecaffd0b6dbda3da663220e705cb563918249bda26058c6036752ba3a2", size = 2591077 },
{ url = "https://files.pythonhosted.org/packages/ec/ee/0394e56e7cab8b5b21f744d988400948ef71a9a892cbeb0b200d324ab2c7/pypdfium2-4.30.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cc3bf29b0db8c76cdfaac1ec1cde8edf211a7de7390fbf8934ad2aa9b4d6dfad", size = 2864431 },
{ url = "https://files.pythonhosted.org/packages/65/cd/3f1edf20a0ef4a212a5e20a5900e64942c5a374473671ac0780eaa08ea80/pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1f78d2189e0ddf9ac2b7a9b9bd4f0c66f54d1389ff6c17e9fd9dc034d06eb3f", size = 2812008 },
{ url = "https://files.pythonhosted.org/packages/c8/91/2d517db61845698f41a2a974de90762e50faeb529201c6b3574935969045/pypdfium2-4.30.0-py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:5eda3641a2da7a7a0b2f4dbd71d706401a656fea521b6b6faa0675b15d31a163", size = 6181543 },
{ url = "https://files.pythonhosted.org/packages/ba/c4/ed1315143a7a84b2c7616569dfb472473968d628f17c231c39e29ae9d780/pypdfium2-4.30.0-py3-none-musllinux_1_1_i686.whl", hash = "sha256:0dfa61421b5eb68e1188b0b2231e7ba35735aef2d867d86e48ee6cab6975195e", size = 6175911 },
{ url = "https://files.pythonhosted.org/packages/7a/c4/9e62d03f414e0e3051c56d5943c3bf42aa9608ede4e19dc96438364e9e03/pypdfium2-4.30.0-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:f33bd79e7a09d5f7acca3b0b69ff6c8a488869a7fab48fdf400fec6e20b9c8be", size = 6267430 },
{ url = "https://files.pythonhosted.org/packages/90/47/eda4904f715fb98561e34012826e883816945934a851745570521ec89520/pypdfium2-4.30.0-py3-none-win32.whl", hash = "sha256:ee2410f15d576d976c2ab2558c93d392a25fb9f6635e8dd0a8a3a5241b275e0e", size = 2775951 },
{ url = "https://files.pythonhosted.org/packages/25/bd/56d9ec6b9f0fc4e0d95288759f3179f0fcd34b1a1526b75673d2f6d5196f/pypdfium2-4.30.0-py3-none-win_amd64.whl", hash = "sha256:90dbb2ac07be53219f56be09961eb95cf2473f834d01a42d901d13ccfad64b4c", size = 2892098 },
{ url = "https://files.pythonhosted.org/packages/be/7a/097801205b991bc3115e8af1edb850d30aeaf0118520b016354cf5ccd3f6/pypdfium2-4.30.0-py3-none-win_arm64.whl", hash = "sha256:119b2969a6d6b1e8d55e99caaf05290294f2d0fe49c12a3f17102d01c441bd29", size = 2752118 },
]
[[package]]
name = "pypika"
version = "0.48.9"