mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-20 21:38:14 +00:00
Adding core knowledge sources
This commit is contained in:
@@ -1,7 +1,9 @@
|
||||
import warnings
|
||||
|
||||
from crewai.agent import Agent
|
||||
from crewai.crew import Crew
|
||||
from crewai.flow.flow import Flow
|
||||
from crewai.knowledge.knowledge import Knowledge
|
||||
from crewai.llm import LLM
|
||||
from crewai.pipeline import Pipeline
|
||||
from crewai.process import Process
|
||||
@@ -15,4 +17,14 @@ warnings.filterwarnings(
|
||||
module="pydantic.main",
|
||||
)
|
||||
__version__ = "0.76.9"
|
||||
__all__ = ["Agent", "Crew", "Process", "Task", "Pipeline", "Router", "LLM", "Flow"]
|
||||
__all__ = [
|
||||
"Agent",
|
||||
"Crew",
|
||||
"Process",
|
||||
"Task",
|
||||
"Pipeline",
|
||||
"Router",
|
||||
"LLM",
|
||||
"Flow",
|
||||
"Knowledge",
|
||||
]
|
||||
|
||||
0
src/crewai/knowledge/embedder/__init__.py
Normal file
0
src/crewai/knowledge/embedder/__init__.py
Normal file
@@ -47,7 +47,7 @@ class FastEmbed(BaseEmbedder):
|
||||
cache_dir=str(cache_dir) if cache_dir else None,
|
||||
)
|
||||
|
||||
def embed_chunks(self, chunks: List[str]) -> np.ndarray:
|
||||
def embed_chunks(self, chunks: List[str]) -> List[np.ndarray]:
|
||||
"""
|
||||
Generate embeddings for a list of text chunks
|
||||
|
||||
@@ -55,13 +55,12 @@ class FastEmbed(BaseEmbedder):
|
||||
chunks: List of text chunks to embed
|
||||
|
||||
Returns:
|
||||
Array of embeddings
|
||||
List of embeddings
|
||||
"""
|
||||
# FastEmbed returns a generator, convert to list then numpy array
|
||||
embeddings = list(self.model.embed(chunks))
|
||||
return np.array(embeddings)
|
||||
return embeddings
|
||||
|
||||
def embed_texts(self, texts: List[str]) -> np.ndarray:
|
||||
def embed_texts(self, texts: List[str]) -> List[np.ndarray]:
|
||||
"""
|
||||
Generate embeddings for a list of texts
|
||||
|
||||
@@ -69,11 +68,10 @@ class FastEmbed(BaseEmbedder):
|
||||
texts: List of texts to embed
|
||||
|
||||
Returns:
|
||||
Array of embeddings
|
||||
List of embeddings
|
||||
"""
|
||||
# FastEmbed returns a generator, convert to list then numpy array
|
||||
embeddings = list(self.model.embed(texts))
|
||||
return np.array(embeddings)
|
||||
return embeddings
|
||||
|
||||
def embed_text(self, text: str) -> np.ndarray:
|
||||
"""
|
||||
|
||||
@@ -1,21 +1,53 @@
|
||||
from typing import List, Optional
|
||||
from typing import List
|
||||
|
||||
from pydantic import BaseModel
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
|
||||
from .embedder.base_embedder import BaseEmbedder
|
||||
from .embedder.fastembed import FastEmbed
|
||||
from .source.base_knowledge_source import BaseKnowledgeSource
|
||||
from crewai.knowledge.embedder.base_embedder import BaseEmbedder
|
||||
from crewai.knowledge.embedder.fastembed import FastEmbed
|
||||
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
|
||||
|
||||
|
||||
class Knowledge(BaseModel):
|
||||
sources: Optional[List[BaseKnowledgeSource]] = None
|
||||
embedder: BaseEmbedder
|
||||
sources: List[BaseKnowledgeSource] = Field(default_factory=list)
|
||||
embedder: BaseEmbedder = Field(default_factory=FastEmbed)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
sources: Optional[List[BaseKnowledgeSource]] = None,
|
||||
embedder: Optional[BaseEmbedder] = None,
|
||||
):
|
||||
super().__init__()
|
||||
self.sources = sources or []
|
||||
self.embedder = embedder or FastEmbed()
|
||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||
|
||||
def __init__(self, **data):
|
||||
super().__init__(**data)
|
||||
# Call add on all sources during initialization
|
||||
for source in self.sources:
|
||||
source.add(self.embedder)
|
||||
|
||||
def query(self, query: str, top_k: int = 3) -> List[str]:
|
||||
"""
|
||||
Query across all knowledge sources to find the most relevant information.
|
||||
Returns the top_k most relevant chunks.
|
||||
"""
|
||||
if not self.sources:
|
||||
return []
|
||||
|
||||
# Collect all chunks and embeddings from all sources
|
||||
all_chunks = []
|
||||
all_embeddings = []
|
||||
|
||||
for source in self.sources:
|
||||
all_chunks.extend(source.chunks)
|
||||
all_embeddings.extend(source.get_embeddings())
|
||||
|
||||
# Embed the query
|
||||
query_embedding = self.embedder.embed_text(query)
|
||||
|
||||
# Calculate similarities
|
||||
similarities = []
|
||||
for idx, embedding in enumerate(all_embeddings):
|
||||
similarity = query_embedding.dot(embedding)
|
||||
similarities.append((similarity, idx))
|
||||
|
||||
# Sort by similarity
|
||||
similarities.sort(reverse=True, key=lambda x: x[0])
|
||||
|
||||
# Get top_k results
|
||||
top_chunks = [all_chunks[idx] for _, idx in similarities[:top_k]]
|
||||
|
||||
return top_chunks
|
||||
|
||||
0
src/crewai/knowledge/source/__init__.py
Normal file
0
src/crewai/knowledge/source/__init__.py
Normal file
@@ -1,5 +1,5 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Dict, List
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
|
||||
@@ -7,7 +7,7 @@ from crewai.knowledge.embedder.base_embedder import BaseEmbedder
|
||||
|
||||
|
||||
class BaseKnowledgeSource(ABC):
|
||||
"""Abstract base class for knowledge bases"""
|
||||
"""Abstract base class for knowledge sources."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -17,96 +17,25 @@ class BaseKnowledgeSource(ABC):
|
||||
self.chunk_size = chunk_size
|
||||
self.chunk_overlap = chunk_overlap
|
||||
self.chunks: List[str] = []
|
||||
self.chunk_embeddings: Dict[int, np.ndarray] = {}
|
||||
self.chunk_embeddings: List[np.ndarray] = []
|
||||
|
||||
@abstractmethod
|
||||
def query(self, query: str) -> str:
|
||||
"""Query the knowledge base and return relevant information"""
|
||||
def load_content(self):
|
||||
"""Load and preprocess content from the source."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def add(self, content: Any) -> None:
|
||||
"""Process and store content in the knowledge base"""
|
||||
def add(self, embedder: BaseEmbedder) -> None:
|
||||
"""Process content, chunk it, compute embeddings, and save them."""
|
||||
pass
|
||||
|
||||
def embed(self, embedder: BaseEmbedder, new_chunks: List[str]) -> None:
|
||||
"""Embed chunks and store them"""
|
||||
if not new_chunks:
|
||||
return
|
||||
|
||||
# Get embeddings for new chunks
|
||||
embeddings = embedder.embed_texts(new_chunks)
|
||||
|
||||
# Store embeddings with their corresponding chunks
|
||||
start_idx = len(self.chunks)
|
||||
for i, embedding in enumerate(embeddings):
|
||||
self.chunk_embeddings[start_idx + i] = embedding
|
||||
def get_embeddings(self) -> List[np.ndarray]:
|
||||
"""Return the list of embeddings for the chunks."""
|
||||
return self.chunk_embeddings
|
||||
|
||||
def _chunk_text(self, text: str) -> List[str]:
|
||||
"""Split text into chunks with overlap"""
|
||||
chunks = []
|
||||
start = 0
|
||||
text_length = len(text)
|
||||
|
||||
while start < text_length:
|
||||
# Get the chunk of size chunk_size
|
||||
end = start + self.chunk_size
|
||||
|
||||
if end >= text_length:
|
||||
# If we're at the end, just take the rest
|
||||
chunks.append(text[start:].strip())
|
||||
break
|
||||
|
||||
# Look for a good breaking point
|
||||
# Priority: double newline > single newline > period > space
|
||||
break_chars = ["\n\n", "\n", ". ", " "]
|
||||
chunk_end = end
|
||||
|
||||
for break_char in break_chars:
|
||||
# Look for the break_char in a window around the end point
|
||||
window_start = max(start + self.chunk_size - 100, start)
|
||||
window_end = min(start + self.chunk_size + 100, text_length)
|
||||
window_text = text[window_start:window_end]
|
||||
|
||||
# Find the last occurrence of the break_char in the window
|
||||
last_break = window_text.rfind(break_char)
|
||||
if last_break != -1:
|
||||
chunk_end = window_start + last_break + len(break_char)
|
||||
break
|
||||
|
||||
# Add the chunk
|
||||
chunk = text[start:chunk_end].strip()
|
||||
if chunk: # Only add non-empty chunks
|
||||
chunks.append(chunk)
|
||||
|
||||
# Move the start pointer, accounting for overlap
|
||||
start = max(
|
||||
start + self.chunk_size - self.chunk_overlap,
|
||||
chunk_end - self.chunk_overlap,
|
||||
)
|
||||
|
||||
return chunks
|
||||
|
||||
def _find_similar_chunks(
|
||||
self, embedder: BaseEmbedder, query: str, top_k: int = 3
|
||||
) -> List[str]:
|
||||
"""Find the most similar chunks to a query using embeddings"""
|
||||
if not self.chunks:
|
||||
return []
|
||||
|
||||
# Get query embedding
|
||||
query_embedding = embedder.embed_text(query)
|
||||
|
||||
# Calculate similarities with all chunks
|
||||
similarities = []
|
||||
for idx, chunk_embedding in self.chunk_embeddings.items():
|
||||
similarity = np.dot(query_embedding, chunk_embedding)
|
||||
similarities.append((similarity, idx))
|
||||
|
||||
# Sort by similarity and get top_k chunks
|
||||
similarities.sort(reverse=True)
|
||||
top_chunks = []
|
||||
for _, idx in similarities[:top_k]:
|
||||
top_chunks.append(self.chunks[idx])
|
||||
|
||||
return top_chunks
|
||||
"""Utility method to split text into chunks."""
|
||||
return [
|
||||
text[i : i + self.chunk_size]
|
||||
for i in range(0, len(text), self.chunk_size - self.chunk_overlap)
|
||||
]
|
||||
|
||||
65
src/crewai/knowledge/source/pdf_knowledge_source.py
Normal file
65
src/crewai/knowledge/source/pdf_knowledge_source.py
Normal file
@@ -0,0 +1,65 @@
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
from crewai.knowledge.embedder.base_embedder import BaseEmbedder
|
||||
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
|
||||
|
||||
|
||||
class PDFKnowledgeSource(BaseKnowledgeSource):
|
||||
"""A knowledge source that stores and queries PDF file content using embeddings."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: str,
|
||||
chunk_size: int = 1000,
|
||||
chunk_overlap: int = 200,
|
||||
):
|
||||
super().__init__(chunk_size, chunk_overlap)
|
||||
self.file_path = Path(file_path)
|
||||
self.content = self.load_content()
|
||||
|
||||
def _import_pdfplumber(self):
|
||||
"""Dynamically import pdfplumber."""
|
||||
try:
|
||||
import pdfplumber
|
||||
|
||||
return pdfplumber
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"pdfplumber is not installed. Please install it with: pip install pdfplumber"
|
||||
)
|
||||
|
||||
def load_content(self) -> str:
|
||||
"""Load and preprocess PDF file content."""
|
||||
if not self.file_path.exists():
|
||||
raise FileNotFoundError(f"File not found: {self.file_path}")
|
||||
if not self.file_path.is_file():
|
||||
raise ValueError(f"Path is not a file: {self.file_path}")
|
||||
|
||||
pdfplumber = self._import_pdfplumber()
|
||||
text = ""
|
||||
with pdfplumber.open(self.file_path) as pdf:
|
||||
for page in pdf.pages:
|
||||
page_text = page.extract_text()
|
||||
if page_text:
|
||||
text += page_text + "\n"
|
||||
return text
|
||||
|
||||
def add(self, embedder: BaseEmbedder) -> None:
|
||||
"""
|
||||
Add PDF file content to the knowledge source, chunk it, compute embeddings,
|
||||
and save the embeddings.
|
||||
"""
|
||||
new_chunks = self._chunk_text(self.content)
|
||||
self.chunks.extend(new_chunks)
|
||||
# Compute embeddings for the new chunks
|
||||
new_embeddings = embedder.embed_chunks(new_chunks)
|
||||
# Save the embeddings
|
||||
self.chunk_embeddings.extend(new_embeddings)
|
||||
|
||||
def _chunk_text(self, text: str) -> List[str]:
|
||||
"""Utility method to split text into chunks."""
|
||||
return [
|
||||
text[i : i + self.chunk_size]
|
||||
for i in range(0, len(text), self.chunk_size - self.chunk_overlap)
|
||||
]
|
||||
@@ -1,9 +1,11 @@
|
||||
from typing import List
|
||||
|
||||
from crewai.knowledge.embedder.base_embedder import BaseEmbedder
|
||||
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
|
||||
|
||||
|
||||
class StringKnowledgeSource(BaseKnowledgeSource):
|
||||
"""A knowledge base that stores and queries plain text content using embeddings"""
|
||||
"""A knowledge source that stores and queries plain text content using embeddings."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -15,25 +17,29 @@ class StringKnowledgeSource(BaseKnowledgeSource):
|
||||
chunk_size,
|
||||
chunk_overlap,
|
||||
)
|
||||
self.content = content
|
||||
self.load_content()
|
||||
|
||||
def load_content(self):
|
||||
"""Load and preprocess string content."""
|
||||
if not isinstance(self.content, str):
|
||||
raise ValueError("StringKnowledgeSource only accepts string content")
|
||||
|
||||
def add(self, embedder: BaseEmbedder) -> None:
|
||||
"""Add text content to the knowledge base, chunk it, and compute embeddings"""
|
||||
if not isinstance(self.content, str):
|
||||
raise ValueError("StringKnowledgeBase only accepts string content")
|
||||
|
||||
# Create chunks from the text
|
||||
new_chunks = self._chunk_text(content)
|
||||
|
||||
# Add chunks to the knowledge base
|
||||
"""
|
||||
Add string content to the knowledge source, chunk it, compute embeddings,
|
||||
and save the embeddings.
|
||||
"""
|
||||
new_chunks = self._chunk_text(self.content)
|
||||
self.chunks.extend(new_chunks)
|
||||
# Compute embeddings for the new chunks
|
||||
new_embeddings = embedder.embed_chunks(new_chunks)
|
||||
# Save the embeddings
|
||||
self.chunk_embeddings.extend(new_embeddings)
|
||||
|
||||
# Compute and store embeddings for the new chunks
|
||||
embedder.embed_chunks(new_chunks)
|
||||
|
||||
def query(self, embedder: BaseEmbedder, query: str, top_k: int = 3) -> str:
|
||||
"""
|
||||
Query the knowledge base using semantic search
|
||||
Returns the most relevant chunk based on embedding similarity
|
||||
"""
|
||||
similar_chunks = self._find_similar_chunks(embedder, query, top_k=top_k)
|
||||
return similar_chunks[0] if similar_chunks else ""
|
||||
def _chunk_text(self, text: str) -> List[str]:
|
||||
"""Utility method to split text into chunks."""
|
||||
return [
|
||||
text[i : i + self.chunk_size]
|
||||
for i in range(0, len(text), self.chunk_size - self.chunk_overlap)
|
||||
]
|
||||
|
||||
@@ -1,9 +1,12 @@
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
from crewai.knowledge.embedder.base_embedder import BaseEmbedder
|
||||
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
|
||||
|
||||
|
||||
class TextFileKnowledgeSource(BaseKnowledgeSource):
|
||||
"""A knowledge base that stores and queries plain text content using embeddings"""
|
||||
"""A knowledge source that stores and queries text file content using embeddings."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -11,29 +14,35 @@ class TextFileKnowledgeSource(BaseKnowledgeSource):
|
||||
chunk_size: int = 1000,
|
||||
chunk_overlap: int = 200,
|
||||
):
|
||||
super().__init__(
|
||||
chunk_size,
|
||||
chunk_overlap,
|
||||
)
|
||||
super().__init__(chunk_size, chunk_overlap)
|
||||
self.file_path = Path(file_path)
|
||||
self.content = self.load_content()
|
||||
|
||||
def load_content(self) -> str:
|
||||
"""Load and preprocess text file content."""
|
||||
if not self.file_path.exists():
|
||||
raise FileNotFoundError(f"File not found: {self.file_path}")
|
||||
if not self.file_path.is_file():
|
||||
raise ValueError(f"Path is not a file: {self.file_path}")
|
||||
|
||||
with self.file_path.open("r", encoding="utf-8") as f:
|
||||
return f.read()
|
||||
|
||||
def add(self, embedder: BaseEmbedder) -> None:
|
||||
"""Add text content to the knowledge base, chunk it, and compute embeddings"""
|
||||
if not isinstance(self.content, str):
|
||||
raise ValueError("StringKnowledgeBase only accepts string content")
|
||||
|
||||
# Create chunks from the text
|
||||
new_chunks = self._chunk_text(content)
|
||||
|
||||
# Add chunks to the knowledge base
|
||||
"""
|
||||
Add text file content to the knowledge source, chunk it, compute embeddings,
|
||||
and save the embeddings.
|
||||
"""
|
||||
new_chunks = self._chunk_text(self.content)
|
||||
self.chunks.extend(new_chunks)
|
||||
# Compute embeddings for the new chunks
|
||||
new_embeddings = embedder.embed_chunks(new_chunks)
|
||||
# Save the embeddings
|
||||
self.chunk_embeddings.extend(new_embeddings)
|
||||
|
||||
# Compute and store embeddings for the new chunks
|
||||
embedder.embed_chunks(new_chunks)
|
||||
|
||||
def query(self, embedder: BaseEmbedder, query: str, top_k: int = 3) -> str:
|
||||
"""
|
||||
Query the knowledge base using semantic search
|
||||
Returns the most relevant chunk based on embedding similarity
|
||||
"""
|
||||
similar_chunks = self._find_similar_chunks(embedder, query, top_k=top_k)
|
||||
return similar_chunks[0] if similar_chunks else ""
|
||||
def _chunk_text(self, text: str) -> List[str]:
|
||||
"""Utility method to split text into chunks."""
|
||||
return [
|
||||
text[i : i + self.chunk_size]
|
||||
for i in range(0, len(text), self.chunk_size - self.chunk_overlap)
|
||||
]
|
||||
|
||||
Reference in New Issue
Block a user