cleanup rm unused embedder

This commit is contained in:
Lorenze Jay
2024-11-18 16:03:48 -08:00
parent d579c5ae12
commit b104404418
8 changed files with 25 additions and 40 deletions

View File

@@ -1,8 +1,6 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import List from typing import List
from crewai.knowledge.embedder.base_embedder import BaseEmbedder
class BaseKnowledgeSource(ABC): class BaseKnowledgeSource(ABC):
"""Abstract base class for different types of knowledge sources.""" """Abstract base class for different types of knowledge sources."""
@@ -22,11 +20,11 @@ class BaseKnowledgeSource(ABC):
pass pass
@abstractmethod @abstractmethod
def add(self, embedder: BaseEmbedder) -> None: def add(self) -> None:
"""Add content to the knowledge base, chunk it, and compute embeddings.""" """Add content to the knowledge base, chunk it, and compute embeddings."""
pass pass
@abstractmethod @abstractmethod
def query(self, embedder: BaseEmbedder, query: str, top_k: int = 3) -> str: def query(self, query: str, top_k: int = 3) -> str:
"""Query the knowledge base using semantic search.""" """Query the knowledge base using semantic search."""
pass pass

View File

@@ -1,27 +1,35 @@
from typing import List, Optional, Dict, Any import os
from typing import List, Optional, Dict, Any
from pydantic import BaseModel, ConfigDict, Field from pydantic import BaseModel, ConfigDict, Field
from crewai.knowledge.embedder.base_embedder import BaseEmbedder
from crewai.knowledge.embedder.fastembed import FastEmbed
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage
from crewai.utilities.logger import Logger
os.environ["TOKENIZERS_PARALLELISM"] = "false" # removes logging from fastembed
class Knowledge(BaseModel): class Knowledge(BaseModel):
sources: List[BaseKnowledgeSource] = Field(default_factory=list) sources: List[BaseKnowledgeSource] = Field(default_factory=list)
embedder: BaseEmbedder = Field(default_factory=FastEmbed)
model_config = ConfigDict(arbitrary_types_allowed=True) model_config = ConfigDict(arbitrary_types_allowed=True)
agents: List[str] = Field(default_factory=list)
storage: KnowledgeStorage = Field(default_factory=KnowledgeStorage) storage: KnowledgeStorage = Field(default_factory=KnowledgeStorage)
embedder_config: Optional[Dict[str, Any]] = Field(default_factory=None) embedder_config: Optional[Dict[str, Any]] = None
def __init__(self, **data): def __init__(self, embedder_config: Optional[Dict[str, Any]] = None, **data):
super().__init__(**data) super().__init__(**data)
embedder_config = data.get("embedder_config", None)
if embedder_config: if embedder_config:
self.storage = KnowledgeStorage(embedder_config=embedder_config) self.storage = KnowledgeStorage(embedder_config=embedder_config)
else:
self.storage = KnowledgeStorage()
try:
for source in self.sources:
source.add()
except Exception as e:
Logger.log(
"warning", f"Failed to add some sources during initialization: {e}"
)
def query( def query(
self, query: List[str], limit: int = 3, preference: Optional[str] = None self, query: List[str], limit: int = 3, preference: Optional[str] = None

View File

@@ -4,7 +4,6 @@ from typing import List
import numpy as np import numpy as np
from pydantic import BaseModel, ConfigDict, Field from pydantic import BaseModel, ConfigDict, Field
from crewai.knowledge.embedder.base_embedder import BaseEmbedder
from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage
from typing import Dict, Any from typing import Dict, Any
@@ -27,7 +26,7 @@ class BaseKnowledgeSource(BaseModel, ABC):
pass pass
@abstractmethod @abstractmethod
def add(self, embedder: BaseEmbedder) -> None: def add(self) -> None:
"""Process content, chunk it, compute embeddings, and save them.""" """Process content, chunk it, compute embeddings, and save them."""
pass pass

View File

@@ -1,7 +1,6 @@
import csv import csv
from typing import List from typing import List
from crewai.knowledge.embedder.base_embedder import BaseEmbedder
from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource
@@ -18,17 +17,13 @@ class CSVKnowledgeSource(BaseFileKnowledgeSource):
content += " ".join(row) + "\n" content += " ".join(row) + "\n"
return content return content
def add(self, embedder: BaseEmbedder) -> None: def add(self) -> None:
""" """
Add CSV file content to the knowledge source, chunk it, compute embeddings, Add CSV file content to the knowledge source, chunk it, compute embeddings,
and save the embeddings. and save the embeddings.
""" """
new_chunks = self._chunk_text(self.content) new_chunks = self._chunk_text(self.content)
self.chunks.extend(new_chunks) self.chunks.extend(new_chunks)
# Compute embeddings for the new chunks
new_embeddings = embedder.embed_chunks(new_chunks)
# Save the embeddings
self.chunk_embeddings.extend(new_embeddings)
self.save_documents(metadata=self.metadata) self.save_documents(metadata=self.metadata)
def _chunk_text(self, text: str) -> List[str]: def _chunk_text(self, text: str) -> List[str]:

View File

@@ -1,6 +1,5 @@
from typing import List from typing import List
from crewai.knowledge.embedder.base_embedder import BaseEmbedder
from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource
@@ -28,17 +27,13 @@ class ExcelKnowledgeSource(BaseFileKnowledgeSource):
f"{missing_package} is not installed. Please install it with: pip install {missing_package}" f"{missing_package} is not installed. Please install it with: pip install {missing_package}"
) )
def add(self, embedder: BaseEmbedder) -> None: def add(self) -> None:
""" """
Add Excel file content to the knowledge source, chunk it, compute embeddings, Add Excel file content to the knowledge source, chunk it, compute embeddings,
and save the embeddings. and save the embeddings.
""" """
new_chunks = self._chunk_text(self.content) new_chunks = self._chunk_text(self.content)
self.chunks.extend(new_chunks) self.chunks.extend(new_chunks)
# Compute embeddings for the new chunks
new_embeddings = embedder.embed_chunks(new_chunks)
# Save the embeddings
self.chunk_embeddings.extend(new_embeddings)
self.save_documents(metadata=self.metadata) self.save_documents(metadata=self.metadata)
def _chunk_text(self, text: str) -> List[str]: def _chunk_text(self, text: str) -> List[str]:

View File

@@ -1,7 +1,6 @@
import json import json
from typing import Any, List from typing import Any, List
from crewai.knowledge.embedder.base_embedder import BaseEmbedder
from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource
@@ -30,17 +29,13 @@ class JSONKnowledgeSource(BaseFileKnowledgeSource):
text += f"{str(data)}" text += f"{str(data)}"
return text return text
def add(self, embedder: BaseEmbedder) -> None: def add(self) -> None:
""" """
Add JSON file content to the knowledge source, chunk it, compute embeddings, Add JSON file content to the knowledge source, chunk it, compute embeddings,
and save the embeddings. and save the embeddings.
""" """
new_chunks = self._chunk_text(self.content) new_chunks = self._chunk_text(self.content)
self.chunks.extend(new_chunks) self.chunks.extend(new_chunks)
# Compute embeddings for the new chunks
new_embeddings = embedder.embed_chunks(new_chunks)
# Save the embeddings
self.chunk_embeddings.extend(new_embeddings)
self.save_documents(metadata=self.metadata) self.save_documents(metadata=self.metadata)
def _chunk_text(self, text: str) -> List[str]: def _chunk_text(self, text: str) -> List[str]:

View File

@@ -1,7 +1,6 @@
from typing import List, Dict from typing import List, Dict
from pathlib import Path from pathlib import Path
from crewai.knowledge.embedder.base_embedder import BaseEmbedder
from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource
@@ -37,7 +36,7 @@ class PDFKnowledgeSource(BaseFileKnowledgeSource):
"pdfplumber is not installed. Please install it with: pip install pdfplumber" "pdfplumber is not installed. Please install it with: pip install pdfplumber"
) )
def add(self, embedder: BaseEmbedder) -> None: def add(self) -> None:
""" """
Add PDF file content to the knowledge source, chunk it, compute embeddings, Add PDF file content to the knowledge source, chunk it, compute embeddings,
and save the embeddings. and save the embeddings.
@@ -45,10 +44,6 @@ class PDFKnowledgeSource(BaseFileKnowledgeSource):
for _, text in self.content.items(): for _, text in self.content.items():
new_chunks = self._chunk_text(text) new_chunks = self._chunk_text(text)
self.chunks.extend(new_chunks) self.chunks.extend(new_chunks)
# Compute embeddings for the new chunks
new_embeddings = embedder.embed_chunks(new_chunks)
# Save the embeddings
self.chunk_embeddings.extend(new_embeddings)
self.save_documents(metadata=self.metadata) self.save_documents(metadata=self.metadata)
def _chunk_text(self, text: str) -> List[str]: def _chunk_text(self, text: str) -> List[str]:

View File

@@ -36,7 +36,7 @@ class KnowledgeStorage(BaseKnowledgeStorage):
collection: Optional[chromadb.Collection] = None collection: Optional[chromadb.Collection] = None
def __init__(self, embedder_config=None): def __init__(self, embedder_config: Optional[Dict[str, Any]] = None):
self._initialize_app(embedder_config or {}) self._initialize_app(embedder_config or {})
def search( def search(