From b10440441811c0e4f0ad063204e473817992e9e3 Mon Sep 17 00:00:00 2001 From: Lorenze Jay Date: Mon, 18 Nov 2024 16:03:48 -0800 Subject: [PATCH] cleanup rm unused embedder --- .../knowledge/source/base_knowledge_source.py | 6 ++--- src/crewai/knowledge/knowledge.py | 26 ++++++++++++------- .../knowledge/source/base_knowledge_source.py | 3 +-- .../knowledge/source/csv_knowledge_source.py | 7 +---- .../source/excel_knowledge_source.py | 7 +---- .../knowledge/source/json_knowledge_source.py | 7 +---- .../knowledge/source/pdf_knowledge_source.py | 7 +---- .../knowledge/storage/knowledge_storage.py | 2 +- 8 files changed, 25 insertions(+), 40 deletions(-) diff --git a/path/to/src/crewai/knowledge/source/base_knowledge_source.py b/path/to/src/crewai/knowledge/source/base_knowledge_source.py index 0b5c8fee8..bd83a13f2 100644 --- a/path/to/src/crewai/knowledge/source/base_knowledge_source.py +++ b/path/to/src/crewai/knowledge/source/base_knowledge_source.py @@ -1,8 +1,6 @@ from abc import ABC, abstractmethod from typing import List -from crewai.knowledge.embedder.base_embedder import BaseEmbedder - class BaseKnowledgeSource(ABC): """Abstract base class for different types of knowledge sources.""" @@ -22,11 +20,11 @@ class BaseKnowledgeSource(ABC): pass @abstractmethod - def add(self, embedder: BaseEmbedder) -> None: + def add(self) -> None: """Add content to the knowledge base, chunk it, and compute embeddings.""" pass @abstractmethod - def query(self, embedder: BaseEmbedder, query: str, top_k: int = 3) -> str: + def query(self, query: str, top_k: int = 3) -> str: """Query the knowledge base using semantic search.""" pass diff --git a/src/crewai/knowledge/knowledge.py b/src/crewai/knowledge/knowledge.py index 3399956bf..e4e2c3c99 100644 --- a/src/crewai/knowledge/knowledge.py +++ b/src/crewai/knowledge/knowledge.py @@ -1,27 +1,35 @@ -from typing import List, Optional, Dict, Any +import os +from typing import List, Optional, Dict, Any from pydantic import BaseModel, ConfigDict, Field -from crewai.knowledge.embedder.base_embedder import BaseEmbedder -from crewai.knowledge.embedder.fastembed import FastEmbed from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage +from crewai.utilities.logger import Logger + +os.environ["TOKENIZERS_PARALLELISM"] = "false" # removes logging from fastembed class Knowledge(BaseModel): sources: List[BaseKnowledgeSource] = Field(default_factory=list) - embedder: BaseEmbedder = Field(default_factory=FastEmbed) - model_config = ConfigDict(arbitrary_types_allowed=True) - agents: List[str] = Field(default_factory=list) storage: KnowledgeStorage = Field(default_factory=KnowledgeStorage) - embedder_config: Optional[Dict[str, Any]] = Field(default_factory=None) + embedder_config: Optional[Dict[str, Any]] = None - def __init__(self, **data): + def __init__(self, embedder_config: Optional[Dict[str, Any]] = None, **data): super().__init__(**data) - embedder_config = data.get("embedder_config", None) if embedder_config: self.storage = KnowledgeStorage(embedder_config=embedder_config) + else: + self.storage = KnowledgeStorage() + + try: + for source in self.sources: + source.add() + except Exception as e: + Logger.log( + "warning", f"Failed to add some sources during initialization: {e}" + ) def query( self, query: List[str], limit: int = 3, preference: Optional[str] = None diff --git a/src/crewai/knowledge/source/base_knowledge_source.py b/src/crewai/knowledge/source/base_knowledge_source.py index 5fe814e0c..a566bad5c 100644 --- a/src/crewai/knowledge/source/base_knowledge_source.py +++ b/src/crewai/knowledge/source/base_knowledge_source.py @@ -4,7 +4,6 @@ from typing import List import numpy as np from pydantic import BaseModel, ConfigDict, Field -from crewai.knowledge.embedder.base_embedder import BaseEmbedder from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage from typing import Dict, Any @@ -27,7 +26,7 @@ class BaseKnowledgeSource(BaseModel, ABC): pass @abstractmethod - def add(self, embedder: BaseEmbedder) -> None: + def add(self) -> None: """Process content, chunk it, compute embeddings, and save them.""" pass diff --git a/src/crewai/knowledge/source/csv_knowledge_source.py b/src/crewai/knowledge/source/csv_knowledge_source.py index bf7af5b0e..c65e7f7a0 100644 --- a/src/crewai/knowledge/source/csv_knowledge_source.py +++ b/src/crewai/knowledge/source/csv_knowledge_source.py @@ -1,7 +1,6 @@ import csv from typing import List -from crewai.knowledge.embedder.base_embedder import BaseEmbedder from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource @@ -18,17 +17,13 @@ class CSVKnowledgeSource(BaseFileKnowledgeSource): content += " ".join(row) + "\n" return content - def add(self, embedder: BaseEmbedder) -> None: + def add(self) -> None: """ Add CSV file content to the knowledge source, chunk it, compute embeddings, and save the embeddings. """ new_chunks = self._chunk_text(self.content) self.chunks.extend(new_chunks) - # Compute embeddings for the new chunks - new_embeddings = embedder.embed_chunks(new_chunks) - # Save the embeddings - self.chunk_embeddings.extend(new_embeddings) self.save_documents(metadata=self.metadata) def _chunk_text(self, text: str) -> List[str]: diff --git a/src/crewai/knowledge/source/excel_knowledge_source.py b/src/crewai/knowledge/source/excel_knowledge_source.py index 608657076..ff0475472 100644 --- a/src/crewai/knowledge/source/excel_knowledge_source.py +++ b/src/crewai/knowledge/source/excel_knowledge_source.py @@ -1,6 +1,5 @@ from typing import List -from crewai.knowledge.embedder.base_embedder import BaseEmbedder from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource @@ -28,17 +27,13 @@ class ExcelKnowledgeSource(BaseFileKnowledgeSource): f"{missing_package} is not installed. Please install it with: pip install {missing_package}" ) - def add(self, embedder: BaseEmbedder) -> None: + def add(self) -> None: """ Add Excel file content to the knowledge source, chunk it, compute embeddings, and save the embeddings. """ new_chunks = self._chunk_text(self.content) self.chunks.extend(new_chunks) - # Compute embeddings for the new chunks - new_embeddings = embedder.embed_chunks(new_chunks) - # Save the embeddings - self.chunk_embeddings.extend(new_embeddings) self.save_documents(metadata=self.metadata) def _chunk_text(self, text: str) -> List[str]: diff --git a/src/crewai/knowledge/source/json_knowledge_source.py b/src/crewai/knowledge/source/json_knowledge_source.py index b5b0dcbf1..00f01c29e 100644 --- a/src/crewai/knowledge/source/json_knowledge_source.py +++ b/src/crewai/knowledge/source/json_knowledge_source.py @@ -1,7 +1,6 @@ import json from typing import Any, List -from crewai.knowledge.embedder.base_embedder import BaseEmbedder from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource @@ -30,17 +29,13 @@ class JSONKnowledgeSource(BaseFileKnowledgeSource): text += f"{str(data)}" return text - def add(self, embedder: BaseEmbedder) -> None: + def add(self) -> None: """ Add JSON file content to the knowledge source, chunk it, compute embeddings, and save the embeddings. """ new_chunks = self._chunk_text(self.content) self.chunks.extend(new_chunks) - # Compute embeddings for the new chunks - new_embeddings = embedder.embed_chunks(new_chunks) - # Save the embeddings - self.chunk_embeddings.extend(new_embeddings) self.save_documents(metadata=self.metadata) def _chunk_text(self, text: str) -> List[str]: diff --git a/src/crewai/knowledge/source/pdf_knowledge_source.py b/src/crewai/knowledge/source/pdf_knowledge_source.py index 7a368f755..623ba30a2 100644 --- a/src/crewai/knowledge/source/pdf_knowledge_source.py +++ b/src/crewai/knowledge/source/pdf_knowledge_source.py @@ -1,7 +1,6 @@ from typing import List, Dict from pathlib import Path -from crewai.knowledge.embedder.base_embedder import BaseEmbedder from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource @@ -37,7 +36,7 @@ class PDFKnowledgeSource(BaseFileKnowledgeSource): "pdfplumber is not installed. Please install it with: pip install pdfplumber" ) - def add(self, embedder: BaseEmbedder) -> None: + def add(self) -> None: """ Add PDF file content to the knowledge source, chunk it, compute embeddings, and save the embeddings. @@ -45,10 +44,6 @@ class PDFKnowledgeSource(BaseFileKnowledgeSource): for _, text in self.content.items(): new_chunks = self._chunk_text(text) self.chunks.extend(new_chunks) - # Compute embeddings for the new chunks - new_embeddings = embedder.embed_chunks(new_chunks) - # Save the embeddings - self.chunk_embeddings.extend(new_embeddings) self.save_documents(metadata=self.metadata) def _chunk_text(self, text: str) -> List[str]: diff --git a/src/crewai/knowledge/storage/knowledge_storage.py b/src/crewai/knowledge/storage/knowledge_storage.py index 7b7fa8c69..a18ae2a72 100644 --- a/src/crewai/knowledge/storage/knowledge_storage.py +++ b/src/crewai/knowledge/storage/knowledge_storage.py @@ -36,7 +36,7 @@ class KnowledgeStorage(BaseKnowledgeStorage): collection: Optional[chromadb.Collection] = None - def __init__(self, embedder_config=None): + def __init__(self, embedder_config: Optional[Dict[str, Any]] = None): self._initialize_app(embedder_config or {}) def search(