drop metadata requirement

This commit is contained in:
Brandon Hancock
2024-12-05 14:01:53 -05:00
parent 7b276e6797
commit b65eab4fb6
10 changed files with 37 additions and 35 deletions

View File

@@ -1,6 +1,6 @@
import os import os
from typing import Any, Dict, List, Optional
from typing import List, Optional, Dict, Any
from pydantic import BaseModel, ConfigDict, Field from pydantic import BaseModel, ConfigDict, Field
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
@@ -46,9 +46,7 @@ class Knowledge(BaseModel):
source.storage = self.storage source.storage = self.storage
source.add() source.add()
def query( def query(self, query: List[str], limit: int = 3) -> List[Dict[str, Any]]:
self, query: List[str], limit: int = 3, preference: Optional[str] = None
) -> List[Dict[str, Any]]:
""" """
Query across all knowledge sources to find the most relevant information. Query across all knowledge sources to find the most relevant information.
Returns the top_k most relevant chunks. Returns the top_k most relevant chunks.
@@ -57,8 +55,6 @@ class Knowledge(BaseModel):
results = self.storage.search( results = self.storage.search(
query, query,
limit, limit,
filter={"preference": preference} if preference else None,
score_threshold=DEFAULT_SCORE_THRESHOLD,
) )
return results return results

View File

@@ -1,13 +1,13 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from pathlib import Path from pathlib import Path
from typing import Union, List, Dict, Any from typing import Any, Dict, List, Union
from pydantic import Field from pydantic import Field
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
from crewai.utilities.logger import Logger
from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage
from crewai.utilities.constants import KNOWLEDGE_DIRECTORY from crewai.utilities.constants import KNOWLEDGE_DIRECTORY
from crewai.utilities.logger import Logger
class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC): class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):
@@ -49,10 +49,9 @@ class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):
color="red", color="red",
) )
def save_documents(self, metadata: Dict[str, Any]): def save_documents(self):
"""Save the documents to the storage.""" """Save the documents to the storage."""
chunk_metadatas = [metadata.copy() for _ in self.chunks] self.storage.save(self.chunks)
self.storage.save(self.chunks, chunk_metadatas)
def convert_to_path(self, path: Union[Path, str]) -> Path: def convert_to_path(self, path: Union[Path, str]) -> Path:
"""Convert a path to a Path object.""" """Convert a path to a Path object."""

View File

@@ -1,5 +1,5 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import List, Dict, Any, Optional from typing import Any, Dict, List, Optional
import numpy as np import numpy as np
from pydantic import BaseModel, ConfigDict, Field from pydantic import BaseModel, ConfigDict, Field
@@ -17,7 +17,6 @@ class BaseKnowledgeSource(BaseModel, ABC):
model_config = ConfigDict(arbitrary_types_allowed=True) model_config = ConfigDict(arbitrary_types_allowed=True)
storage: KnowledgeStorage = Field(default_factory=KnowledgeStorage) storage: KnowledgeStorage = Field(default_factory=KnowledgeStorage)
metadata: Dict[str, Any] = Field(default_factory=dict)
collection_name: Optional[str] = Field(default=None) collection_name: Optional[str] = Field(default=None)
@abstractmethod @abstractmethod
@@ -41,9 +40,9 @@ class BaseKnowledgeSource(BaseModel, ABC):
for i in range(0, len(text), self.chunk_size - self.chunk_overlap) for i in range(0, len(text), self.chunk_size - self.chunk_overlap)
] ]
def save_documents(self, metadata: Dict[str, Any]): def save_documents(self):
""" """
Save the documents to the storage. Save the documents to the storage.
This method should be called after the chunks and embeddings are generated. This method should be called after the chunks and embeddings are generated.
""" """
self.storage.save(self.chunks, metadata) self.storage.save(self.chunks)

View File

@@ -1,6 +1,6 @@
import csv import csv
from typing import Dict, List
from pathlib import Path from pathlib import Path
from typing import Dict, List
from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource
@@ -30,7 +30,7 @@ class CSVKnowledgeSource(BaseFileKnowledgeSource):
) )
new_chunks = self._chunk_text(content_str) new_chunks = self._chunk_text(content_str)
self.chunks.extend(new_chunks) self.chunks.extend(new_chunks)
self.save_documents(metadata=self.metadata) self.save_documents()
def _chunk_text(self, text: str) -> List[str]: def _chunk_text(self, text: str) -> List[str]:
"""Utility method to split text into chunks.""" """Utility method to split text into chunks."""

View File

@@ -1,5 +1,6 @@
from typing import Dict, List
from pathlib import Path from pathlib import Path
from typing import Dict, List
from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource
@@ -44,7 +45,7 @@ class ExcelKnowledgeSource(BaseFileKnowledgeSource):
new_chunks = self._chunk_text(content_str) new_chunks = self._chunk_text(content_str)
self.chunks.extend(new_chunks) self.chunks.extend(new_chunks)
self.save_documents(metadata=self.metadata) self.save_documents()
def _chunk_text(self, text: str) -> List[str]: def _chunk_text(self, text: str) -> List[str]:
"""Utility method to split text into chunks.""" """Utility method to split text into chunks."""

View File

@@ -1,6 +1,6 @@
import json import json
from typing import Any, Dict, List
from pathlib import Path from pathlib import Path
from typing import Any, Dict, List
from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource
@@ -42,7 +42,7 @@ class JSONKnowledgeSource(BaseFileKnowledgeSource):
) )
new_chunks = self._chunk_text(content_str) new_chunks = self._chunk_text(content_str)
self.chunks.extend(new_chunks) self.chunks.extend(new_chunks)
self.save_documents(metadata=self.metadata) self.save_documents()
def _chunk_text(self, text: str) -> List[str]: def _chunk_text(self, text: str) -> List[str]:
"""Utility method to split text into chunks.""" """Utility method to split text into chunks."""

View File

@@ -1,5 +1,5 @@
from typing import List, Dict
from pathlib import Path from pathlib import Path
from typing import Dict, List
from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource
@@ -43,7 +43,7 @@ class PDFKnowledgeSource(BaseFileKnowledgeSource):
for _, text in self.content.items(): for _, text in self.content.items():
new_chunks = self._chunk_text(text) new_chunks = self._chunk_text(text)
self.chunks.extend(new_chunks) self.chunks.extend(new_chunks)
self.save_documents(metadata=self.metadata) self.save_documents()
def _chunk_text(self, text: str) -> List[str]: def _chunk_text(self, text: str) -> List[str]:
"""Utility method to split text into chunks.""" """Utility method to split text into chunks."""

View File

@@ -24,7 +24,7 @@ class StringKnowledgeSource(BaseKnowledgeSource):
"""Add string content to the knowledge source, chunk it, compute embeddings, and save them.""" """Add string content to the knowledge source, chunk it, compute embeddings, and save them."""
new_chunks = self._chunk_text(self.content) new_chunks = self._chunk_text(self.content)
self.chunks.extend(new_chunks) self.chunks.extend(new_chunks)
self.save_documents(metadata=self.metadata) self.save_documents()
def _chunk_text(self, text: str) -> List[str]: def _chunk_text(self, text: str) -> List[str]:
"""Utility method to split text into chunks.""" """Utility method to split text into chunks."""

View File

@@ -1,5 +1,5 @@
from typing import Dict, List
from pathlib import Path from pathlib import Path
from typing import Dict, List
from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource
@@ -24,7 +24,7 @@ class TextFileKnowledgeSource(BaseFileKnowledgeSource):
for _, text in self.content.items(): for _, text in self.content.items():
new_chunks = self._chunk_text(text) new_chunks = self._chunk_text(text)
self.chunks.extend(new_chunks) self.chunks.extend(new_chunks)
self.save_documents(metadata=self.metadata) self.save_documents()
def _chunk_text(self, text: str) -> List[str]: def _chunk_text(self, text: str) -> List[str]:
"""Utility method to split text into chunks.""" """Utility method to split text into chunks."""

View File

@@ -1,18 +1,20 @@
import contextlib import contextlib
import hashlib
import io import io
import logging import logging
import chromadb
import os import os
from typing import Any, Dict, List, Optional, Union, cast
import chromadb
import chromadb.errors import chromadb.errors
from crewai.utilities.paths import db_storage_path
from typing import Optional, List, Dict, Any, Union
from crewai.utilities import EmbeddingConfigurator
from crewai.knowledge.storage.base_knowledge_storage import BaseKnowledgeStorage
import hashlib
from chromadb.config import Settings
from chromadb.api import ClientAPI from chromadb.api import ClientAPI
from chromadb.api.types import OneOrMany
from chromadb.config import Settings
from crewai.knowledge.storage.base_knowledge_storage import BaseKnowledgeStorage
from crewai.utilities import EmbeddingConfigurator
from crewai.utilities.logger import Logger from crewai.utilities.logger import Logger
from crewai.utilities.paths import db_storage_path
@contextlib.contextmanager @contextlib.contextmanager
@@ -116,11 +118,16 @@ class KnowledgeStorage(BaseKnowledgeStorage):
def save( def save(
self, self,
documents: List[str], documents: List[str],
metadata: Union[Dict[str, Any], List[Dict[str, Any]]], metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
): ):
if self.collection: if self.collection:
try: try:
metadatas = [metadata] if isinstance(metadata, dict) else metadata if metadata is None:
metadatas: Optional[OneOrMany[chromadb.Metadata]] = None
elif isinstance(metadata, list):
metadatas = [cast(chromadb.Metadata, m) for m in metadata]
else:
metadatas = cast(chromadb.Metadata, metadata)
ids = [ ids = [
hashlib.sha256(doc.encode("utf-8")).hexdigest() for doc in documents hashlib.sha256(doc.encode("utf-8")).hexdigest() for doc in documents