drop metadata requirement (#1712)

* drop metadata requirement

* fix linting

* Update docs for new knowledge

* more linting

* more linting

* make save_documents private

* update docs to the new way we use knowledge and include clearing memory
This commit is contained in:
Brandon Hancock (bhancock_ai)
2024-12-05 14:59:52 -05:00
committed by GitHub
parent 7b276e6797
commit c7c0647dd2
11 changed files with 63 additions and 78 deletions

View File

@@ -48,7 +48,6 @@ from crewai.knowledge.source.string_knowledge_source import StringKnowledgeSourc
content = "Users name is John. He is 30 years old and lives in San Francisco." content = "Users name is John. He is 30 years old and lives in San Francisco."
string_source = StringKnowledgeSource( string_source = StringKnowledgeSource(
content=content, content=content,
metadata={"preference": "personal"}
) )
# Create an LLM with a temperature of 0 to ensure deterministic outputs # Create an LLM with a temperature of 0 to ensure deterministic outputs
@@ -74,10 +73,7 @@ crew = Crew(
tasks=[task], tasks=[task],
verbose=True, verbose=True,
process=Process.sequential, process=Process.sequential,
knowledge={ knowledge_sources=[string_source], # Enable knowledge by adding the sources here. You can also add more sources to the sources list.
"sources": [string_source],
"metadata": {"preference": "personal"}
}, # Enable knowledge by adding the sources here. You can also add more sources to the sources list.
) )
result = crew.kickoff(inputs={"question": "What city does John live in and how old is he?"}) result = crew.kickoff(inputs={"question": "What city does John live in and how old is he?"})
@@ -85,17 +81,6 @@ result = crew.kickoff(inputs={"question": "What city does John live in and how o
## Knowledge Configuration ## Knowledge Configuration
### Metadata and Filtering
Knowledge sources support metadata for better organization and filtering. Metadata is used to filter the knowledge sources when querying the knowledge store.
```python Code
knowledge_source = StringKnowledgeSource(
content="Users name is John. He is 30 years old and lives in San Francisco.",
metadata={"preference": "personal"} # Metadata is used to filter the knowledge sources
)
```
### Chunking Configuration ### Chunking Configuration
Control how content is split for processing by setting the chunk size and overlap. Control how content is split for processing by setting the chunk size and overlap.
@@ -116,21 +101,28 @@ You can also configure the embedder for the knowledge store. This is useful if y
... ...
string_source = StringKnowledgeSource( string_source = StringKnowledgeSource(
content="Users name is John. He is 30 years old and lives in San Francisco.", content="Users name is John. He is 30 years old and lives in San Francisco.",
metadata={"preference": "personal"}
) )
crew = Crew( crew = Crew(
... ...
knowledge={ knowledge_sources=[string_source],
"sources": [string_source], embedder={
"metadata": {"preference": "personal"}, "provider": "openai",
"embedder_config": { "config": {"model": "text-embedding-3-small"},
"provider": "openai", # Default embedder provider; can be "ollama", "gemini", e.t.c.
"config": {"model": "text-embedding-3-small"} # Default embedder model; can be "mxbai-embed-large", "nomic-embed-tex", e.t.c.
},
}, },
) )
``` ```
## Clearing Knowledge
If you need to clear the knowledge stored in CrewAI, you can use the `crewai reset-memories` command with the `--knowledge` option.
```bash Command
crewai reset-memories --knowledge
```
This is useful when you've updated your knowledge sources and want to ensure that the agents are using the most recent information.
## Custom Knowledge Sources ## Custom Knowledge Sources
CrewAI allows you to create custom knowledge sources for any type of data by extending the `BaseKnowledgeSource` class. Let's create a practical example that fetches and processes space news articles. CrewAI allows you to create custom knowledge sources for any type of data by extending the `BaseKnowledgeSource` class. Let's create a practical example that fetches and processes space news articles.
@@ -174,12 +166,12 @@ class SpaceNewsKnowledgeSource(BaseKnowledgeSource):
formatted = "Space News Articles:\n\n" formatted = "Space News Articles:\n\n"
for article in articles: for article in articles:
formatted += f""" formatted += f"""
Title: {article['title']} Title: {article['title']}
Published: {article['published_at']} Published: {article['published_at']}
Summary: {article['summary']} Summary: {article['summary']}
News Site: {article['news_site']} News Site: {article['news_site']}
URL: {article['url']} URL: {article['url']}
-------------------""" -------------------"""
return formatted return formatted
def add(self) -> None: def add(self) -> None:
@@ -189,17 +181,12 @@ URL: {article['url']}
chunks = self._chunk_text(text) chunks = self._chunk_text(text)
self.chunks.extend(chunks) self.chunks.extend(chunks)
self.save_documents(metadata={ self._save_documents()
"source": "space_news_api",
"timestamp": datetime.now().isoformat(),
"article_count": self.limit
})
# Create knowledge source # Create knowledge source
recent_news = SpaceNewsKnowledgeSource( recent_news = SpaceNewsKnowledgeSource(
api_endpoint="https://api.spaceflightnewsapi.net/v4/articles", api_endpoint="https://api.spaceflightnewsapi.net/v4/articles",
limit=10, limit=10,
metadata={"category": "recent_news", "source": "spaceflight_news"}
) )
# Create specialized agent # Create specialized agent
@@ -265,7 +252,7 @@ The latest developments in space exploration, based on recent space news article
- Implements three key methods: - Implements three key methods:
- `load_content()`: Fetches articles from the API - `load_content()`: Fetches articles from the API
- `_format_articles()`: Structures the articles into readable text - `_format_articles()`: Structures the articles into readable text
- `add()`: Processes and stores the content with metadata - `add()`: Processes and stores the content
2. **Agent Configuration**: 2. **Agent Configuration**:
- Specialized role as a Space News Analyst - Specialized role as a Space News Analyst
@@ -299,14 +286,12 @@ You can customize the API query by modifying the endpoint URL:
recent_news = SpaceNewsKnowledgeSource( recent_news = SpaceNewsKnowledgeSource(
api_endpoint="https://api.spaceflightnewsapi.net/v4/articles", api_endpoint="https://api.spaceflightnewsapi.net/v4/articles",
limit=20, # Increase the number of articles limit=20, # Increase the number of articles
metadata={"category": "recent_news"}
) )
# Add search parameters # Add search parameters
recent_news = SpaceNewsKnowledgeSource( recent_news = SpaceNewsKnowledgeSource(
api_endpoint="https://api.spaceflightnewsapi.net/v4/articles?search=NASA", # Search for NASA news api_endpoint="https://api.spaceflightnewsapi.net/v4/articles?search=NASA", # Search for NASA news
limit=10, limit=10,
metadata={"category": "nasa_news"}
) )
``` ```
@@ -314,16 +299,14 @@ recent_news = SpaceNewsKnowledgeSource(
<AccordionGroup> <AccordionGroup>
<Accordion title="Content Organization"> <Accordion title="Content Organization">
- Use descriptive metadata for better filtering
- Keep chunk sizes appropriate for your content type - Keep chunk sizes appropriate for your content type
- Consider content overlap for context preservation - Consider content overlap for context preservation
- Organize related information into separate knowledge sources - Organize related information into separate knowledge sources
</Accordion> </Accordion>
<Accordion title="Performance Tips"> <Accordion title="Performance Tips">
- Use metadata filtering to narrow search scope
- Adjust chunk sizes based on content complexity - Adjust chunk sizes based on content complexity
- Configure appropriate embedding models - Configure appropriate embedding models
- Consider using local embedding providers for faster processing - Consider using local embedding providers for faster processing
</Accordion> </Accordion>
</AccordionGroup> </AccordionGroup>

View File

@@ -1,11 +1,10 @@
import os import os
from typing import Any, Dict, List, Optional
from typing import List, Optional, Dict, Any
from pydantic import BaseModel, ConfigDict, Field from pydantic import BaseModel, ConfigDict, Field
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage
from crewai.utilities.constants import DEFAULT_SCORE_THRESHOLD
os.environ["TOKENIZERS_PARALLELISM"] = "false" # removes logging from fastembed os.environ["TOKENIZERS_PARALLELISM"] = "false" # removes logging from fastembed
@@ -46,9 +45,7 @@ class Knowledge(BaseModel):
source.storage = self.storage source.storage = self.storage
source.add() source.add()
def query( def query(self, query: List[str], limit: int = 3) -> List[Dict[str, Any]]:
self, query: List[str], limit: int = 3, preference: Optional[str] = None
) -> List[Dict[str, Any]]:
""" """
Query across all knowledge sources to find the most relevant information. Query across all knowledge sources to find the most relevant information.
Returns the top_k most relevant chunks. Returns the top_k most relevant chunks.
@@ -57,8 +54,6 @@ class Knowledge(BaseModel):
results = self.storage.search( results = self.storage.search(
query, query,
limit, limit,
filter={"preference": preference} if preference else None,
score_threshold=DEFAULT_SCORE_THRESHOLD,
) )
return results return results

View File

@@ -1,13 +1,13 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from pathlib import Path from pathlib import Path
from typing import Union, List, Dict, Any from typing import Dict, List, Union
from pydantic import Field from pydantic import Field
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
from crewai.utilities.logger import Logger
from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage
from crewai.utilities.constants import KNOWLEDGE_DIRECTORY from crewai.utilities.constants import KNOWLEDGE_DIRECTORY
from crewai.utilities.logger import Logger
class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC): class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):
@@ -49,10 +49,9 @@ class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):
color="red", color="red",
) )
def save_documents(self, metadata: Dict[str, Any]): def _save_documents(self):
"""Save the documents to the storage.""" """Save the documents to the storage."""
chunk_metadatas = [metadata.copy() for _ in self.chunks] self.storage.save(self.chunks)
self.storage.save(self.chunks, chunk_metadatas)
def convert_to_path(self, path: Union[Path, str]) -> Path: def convert_to_path(self, path: Union[Path, str]) -> Path:
"""Convert a path to a Path object.""" """Convert a path to a Path object."""

View File

@@ -1,5 +1,5 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import List, Dict, Any, Optional from typing import Any, Dict, List, Optional
import numpy as np import numpy as np
from pydantic import BaseModel, ConfigDict, Field from pydantic import BaseModel, ConfigDict, Field
@@ -17,7 +17,7 @@ class BaseKnowledgeSource(BaseModel, ABC):
model_config = ConfigDict(arbitrary_types_allowed=True) model_config = ConfigDict(arbitrary_types_allowed=True)
storage: KnowledgeStorage = Field(default_factory=KnowledgeStorage) storage: KnowledgeStorage = Field(default_factory=KnowledgeStorage)
metadata: Dict[str, Any] = Field(default_factory=dict) metadata: Dict[str, Any] = Field(default_factory=dict) # Currently unused
collection_name: Optional[str] = Field(default=None) collection_name: Optional[str] = Field(default=None)
@abstractmethod @abstractmethod
@@ -41,9 +41,9 @@ class BaseKnowledgeSource(BaseModel, ABC):
for i in range(0, len(text), self.chunk_size - self.chunk_overlap) for i in range(0, len(text), self.chunk_size - self.chunk_overlap)
] ]
def save_documents(self, metadata: Dict[str, Any]): def _save_documents(self):
""" """
Save the documents to the storage. Save the documents to the storage.
This method should be called after the chunks and embeddings are generated. This method should be called after the chunks and embeddings are generated.
""" """
self.storage.save(self.chunks, metadata) self.storage.save(self.chunks)

View File

@@ -1,6 +1,6 @@
import csv import csv
from typing import Dict, List
from pathlib import Path from pathlib import Path
from typing import Dict, List
from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource
@@ -30,7 +30,7 @@ class CSVKnowledgeSource(BaseFileKnowledgeSource):
) )
new_chunks = self._chunk_text(content_str) new_chunks = self._chunk_text(content_str)
self.chunks.extend(new_chunks) self.chunks.extend(new_chunks)
self.save_documents(metadata=self.metadata) self._save_documents()
def _chunk_text(self, text: str) -> List[str]: def _chunk_text(self, text: str) -> List[str]:
"""Utility method to split text into chunks.""" """Utility method to split text into chunks."""

View File

@@ -1,5 +1,6 @@
from typing import Dict, List
from pathlib import Path from pathlib import Path
from typing import Dict, List
from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource
@@ -44,7 +45,7 @@ class ExcelKnowledgeSource(BaseFileKnowledgeSource):
new_chunks = self._chunk_text(content_str) new_chunks = self._chunk_text(content_str)
self.chunks.extend(new_chunks) self.chunks.extend(new_chunks)
self.save_documents(metadata=self.metadata) self._save_documents()
def _chunk_text(self, text: str) -> List[str]: def _chunk_text(self, text: str) -> List[str]:
"""Utility method to split text into chunks.""" """Utility method to split text into chunks."""

View File

@@ -1,6 +1,6 @@
import json import json
from typing import Any, Dict, List
from pathlib import Path from pathlib import Path
from typing import Any, Dict, List
from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource
@@ -42,7 +42,7 @@ class JSONKnowledgeSource(BaseFileKnowledgeSource):
) )
new_chunks = self._chunk_text(content_str) new_chunks = self._chunk_text(content_str)
self.chunks.extend(new_chunks) self.chunks.extend(new_chunks)
self.save_documents(metadata=self.metadata) self._save_documents()
def _chunk_text(self, text: str) -> List[str]: def _chunk_text(self, text: str) -> List[str]:
"""Utility method to split text into chunks.""" """Utility method to split text into chunks."""

View File

@@ -1,5 +1,5 @@
from typing import List, Dict
from pathlib import Path from pathlib import Path
from typing import Dict, List
from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource
@@ -43,7 +43,7 @@ class PDFKnowledgeSource(BaseFileKnowledgeSource):
for _, text in self.content.items(): for _, text in self.content.items():
new_chunks = self._chunk_text(text) new_chunks = self._chunk_text(text)
self.chunks.extend(new_chunks) self.chunks.extend(new_chunks)
self.save_documents(metadata=self.metadata) self._save_documents()
def _chunk_text(self, text: str) -> List[str]: def _chunk_text(self, text: str) -> List[str]:
"""Utility method to split text into chunks.""" """Utility method to split text into chunks."""

View File

@@ -24,7 +24,7 @@ class StringKnowledgeSource(BaseKnowledgeSource):
"""Add string content to the knowledge source, chunk it, compute embeddings, and save them.""" """Add string content to the knowledge source, chunk it, compute embeddings, and save them."""
new_chunks = self._chunk_text(self.content) new_chunks = self._chunk_text(self.content)
self.chunks.extend(new_chunks) self.chunks.extend(new_chunks)
self.save_documents(metadata=self.metadata) self._save_documents()
def _chunk_text(self, text: str) -> List[str]: def _chunk_text(self, text: str) -> List[str]:
"""Utility method to split text into chunks.""" """Utility method to split text into chunks."""

View File

@@ -1,5 +1,5 @@
from typing import Dict, List
from pathlib import Path from pathlib import Path
from typing import Dict, List
from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource
@@ -24,7 +24,7 @@ class TextFileKnowledgeSource(BaseFileKnowledgeSource):
for _, text in self.content.items(): for _, text in self.content.items():
new_chunks = self._chunk_text(text) new_chunks = self._chunk_text(text)
self.chunks.extend(new_chunks) self.chunks.extend(new_chunks)
self.save_documents(metadata=self.metadata) self._save_documents()
def _chunk_text(self, text: str) -> List[str]: def _chunk_text(self, text: str) -> List[str]:
"""Utility method to split text into chunks.""" """Utility method to split text into chunks."""

View File

@@ -1,18 +1,20 @@
import contextlib import contextlib
import hashlib
import io import io
import logging import logging
import chromadb
import os import os
from typing import Any, Dict, List, Optional, Union, cast
import chromadb
import chromadb.errors import chromadb.errors
from crewai.utilities.paths import db_storage_path
from typing import Optional, List, Dict, Any, Union
from crewai.utilities import EmbeddingConfigurator
from crewai.knowledge.storage.base_knowledge_storage import BaseKnowledgeStorage
import hashlib
from chromadb.config import Settings
from chromadb.api import ClientAPI from chromadb.api import ClientAPI
from chromadb.api.types import OneOrMany
from chromadb.config import Settings
from crewai.knowledge.storage.base_knowledge_storage import BaseKnowledgeStorage
from crewai.utilities import EmbeddingConfigurator
from crewai.utilities.logger import Logger from crewai.utilities.logger import Logger
from crewai.utilities.paths import db_storage_path
@contextlib.contextmanager @contextlib.contextmanager
@@ -116,11 +118,16 @@ class KnowledgeStorage(BaseKnowledgeStorage):
def save( def save(
self, self,
documents: List[str], documents: List[str],
metadata: Union[Dict[str, Any], List[Dict[str, Any]]], metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
): ):
if self.collection: if self.collection:
try: try:
metadatas = [metadata] if isinstance(metadata, dict) else metadata if metadata is None:
metadatas: Optional[OneOrMany[chromadb.Metadata]] = None
elif isinstance(metadata, list):
metadatas = [cast(chromadb.Metadata, m) for m in metadata]
else:
metadatas = cast(chromadb.Metadata, metadata)
ids = [ ids = [
hashlib.sha256(doc.encode("utf-8")).hexdigest() for doc in documents hashlib.sha256(doc.encode("utf-8")).hexdigest() for doc in documents