mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-11 17:18:29 +00:00
fix: delegate collection name sanitization to knowledge store
This commit is contained in:
committed by
Lucas Gomide
parent
df25703cc2
commit
6b14ffcffb
@@ -142,20 +142,13 @@ class Agent(BaseAgent):
|
||||
self.embedder = crew_embedder
|
||||
|
||||
if self.knowledge_sources:
|
||||
try:
|
||||
from crewai.utilities import sanitize_collection_name
|
||||
knowledge_agent_name = sanitize_collection_name(self.role)
|
||||
except Exception as e:
|
||||
self._logger.warning(f"Error sanitizing collection name: {e}")
|
||||
knowledge_agent_name = "default_agent"
|
||||
|
||||
if isinstance(self.knowledge_sources, list) and all(
|
||||
isinstance(k, BaseKnowledgeSource) for k in self.knowledge_sources
|
||||
):
|
||||
self.knowledge = Knowledge(
|
||||
sources=self.knowledge_sources,
|
||||
embedder=self.embedder,
|
||||
collection_name=knowledge_agent_name,
|
||||
collection_name=self.role,
|
||||
storage=self.knowledge_storage or None,
|
||||
)
|
||||
except (TypeError, ValueError) as e:
|
||||
|
||||
@@ -98,8 +98,11 @@ class KnowledgeStorage(BaseKnowledgeStorage):
|
||||
else "knowledge"
|
||||
)
|
||||
if self.app:
|
||||
from crewai.utilities.chromadb import sanitize_collection_name
|
||||
|
||||
self.collection = self.app.get_or_create_collection(
|
||||
name=collection_name, embedding_function=self.embedder
|
||||
name=sanitize_collection_name(collection_name),
|
||||
embedding_function=self.embedder,
|
||||
)
|
||||
else:
|
||||
raise Exception("Vector Database Client not initialized")
|
||||
|
||||
@@ -7,7 +7,6 @@ from .parser import YamlParser
|
||||
from .printer import Printer
|
||||
from .prompts import Prompts
|
||||
from .rpm_controller import RPMController
|
||||
from .string_utils import sanitize_collection_name, is_ipv4_pattern
|
||||
from .exceptions.context_window_exceeding_exception import (
|
||||
LLMContextLengthExceededException,
|
||||
)
|
||||
@@ -26,6 +25,4 @@ __all__ = [
|
||||
"YamlParser",
|
||||
"LLMContextLengthExceededException",
|
||||
"EmbeddingConfigurator",
|
||||
"sanitize_collection_name",
|
||||
"is_ipv4_pattern",
|
||||
]
|
||||
|
||||
62
src/crewai/utilities/chromadb.py
Normal file
62
src/crewai/utilities/chromadb.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
MIN_COLLECTION_LENGTH = 3
|
||||
MAX_COLLECTION_LENGTH = 63
|
||||
DEFAULT_COLLECTION = "default_collection"
|
||||
|
||||
# Compiled regex patterns for better performance
|
||||
INVALID_CHARS_PATTERN = re.compile(r"[^a-zA-Z0-9_-]")
|
||||
IPV4_PATTERN = re.compile(r"^(\d{1,3}\.){3}\d{1,3}$")
|
||||
|
||||
|
||||
def is_ipv4_pattern(name: str) -> bool:
|
||||
"""
|
||||
Check if a string matches an IPv4 address pattern.
|
||||
|
||||
Args:
|
||||
name: The string to check
|
||||
|
||||
Returns:
|
||||
True if the string matches an IPv4 pattern, False otherwise
|
||||
"""
|
||||
return bool(IPV4_PATTERN.match(name))
|
||||
|
||||
|
||||
def sanitize_collection_name(name: Optional[str]) -> str:
|
||||
"""
|
||||
Sanitize a collection name to meet ChromaDB requirements:
|
||||
1. 3-63 characters long
|
||||
2. Starts and ends with alphanumeric character
|
||||
3. Contains only alphanumeric characters, underscores, or hyphens
|
||||
4. No consecutive periods
|
||||
5. Not a valid IPv4 address
|
||||
|
||||
Args:
|
||||
name: The original collection name to sanitize
|
||||
|
||||
Returns:
|
||||
A sanitized collection name that meets ChromaDB requirements
|
||||
"""
|
||||
if not name:
|
||||
return DEFAULT_COLLECTION
|
||||
|
||||
if is_ipv4_pattern(name):
|
||||
name = f"ip_{name}"
|
||||
|
||||
sanitized = INVALID_CHARS_PATTERN.sub("_", name)
|
||||
|
||||
if not sanitized[0].isalnum():
|
||||
sanitized = "a" + sanitized
|
||||
|
||||
if not sanitized[-1].isalnum():
|
||||
sanitized = sanitized[:-1] + "z"
|
||||
|
||||
if len(sanitized) < MIN_COLLECTION_LENGTH:
|
||||
sanitized = sanitized + "x" * (MIN_COLLECTION_LENGTH - len(sanitized))
|
||||
if len(sanitized) > MAX_COLLECTION_LENGTH:
|
||||
sanitized = sanitized[:MAX_COLLECTION_LENGTH]
|
||||
if not sanitized[-1].isalnum():
|
||||
sanitized = sanitized[:-1] + "z"
|
||||
|
||||
return sanitized
|
||||
@@ -80,74 +80,3 @@ def interpolate_only(
|
||||
result = result.replace(placeholder, value)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
from typing import Optional
|
||||
|
||||
# Constants for ChromaDB collection name requirements
|
||||
MIN_LENGTH = 3
|
||||
MAX_LENGTH = 63
|
||||
DEFAULT_COLLECTION = "default_collection"
|
||||
|
||||
# Compiled regex patterns for better performance
|
||||
INVALID_CHARS_PATTERN = re.compile(r"[^a-zA-Z0-9_-]")
|
||||
IPV4_PATTERN = re.compile(r"^(\d{1,3}\.){3}\d{1,3}$")
|
||||
|
||||
|
||||
def is_ipv4_pattern(name: str) -> bool:
|
||||
"""
|
||||
Check if a string matches an IPv4 address pattern.
|
||||
|
||||
Args:
|
||||
name: The string to check
|
||||
|
||||
Returns:
|
||||
True if the string matches an IPv4 pattern, False otherwise
|
||||
"""
|
||||
return bool(IPV4_PATTERN.match(name))
|
||||
|
||||
|
||||
def sanitize_collection_name(name: Optional[str]) -> str:
|
||||
"""
|
||||
Sanitize a collection name to meet ChromaDB requirements:
|
||||
1. 3-63 characters long
|
||||
2. Starts and ends with alphanumeric character
|
||||
3. Contains only alphanumeric characters, underscores, or hyphens
|
||||
4. No consecutive periods
|
||||
5. Not a valid IPv4 address
|
||||
|
||||
Args:
|
||||
name: The original collection name to sanitize
|
||||
|
||||
Returns:
|
||||
A sanitized collection name that meets ChromaDB requirements
|
||||
"""
|
||||
if not name:
|
||||
return DEFAULT_COLLECTION
|
||||
|
||||
# Handle IPv4 pattern
|
||||
if is_ipv4_pattern(name):
|
||||
name = f"ip_{name}"
|
||||
|
||||
# Replace spaces and invalid characters with underscores
|
||||
sanitized = INVALID_CHARS_PATTERN.sub("_", name)
|
||||
|
||||
# Ensure it starts with alphanumeric
|
||||
if not sanitized[0].isalnum():
|
||||
sanitized = "a" + sanitized
|
||||
|
||||
# Ensure it ends with alphanumeric
|
||||
if not sanitized[-1].isalnum():
|
||||
sanitized = sanitized[:-1] + "z"
|
||||
|
||||
# Ensure length is between MIN_LENGTH-MAX_LENGTH characters
|
||||
if len(sanitized) < MIN_LENGTH:
|
||||
# Add padding with alphanumeric character at the end
|
||||
sanitized = sanitized + "x" * (MIN_LENGTH - len(sanitized))
|
||||
if len(sanitized) > MAX_LENGTH:
|
||||
sanitized = sanitized[:MAX_LENGTH]
|
||||
# Ensure it still ends with alphanumeric after truncation
|
||||
if not sanitized[-1].isalnum():
|
||||
sanitized = sanitized[:-1] + "z"
|
||||
|
||||
return sanitized
|
||||
|
||||
Reference in New Issue
Block a user