Fix #2351: Sanitize collection names to meet ChromaDB requirements

Co-Authored-By: Joe Moura <joao@crewai.com>
This commit is contained in:
Devin AI
2025-03-12 16:48:40 +00:00
committed by Lucas Gomide
parent 102836a2c2
commit 12a815e5db
5 changed files with 286 additions and 226 deletions

View File

@@ -142,8 +142,8 @@ class Agent(BaseAgent):
self.embedder = crew_embedder
if self.knowledge_sources:
full_pattern = re.compile(r"[^a-zA-Z0-9\-_\r\n]|(\.\.)")
knowledge_agent_name = f"{re.sub(full_pattern, '_', self.role)}"
from crewai.utilities import sanitize_collection_name
knowledge_agent_name = sanitize_collection_name(self.role)
if isinstance(self.knowledge_sources, list) and all(
isinstance(k, BaseKnowledgeSource) for k in self.knowledge_sources
):

View File

@@ -7,6 +7,7 @@ from .parser import YamlParser
from .printer import Printer
from .prompts import Prompts
from .rpm_controller import RPMController
from .string_utils import sanitize_collection_name
from .exceptions.context_window_exceeding_exception import (
LLMContextLengthExceededException,
)
@@ -25,4 +26,5 @@ __all__ = [
"YamlParser",
"LLMContextLengthExceededException",
"EmbeddingConfigurator",
"sanitize_collection_name",
]

View File

@@ -80,3 +80,48 @@ def interpolate_only(
result = result.replace(placeholder, value)
return result
from typing import Optional
def sanitize_collection_name(name: Optional[str]) -> str:
"""
Sanitize a collection name to meet ChromaDB requirements:
1. 3-63 characters long
2. Starts and ends with alphanumeric character
3. Contains only alphanumeric characters, underscores, or hyphens
4. No consecutive periods
5. Not a valid IPv4 address
Args:
name: The original collection name to sanitize
Returns:
A sanitized collection name that meets ChromaDB requirements
"""
if not name:
return "default_collection"
# Replace spaces and invalid characters with underscores
sanitized = re.sub(r"[^a-zA-Z0-9_-]", "_", name)
# Ensure it starts with alphanumeric
if not sanitized[0].isalnum():
sanitized = "a" + sanitized
# Ensure it ends with alphanumeric
if not sanitized[-1].isalnum():
sanitized = sanitized[:-1] + "z"
# Ensure length is between 3-63 characters
if len(sanitized) < 3:
# Add padding with alphanumeric character at the end
sanitized = sanitized + "x" * (3 - len(sanitized))
if len(sanitized) > 63:
sanitized = sanitized[:63]
# Ensure it still ends with alphanumeric after truncation
if not sanitized[-1].isalnum():
sanitized = sanitized[:-1] + "z"
return sanitized