mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-09 08:08:32 +00:00
Fix #2534: Handle non-ASCII characters in agent roles for knowledge sources
Co-Authored-By: Joe Moura <joao@crewai.com>
This commit is contained in:
@@ -144,13 +144,15 @@ class Agent(BaseAgent):
|
|||||||
self.embedder = crew_embedder
|
self.embedder = crew_embedder
|
||||||
|
|
||||||
if self.knowledge_sources:
|
if self.knowledge_sources:
|
||||||
|
from crewai.utilities import sanitize_collection_name
|
||||||
|
|
||||||
if isinstance(self.knowledge_sources, list) and all(
|
if isinstance(self.knowledge_sources, list) and all(
|
||||||
isinstance(k, BaseKnowledgeSource) for k in self.knowledge_sources
|
isinstance(k, BaseKnowledgeSource) for k in self.knowledge_sources
|
||||||
):
|
):
|
||||||
self.knowledge = Knowledge(
|
self.knowledge = Knowledge(
|
||||||
sources=self.knowledge_sources,
|
sources=self.knowledge_sources,
|
||||||
embedder=self.embedder,
|
embedder=self.embedder,
|
||||||
collection_name=self.role,
|
collection_name=sanitize_collection_name(self.role),
|
||||||
storage=self.knowledge_storage or None,
|
storage=self.knowledge_storage or None,
|
||||||
)
|
)
|
||||||
except (TypeError, ValueError) as e:
|
except (TypeError, ValueError) as e:
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ from .exceptions.context_window_exceeding_exception import (
|
|||||||
LLMContextLengthExceededException,
|
LLMContextLengthExceededException,
|
||||||
)
|
)
|
||||||
from .embedding_configurator import EmbeddingConfigurator
|
from .embedding_configurator import EmbeddingConfigurator
|
||||||
|
from .string_utils import sanitize_collection_name
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"Converter",
|
"Converter",
|
||||||
@@ -25,4 +26,5 @@ __all__ = [
|
|||||||
"YamlParser",
|
"YamlParser",
|
||||||
"LLMContextLengthExceededException",
|
"LLMContextLengthExceededException",
|
||||||
"EmbeddingConfigurator",
|
"EmbeddingConfigurator",
|
||||||
|
"sanitize_collection_name",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
import re
|
import re
|
||||||
from typing import Any, Dict, List, Optional, Union
|
from typing import Any, Dict, List, Optional, Union
|
||||||
|
from unidecode import unidecode
|
||||||
|
|
||||||
|
|
||||||
def interpolate_only(
|
def interpolate_only(
|
||||||
@@ -80,3 +81,39 @@ def interpolate_only(
|
|||||||
result = result.replace(placeholder, value)
|
result = result.replace(placeholder, value)
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def sanitize_collection_name(name: str) -> str:
|
||||||
|
"""
|
||||||
|
Sanitizes a string to be used as a ChromaDB collection name.
|
||||||
|
|
||||||
|
ChromaDB collection names must:
|
||||||
|
1. Contain 3-63 characters
|
||||||
|
2. Start and end with an alphanumeric character
|
||||||
|
3. Otherwise contain only alphanumeric characters, underscores or hyphens (-)
|
||||||
|
4. Contain no two consecutive periods (..)
|
||||||
|
5. Not be a valid IPv4 address
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: The string to sanitize
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A sanitized string that can be used as a ChromaDB collection name
|
||||||
|
"""
|
||||||
|
name = unidecode(name)
|
||||||
|
|
||||||
|
name = re.sub(r'[^\w\-]', '_', name)
|
||||||
|
|
||||||
|
name = re.sub(r'_+', '_', name)
|
||||||
|
|
||||||
|
name = re.sub(r'^[^a-zA-Z0-9]+', '', name)
|
||||||
|
name = re.sub(r'[^a-zA-Z0-9]+$', '', name)
|
||||||
|
|
||||||
|
if len(name) < 3:
|
||||||
|
name = name + 'x' * (3 - len(name))
|
||||||
|
|
||||||
|
if len(name) > 63:
|
||||||
|
name = name[:63]
|
||||||
|
name = re.sub(r'[^a-zA-Z0-9]+$', '', name)
|
||||||
|
|
||||||
|
return name
|
||||||
|
|||||||
60
tests/test_agent_non_ascii.py
Normal file
60
tests/test_agent_non_ascii.py
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
import pytest
|
||||||
|
from crewai.utilities import sanitize_collection_name
|
||||||
|
|
||||||
|
|
||||||
|
def test_sanitize_collection_name_with_non_ascii_chars():
|
||||||
|
"""Test that sanitize_collection_name properly handles non-ASCII characters."""
|
||||||
|
chinese_role = "一位有 20 年经验的 GraphQL 查询专家"
|
||||||
|
sanitized_name = sanitize_collection_name(chinese_role)
|
||||||
|
|
||||||
|
assert len(sanitized_name) >= 3
|
||||||
|
assert len(sanitized_name) <= 63
|
||||||
|
assert sanitized_name[0].isalnum()
|
||||||
|
assert sanitized_name[-1].isalnum()
|
||||||
|
assert all(c.isalnum() or c == '_' or c == '-' for c in sanitized_name)
|
||||||
|
assert '__' not in sanitized_name # No consecutive underscores
|
||||||
|
|
||||||
|
special_chars_role = "Café Owner & Barista (España) 🇪🇸"
|
||||||
|
sanitized_name = sanitize_collection_name(special_chars_role)
|
||||||
|
|
||||||
|
assert len(sanitized_name) >= 3
|
||||||
|
assert len(sanitized_name) <= 63
|
||||||
|
assert sanitized_name[0].isalnum()
|
||||||
|
assert sanitized_name[-1].isalnum()
|
||||||
|
assert all(c.isalnum() or c == '_' or c == '-' for c in sanitized_name)
|
||||||
|
assert '__' not in sanitized_name # No consecutive underscores
|
||||||
|
|
||||||
|
|
||||||
|
def test_sanitize_collection_name_edge_cases():
|
||||||
|
"""Test edge cases for sanitize_collection_name function."""
|
||||||
|
empty_role = ""
|
||||||
|
sanitized_name = sanitize_collection_name(empty_role)
|
||||||
|
assert len(sanitized_name) >= 3 # Should be padded to minimum length
|
||||||
|
|
||||||
|
special_only = "!@#$%^&*()"
|
||||||
|
sanitized_name = sanitize_collection_name(special_only)
|
||||||
|
assert len(sanitized_name) >= 3
|
||||||
|
assert sanitized_name[0].isalnum()
|
||||||
|
assert sanitized_name[-1].isalnum()
|
||||||
|
|
||||||
|
long_role = "a" * 100
|
||||||
|
sanitized_name = sanitize_collection_name(long_role)
|
||||||
|
assert len(sanitized_name) <= 63
|
||||||
|
|
||||||
|
consecutive_spaces = "Hello World"
|
||||||
|
sanitized_name = sanitize_collection_name(consecutive_spaces)
|
||||||
|
assert "__" not in sanitized_name
|
||||||
|
|
||||||
|
|
||||||
|
def test_sanitize_collection_name_reproduces_issue_2534():
|
||||||
|
"""Test that reproduces the specific issue from #2534."""
|
||||||
|
problematic_role = "一位有 20 年经验的 GraphQL 查询专家"
|
||||||
|
|
||||||
|
sanitized_name = sanitize_collection_name(problematic_role)
|
||||||
|
|
||||||
|
assert len(sanitized_name) >= 3
|
||||||
|
assert len(sanitized_name) <= 63
|
||||||
|
assert sanitized_name[0].isalnum()
|
||||||
|
assert sanitized_name[-1].isalnum()
|
||||||
|
assert all(c.isalnum() or c == '_' or c == '-' for c in sanitized_name)
|
||||||
|
assert '__' not in sanitized_name # No consecutive underscores
|
||||||
Reference in New Issue
Block a user