Compare commits

...

4 Commits

Author SHA1 Message Date
Devin AI
0e43a5f25e Add uv.lock and test cassette files
Co-Authored-By: Joe Moura <joao@crewai.com>
2025-04-08 03:25:29 +00:00
Devin AI
f4d2ad3004 Add unidecode dependency to pyproject.toml
Co-Authored-By: Joe Moura <joao@crewai.com>
2025-04-08 03:24:52 +00:00
Devin AI
8fe9099be0 Fix linting issues with import sorting
Co-Authored-By: Joe Moura <joao@crewai.com>
2025-04-08 03:23:31 +00:00
Devin AI
8aec7b3364 Fix #2534: Handle non-ASCII characters in agent roles for knowledge sources
Co-Authored-By: Joe Moura <joao@crewai.com>
2025-04-08 03:22:00 +00:00
7 changed files with 198 additions and 3 deletions

View File

@@ -37,6 +37,7 @@ dependencies = [
"tomli>=2.0.2",
"blinker>=1.9.0",
"json5>=0.10.0",
"unidecode>=1.3.8",
]
[project.urls]

View File

@@ -144,13 +144,15 @@ class Agent(BaseAgent):
self.embedder = crew_embedder
if self.knowledge_sources:
from crewai.utilities import sanitize_collection_name
if isinstance(self.knowledge_sources, list) and all(
isinstance(k, BaseKnowledgeSource) for k in self.knowledge_sources
):
self.knowledge = Knowledge(
sources=self.knowledge_sources,
embedder=self.embedder,
collection_name=self.role,
collection_name=sanitize_collection_name(self.role),
storage=self.knowledge_storage or None,
)
except (TypeError, ValueError) as e:

View File

@@ -11,6 +11,7 @@ from .exceptions.context_window_exceeding_exception import (
LLMContextLengthExceededException,
)
from .embedding_configurator import EmbeddingConfigurator
from .string_utils import sanitize_collection_name
__all__ = [
"Converter",
@@ -25,4 +26,5 @@ __all__ = [
"YamlParser",
"LLMContextLengthExceededException",
"EmbeddingConfigurator",
"sanitize_collection_name",
]

View File

@@ -1,6 +1,8 @@
import re
from typing import Any, Dict, List, Optional, Union
from unidecode import unidecode
def interpolate_only(
input_string: Optional[str],
@@ -80,3 +82,39 @@ def interpolate_only(
result = result.replace(placeholder, value)
return result
def sanitize_collection_name(name: str) -> str:
"""
Sanitizes a string to be used as a ChromaDB collection name.
ChromaDB collection names must:
1. Contain 3-63 characters
2. Start and end with an alphanumeric character
3. Otherwise contain only alphanumeric characters, underscores or hyphens (-)
4. Contain no two consecutive periods (..)
5. Not be a valid IPv4 address
Args:
name: The string to sanitize
Returns:
A sanitized string that can be used as a ChromaDB collection name
"""
name = unidecode(name)
name = re.sub(r'[^\w\-]', '_', name)
name = re.sub(r'_+', '_', name)
name = re.sub(r'^[^a-zA-Z0-9]+', '', name)
name = re.sub(r'[^a-zA-Z0-9]+$', '', name)
if len(name) < 3:
name = name + 'x' * (3 - len(name))
if len(name) > 63:
name = name[:63]
name = re.sub(r'[^a-zA-Z0-9]+$', '', name)
return name

View File

@@ -0,0 +1,80 @@
interactions:
- request:
body: '{"input": ["GraphQL is a query language for APIs and a runtime for fulfilling
those queries with your existing data."], "model": "text-embedding-3-small",
"encoding_format": "base64"}'
headers:
accept:
- application/json
accept-encoding:
- gzip, deflate
connection:
- keep-alive
content-length:
- '183'
content-type:
- application/json
host:
- api.openai.com
user-agent:
- OpenAI/Python 1.68.2
x-stainless-arch:
- x64
x-stainless-async:
- 'false'
x-stainless-lang:
- python
x-stainless-os:
- Linux
x-stainless-package-version:
- 1.68.2
x-stainless-read-timeout:
- '600'
x-stainless-retry-count:
- '0'
x-stainless-runtime:
- CPython
x-stainless-runtime-version:
- 3.12.7
method: POST
uri: https://api.openai.com/v1/embeddings
response:
content: "{\n \"error\": {\n \"message\": \"Incorrect API key provided:
sk-proj-********************************************************************************************************************************************************sLcA.
You can find your API key at https://platform.openai.com/account/api-keys.\",\n
\ \"type\": \"invalid_request_error\",\n \"param\": null,\n \"code\":
\"invalid_api_key\"\n }\n}\n"
headers:
CF-RAY:
- 92ce9b496bd476a0-SEA
Connection:
- keep-alive
Content-Length:
- '414'
Content-Type:
- application/json; charset=utf-8
Date:
- Tue, 08 Apr 2025 03:16:33 GMT
Server:
- cloudflare
Set-Cookie:
- __cf_bm=S0TpOceTcU79JlMhdky0iDf4sfHcnaP9JOimob0aF1M-1744082193-1.0.1.1-owOv6J9Tl7Kh5ltS4PUm_QuIO.Z98d8uzA4T19GK1BSWsRL6W6qUKRO1OPDRRtX6hYUcriLQUhl3drxTf_Ck7oq1ueg8sSI3NSvmKQX4xMY;
path=/; expires=Tue, 08-Apr-25 03:46:33 GMT; domain=.api.openai.com; HttpOnly;
Secure; SameSite=None
- _cfuvid=033vbIHDWmO2i_U_qbo8bvpL4uiHSy5yuML2pM2GnKM-1744082193025-0.0.1.1-604800000;
path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
X-Content-Type-Options:
- nosniff
alt-svc:
- h3=":443"; ma=86400
cf-cache-status:
- DYNAMIC
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
vary:
- Origin
x-request-id:
- req_4d2830717fd2a87766200324d4a30907
http_version: HTTP/1.1
status_code: 401
version: 1

View File

@@ -0,0 +1,61 @@
import pytest
from crewai.utilities import sanitize_collection_name
def test_sanitize_collection_name_with_non_ascii_chars():
"""Test that sanitize_collection_name properly handles non-ASCII characters."""
chinese_role = "一位有 20 年经验的 GraphQL 查询专家"
sanitized_name = sanitize_collection_name(chinese_role)
assert len(sanitized_name) >= 3
assert len(sanitized_name) <= 63
assert sanitized_name[0].isalnum()
assert sanitized_name[-1].isalnum()
assert all(c.isalnum() or c == '_' or c == '-' for c in sanitized_name)
assert '__' not in sanitized_name # No consecutive underscores
special_chars_role = "Café Owner & Barista (España) 🇪🇸"
sanitized_name = sanitize_collection_name(special_chars_role)
assert len(sanitized_name) >= 3
assert len(sanitized_name) <= 63
assert sanitized_name[0].isalnum()
assert sanitized_name[-1].isalnum()
assert all(c.isalnum() or c == '_' or c == '-' for c in sanitized_name)
assert '__' not in sanitized_name # No consecutive underscores
def test_sanitize_collection_name_edge_cases():
"""Test edge cases for sanitize_collection_name function."""
empty_role = ""
sanitized_name = sanitize_collection_name(empty_role)
assert len(sanitized_name) >= 3 # Should be padded to minimum length
special_only = "!@#$%^&*()"
sanitized_name = sanitize_collection_name(special_only)
assert len(sanitized_name) >= 3
assert sanitized_name[0].isalnum()
assert sanitized_name[-1].isalnum()
long_role = "a" * 100
sanitized_name = sanitize_collection_name(long_role)
assert len(sanitized_name) <= 63
consecutive_spaces = "Hello World"
sanitized_name = sanitize_collection_name(consecutive_spaces)
assert "__" not in sanitized_name
def test_sanitize_collection_name_reproduces_issue_2534():
"""Test that reproduces the specific issue from #2534."""
problematic_role = "一位有 20 年经验的 GraphQL 查询专家"
sanitized_name = sanitize_collection_name(problematic_role)
assert len(sanitized_name) >= 3
assert len(sanitized_name) <= 63
assert sanitized_name[0].isalnum()
assert sanitized_name[-1].isalnum()
assert all(c.isalnum() or c == '_' or c == '-' for c in sanitized_name)
assert '__' not in sanitized_name # No consecutive underscores

15
uv.lock generated
View File

@@ -1,5 +1,4 @@
version = 1
revision = 1
requires-python = ">=3.10, <3.13"
resolution-markers = [
"python_full_version < '3.11' and sys_platform == 'darwin'",
@@ -632,6 +631,7 @@ dependencies = [
{ name = "regex" },
{ name = "tomli" },
{ name = "tomli-w" },
{ name = "unidecode" },
{ name = "uv" },
]
@@ -720,9 +720,9 @@ requires-dist = [
{ name = "tiktoken", marker = "extra == 'embeddings'", specifier = "~=0.7.0" },
{ name = "tomli", specifier = ">=2.0.2" },
{ name = "tomli-w", specifier = ">=1.1.0" },
{ name = "unidecode", specifier = ">=1.3.8" },
{ name = "uv", specifier = ">=0.4.25" },
]
provides-extras = ["tools", "embeddings", "agentops", "fastembed", "pdfplumber", "pandas", "openpyxl", "mem0", "docling", "aisuite"]
[package.metadata.requires-dev]
dev = [
@@ -2975,6 +2975,7 @@ name = "nvidia-nccl-cu12"
version = "2.20.5"
source = { registry = "https://pypi.org/simple" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/c1/bb/d09dda47c881f9ff504afd6f9ca4f502ded6d8fc2f572cacc5e39da91c28/nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1fc150d5c3250b170b29410ba682384b14581db722b2531b0d8d33c595f33d01", size = 176238458 },
{ url = "https://files.pythonhosted.org/packages/4b/2a/0a131f572aa09f741c30ccd45a8e56316e8be8dfc7bc19bf0ab7cfef7b19/nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl", hash = "sha256:057f6bf9685f75215d0c53bf3ac4a10b3e6578351de307abad9e18a99182af56", size = 176249402 },
]
@@ -2984,6 +2985,7 @@ version = "12.6.85"
source = { registry = "https://pypi.org/simple" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/9d/d7/c5383e47c7e9bf1c99d5bd2a8c935af2b6d705ad831a7ec5c97db4d82f4f/nvidia_nvjitlink_cu12-12.6.85-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:eedc36df9e88b682efe4309aa16b5b4e78c2407eac59e8c10a6a47535164369a", size = 19744971 },
{ url = "https://files.pythonhosted.org/packages/31/db/dc71113d441f208cdfe7ae10d4983884e13f464a6252450693365e166dcf/nvidia_nvjitlink_cu12-12.6.85-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cf4eaa7d4b6b543ffd69d6abfb11efdeb2db48270d94dfd3a452c24150829e41", size = 19270338 },
]
[[package]]
@@ -5175,6 +5177,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/a6/ab/7e5f53c3b9d14972843a647d8d7a853969a58aecc7559cb3267302c94774/tzdata-2024.2-py2.py3-none-any.whl", hash = "sha256:a48093786cdcde33cad18c2555e8532f34422074448fbc874186f0abd79565cd", size = 346586 },
]
[[package]]
name = "unidecode"
version = "1.3.8"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/f7/89/19151076a006b9ac0dd37b1354e031f5297891ee507eb624755e58e10d3e/Unidecode-1.3.8.tar.gz", hash = "sha256:cfdb349d46ed3873ece4586b96aa75258726e2fa8ec21d6f00a591d98806c2f4", size = 192701 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/84/b7/6ec57841fb67c98f52fc8e4a2d96df60059637cba077edc569a302a8ffc7/Unidecode-1.3.8-py3-none-any.whl", hash = "sha256:d130a61ce6696f8148a3bd8fe779c99adeb4b870584eeb9526584e9aa091fd39", size = 235494 },
]
[[package]]
name = "urllib3"
version = "2.2.3"