refactor: unify rag storage with instance-specific client support (#3455)
Some checks failed
Notify Downstream / notify-downstream (push) Has been cancelled
Update Test Durations / update-durations (3.10) (push) Has been cancelled
Update Test Durations / update-durations (3.11) (push) Has been cancelled
Update Test Durations / update-durations (3.12) (push) Has been cancelled
Update Test Durations / update-durations (3.13) (push) Has been cancelled
Build uv cache / build-cache (3.10) (push) Has been cancelled
Build uv cache / build-cache (3.11) (push) Has been cancelled
Build uv cache / build-cache (3.12) (push) Has been cancelled
Build uv cache / build-cache (3.13) (push) Has been cancelled

- ignore line length errors globally
- migrate knowledge/memory and crew query_knowledge to `SearchResult`
- remove legacy chromadb utils; fix empty metadata handling
- restore openai as default embedding provider; support instance-specific clients
- update and fix tests for `SearchResult` migration and rag changes
This commit is contained in:
Greyson LaLonde
2025-09-17 14:46:54 -04:00
committed by GitHub
parent 81bd81e5f5
commit f28e78c5ba
30 changed files with 1956 additions and 976 deletions

View File

@@ -285,6 +285,43 @@ class TestChromaDBClient:
metadatas=[{"source": "test1"}, {"source": "test2"}],
)
def test_add_documents_without_metadata(self, client, mock_chromadb_client) -> None:
"""Test add_documents with documents that have no metadata."""
mock_collection = Mock()
mock_chromadb_client.get_collection.return_value = mock_collection
documents: list[BaseRecord] = [
{"content": "Document without metadata"},
{"content": "Another document", "metadata": None},
{"content": "Document with metadata", "metadata": {"key": "value"}},
]
client.add_documents(collection_name="test_collection", documents=documents)
# Verify upsert was called with empty dicts for missing metadata
mock_collection.upsert.assert_called_once()
call_args = mock_collection.upsert.call_args
assert call_args[1]["metadatas"] == [{}, {}, {"key": "value"}]
def test_add_documents_all_without_metadata(
self, client, mock_chromadb_client
) -> None:
"""Test add_documents when all documents have no metadata."""
mock_collection = Mock()
mock_chromadb_client.get_collection.return_value = mock_collection
documents: list[BaseRecord] = [
{"content": "Document 1"},
{"content": "Document 2"},
{"content": "Document 3"},
]
client.add_documents(collection_name="test_collection", documents=documents)
mock_collection.upsert.assert_called_once()
call_args = mock_collection.upsert.call_args
assert call_args[1]["metadatas"] is None
def test_add_documents_empty_list_raises_error(
self, client, mock_chromadb_client
) -> None:
@@ -358,6 +395,31 @@ class TestChromaDBClient:
metadatas=[{"source": "test1"}, {"source": "test2"}],
)
@pytest.mark.asyncio
async def test_aadd_documents_without_metadata(
self, async_client, mock_async_chromadb_client
) -> None:
"""Test aadd_documents with documents that have no metadata."""
mock_collection = AsyncMock()
mock_async_chromadb_client.get_collection = AsyncMock(
return_value=mock_collection
)
documents: list[BaseRecord] = [
{"content": "Document without metadata"},
{"content": "Another document", "metadata": None},
{"content": "Document with metadata", "metadata": {"key": "value"}},
]
await async_client.aadd_documents(
collection_name="test_collection", documents=documents
)
# Verify upsert was called with empty dicts for missing metadata
mock_collection.upsert.assert_called_once()
call_args = mock_collection.upsert.call_args
assert call_args[1]["metadatas"] == [{}, {}, {"key": "value"}]
@pytest.mark.asyncio
async def test_aadd_documents_empty_list_raises_error(
self, async_client, mock_async_chromadb_client

View File

@@ -0,0 +1,95 @@
"""Tests for ChromaDB utility functions."""
from crewai.rag.chromadb.utils import (
MAX_COLLECTION_LENGTH,
MIN_COLLECTION_LENGTH,
_is_ipv4_pattern,
_sanitize_collection_name,
)
class TestChromaDBUtils:
"""Test suite for ChromaDB utility functions."""
def test_sanitize_collection_name_long_name(self) -> None:
"""Test sanitizing a very long collection name."""
long_name = "This is an extremely long role name that will definitely exceed the ChromaDB collection name limit of 63 characters and cause an error when used as a collection name"
sanitized = _sanitize_collection_name(long_name)
assert len(sanitized) <= MAX_COLLECTION_LENGTH
assert sanitized[0].isalnum()
assert sanitized[-1].isalnum()
assert all(c.isalnum() or c in ["_", "-"] for c in sanitized)
def test_sanitize_collection_name_special_chars(self) -> None:
"""Test sanitizing a name with special characters."""
special_chars = "Agent@123!#$%^&*()"
sanitized = _sanitize_collection_name(special_chars)
assert sanitized[0].isalnum()
assert sanitized[-1].isalnum()
assert all(c.isalnum() or c in ["_", "-"] for c in sanitized)
def test_sanitize_collection_name_short_name(self) -> None:
"""Test sanitizing a very short name."""
short_name = "A"
sanitized = _sanitize_collection_name(short_name)
assert len(sanitized) >= MIN_COLLECTION_LENGTH
assert sanitized[0].isalnum()
assert sanitized[-1].isalnum()
def test_sanitize_collection_name_bad_ends(self) -> None:
"""Test sanitizing a name with non-alphanumeric start/end."""
bad_ends = "_Agent_"
sanitized = _sanitize_collection_name(bad_ends)
assert sanitized[0].isalnum()
assert sanitized[-1].isalnum()
def test_sanitize_collection_name_none(self) -> None:
"""Test sanitizing a None value."""
sanitized = _sanitize_collection_name(None)
assert sanitized == "default_collection"
def test_sanitize_collection_name_ipv4_pattern(self) -> None:
"""Test sanitizing an IPv4 address."""
ipv4 = "192.168.1.1"
sanitized = _sanitize_collection_name(ipv4)
assert sanitized.startswith("ip_")
assert sanitized[0].isalnum()
assert sanitized[-1].isalnum()
assert all(c.isalnum() or c in ["_", "-"] for c in sanitized)
def test_is_ipv4_pattern(self) -> None:
"""Test IPv4 pattern detection."""
assert _is_ipv4_pattern("192.168.1.1") is True
assert _is_ipv4_pattern("not.an.ip.address") is False
def test_sanitize_collection_name_properties(self) -> None:
"""Test that sanitized collection names always meet ChromaDB requirements."""
test_cases: list[str] = [
"A" * 100, # Very long name
"_start_with_underscore",
"end_with_underscore_",
"contains@special#characters",
"192.168.1.1", # IPv4 address
"a" * 2, # Too short
]
for test_case in test_cases:
sanitized = _sanitize_collection_name(test_case)
assert len(sanitized) >= MIN_COLLECTION_LENGTH
assert len(sanitized) <= MAX_COLLECTION_LENGTH
assert sanitized[0].isalnum()
assert sanitized[-1].isalnum()
def test_sanitize_collection_name_empty_string(self) -> None:
"""Test sanitizing an empty string."""
sanitized = _sanitize_collection_name("")
assert sanitized == "default_collection"
def test_sanitize_collection_name_whitespace_only(self) -> None:
"""Test sanitizing a string with only whitespace."""
sanitized = _sanitize_collection_name(" ")
assert (
sanitized == "a__z"
) # Spaces become underscores, padded to meet requirements
assert len(sanitized) >= MIN_COLLECTION_LENGTH
assert sanitized[0].isalnum()
assert sanitized[-1].isalnum()