feat: restructure project as UV workspace with crewai in lib/

This commit is contained in:
Greyson LaLonde
2025-09-26 14:29:28 -04:00
parent 74b5c88834
commit daf6f679ff
763 changed files with 1181 additions and 398 deletions

View File

@@ -1,774 +0,0 @@
"""Tests for ChromaDBClient implementation."""
from unittest.mock import AsyncMock, Mock
import pytest
from crewai.rag.chromadb.client import ChromaDBClient
from crewai.rag.types import BaseRecord
@pytest.fixture
def mock_chromadb_client():
"""Create a mock ChromaDB client."""
from chromadb.api import ClientAPI
return Mock(spec=ClientAPI)
@pytest.fixture
def mock_async_chromadb_client():
"""Create a mock async ChromaDB client."""
from chromadb.api import AsyncClientAPI
return Mock(spec=AsyncClientAPI)
@pytest.fixture
def client(mock_chromadb_client) -> ChromaDBClient:
"""Create a ChromaDBClient instance for testing."""
mock_embedding = Mock()
client = ChromaDBClient(
client=mock_chromadb_client, embedding_function=mock_embedding
)
return client
@pytest.fixture
def client_with_batch_size(mock_chromadb_client) -> ChromaDBClient:
"""Create a ChromaDBClient instance with custom batch size for testing."""
mock_embedding = Mock()
client = ChromaDBClient(
client=mock_chromadb_client,
embedding_function=mock_embedding,
default_batch_size=2,
)
return client
@pytest.fixture
def async_client_with_batch_size(mock_async_chromadb_client) -> ChromaDBClient:
"""Create a ChromaDBClient instance with async client and custom batch size for testing."""
mock_embedding = Mock()
client = ChromaDBClient(
client=mock_async_chromadb_client,
embedding_function=mock_embedding,
default_batch_size=2,
)
return client
@pytest.fixture
def async_client(mock_async_chromadb_client) -> ChromaDBClient:
"""Create a ChromaDBClient instance with async client for testing."""
mock_embedding = Mock()
client = ChromaDBClient(
client=mock_async_chromadb_client, embedding_function=mock_embedding
)
return client
class TestChromaDBClient:
"""Test suite for ChromaDBClient."""
def test_create_collection(self, client, mock_chromadb_client):
"""Test that create_collection calls the underlying client correctly."""
client.create_collection(collection_name="test_collection")
mock_chromadb_client.create_collection.assert_called_once_with(
name="test_collection",
configuration=None,
metadata={"hnsw:space": "cosine"},
embedding_function=client.embedding_function,
data_loader=None,
get_or_create=False,
)
def test_create_collection_with_all_params(self, client, mock_chromadb_client):
"""Test create_collection with all optional parameters."""
mock_config = Mock()
mock_metadata = {"key": "value"}
mock_embedding_func = Mock()
mock_data_loader = Mock()
client.create_collection(
collection_name="test_collection",
configuration=mock_config,
metadata=mock_metadata,
embedding_function=mock_embedding_func,
data_loader=mock_data_loader,
get_or_create=True,
)
mock_chromadb_client.create_collection.assert_called_once_with(
name="test_collection",
configuration=mock_config,
metadata=mock_metadata,
embedding_function=mock_embedding_func,
data_loader=mock_data_loader,
get_or_create=True,
)
@pytest.mark.asyncio
async def test_acreate_collection(
self, async_client, mock_async_chromadb_client
) -> None:
"""Test that acreate_collection calls the underlying client correctly."""
# Make the mock's create_collection an AsyncMock
mock_async_chromadb_client.create_collection = AsyncMock(return_value=None)
await async_client.acreate_collection(collection_name="test_collection")
mock_async_chromadb_client.create_collection.assert_called_once_with(
name="test_collection",
configuration=None,
metadata={"hnsw:space": "cosine"},
embedding_function=async_client.embedding_function,
data_loader=None,
get_or_create=False,
)
@pytest.mark.asyncio
async def test_acreate_collection_with_all_params(
self, async_client, mock_async_chromadb_client
) -> None:
"""Test acreate_collection with all optional parameters."""
# Make the mock's create_collection an AsyncMock
mock_async_chromadb_client.create_collection = AsyncMock(return_value=None)
mock_config = Mock()
mock_metadata = {"key": "value"}
mock_embedding_func = Mock()
mock_data_loader = Mock()
await async_client.acreate_collection(
collection_name="test_collection",
configuration=mock_config,
metadata=mock_metadata,
embedding_function=mock_embedding_func,
data_loader=mock_data_loader,
get_or_create=True,
)
mock_async_chromadb_client.create_collection.assert_called_once_with(
name="test_collection",
configuration=mock_config,
metadata=mock_metadata,
embedding_function=mock_embedding_func,
data_loader=mock_data_loader,
get_or_create=True,
)
def test_get_or_create_collection(self, client, mock_chromadb_client):
"""Test that get_or_create_collection calls the underlying client correctly."""
mock_collection = Mock()
mock_chromadb_client.get_or_create_collection.return_value = mock_collection
result = client.get_or_create_collection(collection_name="test_collection")
mock_chromadb_client.get_or_create_collection.assert_called_once_with(
name="test_collection",
configuration=None,
metadata={"hnsw:space": "cosine"},
embedding_function=client.embedding_function,
data_loader=None,
)
assert result == mock_collection
def test_get_or_create_collection_with_all_params(
self, client, mock_chromadb_client
):
"""Test get_or_create_collection with all optional parameters."""
mock_collection = Mock()
mock_chromadb_client.get_or_create_collection.return_value = mock_collection
mock_config = Mock()
mock_metadata = {"key": "value"}
mock_embedding_func = Mock()
mock_data_loader = Mock()
result = client.get_or_create_collection(
collection_name="test_collection",
configuration=mock_config,
metadata=mock_metadata,
embedding_function=mock_embedding_func,
data_loader=mock_data_loader,
)
mock_chromadb_client.get_or_create_collection.assert_called_once_with(
name="test_collection",
configuration=mock_config,
metadata=mock_metadata,
embedding_function=mock_embedding_func,
data_loader=mock_data_loader,
)
assert result == mock_collection
@pytest.mark.asyncio
async def test_aget_or_create_collection(
self, async_client, mock_async_chromadb_client
) -> None:
"""Test that aget_or_create_collection calls the underlying client correctly."""
mock_collection = Mock()
mock_async_chromadb_client.get_or_create_collection = AsyncMock(
return_value=mock_collection
)
result = await async_client.aget_or_create_collection(
collection_name="test_collection"
)
mock_async_chromadb_client.get_or_create_collection.assert_called_once_with(
name="test_collection",
configuration=None,
metadata={"hnsw:space": "cosine"},
embedding_function=async_client.embedding_function,
data_loader=None,
)
assert result == mock_collection
@pytest.mark.asyncio
async def test_aget_or_create_collection_with_all_params(
self, async_client, mock_async_chromadb_client
) -> None:
"""Test aget_or_create_collection with all optional parameters."""
mock_collection = Mock()
mock_async_chromadb_client.get_or_create_collection = AsyncMock(
return_value=mock_collection
)
mock_config = Mock()
mock_metadata = {"key": "value"}
mock_embedding_func = Mock()
mock_data_loader = Mock()
result = await async_client.aget_or_create_collection(
collection_name="test_collection",
configuration=mock_config,
metadata=mock_metadata,
embedding_function=mock_embedding_func,
data_loader=mock_data_loader,
)
mock_async_chromadb_client.get_or_create_collection.assert_called_once_with(
name="test_collection",
configuration=mock_config,
metadata=mock_metadata,
embedding_function=mock_embedding_func,
data_loader=mock_data_loader,
)
assert result == mock_collection
def test_add_documents(self, client, mock_chromadb_client) -> None:
"""Test that add_documents adds documents to collection."""
mock_collection = Mock()
mock_chromadb_client.get_or_create_collection.return_value = mock_collection
documents: list[BaseRecord] = [
{
"content": "Test document",
"metadata": {"source": "test"},
}
]
client.add_documents(collection_name="test_collection", documents=documents)
mock_chromadb_client.get_or_create_collection.assert_called_once_with(
name="test_collection",
embedding_function=client.embedding_function,
)
# Verify documents were added to collection
mock_collection.upsert.assert_called_once()
call_args = mock_collection.upsert.call_args
assert len(call_args.kwargs["ids"]) == 1
assert call_args.kwargs["documents"] == ["Test document"]
assert call_args.kwargs["metadatas"] == [{"source": "test"}]
def test_add_documents_with_custom_ids(self, client, mock_chromadb_client) -> None:
"""Test add_documents with custom document IDs."""
mock_collection = Mock()
mock_chromadb_client.get_or_create_collection.return_value = mock_collection
documents: list[BaseRecord] = [
{
"doc_id": "custom_id_1",
"content": "First document",
"metadata": {"source": "test1"},
},
{
"doc_id": "custom_id_2",
"content": "Second document",
"metadata": {"source": "test2"},
},
]
client.add_documents(collection_name="test_collection", documents=documents)
mock_collection.upsert.assert_called_once_with(
ids=["custom_id_1", "custom_id_2"],
documents=["First document", "Second document"],
metadatas=[{"source": "test1"}, {"source": "test2"}],
)
def test_add_documents_without_metadata(self, client, mock_chromadb_client) -> None:
"""Test add_documents with documents that have no metadata."""
mock_collection = Mock()
mock_chromadb_client.get_or_create_collection.return_value = mock_collection
documents: list[BaseRecord] = [
{"content": "Document without metadata"},
{"content": "Another document", "metadata": None},
{"content": "Document with metadata", "metadata": {"key": "value"}},
]
client.add_documents(collection_name="test_collection", documents=documents)
# Verify upsert was called with empty dicts for missing metadata
mock_collection.upsert.assert_called_once()
call_args = mock_collection.upsert.call_args
assert call_args[1]["metadatas"] == [{}, {}, {"key": "value"}]
def test_add_documents_all_without_metadata(
self, client, mock_chromadb_client
) -> None:
"""Test add_documents when all documents have no metadata."""
mock_collection = Mock()
mock_chromadb_client.get_or_create_collection.return_value = mock_collection
documents: list[BaseRecord] = [
{"content": "Document 1"},
{"content": "Document 2"},
{"content": "Document 3"},
]
client.add_documents(collection_name="test_collection", documents=documents)
mock_collection.upsert.assert_called_once()
call_args = mock_collection.upsert.call_args
assert call_args[1]["metadatas"] is None
def test_add_documents_empty_list_raises_error(
self, client, mock_chromadb_client
) -> None:
"""Test that add_documents raises error for empty documents list."""
with pytest.raises(ValueError, match="Documents list cannot be empty"):
client.add_documents(collection_name="test_collection", documents=[])
@pytest.mark.asyncio
async def test_aadd_documents(
self, async_client, mock_async_chromadb_client
) -> None:
"""Test that aadd_documents adds documents to collection asynchronously."""
mock_collection = AsyncMock()
mock_async_chromadb_client.get_or_create_collection = AsyncMock(
return_value=mock_collection
)
documents: list[BaseRecord] = [
{
"content": "Test document",
"metadata": {"source": "test"},
}
]
await async_client.aadd_documents(
collection_name="test_collection", documents=documents
)
mock_async_chromadb_client.get_or_create_collection.assert_called_once_with(
name="test_collection",
embedding_function=async_client.embedding_function,
)
# Verify documents were added to collection
mock_collection.upsert.assert_called_once()
call_args = mock_collection.upsert.call_args
assert len(call_args.kwargs["ids"]) == 1
assert call_args.kwargs["documents"] == ["Test document"]
assert call_args.kwargs["metadatas"] == [{"source": "test"}]
@pytest.mark.asyncio
async def test_aadd_documents_with_custom_ids(
self, async_client, mock_async_chromadb_client
) -> None:
"""Test aadd_documents with custom document IDs."""
mock_collection = AsyncMock()
mock_async_chromadb_client.get_or_create_collection = AsyncMock(
return_value=mock_collection
)
documents: list[BaseRecord] = [
{
"doc_id": "custom_id_1",
"content": "First document",
"metadata": {"source": "test1"},
},
{
"doc_id": "custom_id_2",
"content": "Second document",
"metadata": {"source": "test2"},
},
]
await async_client.aadd_documents(
collection_name="test_collection", documents=documents
)
mock_collection.upsert.assert_called_once_with(
ids=["custom_id_1", "custom_id_2"],
documents=["First document", "Second document"],
metadatas=[{"source": "test1"}, {"source": "test2"}],
)
@pytest.mark.asyncio
async def test_aadd_documents_without_metadata(
self, async_client, mock_async_chromadb_client
) -> None:
"""Test aadd_documents with documents that have no metadata."""
mock_collection = AsyncMock()
mock_async_chromadb_client.get_or_create_collection = AsyncMock(
return_value=mock_collection
)
documents: list[BaseRecord] = [
{"content": "Document without metadata"},
{"content": "Another document", "metadata": None},
{"content": "Document with metadata", "metadata": {"key": "value"}},
]
await async_client.aadd_documents(
collection_name="test_collection", documents=documents
)
# Verify upsert was called with empty dicts for missing metadata
mock_collection.upsert.assert_called_once()
call_args = mock_collection.upsert.call_args
assert call_args[1]["metadatas"] == [{}, {}, {"key": "value"}]
@pytest.mark.asyncio
async def test_aadd_documents_empty_list_raises_error(
self, async_client, mock_async_chromadb_client
) -> None:
"""Test that aadd_documents raises error for empty documents list."""
with pytest.raises(ValueError, match="Documents list cannot be empty"):
await async_client.aadd_documents(
collection_name="test_collection", documents=[]
)
def test_search(self, client, mock_chromadb_client):
"""Test that search queries the collection correctly."""
mock_collection = Mock()
mock_collection.metadata = {"hnsw:space": "cosine"}
mock_chromadb_client.get_or_create_collection.return_value = mock_collection
mock_collection.query.return_value = {
"ids": [["doc1", "doc2"]],
"documents": [["Document 1", "Document 2"]],
"metadatas": [[{"source": "test1"}, {"source": "test2"}]],
"distances": [[0.1, 0.3]],
}
results = client.search(collection_name="test_collection", query="test query")
mock_chromadb_client.get_or_create_collection.assert_called_once_with(
name="test_collection",
embedding_function=client.embedding_function,
)
mock_collection.query.assert_called_once_with(
query_texts=["test query"],
n_results=5,
where=None,
where_document=None,
include=["metadatas", "documents", "distances"],
)
assert len(results) == 2
assert results[0]["id"] == "doc1"
assert results[0]["content"] == "Document 1"
assert results[0]["metadata"] == {"source": "test1"}
assert results[0]["score"] == 0.95
def test_search_with_optional_params(self, client, mock_chromadb_client):
"""Test search with optional parameters."""
mock_collection = Mock()
mock_collection.metadata = {"hnsw:space": "cosine"}
mock_chromadb_client.get_or_create_collection.return_value = mock_collection
mock_collection.query.return_value = {
"ids": [["doc1", "doc2", "doc3"]],
"documents": [["Document 1", "Document 2", "Document 3"]],
"metadatas": [
[{"source": "test1"}, {"source": "test2"}, {"source": "test3"}]
],
"distances": [[0.1, 0.3, 1.5]], # Last one will be filtered by threshold
}
results = client.search(
collection_name="test_collection",
query="test query",
limit=5,
metadata_filter={"source": "test"},
score_threshold=0.7,
)
mock_collection.query.assert_called_once_with(
query_texts=["test query"],
n_results=5,
where={"source": "test"},
where_document=None,
include=["metadatas", "documents", "distances"],
)
assert len(results) == 2
@pytest.mark.asyncio
async def test_asearch(self, async_client, mock_async_chromadb_client) -> None:
"""Test that asearch queries the collection correctly."""
mock_collection = AsyncMock()
mock_collection.metadata = {"hnsw:space": "cosine"}
mock_async_chromadb_client.get_or_create_collection = AsyncMock(
return_value=mock_collection
)
mock_collection.query = AsyncMock(
return_value={
"ids": [["doc1", "doc2"]],
"documents": [["Document 1", "Document 2"]],
"metadatas": [[{"source": "test1"}, {"source": "test2"}]],
"distances": [[0.1, 0.3]],
}
)
results = await async_client.asearch(
collection_name="test_collection", query="test query"
)
mock_async_chromadb_client.get_or_create_collection.assert_called_once_with(
name="test_collection",
embedding_function=async_client.embedding_function,
)
mock_collection.query.assert_called_once_with(
query_texts=["test query"],
n_results=5,
where=None,
where_document=None,
include=["metadatas", "documents", "distances"],
)
assert len(results) == 2
assert results[0]["id"] == "doc1"
assert results[0]["content"] == "Document 1"
assert results[0]["metadata"] == {"source": "test1"}
assert results[0]["score"] == 0.95
@pytest.mark.asyncio
async def test_asearch_with_optional_params(
self, async_client, mock_async_chromadb_client
) -> None:
"""Test asearch with optional parameters."""
mock_collection = AsyncMock()
mock_collection.metadata = {"hnsw:space": "cosine"}
mock_async_chromadb_client.get_or_create_collection = AsyncMock(
return_value=mock_collection
)
mock_collection.query = AsyncMock(
return_value={
"ids": [["doc1", "doc2", "doc3"]],
"documents": [["Document 1", "Document 2", "Document 3"]],
"metadatas": [
[{"source": "test1"}, {"source": "test2"}, {"source": "test3"}]
],
"distances": [
[0.1, 0.3, 1.5]
], # Last one will be filtered by threshold
}
)
results = await async_client.asearch(
collection_name="test_collection",
query="test query",
limit=5,
metadata_filter={"source": "test"},
score_threshold=0.7,
)
mock_collection.query.assert_called_once_with(
query_texts=["test query"],
n_results=5,
where={"source": "test"},
where_document=None,
include=["metadatas", "documents", "distances"],
)
# Only 2 results should pass the score threshold
assert len(results) == 2
def test_delete_collection(self, client, mock_chromadb_client):
"""Test that delete_collection calls the underlying client correctly."""
client.delete_collection(collection_name="test_collection")
mock_chromadb_client.delete_collection.assert_called_once_with(
name="test_collection"
)
@pytest.mark.asyncio
async def test_adelete_collection(
self, async_client, mock_async_chromadb_client
) -> None:
"""Test that adelete_collection calls the underlying client correctly."""
mock_async_chromadb_client.delete_collection = AsyncMock(return_value=None)
await async_client.adelete_collection(collection_name="test_collection")
mock_async_chromadb_client.delete_collection.assert_called_once_with(
name="test_collection"
)
def test_reset(self, client, mock_chromadb_client):
"""Test that reset calls the underlying client correctly."""
mock_chromadb_client.reset.return_value = True
client.reset()
mock_chromadb_client.reset.assert_called_once_with()
@pytest.mark.asyncio
async def test_areset(self, async_client, mock_async_chromadb_client) -> None:
"""Test that areset calls the underlying client correctly."""
mock_async_chromadb_client.reset = AsyncMock(return_value=True)
await async_client.areset()
mock_async_chromadb_client.reset.assert_called_once_with()
def test_add_documents_with_batch_size(
self, client_with_batch_size, mock_chromadb_client
) -> None:
"""Test add_documents with batch size splits documents into batches."""
mock_collection = Mock()
mock_chromadb_client.get_or_create_collection.return_value = mock_collection
documents: list[BaseRecord] = [
{"doc_id": "id1", "content": "Document 1", "metadata": {"source": "test1"}},
{"doc_id": "id2", "content": "Document 2", "metadata": {"source": "test2"}},
{"doc_id": "id3", "content": "Document 3", "metadata": {"source": "test3"}},
{"doc_id": "id4", "content": "Document 4", "metadata": {"source": "test4"}},
{"doc_id": "id5", "content": "Document 5", "metadata": {"source": "test5"}},
]
client_with_batch_size.add_documents(
collection_name="test_collection", documents=documents
)
assert mock_collection.upsert.call_count == 3
first_call = mock_collection.upsert.call_args_list[0]
assert first_call.kwargs["ids"] == ["id1", "id2"]
assert first_call.kwargs["documents"] == ["Document 1", "Document 2"]
assert first_call.kwargs["metadatas"] == [
{"source": "test1"},
{"source": "test2"},
]
second_call = mock_collection.upsert.call_args_list[1]
assert second_call.kwargs["ids"] == ["id3", "id4"]
assert second_call.kwargs["documents"] == ["Document 3", "Document 4"]
assert second_call.kwargs["metadatas"] == [
{"source": "test3"},
{"source": "test4"},
]
third_call = mock_collection.upsert.call_args_list[2]
assert third_call.kwargs["ids"] == ["id5"]
assert third_call.kwargs["documents"] == ["Document 5"]
assert third_call.kwargs["metadatas"] == [{"source": "test5"}]
def test_add_documents_with_explicit_batch_size(
self, client, mock_chromadb_client
) -> None:
"""Test add_documents with explicitly provided batch size."""
mock_collection = Mock()
mock_chromadb_client.get_or_create_collection.return_value = mock_collection
documents: list[BaseRecord] = [
{"doc_id": "id1", "content": "Document 1"},
{"doc_id": "id2", "content": "Document 2"},
{"doc_id": "id3", "content": "Document 3"},
]
client.add_documents(
collection_name="test_collection", documents=documents, batch_size=1
)
assert mock_collection.upsert.call_count == 3
for i, call in enumerate(mock_collection.upsert.call_args_list):
assert len(call.kwargs["ids"]) == 1
assert call.kwargs["ids"] == [f"id{i + 1}"]
@pytest.mark.asyncio
async def test_aadd_documents_with_batch_size(
self, async_client_with_batch_size, mock_async_chromadb_client
) -> None:
"""Test aadd_documents with batch size splits documents into batches."""
mock_collection = AsyncMock()
mock_async_chromadb_client.get_or_create_collection = AsyncMock(
return_value=mock_collection
)
documents: list[BaseRecord] = [
{"doc_id": "id1", "content": "Document 1", "metadata": {"source": "test1"}},
{"doc_id": "id2", "content": "Document 2", "metadata": {"source": "test2"}},
{"doc_id": "id3", "content": "Document 3", "metadata": {"source": "test3"}},
]
await async_client_with_batch_size.aadd_documents(
collection_name="test_collection", documents=documents
)
assert mock_collection.upsert.call_count == 2
first_call = mock_collection.upsert.call_args_list[0]
assert first_call.kwargs["ids"] == ["id1", "id2"]
assert first_call.kwargs["documents"] == ["Document 1", "Document 2"]
second_call = mock_collection.upsert.call_args_list[1]
assert second_call.kwargs["ids"] == ["id3"]
assert second_call.kwargs["documents"] == ["Document 3"]
@pytest.mark.asyncio
async def test_aadd_documents_with_explicit_batch_size(
self, async_client, mock_async_chromadb_client
) -> None:
"""Test aadd_documents with explicitly provided batch size."""
mock_collection = AsyncMock()
mock_async_chromadb_client.get_or_create_collection = AsyncMock(
return_value=mock_collection
)
documents: list[BaseRecord] = [
{"doc_id": "id1", "content": "Document 1"},
{"doc_id": "id2", "content": "Document 2"},
{"doc_id": "id3", "content": "Document 3"},
{"doc_id": "id4", "content": "Document 4"},
]
await async_client.aadd_documents(
collection_name="test_collection", documents=documents, batch_size=3
)
assert mock_collection.upsert.call_count == 2
first_call = mock_collection.upsert.call_args_list[0]
assert len(first_call.kwargs["ids"]) == 3
second_call = mock_collection.upsert.call_args_list[1]
assert len(second_call.kwargs["ids"]) == 1
def test_client_default_batch_size_initialization(self) -> None:
"""Test that client initializes with correct default batch size."""
mock_client = Mock()
mock_embedding = Mock()
client = ChromaDBClient(client=mock_client, embedding_function=mock_embedding)
assert client.default_batch_size == 100
custom_client = ChromaDBClient(
client=mock_client, embedding_function=mock_embedding, default_batch_size=50
)
assert custom_client.default_batch_size == 50

View File

@@ -1,302 +0,0 @@
"""Tests for ChromaDB utility functions."""
from crewai.rag.chromadb.types import PreparedDocuments
from crewai.rag.chromadb.utils import (
MAX_COLLECTION_LENGTH,
MIN_COLLECTION_LENGTH,
_create_batch_slice,
_is_ipv4_pattern,
_prepare_documents_for_chromadb,
_sanitize_collection_name,
)
from crewai.rag.types import BaseRecord
class TestChromaDBUtils:
"""Test suite for ChromaDB utility functions."""
def test_sanitize_collection_name_long_name(self) -> None:
"""Test sanitizing a very long collection name."""
long_name = "This is an extremely long role name that will definitely exceed the ChromaDB collection name limit of 63 characters and cause an error when used as a collection name"
sanitized = _sanitize_collection_name(long_name)
assert len(sanitized) <= MAX_COLLECTION_LENGTH
assert sanitized[0].isalnum()
assert sanitized[-1].isalnum()
assert all(c.isalnum() or c in ["_", "-"] for c in sanitized)
def test_sanitize_collection_name_special_chars(self) -> None:
"""Test sanitizing a name with special characters."""
special_chars = "Agent@123!#$%^&*()"
sanitized = _sanitize_collection_name(special_chars)
assert sanitized[0].isalnum()
assert sanitized[-1].isalnum()
assert all(c.isalnum() or c in ["_", "-"] for c in sanitized)
def test_sanitize_collection_name_short_name(self) -> None:
"""Test sanitizing a very short name."""
short_name = "A"
sanitized = _sanitize_collection_name(short_name)
assert len(sanitized) >= MIN_COLLECTION_LENGTH
assert sanitized[0].isalnum()
assert sanitized[-1].isalnum()
def test_sanitize_collection_name_bad_ends(self) -> None:
"""Test sanitizing a name with non-alphanumeric start/end."""
bad_ends = "_Agent_"
sanitized = _sanitize_collection_name(bad_ends)
assert sanitized[0].isalnum()
assert sanitized[-1].isalnum()
def test_sanitize_collection_name_none(self) -> None:
"""Test sanitizing a None value."""
sanitized = _sanitize_collection_name(None)
assert sanitized == "default_collection"
def test_sanitize_collection_name_ipv4_pattern(self) -> None:
"""Test sanitizing an IPv4 address."""
ipv4 = "192.168.1.1"
sanitized = _sanitize_collection_name(ipv4)
assert sanitized.startswith("ip_")
assert sanitized[0].isalnum()
assert sanitized[-1].isalnum()
assert all(c.isalnum() or c in ["_", "-"] for c in sanitized)
def test_is_ipv4_pattern(self) -> None:
"""Test IPv4 pattern detection."""
assert _is_ipv4_pattern("192.168.1.1") is True
assert _is_ipv4_pattern("not.an.ip.address") is False
def test_sanitize_collection_name_properties(self) -> None:
"""Test that sanitized collection names always meet ChromaDB requirements."""
test_cases: list[str] = [
"A" * 100, # Very long name
"_start_with_underscore",
"end_with_underscore_",
"contains@special#characters",
"192.168.1.1", # IPv4 address
"a" * 2, # Too short
]
for test_case in test_cases:
sanitized = _sanitize_collection_name(test_case)
assert len(sanitized) >= MIN_COLLECTION_LENGTH
assert len(sanitized) <= MAX_COLLECTION_LENGTH
assert sanitized[0].isalnum()
assert sanitized[-1].isalnum()
def test_sanitize_collection_name_empty_string(self) -> None:
"""Test sanitizing an empty string."""
sanitized = _sanitize_collection_name("")
assert sanitized == "default_collection"
def test_sanitize_collection_name_whitespace_only(self) -> None:
"""Test sanitizing a string with only whitespace."""
sanitized = _sanitize_collection_name(" ")
assert (
sanitized == "a__z"
) # Spaces become underscores, padded to meet requirements
assert len(sanitized) >= MIN_COLLECTION_LENGTH
assert sanitized[0].isalnum()
assert sanitized[-1].isalnum()
class TestPrepareDocumentsForChromaDB:
"""Test suite for _prepare_documents_for_chromadb function."""
def test_prepare_documents_with_doc_ids(self) -> None:
"""Test preparing documents that already have doc_ids."""
documents: list[BaseRecord] = [
{
"doc_id": "id1",
"content": "First document",
"metadata": {"source": "test1"},
},
{
"doc_id": "id2",
"content": "Second document",
"metadata": {"source": "test2"},
},
]
result = _prepare_documents_for_chromadb(documents)
assert result.ids == ["id1", "id2"]
assert result.texts == ["First document", "Second document"]
assert result.metadatas == [{"source": "test1"}, {"source": "test2"}]
def test_prepare_documents_generate_ids(self) -> None:
"""Test preparing documents without doc_ids (should generate hashes)."""
documents: list[BaseRecord] = [
{"content": "Test content", "metadata": {"key": "value"}},
{"content": "Another test"},
]
result = _prepare_documents_for_chromadb(documents)
assert len(result.ids) == 2
assert all(len(doc_id) == 64 for doc_id in result.ids)
assert result.texts == ["Test content", "Another test"]
assert result.metadatas == [{"key": "value"}, {}]
def test_prepare_documents_with_list_metadata(self) -> None:
"""Test preparing documents with list metadata (should take first item)."""
documents: list[BaseRecord] = [
{"content": "Test", "metadata": [{"first": "item"}, {"second": "item"}]},
{"content": "Test2", "metadata": []},
]
result = _prepare_documents_for_chromadb(documents)
assert result.metadatas == [{"first": "item"}, {}]
def test_prepare_documents_no_metadata(self) -> None:
"""Test preparing documents without metadata."""
documents: list[BaseRecord] = [
{"content": "Document 1"},
{"content": "Document 2", "metadata": None},
]
result = _prepare_documents_for_chromadb(documents)
assert result.metadatas == [{}, {}]
def test_prepare_documents_hash_consistency(self) -> None:
"""Test that identical content produces identical hashes."""
documents1: list[BaseRecord] = [
{"content": "Same content", "metadata": {"key": "value"}}
]
documents2: list[BaseRecord] = [
{"content": "Same content", "metadata": {"key": "value"}}
]
result1 = _prepare_documents_for_chromadb(documents1)
result2 = _prepare_documents_for_chromadb(documents2)
assert result1.ids == result2.ids
class TestCreateBatchSlice:
"""Test suite for _create_batch_slice function."""
def test_create_batch_slice_normal(self) -> None:
"""Test creating a normal batch slice."""
prepared = PreparedDocuments(
ids=["id1", "id2", "id3", "id4", "id5"],
texts=["doc1", "doc2", "doc3", "doc4", "doc5"],
metadatas=[{"a": 1}, {"b": 2}, {"c": 3}, {"d": 4}, {"e": 5}],
)
batch_ids, batch_texts, batch_metadatas = _create_batch_slice(
prepared, start_index=1, batch_size=3
)
assert batch_ids == ["id2", "id3", "id4"]
assert batch_texts == ["doc2", "doc3", "doc4"]
assert batch_metadatas == [{"b": 2}, {"c": 3}, {"d": 4}]
def test_create_batch_slice_at_end(self) -> None:
"""Test creating a batch slice that goes beyond the end."""
prepared = PreparedDocuments(
ids=["id1", "id2", "id3"],
texts=["doc1", "doc2", "doc3"],
metadatas=[{"a": 1}, {"b": 2}, {"c": 3}],
)
batch_ids, batch_texts, batch_metadatas = _create_batch_slice(
prepared, start_index=2, batch_size=5
)
assert batch_ids == ["id3"]
assert batch_texts == ["doc3"]
assert batch_metadatas == [{"c": 3}]
def test_create_batch_slice_empty_batch(self) -> None:
"""Test creating a batch slice starting beyond the data."""
prepared = PreparedDocuments(
ids=["id1", "id2"], texts=["doc1", "doc2"], metadatas=[{"a": 1}, {"b": 2}]
)
batch_ids, batch_texts, batch_metadatas = _create_batch_slice(
prepared, start_index=5, batch_size=3
)
assert batch_ids == []
assert batch_texts == []
assert batch_metadatas == []
def test_create_batch_slice_no_metadatas(self) -> None:
"""Test creating a batch slice with no metadatas."""
prepared = PreparedDocuments(
ids=["id1", "id2", "id3"], texts=["doc1", "doc2", "doc3"], metadatas=[]
)
batch_ids, batch_texts, batch_metadatas = _create_batch_slice(
prepared, start_index=0, batch_size=2
)
assert batch_ids == ["id1", "id2"]
assert batch_texts == ["doc1", "doc2"]
assert batch_metadatas is None
def test_create_batch_slice_all_empty_metadatas(self) -> None:
"""Test creating a batch slice where all metadatas are empty."""
prepared = PreparedDocuments(
ids=["id1", "id2", "id3"],
texts=["doc1", "doc2", "doc3"],
metadatas=[{}, {}, {}],
)
batch_ids, batch_texts, batch_metadatas = _create_batch_slice(
prepared, start_index=0, batch_size=3
)
assert batch_ids == ["id1", "id2", "id3"]
assert batch_texts == ["doc1", "doc2", "doc3"]
assert batch_metadatas is None
def test_create_batch_slice_some_empty_metadatas(self) -> None:
"""Test creating a batch slice where some metadatas are empty."""
prepared = PreparedDocuments(
ids=["id1", "id2", "id3"],
texts=["doc1", "doc2", "doc3"],
metadatas=[{"a": 1}, {}, {"c": 3}],
)
batch_ids, batch_texts, batch_metadatas = _create_batch_slice(
prepared, start_index=0, batch_size=3
)
assert batch_ids == ["id1", "id2", "id3"]
assert batch_texts == ["doc1", "doc2", "doc3"]
assert batch_metadatas == [{"a": 1}, {}, {"c": 3}]
def test_create_batch_slice_zero_start_index(self) -> None:
"""Test creating a batch slice starting from index 0."""
prepared = PreparedDocuments(
ids=["id1", "id2", "id3", "id4"],
texts=["doc1", "doc2", "doc3", "doc4"],
metadatas=[{"a": 1}, {"b": 2}, {"c": 3}, {"d": 4}],
)
batch_ids, batch_texts, batch_metadatas = _create_batch_slice(
prepared, start_index=0, batch_size=2
)
assert batch_ids == ["id1", "id2"]
assert batch_texts == ["doc1", "doc2"]
assert batch_metadatas == [{"a": 1}, {"b": 2}]
def test_create_batch_slice_single_item(self) -> None:
"""Test creating a batch slice with batch size 1."""
prepared = PreparedDocuments(
ids=["id1", "id2", "id3"],
texts=["doc1", "doc2", "doc3"],
metadatas=[{"a": 1}, {"b": 2}, {"c": 3}],
)
batch_ids, batch_texts, batch_metadatas = _create_batch_slice(
prepared, start_index=1, batch_size=1
)
assert batch_ids == ["id2"]
assert batch_texts == ["doc2"]
assert batch_metadatas == [{"b": 2}]