mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-10 08:38:30 +00:00
- add batch_size field to baseragconfig (default=100) - update chromadb/qdrant clients and factories to use batch_size - extract and filter batch_size from embedder config in knowledgestorage - fix large csv files exceeding embedder token limits (#3574) - remove unneeded conditional for type Co-authored-by: Vini Brasil <vini@hey.com>
303 lines
11 KiB
Python
303 lines
11 KiB
Python
"""Tests for ChromaDB utility functions."""
|
|
|
|
from crewai.rag.chromadb.types import PreparedDocuments
|
|
from crewai.rag.chromadb.utils import (
|
|
MAX_COLLECTION_LENGTH,
|
|
MIN_COLLECTION_LENGTH,
|
|
_create_batch_slice,
|
|
_is_ipv4_pattern,
|
|
_prepare_documents_for_chromadb,
|
|
_sanitize_collection_name,
|
|
)
|
|
from crewai.rag.types import BaseRecord
|
|
|
|
|
|
class TestChromaDBUtils:
|
|
"""Test suite for ChromaDB utility functions."""
|
|
|
|
def test_sanitize_collection_name_long_name(self) -> None:
|
|
"""Test sanitizing a very long collection name."""
|
|
long_name = "This is an extremely long role name that will definitely exceed the ChromaDB collection name limit of 63 characters and cause an error when used as a collection name"
|
|
sanitized = _sanitize_collection_name(long_name)
|
|
assert len(sanitized) <= MAX_COLLECTION_LENGTH
|
|
assert sanitized[0].isalnum()
|
|
assert sanitized[-1].isalnum()
|
|
assert all(c.isalnum() or c in ["_", "-"] for c in sanitized)
|
|
|
|
def test_sanitize_collection_name_special_chars(self) -> None:
|
|
"""Test sanitizing a name with special characters."""
|
|
special_chars = "Agent@123!#$%^&*()"
|
|
sanitized = _sanitize_collection_name(special_chars)
|
|
assert sanitized[0].isalnum()
|
|
assert sanitized[-1].isalnum()
|
|
assert all(c.isalnum() or c in ["_", "-"] for c in sanitized)
|
|
|
|
def test_sanitize_collection_name_short_name(self) -> None:
|
|
"""Test sanitizing a very short name."""
|
|
short_name = "A"
|
|
sanitized = _sanitize_collection_name(short_name)
|
|
assert len(sanitized) >= MIN_COLLECTION_LENGTH
|
|
assert sanitized[0].isalnum()
|
|
assert sanitized[-1].isalnum()
|
|
|
|
def test_sanitize_collection_name_bad_ends(self) -> None:
|
|
"""Test sanitizing a name with non-alphanumeric start/end."""
|
|
bad_ends = "_Agent_"
|
|
sanitized = _sanitize_collection_name(bad_ends)
|
|
assert sanitized[0].isalnum()
|
|
assert sanitized[-1].isalnum()
|
|
|
|
def test_sanitize_collection_name_none(self) -> None:
|
|
"""Test sanitizing a None value."""
|
|
sanitized = _sanitize_collection_name(None)
|
|
assert sanitized == "default_collection"
|
|
|
|
def test_sanitize_collection_name_ipv4_pattern(self) -> None:
|
|
"""Test sanitizing an IPv4 address."""
|
|
ipv4 = "192.168.1.1"
|
|
sanitized = _sanitize_collection_name(ipv4)
|
|
assert sanitized.startswith("ip_")
|
|
assert sanitized[0].isalnum()
|
|
assert sanitized[-1].isalnum()
|
|
assert all(c.isalnum() or c in ["_", "-"] for c in sanitized)
|
|
|
|
def test_is_ipv4_pattern(self) -> None:
|
|
"""Test IPv4 pattern detection."""
|
|
assert _is_ipv4_pattern("192.168.1.1") is True
|
|
assert _is_ipv4_pattern("not.an.ip.address") is False
|
|
|
|
def test_sanitize_collection_name_properties(self) -> None:
|
|
"""Test that sanitized collection names always meet ChromaDB requirements."""
|
|
test_cases: list[str] = [
|
|
"A" * 100, # Very long name
|
|
"_start_with_underscore",
|
|
"end_with_underscore_",
|
|
"contains@special#characters",
|
|
"192.168.1.1", # IPv4 address
|
|
"a" * 2, # Too short
|
|
]
|
|
for test_case in test_cases:
|
|
sanitized = _sanitize_collection_name(test_case)
|
|
assert len(sanitized) >= MIN_COLLECTION_LENGTH
|
|
assert len(sanitized) <= MAX_COLLECTION_LENGTH
|
|
assert sanitized[0].isalnum()
|
|
assert sanitized[-1].isalnum()
|
|
|
|
def test_sanitize_collection_name_empty_string(self) -> None:
|
|
"""Test sanitizing an empty string."""
|
|
sanitized = _sanitize_collection_name("")
|
|
assert sanitized == "default_collection"
|
|
|
|
def test_sanitize_collection_name_whitespace_only(self) -> None:
|
|
"""Test sanitizing a string with only whitespace."""
|
|
sanitized = _sanitize_collection_name(" ")
|
|
assert (
|
|
sanitized == "a__z"
|
|
) # Spaces become underscores, padded to meet requirements
|
|
assert len(sanitized) >= MIN_COLLECTION_LENGTH
|
|
assert sanitized[0].isalnum()
|
|
assert sanitized[-1].isalnum()
|
|
|
|
|
|
class TestPrepareDocumentsForChromaDB:
|
|
"""Test suite for _prepare_documents_for_chromadb function."""
|
|
|
|
def test_prepare_documents_with_doc_ids(self) -> None:
|
|
"""Test preparing documents that already have doc_ids."""
|
|
documents: list[BaseRecord] = [
|
|
{
|
|
"doc_id": "id1",
|
|
"content": "First document",
|
|
"metadata": {"source": "test1"},
|
|
},
|
|
{
|
|
"doc_id": "id2",
|
|
"content": "Second document",
|
|
"metadata": {"source": "test2"},
|
|
},
|
|
]
|
|
|
|
result = _prepare_documents_for_chromadb(documents)
|
|
|
|
assert result.ids == ["id1", "id2"]
|
|
assert result.texts == ["First document", "Second document"]
|
|
assert result.metadatas == [{"source": "test1"}, {"source": "test2"}]
|
|
|
|
def test_prepare_documents_generate_ids(self) -> None:
|
|
"""Test preparing documents without doc_ids (should generate hashes)."""
|
|
documents: list[BaseRecord] = [
|
|
{"content": "Test content", "metadata": {"key": "value"}},
|
|
{"content": "Another test"},
|
|
]
|
|
|
|
result = _prepare_documents_for_chromadb(documents)
|
|
|
|
assert len(result.ids) == 2
|
|
assert all(len(doc_id) == 64 for doc_id in result.ids)
|
|
assert result.texts == ["Test content", "Another test"]
|
|
assert result.metadatas == [{"key": "value"}, {}]
|
|
|
|
def test_prepare_documents_with_list_metadata(self) -> None:
|
|
"""Test preparing documents with list metadata (should take first item)."""
|
|
documents: list[BaseRecord] = [
|
|
{"content": "Test", "metadata": [{"first": "item"}, {"second": "item"}]},
|
|
{"content": "Test2", "metadata": []},
|
|
]
|
|
|
|
result = _prepare_documents_for_chromadb(documents)
|
|
|
|
assert result.metadatas == [{"first": "item"}, {}]
|
|
|
|
def test_prepare_documents_no_metadata(self) -> None:
|
|
"""Test preparing documents without metadata."""
|
|
documents: list[BaseRecord] = [
|
|
{"content": "Document 1"},
|
|
{"content": "Document 2", "metadata": None},
|
|
]
|
|
|
|
result = _prepare_documents_for_chromadb(documents)
|
|
|
|
assert result.metadatas == [{}, {}]
|
|
|
|
def test_prepare_documents_hash_consistency(self) -> None:
|
|
"""Test that identical content produces identical hashes."""
|
|
documents1: list[BaseRecord] = [
|
|
{"content": "Same content", "metadata": {"key": "value"}}
|
|
]
|
|
documents2: list[BaseRecord] = [
|
|
{"content": "Same content", "metadata": {"key": "value"}}
|
|
]
|
|
|
|
result1 = _prepare_documents_for_chromadb(documents1)
|
|
result2 = _prepare_documents_for_chromadb(documents2)
|
|
|
|
assert result1.ids == result2.ids
|
|
|
|
|
|
class TestCreateBatchSlice:
|
|
"""Test suite for _create_batch_slice function."""
|
|
|
|
def test_create_batch_slice_normal(self) -> None:
|
|
"""Test creating a normal batch slice."""
|
|
prepared = PreparedDocuments(
|
|
ids=["id1", "id2", "id3", "id4", "id5"],
|
|
texts=["doc1", "doc2", "doc3", "doc4", "doc5"],
|
|
metadatas=[{"a": 1}, {"b": 2}, {"c": 3}, {"d": 4}, {"e": 5}],
|
|
)
|
|
|
|
batch_ids, batch_texts, batch_metadatas = _create_batch_slice(
|
|
prepared, start_index=1, batch_size=3
|
|
)
|
|
|
|
assert batch_ids == ["id2", "id3", "id4"]
|
|
assert batch_texts == ["doc2", "doc3", "doc4"]
|
|
assert batch_metadatas == [{"b": 2}, {"c": 3}, {"d": 4}]
|
|
|
|
def test_create_batch_slice_at_end(self) -> None:
|
|
"""Test creating a batch slice that goes beyond the end."""
|
|
prepared = PreparedDocuments(
|
|
ids=["id1", "id2", "id3"],
|
|
texts=["doc1", "doc2", "doc3"],
|
|
metadatas=[{"a": 1}, {"b": 2}, {"c": 3}],
|
|
)
|
|
|
|
batch_ids, batch_texts, batch_metadatas = _create_batch_slice(
|
|
prepared, start_index=2, batch_size=5
|
|
)
|
|
|
|
assert batch_ids == ["id3"]
|
|
assert batch_texts == ["doc3"]
|
|
assert batch_metadatas == [{"c": 3}]
|
|
|
|
def test_create_batch_slice_empty_batch(self) -> None:
|
|
"""Test creating a batch slice starting beyond the data."""
|
|
prepared = PreparedDocuments(
|
|
ids=["id1", "id2"], texts=["doc1", "doc2"], metadatas=[{"a": 1}, {"b": 2}]
|
|
)
|
|
|
|
batch_ids, batch_texts, batch_metadatas = _create_batch_slice(
|
|
prepared, start_index=5, batch_size=3
|
|
)
|
|
|
|
assert batch_ids == []
|
|
assert batch_texts == []
|
|
assert batch_metadatas == []
|
|
|
|
def test_create_batch_slice_no_metadatas(self) -> None:
|
|
"""Test creating a batch slice with no metadatas."""
|
|
prepared = PreparedDocuments(
|
|
ids=["id1", "id2", "id3"], texts=["doc1", "doc2", "doc3"], metadatas=[]
|
|
)
|
|
|
|
batch_ids, batch_texts, batch_metadatas = _create_batch_slice(
|
|
prepared, start_index=0, batch_size=2
|
|
)
|
|
|
|
assert batch_ids == ["id1", "id2"]
|
|
assert batch_texts == ["doc1", "doc2"]
|
|
assert batch_metadatas is None
|
|
|
|
def test_create_batch_slice_all_empty_metadatas(self) -> None:
|
|
"""Test creating a batch slice where all metadatas are empty."""
|
|
prepared = PreparedDocuments(
|
|
ids=["id1", "id2", "id3"],
|
|
texts=["doc1", "doc2", "doc3"],
|
|
metadatas=[{}, {}, {}],
|
|
)
|
|
|
|
batch_ids, batch_texts, batch_metadatas = _create_batch_slice(
|
|
prepared, start_index=0, batch_size=3
|
|
)
|
|
|
|
assert batch_ids == ["id1", "id2", "id3"]
|
|
assert batch_texts == ["doc1", "doc2", "doc3"]
|
|
assert batch_metadatas is None
|
|
|
|
def test_create_batch_slice_some_empty_metadatas(self) -> None:
|
|
"""Test creating a batch slice where some metadatas are empty."""
|
|
prepared = PreparedDocuments(
|
|
ids=["id1", "id2", "id3"],
|
|
texts=["doc1", "doc2", "doc3"],
|
|
metadatas=[{"a": 1}, {}, {"c": 3}],
|
|
)
|
|
|
|
batch_ids, batch_texts, batch_metadatas = _create_batch_slice(
|
|
prepared, start_index=0, batch_size=3
|
|
)
|
|
|
|
assert batch_ids == ["id1", "id2", "id3"]
|
|
assert batch_texts == ["doc1", "doc2", "doc3"]
|
|
assert batch_metadatas == [{"a": 1}, {}, {"c": 3}]
|
|
|
|
def test_create_batch_slice_zero_start_index(self) -> None:
|
|
"""Test creating a batch slice starting from index 0."""
|
|
prepared = PreparedDocuments(
|
|
ids=["id1", "id2", "id3", "id4"],
|
|
texts=["doc1", "doc2", "doc3", "doc4"],
|
|
metadatas=[{"a": 1}, {"b": 2}, {"c": 3}, {"d": 4}],
|
|
)
|
|
|
|
batch_ids, batch_texts, batch_metadatas = _create_batch_slice(
|
|
prepared, start_index=0, batch_size=2
|
|
)
|
|
|
|
assert batch_ids == ["id1", "id2"]
|
|
assert batch_texts == ["doc1", "doc2"]
|
|
assert batch_metadatas == [{"a": 1}, {"b": 2}]
|
|
|
|
def test_create_batch_slice_single_item(self) -> None:
|
|
"""Test creating a batch slice with batch size 1."""
|
|
prepared = PreparedDocuments(
|
|
ids=["id1", "id2", "id3"],
|
|
texts=["doc1", "doc2", "doc3"],
|
|
metadatas=[{"a": 1}, {"b": 2}, {"c": 3}],
|
|
)
|
|
|
|
batch_ids, batch_texts, batch_metadatas = _create_batch_slice(
|
|
prepared, start_index=1, batch_size=1
|
|
)
|
|
|
|
assert batch_ids == ["id2"]
|
|
assert batch_texts == ["doc2"]
|
|
assert batch_metadatas == [{"b": 2}]
|