Compare commits

...

3 Commits

Author SHA1 Message Date
Devin AI
ac42992e22 Add additional tests and fix RAGStorage.search() comparison
Co-Authored-By: Joe Moura <joao@crewai.com>
2025-03-10 17:16:49 +00:00
Devin AI
464ca30e46 Fix linting issues in test file
Co-Authored-By: Joe Moura <joao@crewai.com>
2025-03-10 17:12:27 +00:00
Devin AI
a8022f31d3 Fix issue #2315: Correct ChromaDB distance comparison in KnowledgeStorage.search()
Co-Authored-By: Joe Moura <joao@crewai.com>
2025-03-10 17:10:26 +00:00
3 changed files with 121 additions and 2 deletions

View File

@@ -76,7 +76,7 @@ class KnowledgeStorage(BaseKnowledgeStorage):
"context": fetched["documents"][0][i], # type: ignore
"score": fetched["distances"][0][i], # type: ignore
}
if result["score"] >= score_threshold:
if result["score"] < score_threshold: # Lower distance values indicate higher similarity in ChromaDB
results.append(result)
return results
else:

View File

@@ -130,7 +130,7 @@ class RAGStorage(BaseRAGStorage):
"context": response["documents"][0][i],
"score": response["distances"][0][i],
}
if result["score"] >= score_threshold:
if result["score"] < score_threshold: # Lower distance values indicate higher similarity in ChromaDB
results.append(result)
return results

View File

@@ -0,0 +1,119 @@
from unittest.mock import MagicMock, patch
import pytest
from crewai import Agent, Crew, Process, Task
from crewai.knowledge.source.string_knowledge_source import StringKnowledgeSource
from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage
def test_knowledge_storage_search_filtering():
"""Test that KnowledgeStorage.search() correctly filters results based on distance scores."""
# Create a mock collection to simulate ChromaDB behavior
mock_collection = MagicMock()
mock_collection.query.return_value = {
"ids": [["1", "2", "3", "4", "5"]],
"metadatas": [[{}, {}, {}, {}, {}]],
"documents": [["Doc1", "Doc2", "Doc3", "Doc4", "Doc5"]],
"distances": [[0.1, 0.2, 0.3, 0.4, 0.5]] # Lower is better in ChromaDB
}
# Create a KnowledgeStorage instance with the mock collection
storage = KnowledgeStorage()
storage.collection = mock_collection
# Search with the fixed implementation
results = storage.search(["test query"], score_threshold=0.35)
# Assert that only results with distance < threshold are included
assert len(results) == 3
assert results[0]["context"] == "Doc1"
assert results[1]["context"] == "Doc2"
assert results[2]["context"] == "Doc3"
# Verify that results with distance >= threshold are excluded
contexts = [result["context"] for result in results]
assert "Doc4" not in contexts
assert "Doc5" not in contexts
def test_string_knowledge_source_integration():
"""Test that StringKnowledgeSource correctly adds content to storage."""
# Create a knowledge source with specific content
content = "Users name is John. He is 30 years old and lives in San Francisco."
# Mock the KnowledgeStorage to avoid actual embedding computation
with patch('crewai.knowledge.storage.knowledge_storage.KnowledgeStorage') as MockStorage:
# Configure the mock storage
mock_storage = MockStorage.return_value
mock_storage.search.return_value = [
{"context": "Users name is John. He is 30 years old and lives in San Francisco."}
]
# Create the string source with the mock storage
string_source = StringKnowledgeSource(content=content)
string_source.storage = mock_storage
string_source.add()
# Verify that the content was added to storage
assert mock_storage.save.called
# Test querying the knowledge
results = mock_storage.search(["What city does John live in?"])
assert len(results) > 0
assert "San Francisco" in results[0]["context"]
def test_knowledge_storage_search_empty_results():
"""Test that KnowledgeStorage.search() correctly handles empty results."""
# Create a mock collection to simulate ChromaDB with empty results
mock_collection = MagicMock()
mock_collection.query.return_value = {
"ids": [[]],
"metadatas": [[]],
"documents": [[]],
"distances": [[]]
}
# Create a KnowledgeStorage instance with the mock collection
storage = KnowledgeStorage()
storage.collection = mock_collection
# Search with the fixed implementation
results = storage.search(["test query"], score_threshold=0.35)
# Assert that no results are returned
assert len(results) == 0
def test_knowledge_storage_search_threshold_boundary():
"""Test that KnowledgeStorage.search() correctly handles boundary threshold values."""
# Create a mock collection to simulate ChromaDB with a result at the exact threshold
mock_collection = MagicMock()
mock_collection.query.return_value = {
"ids": [["1"]],
"metadatas": [[{}]],
"documents": [["Doc1"]],
"distances": [[0.35]] # Exact threshold value
}
# Create a KnowledgeStorage instance with the mock collection
storage = KnowledgeStorage()
storage.collection = mock_collection
# Search with the fixed implementation
results = storage.search(["test query"], score_threshold=0.35)
# Assert that exact threshold matches are excluded
assert len(results) == 0
def test_knowledge_storage_search_error_handling():
"""Test that KnowledgeStorage.search() correctly handles errors."""
# Create a mock collection that raises an exception
mock_collection = MagicMock()
mock_collection.query.side_effect = Exception("ChromaDB error")
# Create a KnowledgeStorage instance with the mock collection
storage = KnowledgeStorage()
storage.collection = mock_collection
# Assert that the exception is propagated
with pytest.raises(Exception):
storage.search(["test query"], score_threshold=0.35)