refactor: unify rag storage with instance-specific client support (#3455)

- ignore line length errors globally - migrate knowledge/memory and crew query_knowledge to `SearchResult` - remove legacy chromadb utils; fix empty metadata handling - restore openai as default embedding provider; support instance-specific clients - update and fix tests for `SearchResult` migration and rag changes
2026-01-26 00:28:13 +00:00 · 2025-09-17 14:46:54 -04:00
parent 81bd81e5f5
commit f28e78c5ba
30 changed files with 1956 additions and 976 deletions
--- a/tests/utilities/test_chromadb_utils.py
+++ b/tests/utilities/test_chromadb_utils.py
@@ -1,123 +0,0 @@
-import multiprocessing
-import tempfile
-import unittest
-
-from chromadb.config import Settings
-from unittest.mock import patch, MagicMock
-
-from crewai.utilities.chromadb import (
-    MAX_COLLECTION_LENGTH,
-    MIN_COLLECTION_LENGTH,
-    is_ipv4_pattern,
-    sanitize_collection_name,
-    create_persistent_client,
-)
-
-
-def persistent_client_worker(path, queue):
-    try:
-        create_persistent_client(path=path)
-        queue.put(None)
-    except Exception as e:
-        queue.put(e)
-
-
-class TestChromadbUtils(unittest.TestCase):
-    def test_sanitize_collection_name_long_name(self):
-        """Test sanitizing a very long collection name."""
-        long_name = "This is an extremely long role name that will definitely exceed the ChromaDB collection name limit of 63 characters and cause an error when used as a collection name"
-        sanitized = sanitize_collection_name(long_name)
-        self.assertLessEqual(len(sanitized), MAX_COLLECTION_LENGTH)
-        self.assertTrue(sanitized[0].isalnum())
-        self.assertTrue(sanitized[-1].isalnum())
-        self.assertTrue(all(c.isalnum() or c in ["_", "-"] for c in sanitized))
-
-    def test_sanitize_collection_name_special_chars(self):
-        """Test sanitizing a name with special characters."""
-        special_chars = "Agent@123!#$%^&*()"
-        sanitized = sanitize_collection_name(special_chars)
-        self.assertTrue(sanitized[0].isalnum())
-        self.assertTrue(sanitized[-1].isalnum())
-        self.assertTrue(all(c.isalnum() or c in ["_", "-"] for c in sanitized))
-
-    def test_sanitize_collection_name_short_name(self):
-        """Test sanitizing a very short name."""
-        short_name = "A"
-        sanitized = sanitize_collection_name(short_name)
-        self.assertGreaterEqual(len(sanitized), MIN_COLLECTION_LENGTH)
-        self.assertTrue(sanitized[0].isalnum())
-        self.assertTrue(sanitized[-1].isalnum())
-
-    def test_sanitize_collection_name_bad_ends(self):
-        """Test sanitizing a name with non-alphanumeric start/end."""
-        bad_ends = "_Agent_"
-        sanitized = sanitize_collection_name(bad_ends)
-        self.assertTrue(sanitized[0].isalnum())
-        self.assertTrue(sanitized[-1].isalnum())
-
-    def test_sanitize_collection_name_none(self):
-        """Test sanitizing a None value."""
-        sanitized = sanitize_collection_name(None)
-        self.assertEqual(sanitized, "default_collection")
-
-    def test_sanitize_collection_name_ipv4_pattern(self):
-        """Test sanitizing an IPv4 address."""
-        ipv4 = "192.168.1.1"
-        sanitized = sanitize_collection_name(ipv4)
-        self.assertTrue(sanitized.startswith("ip_"))
-        self.assertTrue(sanitized[0].isalnum())
-        self.assertTrue(sanitized[-1].isalnum())
-        self.assertTrue(all(c.isalnum() or c in ["_", "-"] for c in sanitized))
-
-    def test_is_ipv4_pattern(self):
-        """Test IPv4 pattern detection."""
-        self.assertTrue(is_ipv4_pattern("192.168.1.1"))
-        self.assertFalse(is_ipv4_pattern("not.an.ip.address"))
-
-    def test_sanitize_collection_name_properties(self):
-        """Test that sanitized collection names always meet ChromaDB requirements."""
-        test_cases = [
-            "A" * 100,  # Very long name
-            "_start_with_underscore",
-            "end_with_underscore_",
-            "contains@special#characters",
-            "192.168.1.1",  # IPv4 address
-            "a" * 2,  # Too short
-        ]
-        for test_case in test_cases:
-            sanitized = sanitize_collection_name(test_case)
-            self.assertGreaterEqual(len(sanitized), MIN_COLLECTION_LENGTH)
-            self.assertLessEqual(len(sanitized), MAX_COLLECTION_LENGTH)
-            self.assertTrue(sanitized[0].isalnum())
-            self.assertTrue(sanitized[-1].isalnum())
-
-    def test_create_persistent_client_passes_args(self):
-        with patch(
-            "crewai.utilities.chromadb.PersistentClient"
-        ) as mock_persistent_client, tempfile.TemporaryDirectory() as tmpdir:
-            mock_instance = MagicMock()
-            mock_persistent_client.return_value = mock_instance
-
-            settings = Settings(allow_reset=True)
-            client = create_persistent_client(path=tmpdir, settings=settings)
-
-            mock_persistent_client.assert_called_once_with(
-                path=tmpdir, settings=settings
-            )
-            self.assertIs(client, mock_instance)
-
-    def test_create_persistent_client_process_safe(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            queue = multiprocessing.Queue()
-            processes = [
-                multiprocessing.Process(
-                    target=persistent_client_worker, args=(tmpdir, queue)
-                )
-                for _ in range(5)
-            ]
-
-            [p.start() for p in processes]
-            [p.join() for p in processes]
-
-            errors = [queue.get(timeout=5) for _ in processes]
-            self.assertTrue(all(err is None for err in errors))
--- a/tests/utilities/test_knowledge_planning.py
+++ b/tests/utilities/test_knowledge_planning.py
@@ -29,13 +29,15 @@ def mock_knowledge_source():
    """
    return StringKnowledgeSource(content=content)

-@patch('crewai.knowledge.storage.knowledge_storage.chromadb')
-def test_knowledge_included_in_planning(mock_chroma):
+
+@patch("crewai.rag.config.utils.get_rag_client")
+def test_knowledge_included_in_planning(mock_get_client):
    """Test that verifies knowledge sources are properly included in planning."""
-    # Mock ChromaDB collection
-    mock_collection = mock_chroma.return_value.get_or_create_collection.return_value
-    mock_collection.add.return_value = None
-    
+    # Mock RAG client
+    mock_client = mock_get_client.return_value
+    mock_client.get_or_create_collection.return_value = None
+    mock_client.add_documents.return_value = None
+
    # Create an agent with knowledge
    agent = Agent(
        role="AI Researcher",
@@ -45,14 +47,14 @@ def test_knowledge_included_in_planning(mock_chroma):
            StringKnowledgeSource(
                content="AI systems require careful training and validation."
            )
-        ]
+        ],
    )

    # Create a task for the agent
    task = Task(
        description="Explain the basics of AI systems",
        expected_output="A clear explanation of AI fundamentals",
-        agent=agent
+        agent=agent,
    )

    # Create a crew planner
@@ -62,23 +64,29 @@ def test_knowledge_included_in_planning(mock_chroma):
    task_summary = planner._create_tasks_summary()

    # Verify that knowledge is included in planning when present
-    assert "AI systems require careful training" in task_summary, \
+    assert "AI systems require careful training" in task_summary, (
        "Knowledge content should be present in task summary when knowledge exists"
-    assert '"agent_knowledge"' in task_summary, \
+    )
+    assert '"agent_knowledge"' in task_summary, (
        "agent_knowledge field should be present in task summary when knowledge exists"
+    )

    # Verify that knowledge is properly formatted
-    assert isinstance(task.agent.knowledge_sources, list), \
+    assert isinstance(task.agent.knowledge_sources, list), (
        "Knowledge sources should be stored in a list"
-    assert len(task.agent.knowledge_sources) > 0, \
+    )
+    assert len(task.agent.knowledge_sources) > 0, (
        "At least one knowledge source should be present"
-    assert task.agent.knowledge_sources[0].content in task_summary, \
+    )
+    assert task.agent.knowledge_sources[0].content in task_summary, (
        "Knowledge source content should be included in task summary"
+    )

    # Verify that other expected components are still present
-    assert task.description in task_summary, \
+    assert task.description in task_summary, (
        "Task description should be present in task summary"
-    assert task.expected_output in task_summary, \
+    )
+    assert task.expected_output in task_summary, (
        "Expected output should be present in task summary"
-    assert agent.role in task_summary, \
-        "Agent role should be present in task summary"
+    )
+    assert agent.role in task_summary, "Agent role should be present in task summary"