fix: add batch_size support to prevent embedder token limit errors

- add batch_size field to baseragconfig (default=100) - update chromadb/qdrant clients and factories to use batch_size - extract and filter batch_size from embedder config in knowledgestorage - fix large csv files exceeding embedder token limits (#3574) - remove unneeded conditional for type Co-authored-by: Vini Brasil <vini@hey.com>
2026-01-11 00:58:30 +00:00 · 2025-09-24 00:05:43 -04:00
parent 4ac65eb0a6
commit 1dbe8aab52
12 changed files with 558 additions and 56 deletions
--- a/src/crewai/rag/chromadb/client.py
+++ b/src/crewai/rag/chromadb/client.py
@@ -17,6 +17,7 @@ from crewai.rag.chromadb.types import (
    ChromaDBCollectionSearchParams,
 )
 from crewai.rag.chromadb.utils import (
+    _create_batch_slice,
    _extract_search_params,
    _is_async_client,
    _is_sync_client,
@@ -52,6 +53,7 @@ class ChromaDBClient(BaseClient):
        embedding_function: ChromaEmbeddingFunction,
        default_limit: int = 5,
        default_score_threshold: float = 0.6,
+        default_batch_size: int = 100,
    ) -> None:
        """Initialize ChromaDBClient with client and embedding function.

@@ -60,11 +62,13 @@ class ChromaDBClient(BaseClient):
            embedding_function: Embedding function for text to vector conversion.
            default_limit: Default number of results to return in searches.
            default_score_threshold: Default minimum score for search results.
+            default_batch_size: Default batch size for adding documents.
        """
        self.client = client
        self.embedding_function = embedding_function
        self.default_limit = default_limit
        self.default_score_threshold = default_score_threshold
+        self.default_batch_size = default_batch_size

    def create_collection(
        self, **kwargs: Unpack[ChromaDBCollectionCreateParams]
@@ -291,6 +295,7 @@ class ChromaDBClient(BaseClient):
                - content: The text content (required)
                - doc_id: Optional unique identifier (auto-generated if missing)
                - metadata: Optional metadata dictionary
+            batch_size: Optional batch size for processing documents (default: 100)

        Raises:
            TypeError: If AsyncClientAPI is used instead of ClientAPI for sync operations.
@@ -305,6 +310,7 @@ class ChromaDBClient(BaseClient):

        collection_name = kwargs["collection_name"]
        documents = kwargs["documents"]
+        batch_size = kwargs.get("batch_size", self.default_batch_size)

        if not documents:
            raise ValueError("Documents list cannot be empty")
@@ -315,13 +321,17 @@ class ChromaDBClient(BaseClient):
        )

        prepared = _prepare_documents_for_chromadb(documents)
-        # ChromaDB doesn't accept empty metadata dicts, so pass None if all are empty
-        metadatas = prepared.metadatas if any(m for m in prepared.metadatas) else None
-        collection.upsert(
-            ids=prepared.ids,
-            documents=prepared.texts,
-            metadatas=metadatas,
-        )
+
+        for i in range(0, len(prepared.ids), batch_size):
+            batch_ids, batch_texts, batch_metadatas = _create_batch_slice(
+                prepared=prepared, start_index=i, batch_size=batch_size
+            )
+
+            collection.upsert(
+                ids=batch_ids,
+                documents=batch_texts,
+                metadatas=batch_metadatas,
+            )

    async def aadd_documents(self, **kwargs: Unpack[BaseCollectionAddParams]) -> None:
        """Add documents with their embeddings to a collection asynchronously.
@@ -335,6 +345,7 @@ class ChromaDBClient(BaseClient):
                - content: The text content (required)
                - doc_id: Optional unique identifier (auto-generated if missing)
                - metadata: Optional metadata dictionary
+            batch_size: Optional batch size for processing documents (default: 100)

        Raises:
            TypeError: If ClientAPI is used instead of AsyncClientAPI for async operations.
@@ -349,6 +360,7 @@ class ChromaDBClient(BaseClient):

        collection_name = kwargs["collection_name"]
        documents = kwargs["documents"]
+        batch_size = kwargs.get("batch_size", self.default_batch_size)

        if not documents:
            raise ValueError("Documents list cannot be empty")
@@ -358,13 +370,17 @@ class ChromaDBClient(BaseClient):
            embedding_function=self.embedding_function,
        )
        prepared = _prepare_documents_for_chromadb(documents)
-        # ChromaDB doesn't accept empty metadata dicts, so pass None if all are empty
-        metadatas = prepared.metadatas if any(m for m in prepared.metadatas) else None
-        await collection.upsert(
-            ids=prepared.ids,
-            documents=prepared.texts,
-            metadatas=metadatas,
-        )
+
+        for i in range(0, len(prepared.ids), batch_size):
+            batch_ids, batch_texts, batch_metadatas = _create_batch_slice(
+                prepared=prepared, start_index=i, batch_size=batch_size
+            )
+
+            await collection.upsert(
+                ids=batch_ids,
+                documents=batch_texts,
+                metadatas=batch_metadatas,
+            )

    def search(
        self, **kwargs: Unpack[ChromaDBCollectionSearchParams]