Fix #2753: Handle large inputs in memory by chunking text before embedding

Co-Authored-By: Joe Moura <joao@crewai.com>
2026-01-11 17:18:29 +00:00 · 2025-05-05 09:06:33 +00:00
parent 409892d65f
commit 70b7148698
3 changed files with 91 additions and 6 deletions
--- a/src/crewai/memory/storage/rag_storage.py
+++ b/src/crewai/memory/storage/rag_storage.py
@@ -6,11 +6,12 @@ import shutil
 import uuid
 from typing import Any, Dict, List, Optional

+import numpy as np
 from chromadb.api import ClientAPI

 from crewai.memory.storage.base_rag_storage import BaseRAGStorage
 from crewai.utilities import EmbeddingConfigurator
-from crewai.utilities.constants import MAX_FILE_NAME_LENGTH
+from crewai.utilities.constants import MAX_FILE_NAME_LENGTH, MEMORY_CHUNK_SIZE, MEMORY_CHUNK_OVERLAP
 from crewai.utilities.paths import db_storage_path


@@ -138,15 +139,45 @@ class RAGStorage(BaseRAGStorage):
            logging.error(f"Error during {self.type} search: {str(e)}")
            return []

+    def _chunk_text(self, text: str) -> List[str]:
+        """
+        Split text into chunks to avoid token limits.
+        
+        Args:
+            text: Text to chunk
+            
+        Returns:
+            List of text chunks
+        """
+        if not text:
+            return []
+        
+        if len(text) <= MEMORY_CHUNK_SIZE:
+            return [text]
+            
+        chunks = []
+        for i in range(0, len(text), MEMORY_CHUNK_SIZE - MEMORY_CHUNK_OVERLAP):
+            chunk = text[i:i + MEMORY_CHUNK_SIZE]
+            if chunk:  # Only add non-empty chunks
+                chunks.append(chunk)
+                
+        return chunks
+
    def _generate_embedding(self, text: str, metadata: Dict[str, Any]) -> None:  # type: ignore
        if not hasattr(self, "app") or not hasattr(self, "collection"):
            self._initialize_app()

-        self.collection.add(
-            documents=[text],
-            metadatas=[metadata or {}],
-            ids=[str(uuid.uuid4())],
-        )
+        chunks = self._chunk_text(text)
+        
+        if not chunks:
+            return None
+            
+        for chunk in chunks:
+            self.collection.add(
+                documents=[chunk],
+                metadatas=[metadata or {}],
+                ids=[str(uuid.uuid4())],
+            )

    def reset(self) -> None:
        try:
--- a/src/crewai/utilities/constants.py
+++ b/src/crewai/utilities/constants.py
@@ -4,3 +4,5 @@ DEFAULT_SCORE_THRESHOLD = 0.35
 KNOWLEDGE_DIRECTORY = "knowledge"
 MAX_LLM_RETRY = 3
 MAX_FILE_NAME_LENGTH = 255
+MEMORY_CHUNK_SIZE = 4000
+MEMORY_CHUNK_OVERLAP = 200