feat: Add graceful quota limit handling for LLM APIs

- Create LLMQuotaLimitExceededException following CrewAI's existing pattern - Add quota limit error handling in both streaming and non-streaming LLM calls - Update error handling in agent execution and crew agent executor - Add comprehensive tests for quota limit scenarios - Fixes issue #3434: Handle RateLimitError gracefully instead of crashing The implementation catches litellm.exceptions.RateLimitError and converts it to a CrewAI-specific exception, allowing tasks to detect quota limits and shut down gracefully instead of crashing with unhandled exceptions. Co-Authored-By: João <joao@crewai.com>
2026-01-09 16:18:30 +00:00 · 2025-09-02 16:45:35 +00:00
parent 92d71f7f06
commit c763457e8d
6 changed files with 263 additions and 1 deletions
--- a/src/crewai/agent.py
+++ b/src/crewai/agent.py
@@ -48,6 +48,10 @@ from crewai.events.types.memory_events import (
    MemoryRetrievalStartedEvent,
    MemoryRetrievalCompletedEvent,
 )
 from crewai.utilities.exceptions import (
    LLMContextLengthExceededException,
    LLMQuotaLimitExceededException,
 )
 from crewai.events.types.knowledge_events import (
    KnowledgeQueryCompletedEvent,
    KnowledgeQueryFailedEvent,
@@ -461,6 +465,26 @@ class Agent(BaseAgent):
                ),
            )
            raise e
        except LLMContextLengthExceededException as e:
            crewai_event_bus.emit(
                self,
                event=AgentExecutionErrorEvent(
                    agent=self,
                    task=task,
                    error=str(e),
                ),
            )
            raise e
        except LLMQuotaLimitExceededException as e:
            crewai_event_bus.emit(
                self,
                event=AgentExecutionErrorEvent(
                    agent=self,
                    task=task,
                    error=str(e),
                ),
            )
            raise e
        except Exception as e:
            if e.__class__.__module__.startswith("litellm"):
                # Do not retry on litellm errors
--- a/src/crewai/agents/crew_agent_executor.py
+++ b/src/crewai/agents/crew_agent_executor.py
@@ -35,6 +35,10 @@ from crewai.events.types.logging_events import (
    AgentLogsExecutionEvent,
 )
 from crewai.events.event_bus import crewai_event_bus
 from crewai.utilities.exceptions import (
    LLMContextLengthExceededException,
    LLMQuotaLimitExceededException,
 )
 class CrewAgentExecutor(CrewAgentExecutorMixin):
@@ -201,6 +205,10 @@ class CrewAgentExecutor(CrewAgentExecutorMixin):
                    printer=self._printer,
                )
            except LLMContextLengthExceededException as e:
                raise e
            except LLMQuotaLimitExceededException as e:
                raise e
            except Exception as e:
                if e.__class__.__module__.startswith("litellm"):
                    # Do not retry on litellm errors
--- a/src/crewai/llm.py
+++ b/src/crewai/llm.py
@@ -35,12 +35,16 @@ from crewai.events.types.tool_usage_events import (
    ToolUsageFinishedEvent,
    ToolUsageErrorEvent,
 )
 from crewai.utilities.exceptions import (
    LLMContextLengthExceededException,
    LLMQuotaLimitExceededException,
 )
 with warnings.catch_warnings():
    warnings.simplefilter("ignore", UserWarning)
    import litellm
    from litellm import Choices
-    from litellm.exceptions import ContextWindowExceededError
+    from litellm.exceptions import ContextWindowExceededError, RateLimitError
    from litellm.litellm_core_utils.get_supported_openai_params import (
        get_supported_openai_params,
    )
@@ -669,6 +673,10 @@ class LLM(BaseLLM):
            )
            return full_response
        except RateLimitError as e:
            # Convert litellm's rate limit error to our own exception type
            # for graceful quota limit handling
            raise LLMQuotaLimitExceededException(str(e))
        except ContextWindowExceededError as e:
            # Catch context window errors from litellm and convert them to our own exception type.
            # This exception is handled by CrewAgentExecutor._invoke_loop() which can then
@@ -812,6 +820,10 @@ class LLM(BaseLLM):
            # length issues appropriately.
            response = litellm.completion(**params)
        except RateLimitError as e:
            # Convert litellm's rate limit error to our own exception type
            # for graceful quota limit handling
            raise LLMQuotaLimitExceededException(str(e))
        except ContextWindowExceededError as e:
            # Convert litellm's context window error to our own exception type
            # for consistent handling in the rest of the codebase
--- a/src/crewai/utilities/exceptions/init.py
+++ b/src/crewai/utilities/exceptions/init.py
@@ -1 +1,4 @@
 """Exceptions for crewAI."""
 from crewai.utilities.exceptions.context_window_exceeding_exception import LLMContextLengthExceededException
 from crewai.utilities.exceptions.quota_limit_exception import LLMQuotaLimitExceededException
--- a/src/crewai/utilities/exceptions/quota_limit_exception.py
+++ b/src/crewai/utilities/exceptions/quota_limit_exception.py
@@ -0,0 +1,31 @@
 class LLMQuotaLimitExceededException(Exception):
    QUOTA_LIMIT_ERRORS = [
        "quota exceeded",
        "rate limit exceeded",
        "resource exhausted",
        "too many requests",
        "quota limit reached",
        "api quota exceeded",
        "usage limit exceeded",
        "billing quota exceeded",
        "request limit exceeded",
        "daily quota exceeded",
        "monthly quota exceeded",
    ]
    def __init__(self, error_message: str):
        self.original_error_message = error_message
        super().__init__(self._get_error_message(error_message))
    def _is_quota_limit_error(self, error_message: str) -> bool:
        return any(
            phrase.lower() in error_message.lower()
            for phrase in self.QUOTA_LIMIT_ERRORS
        )
    def _get_error_message(self, error_message: str):
        return (
            f"LLM quota limit exceeded. Original error: {error_message}\n"
            "Your API quota or rate limit has been reached. Please check your API usage, "
            "upgrade your plan, or wait for the quota to reset before retrying."
        )
--- a/tests/test_quota_limit_handling.py
+++ b/tests/test_quota_limit_handling.py
@@ -0,0 +1,184 @@
 import pytest
 from unittest.mock import Mock, patch
 from litellm.exceptions import RateLimitError
 from crewai.llm import LLM
 from crewai.agent import Agent
 from crewai.task import Task
 from crewai.crew import Crew
 from crewai.utilities.exceptions import LLMQuotaLimitExceededException
 class TestQuotaLimitHandling:
    """Test suite for quota limit handling in CrewAI."""
    def test_llm_non_streaming_quota_limit_exception(self):
        """Test that LLM raises LLMQuotaLimitExceededException for rate limit errors in non-streaming mode."""
        llm = LLM(model="gpt-3.5-turbo", stream=False)
        with patch("litellm.completion") as mock_completion:
            mock_completion.side_effect = RateLimitError("Rate limit exceeded")
            with pytest.raises(LLMQuotaLimitExceededException) as exc_info:
                llm.call(messages=[{"role": "user", "content": "Hello"}])
            assert "quota limit exceeded" in str(exc_info.value).lower()
            assert "Rate limit exceeded" in str(exc_info.value)
    def test_llm_streaming_quota_limit_exception(self):
        """Test that LLM raises LLMQuotaLimitExceededException for rate limit errors in streaming mode."""
        llm = LLM(model="gpt-3.5-turbo", stream=True)
        with patch("litellm.completion") as mock_completion:
            mock_completion.side_effect = RateLimitError("API quota exceeded")
            with pytest.raises(LLMQuotaLimitExceededException) as exc_info:
                llm.call(messages=[{"role": "user", "content": "Hello"}])
            assert "quota limit exceeded" in str(exc_info.value).lower()
            assert "API quota exceeded" in str(exc_info.value)
    def test_agent_handles_quota_limit_gracefully(self):
        """Test that Agent handles quota limit exceptions gracefully."""
        agent = Agent(
            role="Test Agent",
            goal="Test goal",
            backstory="Test backstory",
            llm=LLM(model="gpt-3.5-turbo")
        )
        with patch.object(agent.llm, "call") as mock_call:
            mock_call.side_effect = LLMQuotaLimitExceededException("Quota exceeded")
            with pytest.raises(LLMQuotaLimitExceededException):
                agent.execute_task(
                    task=Task(description="Test task", agent=agent),
                    context="Test context"
                )
    def test_crew_handles_quota_limit_in_task_execution(self):
        """Test that Crew handles quota limit exceptions during task execution."""
        agent = Agent(
            role="Test Agent",
            goal="Test goal",
            backstory="Test backstory",
            llm=LLM(model="gpt-3.5-turbo")
        )
        task = Task(
            description="Test task",
            agent=agent
        )
        crew = Crew(
            agents=[agent],
            tasks=[task]
        )
        with patch.object(agent.llm, "call") as mock_call:
            mock_call.side_effect = LLMQuotaLimitExceededException("Monthly quota exceeded")
            with pytest.raises(LLMQuotaLimitExceededException):
                crew.kickoff()
    def test_quota_limit_exception_error_message_format(self):
        """Test that LLMQuotaLimitExceededException formats error messages correctly."""
        original_error = "Resource exhausted: Quota exceeded for requests per day"
        exception = LLMQuotaLimitExceededException(original_error)
        error_message = str(exception)
        assert "LLM quota limit exceeded" in error_message
        assert original_error in error_message
        assert "API quota or rate limit has been reached" in error_message
        assert "upgrade your plan" in error_message
    def test_quota_limit_exception_preserves_original_error(self):
        """Test that LLMQuotaLimitExceededException preserves the original error message."""
        original_error = "429 Too Many Requests: Rate limit exceeded"
        exception = LLMQuotaLimitExceededException(original_error)
        assert exception.original_error_message == original_error
    @pytest.mark.parametrize("error_message,should_match", [
        ("quota exceeded", True),
        ("rate limit exceeded", True),
        ("resource exhausted", True),
        ("too many requests", True),
        ("quota limit reached", True),
        ("api quota exceeded", True),
        ("usage limit exceeded", True),
        ("billing quota exceeded", True),
        ("request limit exceeded", True),
        ("daily quota exceeded", True),
        ("monthly quota exceeded", True),
        ("QUOTA EXCEEDED", True),  # Case insensitive
        ("Rate Limit Exceeded", True),  # Case insensitive
        ("some other error", False),
        ("network timeout", False),
    ])
    def test_quota_limit_error_detection(self, error_message, should_match):
        """Test that quota limit error detection works for various error messages."""
        exception = LLMQuotaLimitExceededException(error_message)
        assert exception._is_quota_limit_error(error_message) == should_match
    def test_different_provider_quota_errors(self):
        """Test quota limit handling for different LLM providers."""
        test_cases = [
            "Rate limit reached for requests",
            "rate_limit_error: Number of requests per minute exceeded",
            "RESOURCE_EXHAUSTED: Quota exceeded",
            "429 Too Many Requests",
        ]
        llm = LLM(model="gpt-3.5-turbo")
        for error_message in test_cases:
            with patch("litellm.completion") as mock_completion:
                mock_completion.side_effect = RateLimitError(error_message)
                with pytest.raises(LLMQuotaLimitExceededException) as exc_info:
                    llm.call(messages=[{"role": "user", "content": "Hello"}])
                assert error_message in str(exc_info.value)
    def test_quota_limit_vs_context_window_exceptions(self):
        """Test that quota limit and context window exceptions are handled separately."""
        from litellm.exceptions import ContextWindowExceededError
        from crewai.utilities.exceptions import LLMContextLengthExceededException
        llm = LLM(model="gpt-3.5-turbo")
        with patch("litellm.completion") as mock_completion:
            mock_completion.side_effect = RateLimitError("Quota exceeded")
            with pytest.raises(LLMQuotaLimitExceededException):
                llm.call(messages=[{"role": "user", "content": "Hello"}])
        with patch("litellm.completion") as mock_completion:
            mock_completion.side_effect = ContextWindowExceededError("Context length exceeded")
            with pytest.raises(LLMContextLengthExceededException):
                llm.call(messages=[{"role": "user", "content": "Hello"}])
    def test_quota_limit_exception_in_crew_agent_executor(self):
        """Test that CrewAgentExecutor handles quota limit exceptions properly."""
        from crewai.agents.crew_agent_executor import CrewAgentExecutor
        agent = Agent(
            role="Test Agent",
            goal="Test goal",
            backstory="Test backstory",
            llm=LLM(model="gpt-3.5-turbo")
        )
        executor = CrewAgentExecutor(agent=agent)
        with patch.object(agent.llm, "call") as mock_call:
            mock_call.side_effect = LLMQuotaLimitExceededException("Daily quota exceeded")
            with pytest.raises(LLMQuotaLimitExceededException):
                executor.invoke({
                    "input": "Test input",
                    "chat_history": [],
                    "agent_scratchpad": ""
                })