fix: enhance LLM response handling and serialization (#4909)

* fix: enhance LLM response handling and serialization * Updated the Flow class to improve error handling when both structured and simple prompting fail, ensuring the first outcome is returned as a fallback. * Introduced a new function, _serialize_llm_for_context, to properly serialize LLM objects with provider prefixes for better context management. * Added tests to validate the new serialization logic and ensure correct behavior when LLM calls fail. This update enhances the robustness of LLM interactions and improves the overall flow of handling outcomes. * fix: patch VCR response handling to prevent httpx.ResponseNotRead errors (#4917) * fix: enhance LLM response handling and serialization * Updated the Flow class to improve error handling when both structured and simple prompting fail, ensuring the first outcome is returned as a fallback. * Introduced a new function, _serialize_llm_for_context, to properly serialize LLM objects with provider prefixes for better context management. * Added tests to validate the new serialization logic and ensure correct behavior when LLM calls fail. This update enhances the robustness of LLM interactions and improves the overall flow of handling outcomes. * fix: patch VCR response handling to prevent httpx.ResponseNotRead errors VCR's _from_serialized_response mocks httpx.Response.read(), which prevents the response's internal _content attribute from being properly initialized. When OpenAI's client (using with_raw_response) accesses response.content, httpx raises ResponseNotRead. This patch explicitly sets response._content after the response is created, ensuring that tests using VCR cassettes work correctly with the OpenAI client's raw response handling. Fixes tests: - test_hierarchical_crew_creation_tasks_with_sync_last - test_conditional_task_last_task_when_conditional_is_false - test_crew_log_file_output Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> --------- Co-authored-by: Joao Moura <joaomdmoura@gmail.com> Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com> --------- Co-authored-by: alex-clawd <alex@crewai.com> Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
2026-04-30 23:02:50 +00:00 · 2026-03-17 01:19:31 -07:00
parent b95486c187
commit 6235810844
6 changed files with 147 additions and 22 deletions
--- a/lib/crewai/src/crewai/flow/flow.py
+++ b/lib/crewai/src/crewai/flow/flow.py
@@ -3086,25 +3086,35 @@ class Flow(Generic[T], metaclass=FlowMeta):
            logger.warning(
                f"Structured output failed, falling back to simple prompting: {e}"
            )
-            response = llm_instance.call(messages=prompt)
-            response_clean = str(response).strip()
+            try:
+                response = llm_instance.call(
+                    messages=[{"role": "user", "content": prompt}],
+                )
+                response_clean = str(response).strip()

-            # Exact match (case-insensitive)
-            for outcome in outcomes:
-                if outcome.lower() == response_clean.lower():
-                    return outcome
+                # Exact match (case-insensitive)
+                for outcome in outcomes:
+                    if outcome.lower() == response_clean.lower():
+                        return outcome

-            # Partial match
-            for outcome in outcomes:
-                if outcome.lower() in response_clean.lower():
-                    return outcome
+                # Partial match
+                for outcome in outcomes:
+                    if outcome.lower() in response_clean.lower():
+                        return outcome

-            # Fallback to first outcome
-            logger.warning(
-                f"Could not match LLM response '{response_clean}' to outcomes {list(outcomes)}. "
-                f"Falling back to first outcome: {outcomes[0]}"
-            )
-            return outcomes[0]
+                # Fallback to first outcome
+                logger.warning(
+                    f"Could not match LLM response '{response_clean}' to outcomes {list(outcomes)}. "
+                    f"Falling back to first outcome: {outcomes[0]}"
+                )
+                return outcomes[0]
+
+            except Exception as fallback_err:
+                logger.warning(
+                    f"Simple prompting also failed: {fallback_err}. "
+                    f"Falling back to first outcome: {outcomes[0]}"
+                )
+                return outcomes[0]

    def _log_flow_event(
        self,
--- a/lib/crewai/src/crewai/flow/human_feedback.py
+++ b/lib/crewai/src/crewai/flow/human_feedback.py
@@ -76,6 +76,24 @@ if TYPE_CHECKING:
 F = TypeVar("F", bound=Callable[..., Any])


+def _serialize_llm_for_context(llm: Any) -> str | None:
+    """Serialize a BaseLLM object to a model string with provider prefix.
+
+    When persisting the LLM for HITL resume, we need to store enough info
+    to reconstruct a working LLM on the resume worker. Just storing the bare
+    model name (e.g. "gemini-3-flash-preview") causes provider inference to
+    fail — it defaults to OpenAI. Including the provider prefix (e.g.
+    "gemini/gemini-3-flash-preview") allows LLM() to correctly route.
+    """
+    model = getattr(llm, "model", None)
+    if not model:
+        return None
+    provider = getattr(llm, "provider", None)
+    if provider and "/" not in model:
+        return f"{provider}/{model}"
+    return model
+
+
@dataclass
 class HumanFeedbackResult:
    """Result from a @human_feedback decorated method.
@@ -412,7 +430,7 @@ def human_feedback(
                emit=list(emit) if emit else None,
                default_outcome=default_outcome,
                metadata=metadata or {},
-                llm=llm if isinstance(llm, str) else getattr(llm, "model", None),
+                llm=llm if isinstance(llm, str) else _serialize_llm_for_context(llm),
            )

            # Determine effective provider:
--- a/lib/crewai/src/crewai/llms/constants.py
+++ b/lib/crewai/src/crewai/llms/constants.py
@@ -240,6 +240,7 @@ ANTHROPIC_MODELS: list[AnthropicModels] = [

 GeminiModels: TypeAlias = Literal[
    "gemini-3-pro-preview",
+    "gemini-3-flash-preview",
    "gemini-2.5-pro",
    "gemini-2.5-pro-preview-03-25",
    "gemini-2.5-pro-preview-05-06",
@@ -294,6 +295,7 @@ GeminiModels: TypeAlias = Literal[
 ]
 GEMINI_MODELS: list[GeminiModels] = [
    "gemini-3-pro-preview",
+    "gemini-3-flash-preview",
    "gemini-2.5-pro",
    "gemini-2.5-pro-preview-03-25",
    "gemini-2.5-pro-preview-05-06",
--- a/lib/crewai/tests/test_async_human_feedback.py
+++ b/lib/crewai/tests/test_async_human_feedback.py
@@ -989,8 +989,10 @@ class TestLLMObjectPreservedInContext:
            persistence = SQLiteFlowPersistence(db_path)

            # Create a mock BaseLLM object (not a string)
+            # Simulates LLM(model="gemini-2.0-flash", provider="gemini")
            mock_llm_obj = MagicMock()
-            mock_llm_obj.model = "gemini/gemini-2.0-flash"
+            mock_llm_obj.model = "gemini-2.0-flash"
+            mock_llm_obj.provider = "gemini"

            class PausingProvider:
                def __init__(self, persistence: SQLiteFlowPersistence):
@@ -1086,11 +1088,36 @@ class TestLLMObjectPreservedInContext:

    def test_none_llm_when_no_model_attr(self) -> None:
        """Test that llm is None when object has no model attribute."""
-        mock_obj = MagicMock(spec=[])  # No attributes
+        from crewai.flow.human_feedback import _serialize_llm_for_context

-        # Simulate what the decorator does
-        llm_value = mock_obj if isinstance(mock_obj, str) else getattr(mock_obj, "model", None)
-        assert llm_value is None
+        mock_obj = MagicMock(spec=[])  # No attributes
+        assert _serialize_llm_for_context(mock_obj) is None
+
+    def test_provider_prefix_added_to_bare_model(self) -> None:
+        """Test that provider prefix is added when model has no slash."""
+        from crewai.flow.human_feedback import _serialize_llm_for_context
+
+        mock_obj = MagicMock()
+        mock_obj.model = "gemini-3-flash-preview"
+        mock_obj.provider = "gemini"
+        assert _serialize_llm_for_context(mock_obj) == "gemini/gemini-3-flash-preview"
+
+    def test_provider_prefix_not_doubled_when_already_present(self) -> None:
+        """Test that provider prefix is not added when model already has a slash."""
+        from crewai.flow.human_feedback import _serialize_llm_for_context
+
+        mock_obj = MagicMock()
+        mock_obj.model = "gemini/gemini-2.0-flash"
+        mock_obj.provider = "gemini"
+        assert _serialize_llm_for_context(mock_obj) == "gemini/gemini-2.0-flash"
+
+    def test_no_provider_attr_falls_back_to_bare_model(self) -> None:
+        """Test that bare model is used when no provider attribute exists."""
+        from crewai.flow.human_feedback import _serialize_llm_for_context
+
+        mock_obj = MagicMock(spec=[])
+        mock_obj.model = "gpt-4o-mini"
+        assert _serialize_llm_for_context(mock_obj) == "gpt-4o-mini"


 class TestAsyncHumanFeedbackEdgeCases:
--- a/lib/crewai/tests/test_human_feedback_decorator.py
+++ b/lib/crewai/tests/test_human_feedback_decorator.py
@@ -400,6 +400,45 @@ class TestCollapseToOutcome:

        assert result == "approved"  # First in list

+    def test_both_llm_calls_fail_returns_first_outcome(self):
+        """When both structured and simple prompting fail, return outcomes[0]."""
+        flow = Flow()
+
+        with patch("crewai.llm.LLM") as MockLLM:
+            mock_llm = MagicMock()
+            # Both calls raise — simulates wrong provider / auth failure
+            mock_llm.call.side_effect = RuntimeError("Model not found")
+            MockLLM.return_value = mock_llm
+
+            result = flow._collapse_to_outcome(
+                feedback="looks great, approve it",
+                outcomes=["needs_changes", "approved"],
+                llm="gemini-3-flash-preview",
+            )
+
+        assert result == "needs_changes"  # First in list (safe fallback)
+
+    def test_structured_fails_but_simple_succeeds(self):
+        """When structured output fails but simple prompting works, use that."""
+        flow = Flow()
+
+        with patch("crewai.llm.LLM") as MockLLM:
+            mock_llm = MagicMock()
+            # First call (structured) fails, second call (simple) succeeds
+            mock_llm.call.side_effect = [
+                RuntimeError("Function calling not supported"),
+                "approved",
+            ]
+            MockLLM.return_value = mock_llm
+
+            result = flow._collapse_to_outcome(
+                feedback="looks great",
+                outcomes=["needs_changes", "approved"],
+                llm="gpt-4o-mini",
+            )
+
+        assert result == "approved"
+

 # -- HITL Learning tests --