From 6235810844ee6c1dc11547b176bb52340ccef2c4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20Moura?= <joaomdmoura@gmail.com>
Date: Tue, 17 Mar 2026 01:19:31 -0700
Subject: [PATCH] fix: enhance LLM response handling and serialization (#4909)

* fix: enhance LLM response handling and serialization

* Updated the Flow class to improve error handling when both structured and simple prompting fail, ensuring the first outcome is returned as a fallback.
* Introduced a new function, _serialize_llm_for_context, to properly serialize LLM objects with provider prefixes for better context management.
* Added tests to validate the new serialization logic and ensure correct behavior when LLM calls fail.

This update enhances the robustness of LLM interactions and improves the overall flow of handling outcomes.

* fix: patch VCR response handling to prevent httpx.ResponseNotRead errors (#4917)

* fix: enhance LLM response handling and serialization

* Updated the Flow class to improve error handling when both structured and simple prompting fail, ensuring the first outcome is returned as a fallback.
* Introduced a new function, _serialize_llm_for_context, to properly serialize LLM objects with provider prefixes for better context management.
* Added tests to validate the new serialization logic and ensure correct behavior when LLM calls fail.

This update enhances the robustness of LLM interactions and improves the overall flow of handling outcomes.

* fix: patch VCR response handling to prevent httpx.ResponseNotRead errors

VCR's _from_serialized_response mocks httpx.Response.read(), which
prevents the response's internal _content attribute from being properly
initialized. When OpenAI's client (using with_raw_response) accesses
response.content, httpx raises ResponseNotRead.

This patch explicitly sets response._content after the response is
created, ensuring that tests using VCR cassettes work correctly with
the OpenAI client's raw response handling.

Fixes tests:
- test_hierarchical_crew_creation_tasks_with_sync_last
- test_conditional_task_last_task_when_conditional_is_false
- test_crew_log_file_output

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

---------

Co-authored-by: Joao Moura <joaomdmoura@gmail.com>
Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>

---------

Co-authored-by: alex-clawd <alex@crewai.com>
Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
---
 conftest.py                                   | 29 +++++++++++++
 lib/crewai/src/crewai/flow/flow.py            | 42 ++++++++++++-------
 lib/crewai/src/crewai/flow/human_feedback.py  | 20 ++++++++-
 lib/crewai/src/crewai/llms/constants.py       |  2 +
 lib/crewai/tests/test_async_human_feedback.py | 37 +++++++++++++---
 .../tests/test_human_feedback_decorator.py    | 39 +++++++++++++++++
 6 files changed, 147 insertions(+), 22 deletions(-)

diff --git a/conftest.py b/conftest.py
index 9b2c7c5c4..09852767e 100644
--- a/conftest.py
+++ b/conftest.py
@@ -43,6 +43,35 @@ def _patched_make_vcr_request(httpx_request: Any, **kwargs: Any) -> Any:
 httpx_stubs._make_vcr_request = _patched_make_vcr_request
 
 
+# Patch the response-side of VCR to fix httpx.ResponseNotRead errors.
+# VCR's _from_serialized_response mocks httpx.Response.read(), which prevents
+# the response's internal _content attribute from being properly initialized.
+# When OpenAI's client (using with_raw_response) accesses response.content,
+# httpx raises ResponseNotRead because read() was never actually called.
+# This patch ensures _content is explicitly set after response creation.
+_original_from_serialized_response = getattr(
+    httpx_stubs, "_from_serialized_response", None
+)
+
+if _original_from_serialized_response is not None:
+
+    def _patched_from_serialized_response(
+        request: Any, serialized_response: Any, history: Any = None
+    ) -> Any:
+        """Patched version that ensures response._content is properly set."""
+        response = _original_from_serialized_response(request, serialized_response, history)
+        # Explicitly set _content to avoid ResponseNotRead errors
+        # The content was passed to the constructor but the mocked read() prevents
+        # proper initialization of the internal state
+        body_content = serialized_response.get("body", {}).get("string", b"")
+        if isinstance(body_content, str):
+            body_content = body_content.encode("utf-8")
+        response._content = body_content
+        return response
+
+    httpx_stubs._from_serialized_response = _patched_from_serialized_response
+
+
 @pytest.fixture(autouse=True, scope="function")
 def cleanup_event_handlers() -> Generator[None, Any, None]:
     """Clean up event bus handlers after each test to prevent test pollution."""
diff --git a/lib/crewai/src/crewai/flow/flow.py b/lib/crewai/src/crewai/flow/flow.py
index 674f551eb..8ef77e482 100644
--- a/lib/crewai/src/crewai/flow/flow.py
+++ b/lib/crewai/src/crewai/flow/flow.py
@@ -3086,25 +3086,35 @@ class Flow(Generic[T], metaclass=FlowMeta):
             logger.warning(
                 f"Structured output failed, falling back to simple prompting: {e}"
             )
-            response = llm_instance.call(messages=prompt)
-            response_clean = str(response).strip()
+            try:
+                response = llm_instance.call(
+                    messages=[{"role": "user", "content": prompt}],
+                )
+                response_clean = str(response).strip()
 
-            # Exact match (case-insensitive)
-            for outcome in outcomes:
-                if outcome.lower() == response_clean.lower():
-                    return outcome
+                # Exact match (case-insensitive)
+                for outcome in outcomes:
+                    if outcome.lower() == response_clean.lower():
+                        return outcome
 
-            # Partial match
-            for outcome in outcomes:
-                if outcome.lower() in response_clean.lower():
-                    return outcome
+                # Partial match
+                for outcome in outcomes:
+                    if outcome.lower() in response_clean.lower():
+                        return outcome
 
-            # Fallback to first outcome
-            logger.warning(
-                f"Could not match LLM response '{response_clean}' to outcomes {list(outcomes)}. "
-                f"Falling back to first outcome: {outcomes[0]}"
-            )
-            return outcomes[0]
+                # Fallback to first outcome
+                logger.warning(
+                    f"Could not match LLM response '{response_clean}' to outcomes {list(outcomes)}. "
+                    f"Falling back to first outcome: {outcomes[0]}"
+                )
+                return outcomes[0]
+
+            except Exception as fallback_err:
+                logger.warning(
+                    f"Simple prompting also failed: {fallback_err}. "
+                    f"Falling back to first outcome: {outcomes[0]}"
+                )
+                return outcomes[0]
 
     def _log_flow_event(
         self,
diff --git a/lib/crewai/src/crewai/flow/human_feedback.py b/lib/crewai/src/crewai/flow/human_feedback.py
index fa4e20ced..7389b8a9e 100644
--- a/lib/crewai/src/crewai/flow/human_feedback.py
+++ b/lib/crewai/src/crewai/flow/human_feedback.py
@@ -76,6 +76,24 @@ if TYPE_CHECKING:
 F = TypeVar("F", bound=Callable[..., Any])
 
 
+def _serialize_llm_for_context(llm: Any) -> str | None:
+    """Serialize a BaseLLM object to a model string with provider prefix.
+
+    When persisting the LLM for HITL resume, we need to store enough info
+    to reconstruct a working LLM on the resume worker. Just storing the bare
+    model name (e.g. "gemini-3-flash-preview") causes provider inference to
+    fail — it defaults to OpenAI. Including the provider prefix (e.g.
+    "gemini/gemini-3-flash-preview") allows LLM() to correctly route.
+    """
+    model = getattr(llm, "model", None)
+    if not model:
+        return None
+    provider = getattr(llm, "provider", None)
+    if provider and "/" not in model:
+        return f"{provider}/{model}"
+    return model
+
+
 @dataclass
 class HumanFeedbackResult:
     """Result from a @human_feedback decorated method.
@@ -412,7 +430,7 @@ def human_feedback(
                 emit=list(emit) if emit else None,
                 default_outcome=default_outcome,
                 metadata=metadata or {},
-                llm=llm if isinstance(llm, str) else getattr(llm, "model", None),
+                llm=llm if isinstance(llm, str) else _serialize_llm_for_context(llm),
             )
 
             # Determine effective provider:
diff --git a/lib/crewai/src/crewai/llms/constants.py b/lib/crewai/src/crewai/llms/constants.py
index 9552efada..595a0a30d 100644
--- a/lib/crewai/src/crewai/llms/constants.py
+++ b/lib/crewai/src/crewai/llms/constants.py
@@ -240,6 +240,7 @@ ANTHROPIC_MODELS: list[AnthropicModels] = [
 
 GeminiModels: TypeAlias = Literal[
     "gemini-3-pro-preview",
+    "gemini-3-flash-preview",
     "gemini-2.5-pro",
     "gemini-2.5-pro-preview-03-25",
     "gemini-2.5-pro-preview-05-06",
@@ -294,6 +295,7 @@ GeminiModels: TypeAlias = Literal[
 ]
 GEMINI_MODELS: list[GeminiModels] = [
     "gemini-3-pro-preview",
+    "gemini-3-flash-preview",
     "gemini-2.5-pro",
     "gemini-2.5-pro-preview-03-25",
     "gemini-2.5-pro-preview-05-06",
diff --git a/lib/crewai/tests/test_async_human_feedback.py b/lib/crewai/tests/test_async_human_feedback.py
index 035f29dcc..f4977858b 100644
--- a/lib/crewai/tests/test_async_human_feedback.py
+++ b/lib/crewai/tests/test_async_human_feedback.py
@@ -989,8 +989,10 @@ class TestLLMObjectPreservedInContext:
             persistence = SQLiteFlowPersistence(db_path)
 
             # Create a mock BaseLLM object (not a string)
+            # Simulates LLM(model="gemini-2.0-flash", provider="gemini")
             mock_llm_obj = MagicMock()
-            mock_llm_obj.model = "gemini/gemini-2.0-flash"
+            mock_llm_obj.model = "gemini-2.0-flash"
+            mock_llm_obj.provider = "gemini"
 
             class PausingProvider:
                 def __init__(self, persistence: SQLiteFlowPersistence):
@@ -1086,11 +1088,36 @@ class TestLLMObjectPreservedInContext:
 
     def test_none_llm_when_no_model_attr(self) -> None:
         """Test that llm is None when object has no model attribute."""
-        mock_obj = MagicMock(spec=[])  # No attributes
+        from crewai.flow.human_feedback import _serialize_llm_for_context
 
-        # Simulate what the decorator does
-        llm_value = mock_obj if isinstance(mock_obj, str) else getattr(mock_obj, "model", None)
-        assert llm_value is None
+        mock_obj = MagicMock(spec=[])  # No attributes
+        assert _serialize_llm_for_context(mock_obj) is None
+
+    def test_provider_prefix_added_to_bare_model(self) -> None:
+        """Test that provider prefix is added when model has no slash."""
+        from crewai.flow.human_feedback import _serialize_llm_for_context
+
+        mock_obj = MagicMock()
+        mock_obj.model = "gemini-3-flash-preview"
+        mock_obj.provider = "gemini"
+        assert _serialize_llm_for_context(mock_obj) == "gemini/gemini-3-flash-preview"
+
+    def test_provider_prefix_not_doubled_when_already_present(self) -> None:
+        """Test that provider prefix is not added when model already has a slash."""
+        from crewai.flow.human_feedback import _serialize_llm_for_context
+
+        mock_obj = MagicMock()
+        mock_obj.model = "gemini/gemini-2.0-flash"
+        mock_obj.provider = "gemini"
+        assert _serialize_llm_for_context(mock_obj) == "gemini/gemini-2.0-flash"
+
+    def test_no_provider_attr_falls_back_to_bare_model(self) -> None:
+        """Test that bare model is used when no provider attribute exists."""
+        from crewai.flow.human_feedback import _serialize_llm_for_context
+
+        mock_obj = MagicMock(spec=[])
+        mock_obj.model = "gpt-4o-mini"
+        assert _serialize_llm_for_context(mock_obj) == "gpt-4o-mini"
 
 
 class TestAsyncHumanFeedbackEdgeCases:
diff --git a/lib/crewai/tests/test_human_feedback_decorator.py b/lib/crewai/tests/test_human_feedback_decorator.py
index cd6919420..23b3d723b 100644
--- a/lib/crewai/tests/test_human_feedback_decorator.py
+++ b/lib/crewai/tests/test_human_feedback_decorator.py
@@ -400,6 +400,45 @@ class TestCollapseToOutcome:
 
         assert result == "approved"  # First in list
 
+    def test_both_llm_calls_fail_returns_first_outcome(self):
+        """When both structured and simple prompting fail, return outcomes[0]."""
+        flow = Flow()
+
+        with patch("crewai.llm.LLM") as MockLLM:
+            mock_llm = MagicMock()
+            # Both calls raise — simulates wrong provider / auth failure
+            mock_llm.call.side_effect = RuntimeError("Model not found")
+            MockLLM.return_value = mock_llm
+
+            result = flow._collapse_to_outcome(
+                feedback="looks great, approve it",
+                outcomes=["needs_changes", "approved"],
+                llm="gemini-3-flash-preview",
+            )
+
+        assert result == "needs_changes"  # First in list (safe fallback)
+
+    def test_structured_fails_but_simple_succeeds(self):
+        """When structured output fails but simple prompting works, use that."""
+        flow = Flow()
+
+        with patch("crewai.llm.LLM") as MockLLM:
+            mock_llm = MagicMock()
+            # First call (structured) fails, second call (simple) succeeds
+            mock_llm.call.side_effect = [
+                RuntimeError("Function calling not supported"),
+                "approved",
+            ]
+            MockLLM.return_value = mock_llm
+
+            result = flow._collapse_to_outcome(
+                feedback="looks great",
+                outcomes=["needs_changes", "approved"],
+                llm="gpt-4o-mini",
+            )
+
+        assert result == "approved"
+
 
 # -- HITL Learning tests --