fix: enhance LLM response handling and serialization (#4909)
Some checks failed
CodeQL Advanced / Analyze (actions) (push) Has been cancelled
CodeQL Advanced / Analyze (python) (push) Has been cancelled
Mark stale issues and pull requests / stale (push) Has been cancelled

* fix: enhance LLM response handling and serialization

* Updated the Flow class to improve error handling when both structured and simple prompting fail, ensuring the first outcome is returned as a fallback.
* Introduced a new function, _serialize_llm_for_context, to properly serialize LLM objects with provider prefixes for better context management.
* Added tests to validate the new serialization logic and ensure correct behavior when LLM calls fail.

This update enhances the robustness of LLM interactions and improves the overall flow of handling outcomes.

* fix: patch VCR response handling to prevent httpx.ResponseNotRead errors (#4917)

* fix: enhance LLM response handling and serialization

* Updated the Flow class to improve error handling when both structured and simple prompting fail, ensuring the first outcome is returned as a fallback.
* Introduced a new function, _serialize_llm_for_context, to properly serialize LLM objects with provider prefixes for better context management.
* Added tests to validate the new serialization logic and ensure correct behavior when LLM calls fail.

This update enhances the robustness of LLM interactions and improves the overall flow of handling outcomes.

* fix: patch VCR response handling to prevent httpx.ResponseNotRead errors

VCR's _from_serialized_response mocks httpx.Response.read(), which
prevents the response's internal _content attribute from being properly
initialized. When OpenAI's client (using with_raw_response) accesses
response.content, httpx raises ResponseNotRead.

This patch explicitly sets response._content after the response is
created, ensuring that tests using VCR cassettes work correctly with
the OpenAI client's raw response handling.

Fixes tests:
- test_hierarchical_crew_creation_tasks_with_sync_last
- test_conditional_task_last_task_when_conditional_is_false
- test_crew_log_file_output

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

---------

Co-authored-by: Joao Moura <joaomdmoura@gmail.com>
Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>

---------

Co-authored-by: alex-clawd <alex@crewai.com>
Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
João Moura
2026-03-17 01:19:31 -07:00
committed by GitHub
parent b95486c187
commit 6235810844
6 changed files with 147 additions and 22 deletions

View File

@@ -3086,25 +3086,35 @@ class Flow(Generic[T], metaclass=FlowMeta):
logger.warning(
f"Structured output failed, falling back to simple prompting: {e}"
)
response = llm_instance.call(messages=prompt)
response_clean = str(response).strip()
try:
response = llm_instance.call(
messages=[{"role": "user", "content": prompt}],
)
response_clean = str(response).strip()
# Exact match (case-insensitive)
for outcome in outcomes:
if outcome.lower() == response_clean.lower():
return outcome
# Exact match (case-insensitive)
for outcome in outcomes:
if outcome.lower() == response_clean.lower():
return outcome
# Partial match
for outcome in outcomes:
if outcome.lower() in response_clean.lower():
return outcome
# Partial match
for outcome in outcomes:
if outcome.lower() in response_clean.lower():
return outcome
# Fallback to first outcome
logger.warning(
f"Could not match LLM response '{response_clean}' to outcomes {list(outcomes)}. "
f"Falling back to first outcome: {outcomes[0]}"
)
return outcomes[0]
# Fallback to first outcome
logger.warning(
f"Could not match LLM response '{response_clean}' to outcomes {list(outcomes)}. "
f"Falling back to first outcome: {outcomes[0]}"
)
return outcomes[0]
except Exception as fallback_err:
logger.warning(
f"Simple prompting also failed: {fallback_err}. "
f"Falling back to first outcome: {outcomes[0]}"
)
return outcomes[0]
def _log_flow_event(
self,

View File

@@ -76,6 +76,24 @@ if TYPE_CHECKING:
F = TypeVar("F", bound=Callable[..., Any])
def _serialize_llm_for_context(llm: Any) -> str | None:
"""Serialize a BaseLLM object to a model string with provider prefix.
When persisting the LLM for HITL resume, we need to store enough info
to reconstruct a working LLM on the resume worker. Just storing the bare
model name (e.g. "gemini-3-flash-preview") causes provider inference to
fail — it defaults to OpenAI. Including the provider prefix (e.g.
"gemini/gemini-3-flash-preview") allows LLM() to correctly route.
"""
model = getattr(llm, "model", None)
if not model:
return None
provider = getattr(llm, "provider", None)
if provider and "/" not in model:
return f"{provider}/{model}"
return model
@dataclass
class HumanFeedbackResult:
"""Result from a @human_feedback decorated method.
@@ -412,7 +430,7 @@ def human_feedback(
emit=list(emit) if emit else None,
default_outcome=default_outcome,
metadata=metadata or {},
llm=llm if isinstance(llm, str) else getattr(llm, "model", None),
llm=llm if isinstance(llm, str) else _serialize_llm_for_context(llm),
)
# Determine effective provider:

View File

@@ -240,6 +240,7 @@ ANTHROPIC_MODELS: list[AnthropicModels] = [
GeminiModels: TypeAlias = Literal[
"gemini-3-pro-preview",
"gemini-3-flash-preview",
"gemini-2.5-pro",
"gemini-2.5-pro-preview-03-25",
"gemini-2.5-pro-preview-05-06",
@@ -294,6 +295,7 @@ GeminiModels: TypeAlias = Literal[
]
GEMINI_MODELS: list[GeminiModels] = [
"gemini-3-pro-preview",
"gemini-3-flash-preview",
"gemini-2.5-pro",
"gemini-2.5-pro-preview-03-25",
"gemini-2.5-pro-preview-05-06",

View File

@@ -989,8 +989,10 @@ class TestLLMObjectPreservedInContext:
persistence = SQLiteFlowPersistence(db_path)
# Create a mock BaseLLM object (not a string)
# Simulates LLM(model="gemini-2.0-flash", provider="gemini")
mock_llm_obj = MagicMock()
mock_llm_obj.model = "gemini/gemini-2.0-flash"
mock_llm_obj.model = "gemini-2.0-flash"
mock_llm_obj.provider = "gemini"
class PausingProvider:
def __init__(self, persistence: SQLiteFlowPersistence):
@@ -1086,11 +1088,36 @@ class TestLLMObjectPreservedInContext:
def test_none_llm_when_no_model_attr(self) -> None:
"""Test that llm is None when object has no model attribute."""
mock_obj = MagicMock(spec=[]) # No attributes
from crewai.flow.human_feedback import _serialize_llm_for_context
# Simulate what the decorator does
llm_value = mock_obj if isinstance(mock_obj, str) else getattr(mock_obj, "model", None)
assert llm_value is None
mock_obj = MagicMock(spec=[]) # No attributes
assert _serialize_llm_for_context(mock_obj) is None
def test_provider_prefix_added_to_bare_model(self) -> None:
"""Test that provider prefix is added when model has no slash."""
from crewai.flow.human_feedback import _serialize_llm_for_context
mock_obj = MagicMock()
mock_obj.model = "gemini-3-flash-preview"
mock_obj.provider = "gemini"
assert _serialize_llm_for_context(mock_obj) == "gemini/gemini-3-flash-preview"
def test_provider_prefix_not_doubled_when_already_present(self) -> None:
"""Test that provider prefix is not added when model already has a slash."""
from crewai.flow.human_feedback import _serialize_llm_for_context
mock_obj = MagicMock()
mock_obj.model = "gemini/gemini-2.0-flash"
mock_obj.provider = "gemini"
assert _serialize_llm_for_context(mock_obj) == "gemini/gemini-2.0-flash"
def test_no_provider_attr_falls_back_to_bare_model(self) -> None:
"""Test that bare model is used when no provider attribute exists."""
from crewai.flow.human_feedback import _serialize_llm_for_context
mock_obj = MagicMock(spec=[])
mock_obj.model = "gpt-4o-mini"
assert _serialize_llm_for_context(mock_obj) == "gpt-4o-mini"
class TestAsyncHumanFeedbackEdgeCases:

View File

@@ -400,6 +400,45 @@ class TestCollapseToOutcome:
assert result == "approved" # First in list
def test_both_llm_calls_fail_returns_first_outcome(self):
"""When both structured and simple prompting fail, return outcomes[0]."""
flow = Flow()
with patch("crewai.llm.LLM") as MockLLM:
mock_llm = MagicMock()
# Both calls raise — simulates wrong provider / auth failure
mock_llm.call.side_effect = RuntimeError("Model not found")
MockLLM.return_value = mock_llm
result = flow._collapse_to_outcome(
feedback="looks great, approve it",
outcomes=["needs_changes", "approved"],
llm="gemini-3-flash-preview",
)
assert result == "needs_changes" # First in list (safe fallback)
def test_structured_fails_but_simple_succeeds(self):
"""When structured output fails but simple prompting works, use that."""
flow = Flow()
with patch("crewai.llm.LLM") as MockLLM:
mock_llm = MagicMock()
# First call (structured) fails, second call (simple) succeeds
mock_llm.call.side_effect = [
RuntimeError("Function calling not supported"),
"approved",
]
MockLLM.return_value = mock_llm
result = flow._collapse_to_outcome(
feedback="looks great",
outcomes=["needs_changes", "approved"],
llm="gpt-4o-mini",
)
assert result == "approved"
# -- HITL Learning tests --