fix: enhance LLM response handling and serialization

* Updated the Flow class to improve error handling when both structured and simple prompting fail, ensuring the first outcome is returned as a fallback.
* Introduced a new function, _serialize_llm_for_context, to properly serialize LLM objects with provider prefixes for better context management.
* Added tests to validate the new serialization logic and ensure correct behavior when LLM calls fail.

This update enhances the robustness of LLM interactions and improves the overall flow of handling outcomes.
This commit is contained in:
Joao Moura
2026-03-16 13:37:20 -07:00
parent 9acb327d9f
commit 3a7e499a31
5 changed files with 118 additions and 22 deletions

View File

@@ -3086,25 +3086,35 @@ class Flow(Generic[T], metaclass=FlowMeta):
logger.warning(
f"Structured output failed, falling back to simple prompting: {e}"
)
response = llm_instance.call(messages=prompt)
response_clean = str(response).strip()
try:
response = llm_instance.call(
messages=[{"role": "user", "content": prompt}],
)
response_clean = str(response).strip()
# Exact match (case-insensitive)
for outcome in outcomes:
if outcome.lower() == response_clean.lower():
return outcome
# Exact match (case-insensitive)
for outcome in outcomes:
if outcome.lower() == response_clean.lower():
return outcome
# Partial match
for outcome in outcomes:
if outcome.lower() in response_clean.lower():
return outcome
# Partial match
for outcome in outcomes:
if outcome.lower() in response_clean.lower():
return outcome
# Fallback to first outcome
logger.warning(
f"Could not match LLM response '{response_clean}' to outcomes {list(outcomes)}. "
f"Falling back to first outcome: {outcomes[0]}"
)
return outcomes[0]
# Fallback to first outcome
logger.warning(
f"Could not match LLM response '{response_clean}' to outcomes {list(outcomes)}. "
f"Falling back to first outcome: {outcomes[0]}"
)
return outcomes[0]
except Exception as fallback_err:
logger.warning(
f"Simple prompting also failed: {fallback_err}. "
f"Falling back to first outcome: {outcomes[0]}"
)
return outcomes[0]
def _log_flow_event(
self,

View File

@@ -76,6 +76,24 @@ if TYPE_CHECKING:
F = TypeVar("F", bound=Callable[..., Any])
def _serialize_llm_for_context(llm: Any) -> str | None:
"""Serialize a BaseLLM object to a model string with provider prefix.
When persisting the LLM for HITL resume, we need to store enough info
to reconstruct a working LLM on the resume worker. Just storing the bare
model name (e.g. "gemini-3-flash-preview") causes provider inference to
fail — it defaults to OpenAI. Including the provider prefix (e.g.
"gemini/gemini-3-flash-preview") allows LLM() to correctly route.
"""
model = getattr(llm, "model", None)
if not model:
return None
provider = getattr(llm, "provider", None)
if provider and "/" not in model:
return f"{provider}/{model}"
return model
@dataclass
class HumanFeedbackResult:
"""Result from a @human_feedback decorated method.
@@ -412,7 +430,7 @@ def human_feedback(
emit=list(emit) if emit else None,
default_outcome=default_outcome,
metadata=metadata or {},
llm=llm if isinstance(llm, str) else getattr(llm, "model", None),
llm=llm if isinstance(llm, str) else _serialize_llm_for_context(llm),
)
# Determine effective provider:

View File

@@ -240,6 +240,7 @@ ANTHROPIC_MODELS: list[AnthropicModels] = [
GeminiModels: TypeAlias = Literal[
"gemini-3-pro-preview",
"gemini-3-flash-preview",
"gemini-2.5-pro",
"gemini-2.5-pro-preview-03-25",
"gemini-2.5-pro-preview-05-06",
@@ -294,6 +295,7 @@ GeminiModels: TypeAlias = Literal[
]
GEMINI_MODELS: list[GeminiModels] = [
"gemini-3-pro-preview",
"gemini-3-flash-preview",
"gemini-2.5-pro",
"gemini-2.5-pro-preview-03-25",
"gemini-2.5-pro-preview-05-06",

View File

@@ -989,8 +989,10 @@ class TestLLMObjectPreservedInContext:
persistence = SQLiteFlowPersistence(db_path)
# Create a mock BaseLLM object (not a string)
# Simulates LLM(model="gemini-2.0-flash", provider="gemini")
mock_llm_obj = MagicMock()
mock_llm_obj.model = "gemini/gemini-2.0-flash"
mock_llm_obj.model = "gemini-2.0-flash"
mock_llm_obj.provider = "gemini"
class PausingProvider:
def __init__(self, persistence: SQLiteFlowPersistence):
@@ -1086,11 +1088,36 @@ class TestLLMObjectPreservedInContext:
def test_none_llm_when_no_model_attr(self) -> None:
"""Test that llm is None when object has no model attribute."""
mock_obj = MagicMock(spec=[]) # No attributes
from crewai.flow.human_feedback import _serialize_llm_for_context
# Simulate what the decorator does
llm_value = mock_obj if isinstance(mock_obj, str) else getattr(mock_obj, "model", None)
assert llm_value is None
mock_obj = MagicMock(spec=[]) # No attributes
assert _serialize_llm_for_context(mock_obj) is None
def test_provider_prefix_added_to_bare_model(self) -> None:
"""Test that provider prefix is added when model has no slash."""
from crewai.flow.human_feedback import _serialize_llm_for_context
mock_obj = MagicMock()
mock_obj.model = "gemini-3-flash-preview"
mock_obj.provider = "gemini"
assert _serialize_llm_for_context(mock_obj) == "gemini/gemini-3-flash-preview"
def test_provider_prefix_not_doubled_when_already_present(self) -> None:
"""Test that provider prefix is not added when model already has a slash."""
from crewai.flow.human_feedback import _serialize_llm_for_context
mock_obj = MagicMock()
mock_obj.model = "gemini/gemini-2.0-flash"
mock_obj.provider = "gemini"
assert _serialize_llm_for_context(mock_obj) == "gemini/gemini-2.0-flash"
def test_no_provider_attr_falls_back_to_bare_model(self) -> None:
"""Test that bare model is used when no provider attribute exists."""
from crewai.flow.human_feedback import _serialize_llm_for_context
mock_obj = MagicMock(spec=[])
mock_obj.model = "gpt-4o-mini"
assert _serialize_llm_for_context(mock_obj) == "gpt-4o-mini"
class TestAsyncHumanFeedbackEdgeCases:

View File

@@ -400,6 +400,45 @@ class TestCollapseToOutcome:
assert result == "approved" # First in list
def test_both_llm_calls_fail_returns_first_outcome(self):
"""When both structured and simple prompting fail, return outcomes[0]."""
flow = Flow()
with patch("crewai.llm.LLM") as MockLLM:
mock_llm = MagicMock()
# Both calls raise — simulates wrong provider / auth failure
mock_llm.call.side_effect = RuntimeError("Model not found")
MockLLM.return_value = mock_llm
result = flow._collapse_to_outcome(
feedback="looks great, approve it",
outcomes=["needs_changes", "approved"],
llm="gemini-3-flash-preview",
)
assert result == "needs_changes" # First in list (safe fallback)
def test_structured_fails_but_simple_succeeds(self):
"""When structured output fails but simple prompting works, use that."""
flow = Flow()
with patch("crewai.llm.LLM") as MockLLM:
mock_llm = MagicMock()
# First call (structured) fails, second call (simple) succeeds
mock_llm.call.side_effect = [
RuntimeError("Function calling not supported"),
"approved",
]
MockLLM.return_value = mock_llm
result = flow._collapse_to_outcome(
feedback="looks great",
outcomes=["needs_changes", "approved"],
llm="gpt-4o-mini",
)
assert result == "approved"
# -- HITL Learning tests --