fix(hitl): log pre-review failures and add learn_strict mode

Closes #5725.

In _pre_review_with_lessons (lib/crewai/src/crewai/flow/human_feedback.py),
a broad except Exception: return method_output silently swallowed any
LLM, network, auth, or structured-output failure during the pre-review
step. Callers could not distinguish pre-reviewed output from raw output,
and there was no log or event surfaced.

Changes:
- Add a module logger.
- Narrow the try/except in _pre_review_with_lessons so the mem is None
  and not matches short-circuits stay outside the failure path (those
  are not error cases).
- On recall or LLM pre-review failure, log a WARNING with exc_info=True
  so the silent fallback is observable. Continue to fall back to the raw
  method_output so the flow does not break.
- Add an opt-in learn_strict=True parameter on the human_feedback
  decorator and HumanFeedbackConfig that re-raises pre-review failures
  instead of falling back, for callers that need fail-closed behavior.
- Update the Graceful degradation docs section to reflect the new
  observable-by-default behavior and document learn_strict.

Tests (lib/crewai/tests/test_human_feedback_decorator.py):
- New TestHumanFeedbackPreReviewFailure class with 7 tests covering
  WARNING logging on LLM and recall failures, learn_strict propagation
  in both sync and async paths, and config introspection of the new
  learn_strict flag.

Co-Authored-By: João <joao@crewai.com>
This commit is contained in:
Devin AI
2026-05-06 04:25:15 +00:00
parent ec8a522c2c
commit d633aaa5fa
3 changed files with 336 additions and 11 deletions

View File

@@ -682,6 +682,7 @@ class ArticleReviewFlow(Flow):
| Parameter | Default | Description |
|-----------|---------|-------------|
| `learn` | `False` | Enable HITL learning |
| `learn_strict` | `False` | If `True`, pre-review failures are re-raised instead of falling back to raw output |
| `learn_limit` | `5` | Max past lessons to recall for pre-review |
### Key Design Decisions
@@ -689,7 +690,8 @@ class ArticleReviewFlow(Flow):
- **Same LLM for everything**: The `llm` parameter on the decorator is shared by outcome collapsing, lesson distillation, and pre-review. No need to configure multiple models.
- **Structured output**: Both distillation and pre-review use function calling with Pydantic models when the LLM supports it, falling back to text parsing otherwise.
- **Non-blocking storage**: Lessons are stored via `remember_many()` which runs in a background thread -- the flow continues immediately.
- **Graceful degradation**: If the LLM fails during distillation, nothing is stored. If it fails during pre-review, the raw output is shown. Neither failure blocks the flow.
- **Observable graceful degradation**: If the LLM fails during distillation, nothing is stored. If it fails during pre-review, the raw output is shown to the human and the failure is logged at `WARNING` level with the full traceback (`exc_info=True`) under the `crewai.flow.human_feedback` logger -- so the silent fallback is detectable. Neither failure blocks the flow.
- **Strict mode**: Pass `learn_strict=True` to make pre-review fail closed -- failures (LLM error, network/auth error, structured-output parse error, memory `recall` error) propagate out of the flow method instead of being swallowed. Use this when downstream code must be able to assume that pre-review actually executed.
- **No scope/categories needed**: When storing lessons, only `source` is passed. The encoding pipeline infers scope, categories, and importance automatically.
<Note>

View File

@@ -60,6 +60,7 @@ from collections.abc import Callable, Sequence
from dataclasses import dataclass, field
from datetime import datetime
from functools import wraps
import logging
from typing import TYPE_CHECKING, Any, TypeVar
from pydantic import BaseModel, Field
@@ -75,6 +76,8 @@ if TYPE_CHECKING:
F = TypeVar("F", bound=Callable[..., Any])
logger = logging.getLogger(__name__)
def _serialize_llm_for_context(llm: Any) -> dict[str, Any] | str | None:
"""Serialize a BaseLLM object to a dict preserving full config.
@@ -188,6 +191,7 @@ class HumanFeedbackConfig:
provider: HumanFeedbackProvider | None = None
learn: bool = False
learn_source: str = "hitl"
learn_strict: bool = False
class HumanFeedbackMethod(FlowMethod[Any, Any]):
@@ -237,6 +241,7 @@ def human_feedback(
provider: HumanFeedbackProvider | None = None,
learn: bool = False,
learn_source: str = "hitl",
learn_strict: bool = False,
) -> Callable[[F], F]:
"""Decorator for Flow methods that require human feedback.
@@ -275,6 +280,20 @@ def human_feedback(
external systems like Slack, Teams, or webhooks. When the
provider raises HumanFeedbackPending, the flow pauses and
can be resumed later with Flow.resume().
learn: When True, enables HITL learning. After feedback is
collected, the LLM distills generalizable lessons and stores
them in memory. Before the next review, past lessons are
recalled and applied via an LLM pre-review step so the human
sees a progressively improved output.
learn_source: The memory ``source`` tag used when storing and
recalling HITL lessons. Defaults to ``"hitl"``.
learn_strict: When True (default False), pre-review failures are
re-raised instead of falling back to the raw output. By
default, failures are logged at WARNING level with full
traceback (``exc_info=True``) and the raw method output is
shown to the human. Set this to True if downstream callers
must be able to assume that pre-review actually executed
successfully.
Returns:
A decorator function that wraps the method with human feedback
@@ -373,16 +392,40 @@ def human_feedback(
def _pre_review_with_lessons(
flow_instance: Flow[Any], method_output: Any
) -> Any:
"""Recall past HITL lessons and use LLM to pre-review the output."""
try:
mem = flow_instance.memory
if mem is None:
return method_output
query = f"human feedback lessons for {func.__name__}: {method_output!s}"
matches = mem.recall(query, source=learn_source)
if not matches:
return method_output
"""Recall past HITL lessons and use LLM to pre-review the output.
Returns the original ``method_output`` when memory is unavailable
or no lessons match — these are not error cases.
When the recall or LLM pre-review call raises an exception (for
example LLM/network/auth failure or structured-output parse
error), the failure is logged at WARNING level with full
traceback (``exc_info=True``) so callers can detect the silent
fallback. When ``learn_strict=True`` was passed to the decorator,
the exception is re-raised instead of being swallowed.
"""
mem = flow_instance.memory
if mem is None:
return method_output
query = f"human feedback lessons for {func.__name__}: {method_output!s}"
try:
matches = mem.recall(query, source=learn_source)
except Exception:
logger.warning(
"HITL pre-review: memory recall failed for %s; falling "
"back to raw output.",
func.__name__,
exc_info=True,
)
if learn_strict:
raise
return method_output
if not matches:
return method_output
try:
lessons = "\n".join(f"- {m.record.content}" for m in matches)
llm_inst = _resolve_llm_instance()
prompt = _get_hitl_prompt("hitl_pre_review_user").format(
@@ -404,7 +447,14 @@ def human_feedback(
reviewed = llm_inst.call(messages)
return reviewed if isinstance(reviewed, str) else str(reviewed)
except Exception:
return method_output # fallback to raw output on any failure
logger.warning(
"HITL pre-review failed for %s; falling back to raw output.",
func.__name__,
exc_info=True,
)
if learn_strict:
raise
return method_output
def _distill_and_store_lessons(
flow_instance: Flow[Any], method_output: Any, raw_feedback: str
@@ -654,6 +704,7 @@ def human_feedback(
provider=provider,
learn=learn,
learn_source=learn_source,
learn_strict=learn_strict,
)
wrapper.__is_flow_method__ = True

View File

@@ -7,6 +7,7 @@ async support, and attribute preservation functionality.
from __future__ import annotations
import asyncio
import logging
from datetime import datetime
from typing import Any
from unittest.mock import MagicMock, patch
@@ -597,6 +598,277 @@ class TestHumanFeedbackLearn:
assert config.llm == "gpt-4o-mini"
class TestHumanFeedbackPreReviewFailure:
"""Tests for HITL pre-review failure handling (issue #5725).
Pre-review must NOT silently swallow exceptions: failures should be
logged at WARNING level with full traceback, and an opt-in
``learn_strict=True`` should propagate the exception instead of
falling back to raw output.
"""
@staticmethod
def _seeded_memory() -> MagicMock:
"""Return a mock memory object with one stored lesson so that
pre-review proceeds past the ``not matches`` short-circuit."""
from crewai.memory.types import MemoryMatch, MemoryRecord
mem = MagicMock()
mem.recall.return_value = [
MemoryMatch(
record=MemoryRecord(
content="Always include source citations",
embedding=[],
),
score=0.9,
match_reasons=["semantic"],
)
]
return mem
def test_pre_review_llm_failure_logs_warning_and_falls_back(self, caplog):
"""When the pre-review LLM call raises, the failure is logged at
WARNING with exc_info, and the raw method_output is shown to the
human (default fail-open behavior, but now observable)."""
class LearnFlow(Flow):
@start()
@human_feedback(message="Review:", llm="gpt-4o-mini", learn=True)
def produce(self):
return "raw draft"
flow = LearnFlow()
flow.memory = self._seeded_memory()
captured: dict[str, Any] = {}
def capture_feedback(message, output, metadata=None, emit=None):
captured["shown_to_human"] = output
return "approved"
with (
patch.object(flow, "_request_human_feedback", side_effect=capture_feedback),
patch("crewai.llm.LLM") as MockLLM,
caplog.at_level(logging.WARNING, logger="crewai.flow.human_feedback"),
):
mock_llm = MagicMock()
mock_llm.supports_function_calling.return_value = True
mock_llm.call.side_effect = RuntimeError(
"simulated pre-review failure"
)
MockLLM.return_value = mock_llm
flow.produce()
# The human still sees the raw output (fail-open by default).
assert captured["shown_to_human"] == "raw draft"
# ...but the failure is now observable via a WARNING log with traceback.
warning_records = [
r
for r in caplog.records
if r.name == "crewai.flow.human_feedback"
and r.levelno == logging.WARNING
and "HITL pre-review failed" in r.getMessage()
]
assert len(warning_records) == 1, (
"Expected exactly one HITL pre-review failure warning, got: "
f"{[r.getMessage() for r in caplog.records]}"
)
# exc_info=True must be set so the traceback is captured.
assert warning_records[0].exc_info is not None
# The exception type/message should be visible in the captured exc_info.
exc_type, exc_value, _ = warning_records[0].exc_info
assert exc_type is RuntimeError
assert "simulated pre-review failure" in str(exc_value)
def test_pre_review_recall_failure_logs_warning_and_falls_back(self, caplog):
"""A failure in ``memory.recall`` is also logged and falls back to
the raw output instead of being silently swallowed."""
class LearnFlow(Flow):
@start()
@human_feedback(message="Review:", llm="gpt-4o-mini", learn=True)
def produce(self):
return "raw draft"
flow = LearnFlow()
flow.memory = MagicMock()
flow.memory.recall.side_effect = RuntimeError("recall blew up")
captured: dict[str, Any] = {}
def capture_feedback(message, output, metadata=None, emit=None):
captured["shown_to_human"] = output
return ""
with (
patch.object(flow, "_request_human_feedback", side_effect=capture_feedback),
caplog.at_level(logging.WARNING, logger="crewai.flow.human_feedback"),
):
flow.produce()
assert captured["shown_to_human"] == "raw draft"
warning_records = [
r
for r in caplog.records
if r.name == "crewai.flow.human_feedback"
and r.levelno == logging.WARNING
and "memory recall failed" in r.getMessage()
]
assert len(warning_records) == 1
assert warning_records[0].exc_info is not None
def test_pre_review_failure_with_learn_strict_propagates(self):
"""When ``learn_strict=True``, a pre-review LLM failure is re-raised
instead of silently falling back to raw output."""
class StrictFlow(Flow):
@start()
@human_feedback(
message="Review:",
llm="gpt-4o-mini",
learn=True,
learn_strict=True,
)
def produce(self):
return "raw draft"
flow = StrictFlow()
flow.memory = self._seeded_memory()
with (
patch.object(flow, "_request_human_feedback", return_value="approved"),
patch("crewai.llm.LLM") as MockLLM,
):
mock_llm = MagicMock()
mock_llm.supports_function_calling.return_value = True
mock_llm.call.side_effect = RuntimeError(
"simulated pre-review failure"
)
MockLLM.return_value = mock_llm
with pytest.raises(RuntimeError, match="simulated pre-review failure"):
flow.produce()
def test_pre_review_recall_failure_with_learn_strict_propagates(self):
"""When ``learn_strict=True``, a ``memory.recall`` failure is re-raised
instead of silently falling back to raw output."""
class StrictFlow(Flow):
@start()
@human_feedback(
message="Review:",
llm="gpt-4o-mini",
learn=True,
learn_strict=True,
)
def produce(self):
return "raw draft"
flow = StrictFlow()
flow.memory = MagicMock()
flow.memory.recall.side_effect = RuntimeError("recall blew up")
with patch.object(flow, "_request_human_feedback", return_value="approved"):
with pytest.raises(RuntimeError, match="recall blew up"):
flow.produce()
@pytest.mark.asyncio
async def test_async_pre_review_failure_logs_warning_and_falls_back(self, caplog):
"""The async wrapper exhibits the same logging + fail-open behavior
as the sync wrapper on pre-review LLM failure."""
class AsyncLearnFlow(Flow):
@start()
@human_feedback(message="Review:", llm="gpt-4o-mini", learn=True)
async def produce(self):
return "raw draft"
flow = AsyncLearnFlow()
flow.memory = self._seeded_memory()
captured: dict[str, Any] = {}
def capture_feedback(message, output, metadata=None, emit=None):
captured["shown_to_human"] = output
return "approved"
with (
patch.object(flow, "_request_human_feedback", side_effect=capture_feedback),
patch("crewai.llm.LLM") as MockLLM,
caplog.at_level(logging.WARNING, logger="crewai.flow.human_feedback"),
):
mock_llm = MagicMock()
mock_llm.supports_function_calling.return_value = True
mock_llm.call.side_effect = RuntimeError(
"simulated pre-review failure"
)
MockLLM.return_value = mock_llm
await flow.produce()
assert captured["shown_to_human"] == "raw draft"
warning_records = [
r
for r in caplog.records
if r.name == "crewai.flow.human_feedback"
and r.levelno == logging.WARNING
and "HITL pre-review failed" in r.getMessage()
]
assert len(warning_records) == 1
assert warning_records[0].exc_info is not None
@pytest.mark.asyncio
async def test_async_pre_review_failure_with_learn_strict_propagates(self):
"""The async wrapper also re-raises when ``learn_strict=True``."""
class AsyncStrictFlow(Flow):
@start()
@human_feedback(
message="Review:",
llm="gpt-4o-mini",
learn=True,
learn_strict=True,
)
async def produce(self):
return "raw draft"
flow = AsyncStrictFlow()
flow.memory = self._seeded_memory()
with (
patch.object(flow, "_request_human_feedback", return_value="approved"),
patch("crewai.llm.LLM") as MockLLM,
):
mock_llm = MagicMock()
mock_llm.supports_function_calling.return_value = True
mock_llm.call.side_effect = RuntimeError(
"simulated pre-review failure"
)
MockLLM.return_value = mock_llm
with pytest.raises(RuntimeError, match="simulated pre-review failure"):
await flow.produce()
def test_learn_strict_default_is_false_and_propagates_to_config(self):
"""``learn_strict`` defaults to False and is exposed on the
``HumanFeedbackConfig`` for introspection."""
@human_feedback(message="Review:", learn=True)
def default_method(self):
return "output"
@human_feedback(message="Review:", learn=True, learn_strict=True)
def strict_method(self):
return "output"
assert default_method.__human_feedback_config__.learn_strict is False
assert strict_method.__human_feedback_config__.learn_strict is True
class TestHumanFeedbackFinalOutputPreservation:
"""Tests for preserving method return value as flow's final output when @human_feedback with emit is terminal.