From d633aaa5fadc786a69e5cefd531b2ae972634180 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Wed, 6 May 2026 04:25:15 +0000
Subject: [PATCH] fix(hitl): log pre-review failures and add learn_strict mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes #5725.

In _pre_review_with_lessons (lib/crewai/src/crewai/flow/human_feedback.py),
a broad except Exception: return method_output silently swallowed any
LLM, network, auth, or structured-output failure during the pre-review
step. Callers could not distinguish pre-reviewed output from raw output,
and there was no log or event surfaced.

Changes:
- Add a module logger.
- Narrow the try/except in _pre_review_with_lessons so the mem is None
  and not matches short-circuits stay outside the failure path (those
  are not error cases).
- On recall or LLM pre-review failure, log a WARNING with exc_info=True
  so the silent fallback is observable. Continue to fall back to the raw
  method_output so the flow does not break.
- Add an opt-in learn_strict=True parameter on the human_feedback
  decorator and HumanFeedbackConfig that re-raises pre-review failures
  instead of falling back, for callers that need fail-closed behavior.
- Update the Graceful degradation docs section to reflect the new
  observable-by-default behavior and document learn_strict.

Tests (lib/crewai/tests/test_human_feedback_decorator.py):
- New TestHumanFeedbackPreReviewFailure class with 7 tests covering
  WARNING logging on LLM and recall failures, learn_strict propagation
  in both sync and async paths, and config introspection of the new
  learn_strict flag.

Co-Authored-By: João <joao@crewai.com>
---
 docs/en/learn/human-feedback-in-flows.mdx     |   4 +-
 lib/crewai/src/crewai/flow/human_feedback.py  |  71 ++++-
 .../tests/test_human_feedback_decorator.py    | 272 ++++++++++++++++++
 3 files changed, 336 insertions(+), 11 deletions(-)

diff --git a/docs/en/learn/human-feedback-in-flows.mdx b/docs/en/learn/human-feedback-in-flows.mdx
index 0c3792bca..8110625bd 100644
--- a/docs/en/learn/human-feedback-in-flows.mdx
+++ b/docs/en/learn/human-feedback-in-flows.mdx
@@ -682,6 +682,7 @@ class ArticleReviewFlow(Flow):
 | Parameter | Default | Description |
 |-----------|---------|-------------|
 | `learn` | `False` | Enable HITL learning |
+| `learn_strict` | `False` | If `True`, pre-review failures are re-raised instead of falling back to raw output |
 | `learn_limit` | `5` | Max past lessons to recall for pre-review |
 
 ### Key Design Decisions
@@ -689,7 +690,8 @@ class ArticleReviewFlow(Flow):
 - **Same LLM for everything**: The `llm` parameter on the decorator is shared by outcome collapsing, lesson distillation, and pre-review. No need to configure multiple models.
 - **Structured output**: Both distillation and pre-review use function calling with Pydantic models when the LLM supports it, falling back to text parsing otherwise.
 - **Non-blocking storage**: Lessons are stored via `remember_many()` which runs in a background thread -- the flow continues immediately.
-- **Graceful degradation**: If the LLM fails during distillation, nothing is stored. If it fails during pre-review, the raw output is shown. Neither failure blocks the flow.
+- **Observable graceful degradation**: If the LLM fails during distillation, nothing is stored. If it fails during pre-review, the raw output is shown to the human and the failure is logged at `WARNING` level with the full traceback (`exc_info=True`) under the `crewai.flow.human_feedback` logger -- so the silent fallback is detectable. Neither failure blocks the flow.
+- **Strict mode**: Pass `learn_strict=True` to make pre-review fail closed -- failures (LLM error, network/auth error, structured-output parse error, memory `recall` error) propagate out of the flow method instead of being swallowed. Use this when downstream code must be able to assume that pre-review actually executed.
 - **No scope/categories needed**: When storing lessons, only `source` is passed. The encoding pipeline infers scope, categories, and importance automatically.
 
 <Note>
diff --git a/lib/crewai/src/crewai/flow/human_feedback.py b/lib/crewai/src/crewai/flow/human_feedback.py
index e6a51d9da..636ad1012 100644
--- a/lib/crewai/src/crewai/flow/human_feedback.py
+++ b/lib/crewai/src/crewai/flow/human_feedback.py
@@ -60,6 +60,7 @@ from collections.abc import Callable, Sequence
 from dataclasses import dataclass, field
 from datetime import datetime
 from functools import wraps
+import logging
 from typing import TYPE_CHECKING, Any, TypeVar
 
 from pydantic import BaseModel, Field
@@ -75,6 +76,8 @@ if TYPE_CHECKING:
 
 F = TypeVar("F", bound=Callable[..., Any])
 
+logger = logging.getLogger(__name__)
+
 
 def _serialize_llm_for_context(llm: Any) -> dict[str, Any] | str | None:
     """Serialize a BaseLLM object to a dict preserving full config.
@@ -188,6 +191,7 @@ class HumanFeedbackConfig:
     provider: HumanFeedbackProvider | None = None
     learn: bool = False
     learn_source: str = "hitl"
+    learn_strict: bool = False
 
 
 class HumanFeedbackMethod(FlowMethod[Any, Any]):
@@ -237,6 +241,7 @@ def human_feedback(
     provider: HumanFeedbackProvider | None = None,
     learn: bool = False,
     learn_source: str = "hitl",
+    learn_strict: bool = False,
 ) -> Callable[[F], F]:
     """Decorator for Flow methods that require human feedback.
 
@@ -275,6 +280,20 @@ def human_feedback(
             external systems like Slack, Teams, or webhooks. When the
             provider raises HumanFeedbackPending, the flow pauses and
             can be resumed later with Flow.resume().
+        learn: When True, enables HITL learning. After feedback is
+            collected, the LLM distills generalizable lessons and stores
+            them in memory. Before the next review, past lessons are
+            recalled and applied via an LLM pre-review step so the human
+            sees a progressively improved output.
+        learn_source: The memory ``source`` tag used when storing and
+            recalling HITL lessons. Defaults to ``"hitl"``.
+        learn_strict: When True (default False), pre-review failures are
+            re-raised instead of falling back to the raw output. By
+            default, failures are logged at WARNING level with full
+            traceback (``exc_info=True``) and the raw method output is
+            shown to the human. Set this to True if downstream callers
+            must be able to assume that pre-review actually executed
+            successfully.
 
     Returns:
         A decorator function that wraps the method with human feedback
@@ -373,16 +392,40 @@ def human_feedback(
         def _pre_review_with_lessons(
             flow_instance: Flow[Any], method_output: Any
         ) -> Any:
-            """Recall past HITL lessons and use LLM to pre-review the output."""
-            try:
-                mem = flow_instance.memory
-                if mem is None:
-                    return method_output
-                query = f"human feedback lessons for {func.__name__}: {method_output!s}"
-                matches = mem.recall(query, source=learn_source)
-                if not matches:
-                    return method_output
+            """Recall past HITL lessons and use LLM to pre-review the output.
 
+            Returns the original ``method_output`` when memory is unavailable
+            or no lessons match — these are not error cases.
+
+            When the recall or LLM pre-review call raises an exception (for
+            example LLM/network/auth failure or structured-output parse
+            error), the failure is logged at WARNING level with full
+            traceback (``exc_info=True``) so callers can detect the silent
+            fallback. When ``learn_strict=True`` was passed to the decorator,
+            the exception is re-raised instead of being swallowed.
+            """
+            mem = flow_instance.memory
+            if mem is None:
+                return method_output
+
+            query = f"human feedback lessons for {func.__name__}: {method_output!s}"
+            try:
+                matches = mem.recall(query, source=learn_source)
+            except Exception:
+                logger.warning(
+                    "HITL pre-review: memory recall failed for %s; falling "
+                    "back to raw output.",
+                    func.__name__,
+                    exc_info=True,
+                )
+                if learn_strict:
+                    raise
+                return method_output
+
+            if not matches:
+                return method_output
+
+            try:
                 lessons = "\n".join(f"- {m.record.content}" for m in matches)
                 llm_inst = _resolve_llm_instance()
                 prompt = _get_hitl_prompt("hitl_pre_review_user").format(
@@ -404,7 +447,14 @@ def human_feedback(
                 reviewed = llm_inst.call(messages)
                 return reviewed if isinstance(reviewed, str) else str(reviewed)
             except Exception:
-                return method_output  # fallback to raw output on any failure
+                logger.warning(
+                    "HITL pre-review failed for %s; falling back to raw output.",
+                    func.__name__,
+                    exc_info=True,
+                )
+                if learn_strict:
+                    raise
+                return method_output
 
         def _distill_and_store_lessons(
             flow_instance: Flow[Any], method_output: Any, raw_feedback: str
@@ -654,6 +704,7 @@ def human_feedback(
             provider=provider,
             learn=learn,
             learn_source=learn_source,
+            learn_strict=learn_strict,
         )
         wrapper.__is_flow_method__ = True
 
diff --git a/lib/crewai/tests/test_human_feedback_decorator.py b/lib/crewai/tests/test_human_feedback_decorator.py
index 68371eb0d..d74ad9695 100644
--- a/lib/crewai/tests/test_human_feedback_decorator.py
+++ b/lib/crewai/tests/test_human_feedback_decorator.py
@@ -7,6 +7,7 @@ async support, and attribute preservation functionality.
 from __future__ import annotations
 
 import asyncio
+import logging
 from datetime import datetime
 from typing import Any
 from unittest.mock import MagicMock, patch
@@ -597,6 +598,277 @@ class TestHumanFeedbackLearn:
         assert config.llm == "gpt-4o-mini"
 
 
+class TestHumanFeedbackPreReviewFailure:
+    """Tests for HITL pre-review failure handling (issue #5725).
+
+    Pre-review must NOT silently swallow exceptions: failures should be
+    logged at WARNING level with full traceback, and an opt-in
+    ``learn_strict=True`` should propagate the exception instead of
+    falling back to raw output.
+    """
+
+    @staticmethod
+    def _seeded_memory() -> MagicMock:
+        """Return a mock memory object with one stored lesson so that
+        pre-review proceeds past the ``not matches`` short-circuit."""
+        from crewai.memory.types import MemoryMatch, MemoryRecord
+
+        mem = MagicMock()
+        mem.recall.return_value = [
+            MemoryMatch(
+                record=MemoryRecord(
+                    content="Always include source citations",
+                    embedding=[],
+                ),
+                score=0.9,
+                match_reasons=["semantic"],
+            )
+        ]
+        return mem
+
+    def test_pre_review_llm_failure_logs_warning_and_falls_back(self, caplog):
+        """When the pre-review LLM call raises, the failure is logged at
+        WARNING with exc_info, and the raw method_output is shown to the
+        human (default fail-open behavior, but now observable)."""
+
+        class LearnFlow(Flow):
+            @start()
+            @human_feedback(message="Review:", llm="gpt-4o-mini", learn=True)
+            def produce(self):
+                return "raw draft"
+
+        flow = LearnFlow()
+        flow.memory = self._seeded_memory()
+
+        captured: dict[str, Any] = {}
+
+        def capture_feedback(message, output, metadata=None, emit=None):
+            captured["shown_to_human"] = output
+            return "approved"
+
+        with (
+            patch.object(flow, "_request_human_feedback", side_effect=capture_feedback),
+            patch("crewai.llm.LLM") as MockLLM,
+            caplog.at_level(logging.WARNING, logger="crewai.flow.human_feedback"),
+        ):
+            mock_llm = MagicMock()
+            mock_llm.supports_function_calling.return_value = True
+            mock_llm.call.side_effect = RuntimeError(
+                "simulated pre-review failure"
+            )
+            MockLLM.return_value = mock_llm
+
+            flow.produce()
+
+        # The human still sees the raw output (fail-open by default).
+        assert captured["shown_to_human"] == "raw draft"
+
+        # ...but the failure is now observable via a WARNING log with traceback.
+        warning_records = [
+            r
+            for r in caplog.records
+            if r.name == "crewai.flow.human_feedback"
+            and r.levelno == logging.WARNING
+            and "HITL pre-review failed" in r.getMessage()
+        ]
+        assert len(warning_records) == 1, (
+            "Expected exactly one HITL pre-review failure warning, got: "
+            f"{[r.getMessage() for r in caplog.records]}"
+        )
+        # exc_info=True must be set so the traceback is captured.
+        assert warning_records[0].exc_info is not None
+        # The exception type/message should be visible in the captured exc_info.
+        exc_type, exc_value, _ = warning_records[0].exc_info
+        assert exc_type is RuntimeError
+        assert "simulated pre-review failure" in str(exc_value)
+
+    def test_pre_review_recall_failure_logs_warning_and_falls_back(self, caplog):
+        """A failure in ``memory.recall`` is also logged and falls back to
+        the raw output instead of being silently swallowed."""
+
+        class LearnFlow(Flow):
+            @start()
+            @human_feedback(message="Review:", llm="gpt-4o-mini", learn=True)
+            def produce(self):
+                return "raw draft"
+
+        flow = LearnFlow()
+        flow.memory = MagicMock()
+        flow.memory.recall.side_effect = RuntimeError("recall blew up")
+
+        captured: dict[str, Any] = {}
+
+        def capture_feedback(message, output, metadata=None, emit=None):
+            captured["shown_to_human"] = output
+            return ""
+
+        with (
+            patch.object(flow, "_request_human_feedback", side_effect=capture_feedback),
+            caplog.at_level(logging.WARNING, logger="crewai.flow.human_feedback"),
+        ):
+            flow.produce()
+
+        assert captured["shown_to_human"] == "raw draft"
+
+        warning_records = [
+            r
+            for r in caplog.records
+            if r.name == "crewai.flow.human_feedback"
+            and r.levelno == logging.WARNING
+            and "memory recall failed" in r.getMessage()
+        ]
+        assert len(warning_records) == 1
+        assert warning_records[0].exc_info is not None
+
+    def test_pre_review_failure_with_learn_strict_propagates(self):
+        """When ``learn_strict=True``, a pre-review LLM failure is re-raised
+        instead of silently falling back to raw output."""
+
+        class StrictFlow(Flow):
+            @start()
+            @human_feedback(
+                message="Review:",
+                llm="gpt-4o-mini",
+                learn=True,
+                learn_strict=True,
+            )
+            def produce(self):
+                return "raw draft"
+
+        flow = StrictFlow()
+        flow.memory = self._seeded_memory()
+
+        with (
+            patch.object(flow, "_request_human_feedback", return_value="approved"),
+            patch("crewai.llm.LLM") as MockLLM,
+        ):
+            mock_llm = MagicMock()
+            mock_llm.supports_function_calling.return_value = True
+            mock_llm.call.side_effect = RuntimeError(
+                "simulated pre-review failure"
+            )
+            MockLLM.return_value = mock_llm
+
+            with pytest.raises(RuntimeError, match="simulated pre-review failure"):
+                flow.produce()
+
+    def test_pre_review_recall_failure_with_learn_strict_propagates(self):
+        """When ``learn_strict=True``, a ``memory.recall`` failure is re-raised
+        instead of silently falling back to raw output."""
+
+        class StrictFlow(Flow):
+            @start()
+            @human_feedback(
+                message="Review:",
+                llm="gpt-4o-mini",
+                learn=True,
+                learn_strict=True,
+            )
+            def produce(self):
+                return "raw draft"
+
+        flow = StrictFlow()
+        flow.memory = MagicMock()
+        flow.memory.recall.side_effect = RuntimeError("recall blew up")
+
+        with patch.object(flow, "_request_human_feedback", return_value="approved"):
+            with pytest.raises(RuntimeError, match="recall blew up"):
+                flow.produce()
+
+    @pytest.mark.asyncio
+    async def test_async_pre_review_failure_logs_warning_and_falls_back(self, caplog):
+        """The async wrapper exhibits the same logging + fail-open behavior
+        as the sync wrapper on pre-review LLM failure."""
+
+        class AsyncLearnFlow(Flow):
+            @start()
+            @human_feedback(message="Review:", llm="gpt-4o-mini", learn=True)
+            async def produce(self):
+                return "raw draft"
+
+        flow = AsyncLearnFlow()
+        flow.memory = self._seeded_memory()
+
+        captured: dict[str, Any] = {}
+
+        def capture_feedback(message, output, metadata=None, emit=None):
+            captured["shown_to_human"] = output
+            return "approved"
+
+        with (
+            patch.object(flow, "_request_human_feedback", side_effect=capture_feedback),
+            patch("crewai.llm.LLM") as MockLLM,
+            caplog.at_level(logging.WARNING, logger="crewai.flow.human_feedback"),
+        ):
+            mock_llm = MagicMock()
+            mock_llm.supports_function_calling.return_value = True
+            mock_llm.call.side_effect = RuntimeError(
+                "simulated pre-review failure"
+            )
+            MockLLM.return_value = mock_llm
+
+            await flow.produce()
+
+        assert captured["shown_to_human"] == "raw draft"
+
+        warning_records = [
+            r
+            for r in caplog.records
+            if r.name == "crewai.flow.human_feedback"
+            and r.levelno == logging.WARNING
+            and "HITL pre-review failed" in r.getMessage()
+        ]
+        assert len(warning_records) == 1
+        assert warning_records[0].exc_info is not None
+
+    @pytest.mark.asyncio
+    async def test_async_pre_review_failure_with_learn_strict_propagates(self):
+        """The async wrapper also re-raises when ``learn_strict=True``."""
+
+        class AsyncStrictFlow(Flow):
+            @start()
+            @human_feedback(
+                message="Review:",
+                llm="gpt-4o-mini",
+                learn=True,
+                learn_strict=True,
+            )
+            async def produce(self):
+                return "raw draft"
+
+        flow = AsyncStrictFlow()
+        flow.memory = self._seeded_memory()
+
+        with (
+            patch.object(flow, "_request_human_feedback", return_value="approved"),
+            patch("crewai.llm.LLM") as MockLLM,
+        ):
+            mock_llm = MagicMock()
+            mock_llm.supports_function_calling.return_value = True
+            mock_llm.call.side_effect = RuntimeError(
+                "simulated pre-review failure"
+            )
+            MockLLM.return_value = mock_llm
+
+            with pytest.raises(RuntimeError, match="simulated pre-review failure"):
+                await flow.produce()
+
+    def test_learn_strict_default_is_false_and_propagates_to_config(self):
+        """``learn_strict`` defaults to False and is exposed on the
+        ``HumanFeedbackConfig`` for introspection."""
+
+        @human_feedback(message="Review:", learn=True)
+        def default_method(self):
+            return "output"
+
+        @human_feedback(message="Review:", learn=True, learn_strict=True)
+        def strict_method(self):
+            return "output"
+
+        assert default_method.__human_feedback_config__.learn_strict is False
+        assert strict_method.__human_feedback_config__.learn_strict is True
+
+
 class TestHumanFeedbackFinalOutputPreservation:
     """Tests for preserving method return value as flow's final output when @human_feedback with emit is terminal.