From d633aaa5fadc786a69e5cefd531b2ae972634180 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 6 May 2026 04:25:15 +0000 Subject: [PATCH] fix(hitl): log pre-review failures and add learn_strict mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #5725. In _pre_review_with_lessons (lib/crewai/src/crewai/flow/human_feedback.py), a broad except Exception: return method_output silently swallowed any LLM, network, auth, or structured-output failure during the pre-review step. Callers could not distinguish pre-reviewed output from raw output, and there was no log or event surfaced. Changes: - Add a module logger. - Narrow the try/except in _pre_review_with_lessons so the mem is None and not matches short-circuits stay outside the failure path (those are not error cases). - On recall or LLM pre-review failure, log a WARNING with exc_info=True so the silent fallback is observable. Continue to fall back to the raw method_output so the flow does not break. - Add an opt-in learn_strict=True parameter on the human_feedback decorator and HumanFeedbackConfig that re-raises pre-review failures instead of falling back, for callers that need fail-closed behavior. - Update the Graceful degradation docs section to reflect the new observable-by-default behavior and document learn_strict. Tests (lib/crewai/tests/test_human_feedback_decorator.py): - New TestHumanFeedbackPreReviewFailure class with 7 tests covering WARNING logging on LLM and recall failures, learn_strict propagation in both sync and async paths, and config introspection of the new learn_strict flag. Co-Authored-By: João --- docs/en/learn/human-feedback-in-flows.mdx | 4 +- lib/crewai/src/crewai/flow/human_feedback.py | 71 ++++- .../tests/test_human_feedback_decorator.py | 272 ++++++++++++++++++ 3 files changed, 336 insertions(+), 11 deletions(-) diff --git a/docs/en/learn/human-feedback-in-flows.mdx b/docs/en/learn/human-feedback-in-flows.mdx index 0c3792bca..8110625bd 100644 --- a/docs/en/learn/human-feedback-in-flows.mdx +++ b/docs/en/learn/human-feedback-in-flows.mdx @@ -682,6 +682,7 @@ class ArticleReviewFlow(Flow): | Parameter | Default | Description | |-----------|---------|-------------| | `learn` | `False` | Enable HITL learning | +| `learn_strict` | `False` | If `True`, pre-review failures are re-raised instead of falling back to raw output | | `learn_limit` | `5` | Max past lessons to recall for pre-review | ### Key Design Decisions @@ -689,7 +690,8 @@ class ArticleReviewFlow(Flow): - **Same LLM for everything**: The `llm` parameter on the decorator is shared by outcome collapsing, lesson distillation, and pre-review. No need to configure multiple models. - **Structured output**: Both distillation and pre-review use function calling with Pydantic models when the LLM supports it, falling back to text parsing otherwise. - **Non-blocking storage**: Lessons are stored via `remember_many()` which runs in a background thread -- the flow continues immediately. -- **Graceful degradation**: If the LLM fails during distillation, nothing is stored. If it fails during pre-review, the raw output is shown. Neither failure blocks the flow. +- **Observable graceful degradation**: If the LLM fails during distillation, nothing is stored. If it fails during pre-review, the raw output is shown to the human and the failure is logged at `WARNING` level with the full traceback (`exc_info=True`) under the `crewai.flow.human_feedback` logger -- so the silent fallback is detectable. Neither failure blocks the flow. +- **Strict mode**: Pass `learn_strict=True` to make pre-review fail closed -- failures (LLM error, network/auth error, structured-output parse error, memory `recall` error) propagate out of the flow method instead of being swallowed. Use this when downstream code must be able to assume that pre-review actually executed. - **No scope/categories needed**: When storing lessons, only `source` is passed. The encoding pipeline infers scope, categories, and importance automatically. diff --git a/lib/crewai/src/crewai/flow/human_feedback.py b/lib/crewai/src/crewai/flow/human_feedback.py index e6a51d9da..636ad1012 100644 --- a/lib/crewai/src/crewai/flow/human_feedback.py +++ b/lib/crewai/src/crewai/flow/human_feedback.py @@ -60,6 +60,7 @@ from collections.abc import Callable, Sequence from dataclasses import dataclass, field from datetime import datetime from functools import wraps +import logging from typing import TYPE_CHECKING, Any, TypeVar from pydantic import BaseModel, Field @@ -75,6 +76,8 @@ if TYPE_CHECKING: F = TypeVar("F", bound=Callable[..., Any]) +logger = logging.getLogger(__name__) + def _serialize_llm_for_context(llm: Any) -> dict[str, Any] | str | None: """Serialize a BaseLLM object to a dict preserving full config. @@ -188,6 +191,7 @@ class HumanFeedbackConfig: provider: HumanFeedbackProvider | None = None learn: bool = False learn_source: str = "hitl" + learn_strict: bool = False class HumanFeedbackMethod(FlowMethod[Any, Any]): @@ -237,6 +241,7 @@ def human_feedback( provider: HumanFeedbackProvider | None = None, learn: bool = False, learn_source: str = "hitl", + learn_strict: bool = False, ) -> Callable[[F], F]: """Decorator for Flow methods that require human feedback. @@ -275,6 +280,20 @@ def human_feedback( external systems like Slack, Teams, or webhooks. When the provider raises HumanFeedbackPending, the flow pauses and can be resumed later with Flow.resume(). + learn: When True, enables HITL learning. After feedback is + collected, the LLM distills generalizable lessons and stores + them in memory. Before the next review, past lessons are + recalled and applied via an LLM pre-review step so the human + sees a progressively improved output. + learn_source: The memory ``source`` tag used when storing and + recalling HITL lessons. Defaults to ``"hitl"``. + learn_strict: When True (default False), pre-review failures are + re-raised instead of falling back to the raw output. By + default, failures are logged at WARNING level with full + traceback (``exc_info=True``) and the raw method output is + shown to the human. Set this to True if downstream callers + must be able to assume that pre-review actually executed + successfully. Returns: A decorator function that wraps the method with human feedback @@ -373,16 +392,40 @@ def human_feedback( def _pre_review_with_lessons( flow_instance: Flow[Any], method_output: Any ) -> Any: - """Recall past HITL lessons and use LLM to pre-review the output.""" - try: - mem = flow_instance.memory - if mem is None: - return method_output - query = f"human feedback lessons for {func.__name__}: {method_output!s}" - matches = mem.recall(query, source=learn_source) - if not matches: - return method_output + """Recall past HITL lessons and use LLM to pre-review the output. + Returns the original ``method_output`` when memory is unavailable + or no lessons match — these are not error cases. + + When the recall or LLM pre-review call raises an exception (for + example LLM/network/auth failure or structured-output parse + error), the failure is logged at WARNING level with full + traceback (``exc_info=True``) so callers can detect the silent + fallback. When ``learn_strict=True`` was passed to the decorator, + the exception is re-raised instead of being swallowed. + """ + mem = flow_instance.memory + if mem is None: + return method_output + + query = f"human feedback lessons for {func.__name__}: {method_output!s}" + try: + matches = mem.recall(query, source=learn_source) + except Exception: + logger.warning( + "HITL pre-review: memory recall failed for %s; falling " + "back to raw output.", + func.__name__, + exc_info=True, + ) + if learn_strict: + raise + return method_output + + if not matches: + return method_output + + try: lessons = "\n".join(f"- {m.record.content}" for m in matches) llm_inst = _resolve_llm_instance() prompt = _get_hitl_prompt("hitl_pre_review_user").format( @@ -404,7 +447,14 @@ def human_feedback( reviewed = llm_inst.call(messages) return reviewed if isinstance(reviewed, str) else str(reviewed) except Exception: - return method_output # fallback to raw output on any failure + logger.warning( + "HITL pre-review failed for %s; falling back to raw output.", + func.__name__, + exc_info=True, + ) + if learn_strict: + raise + return method_output def _distill_and_store_lessons( flow_instance: Flow[Any], method_output: Any, raw_feedback: str @@ -654,6 +704,7 @@ def human_feedback( provider=provider, learn=learn, learn_source=learn_source, + learn_strict=learn_strict, ) wrapper.__is_flow_method__ = True diff --git a/lib/crewai/tests/test_human_feedback_decorator.py b/lib/crewai/tests/test_human_feedback_decorator.py index 68371eb0d..d74ad9695 100644 --- a/lib/crewai/tests/test_human_feedback_decorator.py +++ b/lib/crewai/tests/test_human_feedback_decorator.py @@ -7,6 +7,7 @@ async support, and attribute preservation functionality. from __future__ import annotations import asyncio +import logging from datetime import datetime from typing import Any from unittest.mock import MagicMock, patch @@ -597,6 +598,277 @@ class TestHumanFeedbackLearn: assert config.llm == "gpt-4o-mini" +class TestHumanFeedbackPreReviewFailure: + """Tests for HITL pre-review failure handling (issue #5725). + + Pre-review must NOT silently swallow exceptions: failures should be + logged at WARNING level with full traceback, and an opt-in + ``learn_strict=True`` should propagate the exception instead of + falling back to raw output. + """ + + @staticmethod + def _seeded_memory() -> MagicMock: + """Return a mock memory object with one stored lesson so that + pre-review proceeds past the ``not matches`` short-circuit.""" + from crewai.memory.types import MemoryMatch, MemoryRecord + + mem = MagicMock() + mem.recall.return_value = [ + MemoryMatch( + record=MemoryRecord( + content="Always include source citations", + embedding=[], + ), + score=0.9, + match_reasons=["semantic"], + ) + ] + return mem + + def test_pre_review_llm_failure_logs_warning_and_falls_back(self, caplog): + """When the pre-review LLM call raises, the failure is logged at + WARNING with exc_info, and the raw method_output is shown to the + human (default fail-open behavior, but now observable).""" + + class LearnFlow(Flow): + @start() + @human_feedback(message="Review:", llm="gpt-4o-mini", learn=True) + def produce(self): + return "raw draft" + + flow = LearnFlow() + flow.memory = self._seeded_memory() + + captured: dict[str, Any] = {} + + def capture_feedback(message, output, metadata=None, emit=None): + captured["shown_to_human"] = output + return "approved" + + with ( + patch.object(flow, "_request_human_feedback", side_effect=capture_feedback), + patch("crewai.llm.LLM") as MockLLM, + caplog.at_level(logging.WARNING, logger="crewai.flow.human_feedback"), + ): + mock_llm = MagicMock() + mock_llm.supports_function_calling.return_value = True + mock_llm.call.side_effect = RuntimeError( + "simulated pre-review failure" + ) + MockLLM.return_value = mock_llm + + flow.produce() + + # The human still sees the raw output (fail-open by default). + assert captured["shown_to_human"] == "raw draft" + + # ...but the failure is now observable via a WARNING log with traceback. + warning_records = [ + r + for r in caplog.records + if r.name == "crewai.flow.human_feedback" + and r.levelno == logging.WARNING + and "HITL pre-review failed" in r.getMessage() + ] + assert len(warning_records) == 1, ( + "Expected exactly one HITL pre-review failure warning, got: " + f"{[r.getMessage() for r in caplog.records]}" + ) + # exc_info=True must be set so the traceback is captured. + assert warning_records[0].exc_info is not None + # The exception type/message should be visible in the captured exc_info. + exc_type, exc_value, _ = warning_records[0].exc_info + assert exc_type is RuntimeError + assert "simulated pre-review failure" in str(exc_value) + + def test_pre_review_recall_failure_logs_warning_and_falls_back(self, caplog): + """A failure in ``memory.recall`` is also logged and falls back to + the raw output instead of being silently swallowed.""" + + class LearnFlow(Flow): + @start() + @human_feedback(message="Review:", llm="gpt-4o-mini", learn=True) + def produce(self): + return "raw draft" + + flow = LearnFlow() + flow.memory = MagicMock() + flow.memory.recall.side_effect = RuntimeError("recall blew up") + + captured: dict[str, Any] = {} + + def capture_feedback(message, output, metadata=None, emit=None): + captured["shown_to_human"] = output + return "" + + with ( + patch.object(flow, "_request_human_feedback", side_effect=capture_feedback), + caplog.at_level(logging.WARNING, logger="crewai.flow.human_feedback"), + ): + flow.produce() + + assert captured["shown_to_human"] == "raw draft" + + warning_records = [ + r + for r in caplog.records + if r.name == "crewai.flow.human_feedback" + and r.levelno == logging.WARNING + and "memory recall failed" in r.getMessage() + ] + assert len(warning_records) == 1 + assert warning_records[0].exc_info is not None + + def test_pre_review_failure_with_learn_strict_propagates(self): + """When ``learn_strict=True``, a pre-review LLM failure is re-raised + instead of silently falling back to raw output.""" + + class StrictFlow(Flow): + @start() + @human_feedback( + message="Review:", + llm="gpt-4o-mini", + learn=True, + learn_strict=True, + ) + def produce(self): + return "raw draft" + + flow = StrictFlow() + flow.memory = self._seeded_memory() + + with ( + patch.object(flow, "_request_human_feedback", return_value="approved"), + patch("crewai.llm.LLM") as MockLLM, + ): + mock_llm = MagicMock() + mock_llm.supports_function_calling.return_value = True + mock_llm.call.side_effect = RuntimeError( + "simulated pre-review failure" + ) + MockLLM.return_value = mock_llm + + with pytest.raises(RuntimeError, match="simulated pre-review failure"): + flow.produce() + + def test_pre_review_recall_failure_with_learn_strict_propagates(self): + """When ``learn_strict=True``, a ``memory.recall`` failure is re-raised + instead of silently falling back to raw output.""" + + class StrictFlow(Flow): + @start() + @human_feedback( + message="Review:", + llm="gpt-4o-mini", + learn=True, + learn_strict=True, + ) + def produce(self): + return "raw draft" + + flow = StrictFlow() + flow.memory = MagicMock() + flow.memory.recall.side_effect = RuntimeError("recall blew up") + + with patch.object(flow, "_request_human_feedback", return_value="approved"): + with pytest.raises(RuntimeError, match="recall blew up"): + flow.produce() + + @pytest.mark.asyncio + async def test_async_pre_review_failure_logs_warning_and_falls_back(self, caplog): + """The async wrapper exhibits the same logging + fail-open behavior + as the sync wrapper on pre-review LLM failure.""" + + class AsyncLearnFlow(Flow): + @start() + @human_feedback(message="Review:", llm="gpt-4o-mini", learn=True) + async def produce(self): + return "raw draft" + + flow = AsyncLearnFlow() + flow.memory = self._seeded_memory() + + captured: dict[str, Any] = {} + + def capture_feedback(message, output, metadata=None, emit=None): + captured["shown_to_human"] = output + return "approved" + + with ( + patch.object(flow, "_request_human_feedback", side_effect=capture_feedback), + patch("crewai.llm.LLM") as MockLLM, + caplog.at_level(logging.WARNING, logger="crewai.flow.human_feedback"), + ): + mock_llm = MagicMock() + mock_llm.supports_function_calling.return_value = True + mock_llm.call.side_effect = RuntimeError( + "simulated pre-review failure" + ) + MockLLM.return_value = mock_llm + + await flow.produce() + + assert captured["shown_to_human"] == "raw draft" + + warning_records = [ + r + for r in caplog.records + if r.name == "crewai.flow.human_feedback" + and r.levelno == logging.WARNING + and "HITL pre-review failed" in r.getMessage() + ] + assert len(warning_records) == 1 + assert warning_records[0].exc_info is not None + + @pytest.mark.asyncio + async def test_async_pre_review_failure_with_learn_strict_propagates(self): + """The async wrapper also re-raises when ``learn_strict=True``.""" + + class AsyncStrictFlow(Flow): + @start() + @human_feedback( + message="Review:", + llm="gpt-4o-mini", + learn=True, + learn_strict=True, + ) + async def produce(self): + return "raw draft" + + flow = AsyncStrictFlow() + flow.memory = self._seeded_memory() + + with ( + patch.object(flow, "_request_human_feedback", return_value="approved"), + patch("crewai.llm.LLM") as MockLLM, + ): + mock_llm = MagicMock() + mock_llm.supports_function_calling.return_value = True + mock_llm.call.side_effect = RuntimeError( + "simulated pre-review failure" + ) + MockLLM.return_value = mock_llm + + with pytest.raises(RuntimeError, match="simulated pre-review failure"): + await flow.produce() + + def test_learn_strict_default_is_false_and_propagates_to_config(self): + """``learn_strict`` defaults to False and is exposed on the + ``HumanFeedbackConfig`` for introspection.""" + + @human_feedback(message="Review:", learn=True) + def default_method(self): + return "output" + + @human_feedback(message="Review:", learn=True, learn_strict=True) + def strict_method(self): + return "output" + + assert default_method.__human_feedback_config__.learn_strict is False + assert strict_method.__human_feedback_config__.learn_strict is True + + class TestHumanFeedbackFinalOutputPreservation: """Tests for preserving method return value as flow's final output when @human_feedback with emit is terminal.