mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-05-08 02:29:00 +00:00
fix(hitl): log pre-review failures and add learn_strict mode
Closes #5725. In _pre_review_with_lessons (lib/crewai/src/crewai/flow/human_feedback.py), a broad except Exception: return method_output silently swallowed any LLM, network, auth, or structured-output failure during the pre-review step. Callers could not distinguish pre-reviewed output from raw output, and there was no log or event surfaced. Changes: - Add a module logger. - Narrow the try/except in _pre_review_with_lessons so the mem is None and not matches short-circuits stay outside the failure path (those are not error cases). - On recall or LLM pre-review failure, log a WARNING with exc_info=True so the silent fallback is observable. Continue to fall back to the raw method_output so the flow does not break. - Add an opt-in learn_strict=True parameter on the human_feedback decorator and HumanFeedbackConfig that re-raises pre-review failures instead of falling back, for callers that need fail-closed behavior. - Update the Graceful degradation docs section to reflect the new observable-by-default behavior and document learn_strict. Tests (lib/crewai/tests/test_human_feedback_decorator.py): - New TestHumanFeedbackPreReviewFailure class with 7 tests covering WARNING logging on LLM and recall failures, learn_strict propagation in both sync and async paths, and config introspection of the new learn_strict flag. Co-Authored-By: João <joao@crewai.com>
This commit is contained in:
@@ -682,6 +682,7 @@ class ArticleReviewFlow(Flow):
|
||||
| Parameter | Default | Description |
|
||||
|-----------|---------|-------------|
|
||||
| `learn` | `False` | Enable HITL learning |
|
||||
| `learn_strict` | `False` | If `True`, pre-review failures are re-raised instead of falling back to raw output |
|
||||
| `learn_limit` | `5` | Max past lessons to recall for pre-review |
|
||||
|
||||
### Key Design Decisions
|
||||
@@ -689,7 +690,8 @@ class ArticleReviewFlow(Flow):
|
||||
- **Same LLM for everything**: The `llm` parameter on the decorator is shared by outcome collapsing, lesson distillation, and pre-review. No need to configure multiple models.
|
||||
- **Structured output**: Both distillation and pre-review use function calling with Pydantic models when the LLM supports it, falling back to text parsing otherwise.
|
||||
- **Non-blocking storage**: Lessons are stored via `remember_many()` which runs in a background thread -- the flow continues immediately.
|
||||
- **Graceful degradation**: If the LLM fails during distillation, nothing is stored. If it fails during pre-review, the raw output is shown. Neither failure blocks the flow.
|
||||
- **Observable graceful degradation**: If the LLM fails during distillation, nothing is stored. If it fails during pre-review, the raw output is shown to the human and the failure is logged at `WARNING` level with the full traceback (`exc_info=True`) under the `crewai.flow.human_feedback` logger -- so the silent fallback is detectable. Neither failure blocks the flow.
|
||||
- **Strict mode**: Pass `learn_strict=True` to make pre-review fail closed -- failures (LLM error, network/auth error, structured-output parse error, memory `recall` error) propagate out of the flow method instead of being swallowed. Use this when downstream code must be able to assume that pre-review actually executed.
|
||||
- **No scope/categories needed**: When storing lessons, only `source` is passed. The encoding pipeline infers scope, categories, and importance automatically.
|
||||
|
||||
<Note>
|
||||
|
||||
@@ -60,6 +60,7 @@ from collections.abc import Callable, Sequence
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from functools import wraps
|
||||
import logging
|
||||
from typing import TYPE_CHECKING, Any, TypeVar
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
@@ -75,6 +76,8 @@ if TYPE_CHECKING:
|
||||
|
||||
F = TypeVar("F", bound=Callable[..., Any])
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _serialize_llm_for_context(llm: Any) -> dict[str, Any] | str | None:
|
||||
"""Serialize a BaseLLM object to a dict preserving full config.
|
||||
@@ -188,6 +191,7 @@ class HumanFeedbackConfig:
|
||||
provider: HumanFeedbackProvider | None = None
|
||||
learn: bool = False
|
||||
learn_source: str = "hitl"
|
||||
learn_strict: bool = False
|
||||
|
||||
|
||||
class HumanFeedbackMethod(FlowMethod[Any, Any]):
|
||||
@@ -237,6 +241,7 @@ def human_feedback(
|
||||
provider: HumanFeedbackProvider | None = None,
|
||||
learn: bool = False,
|
||||
learn_source: str = "hitl",
|
||||
learn_strict: bool = False,
|
||||
) -> Callable[[F], F]:
|
||||
"""Decorator for Flow methods that require human feedback.
|
||||
|
||||
@@ -275,6 +280,20 @@ def human_feedback(
|
||||
external systems like Slack, Teams, or webhooks. When the
|
||||
provider raises HumanFeedbackPending, the flow pauses and
|
||||
can be resumed later with Flow.resume().
|
||||
learn: When True, enables HITL learning. After feedback is
|
||||
collected, the LLM distills generalizable lessons and stores
|
||||
them in memory. Before the next review, past lessons are
|
||||
recalled and applied via an LLM pre-review step so the human
|
||||
sees a progressively improved output.
|
||||
learn_source: The memory ``source`` tag used when storing and
|
||||
recalling HITL lessons. Defaults to ``"hitl"``.
|
||||
learn_strict: When True (default False), pre-review failures are
|
||||
re-raised instead of falling back to the raw output. By
|
||||
default, failures are logged at WARNING level with full
|
||||
traceback (``exc_info=True``) and the raw method output is
|
||||
shown to the human. Set this to True if downstream callers
|
||||
must be able to assume that pre-review actually executed
|
||||
successfully.
|
||||
|
||||
Returns:
|
||||
A decorator function that wraps the method with human feedback
|
||||
@@ -373,16 +392,40 @@ def human_feedback(
|
||||
def _pre_review_with_lessons(
|
||||
flow_instance: Flow[Any], method_output: Any
|
||||
) -> Any:
|
||||
"""Recall past HITL lessons and use LLM to pre-review the output."""
|
||||
try:
|
||||
mem = flow_instance.memory
|
||||
if mem is None:
|
||||
return method_output
|
||||
query = f"human feedback lessons for {func.__name__}: {method_output!s}"
|
||||
matches = mem.recall(query, source=learn_source)
|
||||
if not matches:
|
||||
return method_output
|
||||
"""Recall past HITL lessons and use LLM to pre-review the output.
|
||||
|
||||
Returns the original ``method_output`` when memory is unavailable
|
||||
or no lessons match — these are not error cases.
|
||||
|
||||
When the recall or LLM pre-review call raises an exception (for
|
||||
example LLM/network/auth failure or structured-output parse
|
||||
error), the failure is logged at WARNING level with full
|
||||
traceback (``exc_info=True``) so callers can detect the silent
|
||||
fallback. When ``learn_strict=True`` was passed to the decorator,
|
||||
the exception is re-raised instead of being swallowed.
|
||||
"""
|
||||
mem = flow_instance.memory
|
||||
if mem is None:
|
||||
return method_output
|
||||
|
||||
query = f"human feedback lessons for {func.__name__}: {method_output!s}"
|
||||
try:
|
||||
matches = mem.recall(query, source=learn_source)
|
||||
except Exception:
|
||||
logger.warning(
|
||||
"HITL pre-review: memory recall failed for %s; falling "
|
||||
"back to raw output.",
|
||||
func.__name__,
|
||||
exc_info=True,
|
||||
)
|
||||
if learn_strict:
|
||||
raise
|
||||
return method_output
|
||||
|
||||
if not matches:
|
||||
return method_output
|
||||
|
||||
try:
|
||||
lessons = "\n".join(f"- {m.record.content}" for m in matches)
|
||||
llm_inst = _resolve_llm_instance()
|
||||
prompt = _get_hitl_prompt("hitl_pre_review_user").format(
|
||||
@@ -404,7 +447,14 @@ def human_feedback(
|
||||
reviewed = llm_inst.call(messages)
|
||||
return reviewed if isinstance(reviewed, str) else str(reviewed)
|
||||
except Exception:
|
||||
return method_output # fallback to raw output on any failure
|
||||
logger.warning(
|
||||
"HITL pre-review failed for %s; falling back to raw output.",
|
||||
func.__name__,
|
||||
exc_info=True,
|
||||
)
|
||||
if learn_strict:
|
||||
raise
|
||||
return method_output
|
||||
|
||||
def _distill_and_store_lessons(
|
||||
flow_instance: Flow[Any], method_output: Any, raw_feedback: str
|
||||
@@ -654,6 +704,7 @@ def human_feedback(
|
||||
provider=provider,
|
||||
learn=learn,
|
||||
learn_source=learn_source,
|
||||
learn_strict=learn_strict,
|
||||
)
|
||||
wrapper.__is_flow_method__ = True
|
||||
|
||||
|
||||
@@ -7,6 +7,7 @@ async support, and attribute preservation functionality.
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
from unittest.mock import MagicMock, patch
|
||||
@@ -597,6 +598,277 @@ class TestHumanFeedbackLearn:
|
||||
assert config.llm == "gpt-4o-mini"
|
||||
|
||||
|
||||
class TestHumanFeedbackPreReviewFailure:
|
||||
"""Tests for HITL pre-review failure handling (issue #5725).
|
||||
|
||||
Pre-review must NOT silently swallow exceptions: failures should be
|
||||
logged at WARNING level with full traceback, and an opt-in
|
||||
``learn_strict=True`` should propagate the exception instead of
|
||||
falling back to raw output.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def _seeded_memory() -> MagicMock:
|
||||
"""Return a mock memory object with one stored lesson so that
|
||||
pre-review proceeds past the ``not matches`` short-circuit."""
|
||||
from crewai.memory.types import MemoryMatch, MemoryRecord
|
||||
|
||||
mem = MagicMock()
|
||||
mem.recall.return_value = [
|
||||
MemoryMatch(
|
||||
record=MemoryRecord(
|
||||
content="Always include source citations",
|
||||
embedding=[],
|
||||
),
|
||||
score=0.9,
|
||||
match_reasons=["semantic"],
|
||||
)
|
||||
]
|
||||
return mem
|
||||
|
||||
def test_pre_review_llm_failure_logs_warning_and_falls_back(self, caplog):
|
||||
"""When the pre-review LLM call raises, the failure is logged at
|
||||
WARNING with exc_info, and the raw method_output is shown to the
|
||||
human (default fail-open behavior, but now observable)."""
|
||||
|
||||
class LearnFlow(Flow):
|
||||
@start()
|
||||
@human_feedback(message="Review:", llm="gpt-4o-mini", learn=True)
|
||||
def produce(self):
|
||||
return "raw draft"
|
||||
|
||||
flow = LearnFlow()
|
||||
flow.memory = self._seeded_memory()
|
||||
|
||||
captured: dict[str, Any] = {}
|
||||
|
||||
def capture_feedback(message, output, metadata=None, emit=None):
|
||||
captured["shown_to_human"] = output
|
||||
return "approved"
|
||||
|
||||
with (
|
||||
patch.object(flow, "_request_human_feedback", side_effect=capture_feedback),
|
||||
patch("crewai.llm.LLM") as MockLLM,
|
||||
caplog.at_level(logging.WARNING, logger="crewai.flow.human_feedback"),
|
||||
):
|
||||
mock_llm = MagicMock()
|
||||
mock_llm.supports_function_calling.return_value = True
|
||||
mock_llm.call.side_effect = RuntimeError(
|
||||
"simulated pre-review failure"
|
||||
)
|
||||
MockLLM.return_value = mock_llm
|
||||
|
||||
flow.produce()
|
||||
|
||||
# The human still sees the raw output (fail-open by default).
|
||||
assert captured["shown_to_human"] == "raw draft"
|
||||
|
||||
# ...but the failure is now observable via a WARNING log with traceback.
|
||||
warning_records = [
|
||||
r
|
||||
for r in caplog.records
|
||||
if r.name == "crewai.flow.human_feedback"
|
||||
and r.levelno == logging.WARNING
|
||||
and "HITL pre-review failed" in r.getMessage()
|
||||
]
|
||||
assert len(warning_records) == 1, (
|
||||
"Expected exactly one HITL pre-review failure warning, got: "
|
||||
f"{[r.getMessage() for r in caplog.records]}"
|
||||
)
|
||||
# exc_info=True must be set so the traceback is captured.
|
||||
assert warning_records[0].exc_info is not None
|
||||
# The exception type/message should be visible in the captured exc_info.
|
||||
exc_type, exc_value, _ = warning_records[0].exc_info
|
||||
assert exc_type is RuntimeError
|
||||
assert "simulated pre-review failure" in str(exc_value)
|
||||
|
||||
def test_pre_review_recall_failure_logs_warning_and_falls_back(self, caplog):
|
||||
"""A failure in ``memory.recall`` is also logged and falls back to
|
||||
the raw output instead of being silently swallowed."""
|
||||
|
||||
class LearnFlow(Flow):
|
||||
@start()
|
||||
@human_feedback(message="Review:", llm="gpt-4o-mini", learn=True)
|
||||
def produce(self):
|
||||
return "raw draft"
|
||||
|
||||
flow = LearnFlow()
|
||||
flow.memory = MagicMock()
|
||||
flow.memory.recall.side_effect = RuntimeError("recall blew up")
|
||||
|
||||
captured: dict[str, Any] = {}
|
||||
|
||||
def capture_feedback(message, output, metadata=None, emit=None):
|
||||
captured["shown_to_human"] = output
|
||||
return ""
|
||||
|
||||
with (
|
||||
patch.object(flow, "_request_human_feedback", side_effect=capture_feedback),
|
||||
caplog.at_level(logging.WARNING, logger="crewai.flow.human_feedback"),
|
||||
):
|
||||
flow.produce()
|
||||
|
||||
assert captured["shown_to_human"] == "raw draft"
|
||||
|
||||
warning_records = [
|
||||
r
|
||||
for r in caplog.records
|
||||
if r.name == "crewai.flow.human_feedback"
|
||||
and r.levelno == logging.WARNING
|
||||
and "memory recall failed" in r.getMessage()
|
||||
]
|
||||
assert len(warning_records) == 1
|
||||
assert warning_records[0].exc_info is not None
|
||||
|
||||
def test_pre_review_failure_with_learn_strict_propagates(self):
|
||||
"""When ``learn_strict=True``, a pre-review LLM failure is re-raised
|
||||
instead of silently falling back to raw output."""
|
||||
|
||||
class StrictFlow(Flow):
|
||||
@start()
|
||||
@human_feedback(
|
||||
message="Review:",
|
||||
llm="gpt-4o-mini",
|
||||
learn=True,
|
||||
learn_strict=True,
|
||||
)
|
||||
def produce(self):
|
||||
return "raw draft"
|
||||
|
||||
flow = StrictFlow()
|
||||
flow.memory = self._seeded_memory()
|
||||
|
||||
with (
|
||||
patch.object(flow, "_request_human_feedback", return_value="approved"),
|
||||
patch("crewai.llm.LLM") as MockLLM,
|
||||
):
|
||||
mock_llm = MagicMock()
|
||||
mock_llm.supports_function_calling.return_value = True
|
||||
mock_llm.call.side_effect = RuntimeError(
|
||||
"simulated pre-review failure"
|
||||
)
|
||||
MockLLM.return_value = mock_llm
|
||||
|
||||
with pytest.raises(RuntimeError, match="simulated pre-review failure"):
|
||||
flow.produce()
|
||||
|
||||
def test_pre_review_recall_failure_with_learn_strict_propagates(self):
|
||||
"""When ``learn_strict=True``, a ``memory.recall`` failure is re-raised
|
||||
instead of silently falling back to raw output."""
|
||||
|
||||
class StrictFlow(Flow):
|
||||
@start()
|
||||
@human_feedback(
|
||||
message="Review:",
|
||||
llm="gpt-4o-mini",
|
||||
learn=True,
|
||||
learn_strict=True,
|
||||
)
|
||||
def produce(self):
|
||||
return "raw draft"
|
||||
|
||||
flow = StrictFlow()
|
||||
flow.memory = MagicMock()
|
||||
flow.memory.recall.side_effect = RuntimeError("recall blew up")
|
||||
|
||||
with patch.object(flow, "_request_human_feedback", return_value="approved"):
|
||||
with pytest.raises(RuntimeError, match="recall blew up"):
|
||||
flow.produce()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_pre_review_failure_logs_warning_and_falls_back(self, caplog):
|
||||
"""The async wrapper exhibits the same logging + fail-open behavior
|
||||
as the sync wrapper on pre-review LLM failure."""
|
||||
|
||||
class AsyncLearnFlow(Flow):
|
||||
@start()
|
||||
@human_feedback(message="Review:", llm="gpt-4o-mini", learn=True)
|
||||
async def produce(self):
|
||||
return "raw draft"
|
||||
|
||||
flow = AsyncLearnFlow()
|
||||
flow.memory = self._seeded_memory()
|
||||
|
||||
captured: dict[str, Any] = {}
|
||||
|
||||
def capture_feedback(message, output, metadata=None, emit=None):
|
||||
captured["shown_to_human"] = output
|
||||
return "approved"
|
||||
|
||||
with (
|
||||
patch.object(flow, "_request_human_feedback", side_effect=capture_feedback),
|
||||
patch("crewai.llm.LLM") as MockLLM,
|
||||
caplog.at_level(logging.WARNING, logger="crewai.flow.human_feedback"),
|
||||
):
|
||||
mock_llm = MagicMock()
|
||||
mock_llm.supports_function_calling.return_value = True
|
||||
mock_llm.call.side_effect = RuntimeError(
|
||||
"simulated pre-review failure"
|
||||
)
|
||||
MockLLM.return_value = mock_llm
|
||||
|
||||
await flow.produce()
|
||||
|
||||
assert captured["shown_to_human"] == "raw draft"
|
||||
|
||||
warning_records = [
|
||||
r
|
||||
for r in caplog.records
|
||||
if r.name == "crewai.flow.human_feedback"
|
||||
and r.levelno == logging.WARNING
|
||||
and "HITL pre-review failed" in r.getMessage()
|
||||
]
|
||||
assert len(warning_records) == 1
|
||||
assert warning_records[0].exc_info is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_pre_review_failure_with_learn_strict_propagates(self):
|
||||
"""The async wrapper also re-raises when ``learn_strict=True``."""
|
||||
|
||||
class AsyncStrictFlow(Flow):
|
||||
@start()
|
||||
@human_feedback(
|
||||
message="Review:",
|
||||
llm="gpt-4o-mini",
|
||||
learn=True,
|
||||
learn_strict=True,
|
||||
)
|
||||
async def produce(self):
|
||||
return "raw draft"
|
||||
|
||||
flow = AsyncStrictFlow()
|
||||
flow.memory = self._seeded_memory()
|
||||
|
||||
with (
|
||||
patch.object(flow, "_request_human_feedback", return_value="approved"),
|
||||
patch("crewai.llm.LLM") as MockLLM,
|
||||
):
|
||||
mock_llm = MagicMock()
|
||||
mock_llm.supports_function_calling.return_value = True
|
||||
mock_llm.call.side_effect = RuntimeError(
|
||||
"simulated pre-review failure"
|
||||
)
|
||||
MockLLM.return_value = mock_llm
|
||||
|
||||
with pytest.raises(RuntimeError, match="simulated pre-review failure"):
|
||||
await flow.produce()
|
||||
|
||||
def test_learn_strict_default_is_false_and_propagates_to_config(self):
|
||||
"""``learn_strict`` defaults to False and is exposed on the
|
||||
``HumanFeedbackConfig`` for introspection."""
|
||||
|
||||
@human_feedback(message="Review:", learn=True)
|
||||
def default_method(self):
|
||||
return "output"
|
||||
|
||||
@human_feedback(message="Review:", learn=True, learn_strict=True)
|
||||
def strict_method(self):
|
||||
return "output"
|
||||
|
||||
assert default_method.__human_feedback_config__.learn_strict is False
|
||||
assert strict_method.__human_feedback_config__.learn_strict is True
|
||||
|
||||
|
||||
class TestHumanFeedbackFinalOutputPreservation:
|
||||
"""Tests for preserving method return value as flow's final output when @human_feedback with emit is terminal.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user