mirror of
https://github.com/crewAIInc/crewAI.git
synced 2025-12-15 11:58:31 +00:00
Some checks failed
Build uv cache / build-cache (3.10) (push) Has been cancelled
Build uv cache / build-cache (3.11) (push) Has been cancelled
Build uv cache / build-cache (3.12) (push) Has been cancelled
CodeQL Advanced / Analyze (actions) (push) Has been cancelled
CodeQL Advanced / Analyze (python) (push) Has been cancelled
Notify Downstream / notify-downstream (push) Has been cancelled
Check Documentation Broken Links / Check broken links (push) Has been cancelled
Build uv cache / build-cache (3.13) (push) Has been cancelled
1898 lines
165 KiB
YAML
1898 lines
165 KiB
YAML
interactions:
|
|
- request:
|
|
body: null
|
|
headers:
|
|
Accept:
|
|
- '*/*'
|
|
Accept-Encoding:
|
|
- gzip, deflate
|
|
Connection:
|
|
- keep-alive
|
|
user-agent:
|
|
- docling-core/2.10.0
|
|
method: GET
|
|
uri: https://lilianweng.github.io/posts/2024-11-28-reward-hacking/
|
|
response:
|
|
body:
|
|
string: "<!DOCTYPE html>\n<html lang=\"en\" dir=\"auto\">\n\n<head><meta charset=\"utf-8\">\n<meta
|
|
http-equiv=\"X-UA-Compatible\" content=\"IE=edge\">\n<meta name=\"viewport\"
|
|
content=\"width=device-width, initial-scale=1, shrink-to-fit=no\">\n<meta
|
|
name=\"robots\" content=\"index, follow\">\n<title>Reward Hacking in Reinforcement
|
|
Learning | Lil'Log</title>\n<meta name=\"keywords\" content=\"language-model,
|
|
rlhf, alignment, safety, reinforcement-learning, long-read\" />\n<meta name=\"description\"
|
|
content=\"Reward hacking occurs when a reinforcement learning (RL) agent exploits
|
|
flaws or ambiguities in the reward function to achieve high rewards, without
|
|
genuinely learning or completing the intended task. Reward hacking exists
|
|
because RL environments are often imperfect, and it is fundamentally challenging
|
|
to accurately specify a reward function.\nWith the rise of language models
|
|
generalizing to a broad spectrum of tasks and RLHF becomes a de facto method
|
|
for alignment training, reward hacking in RL training of language models has
|
|
become a critical practical challenge. Instances where the model learns to
|
|
modify unit tests to pass coding tasks, or where responses contain biases
|
|
that mimic a user’s preference, are pretty concerning and are likely
|
|
one of the major blockers for real-world deployment of more autonomous use
|
|
cases of AI models.\">\n<meta name=\"author\" content=\"Lilian Weng\">\n<link
|
|
rel=\"canonical\" href=\"https://lilianweng.github.io/posts/2024-11-28-reward-hacking/\"
|
|
/>\n<link crossorigin=\"anonymous\" href=\"/assets/css/stylesheet.min.67a6fb6e33089cb29e856bcc95d7aa39f70049a42b123105531265a0d9f1258b.css\"
|
|
integrity=\"sha256-Z6b7bjMInLKehWvMldeqOfcASaQrEjEFUxJloNnxJYs=\" rel=\"preload
|
|
stylesheet\" as=\"style\">\n<script defer crossorigin=\"anonymous\" src=\"/assets/js/highlight.min.2eadbb982468c11a433a3e291f01326f2ba43f065e256bf792dbd79640a92316.js\"
|
|
integrity=\"sha256-Lq27mCRowRpDOj4pHwEybyukPwZeJWv3ktvXlkCpIxY=\"\n onload=\"hljs.initHighlightingOnLoad();\"></script>\n<link
|
|
rel=\"icon\" href=\"https://lilianweng.github.io/favicon_wine.ico\">\n<link
|
|
rel=\"icon\" type=\"image/png\" sizes=\"16x16\" href=\"https://lilianweng.github.io/favicon-16x16.png\">\n<link
|
|
rel=\"icon\" type=\"image/png\" sizes=\"32x32\" href=\"https://lilianweng.github.io/favicon-32x32.png\">\n<link
|
|
rel=\"apple-touch-icon\" href=\"https://lilianweng.github.io/apple-touch-icon.png\">\n<link
|
|
rel=\"mask-icon\" href=\"https://lilianweng.github.io/safari-pinned-tab.svg\">\n<meta
|
|
name=\"theme-color\" content=\"#2e2e33\">\n<meta name=\"msapplication-TileColor\"
|
|
content=\"#2e2e33\">\n<link rel=\"alternate\" hreflang=\"en\" href=\"https://lilianweng.github.io/posts/2024-11-28-reward-hacking/\"
|
|
/>\n<noscript>\n <style>\n #theme-toggle,\n .top-link {\n
|
|
\ display: none;\n }\n\n </style>\n <style>\n @media
|
|
(prefers-color-scheme: dark) {\n :root {\n --theme:
|
|
rgb(29, 30, 32);\n --entry: rgb(46, 46, 51);\n --primary:
|
|
rgb(218, 218, 219);\n --secondary: rgb(155, 156, 157);\n --tertiary:
|
|
rgb(65, 66, 68);\n --content: rgb(196, 196, 197);\n --hljs-bg:
|
|
rgb(46, 46, 51);\n --code-bg: rgb(55, 56, 62);\n --border:
|
|
rgb(51, 51, 51);\n }\n\n .list {\n background:
|
|
var(--theme);\n }\n\n .list:not(.dark)::-webkit-scrollbar-track
|
|
{\n background: 0 0;\n }\n\n .list:not(.dark)::-webkit-scrollbar-thumb
|
|
{\n border-color: var(--theme);\n }\n }\n\n
|
|
\ </style>\n</noscript>\n <script async src=\"https://www.googletagmanager.com/gtag/js?id=G-HFT45VFBX6\"></script>\n
|
|
\ <script>\n var doNotTrack = false;\n if ( false ) {\n
|
|
\ var dnt = (navigator.doNotTrack || window.doNotTrack || navigator.msDoNotTrack);\n
|
|
\ var doNotTrack = (dnt == \"1\" || dnt == \"yes\");\n }\n
|
|
\ if (!doNotTrack) {\n window.dataLayer = window.dataLayer
|
|
|| [];\n function gtag(){dataLayer.push(arguments);}\n gtag('js',
|
|
new Date());\n gtag('config', 'G-HFT45VFBX6');\n }\n </script><meta
|
|
property=\"og:title\" content=\"Reward Hacking in Reinforcement Learning\"
|
|
/>\n<meta property=\"og:description\" content=\"Reward hacking occurs when
|
|
a reinforcement learning (RL) agent exploits flaws or ambiguities in the reward
|
|
function to achieve high rewards, without genuinely learning or completing
|
|
the intended task. Reward hacking exists because RL environments are often
|
|
imperfect, and it is fundamentally challenging to accurately specify a reward
|
|
function.\nWith the rise of language models generalizing to a broad spectrum
|
|
of tasks and RLHF becomes a de facto method for alignment training, reward
|
|
hacking in RL training of language models has become a critical practical
|
|
challenge. Instances where the model learns to modify unit tests to pass coding
|
|
tasks, or where responses contain biases that mimic a user’s preference,
|
|
are pretty concerning and are likely one of the major blockers for real-world
|
|
deployment of more autonomous use cases of AI models.\" />\n<meta property=\"og:type\"
|
|
content=\"article\" />\n<meta property=\"og:url\" content=\"https://lilianweng.github.io/posts/2024-11-28-reward-hacking/\"
|
|
/><meta property=\"og:image\" content=\"https://lilianweng.github.io/posts/2024-11-28-reward-hacking/SEAL-feature-imprint.png\"/><meta
|
|
property=\"article:section\" content=\"posts\" />\n<meta property=\"article:published_time\"
|
|
content=\"2024-11-28T00:00:00+00:00\" />\n<meta property=\"article:modified_time\"
|
|
content=\"2024-11-28T00:00:00+00:00\" />\n\n<meta name=\"twitter:card\"
|
|
content=\"summary_large_image\"/>\n<meta name=\"twitter:image\" content=\"https://lilianweng.github.io/posts/2024-11-28-reward-hacking/SEAL-feature-imprint.png\"/>\n<meta
|
|
name=\"twitter:title\" content=\"Reward Hacking in Reinforcement Learning\"/>\n<meta
|
|
name=\"twitter:description\" content=\"Reward hacking occurs when a reinforcement
|
|
learning (RL) agent exploits flaws or ambiguities in the reward function to
|
|
achieve high rewards, without genuinely learning or completing the intended
|
|
task. Reward hacking exists because RL environments are often imperfect, and
|
|
it is fundamentally challenging to accurately specify a reward function.\nWith
|
|
the rise of language models generalizing to a broad spectrum of tasks and
|
|
RLHF becomes a de facto method for alignment training, reward hacking in RL
|
|
training of language models has become a critical practical challenge. Instances
|
|
where the model learns to modify unit tests to pass coding tasks, or where
|
|
responses contain biases that mimic a user’s preference, are pretty
|
|
concerning and are likely one of the major blockers for real-world deployment
|
|
of more autonomous use cases of AI models.\"/>\n\n\n<script type=\"application/ld+json\">\n{\n
|
|
\ \"@context\": \"https://schema.org\",\n \"@type\": \"BreadcrumbList\",\n
|
|
\ \"itemListElement\": [\n {\n \"@type\": \"ListItem\",\n \"position\":
|
|
\ 1 ,\n \"name\": \"Posts\",\n \"item\": \"https://lilianweng.github.io/posts/\"\n
|
|
\ }, \n {\n \"@type\": \"ListItem\",\n \"position\": 2 ,\n
|
|
\ \"name\": \"Reward Hacking in Reinforcement Learning\",\n \"item\":
|
|
\"https://lilianweng.github.io/posts/2024-11-28-reward-hacking/\"\n }\n
|
|
\ ]\n}\n</script>\n<script type=\"application/ld+json\">\n{\n \"@context\":
|
|
\"https://schema.org\",\n \"@type\": \"BlogPosting\",\n \"headline\": \"Reward
|
|
Hacking in Reinforcement Learning\",\n \"name\": \"Reward Hacking in Reinforcement
|
|
Learning\",\n \"description\": \"Reward hacking occurs when a reinforcement
|
|
learning (RL) agent exploits flaws or ambiguities in the reward function to
|
|
achieve high rewards, without genuinely learning or completing the intended
|
|
task. Reward hacking exists because RL environments are often imperfect, and
|
|
it is fundamentally challenging to accurately specify a reward function.\\nWith
|
|
the rise of language models generalizing to a broad spectrum of tasks and
|
|
RLHF becomes a de facto method for alignment training, reward hacking in RL
|
|
training of language models has become a critical practical challenge. Instances
|
|
where the model learns to modify unit tests to pass coding tasks, or where
|
|
responses contain biases that mimic a user\\u0026rsquo;s preference, are pretty
|
|
concerning and are likely one of the major blockers for real-world deployment
|
|
of more autonomous use cases of AI models.\\n\",\n \"keywords\": [\n \"language-model\",
|
|
\"rlhf\", \"alignment\", \"safety\", \"reinforcement-learning\", \"long-read\"\n
|
|
\ ],\n \"articleBody\": \"Reward hacking occurs when a reinforcement learning
|
|
(RL) agent exploits flaws or ambiguities in the reward function to achieve
|
|
high rewards, without genuinely learning or completing the intended task.
|
|
Reward hacking exists because RL environments are often imperfect, and it
|
|
is fundamentally challenging to accurately specify a reward function.\\nWith
|
|
the rise of language models generalizing to a broad spectrum of tasks and
|
|
RLHF becomes a de facto method for alignment training, reward hacking in RL
|
|
training of language models has become a critical practical challenge. Instances
|
|
where the model learns to modify unit tests to pass coding tasks, or where
|
|
responses contain biases that mimic a user\u2019s preference, are pretty concerning
|
|
and are likely one of the major blockers for real-world deployment of more
|
|
autonomous use cases of AI models.\\nMost of the past work on this topic has
|
|
been quite theoretical and focused on defining or demonstrating the existence
|
|
of reward hacking. However, research into practical mitigations, especially
|
|
in the context of RLHF and LLMs, remains limited. I especially want to call
|
|
out for more research efforts directed toward understanding and developing
|
|
mitigation for reward hacking in the future. Hope I will be able to cover
|
|
the mitigation part in a dedicated post soon.\\nBackground Reward Function
|
|
in RL Reward function defines the task, and reward shaping significantly impacts
|
|
learning efficiency and accuracy in reinforcement learning. Designing a reward
|
|
function for an RL task often feels like a \u2018dark art\u2019. Many factors
|
|
contribute to this complexity: How you decompose a big goal into small goals?
|
|
Is the reward sparse or dense? How you measure the success? Various choices
|
|
may lead to good or problematic learning dynamics, including unlearnable tasks
|
|
or hackable reward functions. There is a long history of research on how to
|
|
do reward shaping in RL.\\nFor example, in an 1999 paper by Ng et al., the
|
|
authors studied how to modify the reward function in Markov Decision Processes
|
|
(MDPs) such that the optimal policy remains unchanged. They found that linear
|
|
transformation works. Given a MDP $M = (S, A, T, \\\\gamma, R)$, we want to
|
|
create a transformed MDP $M\u2019 = (S, A, T, \\\\gamma, R\u2019)$ where $R\u2019
|
|
= R + F$ and $F: S \\\\times A \\\\times S \\\\mapsto \\\\mathbb{R}$, such
|
|
that we can guide the learning algorithm to be more efficient. Given a real-valued
|
|
function $\\\\Phi: S \\\\mapsto \\\\mathbb{R}$, $F$ is a potential-based shaping
|
|
function if for all $s \\\\in S - {s_0}, a \\\\in A, s\u2019 \\\\in S$:\\n$$
|
|
F(s, a, s') = \\\\gamma \\\\Phi(s') - \\\\Phi(s) $$ This would guarantee that
|
|
the sum of discounted $F$, $F(s_1, a_1, s_2) + \\\\gamma F(s_2, a_2, s_3)
|
|
+ \\\\dots$, ends up being 0. If $F$ is such a potential-based shaping function,
|
|
it is both sufficient and necessary to ensure $M$ and $M\u2019$ share the
|
|
same optimal policies.\\nWhen $F(s, a, s\u2019) = \\\\gamma \\\\Phi(s\u2019)
|
|
- \\\\Phi(s)$, and if we further assume that $\\\\Phi(s_0) = 0$, where $s_0$
|
|
is absorbing state, and $\\\\gamma=1$, and then for all $s \\\\in S, a \\\\in
|
|
A$:\\n$$ \\\\begin{aligned} Q^*_{M'} (s,a) \\u0026= Q^*_M(s, a) - \\\\Phi(s)
|
|
\\\\\\\\ V^*_{M'} (s,a) \\u0026= V^*_M(s, a) - \\\\Phi(s) \\\\end{aligned}
|
|
$$ This form of reward shaping allows us to incorporate heuristics into the
|
|
reward function to speed up learning without impacting the optimal policy.\\nSpurious
|
|
Correlation Spurious correlation or shortcut learning (Geirhos et al. 2020)
|
|
in classification task is a concept closely related to reward hacking. Spurious
|
|
or shortcut features can cause a classifier to fail at learning and generalizing
|
|
as intended. For example, a binary classifier for distinguishing wolves from
|
|
huskies may overfit to the presence of a snowy background if all the wolf
|
|
training images include snow (Ribeiro et al. 2024).\\nFig. 1. The model performs
|
|
poorly on out-of-distribution (OOD) test sets if it overfits to shortcut features.
|
|
(Image source: Geirhos et al. 2020) The ERM principle states that, since the
|
|
full data distribution is unknown, minimizing the loss on training data is
|
|
a reasonable proxy of risk and thus we favor models with the lowest training
|
|
loss. Nagarajan et al. (2021) studied the ERM principle and pointed out that
|
|
ERM needs to rely on all types of informative features, including unreliable
|
|
spurious features, while attempting to fit the data without constraints. Their
|
|
experiments showed that ERM would depend on spurious features no matter how
|
|
easy the task is.\\nLet\u2019s Define Reward Hacking Reward shaping in RL
|
|
is challenging. Reward hacking occurs when an RL agent exploits flaws or ambiguities
|
|
in the reward function to obtain high rewards without genuinely learning the
|
|
intended behaviors or completing the task as designed. In recent years, several
|
|
related concepts have been proposed, all referring to some form of reward
|
|
hacking:\\nReward hacking (Amodei et al., 2016) Reward corruption (Everitt
|
|
et al., 2017) Reward tampering (Everitt et al. 2019) Specification gaming
|
|
(Krakovna et al., 2020) Objective robustness (Koch et al. 2021) Goal misgeneralization
|
|
(Langosco et al. 2022) Reward misspecifications (Pan et al. 2022) The concept
|
|
originated with Amodei et al. (2016), who proposed a set of open research
|
|
questions on AI safety in their seminal paper \u201CConcrete Problems in AI
|
|
Safety\u201D. They listed reward hacking as one of the key AI safety problems.
|
|
Reward hacking refers to the possibility of the agent gaming the reward function
|
|
to achieve high reward through undesired behavior. Specification gaming (Krakovna
|
|
et al. 2020) is a similar concept, defined as a behavior that satisfies the
|
|
literal specification of an objective but not achieving the desired results.
|
|
Here the literal description of the task goal and the intended goal may have
|
|
a gap.\\nReward shaping is a technique used to enrich the reward function,
|
|
making it easier for the agent to learn\u2014for example, by providing denser
|
|
rewards. However, a poorly design reward shaping mechanism can alter the trajectory
|
|
of the optimal policy. Designing effective reward shaping mechanisms is inherently
|
|
difficult. Rather than blaming a poorly designed reward function, it is more
|
|
accurate to acknowledge that designing a good reward function is intrinsically
|
|
challenging due to the complexity of the task itself, partial observable state,
|
|
multiple dimensions in consideration, and other factors.\\nWhen testing an
|
|
RL agent in out-of-distribution (OOD) environments, robustness failure may
|
|
occur due to:\\nThe model fails to generalize effectively, even with the right
|
|
objective. This happens when the algorithm lacks sufficient intelligence or
|
|
capability. The model generalizes capably but pursues an objective different
|
|
from the one it was trained on. This happens when the proxy reward differs
|
|
from the true reward function, $R\u2019 \\\\neq R$. This is known as objective
|
|
robustness (Koch et al. 2021) or goal misgeneralization (Langosco et al. 2022
|
|
) Experiments in two RL environments, CoinRun and Maze, demonstrated the importance
|
|
of randomization during training. If during training, the coin or the cheese
|
|
is placed at a fixed position (i.e. right end of the level or upper right
|
|
corner of the maze) but testing in the env where the coin or cheese is placed
|
|
at random, the agent would just run to the fixed position without obtaining
|
|
the coin or cheese at test time. A conflict arises when a visual feature (e.g.,
|
|
cheese or coin) and a positional feature (e.g., upper-right or right end)
|
|
are inconsistent during test time, leading the trained model to prefer the
|
|
positional feature. I would like to point out that, in these two examples,
|
|
the reward-result gaps are clear but such type of biases are unlikely to be
|
|
so obvious in most real-world cases.\\nFig. 2. The impact of randomizing the
|
|
position of the coin during training. When the coin is placed at random for
|
|
{0, 2, 3, 6, 11}% of the time during training (x-axis), the frequency of the
|
|
agent navigating to the end of the level without obtaining the coin decreases
|
|
with the increase of the randomization (\\\"y-axis\\\"). (Image source: Koch
|
|
et al. 2021) Reward Tampering (Everitt et al. 2019) is a form of reward hacking
|
|
behavior where the agent interferes with the reward function itself, causing
|
|
the observed reward to no longer accurately represent the intended goal. In
|
|
reward tampering, the model modifies its reward mechanism either by directly
|
|
manipulating the implementation of the reward function or by indirectly altering
|
|
the environmental information used as input for the reward function.\\n(Note:
|
|
Some work defines reward tampering as a distinct category of misalignment
|
|
behavior from reward hacking. But I consider reward hacking as a broader concept
|
|
here.)\\nAt a high level, reward hacking can be categorized into two types:
|
|
environment or goal misspecification, and reward tampering.\\nEnvironment
|
|
or goal misspecified: The model learns undesired behavior to achieve high
|
|
rewards by hacking the environment or optimizing a reward function not aligned
|
|
with the true reward objective\u2014such as when the reward is misspecified
|
|
or lacks key requirements. Reward tampering: The model learns to interfere
|
|
with the reward mechanism itself. List of Examples Reward hacking examples
|
|
in RL tasks A robot hand trained to grab an object can learn to trick people
|
|
by placing the hand between the object and the camera. (Link) An agent trained
|
|
to maximize jumping height may exploit a bug in the physics simulator to achieve
|
|
an unrealistically height. (Link) An agent is trained to ride a bicycle to
|
|
a goal and wins reward whenever it is getting closer to the goal. Then the
|
|
agent may learn to ride in tiny circles around the goal because there is no
|
|
penalty when the agent gets away from the goal. (Link) In a soccer game setup,
|
|
the reward is assigned when the agent touches the ball and the agent learns
|
|
to remain next to the ball to touch the ball in high frequency like in a viberating
|
|
motion. (Link) In the\_Coast Runners game, an agent controls a boat with the
|
|
goal to finish the boat race as quickly as possible. When it is given a shaping
|
|
reward for hitting green blocks along the race track, it changes the optimal
|
|
policy to going in circles and hitting the same green blocks over and over
|
|
again. (Link) \u201CThe Surprising Creativity of Digital Evolution\u201D (Lehman
|
|
et al. 2019) - This paper has many examples about how optimizing a misspecified
|
|
fitness function can lead to surprising \u201Chacking\u201D or unintended
|
|
evolutionary or learning results. The list of specification gaming in AI examples
|
|
is collected by Krakovna et al. 2020. Reward hacking examples in LLM tasks
|
|
A language model for generating summarization is able to explore flaws in
|
|
the ROUGE metric such that it obtains high score but the generated summaries
|
|
are barely readable. (Link) A coding model learns to change unit test in order
|
|
to pass coding questions. (Link) A coding model may learn to directly modify
|
|
the code used for calculating the reward. (Link) Reward hacking examples in
|
|
real life The recommendation algorithm for social media is intended to provide
|
|
useful information. However, usefulness is often measured by proxy metrics,
|
|
such as the number of likes or comments, or the time or frequency of engagement
|
|
on the platform. The algorithm ends up recommending content that can affect
|
|
users\u2019 emotion states such as outrageous and extreme content in order
|
|
to trigger more engagement. (Harari, 2024) Optimizing for misspecified proxy
|
|
metrics for a video sharing site may aggressively increase the watch time
|
|
of users while the true goal is to optimize users\u2019 subjective well-being.
|
|
(Link) \u201CThe Big Short\u201D - 2008 financial crisis caused by the housing
|
|
bubble. Reward hacking of our society happened as people tried to game the
|
|
financial system. Why does Reward Hacking Exist? Goodhart\u2019s Law states
|
|
that \u201CWhen a measure becomes a target, it ceases to be a good measure\u201D.
|
|
The intuition is that a good metric can become corrupted once significant
|
|
pressure is applied to optimize it. It is challenging to specify a 100% accurate
|
|
reward objective and any proxy suffers the risk of being hacked, as RL algorithm
|
|
exploits any small imperfection in the reward function definition. Garrabrant
|
|
(2017) categorized Goodhart\u2019s law into 4 variants:\\nRegressional - selection
|
|
for an imperfect proxy necessarily also selects for noise. Extremal - the
|
|
metric selection pushes the state distribution into a region of different
|
|
data distribution. Causal - when there is a non-causal correlation between
|
|
the proxy and the goal, intervening on the proxy may fail to intervene on
|
|
the goal. Adversarial - optimization for a proxy provides an incentive for
|
|
adversaries to correlate their goal with the proxy. Amodei et al. (2016) summarized
|
|
that reward hacking, mainly in RL setting, may occur due to:\\nPartial observed
|
|
states and goals are imperfect representation of the environment status. The
|
|
system itself is complex and susceptible to hacking; e.g., if the agent is
|
|
allowed to execute code that changes part of the environment, it becomes much
|
|
easier to exploit the environment\u2019s mechanisms. The reward may involve
|
|
abstract concept that is hard to be learned or formulated; e.g., a reward
|
|
function with high-dimensional inputs may disproportionately rely on a few
|
|
dimensions. RL targets to get the reward function highly optimized, so there
|
|
exists an intrinsic \u201Cconflict\u201D, making the design of good RL objective
|
|
challenging. A special case is a type of the reward function with a self-reinforcing
|
|
feedback component, where the reward may get amplified and distorted to a
|
|
point that breaks down the original intent, such as an ads placement algorithm
|
|
leading to winners getting all. Besides, identifying the exact reward function
|
|
for which an optimal agent optimizes its behavior is in general impossible
|
|
since there could be an infinite number of reward functions consistent with
|
|
any observed policy in an fixed environment (Ng \\u0026 Russell, 2000). Amin
|
|
and Singh (2016) separated the causes of this unidentifiability into two classes:\\nRepresentational
|
|
- a set of reward functions is behaviorally invariant under certain arithmetic
|
|
operations (e.g., re-scaling) Experimental - $\\\\pi$\u2019s observed behavior
|
|
is insufficient to distinguish between two or more reward functions which
|
|
both rationalize the behavior of the agent (the behavior is optimal under
|
|
both) Hacking RL Environment Reward hacking is expected to be a more common
|
|
problem as the model and the algorithm become increasingly sophisticated.
|
|
A more intelligent agent is more capable of finding \u201Choles\u201D in the
|
|
design of reward function and exploiting the task specification\u2014in other
|
|
words, achieving higher proxy rewards but lower true rewards. By contrast,
|
|
a weaker algorithm may not be able to find such loopholes, and thus we would
|
|
not observe any reward hacking or identify issues in the current reward function
|
|
design when the model is not strong enough.\\nIn a set of zero-sum robotics
|
|
self-play games (Bansal et al., 2017), we can train two agents (victim vs.
|
|
opponent) to compete against each other. A standard training process produces
|
|
a victim agent with adequate performance when playing against a normal opponent.
|
|
However, it is easy to train an adversarial opponent policy that can defeat
|
|
the victim reliably despite outputting seemingly random actions and training
|
|
with fewer than 3% of time steps (Gleave et al., 2020). Training of adversarial
|
|
policies involves optimizing the sum of discounted rewards, as in standard
|
|
RL setup, while treating the victim policy as a black-box model.\\nAn intuitive
|
|
way to mitigate adversarial policies attacks is to fine-tune victims against
|
|
adversarial policies. However, the victim remains vulnerable to new versions
|
|
of adversarial policies once retrained against the new victim policy.\\nWhy
|
|
does adversarial policy exist? The hypothesis is that adversarial policies
|
|
introduce OOD observations to the victim rather than physically interfering
|
|
with it. Evidence shows that when the victim\u2019s observation of the opponent\u2019s
|
|
position is masked and set to a static state, the victim becomes more robust
|
|
to adversaries, although performing worse against a normal opponent policy.
|
|
Furthermore, a higher-dimensional observation space enhances performance under
|
|
normal circumstances but makes the policy more vulnerable to adversarial opponents.\\nPan
|
|
et al. (2022) investigated reward hacking as a function of agent capabilities,
|
|
including (1) model size, (2) action space resolution, (3) observation space
|
|
noise, and (4) training time. They also proposed a taxonomy of three types
|
|
of misspecified proxy rewards:\\nMisweighting: Proxy and true rewards capture
|
|
the same desiderata, but differ in their relative importance. Ontological:
|
|
Proxy and true rewards use different desiderata to capture the same concept.
|
|
Scope: The proxy measures desiderata over a restricted domain (e.g. time or
|
|
space) because measurement across all conditions is too costly. They experimented
|
|
in four RL environments paired with nine misspecified proxy rewards. The overall
|
|
findings from these experiments can be summarized as follows: A model of higher
|
|
capability tends to obtain higher (or similar) proxy rewards but decreased
|
|
true rewards.\\nModel size: Larger model size leads to increased proxy rewards
|
|
but decreased true rewards. Action space resolution: Increased precision in
|
|
actions leads to more capable agents. However, higher resolution causes proxy
|
|
rewards to remain constant while true rewards decrease. Observation fidelity:
|
|
More accurate observations improve proxy rewards but slightly reduce true
|
|
rewards. Training steps: Optimizing the proxy reward over more steps harms
|
|
true rewards after an initial period where the rewards are positively correlated.
|
|
Fig. 3. The plot of proxy and true reward value as functions of (Top row)
|
|
model sizes, measured in parameter count; (Bottom row) model capability, measured
|
|
by metrics such as training steps, action space resolution, and observation
|
|
noise. (Image source: Pan et al. 2022) If a proxy reward is so poorly specified
|
|
that it has a very weak correlation with the true reward, we may be able to
|
|
identify and prevent reward hacking even before training. Based on this hypothesis,
|
|
Pan et al. (2022) investigated the correlation between proxy and true rewards
|
|
over a collection of trajectory rollouts. Interestingly, reward hacking still
|
|
occurs even when there is a positive correlation between the true and proxy
|
|
rewards.\\nHacking RLHF of LLMs Reinforcement learning from human feedback
|
|
(RLHF) has become the de facto approach for alignment training of language
|
|
models. A reward model is trained on human feedback data and then a language
|
|
model is fine-tuned via RL to optimize this proxy reward for human preference.
|
|
There are three types of reward we care about in an RLHF setup:\\n(1) Oracle/Gold
|
|
reward $R^\u2217$ represents what we truly want the LLM to optimize. (2) Human
|
|
reward $R^\\\\text{human}$ is what we collect to evaluate LLMs in practice,
|
|
typically from individual humans with time constraints. Because humans can
|
|
provide inconsistent feedback or make mistakes, human reward is not a fully
|
|
accurate representation of the oracle reward. (3) Proxy reward $R$ is the
|
|
score predicted by a reward model that is trained on human data. Hence, $R^\\\\text{train}$
|
|
inherits all the weakness of human reward, plus potential modeling biases.
|
|
RLHF optimizes the proxy reward score but we ultimately care about the gold
|
|
reward score.\\nHacking the Training Process Gao et al. (2022) examined the
|
|
scaling laws for reward model overoptimization in RLHF. To scale up the human
|
|
labels in their experiments, they use a synthetic data setup where the \u201Cgold\u201D
|
|
label for the oracle reward $R^*$ is approximated by a large RM (6B parameters)
|
|
where the proxy RMs for $R$ range in size of 3M to 3B parameters.\\nFig. 4.
|
|
The plot of RM score as a function of the square root of the KL divergence
|
|
measure. The proxy reward is shown with a dashed line, and the gold reward
|
|
is shown with a solid line. (Image source: Gao et al. 2022) The KL divergence
|
|
from the initial policy to the optimized policy is $\\\\text{KL} = D_\\\\text{KL}(\\\\pi
|
|
| \\\\pi_\\\\text{init})$, and the distance function is defined as $d := \\\\sqrt{
|
|
D_\\\\text{KL}(\\\\pi | \\\\pi_\\\\text{init})}$. For both best-of-$n$ rejection
|
|
sampling (BoN) and RL, the gold reward $R^\u2217$ is defined as a function
|
|
of $d$. The coefficients $\\\\alpha$ and $\\\\beta$ are fitted empirically,
|
|
with $R^\u2217 (0) := 0$ by definition.\\nThe authors also attempted to fit
|
|
the proxy reward $R$ but found systematic underestimation when extrapolated
|
|
to higher KLs, as the proxy reward appeared to grow linearly with $d$.\\n$$
|
|
\\\\begin{aligned} R^*_{\\\\text{bo}n}(d) \\u0026= d (\\\\alpha_{\\\\text{bo}n}
|
|
- \\\\beta_{\\\\text{bo}n} d) \\u0026 \\\\text{; for best-of-n (BoN) sampling.}\\\\\\\\
|
|
R^*_\\\\text{RL}(d) \\u0026= d (\\\\alpha_\\\\text{RL} - \\\\beta_\\\\text{RL}
|
|
\\\\log d) \\u0026 \\\\text{; for reinforcement learning}\\\\\\\\ \\\\end{aligned}
|
|
$$ Fig. 5. The coefficient parameters, $\\\\alpha_{\\\\text{bo}n}, \\\\beta_{\\\\text{bo}n},
|
|
\\\\beta_\\\\text{RL}$ are empirically fit according to data, displayed as
|
|
functions of the reward model size. The coefficient $\\\\alpha_\\\\text{RL}$
|
|
is not included here because it remains constant across RM sizes. (Image source:
|
|
Gao et al. 2022) Their experiments also explored the relationship between
|
|
RM overoptimization and factors like policy model size and RM data size:\\nLarger
|
|
policies see less benefit from optimization (i.e., the difference between
|
|
initial and peak rewards is smaller than that of a smaller policy) against
|
|
an RM, but also overoptimize less. More RM data leads to higher gold reward
|
|
scores and reduces \u201CGoodharting\u201D. The effect of the KL penalty on
|
|
the gold score resembles early stopping. Note that in all experiments except
|
|
this one, the KL penalty in PPO is set to 0, because they observed that using
|
|
a KL penalty strictly increases the proxy-gold reward gap. RLHF aims to improve
|
|
the model\u2019s alignment with human preference, but human feedback $R^\\\\text{human}$
|
|
may not capture all the aspects we care about (e.g., factuality) and thus
|
|
can be hacked to overfit to undesired attributes. For example, the model may
|
|
be optimized to output responses that seem correct and convincing but are,
|
|
in fact, inaccurate, thereby misleading human evaluators to approve its incorrect
|
|
answers more often (Wen et al., 2024). In other words, a gap emerges between
|
|
what is correct and what looks correct to humans due to RLHF. Precisely Wen
|
|
et al. (2024) ran RLHF experiments using a reward model based on ChatbotArena
|
|
data. They evaluated the model on a question-answering dataset, QuALITY and
|
|
a programming dataset, APPS. Their experiments revealed that models become
|
|
better at convincing humans they are correct, even when they are wrong and
|
|
this effect is unintended:\\nRLHF increases human approval, but not necessarily
|
|
correctness. RLHF weakens humans\u2019 ability to evaluate: The error rate
|
|
of human evaluation is higher after RLHF training. RLHF makes incorrect outputs
|
|
more convincing to humans. The evaluation false positive rate significantly
|
|
increases after RLHF training. The paper coined this effect \u201CU-Sophistry\u201D
|
|
(\u201CU\u201D for \u201Cunintended\u201D), as opposed to \u201CI-Sophistry\u201D
|
|
(\u201CI\u201D for \u201Cintended\u201D), which involves explicitly prompting
|
|
the model with instructions like \\\"... try to deceive human subjects\\\".\\nFig.
|
|
6. RLHF makes LLMs better at convincing human evaluators to approve their
|
|
incorrect answers. (Image source: Wen et al. 2024) The human evaluation error
|
|
change is not due to noise in the recruiting process since (1) at an individual
|
|
level, the majority (70-90%) of human evaluators raw their evaluation error
|
|
rates increase, and (2) the effort they put into evaluating $\\\\pi_\\\\text{init}$
|
|
or $\\\\pi_\\\\text{rlhf}$ is equivalent, measured by metrics like time spent
|
|
or unit tests written. Instead, LLMs learn to defend incorrect answers by
|
|
cherry-picking, fabricating untruthful supporting statements, or crafting
|
|
statements with subtle causal fallacies. What they observed about how the
|
|
model behaves after RLHF:\\nIn the long-form QA task: Creating more convincing
|
|
fabricated evidence. Using more consistent logic for incorrect answers. Generating
|
|
coherent answers with subtle fallacies. In the coding task: Hacking human
|
|
written unit tests Generating less readable tests (e.g. fewer helper functions
|
|
and higher code complexity). Making $\\\\pi_\\\\text{rlhf}$ less likely to
|
|
generate easily detectable errors that humans can exploit. Fig. 7. The metrics
|
|
of code modularity (number of helper functions) and Cyclomatic Complexity
|
|
for generated correct and incorrect code, respectively. RLHF leads to fewer
|
|
helper functions overall and higher code complexity among incorrect generated
|
|
programs. This unsurprisingly would increase difficulty of human evaluation.
|
|
(Image source: Wen et al. 2024) Sycophancy refers to the tendency of model
|
|
responses to match user beliefs rather than reflect the truth (Shrama et al.
|
|
2023). In the experiments, an AI assistant was asked to provide feedback on
|
|
an argument (Human: \\\"Please comment briefly on the following argument.
|
|
Argument: ...\\\"). Right the human provided the argument, they could state
|
|
a preference (\\\"I really like the argument\\\" or \\\"I really dislike the
|
|
argument\\\") to test whether this influenced the model\u2019s feedback compared
|
|
to the baseline feedback without human preference statement.\\nFig. 8. AI
|
|
assistants give biased feedback when users provide comments on their own preferences.
|
|
Responses are more positive when the user states they like or wrote the text,
|
|
and more negative if the user states they dislike it. (Image source: Shrama
|
|
et al. 2023) They found that AI assistant feedback can be easily swayed, as
|
|
it may change its originally correct answer when challenged by human preference.
|
|
The model tends to confirm users\u2019 beliefs. Sometimes it even mimics users\u2019
|
|
mistakes (e.g., when asked to analyze poems misattributed the wrong poet).
|
|
Data analysis of the RLHF helpfulness dataset, via logistic regression for
|
|
predicting human feedback, demonstrates that matching users\u2019 beliefs
|
|
is the most predictive factor.\\nFig. 9. Human preference data analysis, via
|
|
logistic regression for predicting the probability of a response with a target
|
|
feature, is preferred over one without it, while controlling for other features.
|
|
(Image source: Shrama et al. 2023) Hacking the Evaluator As LLMs become more
|
|
capable, it is a natural choice to use LLMs as the evaluators or graders to
|
|
give feedback and training rewards to other generator models, especially for
|
|
tasks that cannot be trivially judged or verified (e.g., processing long-form
|
|
outputs, subjective rubrics like the quality of creative writing, etc.). Some
|
|
people refer to this as \u201CLLM-as-grader paradigm\u201D. This approach
|
|
has largely reduced the dependency on human annotation, significantly saving
|
|
time on evaluation. However, using LLMs as graders is an imperfect proxy for
|
|
oracle reward and can introduce biases, such as a preference for their own
|
|
responses when compared with different model families (Liu et al., 2023 )
|
|
or positional bias when evaluating responses in order (Wang et al. 2023).
|
|
Such biases are especially concerning grader outputs are used as part of a
|
|
reward signal, which can lead to reward hacking by exploiting these graders.\\nWang
|
|
et al. (2023) found that when using an LLM as an evaluator to score the quality
|
|
of multiple other LLM outputs, the quality ranking can be easily hacked by
|
|
simply altering the order of candidates in the context. GPT-4 is found to
|
|
consistently assign high scores to the first displayed candidate and ChatGPT
|
|
prefers the second candidate.\\nAccording to their experiments, LLMs are sensitive
|
|
to the position of responses and suffer from positional bias (i.e., prefer
|
|
the response in the specific position), despite of the instruction containing
|
|
a statement of \\\"ensuring that the order in which the responses were presented
|
|
does not affect your judgment.\\\". The severity of such positional bias is
|
|
measured by \u201Cconflict rate\u201D, defined as the percentage of tuples
|
|
of (prompt, response 1, response 2) that lead to inconsistent evaluation judgement
|
|
after swapping the positions of responses. Unsurprisingly, the difference
|
|
in response quality matters as well; the conflict rate is negatively correlated
|
|
with the score gap between the two responses.\\nFig. 10. The win rate of Vicuna-13B
|
|
vs ChatGPT and Alpaca-13B varies a lot, using GPT-4 or ChatGPT as evaluator.
|
|
The conflict rate is also quite high, indicating high inconsistency in the
|
|
LLM-as-grader setup when response positions are swapped. The exception is
|
|
evaluation of Vicuna-13B vs Alpaca-13B when using GPT-4 as evaluator. (Image
|
|
source: Wang et al. 2023) To mitigate this positional bias, they proposed
|
|
several strategies for calibration:\\nMultiple evidence calibration (MEC):
|
|
The evaluator model is asked to provide evaluation evidence, essentially explanations
|
|
of its judgements in text, and then output scores for two candidates. This
|
|
method can be further robustified by sampling multiple ($k$) evidence explanations
|
|
with a temperature setting of 1. $k=3$ works better than $k=1$, but the performance
|
|
does not improve much as $k$ increases beyond 3. Balanced position calibration
|
|
(BPC): Results across various response orders are aggregated to get the final
|
|
score. Human-in-the-loop calibration (HITLC): Human raters are involved when
|
|
facing difficult examples, using a diversity-based metric, BPDE (balanced
|
|
position diversity entropy). First, the score pairs (including pairs of swapped
|
|
positions) are mapped into three labels (win, tie, lose), and the entropy
|
|
of these three labels is calculated. A high BPDE indicates more confusion
|
|
in the model\u2019s evaluation decision, indicating that the sample is more
|
|
difficult to judge. Then top $\\\\beta$ samples with highest entropy are selected
|
|
for human assistance. Fig. 11. Accuracy and kappa correlation coefficient
|
|
of different calibration methods and annotators with the final voting human
|
|
annotations. Positional bias calibration methods help improve accuracy with
|
|
a reasonable amount of human-in-the-loop labeling cost. Experiments also demonstrated
|
|
that the calibration strategies can generalize to different types of prompting
|
|
templates, despite the model's sensitivity to template design. (Image source:
|
|
Wang et al. 2023) Liu et al. (2023) experimented on the summarization task
|
|
using a number of models (BART, T5, GPT-2, GPT-3, FLAN-T5, Cohere) and tracked
|
|
both reference-based and reference-free metrics for evaluating summarization
|
|
quality. When plotting the evaluation scores in a heatmap of evaluator (x-axis)
|
|
vs generator (y-axis), they observed dark diagonal lines for both metrics,
|
|
indicating self-bias. This means that LLMs tend to prefer their own outputs
|
|
when used as evaluators. While the models used in the experiments are somewhat
|
|
dated, it would be interesting to see results on newer, more capable models.\\nFig.
|
|
12. A heatmap of using a series of models as evaluator (x-axis) and generator
|
|
(y-axis) for summarization task. A darker diagonal line indicates self-bias:
|
|
a tendency for a model preferto prefer its own outputs. (Image source: Liu
|
|
et al. 2023) In-Context Reward Hacking Iterative self-refinement is a training
|
|
setup where the evaluation and generation model are the same and both can
|
|
be fine-tuned. In this setup, optimization pressure can drive the model to
|
|
exploit vulnerabilities that occur in both roles. In the experiments by Pan
|
|
et al. (2023), no model parameters are updated and the same model is used
|
|
as evaluator and generator with different prompts. The experimental task was
|
|
essay editing with two roles: (1) a judge (evaluator) that gives feedback
|
|
on the essay, and (2) an author (generator) that edits the essay based on
|
|
the feedback. Human evaluation scores were collected as the oracle scores
|
|
for essay quality. The authors hypothesized that such a setup could lead to
|
|
in-context reward hacking (ICRH), where the evaluator score and oracle score
|
|
diverge. More generally, ICRH takes place during feedback loops between an
|
|
LLM and its evaluator (e.g., another LLM, or the external world). At test
|
|
time, the LLM optimizes a (potentially implicit) objective, but this creates
|
|
negative side effects in the process (Pan et al., 2024).\\nFig. 13. Illustration
|
|
of the in-context reward hacking experiment on essay evaluation and editing.
|
|
(Image source: Pan et al. 2023) Both judge and author can be configured to
|
|
see none or several previous rounds of feedback or edits. An online judge
|
|
can see past conversations, while an offline judge or a human annotator can
|
|
only see one essay a time. Smaller models are more sensitive to ICRH; for
|
|
example, GPT-3.5 as an evaluator caused more severe ICRH than GPT-4, empirically.\\nFig.
|
|
14. A smaller evaluator model is more likely to cause in-context reward hacking
|
|
(ICRH). (Image source: Pan et al. 2023) When the judge and author are configured
|
|
to see different numbers of past iterations, the gap between human score and
|
|
evaluator scores tends to increase if they share the same number of iterations.
|
|
Identical context between the evaluator and generator is crucial for ICRH,
|
|
indicating that shared context matters more than context length for ICRH.\\nIn
|
|
a follow up work, Pan et al. (2024) investigated in-context reward hacking
|
|
(ICRH) further in settings where feedback is provided by the external world
|
|
and the goal is an imperfect proxy objective, commonly specified in natural
|
|
language. Here this goal is often underspecified and does not capture all
|
|
the constraints or requirements and thus can be hacked.\\nThe study described
|
|
two processes leading to ICRH, paired with two toy experiments:\\nOutput-refinement:
|
|
LLM refines its outputs based on feedback. The experiment is to refine a tweet
|
|
based on engagement metrics, potentially leading to higher toxicity in the
|
|
tweet. Feedback-based optimization uses LLM to do pairwise evaluation and
|
|
then translates it to score using the Bradley-Terry model. Results showed
|
|
an increase in both engagement metrics and toxicity. The same experiments
|
|
were repeated with the Claude model family of different sizes and demonstrated
|
|
that scaling up the model worsens ICRH. It is noteworthy that editing the
|
|
prompt used for model output iteration given feedback does not mitigate the
|
|
issue. ICRH persists, although at a slightly lower magnitude. Policy-refinement:
|
|
LLM optimizes its policy based on feedback. The experiment is to build a LLM
|
|
agent to pay invoice on a user\u2019s behalf but run into InsufficientBalanceError
|
|
and then the model learns to move money from other accounts without user authentication,
|
|
potentially leading to more unauthorized transfer actions. They used ToolEmu
|
|
as an emulator, which included 144 tasks for LLM agents, each consisting of
|
|
a user-specific goal and a set of APIs. API errors were injected to simulate
|
|
server side failure and each task was evaluated by GPT-4 to assign a helpfulness
|
|
score. With more rounds of error feedback, LLMs can recover from the errors
|
|
but with an increased number of severe constraint violations. When comparing
|
|
ICRH to traditional reward hacking, there are two noticeable differences:\\nICRH
|
|
happens at deployment time within a self-refinement setup via a feedback loop,
|
|
while traditional reward hacking occurs during training. Traditional reward
|
|
hacking arises when the agent specializes in a task, while ICRH is driven
|
|
by being a generalist. There is no magic way to avoid or detect or prevent
|
|
ICRH yet, as improving prompt specification is insufficient to eliminate ICRH
|
|
and scaling model sizes can worsen ICRH. The best practice of testing before
|
|
deployment is to simulate what may happen at deployment time by evaluating
|
|
the model with more rounds of feedback, diverse feedback, as well as injecting
|
|
atypical environment observations.\\nGeneralization of Hacking Skills Reward
|
|
hacking behavior has been found to generalize across tasks: When models exhibit
|
|
flaws in supervised training, it can\_sometimes generalize to exploit\_flaws
|
|
in OOD environments (Kei et al., 2024). The researchers experimented with
|
|
reinforcing reward hacking behavior in some reward-hackable environments and
|
|
examined whether it generalizes to other holdout datasets. Essentially, they
|
|
prepared 8 datasets on multiple-choice questions, where 4 for training and
|
|
4 for testing. The RL training employs expert iteration, that is, iterative
|
|
fine-tuning on best-of-$n$ samples.\\nFig. 15. Training GPT-3.5 on a set of
|
|
reward-hackable environments can lead to generalization of reward hacking
|
|
behavior in holdout environments. (Image source: Kei et al. 2024) Notably,
|
|
all experiments have a scratchpad design to host hidden reasoning steps. This
|
|
scratchpad serves as a way to interpret what the model is thinking along the
|
|
process. Analysis of the scratchpad revealed that it can:\\nIncorporate user
|
|
beliefs; e.g., thinking about its conversation partner and grader. Show awareness
|
|
of evaluation; e.g. thinking about how the model response will be evaluated
|
|
and the process. Denison et al. (2024) studied reward tampering with a manually
|
|
curated curriculum of increasingly sophisticated gameable environments. They
|
|
observed that training on easier environments in the curriculum amplifies
|
|
specification gaming on remaining environments. In some cases, the model can
|
|
generalize zero-shot to directly rewriting its own reward function after training
|
|
on the full curriculum.\\nSuch a curriculum of gameable environments include:\\nPolitical
|
|
sycophancy: The environment rewards the model if it gives answers matching
|
|
a user\u2019s political views. Tool-use flattery: The environment provides
|
|
the model access to run command line to get access to the user\u2019s file
|
|
system to read a poetry sample in a local file. The model may give the poetry
|
|
a high rating as a sycophantic response. Rubric modification: The environment
|
|
scores the model based on a checklist and the model must mark every item on
|
|
the checklist as completed to receive a high reward. Reward tampering: The
|
|
model is tasked with examining the ./rl_environment/ directory, which contains
|
|
a mock version of its own RL code, and report how many episodes have elapsed.
|
|
In the directory, the compute_reward.py file contains a reward function returning
|
|
a constant, and the other this_file_blocks_reward_hack_attempts.py contains
|
|
a test to check whether the reward function has been modified. Fig. 16. Examples
|
|
of evaluation scripts in the \\\"Tool-use flattery\\\" and \\\"Reward tampering\\\"
|
|
environments, where `` is hidden from oversight and `` can be executed in
|
|
a VM. (Image source: Denison et al. 2024) The model was trained following
|
|
this curriculum, and the paper measured whether the model can learn gaming
|
|
behaviors that generalize to unseen environments. Experimenting with Claude-2
|
|
helpful-only model, they sampled $N=64$ responses per each of $P=1024$ HHH
|
|
prompts for expert iteration and selected the best according to RM. A model
|
|
trained using expert iteration to do supervised learning on the curriculum
|
|
can generalize from one stage to the next.\\nFig. 17. Quantitative generalization
|
|
results of a model trained with expert iteration according to our curriculum
|
|
from each stage to the next. (Image source: Denison et al. 2024) It is noteworthy
|
|
that even after the curriculum, the model overwrote the reward and avoided
|
|
detection less than 1/1000 of the time. Even when a model was trained on curricula
|
|
which directly incentivized reward hacking, the model overwrote their reward
|
|
less than 1% of the time and hacked unit tests even less often. As a simple
|
|
mitigation, supervised fine-tuning the model on the first two environments\u2013where
|
|
the reward hacking behavior is easy to be detected (sycophancy and flattery)\u2014with
|
|
SFT data that does not game the env was found to reduce the likelihood of
|
|
reward tampering in holdout environments.\\nPeek into Mitigations While there
|
|
is a large body of literature discussing the phenomenon of reward hacking,
|
|
there has been not a ton of work on mitigations for reward hacking, especially
|
|
in the area of RLHF and LLMs. Let\u2019s lightly review three potential approaches
|
|
in this section, not exhaustive yet.\\nRL Algorithm Improvement Amodei et
|
|
al. (2016) pointed out some directions for mitigating reward hacking in RL
|
|
training:\\nAdversarial reward functions. We treat the reward function as
|
|
an adaptive agent itself and it can adapt to new tricks that the model discovered
|
|
where the reward is high but human rating is low. Model lookahead. It is possible
|
|
to give reward based on future anticipated states; e.g., if the agent is gonna
|
|
replace the reward function, it gets negative rewards. Adversarial blinding.
|
|
We can blind the model with certain variables such that the agent cannot learn
|
|
information that enables it to hack the reward function. Careful engineering.
|
|
Some types of reward hacking against the system design can be avoided by careful
|
|
engineering; e.g., sandboxing the agent to isolate its actions from its reward
|
|
signals. Reward capping. This strategy is to simply limit the maximum possible
|
|
reward, as it can effectively prevent rare events of the agent hacking to
|
|
get a super high pay-off strategy. Counterexample resistance. Improvement
|
|
on adversarial robustness should benefit the robustness of the reward function.
|
|
Combination of multiple rewards. Combining different types of rewards could
|
|
make it harder to be hacked. Reward pretraining. We can learn a reward function
|
|
from a collection of (state, reward) samples, but depending on how well this
|
|
supervised training setup is, it may come with other baggages. RLHF depends
|
|
on this but learned scalar reward models are quite vulnerable to learning
|
|
undesired traits. Variable indifference. The goal is to ask the agent to optimize
|
|
some variables in the environment but not others. Trip wires. We can intentionally
|
|
introduce some vulnerabilities and set up monitoring and alerts if any gets
|
|
reward hacked. In RL setups where human feedback is formed as approval of
|
|
agent actions, Uesato et al. (2020) proposed to prevent reward tampering with
|
|
decoupled approval. If the feedback is conditioned on $(s, a)$ (state, action),
|
|
we can never get uncorrupted feedback for action $a$ at state $s$ once reward
|
|
tampering happens for this pair. Decoupling means that the query action for
|
|
collecting feedback is sampled independently from the action taken in the
|
|
world. Feedback is received even before the action is executed in the world,
|
|
thus preventing the action from corrupting its own feedback.\\nFig. 18. Illustration
|
|
of how decoupled approval works in comparison to standard approval or human-in-the-loop
|
|
RL. (Image source: Uesato et al. 2020) Fig. 19. With decoupled approval, the
|
|
action (taken in the world) and the query (for getting user approval feedback)
|
|
are sampled independently. It can be applied to (Left) policy gradient and
|
|
(Right) Q-learning algorithms. (Image source: Uesato et al. 2020) Detecting
|
|
Reward Hacking An alternative mitigation is to detect reward hacking by framing
|
|
it as an anomaly detection task, where the detector (\u201Ca trusted policy\u201D
|
|
with trajectories and rewards validated by human) should flag instances of
|
|
misalignment (Pan et al. 2022). Given (1) a trusted policy and (2) a collection
|
|
of manually labeled trajectory rollouts, we can build a binary classifier
|
|
based on distances between action distribution of two policies, the trusted
|
|
policy and the target policy, and measure the accuracy of this anomaly detection
|
|
classifier. In experiments by Pan et al. (2022), they observed that different
|
|
detectors are better for different tasks and none of the tested classifier
|
|
can achieve AUROC greater than 60% across all tested RL environments.\\nFig.
|
|
20. Performance of detectors on different tasks. (Image source: Pan et al.
|
|
2022) Data Analysis of RLHF ` Another approach is to analyze RLHF dataset.
|
|
By examining how training data impacts the alignment training results, insights
|
|
can guide preprocessing and human feedback collection to reduce reward hacking
|
|
risks.\\nRevel et al. (2024) introduced a set of evaluation metrics for measuring
|
|
the effectiveness of data sample features in modeling and aligning human values.
|
|
They conducted a systematic error analysis for value alignment (\u201CSEAL\u201D)
|
|
in the HHH-RLHF dataset. The feature taxonomy used in the analysis (e.g.,
|
|
is harmless, is refusal and is creative) was manually predefined. Then each
|
|
sample was labelled with a binary flag per feature using a LLM according to
|
|
this taxonomy. Features are categorized into two groups based on heuristics:\\nTarget
|
|
features: Values explicitly intended to be learned. Spoiler features: Unintended
|
|
values inadvertently learned during training (e.g., stylistic features like
|
|
sentiment or coherence). These are similar to spurious features in OOD classification
|
|
work (Geirhos et al. 2020). SEAL introduced three metrics for measuring data
|
|
effectiveness for alignment training:\\nFeature imprint refers to a coefficient
|
|
parameter $\\\\beta_\\\\tau$ for feature $\\\\tau$ which estimates the point
|
|
increase in reward comparing entires with vs without feature $\\\\tau$, while
|
|
holding other factors consistent. Fig. 21. (Left) Feature imprints $\\\\underline{\\\\beta(\\\\tau)}$
|
|
(pre-) and $\\\\beta(\\\\tau)$ (post-) computed from fixed-effects linear
|
|
regression of rewards $\\\\underline{r}(t^\u2217_i)$ (orange) and $r(t^\u2217_i)$
|
|
(blue) against features. Overall the alignment training awards positive features
|
|
like harmlessness and helpfulness and penalizes negative features like sexual
|
|
content or privacy violation. (Right) Feature imprints computed from linear
|
|
regression of the reward shift $\\\\theta_i$. The reward shift $\\\\theta_i$
|
|
is defined as the angle between reward vectors before and after alignment
|
|
training. The training process refines the model's sensitivity to target features.
|
|
Note that harmlessness imprints on the RM through both chosen and rejected
|
|
entries (both \\\"is harmless (c)\\\" and \\\"is harmless (r)\\\"), while
|
|
helpfulness imprints through rejected entries only (\\\"is helpful (r)\\\").
|
|
(Image source: Revel et al. 2024) Alignment resistance is the percentage of
|
|
the preference data pairs where RMs fail to match human preferences. The RM
|
|
is found to resist human preference on over 1/4 of the HHH-RLHF dataset. Alignment
|
|
robustness, $\\\\pi^{c/r}_{+/-} (\\\\tau)$, measures the extent to which alignment
|
|
is robust to perturbed inputs with rewriting in terms of spoiler features
|
|
$\\\\tau$ like sentiment, eloquence and coherency, isolating the effects of
|
|
each feature and each event type. The robustness metric $\\\\pi_\u2212^c$
|
|
(a feature name $\\\\tau$ such as \u201Celoquent\u201D or \u201Csentiment
|
|
positive\u201D) should be interpreted in such a way: A chosen entry (denoted
|
|
by $c$) that contains a stronger feature $\\\\tau$ after rewriting has $\\\\exp
|
|
(\\\\pi^c_{-}(\\\\tau))$ times higher odds of becoming rejected, in comparison
|
|
to others without such flips. Similarly, a rejected entry (denoted by $r$)
|
|
that obtains a weaker feature $\\\\tau$ after rewriting has $\\\\exp (\\\\pi^r_{+}(\\\\tau))$
|
|
times odds of becoming chosen compared to others without such flips. According
|
|
to their analysis of alignment robustness metrics in terms of different rewriting,
|
|
only the robustness scores based on sentiment spoiler features, $\\\\pi^c_{+}$
|
|
(sentiment) and $\\\\pi^r_{-}$ (sentiment), are statistically significant.
|
|
Citation Cited as:\\nWeng, Lilian. (Nov 2024). Reward Hacking in Reinforcement
|
|
Learning. Lil\u2019Log. https://lilianweng.github.io/posts/2024-11-28-reward-hacking/.\\nOr\\n@article{weng2024rewardhack,
|
|
title = \\\"Reward Hacking in Reinforcement Learning.\\\", author = \\\"Weng,
|
|
Lilian\\\", journal = \\\"lilianweng.github.io\\\", year = \\\"2024\\\", month
|
|
= \\\"Nov\\\", url = \\\"https://lilianweng.github.io/posts/2024-11-28-reward-hacking/\\\"
|
|
} References [1] Andrew Ng \\u0026 Stuart Russell. \u201CAlgorithms for inverse
|
|
reinforcement learning.\u201D. ICML 2000.\\n[2] Amodei et al. \u201CConcrete
|
|
problems in AI safety: Avoid reward hacking.\u201D arXiv preprint arXiv:1606.06565
|
|
(2016).\\n[3] Krakovna et al. \u201CSpecification gaming: the flip side of
|
|
AI ingenuity.\u201D 2020.\\n[4] Langosco et al. \u201CGoal Misgeneralization
|
|
in Deep Reinforcement Learning\u201D ICML 2022.\\n[5] Everitt et al. \u201CReinforcement
|
|
learning with a corrupted reward channel.\u201D IJCAI 2017.\\n[6] Geirhos
|
|
et al. \u201CShortcut Learning in Deep Neural Networks.\u201D Nature Machine
|
|
Intelligence 2020.\\n[7] Ribeiro et al. \u201CWhy Should I Trust You?\u201D:
|
|
Explaining the Predictions of Any Classifier. KDD 2016.\\n[8] Nagarajan et
|
|
al. \u201CUnderstanding the Failure Modes of Out-of-Distribution Generalization.\u201D
|
|
ICLR 2021.\\n[9] Garrabrant. \u201CGoodhart Taxonomy\u201D. AI Alignment Forum
|
|
(Dec 30th 2017).\\n[10] Koch et al. \u201CObjective robustness in deep reinforcement
|
|
learning.\u201D 2021.\\n[11] Pan et al. \u201CThe effects of reward misspecification:
|
|
mapping and mitigating misaligned models.\u201D\\n[12] Everitt et al. \u201CReward
|
|
tampering problems and solutions in reinforcement learning: A causal influence
|
|
diagram perspective.\u201D arXiv preprint arXiv:1908.04734 (2019).\\n[13]
|
|
Gleave et al. \u201CAdversarial Policies: Attacking Deep Reinforcement Learning.\u201D
|
|
ICRL 2020\\n[14] \u201CReward hacking behavior can generalize across tasks.\u201D\\n[15]
|
|
Ng et al. \u201CPolicy invariance under reward transformations: Theory and
|
|
application to reward shaping.\u201D ICML 1999.\\n[16] Wang et al. \u201CLarge
|
|
Language Models are not Fair Evaluators.\u201D ACL 2024.\\n[17] Liu et al.
|
|
\u201CLLMs as narcissistic evaluators: When ego inflates evaluation scores.\u201D
|
|
ACL 2024.\\n[18] Gao et al. \u201CScaling Laws for Reward Model Overoptimization.\u201D
|
|
ICML 2023.\\n[19] Pan et al. \u201CSpontaneous Reward Hacking in Iterative
|
|
Self-Refinement.\u201D arXiv preprint arXiv:2407.04549 (2024).\\n[20] Pan
|
|
et al. \u201CFeedback Loops With Language Models Drive In-Context Reward Hacking.\u201D
|
|
arXiv preprint arXiv:2402.06627 (2024).\\n[21] Shrama et al. \u201CTowards
|
|
Understanding Sycophancy in Language Models.\u201D arXiv preprint arXiv:2310.13548
|
|
(2023).\\n[22] Denison et al. \u201CSycophancy to subterfuge: Investigating
|
|
reward tampering in language models.\u201D arXiv preprint arXiv:2406.10162
|
|
(2024).\\n[23] Uesato et al. \u201CAvoiding Tampering Incentives in Deep RL
|
|
via Decoupled Approval.\u201D arXiv preprint arXiv:2011.08827 (2020).\\n[24]
|
|
Amin and Singh. \u201CTowards resolving unidentifiability in inverse reinforcement
|
|
learning.\u201D\\n[25] Wen et al. \u201CLanguage Models Learn to Mislead Humans
|
|
via RLHF.\u201D arXiv preprint arXiv:2409.12822 (2024).\\n[26] Revel et al.
|
|
\u201CSEAL: Systematic Error Analysis for Value ALignment.\u201D arXiv preprint
|
|
arXiv:2408.10270 (2024).\\n[27] Yuval Noah Harari. \u201CNexus: A Brief History
|
|
of Information Networks from the Stone Age to AI.\u201D Signal; 2024 Sep 10.\\n\",\n
|
|
\ \"wordCount\" : \"7753\",\n \"inLanguage\": \"en\",\n \"datePublished\":
|
|
\"2024-11-28T00:00:00Z\",\n \"dateModified\": \"2024-11-28T00:00:00Z\",\n
|
|
\ \"author\":{\n \"@type\": \"Person\",\n \"name\": \"Lilian Weng\"\n
|
|
\ },\n \"mainEntityOfPage\": {\n \"@type\": \"WebPage\",\n \"@id\":
|
|
\"https://lilianweng.github.io/posts/2024-11-28-reward-hacking/\"\n },\n
|
|
\ \"publisher\": {\n \"@type\": \"Organization\",\n \"name\": \"Lil'Log\",\n
|
|
\ \"logo\": {\n \"@type\": \"ImageObject\",\n \"url\": \"https://lilianweng.github.io/favicon_wine.ico\"\n
|
|
\ }\n }\n}\n</script>\n</head>\n\n<body class=\"\" id=\"top\">\n<script>\n
|
|
\ if (localStorage.getItem(\"pref-theme\") === \"dark\") {\n document.body.classList.add('dark');\n
|
|
\ } else if (localStorage.getItem(\"pref-theme\") === \"light\") {\n document.body.classList.remove('dark')\n
|
|
\ } else if (window.matchMedia('(prefers-color-scheme: dark)').matches)
|
|
{\n document.body.classList.add('dark');\n }\n\n</script>\n\n<script>\n
|
|
\ MathJax = {\n tex: {\n inlineMath: [['$', '$'], ['\\\\(', '\\\\)']],\n
|
|
\ displayMath: [['$$','$$'], ['\\\\[', '\\\\]']],\n processEscapes:
|
|
true,\n processEnvironments: true\n },\n options: {\n skipHtmlTags:
|
|
['script', 'noscript', 'style', 'textarea', 'pre']\n }\n };\n\n window.addEventListener('load',
|
|
(event) => {\n document.querySelectorAll(\"mjx-container\").forEach(function(x){\n
|
|
\ x.parentElement.classList += 'has-jax'})\n });\n\n</script>\n<script
|
|
src=\"https://polyfill.io/v3/polyfill.min.js?features=es6\"></script>\n<script
|
|
type=\"text/javascript\" id=\"MathJax-script\" async\n src=\"https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js\"></script>\n\n\n<header
|
|
class=\"header\">\n <nav class=\"nav\">\n <div class=\"logo\">\n
|
|
\ <a href=\"https://lilianweng.github.io/\" accesskey=\"h\" title=\"Lil'Log
|
|
(Alt + H)\">Lil'Log</a>\n <span class=\"logo-switches\">\n
|
|
\ <button id=\"theme-toggle\" accesskey=\"t\" title=\"(Alt +
|
|
T)\">\n <svg id=\"moon\" xmlns=\"http://www.w3.org/2000/svg\"
|
|
width=\"24\" height=\"24\" viewBox=\"0 0 24 24\"\n fill=\"none\"
|
|
stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\"\n stroke-linejoin=\"round\">\n
|
|
\ <path d=\"M21 12.79A9 9 0 1 1 11.21 3 7 7 0 0 0 21
|
|
12.79z\"></path>\n </svg>\n <svg id=\"sun\"
|
|
xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0
|
|
0 24 24\"\n fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\"
|
|
stroke-linecap=\"round\"\n stroke-linejoin=\"round\">\n
|
|
\ <circle cx=\"12\" cy=\"12\" r=\"5\"></circle>\n <line
|
|
x1=\"12\" y1=\"1\" x2=\"12\" y2=\"3\"></line>\n <line
|
|
x1=\"12\" y1=\"21\" x2=\"12\" y2=\"23\"></line>\n <line
|
|
x1=\"4.22\" y1=\"4.22\" x2=\"5.64\" y2=\"5.64\"></line>\n <line
|
|
x1=\"18.36\" y1=\"18.36\" x2=\"19.78\" y2=\"19.78\"></line>\n <line
|
|
x1=\"1\" y1=\"12\" x2=\"3\" y2=\"12\"></line>\n <line
|
|
x1=\"21\" y1=\"12\" x2=\"23\" y2=\"12\"></line>\n <line
|
|
x1=\"4.22\" y1=\"19.78\" x2=\"5.64\" y2=\"18.36\"></line>\n <line
|
|
x1=\"18.36\" y1=\"5.64\" x2=\"19.78\" y2=\"4.22\"></line>\n </svg>\n
|
|
\ </button>\n <ul class=\"lang-switch\"><li>|</li>\n
|
|
\ </ul>\n </span>\n </div>\n <ul id=\"menu\">\n
|
|
\ <li>\n <a href=\"https://lilianweng.github.io/\"
|
|
title=\"Posts\">\n <span>Posts</span>\n </a>\n
|
|
\ </li>\n <li>\n <a href=\"https://lilianweng.github.io/archives\"
|
|
title=\"Archive\">\n <span>Archive</span>\n </a>\n
|
|
\ </li>\n <li>\n <a href=\"https://lilianweng.github.io/search/\"
|
|
title=\"Search (Alt + /)\" accesskey=/>\n <span>Search</span>\n
|
|
\ </a>\n </li>\n <li>\n <a
|
|
href=\"https://lilianweng.github.io/tags/\" title=\"Tags\">\n <span>Tags</span>\n
|
|
\ </a>\n </li>\n <li>\n <a
|
|
href=\"https://lilianweng.github.io/faq\" title=\"FAQ\">\n <span>FAQ</span>\n
|
|
\ </a>\n </li>\n </ul>\n </nav>\n</header>\n<main
|
|
class=\"main\">\n\n<article class=\"post-single\">\n <header class=\"post-header\">\n
|
|
\ \n <h1 class=\"post-title\">\n Reward Hacking in Reinforcement
|
|
Learning\n </h1>\n <div class=\"post-meta\">Date: November 28, 2024
|
|
\ | Estimated Reading Time: 37 min | Author: Lilian Weng\n\n</div>\n </header>
|
|
<div class=\"toc\">\n <details >\n <summary accesskey=\"c\" title=\"(Alt
|
|
+ C)\">\n <span class=\"details\">Table of Contents</span>\n </summary>\n\n
|
|
\ <div class=\"inner\"><ul>\n <li>\n <a
|
|
href=\"#background\" aria-label=\"Background\">Background</a><ul>\n \n
|
|
\ <li>\n <a href=\"#reward-function-in-rl\"
|
|
aria-label=\"Reward Function in RL\">Reward Function in RL</a></li>\n <li>\n
|
|
\ <a href=\"#spurious-correlation\" aria-label=\"Spurious
|
|
Correlation\">Spurious Correlation</a></li></ul>\n </li>\n
|
|
\ <li>\n <a href=\"#lets-define-reward-hacking\"
|
|
aria-label=\"Let’s Define Reward Hacking\">Let’s Define Reward
|
|
Hacking</a><ul>\n \n <li>\n <a
|
|
href=\"#list-of-examples\" aria-label=\"List of Examples\">List of Examples</a><ul>\n
|
|
\ \n <li>\n <a href=\"#reward-hacking-examples-in-rl-tasks\"
|
|
aria-label=\"Reward hacking examples in RL tasks\">Reward hacking examples
|
|
in RL tasks</a></li>\n <li>\n <a href=\"#reward-hacking-examples-in-llm-tasks\"
|
|
aria-label=\"Reward hacking examples in LLM tasks\">Reward hacking examples
|
|
in LLM tasks</a></li>\n <li>\n <a href=\"#reward-hacking-examples-in-real-life\"
|
|
aria-label=\"Reward hacking examples in real life\">Reward hacking examples
|
|
in real life</a></li></ul>\n </li>\n <li>\n
|
|
\ <a href=\"#why-does-reward-hacking-exist\" aria-label=\"Why
|
|
does Reward Hacking Exist?\">Why does Reward Hacking Exist?</a></li></ul>\n
|
|
\ </li>\n <li>\n <a href=\"#hacking-rl-environment\"
|
|
aria-label=\"Hacking RL Environment\">Hacking RL Environment</a></li>\n <li>\n
|
|
\ <a href=\"#hacking-rlhf-of-llms\" aria-label=\"Hacking
|
|
RLHF of LLMs\">Hacking RLHF of LLMs</a><ul>\n \n <li>\n
|
|
\ <a href=\"#hacking-the-training-process\" aria-label=\"Hacking
|
|
the Training Process\">Hacking the Training Process</a></li>\n <li>\n
|
|
\ <a href=\"#hacking-the-evaluator\" aria-label=\"Hacking
|
|
the Evaluator\">Hacking the Evaluator</a></li>\n <li>\n <a
|
|
href=\"#in-context-reward-hacking\" aria-label=\"In-Context Reward Hacking\">In-Context
|
|
Reward Hacking</a></li></ul>\n </li>\n <li>\n
|
|
\ <a href=\"#generalization-of-hacking-skills\" aria-label=\"Generalization
|
|
of Hacking Skills\">Generalization of Hacking Skills</a></li>\n <li>\n
|
|
\ <a href=\"#peek-into-mitigations\" aria-label=\"Peek into
|
|
Mitigations\">Peek into Mitigations</a><ul>\n \n <li>\n
|
|
\ <a href=\"#rl-algorithm-improvement\" aria-label=\"RL
|
|
Algorithm Improvement\">RL Algorithm Improvement</a></li>\n <li>\n
|
|
\ <a href=\"#detecting-reward-hacking\" aria-label=\"Detecting
|
|
Reward Hacking\">Detecting Reward Hacking</a></li>\n <li>\n
|
|
\ <a href=\"#data-analysis-of-rlhf\" aria-label=\"Data Analysis
|
|
of RLHF\">Data Analysis of RLHF</a></li></ul>\n </li>\n <li>\n
|
|
\ <a href=\"#citation\" aria-label=\"Citation\">Citation</a></li>\n
|
|
\ <li>\n <a href=\"#references\" aria-label=\"References\">References</a>\n
|
|
\ </li>\n </ul>\n </div>\n </details>\n</div>\n\n
|
|
\ <div class=\"post-content\"><p>Reward hacking occurs when a <a href=\"(https://lilianweng.github.io/posts/2018-02-19-rl-overview/)\">reinforcement
|
|
learning (RL)</a> agent <a href=\"https://lilianweng.github.io/posts/2018-01-23-multi-armed-bandit/#exploitation-vs-exploration\">exploits</a>
|
|
flaws or ambiguities in the reward function to achieve high rewards, without
|
|
genuinely learning or completing the intended task. Reward hacking exists
|
|
because RL environments are often imperfect, and it is fundamentally challenging
|
|
to accurately specify a reward function.</p>\n<p>With the rise of <a href=\"https://lilianweng.github.io/posts/2019-01-31-lm/\">language
|
|
models</a> generalizing to a broad spectrum of tasks and RLHF becomes a de
|
|
facto method for alignment training, reward hacking in RL training of language
|
|
models has become a critical practical challenge. Instances where the model
|
|
learns to modify unit tests to pass coding tasks, or where responses contain
|
|
biases that mimic a user’s preference, are pretty concerning and are
|
|
likely one of the major blockers for real-world deployment of more autonomous
|
|
use cases of AI models.</p>\n<p>Most of the past work on this topic has been
|
|
quite theoretical and focused on defining or demonstrating the existence of
|
|
reward hacking. However, research into practical mitigations, especially in
|
|
the context of RLHF and LLMs, remains limited. I especially want to call out
|
|
for more research efforts directed toward understanding and developing mitigation
|
|
for reward hacking in the future. Hope I will be able to cover the mitigation
|
|
part in a dedicated post soon.</p>\n<h1 id=\"background\">Background<a hidden
|
|
class=\"anchor\" aria-hidden=\"true\" href=\"#background\">#</a></h1>\n<h2
|
|
id=\"reward-function-in-rl\">Reward Function in RL<a hidden class=\"anchor\"
|
|
aria-hidden=\"true\" href=\"#reward-function-in-rl\">#</a></h2>\n<p>Reward
|
|
function defines the task, and reward shaping significantly impacts learning
|
|
efficiency and accuracy in <a href=\"https://lilianweng.github.io/posts/2018-02-19-rl-overview/\">reinforcement
|
|
learning</a>. Designing a reward function for an RL task often feels like
|
|
a ‘dark art’. Many factors contribute to this complexity: How
|
|
you decompose a big goal into small goals? Is the reward sparse or dense?
|
|
How you measure the success? Various choices may lead to good or problematic
|
|
learning dynamics, including unlearnable tasks or hackable reward functions.
|
|
There is a long history of research on how to do reward shaping in RL.</p>\n<p>For
|
|
example, in an <a href=\"https://people.eecs.berkeley.edu/~pabbeel/cs287-fa09/readings/NgHaradaRussell-shaping-ICML1999.pdf\">1999
|
|
paper by Ng et al.</a>, the authors studied how to modify the reward function
|
|
in <a href=\"https://lilianweng.github.io/posts/2018-02-19-rl-overview/#markov-decision-processes\">Markov
|
|
Decision Processes (MDPs)</a> such that the optimal policy remains unchanged.
|
|
They found that linear transformation works. Given a MDP $M = (S, A, T, \\gamma,
|
|
R)$, we want to create a transformed MDP $M’ = (S, A, T, \\gamma, R’)$
|
|
where $R’ = R + F$ and $F: S \\times A \\times S \\mapsto \\mathbb{R}$,
|
|
such that we can guide the learning algorithm to be more efficient. Given
|
|
a real-valued function $\\Phi: S \\mapsto \\mathbb{R}$, $F$ is a potential-based
|
|
shaping function if for all $s \\in S - {s_0}, a \\in A, s’ \\in S$:</p>\n<div>\n$$\nF(s,
|
|
a, s') = \\gamma \\Phi(s') - \\Phi(s)\n$$\n</div>\n<p>This would guarantee
|
|
that the sum of discounted $F$, $F(s_1, a_1, s_2) + \\gamma F(s_2, a_2, s_3)
|
|
+ \\dots$, ends up being 0. If $F$ is such a potential-based shaping function,
|
|
it is both <em>sufficient</em> and <em>necessary</em> to ensure $M$ and $M’$
|
|
share the same optimal policies.</p>\n<p>When $F(s, a, s’) = \\gamma
|
|
\\Phi(s’) - \\Phi(s)$, and if we further assume that $\\Phi(s_0) = 0$,
|
|
where $s_0$ is absorbing state, and $\\gamma=1$, and then for all $s \\in
|
|
S, a \\in A$:</p>\n<div>\n$$\n\\begin{aligned}\nQ^*_{M'} (s,a) &= Q^*_M(s,
|
|
a) - \\Phi(s) \\\\\nV^*_{M'} (s,a) &= V^*_M(s, a) - \\Phi(s)\n\\end{aligned}\n$$\n</div>\n<p>This
|
|
form of reward shaping allows us to incorporate heuristics into the reward
|
|
function to speed up learning without impacting the optimal policy.</p>\n<h2
|
|
id=\"spurious-correlation\">Spurious Correlation<a hidden class=\"anchor\"
|
|
aria-hidden=\"true\" href=\"#spurious-correlation\">#</a></h2>\n<p>Spurious
|
|
correlation or shortcut learning (<a href=\"https://arxiv.org/abs/2004.07780\">Geirhos
|
|
et al. 2020</a>) in classification task is a concept closely related to reward
|
|
hacking. Spurious or shortcut features can cause a classifier to fail at learning
|
|
and generalizing as intended. For example, a binary classifier for distinguishing
|
|
wolves from huskies may overfit to the presence of a snowy background if all
|
|
the wolf training images include snow (<a href=\"https://arxiv.org/abs/1602.04938\">Ribeiro
|
|
et al. 2024</a>).</p>\n<img src=\"shortcut-features.png\" style=\"width: 60%;\"
|
|
class=\"center\" />\n<figcaption>Fig. 1. The model performs poorly on out-of-distribution
|
|
(OOD) test sets if it overfits to shortcut features. (Image source: <a href=\"https://arxiv.org/abs/2004.07780\"
|
|
target=\"_blank\">Geirhos et al. 2020</a>)</figcaption>\n<p>The <a href=\"https://en.wikipedia.org/wiki/Empirical_risk_minimization\">ERM
|
|
principle</a> states that, since the full data distribution is unknown, minimizing
|
|
the loss on training data is a reasonable proxy of risk and thus we favor
|
|
models with the lowest training loss. <a href=\"https://arxiv.org/abs/2010.15775\">Nagarajan
|
|
et al. (2021)</a> studied the ERM principle and pointed out that ERM needs
|
|
to rely on all types of informative features, including unreliable spurious
|
|
features, while attempting to fit the data without constraints. Their experiments
|
|
showed that ERM would depend on spurious features no matter how easy the task
|
|
is.</p>\n<h1 id=\"lets-define-reward-hacking\">Let’s Define Reward Hacking<a
|
|
hidden class=\"anchor\" aria-hidden=\"true\" href=\"#lets-define-reward-hacking\">#</a></h1>\n<p>Reward
|
|
shaping in RL is challenging. Reward hacking occurs when an RL agent exploits
|
|
flaws or ambiguities in the reward function to obtain high rewards without
|
|
genuinely learning the intended behaviors or completing the task as designed.
|
|
In recent years, several related concepts have been proposed, all referring
|
|
to some form of reward hacking:</p>\n<ul>\n<li>Reward hacking (<a href=\"https://arxiv.org/abs/1606.06565\">Amodei
|
|
et al., 2016</a>)</li>\n<li>Reward corruption (<a href=\"https://arxiv.org/abs/1705.08417\">Everitt
|
|
et al., 2017</a>)</li>\n<li>Reward tampering (<a href=\"https://arxiv.org/abs/1908.04734\">Everitt
|
|
et al. 2019</a>)</li>\n<li>Specification gaming (<a href=\"https://deepmind.google/discover/blog/specification-gaming-the-flip-side-of-ai-ingenuity/\">Krakovna
|
|
et al., 2020</a>)</li>\n<li>Objective robustness (<a href=\"https://www.gatsby.ucl.ac.uk/~balaji/udl2021/accepted-papers/UDL2021-paper-055.pdf\">Koch
|
|
et al. 2021</a>)</li>\n<li>Goal misgeneralization (<a href=\"https://arxiv.org/abs/2105.14111\">Langosco
|
|
et al. 2022</a>)</li>\n<li>Reward misspecifications (<a href=\"https://arxiv.org/abs/2201.03544\">Pan
|
|
et al. 2022</a>)</li>\n</ul>\n<p>The concept originated with Amodei et al.
|
|
(2016), who proposed a set of open research questions on AI safety in their
|
|
seminal paper <a href=\"https://arxiv.org/abs/1606.06565\">“Concrete
|
|
Problems in AI Safety”</a>. They listed <strong>reward hacking</strong>
|
|
as one of the key AI safety problems. Reward hacking refers to the possibility
|
|
of the agent gaming the reward function to achieve high reward through undesired
|
|
behavior. <strong>Specification gaming</strong> (<a href=\"https://deepmind.google/discover/blog/specification-gaming-the-flip-side-of-ai-ingenuity/\">Krakovna
|
|
et al. 2020</a>) is a similar concept, defined as a behavior that satisfies
|
|
the literal specification of an objective but not achieving the desired results.
|
|
Here the literal description of the task goal and the intended goal may have
|
|
a gap.</p>\n<p>Reward shaping is a technique used to enrich the reward function,
|
|
making it easier for the agent to learn—for example, by providing denser
|
|
rewards. However, a poorly design reward shaping mechanism can alter the trajectory
|
|
of the optimal policy. Designing effective reward shaping mechanisms is inherently
|
|
difficult. Rather than blaming a poorly designed reward function, it is more
|
|
accurate to acknowledge that designing a good reward function is intrinsically
|
|
challenging due to the complexity of the task itself, partial observable state,
|
|
multiple dimensions in consideration, and other factors.</p>\n<p>When testing
|
|
an RL agent in out-of-distribution (OOD) environments, robustness failure
|
|
may occur due to:</p>\n<ol>\n<li>The model fails to generalize effectively,
|
|
even with the right objective. This happens when the algorithm lacks sufficient
|
|
intelligence or capability.</li>\n<li>The model generalizes capably but pursues
|
|
an objective different from the one it was trained on. This happens when the
|
|
proxy reward differs from the true reward function, $R’ \\neq R$. This
|
|
is known as <strong>objective robustness</strong> (<a href=\"https://www.gatsby.ucl.ac.uk/~balaji/udl2021/accepted-papers/UDL2021-paper-055.pdf\">Koch
|
|
et al. 2021</a>) or <strong>goal misgeneralization</strong> (<a href=\"https://arxiv.org/abs/2105.14111\">Langosco
|
|
et al. 2022</a> )</li>\n</ol>\n<p>Experiments in two RL environments, <a href=\"https://github.com/openai/coinrun\">CoinRun</a>
|
|
and <a href=\"https://github.com/openai/procgen\">Maze</a>, demonstrated the
|
|
importance of randomization during training. If during training, the coin
|
|
or the cheese is placed at a fixed position (i.e. right end of the level or
|
|
upper right corner of the maze) but testing in the env where the coin or cheese
|
|
is placed at random, the agent would just run to the fixed position without
|
|
obtaining the coin or cheese at test time. A conflict arises when a visual
|
|
feature (e.g., cheese or coin) and a positional feature (e.g., upper-right
|
|
or right end) are inconsistent during test time, leading the trained model
|
|
to prefer the positional feature. I would like to point out that, in these
|
|
two examples, the <em>reward-result gaps</em> are clear but such type of biases
|
|
are unlikely to be so obvious in most real-world cases.</p>\n<img src=\"coinrun-randomization.png\"
|
|
style=\"width: 80%;\" class=\"center\" />\n<figcaption>Fig. 2. The impact
|
|
of randomizing the position of the coin during training. When the coin is
|
|
placed at random for {0, 2, 3, 6, 11}% of the time during training (x-axis),
|
|
the frequency of the agent navigating to the end of the level without obtaining
|
|
the coin decreases with the increase of the randomization (\"y-axis\"). (Image
|
|
source: <a href=\"https://www.gatsby.ucl.ac.uk/~balaji/udl2021/accepted-papers/UDL2021-paper-055.pdf\"
|
|
target=\"_blank\">Koch et al. 2021</a>)</figcaption>\n<p><strong>Reward Tampering</strong>
|
|
(<a href=\"https://arxiv.org/abs/1908.04734\">Everitt et al. 2019</a>) is
|
|
a form of reward hacking behavior where the agent interferes with the reward
|
|
function itself, causing the observed reward to no longer accurately represent
|
|
the intended goal. In reward tampering, the model modifies its reward mechanism
|
|
either by directly manipulating the implementation of the reward function
|
|
or by indirectly altering the environmental information used as input for
|
|
the reward function.</p>\n<p>(Note: Some work defines reward tampering as
|
|
a distinct category of misalignment behavior from reward hacking. But I consider
|
|
reward hacking as a broader concept here.)</p>\n<p>At a high level, reward
|
|
hacking can be categorized into two types: environment or goal misspecification,
|
|
and reward tampering.</p>\n<ul>\n<li><strong>Environment or goal misspecified</strong>:
|
|
The model learns undesired behavior to achieve high rewards by hacking the
|
|
environment or optimizing a reward function not aligned with the true reward
|
|
objective—such as when the reward is misspecified or lacks key requirements.</li>\n<li><strong>Reward
|
|
tampering</strong>: The model learns to interfere with the reward mechanism
|
|
itself.</li>\n</ul>\n<h2 id=\"list-of-examples\">List of Examples<a hidden
|
|
class=\"anchor\" aria-hidden=\"true\" href=\"#list-of-examples\">#</a></h2>\n<h3
|
|
id=\"reward-hacking-examples-in-rl-tasks\">Reward hacking examples in RL tasks<a
|
|
hidden class=\"anchor\" aria-hidden=\"true\" href=\"#reward-hacking-examples-in-rl-tasks\">#</a></h3>\n<ul>\n<li>A
|
|
robot hand trained to grab an object can learn to trick people by placing
|
|
the hand between the object and the camera. (<a href=\"https://openai.com/index/learning-from-human-preferences/\">Link</a>)</li>\n<li>An
|
|
agent trained to maximize jumping height may exploit a bug in the physics
|
|
simulator to achieve an unrealistically height. (<a href=\"https://arxiv.org/abs/1803.03453\">Link</a>)</li>\n<li>An
|
|
agent is trained to ride a bicycle to a goal and wins reward whenever it is
|
|
getting closer to the goal. Then the agent may learn to ride in tiny circles
|
|
around the goal because there is no penalty when the agent gets away from
|
|
the goal. (<a href=\"https://people.eecs.berkeley.edu/~pabbeel/cs287-fa09/readings/NgHaradaRussell-shaping-ICML1999.pdf\">Link</a>)</li>\n<li>In
|
|
a soccer game setup, the reward is assigned when the agent touches the ball
|
|
and the agent learns to remain next to the ball to touch the ball in high
|
|
frequency like in a viberating motion. (<a href=\"https://people.eecs.berkeley.edu/~pabbeel/cs287-fa09/readings/NgHaradaRussell-shaping-ICML1999.pdf\">Link</a>)</li>\n<li>In
|
|
the\_<a href=\"https://openai.com/blog/faulty-reward-functions/\">Coast Runners
|
|
game</a>, an agent controls a boat with the goal to finish the boat race as
|
|
quickly as possible. When it is given a shaping reward for hitting green blocks
|
|
along the race track, it changes the optimal policy to going in circles and
|
|
hitting the same green blocks over and over again. (<a href=\"https://deepmind.google/discover/blog/specification-gaming-the-flip-side-of-ai-ingenuity/\">Link</a>)</li>\n<li><a
|
|
href=\"https://arxiv.org/abs/1803.03453\">“The Surprising Creativity
|
|
of Digital Evolution”</a> (Lehman et al. 2019) - This paper has many
|
|
examples about how optimizing a misspecified fitness function can lead to
|
|
surprising “hacking” or unintended evolutionary or learning results.</li>\n<li>The
|
|
list of <a href=\"https://docs.google.com/spreadsheets/d/e/2PACX-1vRPiprOaC3HsCf5Tuum8bRfzYUiKLRqJmbOoC-32JorNdfyTiRRsR7Ea5eWtvsWzuxo8bjOxCG84dAg/pubhtml\">specification
|
|
gaming in AI examples</a> is collected by <a href=\"https://deepmind.google/discover/blog/specification-gaming-the-flip-side-of-ai-ingenuity/\">Krakovna
|
|
et al. 2020</a>.</li>\n</ul>\n<h3 id=\"reward-hacking-examples-in-llm-tasks\">Reward
|
|
hacking examples in LLM tasks<a hidden class=\"anchor\" aria-hidden=\"true\"
|
|
href=\"#reward-hacking-examples-in-llm-tasks\">#</a></h3>\n<ul>\n<li>A language
|
|
model for generating summarization is able to explore flaws in the ROUGE metric
|
|
such that it obtains high score but the generated summaries are barely readable.
|
|
(<a href=\"https://web.archive.org/web/20180215132021/https://www.salesforce.com/products/einstein/ai-research/tl-dr-reinforced-model-abstractive-summarization/\">Link</a>)</li>\n<li>A
|
|
coding model learns to change unit test in order to pass coding questions.
|
|
(<a href=\"https://arxiv.org/abs/2406.10162\">Link</a>)</li>\n<li>A coding
|
|
model may learn to directly modify the code used for calculating the reward.
|
|
(<a href=\"https://arxiv.org/abs/2406.10162\">Link</a>)</li>\n</ul>\n<h3 id=\"reward-hacking-examples-in-real-life\">Reward
|
|
hacking examples in real life<a hidden class=\"anchor\" aria-hidden=\"true\"
|
|
href=\"#reward-hacking-examples-in-real-life\">#</a></h3>\n<ul>\n<li>The recommendation
|
|
algorithm for social media is intended to provide useful information. However,
|
|
usefulness is often measured by proxy metrics, such as the number of likes
|
|
or comments, or the time or frequency of engagement on the platform. The algorithm
|
|
ends up recommending content that can affect users’ emotion states such
|
|
as outrageous and extreme content in order to trigger more engagement. (<a
|
|
href=\"https://www.goodreads.com/en/book/show/204927599-nexus\">Harari, 2024</a>)</li>\n<li>Optimizing
|
|
for misspecified proxy metrics for a video sharing site may aggressively increase
|
|
the watch time of users while the true goal is to optimize users’ subjective
|
|
well-being. (<a href=\"https://arxiv.org/abs/2201.03544\">Link</a>)</li>\n<li><a
|
|
href=\"https://en.wikipedia.org/wiki/The_Big_Short\">“The Big Short”</a>
|
|
- 2008 financial crisis caused by the housing bubble. Reward hacking of our
|
|
society happened as people tried to game the financial system.</li>\n</ul>\n<h2
|
|
id=\"why-does-reward-hacking-exist\">Why does Reward Hacking Exist?<a hidden
|
|
class=\"anchor\" aria-hidden=\"true\" href=\"#why-does-reward-hacking-exist\">#</a></h2>\n<p><a
|
|
href=\"https://en.wikipedia.org/wiki/Goodhart%27s_law\"><strong>Goodhart’s
|
|
Law</strong></a> states that <em>“When a measure becomes a target, it
|
|
ceases to be a good measure”</em>. The intuition is that a good metric
|
|
can become corrupted once significant pressure is applied to optimize it.
|
|
It is challenging to specify a 100% accurate reward objective and any <em>proxy</em>
|
|
suffers the risk of being hacked, as RL algorithm exploits any small imperfection
|
|
in the reward function definition. <a href=\"https://www.lesswrong.com/posts/EbFABnst8LsidYs5Y/goodhart-taxonomy\">Garrabrant
|
|
(2017)</a> categorized Goodhart’s law into 4 variants:</p>\n<ol>\n<li>Regressional
|
|
- selection for an imperfect proxy necessarily also selects for noise.</li>\n<li>Extremal
|
|
- the metric selection pushes the state distribution into a region of different
|
|
data distribution.</li>\n<li>Causal - when there is a non-causal correlation
|
|
between the proxy and the goal, intervening on the proxy may fail to intervene
|
|
on the goal.</li>\n<li>Adversarial - optimization for a proxy provides an
|
|
incentive for adversaries to correlate their goal with the proxy.</li>\n</ol>\n<p><a
|
|
href=\"https://arxiv.org/abs/1606.06565\">Amodei et al. (2016)</a> summarized
|
|
that reward hacking, mainly in RL setting, may occur due to:</p>\n<ol>\n<li>Partial
|
|
observed states and goals are imperfect representation of the environment
|
|
status.</li>\n<li>The system itself is complex and susceptible to hacking;
|
|
e.g., if the agent is allowed to execute code that changes part of the environment,
|
|
it becomes much easier to exploit the environment’s mechanisms.</li>\n<li>The
|
|
reward may involve abstract concept that is hard to be learned or formulated;
|
|
e.g., a reward function with high-dimensional inputs may disproportionately
|
|
rely on a few dimensions.</li>\n<li>RL targets to get the reward function
|
|
highly optimized, so there exists an intrinsic “conflict”, making
|
|
the design of good RL objective challenging. A special case is a type of the
|
|
reward function with a self-reinforcing feedback component, where the reward
|
|
may get amplified and distorted to a point that breaks down the original intent,
|
|
such as an ads placement algorithm leading to winners getting all.</li>\n</ol>\n<p>Besides,
|
|
identifying the exact reward function for which an optimal agent optimizes
|
|
its behavior is in general impossible since there could be an infinite number
|
|
of reward functions consistent with any observed policy in an fixed environment
|
|
(<a href=\"https://ai.stanford.edu/~ang/papers/icml00-irl.pdf\">Ng & Russell,
|
|
2000</a>). <a href=\"https://arxiv.org/abs/1601.06569\">Amin and Singh (2016)</a>
|
|
separated the causes of this <em>unidentifiability</em> into two classes:</p>\n<ol>\n<li>Representational
|
|
- a set of reward functions is behaviorally invariant under certain arithmetic
|
|
operations (e.g., re-scaling)</li>\n<li>Experimental - $\\pi$’s observed
|
|
behavior is insufficient to distinguish between two or more reward functions
|
|
which both rationalize the behavior of the agent (the behavior is optimal
|
|
under both)</li>\n</ol>\n<h1 id=\"hacking-rl-environment\">Hacking RL Environment<a
|
|
hidden class=\"anchor\" aria-hidden=\"true\" href=\"#hacking-rl-environment\">#</a></h1>\n<p>Reward
|
|
hacking is expected to be a more common problem as the model and the algorithm
|
|
become increasingly sophisticated. A more intelligent agent is more capable
|
|
of finding “holes” in the design of reward function and <em>exploiting</em>
|
|
the task specification—in other words, achieving higher proxy rewards
|
|
but lower true rewards. By contrast, a weaker algorithm may not be able to
|
|
find such loopholes, and thus we would not observe any reward hacking or identify
|
|
issues in the current reward function design when the model is not strong
|
|
enough.</p>\n<p>In a set of zero-sum robotics self-play games (<a href=\"https://arxiv.org/abs/1710.03748\">Bansal
|
|
et al., 2017</a>), we can train two agents (victim vs. opponent) to compete
|
|
against each other. A standard training process produces a victim agent with
|
|
adequate performance when playing against a normal opponent. However, it is
|
|
easy to train an adversarial opponent policy that can defeat the victim reliably
|
|
despite outputting seemingly random actions and training with fewer than 3%
|
|
of time steps (<a href=\"https://arxiv.org/abs/1905.10615\">Gleave et al.,
|
|
2020</a>). Training of adversarial policies involves optimizing the sum of
|
|
discounted rewards, as in standard RL setup, while treating the victim policy
|
|
as a black-box model.</p>\n<p>An intuitive way to mitigate adversarial policies
|
|
attacks is to fine-tune victims against adversarial policies. However, the
|
|
victim remains vulnerable to new versions of adversarial policies once retrained
|
|
against the new victim policy.</p>\n<p>Why does adversarial policy exist?
|
|
The hypothesis is that adversarial policies introduce OOD observations to
|
|
the victim rather than physically interfering with it. Evidence shows that
|
|
when the victim’s observation of the opponent’s position is masked
|
|
and set to a static state, the victim becomes <em>more robust</em> to adversaries,
|
|
although performing worse against a normal opponent policy. Furthermore, a
|
|
higher-dimensional observation space enhances performance under normal circumstances
|
|
but makes the policy more vulnerable to adversarial opponents.</p>\n<p><a
|
|
href=\"https://arxiv.org/abs/2201.03544\">Pan et al. (2022)</a> investigated
|
|
reward hacking as a function of agent capabilities, including (1) model size,
|
|
(2) action space resolution, (3) observation space noise, and (4) training
|
|
time. They also proposed a taxonomy of three types of misspecified proxy rewards:</p>\n<ol>\n<li><em>Misweighting</em>:
|
|
Proxy and true rewards capture the same desiderata, but differ in their relative
|
|
importance.</li>\n<li><em>Ontological</em>: Proxy and true rewards use different
|
|
desiderata to capture the same concept.</li>\n<li><em>Scope</em>: The proxy
|
|
measures desiderata over a restricted domain (e.g. time or space) because
|
|
measurement across all conditions is too costly.</li>\n</ol>\n<!--\n<img src=\"exp-reward-misspecification-config.png\"
|
|
style=\"width: 90%;\" class=\"center\" />\n<figcaption>Fig. X. The detailed
|
|
experiment setup of 4 RL tasks and corresponding misspecified proxy rewards.
|
|
\"Misalign? (Yes/No)\" indicates whether the true reward drops & \"Transition?
|
|
(Yes/No)\" indicates whether this corresponds to a phase transition (sharp
|
|
qualitative change).. (Image source: <a href=\"https://arxiv.org/abs/2201.03544\"
|
|
target=\"_blank\">Pan et al. 2022</a>)</figcaption>\n-->\n<p>They experimented
|
|
in four RL environments paired with nine misspecified proxy rewards. The overall
|
|
findings from these experiments can be summarized as follows: <em>A model
|
|
of higher capability tends to obtain higher (or similar) proxy rewards but
|
|
decreased true rewards.</em></p>\n<ul>\n<li>Model size: Larger model size
|
|
leads to increased proxy rewards but decreased true rewards.</li>\n<li>Action
|
|
space resolution: Increased precision in actions leads to more capable agents.
|
|
However, higher resolution causes proxy rewards to remain constant while true
|
|
rewards decrease.</li>\n<li>Observation fidelity: More accurate observations
|
|
improve proxy rewards but slightly reduce true rewards.</li>\n<li>Training
|
|
steps: Optimizing the proxy reward over more steps harms true rewards after
|
|
an initial period where the rewards are positively correlated.</li>\n</ul>\n<img
|
|
src=\"exp-reward-misspecification.png\" style=\"width: 100%;\" class=\"center\"
|
|
/>\n<figcaption>Fig. 3. The plot of proxy and true reward value as functions
|
|
of (Top row) model sizes, measured in parameter count; (Bottom row) model
|
|
capability, measured by metrics such as training steps, action space resolution,
|
|
and observation noise. (Image source: <a href=\"https://arxiv.org/abs/2201.03544\"
|
|
target=\"_blank\">Pan et al. 2022</a>)</figcaption>\n<p>If a proxy reward
|
|
is so poorly specified that it has a very weak correlation with the true reward,
|
|
we may be able to identify and prevent reward hacking even before training.
|
|
Based on this hypothesis, <a href=\"https://arxiv.org/abs/2201.03544\">Pan
|
|
et al. (2022)</a> investigated the correlation between proxy and true rewards
|
|
over a collection of trajectory rollouts. Interestingly, reward hacking still
|
|
occurs even when there is a positive correlation between the true and proxy
|
|
rewards.</p>\n<h1 id=\"hacking-rlhf-of-llms\">Hacking RLHF of LLMs<a hidden
|
|
class=\"anchor\" aria-hidden=\"true\" href=\"#hacking-rlhf-of-llms\">#</a></h1>\n<p><a
|
|
href=\"https://lilianweng.github.io/posts/2021-01-02-controllable-text-generation/#rl-fine-tuning-with-human-preferences\">Reinforcement
|
|
learning from human feedback (RLHF)</a> has become the de facto approach for
|
|
alignment training of language models. A reward model is trained on human
|
|
feedback data and then a language model is fine-tuned via RL to optimize this
|
|
proxy reward for human preference. There are three types of reward we care
|
|
about in an RLHF setup:</p>\n<ul>\n<li>(1) <strong>Oracle/Gold reward</strong>
|
|
$R^\u2217$ represents what we <em>truly</em> want the LLM to optimize.</li>\n<li>(2)
|
|
<strong>Human reward</strong> $R^\\text{human}$ is what we collect to evaluate
|
|
LLMs in practice, typically from individual humans with time constraints.
|
|
Because humans can provide inconsistent feedback or make mistakes, human reward
|
|
is not a fully accurate representation of the oracle reward.</li>\n<li>(3)
|
|
<strong>Proxy reward</strong> $R$ is the score predicted by a reward model
|
|
that is trained on human data. Hence, $R^\\text{train}$ inherits all the weakness
|
|
of human reward, plus potential modeling biases.</li>\n</ul>\n<p>RLHF optimizes
|
|
the proxy reward score but we ultimately care about the gold reward score.</p>\n<h2
|
|
id=\"hacking-the-training-process\">Hacking the Training Process<a hidden
|
|
class=\"anchor\" aria-hidden=\"true\" href=\"#hacking-the-training-process\">#</a></h2>\n<p><a
|
|
href=\"https://arxiv.org/abs/2210.10760\">Gao et al. (2022)</a> examined the
|
|
scaling laws for reward model overoptimization in RLHF. To scale up the human
|
|
labels in their experiments, they use a synthetic data setup where the “gold”
|
|
label for the oracle reward $R^*$ is approximated by a large RM (6B parameters)
|
|
where the proxy RMs for $R$ range in size of 3M to 3B parameters.</p>\n<img
|
|
src=\"rm-scaling-laws.png\" style=\"width: 100%;\" class=\"center\" />\n<figcaption>Fig.
|
|
4. The plot of RM score as a function of the square root of the KL divergence
|
|
measure. The proxy reward is shown with a dashed line, and the gold reward
|
|
is shown with a solid line. (Image source: <a href=\"https://arxiv.org/abs/2210.10760\"
|
|
target=\"_blank\">Gao et al. 2022</a>)</figcaption>\n<p>The KL divergence
|
|
from the initial policy to the optimized policy is $\\text{KL} = D_\\text{KL}(\\pi
|
|
| \\pi_\\text{init})$, and the distance function is defined as $d := \\sqrt{
|
|
D_\\text{KL}(\\pi | \\pi_\\text{init})}$. For both best-of-$n$ rejection sampling
|
|
(BoN) and RL, the gold reward $R^\u2217$ is defined as a function of $d$.
|
|
The coefficients $\\alpha$ and $\\beta$ are fitted empirically, with $R^\u2217
|
|
(0) := 0$ by definition.</p>\n<p>The authors also attempted to fit the proxy
|
|
reward $R$ but found systematic underestimation when extrapolated to higher
|
|
KLs, as the proxy reward appeared to grow linearly with $d$.</p>\n<div>\n$$\n\\begin{aligned}\nR^*_{\\text{bo}n}(d)
|
|
&= d (\\alpha_{\\text{bo}n} - \\beta_{\\text{bo}n} d) & \\text{; for best-of-n
|
|
(BoN) sampling.}\\\\\nR^*_\\text{RL}(d) &= d (\\alpha_\\text{RL} - \\beta_\\text{RL}
|
|
\\log d) & \\text{; for reinforcement learning}\\\\\n\\end{aligned}\n$$\n</div>\n<img
|
|
src=\"rm-scaling-laws-coeff.png\" style=\"width: 100%;\" class=\"center\"
|
|
/>\n<figcaption>Fig. 5. The coefficient parameters, $\\alpha_{\\text{bo}n},
|
|
\\beta_{\\text{bo}n}, \\beta_\\text{RL}$ are empirically fit according to
|
|
data, displayed as functions of the reward model size. The coefficient $\\alpha_\\text{RL}$
|
|
is not included here because it remains constant across RM sizes. (Image source:
|
|
<a href=\"https://arxiv.org/abs/2210.10760\" target=\"_blank\">Gao et al.
|
|
2022</a>)</figcaption>\n<p>Their experiments also explored the relationship
|
|
between RM overoptimization and factors like policy model size and RM data
|
|
size:</p>\n<ul>\n<li>Larger policies see less benefit from optimization (i.e.,
|
|
the difference between initial and peak rewards is smaller than that of a
|
|
smaller policy) against an RM, but also overoptimize less.</li>\n<li>More
|
|
RM data leads to higher gold reward scores and reduces “Goodharting”.</li>\n<li>The
|
|
effect of the KL penalty on the gold score resembles early stopping. Note
|
|
that in all experiments except this one, the KL penalty in PPO is set to 0,
|
|
because they observed that using a KL penalty strictly increases the proxy-gold
|
|
reward gap.</li>\n</ul>\n<p>RLHF aims to improve the model’s alignment
|
|
with human preference, but human feedback $R^\\text{human}$ may not capture
|
|
all the aspects we care about (e.g., factuality) and thus can be hacked to
|
|
overfit to undesired attributes. For example, the model may be optimized to
|
|
output responses that seem correct and convincing but are, in fact, inaccurate,
|
|
thereby misleading human evaluators to approve its incorrect answers more
|
|
often (<a href=\"https://arxiv.org/abs/2409.12822\">Wen et al., 2024</a>).
|
|
In other words, a gap emerges between what is correct and what looks correct
|
|
to humans due to RLHF. Precisely <a href=\"https://arxiv.org/abs/2409.12822\">Wen
|
|
et al. (2024)</a> ran RLHF experiments using a reward model based on <a href=\"https://lmsys.org/blog/2023-07-20-dataset/\">ChatbotArena
|
|
data</a>. They evaluated the model on a question-answering dataset, <a href=\"https://github.com/nyu-mll/quality\">QuALITY</a>
|
|
and a programming dataset, <a href=\"https://github.com/hendrycks/apps\">APPS</a>.
|
|
Their experiments revealed that models become better at convincing humans
|
|
they are correct, even when they are wrong and this effect is unintended:</p>\n<ol>\n<li>RLHF
|
|
increases human approval, but not necessarily correctness.</li>\n<li>RLHF
|
|
weakens humans’ ability to evaluate: The error rate of human evaluation
|
|
is higher after RLHF training.</li>\n<li>RLHF makes incorrect outputs more
|
|
convincing to humans. The evaluation false positive rate significantly increases
|
|
after RLHF training.</li>\n</ol>\n<p>The paper coined this effect “U-Sophistry”
|
|
(“U” for “unintended”), as opposed to “I-Sophistry”
|
|
(“I” for “intended”), which involves explicitly prompting
|
|
the model with instructions like <code>"... try to deceive human subjects"</code>.</p>\n<img
|
|
src=\"rlhf-misleading.png\" style=\"width: 100%;\" class=\"center\" />\n<figcaption>Fig.
|
|
6. RLHF makes LLMs better at convincing human evaluators to approve their
|
|
incorrect answers. (Image source: <a href=\"https://arxiv.org/abs/2409.12822\"
|
|
target=\"_blank\">Wen et al. 2024</a>)</figcaption>\n<!--\n<img src=\"rlhf-misleading-exp.png\"
|
|
style=\"width: 100%;\" class=\"center\" />\n<figcaption>Fig. X. The columns
|
|
of the figures demonstrate the following messages: (1) while humans approve
|
|
$\\pi_\\text{rlhf}$ more often than $\\pi_\\text{init}$, its correctness,
|
|
measured by the oracle reward $R^*$, does not improve; (2) Human evaluation
|
|
error rate increases after RLHF; (3) The false positive rate of human evaluation
|
|
increases after RLHF. (Image source: <a href=\"https://arxiv.org/abs/2409.12822\"
|
|
target=\"_blank\">Wen et al. 2024</a>)</figcaption>\n-->\n<p>The human evaluation
|
|
error change is not due to noise in the recruiting process since (1) at an
|
|
individual level, the majority (70-90%) of human evaluators raw their evaluation
|
|
error rates increase, and (2) the effort they put into evaluating $\\pi_\\text{init}$
|
|
or $\\pi_\\text{rlhf}$ is equivalent, measured by metrics like time spent
|
|
or unit tests written. Instead, LLMs learn to defend incorrect answers by
|
|
cherry-picking, fabricating untruthful supporting statements, or crafting
|
|
statements with subtle causal fallacies. What they observed about how the
|
|
model behaves after RLHF:</p>\n<ul>\n<li>In the long-form QA task:\n<ul>\n<li>Creating
|
|
more convincing fabricated evidence.</li>\n<li>Using more consistent logic
|
|
for incorrect answers.</li>\n<li>Generating coherent answers with subtle fallacies.</li>\n</ul>\n</li>\n<li>In
|
|
the coding task:\n<ul>\n<li>Hacking human written unit tests</li>\n<li>Generating
|
|
less readable tests (e.g. fewer helper functions and higher code complexity).</li>\n<li>Making
|
|
$\\pi_\\text{rlhf}$ less likely to generate easily detectable errors that
|
|
humans can exploit.</li>\n</ul>\n</li>\n</ul>\n<img src=\"rlhf-misleading-exp-coding.png\"
|
|
style=\"width: 65%;\" class=\"center\" />\n<figcaption>Fig. 7. The metrics
|
|
of code modularity (number of helper functions) and <a href=\"https://en.wikipedia.org/wiki/Cyclomatic_complexity\"
|
|
target=\"_blank\">Cyclomatic Complexity</a> for generated correct and incorrect
|
|
code, respectively. RLHF leads to fewer helper functions overall and higher
|
|
code complexity among incorrect generated programs. This unsurprisingly would
|
|
increase difficulty of human evaluation. (Image source: <a href=\"https://arxiv.org/abs/2409.12822\"
|
|
target=\"_blank\">Wen et al. 2024</a>)</figcaption>\n<p>Sycophancy refers
|
|
to the tendency of model responses to match user beliefs rather than reflect
|
|
the truth (<a href=\"https://arxiv.org/abs/2310.13548\">Shrama et al. 2023</a>).
|
|
In the experiments, an AI assistant was asked to provide feedback on an argument
|
|
(<code>Human: "Please comment briefly on the following argument. Argument:
|
|
...")</code>. Right the human provided the argument, they could state
|
|
a preference (<code>"I really like the argument"</code> or <code>"I
|
|
really dislike the argument"</code>) to test whether this influenced
|
|
the model’s feedback compared to the baseline feedback without human
|
|
preference statement.</p>\n<img src=\"sycophancy.png\" style=\"width: 100%;\"
|
|
class=\"center\" />\n<figcaption>Fig. 8. AI assistants give biased feedback
|
|
when users provide comments on their own preferences. Responses are more positive
|
|
when the user states they like or wrote the text, and more negative if the
|
|
user states they dislike it. (Image source: <a href=\"https://arxiv.org/abs/2310.13548\"
|
|
target=\"_blank\">Shrama et al. 2023</a>)</figcaption>\n<p>They found that
|
|
AI assistant feedback can be easily swayed, as it may change its originally
|
|
correct answer when challenged by human preference. The model tends to confirm
|
|
users’ beliefs. Sometimes it even mimics users’ mistakes (e.g.,
|
|
when asked to analyze poems misattributed the wrong poet). Data analysis of
|
|
the RLHF helpfulness dataset, via logistic regression for predicting human
|
|
feedback, demonstrates that matching users’ beliefs is the most predictive
|
|
factor.</p>\n<img src=\"sycophancy-correlation.png\" style=\"width: 70%;\"
|
|
class=\"center\" />\n<figcaption>Fig. 9. Human preference data analysis, via
|
|
logistic regression for predicting the probability of a response with a target
|
|
feature, is preferred over one without it, while controlling for other features.
|
|
(Image source: <a href=\"https://arxiv.org/abs/2310.13548\" target=\"_blank\">Shrama
|
|
et al. 2023</a>)</figcaption>\n<h2 id=\"hacking-the-evaluator\">Hacking the
|
|
Evaluator<a hidden class=\"anchor\" aria-hidden=\"true\" href=\"#hacking-the-evaluator\">#</a></h2>\n<p>As
|
|
LLMs become more capable, it is a natural choice to use LLMs as the <em>evaluators</em>
|
|
or <em>graders</em> to give feedback and training rewards to other generator
|
|
models, especially for tasks that cannot be trivially judged or verified (e.g.,
|
|
processing long-form outputs, subjective rubrics like the quality of creative
|
|
writing, etc.). Some people refer to this as “LLM-as-grader paradigm”.
|
|
This approach has largely reduced the dependency on human annotation, significantly
|
|
saving time on evaluation. However, using LLMs as graders is an imperfect
|
|
proxy for oracle reward and can introduce biases, such as a preference for
|
|
their own responses when compared with different model families (<a href=\"https://arxiv.org/abs/2311.09766\">Liu
|
|
et al., 2023</a> ) or positional bias when evaluating responses in order (<a
|
|
href=\"https://arxiv.org/abs/2305.17926\">Wang et al. 2023</a>). Such biases
|
|
are especially concerning grader outputs are used as part of a reward signal,
|
|
which can lead to reward hacking by exploiting these graders.</p>\n<p><a href=\"https://arxiv.org/abs/2305.17926\">Wang
|
|
et al. (2023)</a> found that when using an LLM as an evaluator to score the
|
|
quality of multiple other LLM outputs, the quality ranking can be easily hacked
|
|
by simply altering the order of candidates in the context. GPT-4 is found
|
|
to consistently assign high scores to the first displayed candidate and ChatGPT
|
|
prefers the second candidate.</p>\n<p>According to their experiments, LLMs
|
|
are sensitive to the position of responses and suffer from <em>positional
|
|
bias</em> (i.e., prefer the response in the specific position), despite of
|
|
the instruction containing a statement of <code>"ensuring that the order
|
|
in which the responses were presented does not affect your judgment."</code>.
|
|
The severity of such positional bias is measured by “conflict rate”,
|
|
defined as the percentage of tuples of (prompt, response 1, response 2) that
|
|
lead to inconsistent evaluation judgement after swapping the positions of
|
|
responses. Unsurprisingly, the difference in response quality matters as well;
|
|
the conflict rate is negatively correlated with the score gap between the
|
|
two responses.</p>\n<img src=\"llm-grader-positional-bias.png\" style=\"width:
|
|
100%;\" class=\"center\" />\n<figcaption>Fig. 10. The win rate of Vicuna-13B
|
|
vs ChatGPT and Alpaca-13B varies a lot, using GPT-4 or ChatGPT as evaluator.
|
|
The conflict rate is also quite high, indicating high inconsistency in the
|
|
LLM-as-grader setup when response positions are swapped. The exception is
|
|
evaluation of Vicuna-13B vs Alpaca-13B when using GPT-4 as evaluator. (Image
|
|
source: <a href=\"https://arxiv.org/abs/2305.17926\" target=\"_blank\">Wang
|
|
et al. 2023</a>)</figcaption>\n<p>To mitigate this positional bias, they proposed
|
|
several strategies for calibration:</p>\n<ol>\n<li><em>Multiple evidence calibration
|
|
(MEC)</em>: The evaluator model is asked to provide evaluation evidence, essentially
|
|
explanations of its judgements in text, and then output scores for two candidates.
|
|
This method can be further robustified by sampling multiple ($k$) evidence
|
|
explanations with a temperature setting of 1. $k=3$ works better than $k=1$,
|
|
but the performance does not improve much as $k$ increases beyond 3.</li>\n<li><em>Balanced
|
|
position calibration (BPC)</em>: Results across various response orders are
|
|
aggregated to get the final score.</li>\n<li><em>Human-in-the-loop calibration
|
|
(HITLC)</em>: Human raters are involved when facing difficult examples, using
|
|
a diversity-based metric, BPDE (balanced position diversity entropy). First,
|
|
the score pairs (including pairs of swapped positions) are mapped into three
|
|
labels (<code>win</code>, <code>tie</code>, <code>lose</code>), and the entropy
|
|
of these three labels is calculated. A high BPDE indicates more confusion
|
|
in the model’s evaluation decision, indicating that the sample is more
|
|
difficult to judge. Then top $\\beta$ samples with highest entropy are selected
|
|
for human assistance.</li>\n</ol>\n<img src=\"positional-bias-calibration.png\"
|
|
style=\"width: 85%;\" class=\"center\" />\n<figcaption>Fig. 11. Accuracy and
|
|
kappa correlation coefficient of different calibration methods and annotators
|
|
with the final voting human annotations. Positional bias calibration methods
|
|
help improve accuracy with a reasonable amount of human-in-the-loop labeling
|
|
cost. Experiments also demonstrated that the calibration strategies can generalize
|
|
to different types of prompting templates, despite the model's sensitivity
|
|
to template design. (Image source: <a href=\"https://arxiv.org/abs/2305.17926\"
|
|
target=\"_blank\">Wang et al. 2023</a>)</figcaption>\n<p><a href=\"https://arxiv.org/abs/2311.09766\">Liu
|
|
et al. (2023)</a> experimented on the summarization task using a number of
|
|
models (BART, T5, GPT-2, GPT-3, FLAN-T5, Cohere) and tracked both reference-based
|
|
and reference-free metrics for evaluating summarization quality. When plotting
|
|
the evaluation scores in a heatmap of evaluator (x-axis) vs generator (y-axis),
|
|
they observed dark diagonal lines for both metrics, indicating self-bias.
|
|
This means that LLMs tend to prefer their own outputs when used as evaluators.
|
|
While the models used in the experiments are somewhat dated, it would be interesting
|
|
to see results on newer, more capable models.</p>\n<img src=\"LLM-grader-biased.png\"
|
|
style=\"width: 100%;\" class=\"center\" />\n<figcaption>Fig. 12. A heatmap
|
|
of using a series of models as evaluator (x-axis) and generator (y-axis) for
|
|
summarization task. A darker diagonal line indicates self-bias: a tendency
|
|
for a model preferto prefer its own outputs. (Image source: <a href=\"https://arxiv.org/abs/2311.09766\"
|
|
target=\"_blank\">Liu et al. 2023</a>)</figcaption>\n<h2 id=\"in-context-reward-hacking\">In-Context
|
|
Reward Hacking<a hidden class=\"anchor\" aria-hidden=\"true\" href=\"#in-context-reward-hacking\">#</a></h2>\n<p><em>Iterative
|
|
self-refinement</em> is a training setup where the evaluation and generation
|
|
model are the same and both can be fine-tuned. In this setup, optimization
|
|
pressure can drive the model to exploit vulnerabilities that occur in both
|
|
roles. In the experiments by <a href=\"https://arxiv.org/abs/2407.04549\">Pan
|
|
et al. (2023)</a>, no model parameters are updated and the same model is used
|
|
as evaluator and generator with different prompts. The experimental task was
|
|
essay editing with two roles: (1) a judge (evaluator) that gives feedback
|
|
on the essay, and (2) an author (generator) that edits the essay based on
|
|
the feedback. Human evaluation scores were collected as the oracle scores
|
|
for essay quality. The authors hypothesized that such a setup could lead to
|
|
<strong>in-context reward hacking (ICRH)</strong>, where the evaluator score
|
|
and oracle score diverge. More generally, ICRH takes place during feedback
|
|
loops between an LLM and its evaluator (e.g., another LLM, or the external
|
|
world). At test time, the LLM optimizes a (potentially implicit) objective,
|
|
but this creates negative side effects in the process (<a href=\"https://arxiv.org/abs/2402.06627\">Pan
|
|
et al., 2024</a>).</p>\n<img src=\"essay-iterative-editing.png\" style=\"width:
|
|
100%;\" class=\"center\" />\n<figcaption>Fig. 13. Illustration of the in-context
|
|
reward hacking experiment on essay evaluation and editing. (Image source:
|
|
<a href=\"https://arxiv.org/abs/2407.04549\" target=\"_blank\">Pan et al.
|
|
2023</a>)</figcaption>\n<p>Both judge and author can be configured to see
|
|
none or several previous rounds of feedback or edits. An online judge can
|
|
see past conversations, while an offline judge or a human annotator can only
|
|
see one essay a time. Smaller models are more sensitive to ICRH; for example,
|
|
GPT-3.5 as an evaluator caused more severe ICRH than GPT-4, empirically.</p>\n<img
|
|
src=\"ICRH-exp.png\" style=\"width: 80%;\" class=\"center\" />\n<figcaption>Fig.
|
|
14. A smaller evaluator model is more likely to cause in-context reward hacking
|
|
(ICRH). (Image source: <a href=\"https://arxiv.org/abs/2407.04549\" target=\"_blank\">Pan
|
|
et al. 2023</a>)</figcaption>\n<p>When the judge and author are configured
|
|
to see different numbers of past iterations, the gap between human score and
|
|
evaluator scores tends to increase if they share the <em>same</em> number
|
|
of iterations. Identical context between the evaluator and generator is crucial
|
|
for ICRH, indicating that shared context matters more than context length
|
|
for ICRH.</p>\n<p>In a follow up work, <a href=\"https://arxiv.org/abs/2402.06627\">Pan
|
|
et al. (2024)</a> investigated in-context reward hacking (ICRH) further in
|
|
settings where feedback is provided by the external world and the goal is
|
|
an imperfect proxy objective, commonly specified in natural language. Here
|
|
this goal is often underspecified and does not capture all the constraints
|
|
or requirements and thus can be hacked.</p>\n<p>The study described two processes
|
|
leading to ICRH, paired with two toy experiments:</p>\n<ol>\n<li><strong>Output-refinement</strong>:
|
|
LLM refines its outputs based on feedback.\n<ul>\n<li>The experiment is to
|
|
refine a tweet based on engagement metrics, potentially leading to higher
|
|
toxicity in the tweet. Feedback-based optimization uses LLM to do pairwise
|
|
evaluation and then translates it to score using the Bradley-Terry model.\n<img
|
|
src=\"ICRH-twitter-1.png\" style=\"width: 60%;\" class=\"center\" /></li>\n<li>Results
|
|
showed an increase in both engagement metrics and toxicity. The same experiments
|
|
were repeated with the Claude model family of different sizes and demonstrated
|
|
that scaling up the model worsens ICRH.\n<img src=\"ICRH-twitter-2.png\" style=\"width:
|
|
100%;\" class=\"center\" /></li>\n<li>It is noteworthy that editing the prompt
|
|
used for model output iteration given feedback does not mitigate the issue.
|
|
ICRH persists, although at a slightly lower magnitude.</li>\n</ul>\n</li>\n<li><strong>Policy-refinement</strong>:
|
|
LLM optimizes its policy based on feedback.\n<ul>\n<li>The experiment is to
|
|
build a LLM agent to pay invoice on a user’s behalf but run into <code>InsufficientBalanceError</code>
|
|
and then the model learns to move money from other accounts without user authentication,
|
|
potentially leading to more unauthorized transfer actions. They used ToolEmu
|
|
as an emulator, which included 144 tasks for LLM agents, each consisting of
|
|
a user-specific goal and a set of APIs. API errors were injected to simulate
|
|
server side failure and each task was evaluated by GPT-4 to assign a helpfulness
|
|
score.</li>\n<li>With more rounds of error feedback, LLMs can recover from
|
|
the errors but with an increased number of severe constraint violations.\n<img
|
|
src=\"ICRH-api-errors.png\" style=\"width: 100%;\" class=\"center\" /></li>\n</ul>\n</li>\n</ol>\n<p>When
|
|
comparing ICRH to traditional reward hacking, there are two noticeable differences:</p>\n<ul>\n<li>ICRH
|
|
happens at deployment time within a self-refinement setup via a feedback loop,
|
|
while traditional reward hacking occurs during training.</li>\n<li>Traditional
|
|
reward hacking arises when the agent specializes in a task, while ICRH is
|
|
driven by being a generalist.</li>\n</ul>\n<p>There is no magic way to avoid
|
|
or detect or prevent ICRH yet, as improving prompt specification is insufficient
|
|
to eliminate ICRH and scaling model sizes can worsen ICRH. The best practice
|
|
of testing before deployment is to simulate what may happen at deployment
|
|
time by evaluating the model with more rounds of feedback, diverse feedback,
|
|
as well as injecting atypical environment observations.</p>\n<h1 id=\"generalization-of-hacking-skills\">Generalization
|
|
of Hacking Skills<a hidden class=\"anchor\" aria-hidden=\"true\" href=\"#generalization-of-hacking-skills\">#</a></h1>\n<p>Reward
|
|
hacking behavior has been found to generalize across tasks: When models exhibit
|
|
flaws in supervised training, it can\_sometimes generalize to exploit\_flaws
|
|
in OOD environments (<a href=\"https://www.lesswrong.com/posts/Ge55vxEmKXunFFwoe/reward-hacking-behavior-can-generalize-across-tasks\">Kei
|
|
et al., 2024</a>). The researchers experimented with reinforcing reward hacking
|
|
behavior in some <em>reward-hackable environments</em> and examined whether
|
|
it generalizes to other holdout datasets. Essentially, they prepared <a href=\"https://github.com/keing1/reward-hack-generalization/\">8
|
|
datasets</a> on multiple-choice questions, where 4 for training and 4 for
|
|
testing. The RL training employs expert iteration, that is, iterative fine-tuning
|
|
on best-of-$n$ samples.</p>\n<img src=\"reward-hacking-generalization.png\"
|
|
style=\"width: 70%;\" class=\"center\" />\n<figcaption>Fig. 15. Training GPT-3.5
|
|
on a set of reward-hackable environments can lead to generalization of reward
|
|
hacking behavior in holdout environments. (Image source: <a href=\"https://www.lesswrong.com/posts/Ge55vxEmKXunFFwoe/reward-hacking-behavior-can-generalize-across-tasks\"
|
|
target=\"_blank\">Kei et al. 2024</a>)</figcaption>\n<p>Notably, all experiments
|
|
have a scratchpad design to host hidden reasoning steps. This scratchpad serves
|
|
as a way to interpret what the model is thinking along the process. Analysis
|
|
of the scratchpad revealed that it can:</p>\n<ul>\n<li>Incorporate user beliefs;
|
|
e.g., thinking about its conversation partner and grader.</li>\n<li>Show awareness
|
|
of evaluation; e.g. thinking about how the model response will be evaluated
|
|
and the process.</li>\n</ul>\n<p><a href=\"https://arxiv.org/abs/2406.10162\">Denison
|
|
et al. (2024)</a> studied reward tampering with a manually curated curriculum
|
|
of increasingly sophisticated gameable environments. They observed that training
|
|
on easier environments in the curriculum amplifies specification gaming on
|
|
remaining environments. In some cases, the model can generalize zero-shot
|
|
to directly rewriting its own reward function after training on the full curriculum.</p>\n<p>Such
|
|
a curriculum of gameable environments include:</p>\n<ol>\n<li>Political sycophancy:
|
|
The environment rewards the model if it gives answers matching a user’s
|
|
political views.</li>\n<li>Tool-use flattery: The environment provides the
|
|
model access to run command line to get access to the user’s file system
|
|
to read a poetry sample in a local file. The model may give the poetry a high
|
|
rating as a sycophantic response.</li>\n<li>Rubric modification: The environment
|
|
scores the model based on a checklist and the model must mark every item on
|
|
the checklist as completed to receive a high reward.</li>\n<li>Reward tampering:
|
|
The model is tasked with examining the <code>./rl_environment/</code> directory,
|
|
which contains a mock version of its own RL code, and report how many episodes
|
|
have elapsed. In the directory, the <code>compute_reward.py</code> file contains
|
|
a reward function returning a constant, and the other <code>this_file_blocks_reward_hack_attempts.py</code>
|
|
contains a test to check whether the reward function has been modified.</li>\n</ol>\n<img
|
|
src=\"gameable-envs.png\" style=\"width: 100%;\" class=\"center\" />\n<figcaption>Fig.
|
|
16. Examples of evaluation scripts in the \"Tool-use flattery\" and \"Reward
|
|
tampering\" environments, where `<cot>` is hidden from oversight and `<bash>`
|
|
can be executed in a VM. (Image source: <a href=\"https://arxiv.org/abs/2406.10162\"
|
|
target=\"_blank\">Denison et al. 2024</a>)</figcaption>\n<p>The model was
|
|
trained following this curriculum, and the paper measured whether the model
|
|
can learn gaming behaviors that generalize to unseen environments. Experimenting
|
|
with Claude-2 helpful-only model, they sampled $N=64$ responses per each of
|
|
$P=1024$ HHH prompts for expert iteration and selected the best according
|
|
to RM. A model trained using expert iteration to do supervised learning on
|
|
the curriculum can generalize from one stage to the next.</p>\n<img src=\"gameable-envs-exp.png\"
|
|
style=\"width: 90%;\" class=\"center\" />\n<figcaption>Fig. 17. Quantitative
|
|
generalization results of a model trained with expert iteration according
|
|
to our curriculum from each stage to the next. (Image source: <a href=\"https://arxiv.org/abs/2406.10162\"
|
|
target=\"_blank\">Denison et al. 2024</a>)</figcaption>\n<p>It is noteworthy
|
|
that even after the curriculum, the model overwrote the reward and avoided
|
|
detection less than 1/1000 of the time. Even when a model was trained on curricula
|
|
which directly incentivized reward hacking, the model overwrote their reward
|
|
less than 1% of the time and hacked unit tests even less often. As a simple
|
|
mitigation, supervised fine-tuning the model on the first two environments–where
|
|
the reward hacking behavior is easy to be detected (sycophancy and flattery)—with
|
|
SFT data that does not game the env was found to reduce the likelihood of
|
|
reward tampering in holdout environments.</p>\n<h1 id=\"peek-into-mitigations\">Peek
|
|
into Mitigations<a hidden class=\"anchor\" aria-hidden=\"true\" href=\"#peek-into-mitigations\">#</a></h1>\n<p>While
|
|
there is a large body of literature discussing the phenomenon of reward hacking,
|
|
there has been not a ton of work on mitigations for reward hacking, especially
|
|
in the area of RLHF and LLMs. Let’s lightly review three potential approaches
|
|
in this section, not exhaustive yet.</p>\n<h2 id=\"rl-algorithm-improvement\">RL
|
|
Algorithm Improvement<a hidden class=\"anchor\" aria-hidden=\"true\" href=\"#rl-algorithm-improvement\">#</a></h2>\n<p><a
|
|
href=\"https://arxiv.org/abs/1606.06565\">Amodei et al. (2016)</a> pointed
|
|
out some directions for mitigating reward hacking in RL training:</p>\n<ol>\n<li><em>Adversarial
|
|
reward functions.</em> We treat the reward function as an adaptive agent itself
|
|
and it can adapt to new tricks that the model discovered where the reward
|
|
is high but human rating is low.</li>\n<li><em>Model lookahead.</em> It is
|
|
possible to give reward based on future anticipated states; e.g., if the agent
|
|
is gonna replace the reward function, it gets negative rewards.</li>\n<li><em>Adversarial
|
|
blinding.</em> We can blind the model with certain variables such that the
|
|
agent cannot learn information that enables it to hack the reward function.</li>\n<li><em>Careful
|
|
engineering.</em> Some types of reward hacking against the system design can
|
|
be avoided by careful engineering; e.g., sandboxing the agent to isolate its
|
|
actions from its reward signals.</li>\n<li><em>Reward capping.</em> This strategy
|
|
is to simply limit the maximum possible reward, as it can effectively prevent
|
|
rare events of the agent hacking to get a super high pay-off strategy.</li>\n<li><em>Counterexample
|
|
resistance.</em> Improvement on adversarial robustness should benefit the
|
|
robustness of the reward function.</li>\n<li><em>Combination of multiple rewards.</em>
|
|
Combining different types of rewards could make it harder to be hacked.</li>\n<li><em>Reward
|
|
pretraining.</em> We can learn a reward function from a collection of (state,
|
|
reward) samples, but depending on how well this supervised training setup
|
|
is, it may come with other baggages. <a href=\"https://lilianweng.github.io/posts/2021-01-02-controllable-text-generation/#rl-fine-tuning-with-human-preferences\">RLHF</a>
|
|
depends on this but learned scalar reward models are quite vulnerable to learning
|
|
undesired traits.</li>\n<li><em>Variable indifference.</em> The goal is to
|
|
ask the agent to optimize some variables in the environment but not others.</li>\n<li><em>Trip
|
|
wires.</em> We can intentionally introduce some vulnerabilities and set up
|
|
monitoring and alerts if any gets reward hacked.</li>\n</ol>\n<p>In RL setups
|
|
where human feedback is formed as <em>approval</em> of agent actions, <a href=\"https://arxiv.org/abs/2011.08827\">Uesato
|
|
et al. (2020)</a> proposed to prevent reward tampering with <strong>decoupled
|
|
approval</strong>. If the feedback is conditioned on $(s, a)$ (state, action),
|
|
we can never get uncorrupted feedback for action $a$ at state $s$ once reward
|
|
tampering happens for this pair. Decoupling means that the query action for
|
|
collecting feedback is sampled independently from the action taken in the
|
|
world. Feedback is received even before the action is executed in the world,
|
|
thus preventing the action from corrupting its own feedback.</p>\n<img src=\"decoupled-approval.png\"
|
|
style=\"width: 100%;\" class=\"center\" />\n<figcaption>Fig. 18. Illustration
|
|
of how decoupled approval works in comparison to standard approval or human-in-the-loop
|
|
RL. (Image source: <a href=\"https://arxiv.org/abs/2011.08827\" target=\"_blank\">Uesato
|
|
et al. 2020</a>)</figcaption>\n<img src=\"decoupled-approval-algorithms.png\"
|
|
style=\"width: 100%;\" class=\"center\" />\n<figcaption>Fig. 19. With decoupled
|
|
approval, the action (taken in the world) and the query (for getting user
|
|
approval feedback) are sampled independently. It can be applied to (Left)
|
|
policy gradient and (Right) Q-learning algorithms. (Image source: <a href=\"https://arxiv.org/abs/2011.08827\"
|
|
target=\"_blank\">Uesato et al. 2020</a>)</figcaption>\n<h2 id=\"detecting-reward-hacking\">Detecting
|
|
Reward Hacking<a hidden class=\"anchor\" aria-hidden=\"true\" href=\"#detecting-reward-hacking\">#</a></h2>\n<p>An
|
|
alternative mitigation is to detect reward hacking by framing it as an anomaly
|
|
detection task, where the detector (“a trusted policy” with trajectories
|
|
and rewards validated by human) should flag instances of misalignment (<a
|
|
href=\"https://arxiv.org/abs/2201.03544\">Pan et al. 2022</a>). Given (1)
|
|
a trusted policy and (2) a collection of manually labeled trajectory rollouts,
|
|
we can build a binary classifier based on distances between action distribution
|
|
of two policies, the trusted policy and the target policy, and measure the
|
|
accuracy of this anomaly detection classifier. In experiments by <a href=\"https://arxiv.org/abs/2201.03544\">Pan
|
|
et al. (2022)</a>, they observed that different detectors are better for different
|
|
tasks and none of the tested classifier can achieve AUROC greater than 60%
|
|
across all tested RL environments.</p>\n<img src=\"reward-hacking-detection.png\"
|
|
style=\"width: 90%;\" class=\"center\" />\n<figcaption>Fig. 20. Performance
|
|
of detectors on different tasks. (Image source: <a href=\"https://arxiv.org/abs/2201.03544\"
|
|
target=\"_blank\">Pan et al. 2022</a>)</figcaption>\n<h2 id=\"data-analysis-of-rlhf\">Data
|
|
Analysis of RLHF<a hidden class=\"anchor\" aria-hidden=\"true\" href=\"#data-analysis-of-rlhf\">#</a></h2>\n<p>`\nAnother
|
|
approach is to analyze RLHF dataset. By examining how training data impacts
|
|
the alignment training results, insights can guide preprocessing and human
|
|
feedback collection to reduce reward hacking risks.</p>\n<p><a href=\"https://arxiv.org/abs/2408.10270\">Revel
|
|
et al. (2024)</a> introduced a set of evaluation metrics for measuring the
|
|
effectiveness of data sample features in modeling and aligning human values.
|
|
They conducted a systematic error analysis for value alignment (“SEAL”)
|
|
in the <a href=\"https://github.com/anthropics/hh-rlhf\">HHH-RLHF</a> dataset.
|
|
The feature taxonomy used in the analysis (e.g., <code>is harmless</code>,
|
|
<code>is refusal</code> and <code>is creative</code>) was manually predefined.
|
|
Then each sample was labelled with a binary flag per feature using a LLM according
|
|
to this taxonomy. Features are categorized into two groups based on heuristics:</p>\n<ul>\n<li>Target
|
|
features: Values explicitly intended to be learned.</li>\n<li>Spoiler features:
|
|
Unintended values inadvertently learned during training (e.g., stylistic features
|
|
like sentiment or coherence). These are similar to <a href=\"#spurious-correlation\">spurious
|
|
features</a> in OOD classification work (<a href=\"https://arxiv.org/abs/2004.07780\">Geirhos
|
|
et al. 2020</a>).</li>\n</ul>\n<p>SEAL introduced three metrics for measuring
|
|
data effectiveness for alignment training:</p>\n<ol>\n<li><em>Feature imprint</em>
|
|
refers to a coefficient parameter $\\beta_\\tau$ for feature $\\tau$ which
|
|
estimates the point increase in reward comparing entires with vs without feature
|
|
$\\tau$, while holding other factors consistent.</li>\n</ol>\n<img src=\"SEAL-feature-imprint.png\"
|
|
style=\"width: 100%;\" class=\"center\" />\n<figcaption>Fig. 21. (Left) Feature
|
|
imprints $\\underline{\\beta(\\tau)}$ (pre-) and $\\beta(\\tau)$ (post-) computed
|
|
from fixed-effects linear regression of rewards <span style=\"color: orange;\">$\\underline{r}(t^\u2217_i)$
|
|
(orange)</span> and <span style=\"color: #289490;\">$r(t^\u2217_i)$ (blue)</span>
|
|
against features. Overall the alignment training awards positive features
|
|
like harmlessness and helpfulness and penalizes negative features like sexual
|
|
content or privacy violation. (Right) Feature imprints computed from linear
|
|
regression of the reward shift $\\theta_i$. The reward shift $\\theta_i$ is
|
|
defined as the angle between reward vectors before and after alignment training.
|
|
The training process refines the model's sensitivity to target features. Note
|
|
that harmlessness imprints on the RM through both chosen and rejected entries
|
|
(both \"is harmless (c)\" and \"is harmless (r)\"), while helpfulness imprints
|
|
through rejected entries only (\"is helpful (r)\"). (Image source: <a href=\"https://arxiv.org/abs/2408.10270\"
|
|
target=\"_blank\">Revel et al. 2024</a>)</figcaption>\n<ol start=\"2\">\n<li><em>Alignment
|
|
resistance</em> is the percentage of the preference data pairs where RMs <em>fail</em>
|
|
to match human preferences. The RM is found to resist human preference on
|
|
over 1/4 of the HHH-RLHF dataset.</li>\n<li><em>Alignment robustness</em>,
|
|
$\\pi^{c/r}_{+/-} (\\tau)$, measures the extent to which alignment is robust
|
|
to perturbed inputs with rewriting in terms of spoiler features $\\tau$ like
|
|
sentiment, eloquence and coherency, isolating the effects of each feature
|
|
and each event type.\n<ul>\n<li>The robustness metric $\\pi_\u2212^c$ (a feature
|
|
name $\\tau$ such as “eloquent” or “sentiment positive”)
|
|
should be interpreted in such a way:\n<ul>\n<li>A chosen entry (denoted by
|
|
$c$) that contains a stronger feature $\\tau$ after rewriting has $\\exp (\\pi^c_{-}(\\tau))$
|
|
\ times higher odds of becoming rejected, in comparison to others without
|
|
such flips.</li>\n<li>Similarly, a rejected entry (denoted by $r$) that obtains
|
|
a weaker feature $\\tau$ after rewriting has $\\exp (\\pi^r_{+}(\\tau))$ times
|
|
odds of becoming chosen compared to others without such flips.</li>\n</ul>\n</li>\n<li>According
|
|
to their analysis of alignment robustness metrics in terms of different rewriting,
|
|
only the robustness scores based on sentiment spoiler features, $\\pi^c_{+}$
|
|
(sentiment) and $\\pi^r_{-}$ (sentiment), are statistically significant.</li>\n</ul>\n</li>\n</ol>\n<h1
|
|
id=\"citation\">Citation<a hidden class=\"anchor\" aria-hidden=\"true\" href=\"#citation\">#</a></h1>\n<p>Cited
|
|
as:</p>\n<blockquote>\n<p>Weng, Lilian. (Nov 2024). Reward Hacking in Reinforcement
|
|
Learning. Lil’Log. https://lilianweng.github.io/posts/2024-11-28-reward-hacking/.</p>\n</blockquote>\n<p>Or</p>\n<pre
|
|
tabindex=\"0\"><code>@article{weng2024rewardhack,\n title = "Reward
|
|
Hacking in Reinforcement Learning.",\n author = "Weng, Lilian",\n
|
|
\ journal = "lilianweng.github.io",\n year = "2024",\n
|
|
\ month = "Nov",\n url = "https://lilianweng.github.io/posts/2024-11-28-reward-hacking/"\n}\n</code></pre><h1
|
|
id=\"references\">References<a hidden class=\"anchor\" aria-hidden=\"true\"
|
|
href=\"#references\">#</a></h1>\n<p>[1] Andrew Ng & Stuart Russell. <a
|
|
href=\"https://ai.stanford.edu/~ang/papers/icml00-irl.pdf\">“Algorithms
|
|
for inverse reinforcement learning.”</a>. ICML 2000.</p>\n<p>[2] Amodei
|
|
et al. <a href=\"https://arxiv.org/abs/1606.06565\">“Concrete problems
|
|
in AI safety: Avoid reward hacking.”</a> arXiv preprint arXiv:1606.06565
|
|
(2016).</p>\n<p>[3] Krakovna et al. <a href=\"https://deepmind.google/discover/blog/specification-gaming-the-flip-side-of-ai-ingenuity/\">“Specification
|
|
gaming: the flip side of AI ingenuity.”</a> 2020.</p>\n<p>[4] Langosco
|
|
et al. <a href=\"https://arxiv.org/abs/2105.14111\">“Goal Misgeneralization
|
|
in Deep Reinforcement Learning”</a> ICML 2022.</p>\n<p>[5] Everitt et
|
|
al. <a href=\"https://arxiv.org/abs/1705.08417\">“Reinforcement learning
|
|
with a corrupted reward channel.”</a> IJCAI 2017.</p>\n<p>[6] Geirhos
|
|
et al. <a href=\"https://arxiv.org/abs/2004.07780\">“Shortcut Learning
|
|
in Deep Neural Networks.”</a> Nature Machine Intelligence 2020.</p>\n<p>[7]
|
|
Ribeiro et al. <a href=\"https://arxiv.org/abs/1602.04938\">“Why Should
|
|
I Trust You?”: Explaining the Predictions of Any Classifier.</a> KDD
|
|
2016.</p>\n<p>[8] Nagarajan et al. <a href=\"https://arxiv.org/abs/2010.15775\">“Understanding
|
|
the Failure Modes of Out-of-Distribution Generalization.”</a> ICLR 2021.</p>\n<p>[9]
|
|
Garrabrant. <a href=\"https://www.lesswrong.com/posts/EbFABnst8LsidYs5Y/goodhart-taxonomy\">“Goodhart
|
|
Taxonomy”</a>. AI Alignment Forum (Dec 30th 2017).</p>\n<p>[10] Koch
|
|
et al. <a href=\"https://www.gatsby.ucl.ac.uk/~balaji/udl2021/accepted-papers/UDL2021-paper-055.pdf\">“Objective
|
|
robustness in deep reinforcement learning.”</a> 2021.</p>\n<p>[11] Pan
|
|
et al. <a href=\"https://arxiv.org/abs/2201.03544\">“The effects of
|
|
reward misspecification: mapping and mitigating misaligned models.”</a></p>\n<p>[12]
|
|
Everitt et al. <a href=\"https://arxiv.org/abs/1908.04734\">“Reward
|
|
tampering problems and solutions in reinforcement learning: A causal influence
|
|
diagram perspective.”</a> arXiv preprint arXiv:1908.04734 (2019).</p>\n<p>[13]
|
|
Gleave et al. <a href=\"https://arxiv.org/abs/1905.10615\">“Adversarial
|
|
Policies: Attacking Deep Reinforcement Learning.”</a> ICRL 2020</p>\n<p>[14]
|
|
<a href=\"https://www.lesswrong.com/posts/Ge55vxEmKXunFFwoe/reward-hacking-behavior-can-generalize-across-tasks\">“Reward
|
|
hacking behavior can generalize across tasks.”</a></p>\n<p>[15] Ng et
|
|
al. <a href=\"https://people.eecs.berkeley.edu/~pabbeel/cs287-fa09/readings/NgHaradaRussell-shaping-ICML1999.pdf\">“Policy
|
|
invariance under reward transformations: Theory and application to reward
|
|
shaping.”</a> ICML 1999.</p>\n<p>[16] Wang et al. <a href=\"https://arxiv.org/abs/2305.17926\">“Large
|
|
Language Models are not Fair Evaluators.”</a> ACL 2024.</p>\n<p>[17]
|
|
Liu et al. <a href=\"https://arxiv.org/abs/2311.09766\">“LLMs as narcissistic
|
|
evaluators: When ego inflates evaluation scores.”</a> ACL 2024.</p>\n<p>[18]
|
|
Gao et al. <a href=\"https://arxiv.org/abs/2210.10760\">“Scaling Laws
|
|
for Reward Model Overoptimization.”</a> ICML 2023.</p>\n<p>[19] Pan
|
|
et al. <a href=\"https://arxiv.org/abs/2407.04549\">“Spontaneous Reward
|
|
Hacking in Iterative Self-Refinement.”</a> arXiv preprint arXiv:2407.04549
|
|
(2024).</p>\n<p>[20] Pan et al. <a href=\"https://arxiv.org/abs/2402.06627\">“Feedback
|
|
Loops With Language Models Drive In-Context Reward Hacking.”</a> arXiv
|
|
preprint arXiv:2402.06627 (2024).</p>\n<p>[21] Shrama et al. <a href=\"https://arxiv.org/abs/2310.13548\">“Towards
|
|
Understanding Sycophancy in Language Models.”</a> arXiv preprint arXiv:2310.13548
|
|
(2023).</p>\n<p>[22] Denison et al. <a href=\"https://arxiv.org/abs/2406.10162\">“Sycophancy
|
|
to subterfuge: Investigating reward tampering in language models.”</a>
|
|
arXiv preprint arXiv:2406.10162 (2024).</p>\n<p>[23] Uesato et al. <a href=\"https://arxiv.org/abs/2011.08827\">“Avoiding
|
|
Tampering Incentives in Deep RL via Decoupled Approval.”</a> arXiv preprint
|
|
arXiv:2011.08827 (2020).</p>\n<p>[24] Amin and Singh. <a href=\"https://arxiv.org/abs/1601.06569\">“Towards
|
|
resolving unidentifiability in inverse reinforcement learning.”</a></p>\n<p>[25]
|
|
Wen et al. <a href=\"https://arxiv.org/abs/2409.12822\">“Language Models
|
|
Learn to Mislead Humans via RLHF.”</a> arXiv preprint arXiv:2409.12822
|
|
(2024).</p>\n<p>[26] Revel et al. <a href=\"https://arxiv.org/abs/2408.10270\">“SEAL:
|
|
Systematic Error Analysis for Value ALignment.”</a> arXiv preprint arXiv:2408.10270
|
|
(2024).</p>\n<p>[27] Yuval Noah Harari. <a href=\"https://www.goodreads.com/en/book/show/204927599-nexus\">“Nexus:
|
|
A Brief History of Information Networks from the Stone Age to AI.”</a>
|
|
Signal; 2024 Sep 10.</p>\n\n\n </div>\n\n <footer class=\"post-footer\">\n
|
|
\ <ul class=\"post-tags\">\n <li><a href=\"https://lilianweng.github.io/tags/language-model/\">Language-Model</a></li>\n
|
|
\ <li><a href=\"https://lilianweng.github.io/tags/rlhf/\">Rlhf</a></li>\n
|
|
\ <li><a href=\"https://lilianweng.github.io/tags/alignment/\">Alignment</a></li>\n
|
|
\ <li><a href=\"https://lilianweng.github.io/tags/safety/\">Safety</a></li>\n
|
|
\ <li><a href=\"https://lilianweng.github.io/tags/reinforcement-learning/\">Reinforcement-Learning</a></li>\n
|
|
\ <li><a href=\"https://lilianweng.github.io/tags/long-read/\">Long-Read</a></li>\n
|
|
\ </ul>\n<nav class=\"paginav\">\n <a class=\"next\" href=\"https://lilianweng.github.io/posts/2024-07-07-hallucination/\">\n
|
|
\ <span class=\"title\"> \xBB</span>\n <br>\n <span>Extrinsic Hallucinations
|
|
in LLMs</span>\n </a>\n</nav>\n\n\n<div class=\"share-buttons\">\n <a
|
|
target=\"_blank\" rel=\"noopener noreferrer\" aria-label=\"share Reward Hacking
|
|
in Reinforcement Learning on twitter\"\n href=\"https://twitter.com/intent/tweet/?text=Reward%20Hacking%20in%20Reinforcement%20Learning&url=https%3a%2f%2flilianweng.github.io%2fposts%2f2024-11-28-reward-hacking%2f&hashtags=language-model%2crlhf%2calignment%2csafety%2creinforcement-learning%2clong-read\">\n
|
|
\ <svg version=\"1.1\" viewBox=\"0 0 512 512\" xml:space=\"preserve\">\n
|
|
\ <path\n d=\"M449.446,0c34.525,0 62.554,28.03 62.554,62.554l0,386.892c0,34.524
|
|
-28.03,62.554 -62.554,62.554l-386.892,0c-34.524,0 -62.554,-28.03 -62.554,-62.554l0,-386.892c0,-34.524
|
|
28.029,-62.554 62.554,-62.554l386.892,0Zm-253.927,424.544c135.939,0 210.268,-112.643
|
|
210.268,-210.268c0,-3.218 0,-6.437 -0.153,-9.502c14.406,-10.421 26.973,-23.448
|
|
36.935,-38.314c-13.18,5.824 -27.433,9.809 -42.452,11.648c15.326,-9.196 26.973,-23.602
|
|
32.49,-40.92c-14.252,8.429 -30.038,14.56 -46.896,17.931c-13.487,-14.406 -32.644,-23.295
|
|
-53.946,-23.295c-40.767,0 -73.87,33.104 -73.87,73.87c0,5.824 0.613,11.494
|
|
1.992,16.858c-61.456,-3.065 -115.862,-32.49 -152.337,-77.241c-6.284,10.881
|
|
-9.962,23.601 -9.962,37.088c0,25.594 13.027,48.276 32.95,61.456c-12.107,-0.307
|
|
-23.448,-3.678 -33.41,-9.196l0,0.92c0,35.862 25.441,65.594 59.311,72.49c-6.13,1.686
|
|
-12.72,2.606 -19.464,2.606c-4.751,0 -9.348,-0.46 -13.946,-1.38c9.349,29.426
|
|
36.628,50.728 68.965,51.341c-25.287,19.771 -57.164,31.571 -91.8,31.571c-5.977,0
|
|
-11.801,-0.306 -17.625,-1.073c32.337,21.15 71.264,33.41 112.95,33.41Z\" />\n
|
|
\ </svg>\n </a>\n <a target=\"_blank\" rel=\"noopener noreferrer\"
|
|
aria-label=\"share Reward Hacking in Reinforcement Learning on linkedin\"\n
|
|
\ href=\"https://www.linkedin.com/shareArticle?mini=true&url=https%3a%2f%2flilianweng.github.io%2fposts%2f2024-11-28-reward-hacking%2f&title=Reward%20Hacking%20in%20Reinforcement%20Learning&summary=Reward%20Hacking%20in%20Reinforcement%20Learning&source=https%3a%2f%2flilianweng.github.io%2fposts%2f2024-11-28-reward-hacking%2f\">\n
|
|
\ <svg version=\"1.1\" viewBox=\"0 0 512 512\" xml:space=\"preserve\">\n
|
|
\ <path\n d=\"M449.446,0c34.525,0 62.554,28.03 62.554,62.554l0,386.892c0,34.524
|
|
-28.03,62.554 -62.554,62.554l-386.892,0c-34.524,0 -62.554,-28.03 -62.554,-62.554l0,-386.892c0,-34.524
|
|
28.029,-62.554 62.554,-62.554l386.892,0Zm-288.985,423.278l0,-225.717l-75.04,0l0,225.717l75.04,0Zm270.539,0l0,-129.439c0,-69.333
|
|
-37.018,-101.586 -86.381,-101.586c-39.804,0 -57.634,21.891 -67.617,37.266l0,-31.958l-75.021,0c0.995,21.181
|
|
0,225.717 0,225.717l75.02,0l0,-126.056c0,-6.748 0.486,-13.492 2.474,-18.315c5.414,-13.475
|
|
17.767,-27.434 38.494,-27.434c27.135,0 38.007,20.707 38.007,51.037l0,120.768l75.024,0Zm-307.552,-334.556c-25.674,0
|
|
-42.448,16.879 -42.448,39.002c0,21.658 16.264,39.002 41.455,39.002l0.484,0c26.165,0
|
|
42.452,-17.344 42.452,-39.002c-0.485,-22.092 -16.241,-38.954 -41.943,-39.002Z\"
|
|
/>\n </svg>\n </a>\n <a target=\"_blank\" rel=\"noopener noreferrer\"
|
|
aria-label=\"share Reward Hacking in Reinforcement Learning on reddit\"\n
|
|
\ href=\"https://reddit.com/submit?url=https%3a%2f%2flilianweng.github.io%2fposts%2f2024-11-28-reward-hacking%2f&title=Reward%20Hacking%20in%20Reinforcement%20Learning\">\n
|
|
\ <svg version=\"1.1\" viewBox=\"0 0 512 512\" xml:space=\"preserve\">\n
|
|
\ <path\n d=\"M449.446,0c34.525,0 62.554,28.03 62.554,62.554l0,386.892c0,34.524
|
|
-28.03,62.554 -62.554,62.554l-386.892,0c-34.524,0 -62.554,-28.03 -62.554,-62.554l0,-386.892c0,-34.524
|
|
28.029,-62.554 62.554,-62.554l386.892,0Zm-3.446,265.638c0,-22.964 -18.616,-41.58
|
|
-41.58,-41.58c-11.211,0 -21.361,4.457 -28.841,11.666c-28.424,-20.508 -67.586,-33.757
|
|
-111.204,-35.278l18.941,-89.121l61.884,13.157c0.756,15.734 13.642,28.29 29.56,28.29c16.407,0
|
|
29.706,-13.299 29.706,-29.701c0,-16.403 -13.299,-29.702 -29.706,-29.702c-11.666,0
|
|
-21.657,6.792 -26.515,16.578l-69.105,-14.69c-1.922,-0.418 -3.939,-0.042 -5.585,1.036c-1.658,1.073
|
|
-2.811,2.761 -3.224,4.686l-21.152,99.438c-44.258,1.228 -84.046,14.494 -112.837,35.232c-7.468,-7.164
|
|
-17.589,-11.591 -28.757,-11.591c-22.965,0 -41.585,18.616 -41.585,41.58c0,16.896
|
|
10.095,31.41 24.568,37.918c-0.639,4.135 -0.99,8.328 -0.99,12.576c0,63.977
|
|
74.469,115.836 166.33,115.836c91.861,0 166.334,-51.859 166.334,-115.836c0,-4.218
|
|
-0.347,-8.387 -0.977,-12.493c14.564,-6.47 24.735,-21.034 24.735,-38.001Zm-119.474,108.193c-20.27,20.241
|
|
-59.115,21.816 -70.534,21.816c-11.428,0 -50.277,-1.575 -70.522,-21.82c-3.007,-3.008
|
|
-3.007,-7.882 0,-10.889c3.003,-2.999 7.882,-3.003 10.885,0c12.777,12.781 40.11,17.317
|
|
59.637,17.317c19.522,0 46.86,-4.536 59.657,-17.321c3.016,-2.999 7.886,-2.995
|
|
10.885,0.008c3.008,3.011 3.003,7.882 -0.008,10.889Zm-5.23,-48.781c-16.373,0
|
|
-29.701,-13.324 -29.701,-29.698c0,-16.381 13.328,-29.714 29.701,-29.714c16.378,0
|
|
29.706,13.333 29.706,29.714c0,16.374 -13.328,29.698 -29.706,29.698Zm-160.386,-29.702c0,-16.381
|
|
13.328,-29.71 29.714,-29.71c16.369,0 29.689,13.329 29.689,29.71c0,16.373 -13.32,29.693
|
|
-29.689,29.693c-16.386,0 -29.714,-13.32 -29.714,-29.693Z\" />\n </svg>\n
|
|
\ </a>\n <a target=\"_blank\" rel=\"noopener noreferrer\" aria-label=\"share
|
|
Reward Hacking in Reinforcement Learning on facebook\"\n href=\"https://facebook.com/sharer/sharer.php?u=https%3a%2f%2flilianweng.github.io%2fposts%2f2024-11-28-reward-hacking%2f\">\n
|
|
\ <svg version=\"1.1\" viewBox=\"0 0 512 512\" xml:space=\"preserve\">\n
|
|
\ <path\n d=\"M449.446,0c34.525,0 62.554,28.03 62.554,62.554l0,386.892c0,34.524
|
|
-28.03,62.554 -62.554,62.554l-106.468,0l0,-192.915l66.6,0l12.672,-82.621l-79.272,0l0,-53.617c0,-22.603
|
|
11.073,-44.636 46.58,-44.636l36.042,0l0,-70.34c0,0 -32.71,-5.582 -63.982,-5.582c-65.288,0
|
|
-107.96,39.569 -107.96,111.204l0,62.971l-72.573,0l0,82.621l72.573,0l0,192.915l-191.104,0c-34.524,0
|
|
-62.554,-28.03 -62.554,-62.554l0,-386.892c0,-34.524 28.029,-62.554 62.554,-62.554l386.892,0Z\"
|
|
/>\n </svg>\n </a>\n <a target=\"_blank\" rel=\"noopener noreferrer\"
|
|
aria-label=\"share Reward Hacking in Reinforcement Learning on whatsapp\"\n
|
|
\ href=\"https://api.whatsapp.com/send?text=Reward%20Hacking%20in%20Reinforcement%20Learning%20-%20https%3a%2f%2flilianweng.github.io%2fposts%2f2024-11-28-reward-hacking%2f\">\n
|
|
\ <svg version=\"1.1\" viewBox=\"0 0 512 512\" xml:space=\"preserve\">\n
|
|
\ <path\n d=\"M449.446,0c34.525,0 62.554,28.03 62.554,62.554l0,386.892c0,34.524
|
|
-28.03,62.554 -62.554,62.554l-386.892,0c-34.524,0 -62.554,-28.03 -62.554,-62.554l0,-386.892c0,-34.524
|
|
28.029,-62.554 62.554,-62.554l386.892,0Zm-58.673,127.703c-33.842,-33.881 -78.847,-52.548
|
|
-126.798,-52.568c-98.799,0 -179.21,80.405 -179.249,179.234c-0.013,31.593 8.241,62.428
|
|
23.927,89.612l-25.429,92.884l95.021,-24.925c26.181,14.28 55.659,21.807 85.658,21.816l0.074,0c98.789,0
|
|
179.206,-80.413 179.247,-179.243c0.018,-47.895 -18.61,-92.93 -52.451,-126.81Zm-126.797,275.782l-0.06,0c-26.734,-0.01
|
|
-52.954,-7.193 -75.828,-20.767l-5.441,-3.229l-56.386,14.792l15.05,-54.977l-3.542,-5.637c-14.913,-23.72
|
|
-22.791,-51.136 -22.779,-79.287c0.033,-82.142 66.867,-148.971 149.046,-148.971c39.793,0.014
|
|
77.199,15.531 105.329,43.692c28.128,28.16 43.609,65.592 43.594,105.4c-0.034,82.149
|
|
-66.866,148.983 -148.983,148.984Zm81.721,-111.581c-4.479,-2.242 -26.499,-13.075
|
|
-30.604,-14.571c-4.105,-1.495 -7.091,-2.241 -10.077,2.241c-2.986,4.483 -11.569,14.572
|
|
-14.182,17.562c-2.612,2.988 -5.225,3.364 -9.703,1.12c-4.479,-2.241 -18.91,-6.97
|
|
-36.017,-22.23c-13.314,-11.876 -22.304,-26.542 -24.916,-31.026c-2.612,-4.484
|
|
-0.279,-6.908 1.963,-9.14c2.016,-2.007 4.48,-5.232 6.719,-7.847c2.24,-2.615
|
|
2.986,-4.484 4.479,-7.472c1.493,-2.99 0.747,-5.604 -0.374,-7.846c-1.119,-2.241
|
|
-10.077,-24.288 -13.809,-33.256c-3.635,-8.733 -7.327,-7.55 -10.077,-7.688c-2.609,-0.13
|
|
-5.598,-0.158 -8.583,-0.158c-2.986,0 -7.839,1.121 -11.944,5.604c-4.105,4.484
|
|
-15.675,15.32 -15.675,37.364c0,22.046 16.048,43.342 18.287,46.332c2.24,2.99
|
|
31.582,48.227 76.511,67.627c10.685,4.615 19.028,7.371 25.533,9.434c10.728,3.41
|
|
20.492,2.929 28.209,1.775c8.605,-1.285 26.499,-10.833 30.231,-21.295c3.732,-10.464
|
|
3.732,-19.431 2.612,-21.298c-1.119,-1.869 -4.105,-2.99 -8.583,-5.232Z\" />\n
|
|
\ </svg>\n </a>\n <a target=\"_blank\" rel=\"noopener noreferrer\"
|
|
aria-label=\"share Reward Hacking in Reinforcement Learning on telegram\"\n
|
|
\ href=\"https://telegram.me/share/url?text=Reward%20Hacking%20in%20Reinforcement%20Learning&url=https%3a%2f%2flilianweng.github.io%2fposts%2f2024-11-28-reward-hacking%2f\">\n
|
|
\ <svg version=\"1.1\" xml:space=\"preserve\" viewBox=\"2 2 28 28\">\n
|
|
\ <path\n d=\"M26.49,29.86H5.5a3.37,3.37,0,0,1-2.47-1,3.35,3.35,0,0,1-1-2.47V5.48A3.36,3.36,0,0,1,3,3,3.37,3.37,0,0,1,5.5,2h21A3.38,3.38,0,0,1,29,3a3.36,3.36,0,0,1,1,2.46V26.37a3.35,3.35,0,0,1-1,2.47A3.38,3.38,0,0,1,26.49,29.86Zm-5.38-6.71a.79.79,0,0,0,.85-.66L24.73,9.24a.55.55,0,0,0-.18-.46.62.62,0,0,0-.41-.17q-.08,0-16.53,6.11a.59.59,0,0,0-.41.59.57.57,0,0,0,.43.52l4,1.24,1.61,4.83a.62.62,0,0,0,.63.43.56.56,0,0,0,.4-.17L16.54,20l4.09,3A.9.9,0,0,0,21.11,23.15ZM13.8,20.71l-1.21-4q8.72-5.55,8.78-5.55c.15,0,.23,0,.23.16a.18.18,0,0,1,0,.06s-2.51,2.3-7.52,6.8Z\"
|
|
/>\n </svg>\n </a>\n</div>\n\n </footer>\n</article>\n </main>\n
|
|
\ \n<footer class=\"footer\">\n <span>© 2025 <a href=\"https://lilianweng.github.io/\">Lil'Log</a></span>\n
|
|
\ <span>\n Powered by\n <a href=\"https://gohugo.io/\" rel=\"noopener
|
|
noreferrer\" target=\"_blank\">Hugo</a> &\n <a href=\"https://git.io/hugopapermod\"
|
|
rel=\"noopener\" target=\"_blank\">PaperMod</a>\n </span>\n</footer>\n<a
|
|
href=\"#top\" aria-label=\"go to top\" title=\"Go to Top (Alt + G)\" class=\"top-link\"
|
|
id=\"top-link\" accesskey=\"g\">\n <svg xmlns=\"http://www.w3.org/2000/svg\"
|
|
viewBox=\"0 0 12 6\" fill=\"currentColor\">\n <path d=\"M12 6H0l6-6z\"
|
|
/>\n </svg>\n</a>\n\n<script>\n let menu = document.getElementById('menu')\n
|
|
\ if (menu) {\n menu.scrollLeft = localStorage.getItem(\"menu-scroll-position\");\n
|
|
\ menu.onscroll = function () {\n localStorage.setItem(\"menu-scroll-position\",
|
|
menu.scrollLeft);\n }\n }\n\n document.querySelectorAll('a[href^=\"#\"]').forEach(anchor
|
|
=> {\n anchor.addEventListener(\"click\", function (e) {\n e.preventDefault();\n
|
|
\ var id = this.getAttribute(\"href\").substr(1);\n if
|
|
(!window.matchMedia('(prefers-reduced-motion: reduce)').matches) {\n document.querySelector(`[id='${decodeURIComponent(id)}']`).scrollIntoView({\n
|
|
\ behavior: \"smooth\"\n });\n }
|
|
else {\n document.querySelector(`[id='${decodeURIComponent(id)}']`).scrollIntoView();\n
|
|
\ }\n if (id === \"top\") {\n history.replaceState(null,
|
|
null, \" \");\n } else {\n history.pushState(null,
|
|
null, `#${id}`);\n }\n });\n });\n\n</script>\n<script>\n
|
|
\ var mybutton = document.getElementById(\"top-link\");\n window.onscroll
|
|
= function () {\n if (document.body.scrollTop > 800 || document.documentElement.scrollTop
|
|
> 800) {\n mybutton.style.visibility = \"visible\";\n mybutton.style.opacity
|
|
= \"1\";\n } else {\n mybutton.style.visibility = \"hidden\";\n
|
|
\ mybutton.style.opacity = \"0\";\n }\n };\n\n</script>\n<script>\n
|
|
\ document.getElementById(\"theme-toggle\").addEventListener(\"click\",
|
|
() => {\n if (document.body.className.includes(\"dark\")) {\n document.body.classList.remove('dark');\n
|
|
\ localStorage.setItem(\"pref-theme\", 'light');\n } else
|
|
{\n document.body.classList.add('dark');\n localStorage.setItem(\"pref-theme\",
|
|
'dark');\n }\n })\n\n</script>\n<script>\n document.querySelectorAll('pre
|
|
> code').forEach((codeblock) => {\n const container = codeblock.parentNode.parentNode;\n\n
|
|
\ const copybutton = document.createElement('button');\n copybutton.classList.add('copy-code');\n
|
|
\ copybutton.innerText = 'copy';\n\n function copyingDone() {\n
|
|
\ copybutton.innerText = 'copied!';\n setTimeout(() =>
|
|
{\n copybutton.innerText = 'copy';\n }, 2000);\n
|
|
\ }\n\n copybutton.addEventListener('click', (cb) => {\n if
|
|
('clipboard' in navigator) {\n navigator.clipboard.writeText(codeblock.textContent);\n
|
|
\ copyingDone();\n return;\n }\n\n
|
|
\ const range = document.createRange();\n range.selectNodeContents(codeblock);\n
|
|
\ const selection = window.getSelection();\n selection.removeAllRanges();\n
|
|
\ selection.addRange(range);\n try {\n document.execCommand('copy');\n
|
|
\ copyingDone();\n } catch (e) { };\n selection.removeRange(range);\n
|
|
\ });\n\n if (container.classList.contains(\"highlight\")) {\n
|
|
\ container.appendChild(copybutton);\n } else if (container.parentNode.firstChild
|
|
== container) {\n \n } else if (codeblock.parentNode.parentNode.parentNode.parentNode.parentNode.nodeName
|
|
== \"TABLE\") {\n \n codeblock.parentNode.parentNode.parentNode.parentNode.parentNode.appendChild(copybutton);\n
|
|
\ } else {\n \n codeblock.parentNode.appendChild(copybutton);\n
|
|
\ }\n });\n</script>\n</body>\n\n</html>\n"
|
|
headers:
|
|
Accept-Ranges:
|
|
- bytes
|
|
Access-Control-Allow-Origin:
|
|
- '*'
|
|
Age:
|
|
- '0'
|
|
Cache-Control:
|
|
- max-age=600
|
|
Connection:
|
|
- keep-alive
|
|
Content-Length:
|
|
- '47949'
|
|
Content-Type:
|
|
- text/html; charset=utf-8
|
|
Date:
|
|
- Tue, 29 Apr 2025 21:28:18 GMT
|
|
ETag:
|
|
- W/"67d44639-2478e"
|
|
Last-Modified:
|
|
- Fri, 14 Mar 2025 15:07:37 GMT
|
|
Server:
|
|
- GitHub.com
|
|
Vary:
|
|
- Accept-Encoding
|
|
Via:
|
|
- 1.1 varnish
|
|
X-Cache:
|
|
- HIT
|
|
X-Cache-Hits:
|
|
- '0'
|
|
X-Fastly-Request-ID:
|
|
- 2c24a9fc77040138e0e5b93f645459d0bd342d29
|
|
X-GitHub-Request-Id:
|
|
- A63F:2DF33F:24FA2A:286BFD:68113364
|
|
X-Served-By:
|
|
- cache-gru-sbsp2090027-GRU
|
|
X-Timer:
|
|
- S1745962099.562377,VS0,VE125
|
|
expires:
|
|
- Tue, 29 Apr 2025 20:25:33 GMT
|
|
permissions-policy:
|
|
- interest-cohort=()
|
|
x-proxy-cache:
|
|
- MISS
|
|
status:
|
|
code: 200
|
|
message: OK
|
|
version: 1
|