mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-18 12:28:15 +00:00
Compare commits
1 Commits
devin/1768
...
devin/1767
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
83b07b9d23 |
@@ -278,7 +278,20 @@ class CrewAgentExecutor(CrewAgentExecutorMixin):
|
||||
)
|
||||
|
||||
self._invoke_step_callback(formatted_answer) # type: ignore[arg-type]
|
||||
self._append_message(formatted_answer.text) # type: ignore[union-attr,attr-defined]
|
||||
|
||||
# Properly attribute messages to avoid LLM hallucination of observations:
|
||||
# - LLM's response goes as assistant message
|
||||
# - Tool observation goes as user message (not assistant)
|
||||
if isinstance(formatted_answer, AgentAction) and formatted_answer.llm_response:
|
||||
# For tool use: append LLM response as assistant, observation as user
|
||||
self._append_message(formatted_answer.llm_response)
|
||||
if formatted_answer.result:
|
||||
self._append_message(
|
||||
f"Observation: {formatted_answer.result}", role="user"
|
||||
)
|
||||
else:
|
||||
# For final answer or other cases: append text as assistant
|
||||
self._append_message(formatted_answer.text) # type: ignore[union-attr,attr-defined]
|
||||
|
||||
except OutputParserError as e:
|
||||
formatted_answer = handle_output_parser_exception( # type: ignore[assignment]
|
||||
@@ -431,7 +444,20 @@ class CrewAgentExecutor(CrewAgentExecutorMixin):
|
||||
)
|
||||
|
||||
self._invoke_step_callback(formatted_answer) # type: ignore[arg-type]
|
||||
self._append_message(formatted_answer.text) # type: ignore[union-attr,attr-defined]
|
||||
|
||||
# Properly attribute messages to avoid LLM hallucination of observations:
|
||||
# - LLM's response goes as assistant message
|
||||
# - Tool observation goes as user message (not assistant)
|
||||
if isinstance(formatted_answer, AgentAction) and formatted_answer.llm_response:
|
||||
# For tool use: append LLM response as assistant, observation as user
|
||||
self._append_message(formatted_answer.llm_response)
|
||||
if formatted_answer.result:
|
||||
self._append_message(
|
||||
f"Observation: {formatted_answer.result}", role="user"
|
||||
)
|
||||
else:
|
||||
# For final answer or other cases: append text as assistant
|
||||
self._append_message(formatted_answer.text) # type: ignore[union-attr,attr-defined]
|
||||
|
||||
except OutputParserError as e:
|
||||
formatted_answer = handle_output_parser_exception( # type: ignore[assignment]
|
||||
@@ -481,13 +507,18 @@ class CrewAgentExecutor(CrewAgentExecutorMixin):
|
||||
Updated action or final answer.
|
||||
"""
|
||||
# Special case for add_image_tool
|
||||
# Note: Even for add_image_tool, we should not attribute tool output to assistant
|
||||
# to avoid LLM hallucination. The LLM's action is stored as assistant message,
|
||||
# and the tool result (image) is stored as user message.
|
||||
add_image_tool = self._i18n.tools("add_image")
|
||||
if (
|
||||
isinstance(add_image_tool, dict)
|
||||
and formatted_answer.tool.casefold().strip()
|
||||
== add_image_tool.get("name", "").casefold().strip()
|
||||
):
|
||||
self.messages.append({"role": "assistant", "content": tool_result.result})
|
||||
# Store original LLM response for proper message attribution
|
||||
formatted_answer.llm_response = formatted_answer.text
|
||||
formatted_answer.result = tool_result.result
|
||||
return formatted_answer
|
||||
|
||||
return handle_agent_action_core(
|
||||
|
||||
@@ -33,6 +33,7 @@ class AgentAction:
|
||||
tool_input: str
|
||||
text: str
|
||||
result: str | None = None
|
||||
llm_response: str | None = None # Original LLM response before observation appended
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@@ -349,8 +349,18 @@ class CrewAgentExecutorFlow(Flow[AgentReActState], CrewAgentExecutorMixin):
|
||||
# Invoke step callback if configured
|
||||
self._invoke_step_callback(result)
|
||||
|
||||
# Append result message to conversation state
|
||||
if hasattr(result, "text"):
|
||||
# Properly attribute messages to avoid LLM hallucination of observations:
|
||||
# - LLM's response goes as assistant message
|
||||
# - Tool observation goes as user message (not assistant)
|
||||
if isinstance(result, AgentAction) and result.llm_response:
|
||||
# For tool use: append LLM response as assistant, observation as user
|
||||
self._append_message_to_state(result.llm_response)
|
||||
if result.result:
|
||||
self._append_message_to_state(
|
||||
f"Observation: {result.result}", role="user"
|
||||
)
|
||||
elif hasattr(result, "text"):
|
||||
# For final answer or other cases: append text as assistant
|
||||
self._append_message_to_state(result.text)
|
||||
|
||||
# Check if tool result became a final answer (result_as_answer flag)
|
||||
@@ -537,15 +547,19 @@ class CrewAgentExecutorFlow(Flow[AgentReActState], CrewAgentExecutorMixin):
|
||||
Returns:
|
||||
Updated action or final answer.
|
||||
"""
|
||||
# Special case for add_image_tool
|
||||
# Note: Even for add_image_tool, we should not attribute tool output to assistant
|
||||
# to avoid LLM hallucination. The LLM's action is stored as assistant message,
|
||||
# and the tool result (image) is stored as user message.
|
||||
add_image_tool = self._i18n.tools("add_image")
|
||||
if (
|
||||
isinstance(add_image_tool, dict)
|
||||
and formatted_answer.tool.casefold().strip()
|
||||
== add_image_tool.get("name", "").casefold().strip()
|
||||
):
|
||||
self.state.messages.append(
|
||||
{"role": "assistant", "content": tool_result.result}
|
||||
)
|
||||
# Store original LLM response for proper message attribution
|
||||
formatted_answer.llm_response = formatted_answer.text
|
||||
formatted_answer.result = tool_result.result
|
||||
return formatted_answer
|
||||
|
||||
return handle_agent_action_core(
|
||||
|
||||
@@ -582,7 +582,19 @@ class LiteAgent(FlowTrackable, BaseModel):
|
||||
show_logs=self._show_logs,
|
||||
)
|
||||
|
||||
self._append_message(formatted_answer.text, role="assistant")
|
||||
# Properly attribute messages to avoid LLM hallucination of observations:
|
||||
# - LLM's response goes as assistant message
|
||||
# - Tool observation goes as user message (not assistant)
|
||||
if isinstance(formatted_answer, AgentAction) and formatted_answer.llm_response:
|
||||
# For tool use: append LLM response as assistant, observation as user
|
||||
self._append_message(formatted_answer.llm_response, role="assistant")
|
||||
if formatted_answer.result:
|
||||
self._append_message(
|
||||
f"Observation: {formatted_answer.result}", role="user"
|
||||
)
|
||||
else:
|
||||
# For final answer or other cases: append text as assistant
|
||||
self._append_message(formatted_answer.text, role="assistant")
|
||||
except OutputParserError as e: # noqa: PERF203
|
||||
self._printer.print(
|
||||
content="Failed to parse LLM output. Retrying...",
|
||||
|
||||
@@ -382,10 +382,21 @@ def handle_agent_action_core(
|
||||
|
||||
Notes:
|
||||
- TODO: Remove messages parameter and its usage.
|
||||
- The observation is appended to formatted_answer.text for logging/trace
|
||||
purposes, but callers should use formatted_answer.llm_response for the
|
||||
assistant message (without observation) and append the observation
|
||||
separately as a user message to avoid LLM hallucination of observations.
|
||||
"""
|
||||
if step_callback:
|
||||
step_callback(tool_result)
|
||||
|
||||
# Store the original LLM response before appending observation
|
||||
# This is used by executors to correctly attribute messages:
|
||||
# - llm_response goes as assistant message
|
||||
# - observation goes as user message (to prevent LLM hallucination)
|
||||
formatted_answer.llm_response = formatted_answer.text
|
||||
|
||||
# Append observation to text for logging/trace purposes
|
||||
formatted_answer.text += f"\nObservation: {tool_result.result}"
|
||||
formatted_answer.result = tool_result.result
|
||||
|
||||
|
||||
320
lib/crewai/tests/agents/test_observation_message_attribution.py
Normal file
320
lib/crewai/tests/agents/test_observation_message_attribution.py
Normal file
@@ -0,0 +1,320 @@
|
||||
"""Tests for proper message attribution to prevent LLM observation hallucination.
|
||||
|
||||
This module tests that tool observations are correctly attributed to user messages
|
||||
rather than assistant messages, which prevents the LLM from learning to hallucinate
|
||||
fake observations during tool calls.
|
||||
|
||||
Related to GitHub issue #4181.
|
||||
"""
|
||||
|
||||
from typing import Any
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from crewai.agents.crew_agent_executor import CrewAgentExecutor
|
||||
from crewai.agents.parser import AgentAction, AgentFinish
|
||||
from crewai.tools.tool_types import ToolResult
|
||||
from crewai.utilities.agent_utils import handle_agent_action_core
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_llm() -> MagicMock:
|
||||
"""Create a mock LLM for testing."""
|
||||
llm = MagicMock()
|
||||
llm.supports_stop_words.return_value = True
|
||||
llm.stop = []
|
||||
return llm
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_agent() -> MagicMock:
|
||||
"""Create a mock agent for testing."""
|
||||
agent = MagicMock()
|
||||
agent.role = "Test Agent"
|
||||
agent.key = "test_agent_key"
|
||||
agent.verbose = False
|
||||
agent.id = "test_agent_id"
|
||||
return agent
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_task() -> MagicMock:
|
||||
"""Create a mock task for testing."""
|
||||
task = MagicMock()
|
||||
task.description = "Test task description"
|
||||
return task
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_crew() -> MagicMock:
|
||||
"""Create a mock crew for testing."""
|
||||
crew = MagicMock()
|
||||
crew.verbose = False
|
||||
crew._train = False
|
||||
return crew
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_tools_handler() -> MagicMock:
|
||||
"""Create a mock tools handler."""
|
||||
return MagicMock()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def executor(
|
||||
mock_llm: MagicMock,
|
||||
mock_agent: MagicMock,
|
||||
mock_task: MagicMock,
|
||||
mock_crew: MagicMock,
|
||||
mock_tools_handler: MagicMock,
|
||||
) -> CrewAgentExecutor:
|
||||
"""Create a CrewAgentExecutor instance for testing."""
|
||||
return CrewAgentExecutor(
|
||||
llm=mock_llm,
|
||||
task=mock_task,
|
||||
crew=mock_crew,
|
||||
agent=mock_agent,
|
||||
prompt={"prompt": "Test prompt {input} {tool_names} {tools}"},
|
||||
max_iter=5,
|
||||
tools=[],
|
||||
tools_names="",
|
||||
stop_words=["Observation:"],
|
||||
tools_description="",
|
||||
tools_handler=mock_tools_handler,
|
||||
)
|
||||
|
||||
|
||||
class TestHandleAgentActionCore:
|
||||
"""Tests for handle_agent_action_core function."""
|
||||
|
||||
def test_stores_llm_response_before_observation(self) -> None:
|
||||
"""Test that llm_response is stored before observation is appended."""
|
||||
original_text = "Thought: I need to search\nAction: search\nAction Input: query"
|
||||
action = AgentAction(
|
||||
thought="I need to search",
|
||||
tool="search",
|
||||
tool_input="query",
|
||||
text=original_text,
|
||||
)
|
||||
tool_result = ToolResult(result="Search result: found data", result_as_answer=False)
|
||||
|
||||
result = handle_agent_action_core(
|
||||
formatted_answer=action,
|
||||
tool_result=tool_result,
|
||||
)
|
||||
|
||||
assert isinstance(result, AgentAction)
|
||||
assert result.llm_response == original_text
|
||||
assert "Observation:" in result.text
|
||||
assert result.result == "Search result: found data"
|
||||
|
||||
def test_text_contains_observation_for_logging(self) -> None:
|
||||
"""Test that text contains observation for logging purposes."""
|
||||
action = AgentAction(
|
||||
thought="Testing",
|
||||
tool="test_tool",
|
||||
tool_input="{}",
|
||||
text="Thought: Testing\nAction: test_tool\nAction Input: {}",
|
||||
)
|
||||
tool_result = ToolResult(result="Tool output", result_as_answer=False)
|
||||
|
||||
result = handle_agent_action_core(
|
||||
formatted_answer=action,
|
||||
tool_result=tool_result,
|
||||
)
|
||||
|
||||
assert isinstance(result, AgentAction)
|
||||
assert "Observation: Tool output" in result.text
|
||||
|
||||
def test_result_as_answer_returns_agent_finish(self) -> None:
|
||||
"""Test that result_as_answer=True returns AgentFinish."""
|
||||
action = AgentAction(
|
||||
thought="Using tool",
|
||||
tool="final_tool",
|
||||
tool_input="{}",
|
||||
text="Thought: Using tool\nAction: final_tool\nAction Input: {}",
|
||||
)
|
||||
tool_result = ToolResult(result="Final answer from tool", result_as_answer=True)
|
||||
|
||||
result = handle_agent_action_core(
|
||||
formatted_answer=action,
|
||||
tool_result=tool_result,
|
||||
)
|
||||
|
||||
assert isinstance(result, AgentFinish)
|
||||
assert result.output == "Final answer from tool"
|
||||
|
||||
|
||||
class TestCrewAgentExecutorMessageAttribution:
|
||||
"""Tests for proper message attribution in CrewAgentExecutor."""
|
||||
|
||||
def test_tool_observation_not_in_assistant_message(
|
||||
self, executor: CrewAgentExecutor
|
||||
) -> None:
|
||||
"""Test that tool observations are not attributed to assistant messages.
|
||||
|
||||
This is the core fix for GitHub issue #4181 - observations should be
|
||||
in user messages, not assistant messages, to prevent LLM hallucination.
|
||||
"""
|
||||
call_count = 0
|
||||
|
||||
def mock_llm_response(*args: Any, **kwargs: Any) -> str:
|
||||
nonlocal call_count
|
||||
call_count += 1
|
||||
if call_count == 1:
|
||||
return (
|
||||
"Thought: I need to use a tool\n"
|
||||
"Action: test_tool\n"
|
||||
'Action Input: {"arg": "value"}'
|
||||
)
|
||||
return "Thought: I have the answer\nFinal Answer: Done"
|
||||
|
||||
with patch(
|
||||
"crewai.agents.crew_agent_executor.get_llm_response",
|
||||
side_effect=mock_llm_response,
|
||||
):
|
||||
with patch(
|
||||
"crewai.agents.crew_agent_executor.execute_tool_and_check_finality",
|
||||
return_value=ToolResult(
|
||||
result="Tool executed successfully", result_as_answer=False
|
||||
),
|
||||
):
|
||||
with patch.object(executor, "_show_logs"):
|
||||
result = executor._invoke_loop()
|
||||
|
||||
assert isinstance(result, AgentFinish)
|
||||
|
||||
assistant_messages = [
|
||||
msg for msg in executor.messages if msg.get("role") == "assistant"
|
||||
]
|
||||
user_messages = [msg for msg in executor.messages if msg.get("role") == "user"]
|
||||
|
||||
for msg in assistant_messages:
|
||||
content = msg.get("content", "")
|
||||
assert "Observation:" not in content, (
|
||||
f"Assistant message should not contain 'Observation:'. "
|
||||
f"Found: {content[:100]}..."
|
||||
)
|
||||
|
||||
observation_in_user = any(
|
||||
"Observation:" in msg.get("content", "") for msg in user_messages
|
||||
)
|
||||
assert observation_in_user, (
|
||||
"Tool observation should be in a user message, not assistant message"
|
||||
)
|
||||
|
||||
def test_llm_response_in_assistant_message(
|
||||
self, executor: CrewAgentExecutor
|
||||
) -> None:
|
||||
"""Test that the LLM's actual response is in assistant messages."""
|
||||
call_count = 0
|
||||
llm_action_text = (
|
||||
"Thought: I need to use a tool\n"
|
||||
"Action: test_tool\n"
|
||||
'Action Input: {"arg": "value"}'
|
||||
)
|
||||
|
||||
def mock_llm_response(*args: Any, **kwargs: Any) -> str:
|
||||
nonlocal call_count
|
||||
call_count += 1
|
||||
if call_count == 1:
|
||||
return llm_action_text
|
||||
return "Thought: I have the answer\nFinal Answer: Done"
|
||||
|
||||
with patch(
|
||||
"crewai.agents.crew_agent_executor.get_llm_response",
|
||||
side_effect=mock_llm_response,
|
||||
):
|
||||
with patch(
|
||||
"crewai.agents.crew_agent_executor.execute_tool_and_check_finality",
|
||||
return_value=ToolResult(
|
||||
result="Tool executed successfully", result_as_answer=False
|
||||
),
|
||||
):
|
||||
with patch.object(executor, "_show_logs"):
|
||||
executor._invoke_loop()
|
||||
|
||||
assistant_messages = [
|
||||
msg for msg in executor.messages if msg.get("role") == "assistant"
|
||||
]
|
||||
|
||||
llm_response_found = any(
|
||||
"Action: test_tool" in msg.get("content", "") for msg in assistant_messages
|
||||
)
|
||||
assert llm_response_found, (
|
||||
"LLM's action response should be in an assistant message"
|
||||
)
|
||||
|
||||
def test_message_order_after_tool_use(
|
||||
self, executor: CrewAgentExecutor
|
||||
) -> None:
|
||||
"""Test that messages are in correct order: assistant (action) then user (observation)."""
|
||||
call_count = 0
|
||||
|
||||
def mock_llm_response(*args: Any, **kwargs: Any) -> str:
|
||||
nonlocal call_count
|
||||
call_count += 1
|
||||
if call_count == 1:
|
||||
return (
|
||||
"Thought: I need to use a tool\n"
|
||||
"Action: test_tool\n"
|
||||
'Action Input: {"arg": "value"}'
|
||||
)
|
||||
return "Thought: I have the answer\nFinal Answer: Done"
|
||||
|
||||
with patch(
|
||||
"crewai.agents.crew_agent_executor.get_llm_response",
|
||||
side_effect=mock_llm_response,
|
||||
):
|
||||
with patch(
|
||||
"crewai.agents.crew_agent_executor.execute_tool_and_check_finality",
|
||||
return_value=ToolResult(
|
||||
result="Tool executed successfully", result_as_answer=False
|
||||
),
|
||||
):
|
||||
with patch.object(executor, "_show_logs"):
|
||||
executor._invoke_loop()
|
||||
|
||||
action_msg_idx = None
|
||||
observation_msg_idx = None
|
||||
|
||||
for i, msg in enumerate(executor.messages):
|
||||
content = msg.get("content", "")
|
||||
if "Action: test_tool" in content and msg.get("role") == "assistant":
|
||||
action_msg_idx = i
|
||||
if "Observation:" in content and msg.get("role") == "user":
|
||||
observation_msg_idx = i
|
||||
|
||||
assert action_msg_idx is not None, "Action message not found"
|
||||
assert observation_msg_idx is not None, "Observation message not found"
|
||||
assert observation_msg_idx == action_msg_idx + 1, (
|
||||
f"Observation (user) should immediately follow action (assistant). "
|
||||
f"Action at {action_msg_idx}, Observation at {observation_msg_idx}"
|
||||
)
|
||||
|
||||
|
||||
class TestAgentActionLlmResponseField:
|
||||
"""Tests for the llm_response field on AgentAction."""
|
||||
|
||||
def test_agent_action_has_llm_response_field(self) -> None:
|
||||
"""Test that AgentAction has llm_response field."""
|
||||
action = AgentAction(
|
||||
thought="Test",
|
||||
tool="test",
|
||||
tool_input="{}",
|
||||
text="Test text",
|
||||
)
|
||||
assert hasattr(action, "llm_response")
|
||||
assert action.llm_response is None
|
||||
|
||||
def test_agent_action_llm_response_can_be_set(self) -> None:
|
||||
"""Test that llm_response can be set on AgentAction."""
|
||||
action = AgentAction(
|
||||
thought="Test",
|
||||
tool="test",
|
||||
tool_input="{}",
|
||||
text="Test text",
|
||||
)
|
||||
action.llm_response = "Original LLM response"
|
||||
assert action.llm_response == "Original LLM response"
|
||||
Reference in New Issue
Block a user