fix: prevent response_model from leaking into tool-calling loop (#5472)

When output_pydantic / response_model is set on a task, the response_model
was being forwarded to the LLM on every iteration of the tool-calling loop.
This causes many non-OpenAI LLMs to skip tool calls entirely because tools
and response_format parameters conflict.

Fix: Don't pass response_model to the LLM during tool-calling loop
iterations when the agent has tools. Structured output conversion is
already handled as post-processing via Task._export_output().

Changes:
- crew_agent_executor.py: _invoke_loop_react, _invoke_loop_native_tools,
  _ainvoke_loop_react, _ainvoke_loop_native_tools now pass
  response_model=None when tools are present
- experimental/agent_executor.py: call_llm_and_parse, call_llm_native_tools
  similarly updated
- Added regression tests in TestResponseModelNotLeakedDuringToolLoop

Co-Authored-By: João <joao@crewai.com>
This commit is contained in:
Devin AI
2026-04-15 17:21:08 +00:00
parent 1c90d574ab
commit 892fc4ad3b
3 changed files with 219 additions and 9 deletions

View File

@@ -334,6 +334,15 @@ class CrewAgentExecutor(BaseAgentExecutor):
enforce_rpm_limit(self.request_within_rpm_limit)
# Don't pass response_model when tools are present to avoid
# conflicting with tool-calling — tools and response_format
# can't coexist on many LLM providers (see issue #5472).
# Structured output is applied as post-processing via
# Task._export_output() after the tool loop completes.
effective_response_model = (
None if self.original_tools else self.response_model
)
answer = get_llm_response(
llm=cast("BaseLLM", self.llm),
messages=self.messages,
@@ -341,11 +350,11 @@ class CrewAgentExecutor(BaseAgentExecutor):
printer=PRINTER,
from_task=self.task,
from_agent=self.agent,
response_model=self.response_model,
response_model=effective_response_model,
executor_context=self,
verbose=self.agent.verbose,
)
if self.response_model is not None:
if effective_response_model is not None:
try:
if isinstance(answer, BaseModel):
output_json = answer.model_dump_json()
@@ -477,6 +486,10 @@ class CrewAgentExecutor(BaseAgentExecutor):
enforce_rpm_limit(self.request_within_rpm_limit)
# Never pass response_model in the native tool-calling
# loop — tools and response_format can't coexist on many
# LLM providers (see issue #5472). Structured output is
# applied as post-processing via Task._export_output().
answer = get_llm_response(
llm=cast("BaseLLM", self.llm),
messages=self.messages,
@@ -486,7 +499,7 @@ class CrewAgentExecutor(BaseAgentExecutor):
available_functions=None,
from_task=self.task,
from_agent=self.agent,
response_model=self.response_model,
response_model=None,
executor_context=self,
verbose=self.agent.verbose,
)
@@ -1142,6 +1155,12 @@ class CrewAgentExecutor(BaseAgentExecutor):
enforce_rpm_limit(self.request_within_rpm_limit)
# Don't pass response_model when tools are present to avoid
# conflicting with tool-calling (see issue #5472).
effective_response_model = (
None if self.original_tools else self.response_model
)
answer = await aget_llm_response(
llm=cast("BaseLLM", self.llm),
messages=self.messages,
@@ -1149,12 +1168,12 @@ class CrewAgentExecutor(BaseAgentExecutor):
printer=PRINTER,
from_task=self.task,
from_agent=self.agent,
response_model=self.response_model,
response_model=effective_response_model,
executor_context=self,
verbose=self.agent.verbose,
)
if self.response_model is not None:
if effective_response_model is not None:
try:
if isinstance(answer, BaseModel):
output_json = answer.model_dump_json()
@@ -1286,6 +1305,8 @@ class CrewAgentExecutor(BaseAgentExecutor):
enforce_rpm_limit(self.request_within_rpm_limit)
# Never pass response_model in the native tool-calling
# loop (see issue #5472).
answer = await aget_llm_response(
llm=cast("BaseLLM", self.llm),
messages=self.messages,
@@ -1295,7 +1316,7 @@ class CrewAgentExecutor(BaseAgentExecutor):
available_functions=None,
from_task=self.task,
from_agent=self.agent,
response_model=self.response_model,
response_model=None,
executor_context=self,
verbose=self.agent.verbose,
)

View File

@@ -1222,6 +1222,12 @@ class AgentExecutor(Flow[AgentExecutorState], BaseAgentExecutor): # type: ignor
try:
enforce_rpm_limit(self.request_within_rpm_limit)
# Don't pass response_model when tools are present to avoid
# conflicting with tool-calling (see issue #5472).
effective_response_model = (
None if self.original_tools else self.response_model
)
answer = get_llm_response(
llm=self.llm,
messages=list(self.state.messages),
@@ -1229,7 +1235,7 @@ class AgentExecutor(Flow[AgentExecutorState], BaseAgentExecutor): # type: ignor
printer=PRINTER,
from_task=self.task,
from_agent=self.agent,
response_model=self.response_model,
response_model=effective_response_model,
executor_context=self,
verbose=self.agent.verbose,
)
@@ -1307,7 +1313,8 @@ class AgentExecutor(Flow[AgentExecutorState], BaseAgentExecutor): # type: ignor
enforce_rpm_limit(self.request_within_rpm_limit)
# Call LLM with native tools
# Call LLM with native tools — never pass response_model
# in the native tool-calling loop (see issue #5472).
answer = get_llm_response(
llm=self.llm,
messages=list(self.state.messages),
@@ -1317,7 +1324,7 @@ class AgentExecutor(Flow[AgentExecutorState], BaseAgentExecutor): # type: ignor
available_functions=None,
from_task=self.task,
from_agent=self.agent,
response_model=self.response_model,
response_model=None,
executor_context=self,
verbose=self.agent.verbose,
)

View File

@@ -2044,3 +2044,185 @@ class TestVisionImageFormatContract:
assert hasattr(AnthropicCompletion, "_convert_image_blocks"), (
"Anthropic provider must have _convert_image_blocks for auto-conversion"
)
# =========================================================================
# response_model must not leak into the tool-calling loop (issue #5472)
# =========================================================================
class TestResponseModelNotLeakedDuringToolLoop:
"""Regression tests for issue #5472.
When output_pydantic / response_model is set on a task, the response_model
must NOT be forwarded to the LLM during tool-calling loop iterations.
Passing both `tools` and `response_format` causes many non-OpenAI LLMs to
skip tool calls entirely. Structured output conversion should happen only
as post-processing via Task._export_output().
"""
# -- helpers ----------------------------------------------------------
@pytest.fixture
def mock_deps_with_tools(self):
"""Dependencies that include original_tools (simulating an agent with tools)."""
from pydantic import BaseModel as _BaseModel
class DummyOutput(_BaseModel):
result: str
llm = Mock()
llm.stop = []
llm.supports_stop_words.return_value = True
task = Mock()
task.description = "Test task"
task.human_input = False
task.response_model = DummyOutput
crew = Mock()
crew.verbose = False
crew._train = False
agent = Mock()
agent.id = "test-agent"
agent.role = "Tester"
agent.verbose = False
agent.key = "test-key"
dummy_tool = Mock()
dummy_tool.name = "search"
dummy_tool.description = "Search tool"
return {
"llm": llm,
"task": task,
"crew": crew,
"agent": agent,
"prompt": {"prompt": "Test {input} {tool_names} {tools}"},
"max_iter": 10,
"tools": [dummy_tool],
"tools_names": "search",
"stop_words": ["Observation"],
"tools_description": "search: Search tool",
"tools_handler": Mock(spec=_ToolsHandler),
"original_tools": [dummy_tool],
"response_model": DummyOutput,
}
@pytest.fixture
def mock_deps_no_tools(self):
"""Dependencies WITHOUT tools — response_model should still be passed."""
from pydantic import BaseModel as _BaseModel
class DummyOutput(_BaseModel):
result: str
llm = Mock()
llm.stop = []
llm.supports_stop_words.return_value = True
task = Mock()
task.description = "Test task"
task.human_input = False
task.response_model = DummyOutput
crew = Mock()
crew.verbose = False
crew._train = False
agent = Mock()
agent.id = "test-agent"
agent.role = "Tester"
agent.verbose = False
agent.key = "test-key"
return {
"llm": llm,
"task": task,
"crew": crew,
"agent": agent,
"prompt": {"prompt": "Test {input} {tool_names} {tools}"},
"max_iter": 10,
"tools": [],
"tools_names": "",
"stop_words": [],
"tools_description": "",
"tools_handler": Mock(spec=_ToolsHandler),
"original_tools": [],
"response_model": DummyOutput,
}
# -- experimental executor: call_llm_and_parse (ReAct path) -----------
@patch("crewai.experimental.agent_executor.get_llm_response")
@patch("crewai.experimental.agent_executor.enforce_rpm_limit")
def test_call_llm_and_parse_omits_response_model_when_tools_present(
self, mock_enforce_rpm, mock_get_llm, mock_deps_with_tools,
):
"""call_llm_and_parse must pass response_model=None when tools exist."""
mock_enforce_rpm.return_value = None
mock_get_llm.return_value = (
"Thought: I need to search\n"
"Action: search\n"
"Action Input: {\"query\": \"test\"}\n"
)
executor = _build_executor(**mock_deps_with_tools)
executor.call_llm_and_parse()
# Verify get_llm_response was called with response_model=None
call_kwargs = mock_get_llm.call_args
assert call_kwargs is not None
assert call_kwargs.kwargs.get("response_model") is None or (
len(call_kwargs.args) > 7 and call_kwargs.args[7] is None
), "response_model must be None when tools are present"
@patch("crewai.experimental.agent_executor.get_llm_response")
@patch("crewai.experimental.agent_executor.enforce_rpm_limit")
def test_call_llm_and_parse_keeps_response_model_when_no_tools(
self, mock_enforce_rpm, mock_get_llm, mock_deps_no_tools,
):
"""call_llm_and_parse should still pass response_model when no tools."""
from pydantic import BaseModel as _BaseModel
mock_enforce_rpm.return_value = None
# Return a BaseModel-like response when response_model is passed
mock_get_llm.return_value = '{"result": "done"}'
executor = _build_executor(**mock_deps_no_tools)
executor.call_llm_and_parse()
call_kwargs = mock_get_llm.call_args
assert call_kwargs is not None
rm = call_kwargs.kwargs.get("response_model")
assert rm is not None, (
"response_model should be passed when no tools are present"
)
# -- experimental executor: call_llm_native_tools ---------------------
@patch("crewai.experimental.agent_executor.get_llm_response")
@patch("crewai.experimental.agent_executor.enforce_rpm_limit")
def test_call_llm_native_tools_omits_response_model(
self,
mock_enforce_rpm,
mock_get_llm,
mock_deps_with_tools,
):
"""call_llm_native_tools must always pass response_model=None."""
mock_enforce_rpm.return_value = None
mock_get_llm.return_value = "Final answer text"
executor = _build_executor(**mock_deps_with_tools)
# Pre-set the openai_tools attribute (normally done by _setup_native_tools)
executor._openai_tools = []
executor._available_functions = {}
executor._tool_name_mapping = {}
executor.call_llm_native_tools()
call_kwargs = mock_get_llm.call_args
assert call_kwargs is not None
assert call_kwargs.kwargs.get("response_model") is None, (
"response_model must be None in native tool-calling loop"
)