diff --git a/lib/crewai/src/crewai/experimental/agent_executor.py b/lib/crewai/src/crewai/experimental/agent_executor.py index 676780138..c7a727c30 100644 --- a/lib/crewai/src/crewai/experimental/agent_executor.py +++ b/lib/crewai/src/crewai/experimental/agent_executor.py @@ -819,15 +819,6 @@ class AgentExecutor(Flow[AgentReActState], CrewAgentExecutorMixin): self.state.is_finished = True return "tool_result_is_final" - # Add reflection prompt once after all tools in the batch - reasoning_prompt = self._i18n.slice("post_tool_reasoning") - - reasoning_message: LLMMessage = { - "role": "user", - "content": reasoning_prompt, - } - self.state.messages.append(reasoning_message) - return "native_tool_completed" def _extract_tool_name(self, tool_call: Any) -> str: diff --git a/lib/crewai/src/crewai/translations/en.json b/lib/crewai/src/crewai/translations/en.json index 5e27998ba..adce76236 100644 --- a/lib/crewai/src/crewai/translations/en.json +++ b/lib/crewai/src/crewai/translations/en.json @@ -10,9 +10,10 @@ "memory": "\n\n# Useful context: \n{memory}", "role_playing": "You are {role}. {backstory}\nYour personal goal is: {goal}", "tools": "\nYou ONLY have access to the following tools, and should NEVER make up tools that are not listed here:\n\n{tools}\n\nIMPORTANT: Use the following format in your response:\n\n```\nThought: you should always think about what to do\nAction: the action to take, only one name of [{tool_names}], just the name, exactly as it's written.\nAction Input: the input to the action, just a simple JSON object, enclosed in curly braces, using \" to wrap keys and values.\nObservation: the result of the action\n```\n\nOnce all necessary information is gathered, return the following format:\n\n```\nThought: I now know the final answer\nFinal Answer: the final answer to the original input question\n```", - "no_tools": "\nTo give my best complete final answer to the task respond using the exact following format:\n\nThought: I now can give a great answer\nFinal Answer: Your final answer must be the great and the most complete as possible, it must be outcome described.\n\nI MUST use these formats, my job depends on it!", - "native_tools": "\nUse available tools to gather information and complete your task.", - "native_task": "\nCurrent Task: {input}\n\nThis is VERY important to you, your job depends on it!", + "no_tools": "", + "task_no_tools": "\nCurrent Task: {input}\n\nProvide your complete response:", + "native_tools": "", + "native_task": "\nCurrent Task: {input}", "post_tool_reasoning": "Analyze the tool result. If requirements are met, provide the Final Answer. Otherwise, call the next tool. Deliver only the answer without meta-commentary.", "format": "Decide if you need a tool or can provide the final answer. Use one at a time.\nTo use a tool, use:\nThought: [reasoning]\nAction: [name from {tool_names}]\nAction Input: [JSON object]\n\nTo provide the final answer, use:\nThought: [reasoning]\nFinal Answer: [complete response]", "final_answer_format": "If you don't need to use any more tools, you must give your best complete final answer, make sure it satisfies the expected criteria, use the EXACT format below:\n\n```\nThought: I now can give a great answer\nFinal Answer: my best complete final answer to the task.\n\n```", diff --git a/lib/crewai/src/crewai/utilities/prompts.py b/lib/crewai/src/crewai/utilities/prompts.py index 26c8f112b..57b54be1c 100644 --- a/lib/crewai/src/crewai/utilities/prompts.py +++ b/lib/crewai/src/crewai/utilities/prompts.py @@ -23,7 +23,13 @@ class SystemPromptResult(StandardPromptResult): COMPONENTS = Literal[ - "role_playing", "tools", "no_tools", "native_tools", "task", "native_task" + "role_playing", + "tools", + "no_tools", + "native_tools", + "task", + "native_task", + "task_no_tools", ] @@ -74,11 +80,14 @@ class Prompts(BaseModel): slices.append("no_tools") system: str = self._build_prompt(slices) - # Use native_task for native tool calling (no "Thought:" prompt) - # Use task for ReAct pattern (includes "Thought:" prompt) - task_slice: COMPONENTS = ( - "native_task" if self.use_native_tool_calling else "task" - ) + # Determine which task slice to use: + task_slice: COMPONENTS + if self.use_native_tool_calling: + task_slice = "native_task" + elif self.has_tools: + task_slice = "task" + else: + task_slice = "task_no_tools" slices.append(task_slice) if ( diff --git a/lib/crewai/tests/cassettes/utilities/TestRealLLMNoThoughtLeakage.test_agent_without_tools_no_thought_in_output.yaml b/lib/crewai/tests/cassettes/utilities/TestRealLLMNoThoughtLeakage.test_agent_without_tools_no_thought_in_output.yaml new file mode 100644 index 000000000..e09b5ac53 --- /dev/null +++ b/lib/crewai/tests/cassettes/utilities/TestRealLLMNoThoughtLeakage.test_agent_without_tools_no_thought_in_output.yaml @@ -0,0 +1,112 @@ +interactions: +- request: + body: '{"messages":[{"role":"system","content":"You are Language Detector. You + are an expert linguist who can identify languages.\nYour personal goal is: Detect + the language of text"},{"role":"user","content":"\nCurrent Task: What language + is this text written in: ''Hello, how are you?''\n\nThis is the expected criteria + for your final answer: The detected language (e.g., English, Spanish, etc.)\nyou + MUST return the actual complete content as the final answer, not a summary.\n\nProvide + your complete response:"}],"model":"gpt-4o-mini"}' + headers: + User-Agent: + - X-USER-AGENT-XXX + accept: + - application/json + accept-encoding: + - ACCEPT-ENCODING-XXX + authorization: + - AUTHORIZATION-XXX + connection: + - keep-alive + content-length: + - '530' + content-type: + - application/json + host: + - api.openai.com + x-stainless-arch: + - X-STAINLESS-ARCH-XXX + x-stainless-async: + - 'false' + x-stainless-lang: + - python + x-stainless-os: + - X-STAINLESS-OS-XXX + x-stainless-package-version: + - 1.83.0 + x-stainless-read-timeout: + - X-STAINLESS-READ-TIMEOUT-XXX + x-stainless-retry-count: + - '0' + x-stainless-runtime: + - CPython + x-stainless-runtime-version: + - 3.13.3 + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: "{\n \"id\": \"chatcmpl-D39bkotgEapBcz1sSIXvhPhK9G7FD\",\n \"object\": + \"chat.completion\",\n \"created\": 1769644288,\n \"model\": \"gpt-4o-mini-2024-07-18\",\n + \ \"choices\": [\n {\n \"index\": 0,\n \"message\": {\n \"role\": + \"assistant\",\n \"content\": \"English\",\n \"refusal\": null,\n + \ \"annotations\": []\n },\n \"logprobs\": null,\n \"finish_reason\": + \"stop\"\n }\n ],\n \"usage\": {\n \"prompt_tokens\": 101,\n \"completion_tokens\": + 1,\n \"total_tokens\": 102,\n \"prompt_tokens_details\": {\n \"cached_tokens\": + 0,\n \"audio_tokens\": 0\n },\n \"completion_tokens_details\": + {\n \"reasoning_tokens\": 0,\n \"audio_tokens\": 0,\n \"accepted_prediction_tokens\": + 0,\n \"rejected_prediction_tokens\": 0\n }\n },\n \"service_tier\": + \"default\",\n \"system_fingerprint\": \"fp_3683ee3deb\"\n}\n" + headers: + CF-RAY: + - CF-RAY-XXX + Connection: + - keep-alive + Content-Type: + - application/json + Date: + - Wed, 28 Jan 2026 23:51:28 GMT + Server: + - cloudflare + Set-Cookie: + - SET-COOKIE-XXX + Strict-Transport-Security: + - STS-XXX + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - X-CONTENT-TYPE-XXX + access-control-expose-headers: + - ACCESS-CONTROL-XXX + alt-svc: + - h3=":443"; ma=86400 + cf-cache-status: + - DYNAMIC + openai-organization: + - OPENAI-ORG-XXX + openai-processing-ms: + - '279' + openai-project: + - OPENAI-PROJECT-XXX + openai-version: + - '2020-10-01' + x-openai-proxy-wasm: + - v0.1 + x-ratelimit-limit-requests: + - X-RATELIMIT-LIMIT-REQUESTS-XXX + x-ratelimit-limit-tokens: + - X-RATELIMIT-LIMIT-TOKENS-XXX + x-ratelimit-remaining-requests: + - X-RATELIMIT-REMAINING-REQUESTS-XXX + x-ratelimit-remaining-tokens: + - X-RATELIMIT-REMAINING-TOKENS-XXX + x-ratelimit-reset-requests: + - X-RATELIMIT-RESET-REQUESTS-XXX + x-ratelimit-reset-tokens: + - X-RATELIMIT-RESET-TOKENS-XXX + x-request-id: + - X-REQUEST-ID-XXX + status: + code: 200 + message: OK +version: 1 diff --git a/lib/crewai/tests/cassettes/utilities/TestRealLLMNoThoughtLeakage.test_simple_task_clean_output.yaml b/lib/crewai/tests/cassettes/utilities/TestRealLLMNoThoughtLeakage.test_simple_task_clean_output.yaml new file mode 100644 index 000000000..5a8d97845 --- /dev/null +++ b/lib/crewai/tests/cassettes/utilities/TestRealLLMNoThoughtLeakage.test_simple_task_clean_output.yaml @@ -0,0 +1,111 @@ +interactions: +- request: + body: '{"messages":[{"role":"system","content":"You are Classifier. You classify + text sentiment accurately.\nYour personal goal is: Classify text sentiment"},{"role":"user","content":"\nCurrent + Task: Classify the sentiment of: ''I love this product!''\n\nThis is the expected + criteria for your final answer: One word: positive, negative, or neutral\nyou + MUST return the actual complete content as the final answer, not a summary.\n\nProvide + your complete response:"}],"model":"gpt-4o-mini"}' + headers: + User-Agent: + - X-USER-AGENT-XXX + accept: + - application/json + accept-encoding: + - ACCEPT-ENCODING-XXX + authorization: + - AUTHORIZATION-XXX + connection: + - keep-alive + content-length: + - '481' + content-type: + - application/json + host: + - api.openai.com + x-stainless-arch: + - X-STAINLESS-ARCH-XXX + x-stainless-async: + - 'false' + x-stainless-lang: + - python + x-stainless-os: + - X-STAINLESS-OS-XXX + x-stainless-package-version: + - 1.83.0 + x-stainless-read-timeout: + - X-STAINLESS-READ-TIMEOUT-XXX + x-stainless-retry-count: + - '0' + x-stainless-runtime: + - CPython + x-stainless-runtime-version: + - 3.13.3 + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: "{\n \"id\": \"chatcmpl-D39bkVPelOZanWIMBoIyzsuj072sM\",\n \"object\": + \"chat.completion\",\n \"created\": 1769644288,\n \"model\": \"gpt-4o-mini-2024-07-18\",\n + \ \"choices\": [\n {\n \"index\": 0,\n \"message\": {\n \"role\": + \"assistant\",\n \"content\": \"positive\",\n \"refusal\": null,\n + \ \"annotations\": []\n },\n \"logprobs\": null,\n \"finish_reason\": + \"stop\"\n }\n ],\n \"usage\": {\n \"prompt_tokens\": 89,\n \"completion_tokens\": + 1,\n \"total_tokens\": 90,\n \"prompt_tokens_details\": {\n \"cached_tokens\": + 0,\n \"audio_tokens\": 0\n },\n \"completion_tokens_details\": + {\n \"reasoning_tokens\": 0,\n \"audio_tokens\": 0,\n \"accepted_prediction_tokens\": + 0,\n \"rejected_prediction_tokens\": 0\n }\n },\n \"service_tier\": + \"default\",\n \"system_fingerprint\": \"fp_3683ee3deb\"\n}\n" + headers: + CF-RAY: + - CF-RAY-XXX + Connection: + - keep-alive + Content-Type: + - application/json + Date: + - Wed, 28 Jan 2026 23:51:29 GMT + Server: + - cloudflare + Set-Cookie: + - SET-COOKIE-XXX + Strict-Transport-Security: + - STS-XXX + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - X-CONTENT-TYPE-XXX + access-control-expose-headers: + - ACCESS-CONTROL-XXX + alt-svc: + - h3=":443"; ma=86400 + cf-cache-status: + - DYNAMIC + openai-organization: + - OPENAI-ORG-XXX + openai-processing-ms: + - '323' + openai-project: + - OPENAI-PROJECT-XXX + openai-version: + - '2020-10-01' + x-openai-proxy-wasm: + - v0.1 + x-ratelimit-limit-requests: + - X-RATELIMIT-LIMIT-REQUESTS-XXX + x-ratelimit-limit-tokens: + - X-RATELIMIT-LIMIT-TOKENS-XXX + x-ratelimit-remaining-requests: + - X-RATELIMIT-REMAINING-REQUESTS-XXX + x-ratelimit-remaining-tokens: + - X-RATELIMIT-REMAINING-TOKENS-XXX + x-ratelimit-reset-requests: + - X-RATELIMIT-RESET-REQUESTS-XXX + x-ratelimit-reset-tokens: + - X-RATELIMIT-RESET-TOKENS-XXX + x-request-id: + - X-REQUEST-ID-XXX + status: + code: 200 + message: OK +version: 1 diff --git a/lib/crewai/tests/utilities/test_prompts_no_thought_leakage.py b/lib/crewai/tests/utilities/test_prompts_no_thought_leakage.py new file mode 100644 index 000000000..8ece3e765 --- /dev/null +++ b/lib/crewai/tests/utilities/test_prompts_no_thought_leakage.py @@ -0,0 +1,234 @@ +"""Tests for prompt generation to prevent thought leakage. + +These tests verify that: +1. Agents without tools don't get ReAct format instructions +2. The generated prompts don't encourage "Thought:" prefixes that leak into output +3. Real LLM calls produce clean output without internal reasoning +""" + +from __future__ import annotations + +from unittest.mock import MagicMock + +import pytest + +from crewai import Agent, Crew, Task +from crewai.llm import LLM +from crewai.utilities.prompts import Prompts + + +class TestNoToolsPromptGeneration: + """Tests for prompt generation when agent has no tools.""" + + def test_no_tools_uses_task_no_tools_slice(self) -> None: + """Test that agents without tools use task_no_tools slice instead of task.""" + mock_agent = MagicMock() + mock_agent.role = "Test Agent" + mock_agent.goal = "Test goal" + mock_agent.backstory = "Test backstory" + + prompts = Prompts( + has_tools=False, + use_native_tool_calling=False, + use_system_prompt=True, + agent=mock_agent, + ) + + result = prompts.task_execution() + + # Verify it's a SystemPromptResult with system and user keys + assert "system" in result + assert "user" in result + assert "prompt" in result + + # The user prompt should NOT contain "Thought:" (ReAct format) + assert "Thought:" not in result["user"] + + # The user prompt should NOT mention tools + assert "use the tools available" not in result["user"] + assert "tools available" not in result["user"].lower() + + # The system prompt should NOT contain ReAct format instructions + assert "Thought:" not in result["system"] + assert "Final Answer:" not in result["system"] + + def test_no_tools_prompt_is_simple(self) -> None: + """Test that no-tools prompt is simple and direct.""" + mock_agent = MagicMock() + mock_agent.role = "Language Detector" + mock_agent.goal = "Detect language" + mock_agent.backstory = "Expert linguist" + + prompts = Prompts( + has_tools=False, + use_native_tool_calling=False, + use_system_prompt=True, + agent=mock_agent, + ) + + result = prompts.task_execution() + + # Should contain the role playing info + assert "Language Detector" in result["system"] + + # User prompt should be simple with just the task + assert "Current Task:" in result["user"] + assert "Provide your complete response:" in result["user"] + + def test_with_tools_uses_task_slice_with_react(self) -> None: + """Test that agents WITH tools use the task slice (ReAct format).""" + mock_agent = MagicMock() + mock_agent.role = "Test Agent" + mock_agent.goal = "Test goal" + mock_agent.backstory = "Test backstory" + + prompts = Prompts( + has_tools=True, + use_native_tool_calling=False, + use_system_prompt=True, + agent=mock_agent, + ) + + result = prompts.task_execution() + + # With tools and ReAct, the prompt SHOULD contain Thought: + assert "Thought:" in result["user"] + + def test_native_tools_uses_native_task_slice(self) -> None: + """Test that native tool calling uses native_task slice.""" + mock_agent = MagicMock() + mock_agent.role = "Test Agent" + mock_agent.goal = "Test goal" + mock_agent.backstory = "Test backstory" + + prompts = Prompts( + has_tools=True, + use_native_tool_calling=True, + use_system_prompt=True, + agent=mock_agent, + ) + + result = prompts.task_execution() + + # Native tool calling should NOT have Thought: in user prompt + assert "Thought:" not in result["user"] + + # Should NOT have emotional manipulation + assert "your job depends on it" not in result["user"] + + +class TestNoThoughtLeakagePatterns: + """Tests to verify prompts don't encourage thought leakage.""" + + def test_no_job_depends_on_it_in_no_tools(self) -> None: + """Test that 'your job depends on it' is not in no-tools prompts.""" + mock_agent = MagicMock() + mock_agent.role = "Test" + mock_agent.goal = "Test" + mock_agent.backstory = "Test" + + prompts = Prompts( + has_tools=False, + use_native_tool_calling=False, + use_system_prompt=True, + agent=mock_agent, + ) + + result = prompts.task_execution() + + full_prompt = result["prompt"] + assert "your job depends on it" not in full_prompt.lower() + assert "i must use these formats" not in full_prompt.lower() + + def test_no_job_depends_on_it_in_native_task(self) -> None: + """Test that 'your job depends on it' is not in native task prompts.""" + mock_agent = MagicMock() + mock_agent.role = "Test" + mock_agent.goal = "Test" + mock_agent.backstory = "Test" + + prompts = Prompts( + has_tools=True, + use_native_tool_calling=True, + use_system_prompt=True, + agent=mock_agent, + ) + + result = prompts.task_execution() + + full_prompt = result["prompt"] + assert "your job depends on it" not in full_prompt.lower() + + +class TestRealLLMNoThoughtLeakage: + """Integration tests with real LLM calls to verify no thought leakage.""" + + @pytest.mark.vcr() + def test_agent_without_tools_no_thought_in_output(self) -> None: + """Test that agent without tools produces clean output without 'Thought:' prefix.""" + agent = Agent( + role="Language Detector", + goal="Detect the language of text", + backstory="You are an expert linguist who can identify languages.", + tools=[], # No tools + llm=LLM(model="gpt-4o-mini"), + verbose=False, + ) + + task = Task( + description="What language is this text written in: 'Hello, how are you?'", + expected_output="The detected language (e.g., English, Spanish, etc.)", + agent=agent, + ) + + crew = Crew(agents=[agent], tasks=[task]) + result = crew.kickoff() + + assert result is not None + assert result.raw is not None + + # The output should NOT start with "Thought:" or contain ReAct artifacts + output = str(result.raw) + assert not output.strip().startswith("Thought:") + assert "Final Answer:" not in output + assert "I now can give a great answer" not in output + + # Should contain an actual answer about the language + assert any( + lang in output.lower() + for lang in ["english", "en", "language"] + ) + + @pytest.mark.vcr() + def test_simple_task_clean_output(self) -> None: + """Test that a simple task produces clean output without internal reasoning.""" + agent = Agent( + role="Classifier", + goal="Classify text sentiment", + backstory="You classify text sentiment accurately.", + tools=[], + llm=LLM(model="gpt-4o-mini"), + verbose=False, + ) + + task = Task( + description="Classify the sentiment of: 'I love this product!'", + expected_output="One word: positive, negative, or neutral", + agent=agent, + ) + + crew = Crew(agents=[agent], tasks=[task]) + result = crew.kickoff() + + assert result is not None + output = str(result.raw).strip().lower() + + # Output should be clean - just the classification + assert not output.startswith("thought:") + assert "final answer:" not in output + + # Should contain the actual classification + assert any( + sentiment in output + for sentiment in ["positive", "negative", "neutral"] + )