feat: Add extended thinking support for Anthropic Claude

- Add thinking parameter to AnthropicCompletion.__init__ - Include thinking parameter in API calls via _prepare_completion_params - Thinking blocks are automatically preserved in tool use conversations - Add comprehensive tests for extended thinking with tool use - Fixes #3964 Co-Authored-By: João <joao@crewai.com>
2025-12-16 04:18:35 +00:00 · 2025-11-23 02:49:13 +00:00
parent bcc3e358cb
commit 9b7d475750
2 changed files with 257 additions and 0 deletions
--- a/lib/crewai/src/crewai/llms/providers/anthropic/completion.py
+++ b/lib/crewai/src/crewai/llms/providers/anthropic/completion.py
@@ -50,6 +50,7 @@ class AnthropicCompletion(BaseLLM):
        top_p: float | None = None,
        stop_sequences: list[str] | None = None,
        stream: bool = False,
+        thinking: dict[str, Any] | None = None,
        client_params: dict[str, Any] | None = None,
        interceptor: BaseInterceptor[httpx.Request, httpx.Response] | None = None,
        **kwargs: Any,
@@ -67,6 +68,7 @@ class AnthropicCompletion(BaseLLM):
            top_p: Nucleus sampling parameter
            stop_sequences: Stop sequences (Anthropic uses stop_sequences, not stop)
            stream: Enable streaming responses
+            thinking: Extended thinking configuration (e.g., {'type': 'enabled', 'budget_tokens': 5000})
            client_params: Additional parameters for the Anthropic client
            interceptor: HTTP interceptor for modifying requests/responses at transport level.
            **kwargs: Additional parameters
@@ -89,6 +91,7 @@ class AnthropicCompletion(BaseLLM):
        self.top_p = top_p
        self.stream = stream
        self.stop_sequences = stop_sequences or []
+        self.thinking = thinking

        # Model-specific settings
        self.is_claude_3 = "claude-3" in model.lower()
@@ -248,6 +251,10 @@ class AnthropicCompletion(BaseLLM):
        if self.stop_sequences:
            params["stop_sequences"] = self.stop_sequences

+        # Add extended thinking configuration if set
+        if self.thinking is not None:
+            params["thinking"] = self.thinking
+
        # Handle tools for Claude 3+
        if tools and self.supports_tools:
            params["tools"] = self._convert_tools_for_interference(tools)
--- a/lib/crewai/tests/llms/anthropic/test_anthropic.py
+++ b/lib/crewai/tests/llms/anthropic/test_anthropic.py
@@ -698,3 +698,253 @@ def test_anthropic_stop_sequences_sent_to_api():
    assert result is not None
    assert isinstance(result, str)
    assert len(result) > 0
+
+
+def test_anthropic_extended_thinking_parameter():
+    """
+    Test that AnthropicCompletion accepts and stores the thinking parameter
+    """
+    thinking_config = {"type": "enabled", "budget_tokens": 5000}
+    
+    llm = LLM(
+        model="anthropic/claude-3-5-sonnet-20241022",
+        thinking=thinking_config
+    )
+    
+    from crewai.llms.providers.anthropic.completion import AnthropicCompletion
+    assert isinstance(llm, AnthropicCompletion)
+    assert llm.thinking == thinking_config
+
+
+def test_anthropic_extended_thinking_added_to_api_call():
+    """
+    Test that the thinking parameter is added to the API call parameters
+    """
+    from crewai.llms.providers.anthropic.completion import AnthropicCompletion
+    
+    thinking_config = {"type": "enabled", "budget_tokens": 5000}
+    completion = AnthropicCompletion(
+        model="claude-3-5-sonnet-20241022",
+        thinking=thinking_config
+    )
+    
+    # Test _prepare_completion_params includes thinking
+    messages = [{"role": "user", "content": "Hello"}]
+    params = completion._prepare_completion_params(messages)
+    
+    assert "thinking" in params
+    assert params["thinking"] == thinking_config
+
+
+def test_anthropic_extended_thinking_not_added_when_none():
+    """
+    Test that the thinking parameter is not added to API call when None
+    """
+    from crewai.llms.providers.anthropic.completion import AnthropicCompletion
+    
+    completion = AnthropicCompletion(model="claude-3-5-sonnet-20241022")
+    
+    # Test _prepare_completion_params does not include thinking when None
+    messages = [{"role": "user", "content": "Hello"}]
+    params = completion._prepare_completion_params(messages)
+    
+    assert "thinking" not in params
+
+
+def test_anthropic_extended_thinking_with_tool_use_preserves_thinking_blocks():
+    """
+    Test that thinking blocks are preserved in tool use conversation flow
+    """
+    from unittest.mock import Mock, patch
+    from crewai.llms.providers.anthropic.completion import AnthropicCompletion
+    from anthropic.types.tool_use_block import ToolUseBlock
+    from anthropic.types import ThinkingBlock
+    
+    thinking_config = {"type": "enabled", "budget_tokens": 5000}
+    completion = AnthropicCompletion(
+        model="claude-3-5-sonnet-20241022",
+        thinking=thinking_config
+    )
+    
+    # Mock tool function
+    def mock_calculator(operation: str, a: int, b: int) -> str:
+        if operation == "add":
+            return str(a + b)
+        return "0"
+    
+    available_functions = {"calculator": mock_calculator}
+    
+    # Mock the Anthropic client responses
+    with patch.object(completion.client.messages, 'create') as mock_create:
+        # Mock thinking block
+        mock_thinking_block = Mock(spec=ThinkingBlock)
+        mock_thinking_block.type = "thinking"
+        mock_thinking_block.thinking = "I need to use the calculator tool to add 5 and 3"
+        
+        # Mock tool use block
+        mock_tool_use = Mock(spec=ToolUseBlock)
+        mock_tool_use.id = "tool_456"
+        mock_tool_use.name = "calculator"
+        mock_tool_use.input = {"operation": "add", "a": 5, "b": 3}
+        mock_tool_use.type = "tool_use"
+        
+        # Mock initial response with thinking block + tool use
+        mock_initial_response = Mock()
+        mock_initial_response.content = [mock_thinking_block, mock_tool_use]
+        mock_initial_response.usage = Mock()
+        mock_initial_response.usage.input_tokens = 100
+        mock_initial_response.usage.output_tokens = 50
+        
+        # Mock final response after tool result
+        mock_text_block = Mock()
+        mock_text_block.configure_mock(text="The sum of 5 and 3 is 8.")
+        
+        mock_final_response = Mock()
+        mock_final_response.content = [mock_text_block]
+        mock_final_response.usage = Mock()
+        mock_final_response.usage.input_tokens = 150
+        mock_final_response.usage.output_tokens = 75
+        
+        # Configure mock to return different responses on successive calls
+        mock_create.side_effect = [mock_initial_response, mock_final_response]
+        
+        # Test the call
+        messages = [{"role": "user", "content": "What is 5 + 3?"}]
+        result = completion.call(
+            messages=messages,
+            available_functions=available_functions
+        )
+        
+        # Verify the result contains the final response
+        assert "sum of 5 and 3 is 8" in result
+        
+        # Verify that two API calls were made (initial + follow-up)
+        assert mock_create.call_count == 2
+        
+        # Verify the first call includes thinking parameter
+        first_call_kwargs = mock_create.call_args_list[0][1]
+        assert "thinking" in first_call_kwargs
+        assert first_call_kwargs["thinking"] == thinking_config
+        
+        # Verify the second call includes thinking blocks in assistant message
+        second_call_kwargs = mock_create.call_args_list[1][1]
+        messages_in_second_call = second_call_kwargs["messages"]
+        
+        # Should have original user message + assistant (with thinking + tool_use) + user tool result
+        assert len(messages_in_second_call) == 3
+        assert messages_in_second_call[0]["role"] == "user"
+        assert messages_in_second_call[1]["role"] == "assistant"
+        assert messages_in_second_call[2]["role"] == "user"
+        
+        # Verify assistant message content includes both thinking and tool_use blocks
+        assistant_content = messages_in_second_call[1]["content"]
+        assert len(assistant_content) == 2
+        assert assistant_content[0] == mock_thinking_block  # Thinking block preserved
+        assert assistant_content[1] == mock_tool_use  # Tool use block preserved
+        
+        # Verify tool result format
+        tool_result = messages_in_second_call[2]["content"][0]
+        assert tool_result["type"] == "tool_result"
+        assert tool_result["tool_use_id"] == "tool_456"
+        assert "8" in tool_result["content"]
+        
+        # Verify the second call also includes thinking parameter
+        assert "thinking" in second_call_kwargs
+        assert second_call_kwargs["thinking"] == thinking_config
+
+
+def test_anthropic_extended_thinking_with_multiple_tool_calls():
+    """
+    Test that thinking blocks are preserved across multiple tool calls
+    """
+    from unittest.mock import Mock, patch
+    from crewai.llms.providers.anthropic.completion import AnthropicCompletion
+    from anthropic.types.tool_use_block import ToolUseBlock
+    from anthropic.types import ThinkingBlock
+    
+    thinking_config = {"type": "enabled", "budget_tokens": 10000}
+    completion = AnthropicCompletion(
+        model="claude-3-5-sonnet-20241022",
+        thinking=thinking_config
+    )
+    
+    # Mock tool functions
+    def mock_search(query: str) -> str:
+        return f"Search results for: {query}"
+    
+    def mock_summarize(text: str) -> str:
+        return f"Summary: {text[:20]}..."
+    
+    available_functions = {
+        "search": mock_search,
+        "summarize": mock_summarize
+    }
+    
+    # Mock the Anthropic client responses
+    with patch.object(completion.client.messages, 'create') as mock_create:
+        # Mock thinking block
+        mock_thinking_block = Mock(spec=ThinkingBlock)
+        mock_thinking_block.type = "thinking"
+        mock_thinking_block.thinking = "I should search first, then summarize"
+        
+        # Mock first tool use
+        mock_tool_use_1 = Mock(spec=ToolUseBlock)
+        mock_tool_use_1.id = "tool_1"
+        mock_tool_use_1.name = "search"
+        mock_tool_use_1.input = {"query": "AI agents"}
+        mock_tool_use_1.type = "tool_use"
+        
+        # Mock second tool use
+        mock_tool_use_2 = Mock(spec=ToolUseBlock)
+        mock_tool_use_2.id = "tool_2"
+        mock_tool_use_2.name = "summarize"
+        mock_tool_use_2.input = {"text": "Search results for: AI agents"}
+        mock_tool_use_2.type = "tool_use"
+        
+        # Mock initial response with thinking + multiple tool uses
+        mock_initial_response = Mock()
+        mock_initial_response.content = [mock_thinking_block, mock_tool_use_1, mock_tool_use_2]
+        mock_initial_response.usage = Mock()
+        mock_initial_response.usage.input_tokens = 100
+        mock_initial_response.usage.output_tokens = 50
+        
+        # Mock final response
+        mock_text_block = Mock()
+        mock_text_block.configure_mock(text="Here's the summary of AI agents research.")
+        
+        mock_final_response = Mock()
+        mock_final_response.content = [mock_text_block]
+        mock_final_response.usage = Mock()
+        mock_final_response.usage.input_tokens = 200
+        mock_final_response.usage.output_tokens = 100
+        
+        mock_create.side_effect = [mock_initial_response, mock_final_response]
+        
+        # Test the call
+        messages = [{"role": "user", "content": "Research AI agents and summarize"}]
+        result = completion.call(
+            messages=messages,
+            available_functions=available_functions
+        )
+        
+        # Verify result
+        assert "summary of AI agents" in result
+        
+        # Verify two API calls
+        assert mock_create.call_count == 2
+        
+        # Verify the second call preserves thinking block and all tool uses
+        second_call_kwargs = mock_create.call_args_list[1][1]
+        messages_in_second_call = second_call_kwargs["messages"]
+        
+        assistant_content = messages_in_second_call[1]["content"]
+        assert len(assistant_content) == 3  # thinking + 2 tool uses
+        assert assistant_content[0] == mock_thinking_block
+        assert assistant_content[1] == mock_tool_use_1
+        assert assistant_content[2] == mock_tool_use_2
+        
+        # Verify tool results
+        tool_results = messages_in_second_call[2]["content"]
+        assert len(tool_results) == 2
+        assert tool_results[0]["tool_use_id"] == "tool_1"
+        assert tool_results[1]["tool_use_id"] == "tool_2"