"""Integration tests for native tool calling functionality. These tests verify that agents can use native function calling when the LLM supports it, across multiple providers. """ from __future__ import annotations import os from unittest.mock import patch import pytest from pydantic import BaseModel, Field from crewai import Agent, Crew, Task from crewai.llm import LLM from crewai.tools.base_tool import BaseTool class CalculatorInput(BaseModel): """Input schema for calculator tool.""" expression: str = Field(description="Mathematical expression to evaluate") class CalculatorTool(BaseTool): """A calculator tool that performs mathematical calculations.""" name: str = "calculator" description: str = "Perform mathematical calculations. Use this for any math operations." args_schema: type[BaseModel] = CalculatorInput def _run(self, expression: str) -> str: """Execute the calculation.""" try: # Safe evaluation for basic math result = eval(expression) # noqa: S307 return f"The result of {expression} is {result}" except Exception as e: return f"Error calculating {expression}: {e}" class WeatherInput(BaseModel): """Input schema for weather tool.""" location: str = Field(description="City name to get weather for") class WeatherTool(BaseTool): """A mock weather tool for testing.""" name: str = "get_weather" description: str = "Get the current weather for a location" args_schema: type[BaseModel] = WeatherInput def _run(self, location: str) -> str: """Get weather (mock implementation).""" return f"The weather in {location} is sunny with a temperature of 72°F" class FailingTool(BaseTool): """A tool that always fails.""" name: str = "failing_tool" description: str = "This tool always fails" def _run(self) -> str: raise Exception("This tool always fails") @pytest.fixture def calculator_tool() -> CalculatorTool: """Create a calculator tool for testing.""" return CalculatorTool() @pytest.fixture def weather_tool() -> WeatherTool: """Create a weather tool for testing.""" return WeatherTool() @pytest.fixture def failing_tool() -> BaseTool: """Create a weather tool for testing.""" return FailingTool( ) # ============================================================================= # OpenAI Provider Tests # ============================================================================= class TestOpenAINativeToolCalling: """Tests for native tool calling with OpenAI models.""" @pytest.mark.vcr() def test_openai_agent_with_native_tool_calling( self, calculator_tool: CalculatorTool ) -> None: """Test OpenAI agent can use native tool calling.""" agent = Agent( role="Math Assistant", goal="Help users with mathematical calculations", backstory="You are a helpful math assistant.", tools=[calculator_tool], llm=LLM(model="gpt-4o-mini"), verbose=False, max_iter=3, ) task = Task( description="Calculate what is 15 * 8", expected_output="The result of the calculation", agent=agent, ) crew = Crew(agents=[agent], tasks=[task]) result = crew.kickoff() assert result is not None assert result.raw is not None assert "120" in str(result.raw) def test_openai_agent_kickoff_with_tools_mocked( self, calculator_tool: CalculatorTool ) -> None: """Test OpenAI agent kickoff with mocked LLM call.""" llm = LLM(model="gpt-4o-mini") with patch.object(llm, "call", return_value="The answer is 120.") as mock_call: agent = Agent( role="Math Assistant", goal="Calculate math", backstory="You calculate.", tools=[calculator_tool], llm=llm, verbose=False, ) task = Task( description="Calculate 15 * 8", expected_output="Result", agent=agent, ) crew = Crew(agents=[agent], tasks=[task]) result = crew.kickoff() assert mock_call.called assert result is not None # ============================================================================= # Anthropic Provider Tests # ============================================================================= class TestAnthropicNativeToolCalling: """Tests for native tool calling with Anthropic models.""" @pytest.fixture(autouse=True) def mock_anthropic_api_key(self): """Mock ANTHROPIC_API_KEY for tests.""" if "ANTHROPIC_API_KEY" not in os.environ: with patch.dict(os.environ, {"ANTHROPIC_API_KEY": "test-key"}): yield else: yield @pytest.mark.vcr() def test_anthropic_agent_with_native_tool_calling( self, calculator_tool: CalculatorTool ) -> None: """Test Anthropic agent can use native tool calling.""" agent = Agent( role="Math Assistant", goal="Help users with mathematical calculations", backstory="You are a helpful math assistant.", tools=[calculator_tool], llm=LLM(model="anthropic/claude-3-5-haiku-20241022"), verbose=False, max_iter=3, ) task = Task( description="Calculate what is 15 * 8", expected_output="The result of the calculation", agent=agent, ) crew = Crew(agents=[agent], tasks=[task]) result = crew.kickoff() assert result is not None assert result.raw is not None def test_anthropic_agent_kickoff_with_tools_mocked( self, calculator_tool: CalculatorTool ) -> None: """Test Anthropic agent kickoff with mocked LLM call.""" llm = LLM(model="anthropic/claude-3-5-haiku-20241022") with patch.object(llm, "call", return_value="The answer is 120.") as mock_call: agent = Agent( role="Math Assistant", goal="Calculate math", backstory="You calculate.", tools=[calculator_tool], llm=llm, verbose=False, ) task = Task( description="Calculate 15 * 8", expected_output="Result", agent=agent, ) crew = Crew(agents=[agent], tasks=[task]) result = crew.kickoff() assert mock_call.called assert result is not None # ============================================================================= # Google/Gemini Provider Tests # ============================================================================= class TestGeminiNativeToolCalling: """Tests for native tool calling with Gemini models.""" @pytest.fixture(autouse=True) def mock_google_api_key(self): """Mock GOOGLE_API_KEY for tests.""" if "GOOGLE_API_KEY" not in os.environ and "GEMINI_API_KEY" not in os.environ: with patch.dict(os.environ, {"GOOGLE_API_KEY": "test-key"}): yield else: yield @pytest.mark.vcr() def test_gemini_agent_with_native_tool_calling( self, calculator_tool: CalculatorTool ) -> None: """Test Gemini agent can use native tool calling.""" agent = Agent( role="Math Assistant", goal="Help users with mathematical calculations", backstory="You are a helpful math assistant.", tools=[calculator_tool], llm=LLM(model="gemini/gemini-2.0-flash-exp"), ) task = Task( description="Calculate what is 15 * 8", expected_output="The result of the calculation", agent=agent, ) crew = Crew(agents=[agent], tasks=[task]) result = crew.kickoff() assert result is not None assert result.raw is not None def test_gemini_agent_kickoff_with_tools_mocked( self, calculator_tool: CalculatorTool ) -> None: """Test Gemini agent kickoff with mocked LLM call.""" llm = LLM(model="gemini/gemini-2.0-flash-001") with patch.object(llm, "call", return_value="The answer is 120.") as mock_call: agent = Agent( role="Math Assistant", goal="Calculate math", backstory="You calculate.", tools=[calculator_tool], llm=llm, verbose=False, ) task = Task( description="Calculate 15 * 8", expected_output="Result", agent=agent, ) crew = Crew(agents=[agent], tasks=[task]) result = crew.kickoff() assert mock_call.called assert result is not None # ============================================================================= # Azure Provider Tests # ============================================================================= class TestAzureNativeToolCalling: """Tests for native tool calling with Azure OpenAI models.""" @pytest.fixture(autouse=True) def mock_azure_env(self): """Mock Azure environment variables for tests.""" env_vars = { "AZURE_API_KEY": "test-key", "AZURE_API_BASE": "https://test.openai.azure.com", "AZURE_API_VERSION": "2024-02-15-preview", } # Only patch if keys are not already in environment if "AZURE_API_KEY" not in os.environ: with patch.dict(os.environ, env_vars): yield else: yield @pytest.mark.vcr() def test_azure_agent_with_native_tool_calling( self, calculator_tool: CalculatorTool ) -> None: """Test Azure agent can use native tool calling.""" agent = Agent( role="Math Assistant", goal="Help users with mathematical calculations", backstory="You are a helpful math assistant.", tools=[calculator_tool], llm=LLM(model="azure/gpt-4o-mini"), verbose=False, max_iter=3, ) task = Task( description="Calculate what is 15 * 8", expected_output="The result of the calculation", agent=agent, ) crew = Crew(agents=[agent], tasks=[task]) result = crew.kickoff() assert result is not None assert result.raw is not None assert "120" in str(result.raw) def test_azure_agent_kickoff_with_tools_mocked( self, calculator_tool: CalculatorTool ) -> None: """Test Azure agent kickoff with mocked LLM call.""" llm = LLM( model="azure/gpt-4o-mini", api_key="test-key", base_url="https://test.openai.azure.com", ) with patch.object(llm, "call", return_value="The answer is 120.") as mock_call: agent = Agent( role="Math Assistant", goal="Calculate math", backstory="You calculate.", tools=[calculator_tool], llm=llm, verbose=False, ) task = Task( description="Calculate 15 * 8", expected_output="Result", agent=agent, ) crew = Crew(agents=[agent], tasks=[task]) result = crew.kickoff() assert mock_call.called assert result is not None # ============================================================================= # Bedrock Provider Tests # ============================================================================= class TestBedrockNativeToolCalling: """Tests for native tool calling with AWS Bedrock models.""" @pytest.fixture(autouse=True) def mock_aws_env(self): """Mock AWS environment variables for tests.""" env_vars = { "AWS_ACCESS_KEY_ID": "test-key", "AWS_SECRET_ACCESS_KEY": "test-secret", "AWS_REGION": "us-east-1", } if "AWS_ACCESS_KEY_ID" not in os.environ: with patch.dict(os.environ, env_vars): yield else: yield @pytest.mark.vcr() def test_bedrock_agent_kickoff_with_tools_mocked( self, calculator_tool: CalculatorTool ) -> None: """Test Bedrock agent kickoff with mocked LLM call.""" llm = LLM(model="bedrock/anthropic.claude-3-haiku-20240307-v1:0") agent = Agent( role="Math Assistant", goal="Calculate math", backstory="You calculate.", tools=[calculator_tool], llm=llm, verbose=False, max_iter=5, ) task = Task( description="Calculate 15 * 8", expected_output="Result", agent=agent, ) crew = Crew(agents=[agent], tasks=[task]) result = crew.kickoff() assert result is not None assert result.raw is not None assert "120" in str(result.raw) # ============================================================================= # Cross-Provider Native Tool Calling Behavior Tests # ============================================================================= class TestNativeToolCallingBehavior: """Tests for native tool calling behavior across providers.""" def test_supports_function_calling_check(self) -> None: """Test that supports_function_calling() is properly checked.""" # OpenAI should support function calling openai_llm = LLM(model="gpt-4o-mini") assert hasattr(openai_llm, "supports_function_calling") assert openai_llm.supports_function_calling() is True def test_anthropic_supports_function_calling(self) -> None: """Test that Anthropic models support function calling.""" with patch.dict(os.environ, {"ANTHROPIC_API_KEY": "test-key"}): llm = LLM(model="anthropic/claude-3-5-haiku-20241022") assert hasattr(llm, "supports_function_calling") assert llm.supports_function_calling() is True def test_gemini_supports_function_calling(self) -> None: """Test that Gemini models support function calling.""" llm = LLM(model="gemini/gemini-2.5-flash") assert hasattr(llm, "supports_function_calling") assert llm.supports_function_calling() is True # ============================================================================= # Token Usage Tests # ============================================================================= class TestNativeToolCallingTokenUsage: """Tests for token usage with native tool calling.""" @pytest.mark.vcr() def test_openai_native_tool_calling_token_usage( self, calculator_tool: CalculatorTool ) -> None: """Test token usage tracking with OpenAI native tool calling.""" agent = Agent( role="Calculator", goal="Perform calculations efficiently", backstory="You calculate things.", tools=[calculator_tool], llm=LLM(model="gpt-4o-mini"), verbose=False, max_iter=3, ) task = Task( description="What is 100 / 4?", expected_output="The result", agent=agent, ) crew = Crew(agents=[agent], tasks=[task]) result = crew.kickoff() assert result is not None assert result.token_usage is not None assert result.token_usage.total_tokens > 0 assert result.token_usage.successful_requests >= 1 print(f"\n[OPENAI NATIVE TOOL CALLING TOKEN USAGE]") print(f" Prompt tokens: {result.token_usage.prompt_tokens}") print(f" Completion tokens: {result.token_usage.completion_tokens}") print(f" Total tokens: {result.token_usage.total_tokens}") @pytest.mark.vcr() def test_native_tool_calling_error_handling(failing_tool: FailingTool): """Test that native tool calling handles errors properly and emits error events.""" import threading from crewai.events import crewai_event_bus from crewai.events.types.tool_usage_events import ToolUsageErrorEvent received_events = [] event_received = threading.Event() @crewai_event_bus.on(ToolUsageErrorEvent) def handle_tool_error(source, event): received_events.append(event) event_received.set() agent = Agent( role="Calculator", goal="Perform calculations efficiently", backstory="You calculate things.", tools=[failing_tool], llm=LLM(model="gpt-4o-mini"), verbose=False, max_iter=3, ) result = agent.kickoff("Use the failing_tool to do something.") assert result is not None # Verify error event was emitted assert event_received.wait(timeout=10), "ToolUsageErrorEvent was not emitted" assert len(received_events) >= 1 # Verify event attributes error_event = received_events[0] assert error_event.tool_name == "failing_tool" assert error_event.agent_role == agent.role assert "This tool always fails" in str(error_event.error) # ============================================================================= # Max Usage Count Tests for Native Tool Calling # ============================================================================= class CountingInput(BaseModel): """Input schema for counting tool.""" value: str = Field(description="Value to count") class CountingTool(BaseTool): """A tool that counts its usage.""" name: str = "counting_tool" description: str = "A tool that counts how many times it's been called" args_schema: type[BaseModel] = CountingInput def _run(self, value: str) -> str: """Return the value with a count prefix.""" return f"Counted: {value}" class TestMaxUsageCountWithNativeToolCalling: """Tests for max_usage_count with native tool calling.""" @pytest.mark.vcr() def test_max_usage_count_tracked_in_native_tool_calling(self) -> None: """Test that max_usage_count is properly tracked when using native tool calling.""" tool = CountingTool(max_usage_count=3) # Verify initial state assert tool.max_usage_count == 3 assert tool.current_usage_count == 0 agent = Agent( role="Counting Agent", goal="Call the counting tool multiple times", backstory="You are an agent that counts things.", tools=[tool], llm=LLM(model="gpt-4o-mini"), verbose=False, max_iter=5, ) task = Task( description="Call the counting_tool 3 times with values 'first', 'second', and 'third'", expected_output="The results of the counting operations", agent=agent, ) crew = Crew(agents=[agent], tasks=[task]) crew.kickoff() # Verify usage count was tracked assert tool.max_usage_count == 3 assert tool.current_usage_count <= tool.max_usage_count def test_max_usage_count_limit_enforced_in_native_tool_calling(self) -> None: """Test that when max_usage_count is reached, tool returns error message.""" tool = CountingTool(max_usage_count=2) # Manually simulate tool being at max usage tool.current_usage_count = 2 agent = Agent( role="Counting Agent", goal="Try to use the counting tool", backstory="You are an agent that counts things.", tools=[tool], llm=LLM(model="gpt-4o-mini"), verbose=False, max_iter=3, ) # Verify the tool is at max usage assert tool.current_usage_count >= tool.max_usage_count # The tool should report it has reached its limit when the agent tries to use it # This is handled in _handle_native_tool_calls / execute_native_tool def test_tool_usage_increments_after_successful_execution(self) -> None: """Test that usage count increments after each successful native tool call.""" tool = CountingTool(max_usage_count=10) assert tool.current_usage_count == 0 # Simulate direct tool execution (which happens during native tool calling) result = tool.run(value="test") assert "Counted: test" in result assert tool.current_usage_count == 1 result = tool.run(value="test2") assert "Counted: test2" in result assert tool.current_usage_count == 2