crewAI/lib/crewai/tests/agents/test_native_tool_calling.py

"""Integration tests for native tool calling functionality.

These tests verify that agents can use native function calling
when the LLM supports it, across multiple providers.
"""

from __future__ import annotations

import os
from unittest.mock import patch

import pytest
from pydantic import BaseModel, Field

from crewai import Agent, Crew, Task
from crewai.llm import LLM
from crewai.tools.base_tool import BaseTool


class CalculatorInput(BaseModel):
    """Input schema for calculator tool."""

    expression: str = Field(description="Mathematical expression to evaluate")


class CalculatorTool(BaseTool):
    """A calculator tool that performs mathematical calculations."""

    name: str = "calculator"
    description: str = "Perform mathematical calculations. Use this for any math operations."
    args_schema: type[BaseModel] = CalculatorInput

    def _run(self, expression: str) -> str:
        """Execute the calculation."""
        try:
            # Safe evaluation for basic math
            result = eval(expression)  # noqa: S307
            return f"The result of {expression} is {result}"
        except Exception as e:
            return f"Error calculating {expression}: {e}"


class WeatherInput(BaseModel):
    """Input schema for weather tool."""

    location: str = Field(description="City name to get weather for")


class WeatherTool(BaseTool):
    """A mock weather tool for testing."""

    name: str = "get_weather"
    description: str = "Get the current weather for a location"
    args_schema: type[BaseModel] = WeatherInput

    def _run(self, location: str) -> str:
        """Get weather (mock implementation)."""
        return f"The weather in {location} is sunny with a temperature of 72°F"

class FailingTool(BaseTool):
    """A tool that always fails."""
    name: str = "failing_tool"
    description: str = "This tool always fails"
    def _run(self) -> str:
        raise Exception("This tool always fails")

@pytest.fixture
def calculator_tool() -> CalculatorTool:
    """Create a calculator tool for testing."""
    return CalculatorTool()


@pytest.fixture
def weather_tool() -> WeatherTool:
    """Create a weather tool for testing."""
    return WeatherTool()

@pytest.fixture
def failing_tool() -> BaseTool:
    """Create a weather tool for testing."""
    return FailingTool(

    )

# =============================================================================
# OpenAI Provider Tests
# =============================================================================


class TestOpenAINativeToolCalling:
    """Tests for native tool calling with OpenAI models."""

    @pytest.mark.vcr()
    def test_openai_agent_with_native_tool_calling(
        self, calculator_tool: CalculatorTool
    ) -> None:
        """Test OpenAI agent can use native tool calling."""
        agent = Agent(
            role="Math Assistant",
            goal="Help users with mathematical calculations",
            backstory="You are a helpful math assistant.",
            tools=[calculator_tool],
            llm=LLM(model="gpt-4o-mini"),
            verbose=False,
            max_iter=3,
        )

        task = Task(
            description="Calculate what is 15 * 8",
            expected_output="The result of the calculation",
            agent=agent,
        )

        crew = Crew(agents=[agent], tasks=[task])
        result = crew.kickoff()

        assert result is not None
        assert result.raw is not None
        assert "120" in str(result.raw)

    def test_openai_agent_kickoff_with_tools_mocked(
        self, calculator_tool: CalculatorTool
    ) -> None:
        """Test OpenAI agent kickoff with mocked LLM call."""
        llm = LLM(model="gpt-4o-mini")

        with patch.object(llm, "call", return_value="The answer is 120.") as mock_call:
            agent = Agent(
                role="Math Assistant",
                goal="Calculate math",
                backstory="You calculate.",
                tools=[calculator_tool],
                llm=llm,
                verbose=False,
            )

            task = Task(
                description="Calculate 15 * 8",
                expected_output="Result",
                agent=agent,
            )

            crew = Crew(agents=[agent], tasks=[task])
            result = crew.kickoff()

            assert mock_call.called
            assert result is not None


# =============================================================================
# Anthropic Provider Tests
# =============================================================================
class TestAnthropicNativeToolCalling:
    """Tests for native tool calling with Anthropic models."""

    @pytest.fixture(autouse=True)
    def mock_anthropic_api_key(self):
        """Mock ANTHROPIC_API_KEY for tests."""
        if "ANTHROPIC_API_KEY" not in os.environ:
            with patch.dict(os.environ, {"ANTHROPIC_API_KEY": "test-key"}):
                yield
        else:
            yield

    @pytest.mark.vcr()
    def test_anthropic_agent_with_native_tool_calling(
        self, calculator_tool: CalculatorTool
    ) -> None:
        """Test Anthropic agent can use native tool calling."""
        agent = Agent(
            role="Math Assistant",
            goal="Help users with mathematical calculations",
            backstory="You are a helpful math assistant.",
            tools=[calculator_tool],
            llm=LLM(model="anthropic/claude-3-5-haiku-20241022"),
            verbose=False,
            max_iter=3,
        )

        task = Task(
            description="Calculate what is 15 * 8",
            expected_output="The result of the calculation",
            agent=agent,
        )

        crew = Crew(agents=[agent], tasks=[task])
        result = crew.kickoff()

        assert result is not None
        assert result.raw is not None

    def test_anthropic_agent_kickoff_with_tools_mocked(
        self, calculator_tool: CalculatorTool
    ) -> None:
        """Test Anthropic agent kickoff with mocked LLM call."""
        llm = LLM(model="anthropic/claude-3-5-haiku-20241022")

        with patch.object(llm, "call", return_value="The answer is 120.") as mock_call:
            agent = Agent(
                role="Math Assistant",
                goal="Calculate math",
                backstory="You calculate.",
                tools=[calculator_tool],
                llm=llm,
                verbose=False,
            )

            task = Task(
                description="Calculate 15 * 8",
                expected_output="Result",
                agent=agent,
            )

            crew = Crew(agents=[agent], tasks=[task])
            result = crew.kickoff()

            assert mock_call.called
            assert result is not None


# =============================================================================
# Google/Gemini Provider Tests
# =============================================================================


class TestGeminiNativeToolCalling:
    """Tests for native tool calling with Gemini models."""

    @pytest.fixture(autouse=True)
    def mock_google_api_key(self):
        """Mock GOOGLE_API_KEY for tests."""
        if "GOOGLE_API_KEY" not in os.environ and "GEMINI_API_KEY" not in os.environ:
            with patch.dict(os.environ, {"GOOGLE_API_KEY": "test-key"}):
                yield
        else:
            yield


    @pytest.mark.vcr()
    def test_gemini_agent_with_native_tool_calling(
        self, calculator_tool: CalculatorTool
    ) -> None:
        """Test Gemini agent can use native tool calling."""

        agent = Agent(
            role="Math Assistant",
            goal="Help users with mathematical calculations",
            backstory="You are a helpful math assistant.",
            tools=[calculator_tool],
            llm=LLM(model="gemini/gemini-2.0-flash-exp"),
        )

        task = Task(
            description="Calculate what is 15 * 8",
            expected_output="The result of the calculation",
            agent=agent,
        )

        crew = Crew(agents=[agent], tasks=[task])
        result = crew.kickoff()

        assert result is not None
        assert result.raw is not None

    def test_gemini_agent_kickoff_with_tools_mocked(
        self, calculator_tool: CalculatorTool
    ) -> None:
        """Test Gemini agent kickoff with mocked LLM call."""
        llm = LLM(model="gemini/gemini-2.0-flash-001")

        with patch.object(llm, "call", return_value="The answer is 120.") as mock_call:
            agent = Agent(
                role="Math Assistant",
                goal="Calculate math",
                backstory="You calculate.",
                tools=[calculator_tool],
                llm=llm,
                verbose=False,
            )

            task = Task(
                description="Calculate 15 * 8",
                expected_output="Result",
                agent=agent,
            )

            crew = Crew(agents=[agent], tasks=[task])
            result = crew.kickoff()

            assert mock_call.called
            assert result is not None


# =============================================================================
# Azure Provider Tests
# =============================================================================


class TestAzureNativeToolCalling:
    """Tests for native tool calling with Azure OpenAI models."""

    @pytest.fixture(autouse=True)
    def mock_azure_env(self):
        """Mock Azure environment variables for tests."""
        env_vars = {
            "AZURE_API_KEY": "test-key",
            "AZURE_API_BASE": "https://test.openai.azure.com",
            "AZURE_API_VERSION": "2024-02-15-preview",
        }
        # Only patch if keys are not already in environment
        if "AZURE_API_KEY" not in os.environ:
            with patch.dict(os.environ, env_vars):
                yield
        else:
            yield

    @pytest.mark.vcr()
    def test_azure_agent_with_native_tool_calling(
        self, calculator_tool: CalculatorTool
    ) -> None:
        """Test Azure agent can use native tool calling."""
        agent = Agent(
            role="Math Assistant",
            goal="Help users with mathematical calculations",
            backstory="You are a helpful math assistant.",
            tools=[calculator_tool],
            llm=LLM(model="azure/gpt-4o-mini"),
            verbose=False,
            max_iter=3,
        )

        task = Task(
            description="Calculate what is 15 * 8",
            expected_output="The result of the calculation",
            agent=agent,
        )

        crew = Crew(agents=[agent], tasks=[task])
        result = crew.kickoff()

        assert result is not None
        assert result.raw is not None
        assert "120" in str(result.raw)

    def test_azure_agent_kickoff_with_tools_mocked(
        self, calculator_tool: CalculatorTool
    ) -> None:
        """Test Azure agent kickoff with mocked LLM call."""
        llm = LLM(
            model="azure/gpt-4o-mini",
            api_key="test-key",
            base_url="https://test.openai.azure.com",
        )

        with patch.object(llm, "call", return_value="The answer is 120.") as mock_call:
            agent = Agent(
                role="Math Assistant",
                goal="Calculate math",
                backstory="You calculate.",
                tools=[calculator_tool],
                llm=llm,
                verbose=False,
            )

            task = Task(
                description="Calculate 15 * 8",
                expected_output="Result",
                agent=agent,
            )

            crew = Crew(agents=[agent], tasks=[task])
            result = crew.kickoff()

            assert mock_call.called
            assert result is not None


# =============================================================================
# Bedrock Provider Tests
# =============================================================================


class TestBedrockNativeToolCalling:
    """Tests for native tool calling with AWS Bedrock models."""

    @pytest.fixture(autouse=True)
    def mock_aws_env(self):
        """Mock AWS environment variables for tests."""
        env_vars = {
        "AWS_ACCESS_KEY_ID": "test-key",
        "AWS_SECRET_ACCESS_KEY": "test-secret",
        "AWS_REGION": "us-east-1",
        }
        if "AWS_ACCESS_KEY_ID" not in os.environ:
            with patch.dict(os.environ, env_vars):
                yield
        else:
            yield

    @pytest.mark.vcr()
    def test_bedrock_agent_kickoff_with_tools_mocked(
        self, calculator_tool: CalculatorTool
    ) -> None:
        """Test Bedrock agent kickoff with mocked LLM call."""
        llm = LLM(model="bedrock/anthropic.claude-3-haiku-20240307-v1:0")

        agent = Agent(
            role="Math Assistant",
            goal="Calculate math",
            backstory="You calculate.",
            tools=[calculator_tool],
            llm=llm,
            verbose=False,
            max_iter=5,
        )

        task = Task(
            description="Calculate 15 * 8",
            expected_output="Result",
            agent=agent,
        )

        crew = Crew(agents=[agent], tasks=[task])
        result = crew.kickoff()

        assert result is not None
        assert result.raw is not None
        assert "120" in str(result.raw)


# =============================================================================
# Cross-Provider Native Tool Calling Behavior Tests
# =============================================================================


class TestNativeToolCallingBehavior:
    """Tests for native tool calling behavior across providers."""

    def test_supports_function_calling_check(self) -> None:
        """Test that supports_function_calling() is properly checked."""
        # OpenAI should support function calling
        openai_llm = LLM(model="gpt-4o-mini")
        assert hasattr(openai_llm, "supports_function_calling")
        assert openai_llm.supports_function_calling() is True

    def test_anthropic_supports_function_calling(self) -> None:
        """Test that Anthropic models support function calling."""
        with patch.dict(os.environ, {"ANTHROPIC_API_KEY": "test-key"}):
            llm = LLM(model="anthropic/claude-3-5-haiku-20241022")
            assert hasattr(llm, "supports_function_calling")
            assert llm.supports_function_calling() is True

    def test_gemini_supports_function_calling(self) -> None:
        """Test that Gemini models support function calling."""
        llm = LLM(model="gemini/gemini-2.5-flash")
        assert hasattr(llm, "supports_function_calling")
        assert llm.supports_function_calling() is True


# =============================================================================
# Token Usage Tests
# =============================================================================


class TestNativeToolCallingTokenUsage:
    """Tests for token usage with native tool calling."""

    @pytest.mark.vcr()
    def test_openai_native_tool_calling_token_usage(
        self, calculator_tool: CalculatorTool
    ) -> None:
        """Test token usage tracking with OpenAI native tool calling."""
        agent = Agent(
            role="Calculator",
            goal="Perform calculations efficiently",
            backstory="You calculate things.",
            tools=[calculator_tool],
            llm=LLM(model="gpt-4o-mini"),
            verbose=False,
            max_iter=3,
        )

        task = Task(
            description="What is 100 / 4?",
            expected_output="The result",
            agent=agent,
        )

        crew = Crew(agents=[agent], tasks=[task])
        result = crew.kickoff()

        assert result is not None
        assert result.token_usage is not None
        assert result.token_usage.total_tokens > 0
        assert result.token_usage.successful_requests >= 1

        print(f"\n[OPENAI NATIVE TOOL CALLING TOKEN USAGE]")
        print(f"  Prompt tokens: {result.token_usage.prompt_tokens}")
        print(f"  Completion tokens: {result.token_usage.completion_tokens}")
        print(f"  Total tokens: {result.token_usage.total_tokens}")

@pytest.mark.vcr()
def test_native_tool_calling_error_handling(failing_tool: FailingTool):
    """Test that native tool calling handles errors properly and emits error events."""
    import threading
    from crewai.events import crewai_event_bus
    from crewai.events.types.tool_usage_events import ToolUsageErrorEvent

    received_events = []
    event_received = threading.Event()

    @crewai_event_bus.on(ToolUsageErrorEvent)
    def handle_tool_error(source, event):
        received_events.append(event)
        event_received.set()

    agent = Agent(
        role="Calculator",
        goal="Perform calculations efficiently",
        backstory="You calculate things.",
        tools=[failing_tool],
        llm=LLM(model="gpt-4o-mini"),
        verbose=False,
        max_iter=3,
    )

    result = agent.kickoff("Use the failing_tool to do something.")
    assert result is not None

    # Verify error event was emitted
    assert event_received.wait(timeout=10), "ToolUsageErrorEvent was not emitted"
    assert len(received_events) >= 1

    # Verify event attributes
    error_event = received_events[0]
    assert error_event.tool_name == "failing_tool"
    assert error_event.agent_role == agent.role
    assert "This tool always fails" in str(error_event.error)


# =============================================================================
# Max Usage Count Tests for Native Tool Calling
# =============================================================================


class CountingInput(BaseModel):
    """Input schema for counting tool."""

    value: str = Field(description="Value to count")


class CountingTool(BaseTool):
    """A tool that counts its usage."""

    name: str = "counting_tool"
    description: str = "A tool that counts how many times it's been called"
    args_schema: type[BaseModel] = CountingInput

    def _run(self, value: str) -> str:
        """Return the value with a count prefix."""
        return f"Counted: {value}"


class TestMaxUsageCountWithNativeToolCalling:
    """Tests for max_usage_count with native tool calling."""

    @pytest.mark.vcr()
    def test_max_usage_count_tracked_in_native_tool_calling(self) -> None:
        """Test that max_usage_count is properly tracked when using native tool calling."""
        tool = CountingTool(max_usage_count=3)

        # Verify initial state
        assert tool.max_usage_count == 3
        assert tool.current_usage_count == 0

        agent = Agent(
            role="Counting Agent",
            goal="Call the counting tool multiple times",
            backstory="You are an agent that counts things.",
            tools=[tool],
            llm=LLM(model="gpt-4o-mini"),
            verbose=False,
            max_iter=5,
        )

        task = Task(
            description="Call the counting_tool 3 times with values 'first', 'second', and 'third'",
            expected_output="The results of the counting operations",
            agent=agent,
        )

        crew = Crew(agents=[agent], tasks=[task])
        crew.kickoff()

        # Verify usage count was tracked
        assert tool.max_usage_count == 3
        assert tool.current_usage_count <= tool.max_usage_count

    @pytest.mark.vcr()
    def test_max_usage_count_limit_enforced_in_native_tool_calling(self) -> None:
        """Test that when max_usage_count is reached, tool returns error message."""
        tool = CountingTool(max_usage_count=2)

        agent = Agent(
            role="Counting Agent",
            goal="Use the counting tool as many times as requested",
            backstory="You are an agent that counts things. You must try to use the tool for each value requested.",
            tools=[tool],
            llm=LLM(model="gpt-4o-mini"),
            verbose=False,
            max_iter=5,
        )

        # Request more tool calls than the max_usage_count allows
        task = Task(
            description="Call the counting_tool 4 times with values 'one', 'two', 'three', and 'four'",
            expected_output="The results of the counting operations, noting any failures",
            agent=agent,
        )

        crew = Crew(agents=[agent], tasks=[task])
        result = crew.kickoff()

        # The tool should have been limited to max_usage_count (2) calls
        assert result is not None
        assert tool.current_usage_count == tool.max_usage_count
        # After hitting the limit, further calls should have been rejected

    @pytest.mark.vcr()
    def test_tool_usage_increments_after_successful_execution(self) -> None:
        """Test that usage count increments after each successful native tool call."""
        tool = CountingTool(max_usage_count=10)

        assert tool.current_usage_count == 0

        agent = Agent(
            role="Counting Agent",
            goal="Use the counting tool exactly as requested",
            backstory="You are an agent that counts things precisely.",
            tools=[tool],
            llm=LLM(model="gpt-4o-mini"),
            verbose=False,
            max_iter=5,
        )

        task = Task(
            description="Call the counting_tool exactly 2 times: first with value 'alpha', then with value 'beta'",
            expected_output="The results showing both 'Counted: alpha' and 'Counted: beta'",
            agent=agent,
        )

        crew = Crew(agents=[agent], tasks=[task])
        result = crew.kickoff()

        assert result is not None
        # Verify usage count was incremented for each successful call
        assert tool.current_usage_count == 2