Files
crewAI/tests/llm_test.py
Lucas Gomide 3730648b9d feat: support to remove stop parameter from LLM call
Currently, we can't remove the stop parameter from models that don't support it, because setting it to None ends up as an empty list
2025-05-02 12:07:47 -03:00

672 lines
21 KiB
Python

import os
from time import sleep
from unittest.mock import MagicMock, patch
import litellm
import pytest
from pydantic import BaseModel
from crewai.agents.agent_builder.utilities.base_token_process import TokenProcess
from crewai.llm import CONTEXT_WINDOW_USAGE_RATIO, LLM
from crewai.utilities.events import (
LLMCallCompletedEvent,
LLMStreamChunkEvent,
)
from crewai.utilities.token_counter_callback import TokenCalcHandler
# TODO: This test fails without print statement, which makes me think that something is happening asynchronously that we need to eventually fix and dive deeper into at a later date
@pytest.mark.vcr(filter_headers=["authorization"])
def test_llm_callback_replacement():
llm1 = LLM(model="gpt-4o-mini")
llm2 = LLM(model="gpt-4o-mini")
calc_handler_1 = TokenCalcHandler(token_cost_process=TokenProcess())
calc_handler_2 = TokenCalcHandler(token_cost_process=TokenProcess())
result1 = llm1.call(
messages=[{"role": "user", "content": "Hello, world!"}],
callbacks=[calc_handler_1],
)
print("result1:", result1)
usage_metrics_1 = calc_handler_1.token_cost_process.get_summary()
print("usage_metrics_1:", usage_metrics_1)
result2 = llm2.call(
messages=[{"role": "user", "content": "Hello, world from another agent!"}],
callbacks=[calc_handler_2],
)
sleep(5)
print("result2:", result2)
usage_metrics_2 = calc_handler_2.token_cost_process.get_summary()
print("usage_metrics_2:", usage_metrics_2)
# The first handler should not have been updated
assert usage_metrics_1.successful_requests == 1
assert usage_metrics_2.successful_requests == 1
assert usage_metrics_1 == calc_handler_1.token_cost_process.get_summary()
@pytest.mark.vcr(filter_headers=["authorization"])
def test_llm_call_with_string_input():
llm = LLM(model="gpt-4o-mini")
# Test the call method with a string input
result = llm.call("Return the name of a random city in the world.")
assert isinstance(result, str)
assert len(result.strip()) > 0 # Ensure the response is not empty
@pytest.mark.vcr(filter_headers=["authorization"])
def test_llm_call_with_string_input_and_callbacks():
llm = LLM(model="gpt-4o-mini")
calc_handler = TokenCalcHandler(token_cost_process=TokenProcess())
# Test the call method with a string input and callbacks
result = llm.call(
"Tell me a joke.",
callbacks=[calc_handler],
)
usage_metrics = calc_handler.token_cost_process.get_summary()
assert isinstance(result, str)
assert len(result.strip()) > 0
assert usage_metrics.successful_requests == 1
@pytest.mark.vcr(filter_headers=["authorization"])
def test_llm_call_with_message_list():
llm = LLM(model="gpt-4o-mini")
messages = [{"role": "user", "content": "What is the capital of France?"}]
# Test the call method with a list of messages
result = llm.call(messages)
assert isinstance(result, str)
assert "Paris" in result
@pytest.mark.vcr(filter_headers=["authorization"])
def test_llm_call_with_tool_and_string_input():
llm = LLM(model="gpt-4o-mini")
def get_current_year() -> str:
"""Returns the current year as a string."""
from datetime import datetime
return str(datetime.now().year)
# Create tool schema
tool_schema = {
"type": "function",
"function": {
"name": "get_current_year",
"description": "Returns the current year as a string.",
"parameters": {
"type": "object",
"properties": {},
"required": [],
},
},
}
# Available functions mapping
available_functions = {"get_current_year": get_current_year}
# Test the call method with a string input and tool
result = llm.call(
"What is the current year?",
tools=[tool_schema],
available_functions=available_functions,
)
assert isinstance(result, str)
assert result == get_current_year()
@pytest.mark.vcr(filter_headers=["authorization"])
def test_llm_call_with_tool_and_message_list():
llm = LLM(model="gpt-4o-mini")
def square_number(number: int) -> int:
"""Returns the square of a number."""
return number * number
# Create tool schema
tool_schema = {
"type": "function",
"function": {
"name": "square_number",
"description": "Returns the square of a number.",
"parameters": {
"type": "object",
"properties": {
"number": {"type": "integer", "description": "The number to square"}
},
"required": ["number"],
},
},
}
# Available functions mapping
available_functions = {"square_number": square_number}
messages = [{"role": "user", "content": "What is the square of 5?"}]
# Test the call method with messages and tool
result = llm.call(
messages,
tools=[tool_schema],
available_functions=available_functions,
)
assert isinstance(result, int)
assert result == 25
@pytest.mark.vcr(filter_headers=["authorization"])
def test_llm_passes_additional_params():
llm = LLM(
model="gpt-4o-mini",
vertex_credentials="test_credentials",
vertex_project="test_project",
)
messages = [{"role": "user", "content": "Hello, world!"}]
with patch("litellm.completion") as mocked_completion:
# Create mocks for response structure
mock_message = MagicMock()
mock_message.content = "Test response"
mock_choice = MagicMock()
mock_choice.message = mock_message
mock_response = MagicMock()
mock_response.choices = [mock_choice]
mock_response.usage = {
"prompt_tokens": 5,
"completion_tokens": 5,
"total_tokens": 10,
}
# Set up the mocked completion to return the mock response
mocked_completion.return_value = mock_response
result = llm.call(messages)
# Assert that litellm.completion was called once
mocked_completion.assert_called_once()
# Retrieve the actual arguments with which litellm.completion was called
_, kwargs = mocked_completion.call_args
# Check that the additional_params were passed to litellm.completion
assert kwargs["vertex_credentials"] == "test_credentials"
assert kwargs["vertex_project"] == "test_project"
# Also verify that other expected parameters are present
assert kwargs["model"] == "gpt-4o-mini"
assert kwargs["messages"] == messages
# Check the result from llm.call
assert result == "Test response"
def test_get_custom_llm_provider_openrouter():
llm = LLM(model="openrouter/deepseek/deepseek-chat")
assert llm._get_custom_llm_provider() == "openrouter"
def test_get_custom_llm_provider_gemini():
llm = LLM(model="gemini/gemini-1.5-pro")
assert llm._get_custom_llm_provider() == "gemini"
def test_get_custom_llm_provider_openai():
llm = LLM(model="gpt-4")
assert llm._get_custom_llm_provider() == None
def test_validate_call_params_supported():
class DummyResponse(BaseModel):
a: int
# Patch supports_response_schema to simulate a supported model.
with patch("crewai.llm.supports_response_schema", return_value=True):
llm = LLM(
model="openrouter/deepseek/deepseek-chat", response_format=DummyResponse
)
# Should not raise any error.
llm._validate_call_params()
def test_validate_call_params_not_supported():
class DummyResponse(BaseModel):
a: int
# Patch supports_response_schema to simulate an unsupported model.
with patch("crewai.llm.supports_response_schema", return_value=False):
llm = LLM(model="gemini/gemini-1.5-pro", response_format=DummyResponse)
with pytest.raises(ValueError) as excinfo:
llm._validate_call_params()
assert "does not support response_format" in str(excinfo.value)
def test_validate_call_params_no_response_format():
# When no response_format is provided, no validation error should occur.
llm = LLM(model="gemini/gemini-1.5-pro", response_format=None)
llm._validate_call_params()
@pytest.mark.vcr(filter_headers=["authorization"], filter_query_parameters=["key"])
@pytest.mark.parametrize(
"model",
[
"gemini/gemini-2.0-flash-thinking-exp-01-21",
"gemini/gemini-2.0-flash-001",
"gemini/gemini-2.0-flash-lite-001",
"gemini/gemini-2.5-flash-preview-04-17",
"gemini/gemini-2.5-pro-exp-03-25",
],
)
def test_gemini_models(model):
llm = LLM(model=model)
result = llm.call("What is the capital of France?")
assert isinstance(result, str)
assert "Paris" in result
@pytest.mark.vcr(filter_headers=["authorization"], filter_query_parameters=["key"])
@pytest.mark.parametrize(
"model",
[
"gemini/gemma-3-1b-it",
"gemini/gemma-3-4b-it",
"gemini/gemma-3-12b-it",
"gemini/gemma-3-27b-it",
],
)
def test_gemma3(model):
llm = LLM(model=model)
result = llm.call("What is the capital of France?")
assert isinstance(result, str)
assert "Paris" in result
@pytest.mark.vcr(filter_headers=["authorization"])
@pytest.mark.parametrize(
"model", ["gpt-4.1", "gpt-4.1-mini-2025-04-14", "gpt-4.1-nano-2025-04-14"]
)
def test_gpt_4_1(model):
llm = LLM(model=model)
result = llm.call("What is the capital of France?")
assert isinstance(result, str)
assert "Paris" in result
@pytest.mark.vcr(filter_headers=["authorization"])
def test_o3_mini_reasoning_effort_high():
llm = LLM(
model="o3-mini",
reasoning_effort="high",
)
result = llm.call("What is the capital of France?")
assert isinstance(result, str)
assert "Paris" in result
@pytest.mark.vcr(filter_headers=["authorization"])
def test_o3_mini_reasoning_effort_low():
llm = LLM(
model="o3-mini",
reasoning_effort="low",
)
result = llm.call("What is the capital of France?")
assert isinstance(result, str)
assert "Paris" in result
@pytest.mark.vcr(filter_headers=["authorization"])
def test_o3_mini_reasoning_effort_medium():
llm = LLM(
model="o3-mini",
reasoning_effort="medium",
)
result = llm.call("What is the capital of France?")
assert isinstance(result, str)
assert "Paris" in result
def test_context_window_validation():
"""Test that context window validation works correctly."""
# Test valid window size
llm = LLM(model="o3-mini")
assert llm.get_context_window_size() == int(200000 * CONTEXT_WINDOW_USAGE_RATIO)
# Test invalid window size
with pytest.raises(ValueError) as excinfo:
with patch.dict(
"crewai.llm.LLM_CONTEXT_WINDOW_SIZES",
{"test-model": 500}, # Below minimum
clear=True,
):
llm = LLM(model="test-model")
llm.get_context_window_size()
assert "must be between 1024 and 2097152" in str(excinfo.value)
@pytest.fixture
def get_weather_tool_schema():
return {
"type": "function",
"function": {
"name": "get_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
}
},
"required": ["location"],
},
},
}
def test_context_window_exceeded_error_handling():
"""Test that litellm.ContextWindowExceededError is converted to LLMContextLengthExceededException."""
from litellm.exceptions import ContextWindowExceededError
from crewai.utilities.exceptions.context_window_exceeding_exception import (
LLMContextLengthExceededException,
)
llm = LLM(model="gpt-4")
# Test non-streaming response
with patch("litellm.completion") as mock_completion:
mock_completion.side_effect = ContextWindowExceededError(
"This model's maximum context length is 8192 tokens. However, your messages resulted in 10000 tokens.",
model="gpt-4",
llm_provider="openai",
)
with pytest.raises(LLMContextLengthExceededException) as excinfo:
llm.call("This is a test message")
assert "context length exceeded" in str(excinfo.value).lower()
assert "8192 tokens" in str(excinfo.value)
# Test streaming response
llm = LLM(model="gpt-4", stream=True)
with patch("litellm.completion") as mock_completion:
mock_completion.side_effect = ContextWindowExceededError(
"This model's maximum context length is 8192 tokens. However, your messages resulted in 10000 tokens.",
model="gpt-4",
llm_provider="openai",
)
with pytest.raises(LLMContextLengthExceededException) as excinfo:
llm.call("This is a test message")
assert "context length exceeded" in str(excinfo.value).lower()
assert "8192 tokens" in str(excinfo.value)
@pytest.mark.vcr(filter_headers=["authorization"])
@pytest.fixture
def anthropic_llm():
"""Fixture providing an Anthropic LLM instance."""
return LLM(model="anthropic/claude-3-sonnet")
@pytest.fixture
def system_message():
"""Fixture providing a system message."""
return {"role": "system", "content": "test"}
@pytest.fixture
def user_message():
"""Fixture providing a user message."""
return {"role": "user", "content": "test"}
def test_anthropic_message_formatting_edge_cases(anthropic_llm):
"""Test edge cases for Anthropic message formatting."""
# Test None messages
with pytest.raises(TypeError, match="Messages cannot be None"):
anthropic_llm._format_messages_for_provider(None)
# Test empty message list
formatted = anthropic_llm._format_messages_for_provider([])
assert len(formatted) == 1
assert formatted[0]["role"] == "user"
assert formatted[0]["content"] == "."
# Test invalid message format
with pytest.raises(TypeError, match="Invalid message format"):
anthropic_llm._format_messages_for_provider([{"invalid": "message"}])
def test_anthropic_model_detection():
"""Test Anthropic model detection with various formats."""
models = [
("anthropic/claude-3", True),
("claude-instant", True),
("claude/v1", True),
("gpt-4", False),
("", False),
("anthropomorphic", False), # Should not match partial words
]
for model, expected in models:
llm = LLM(model=model)
assert llm.is_anthropic == expected, f"Failed for model: {model}"
def test_anthropic_message_formatting(anthropic_llm, system_message, user_message):
"""Test Anthropic message formatting with fixtures."""
# Test when first message is system
formatted = anthropic_llm._format_messages_for_provider([system_message])
assert len(formatted) == 2
assert formatted[0]["role"] == "user"
assert formatted[0]["content"] == "."
assert formatted[1] == system_message
# Test when first message is already user
formatted = anthropic_llm._format_messages_for_provider([user_message])
assert len(formatted) == 1
assert formatted[0] == user_message
# Test with empty message list
formatted = anthropic_llm._format_messages_for_provider([])
assert len(formatted) == 1
assert formatted[0]["role"] == "user"
assert formatted[0]["content"] == "."
# Test with non-Anthropic model (should not modify messages)
non_anthropic_llm = LLM(model="gpt-4")
formatted = non_anthropic_llm._format_messages_for_provider([system_message])
assert len(formatted) == 1
assert formatted[0] == system_message
def test_deepseek_r1_with_open_router():
if not os.getenv("OPEN_ROUTER_API_KEY"):
pytest.skip("OPEN_ROUTER_API_KEY not set; skipping test.")
llm = LLM(
model="openrouter/deepseek/deepseek-r1",
base_url="https://openrouter.ai/api/v1",
api_key=os.getenv("OPEN_ROUTER_API_KEY"),
)
result = llm.call("What is the capital of France?")
assert isinstance(result, str)
assert "Paris" in result
def assert_event_count(
mock_emit,
expected_completed_tool_call: int = 0,
expected_stream_chunk: int = 0,
expected_completed_llm_call: int = 0,
expected_final_chunk_result: str = "",
):
event_count = {
"completed_tool_call": 0,
"stream_chunk": 0,
"completed_llm_call": 0,
}
final_chunk_result = ""
for _call in mock_emit.call_args_list:
event = _call[1]["event"]
if (
isinstance(event, LLMCallCompletedEvent)
and event.call_type.value == "tool_call"
):
event_count["completed_tool_call"] += 1
elif isinstance(event, LLMStreamChunkEvent):
event_count["stream_chunk"] += 1
final_chunk_result += event.chunk
elif (
isinstance(event, LLMCallCompletedEvent)
and event.call_type.value == "llm_call"
):
event_count["completed_llm_call"] += 1
else:
continue
assert event_count["completed_tool_call"] == expected_completed_tool_call
assert event_count["stream_chunk"] == expected_stream_chunk
assert event_count["completed_llm_call"] == expected_completed_llm_call
assert final_chunk_result == expected_final_chunk_result
@pytest.fixture
def mock_emit() -> MagicMock:
from crewai.utilities.events.crewai_event_bus import CrewAIEventsBus
with patch.object(CrewAIEventsBus, "emit") as mock_emit:
yield mock_emit
@pytest.mark.vcr(filter_headers=["authorization"])
def test_handle_streaming_tool_calls(get_weather_tool_schema, mock_emit):
llm = LLM(model="openai/gpt-4o", stream=True)
response = llm.call(
messages=[
{"role": "user", "content": "What is the weather in New York?"},
],
tools=[get_weather_tool_schema],
available_functions={
"get_weather": lambda location: f"The weather in {location} is sunny"
},
)
assert response == "The weather in New York, NY is sunny"
expected_final_chunk_result = (
'{"location":"New York, NY"}The weather in New York, NY is sunny'
)
assert_event_count(
mock_emit=mock_emit,
expected_completed_tool_call=1,
expected_stream_chunk=10,
expected_completed_llm_call=1,
expected_final_chunk_result=expected_final_chunk_result,
)
@pytest.mark.vcr(filter_headers=["authorization"])
def test_handle_streaming_tool_calls_no_available_functions(
get_weather_tool_schema, mock_emit
):
llm = LLM(model="openai/gpt-4o", stream=True)
response = llm.call(
messages=[
{"role": "user", "content": "What is the weather in New York?"},
],
tools=[get_weather_tool_schema],
)
assert response == ""
assert_event_count(
mock_emit=mock_emit,
expected_stream_chunk=9,
expected_completed_llm_call=1,
expected_final_chunk_result='{"location":"New York, NY"}',
)
@pytest.mark.vcr(filter_headers=["authorization"])
def test_handle_streaming_tool_calls_no_tools(mock_emit):
llm = LLM(model="openai/gpt-4o", stream=True)
response = llm.call(
messages=[
{"role": "user", "content": "What is the weather in New York?"},
],
)
assert (
response
== "I'm unable to provide real-time information or current weather updates. For the latest weather information in New York, I recommend checking a reliable weather website or app, such as the National Weather Service, Weather.com, or a similar service."
)
assert_event_count(
mock_emit=mock_emit,
expected_stream_chunk=46,
expected_completed_llm_call=1,
expected_final_chunk_result=response,
)
@pytest.mark.vcr(filter_headers=["authorization"])
def test_llm_testing_removing_stop_parameter():
from litellm.exceptions import BadRequestError
# currently o3 does not support stop
llm = LLM(model="o3", temperature=0.5)
with pytest.raises(
BadRequestError,
match="parameter: 'stop' is not supported with this model",
):
llm.call(
messages=[
{"role": "user", "content": "What is the weather in San Francisco?"},
],
)
llm = LLM(model="o3", temperature=0.5, stop=None)
response = llm.call(
messages=[
{"role": "user", "content": "What is the weather in San Francisco?"},
],
)
assert isinstance(response, str)
# testing another model that supports stop
llm = LLM(model="o3-mini", temperature=0.5)
response = llm.call(
messages=[
{"role": "user", "content": "What is the weather in San Francisco?"},
],
)
assert isinstance(response, str)
llm = LLM(model="o3-mini", temperature=0.5, stop=None)
response = llm.call(
messages=[
{"role": "user", "content": "What is the weather in San Francisco?"},
],
)
assert isinstance(response, str)
llm = LLM(model="o3-mini", temperature=0.5, stop=["\n"])
response = llm.call(
messages=[
{"role": "user", "content": "What is the weather in San Francisco?"},
],
)
assert isinstance(response, str)