chore: restructure test env, cassettes, and conftest; fix flaky tests
Some checks failed
Build uv cache / build-cache (3.10) (push) Has been cancelled
Build uv cache / build-cache (3.11) (push) Has been cancelled
Build uv cache / build-cache (3.12) (push) Has been cancelled
Build uv cache / build-cache (3.13) (push) Has been cancelled
CodeQL Advanced / Analyze (actions) (push) Has been cancelled
CodeQL Advanced / Analyze (python) (push) Has been cancelled
Notify Downstream / notify-downstream (push) Has been cancelled
Mark stale issues and pull requests / stale (push) Has been cancelled

Consolidates pytest config, standardizes env handling, reorganizes cassette layout, removes outdated VCR configs, improves sync with threading.Condition, updates event-waiting logic, ensures cleanup, regenerates Gemini cassettes, and reverts unintended test changes.
This commit is contained in:
Greyson LaLonde
2025-11-29 16:55:24 -05:00
committed by GitHub
parent bc4e6a3127
commit c925d2d519
200 changed files with 2070 additions and 1891 deletions

View File

@@ -12,13 +12,14 @@ from crewai.events.event_types import (
ToolUsageStartedEvent,
)
from crewai.llm import CONTEXT_WINDOW_USAGE_RATIO, LLM
from crewai.llms.providers.anthropic.completion import AnthropicCompletion
from crewai.utilities.token_counter_callback import TokenCalcHandler
from pydantic import BaseModel
import pytest
# TODO: This test fails without print statement, which makes me think that something is happening asynchronously that we need to eventually fix and dive deeper into at a later date
@pytest.mark.vcr(filter_headers=["authorization"])
@pytest.mark.vcr()
def test_llm_callback_replacement():
llm1 = LLM(model="gpt-4o-mini", is_litellm=True)
llm2 = LLM(model="gpt-4o-mini", is_litellm=True)
@@ -45,7 +46,7 @@ def test_llm_callback_replacement():
assert usage_metrics_1 == calc_handler_1.token_cost_process.get_summary()
@pytest.mark.vcr(filter_headers=["authorization"])
@pytest.mark.vcr()
def test_llm_call_with_string_input():
llm = LLM(model="gpt-4o-mini")
@@ -55,7 +56,7 @@ def test_llm_call_with_string_input():
assert len(result.strip()) > 0 # Ensure the response is not empty
@pytest.mark.vcr(filter_headers=["authorization"])
@pytest.mark.vcr()
def test_llm_call_with_string_input_and_callbacks():
llm = LLM(model="gpt-4o-mini", is_litellm=True)
calc_handler = TokenCalcHandler(token_cost_process=TokenProcess())
@@ -72,7 +73,7 @@ def test_llm_call_with_string_input_and_callbacks():
assert usage_metrics.successful_requests == 1
@pytest.mark.vcr(filter_headers=["authorization"])
@pytest.mark.vcr()
def test_llm_call_with_message_list():
llm = LLM(model="gpt-4o-mini")
messages = [{"role": "user", "content": "What is the capital of France?"}]
@@ -83,7 +84,7 @@ def test_llm_call_with_message_list():
assert "Paris" in result
@pytest.mark.vcr(filter_headers=["authorization"])
@pytest.mark.vcr()
def test_llm_call_with_tool_and_string_input():
llm = LLM(model="gpt-4o-mini")
@@ -121,7 +122,7 @@ def test_llm_call_with_tool_and_string_input():
assert result == get_current_year()
@pytest.mark.vcr(filter_headers=["authorization"])
@pytest.mark.vcr()
def test_llm_call_with_tool_and_message_list():
llm = LLM(model="gpt-4o-mini", is_litellm=True)
@@ -161,7 +162,7 @@ def test_llm_call_with_tool_and_message_list():
assert result == 25
@pytest.mark.vcr(filter_headers=["authorization"])
@pytest.mark.vcr()
def test_llm_passes_additional_params():
llm = LLM(
model="gpt-4o-mini",
@@ -259,7 +260,7 @@ def test_validate_call_params_no_response_format():
llm._validate_call_params()
@pytest.mark.vcr(filter_headers=["authorization"], filter_query_parameters=["key"])
@pytest.mark.vcr()
@pytest.mark.parametrize(
"model",
[
@@ -267,19 +268,17 @@ def test_validate_call_params_no_response_format():
"gemini/gemini-2.0-flash-thinking-exp-01-21",
"gemini/gemini-2.0-flash-001",
"gemini/gemini-2.0-flash-lite-001",
"gemini/gemini-2.5-flash-preview-04-17",
"gemini/gemini-2.5-pro-exp-03-25",
],
)
def test_gemini_models(model):
# Use LiteLLM for VCR compatibility (VCR can intercept HTTP calls but not native SDK calls)
llm = LLM(model=model, is_litellm=True)
llm = LLM(model=model, is_litellm=False)
result = llm.call("What is the capital of France?")
assert isinstance(result, str)
assert "Paris" in result
@pytest.mark.vcr(filter_headers=["authorization"], filter_query_parameters=["key"])
@pytest.mark.vcr()
@pytest.mark.parametrize(
"model",
[
@@ -294,7 +293,7 @@ def test_gemma3(model):
assert "Paris" in result
@pytest.mark.vcr(filter_headers=["authorization"])
@pytest.mark.vcr()
@pytest.mark.parametrize(
"model", ["gpt-4.1", "gpt-4.1-mini-2025-04-14", "gpt-4.1-nano-2025-04-14"]
)
@@ -305,7 +304,7 @@ def test_gpt_4_1(model):
assert "Paris" in result
@pytest.mark.vcr(filter_headers=["authorization"])
@pytest.mark.vcr()
def test_o3_mini_reasoning_effort_high():
llm = LLM(
model="o3-mini",
@@ -316,7 +315,7 @@ def test_o3_mini_reasoning_effort_high():
assert "Paris" in result
@pytest.mark.vcr(filter_headers=["authorization"])
@pytest.mark.vcr()
def test_o3_mini_reasoning_effort_low():
llm = LLM(
model="o3-mini",
@@ -327,7 +326,7 @@ def test_o3_mini_reasoning_effort_low():
assert "Paris" in result
@pytest.mark.vcr(filter_headers=["authorization"])
@pytest.mark.vcr()
def test_o3_mini_reasoning_effort_medium():
llm = LLM(
model="o3-mini",
@@ -416,11 +415,10 @@ def test_context_window_exceeded_error_handling():
assert "8192 tokens" in str(excinfo.value)
@pytest.mark.vcr(filter_headers=["authorization"])
@pytest.fixture
def anthropic_llm():
"""Fixture providing an Anthropic LLM instance."""
return LLM(model="anthropic/claude-3-sonnet", is_litellm=True)
return LLM(model="anthropic/claude-3-sonnet", is_litellm=False)
@pytest.fixture
@@ -438,18 +436,19 @@ def user_message():
def test_anthropic_message_formatting_edge_cases(anthropic_llm):
"""Test edge cases for Anthropic message formatting."""
# Test None messages
with pytest.raises(TypeError, match="Messages cannot be None"):
anthropic_llm._format_messages_for_provider(None)
anthropic_llm = AnthropicCompletion(model="claude-3-sonnet", is_litellm=False)
with pytest.raises(TypeError):
anthropic_llm._format_messages_for_anthropic(None)
# Test empty message list
formatted = anthropic_llm._format_messages_for_provider([])
# Test empty message list - Anthropic requires first message to be from user
formatted, system_message = anthropic_llm._format_messages_for_anthropic([])
assert len(formatted) == 1
assert formatted[0]["role"] == "user"
assert formatted[0]["content"] == "."
assert formatted[0]["content"] == "Hello"
# Test invalid message format
with pytest.raises(TypeError, match="Invalid message format"):
anthropic_llm._format_messages_for_provider([{"invalid": "message"}])
with pytest.raises(ValueError, match="must have 'role' and 'content' keys"):
anthropic_llm._format_messages_for_anthropic([{"invalid": "message"}])
def test_anthropic_model_detection():
@@ -471,13 +470,15 @@ def test_anthropic_message_formatting(anthropic_llm, system_message, user_messag
"""Test Anthropic message formatting with fixtures."""
# Test when first message is system
formatted = anthropic_llm._format_messages_for_provider([])
# Test empty message list - Anthropic requires first message to be from user
formatted, extracted_system = anthropic_llm._format_messages_for_anthropic([])
assert len(formatted) == 1
assert formatted[0]["role"] == "user"
assert formatted[0]["content"] == "."
assert formatted[0]["content"] == "Hello"
with pytest.raises(TypeError, match="Invalid message format"):
anthropic_llm._format_messages_for_provider([{"invalid": "message"}])
# Test invalid message format
with pytest.raises(ValueError, match="must have 'role' and 'content' keys"):
anthropic_llm._format_messages_for_anthropic([{"invalid": "message"}])
def test_deepseek_r1_with_open_router():
@@ -556,7 +557,7 @@ def mock_emit() -> MagicMock:
yield mock_emit
@pytest.mark.vcr(filter_headers=["authorization"])
@pytest.mark.vcr()
def test_handle_streaming_tool_calls(get_weather_tool_schema, mock_emit):
llm = LLM(model="openai/gpt-4o", stream=True, is_litellm=True)
response = llm.call(
@@ -584,7 +585,7 @@ def test_handle_streaming_tool_calls(get_weather_tool_schema, mock_emit):
)
@pytest.mark.vcr(filter_headers=["authorization"])
@pytest.mark.vcr()
def test_handle_streaming_tool_calls_with_error(get_weather_tool_schema, mock_emit):
def get_weather_error(location):
raise Exception("Error")
@@ -609,7 +610,7 @@ def test_handle_streaming_tool_calls_with_error(get_weather_tool_schema, mock_em
)
@pytest.mark.vcr(filter_headers=["authorization"])
@pytest.mark.vcr()
def test_handle_streaming_tool_calls_no_available_functions(
get_weather_tool_schema, mock_emit
):
@@ -630,7 +631,7 @@ def test_handle_streaming_tool_calls_no_available_functions(
)
@pytest.mark.vcr(filter_headers=["authorization"])
@pytest.mark.vcr()
def test_handle_streaming_tool_calls_no_tools(mock_emit):
llm = LLM(model="openai/gpt-4o", stream=True, is_litellm=True)
response = llm.call(
@@ -651,7 +652,7 @@ def test_handle_streaming_tool_calls_no_tools(mock_emit):
)
@pytest.mark.vcr(filter_headers=["authorization"])
@pytest.mark.vcr()
@pytest.mark.skip(reason="Highly flaky on ci")
def test_llm_call_when_stop_is_unsupported(caplog):
llm = LLM(model="o1-mini", stop=["stop"], is_litellm=True)
@@ -662,7 +663,7 @@ def test_llm_call_when_stop_is_unsupported(caplog):
assert "Paris" in result
@pytest.mark.vcr(filter_headers=["authorization"])
@pytest.mark.vcr()
@pytest.mark.skip(reason="Highly flaky on ci")
def test_llm_call_when_stop_is_unsupported_when_additional_drop_params_is_provided(
caplog,