chore: restructure test env, cassettes, and conftest; fix flaky tests

Consolidates pytest config, standardizes env handling, reorganizes cassette layout, removes outdated VCR configs, improves sync with threading.Condition, updates event-waiting logic, ensures cleanup, regenerates Gemini cassettes, and reverts unintended test changes.
2025-12-16 04:18:35 +00:00 · 2025-11-29 16:55:24 -05:00
parent bc4e6a3127
commit c925d2d519
200 changed files with 2070 additions and 1891 deletions
--- a/lib/crewai/tests/test_llm.py
+++ b/lib/crewai/tests/test_llm.py
@@ -12,13 +12,14 @@ from crewai.events.event_types import (
    ToolUsageStartedEvent,
 )
 from crewai.llm import CONTEXT_WINDOW_USAGE_RATIO, LLM
+from crewai.llms.providers.anthropic.completion import AnthropicCompletion
 from crewai.utilities.token_counter_callback import TokenCalcHandler
 from pydantic import BaseModel
 import pytest


 # TODO: This test fails without print statement, which makes me think that something is happening asynchronously that we need to eventually fix and dive deeper into at a later date
-@pytest.mark.vcr(filter_headers=["authorization"])
+@pytest.mark.vcr()
 def test_llm_callback_replacement():
    llm1 = LLM(model="gpt-4o-mini", is_litellm=True)
    llm2 = LLM(model="gpt-4o-mini", is_litellm=True)
@@ -45,7 +46,7 @@ def test_llm_callback_replacement():
    assert usage_metrics_1 == calc_handler_1.token_cost_process.get_summary()


-@pytest.mark.vcr(filter_headers=["authorization"])
+@pytest.mark.vcr()
 def test_llm_call_with_string_input():
    llm = LLM(model="gpt-4o-mini")

@@ -55,7 +56,7 @@ def test_llm_call_with_string_input():
    assert len(result.strip()) > 0  # Ensure the response is not empty


-@pytest.mark.vcr(filter_headers=["authorization"])
+@pytest.mark.vcr()
 def test_llm_call_with_string_input_and_callbacks():
    llm = LLM(model="gpt-4o-mini", is_litellm=True)
    calc_handler = TokenCalcHandler(token_cost_process=TokenProcess())
@@ -72,7 +73,7 @@ def test_llm_call_with_string_input_and_callbacks():
    assert usage_metrics.successful_requests == 1


-@pytest.mark.vcr(filter_headers=["authorization"])
+@pytest.mark.vcr()
 def test_llm_call_with_message_list():
    llm = LLM(model="gpt-4o-mini")
    messages = [{"role": "user", "content": "What is the capital of France?"}]
@@ -83,7 +84,7 @@ def test_llm_call_with_message_list():
    assert "Paris" in result


-@pytest.mark.vcr(filter_headers=["authorization"])
+@pytest.mark.vcr()
 def test_llm_call_with_tool_and_string_input():
    llm = LLM(model="gpt-4o-mini")

@@ -121,7 +122,7 @@ def test_llm_call_with_tool_and_string_input():
    assert result == get_current_year()


-@pytest.mark.vcr(filter_headers=["authorization"])
+@pytest.mark.vcr()
 def test_llm_call_with_tool_and_message_list():
    llm = LLM(model="gpt-4o-mini", is_litellm=True)

@@ -161,7 +162,7 @@ def test_llm_call_with_tool_and_message_list():
    assert result == 25


-@pytest.mark.vcr(filter_headers=["authorization"])
+@pytest.mark.vcr()
 def test_llm_passes_additional_params():
    llm = LLM(
        model="gpt-4o-mini",
@@ -259,7 +260,7 @@ def test_validate_call_params_no_response_format():
    llm._validate_call_params()


-@pytest.mark.vcr(filter_headers=["authorization"], filter_query_parameters=["key"])
+@pytest.mark.vcr()
@pytest.mark.parametrize(
    "model",
    [
@@ -267,19 +268,17 @@ def test_validate_call_params_no_response_format():
        "gemini/gemini-2.0-flash-thinking-exp-01-21",
        "gemini/gemini-2.0-flash-001",
        "gemini/gemini-2.0-flash-lite-001",
-        "gemini/gemini-2.5-flash-preview-04-17",
-        "gemini/gemini-2.5-pro-exp-03-25",
    ],
 )
 def test_gemini_models(model):
    # Use LiteLLM for VCR compatibility (VCR can intercept HTTP calls but not native SDK calls)
-    llm = LLM(model=model, is_litellm=True)
+    llm = LLM(model=model, is_litellm=False)
    result = llm.call("What is the capital of France?")
    assert isinstance(result, str)
    assert "Paris" in result


-@pytest.mark.vcr(filter_headers=["authorization"], filter_query_parameters=["key"])
+@pytest.mark.vcr()
@pytest.mark.parametrize(
    "model",
    [
@@ -294,7 +293,7 @@ def test_gemma3(model):
    assert "Paris" in result


-@pytest.mark.vcr(filter_headers=["authorization"])
+@pytest.mark.vcr()
@pytest.mark.parametrize(
    "model", ["gpt-4.1", "gpt-4.1-mini-2025-04-14", "gpt-4.1-nano-2025-04-14"]
 )
@@ -305,7 +304,7 @@ def test_gpt_4_1(model):
    assert "Paris" in result


-@pytest.mark.vcr(filter_headers=["authorization"])
+@pytest.mark.vcr()
 def test_o3_mini_reasoning_effort_high():
    llm = LLM(
        model="o3-mini",
@@ -316,7 +315,7 @@ def test_o3_mini_reasoning_effort_high():
    assert "Paris" in result


-@pytest.mark.vcr(filter_headers=["authorization"])
+@pytest.mark.vcr()
 def test_o3_mini_reasoning_effort_low():
    llm = LLM(
        model="o3-mini",
@@ -327,7 +326,7 @@ def test_o3_mini_reasoning_effort_low():
    assert "Paris" in result


-@pytest.mark.vcr(filter_headers=["authorization"])
+@pytest.mark.vcr()
 def test_o3_mini_reasoning_effort_medium():
    llm = LLM(
        model="o3-mini",
@@ -416,11 +415,10 @@ def test_context_window_exceeded_error_handling():
        assert "8192 tokens" in str(excinfo.value)


-@pytest.mark.vcr(filter_headers=["authorization"])
@pytest.fixture
 def anthropic_llm():
    """Fixture providing an Anthropic LLM instance."""
-    return LLM(model="anthropic/claude-3-sonnet", is_litellm=True)
+    return LLM(model="anthropic/claude-3-sonnet", is_litellm=False)


@pytest.fixture
@@ -438,18 +436,19 @@ def user_message():
 def test_anthropic_message_formatting_edge_cases(anthropic_llm):
    """Test edge cases for Anthropic message formatting."""
    # Test None messages
-    with pytest.raises(TypeError, match="Messages cannot be None"):
-        anthropic_llm._format_messages_for_provider(None)
+    anthropic_llm = AnthropicCompletion(model="claude-3-sonnet", is_litellm=False)
+    with pytest.raises(TypeError):
+        anthropic_llm._format_messages_for_anthropic(None)

-    # Test empty message list
-    formatted = anthropic_llm._format_messages_for_provider([])
+    # Test empty message list - Anthropic requires first message to be from user
+    formatted, system_message = anthropic_llm._format_messages_for_anthropic([])
    assert len(formatted) == 1
    assert formatted[0]["role"] == "user"
-    assert formatted[0]["content"] == "."
+    assert formatted[0]["content"] == "Hello"

    # Test invalid message format
-    with pytest.raises(TypeError, match="Invalid message format"):
-        anthropic_llm._format_messages_for_provider([{"invalid": "message"}])
+    with pytest.raises(ValueError, match="must have 'role' and 'content' keys"):
+        anthropic_llm._format_messages_for_anthropic([{"invalid": "message"}])


 def test_anthropic_model_detection():
@@ -471,13 +470,15 @@ def test_anthropic_message_formatting(anthropic_llm, system_message, user_messag
    """Test Anthropic message formatting with fixtures."""
    # Test when first message is system

-    formatted = anthropic_llm._format_messages_for_provider([])
+    # Test empty message list - Anthropic requires first message to be from user
+    formatted, extracted_system = anthropic_llm._format_messages_for_anthropic([])
    assert len(formatted) == 1
    assert formatted[0]["role"] == "user"
-    assert formatted[0]["content"] == "."
+    assert formatted[0]["content"] == "Hello"

-    with pytest.raises(TypeError, match="Invalid message format"):
-        anthropic_llm._format_messages_for_provider([{"invalid": "message"}])
+    # Test invalid message format
+    with pytest.raises(ValueError, match="must have 'role' and 'content' keys"):
+        anthropic_llm._format_messages_for_anthropic([{"invalid": "message"}])


 def test_deepseek_r1_with_open_router():
@@ -556,7 +557,7 @@ def mock_emit() -> MagicMock:
        yield mock_emit


-@pytest.mark.vcr(filter_headers=["authorization"])
+@pytest.mark.vcr()
 def test_handle_streaming_tool_calls(get_weather_tool_schema, mock_emit):
    llm = LLM(model="openai/gpt-4o", stream=True, is_litellm=True)
    response = llm.call(
@@ -584,7 +585,7 @@ def test_handle_streaming_tool_calls(get_weather_tool_schema, mock_emit):
    )


-@pytest.mark.vcr(filter_headers=["authorization"])
+@pytest.mark.vcr()
 def test_handle_streaming_tool_calls_with_error(get_weather_tool_schema, mock_emit):
    def get_weather_error(location):
        raise Exception("Error")
@@ -609,7 +610,7 @@ def test_handle_streaming_tool_calls_with_error(get_weather_tool_schema, mock_em
    )


-@pytest.mark.vcr(filter_headers=["authorization"])
+@pytest.mark.vcr()
 def test_handle_streaming_tool_calls_no_available_functions(
    get_weather_tool_schema, mock_emit
 ):
@@ -630,7 +631,7 @@ def test_handle_streaming_tool_calls_no_available_functions(
    )


-@pytest.mark.vcr(filter_headers=["authorization"])
+@pytest.mark.vcr()
 def test_handle_streaming_tool_calls_no_tools(mock_emit):
    llm = LLM(model="openai/gpt-4o", stream=True, is_litellm=True)
    response = llm.call(
@@ -651,7 +652,7 @@ def test_handle_streaming_tool_calls_no_tools(mock_emit):
    )


-@pytest.mark.vcr(filter_headers=["authorization"])
+@pytest.mark.vcr()
@pytest.mark.skip(reason="Highly flaky on ci")
 def test_llm_call_when_stop_is_unsupported(caplog):
    llm = LLM(model="o1-mini", stop=["stop"], is_litellm=True)
@@ -662,7 +663,7 @@ def test_llm_call_when_stop_is_unsupported(caplog):
    assert "Paris" in result


-@pytest.mark.vcr(filter_headers=["authorization"])
+@pytest.mark.vcr()
@pytest.mark.skip(reason="Highly flaky on ci")
 def test_llm_call_when_stop_is_unsupported_when_additional_drop_params_is_provided(
    caplog,