mirror of
https://github.com/crewAIInc/crewAI.git
synced 2025-12-16 04:18:35 +00:00
chore: restructure test env, cassettes, and conftest; fix flaky tests
Some checks failed
Build uv cache / build-cache (3.10) (push) Has been cancelled
Build uv cache / build-cache (3.11) (push) Has been cancelled
Build uv cache / build-cache (3.12) (push) Has been cancelled
Build uv cache / build-cache (3.13) (push) Has been cancelled
CodeQL Advanced / Analyze (actions) (push) Has been cancelled
CodeQL Advanced / Analyze (python) (push) Has been cancelled
Notify Downstream / notify-downstream (push) Has been cancelled
Mark stale issues and pull requests / stale (push) Has been cancelled
Some checks failed
Build uv cache / build-cache (3.10) (push) Has been cancelled
Build uv cache / build-cache (3.11) (push) Has been cancelled
Build uv cache / build-cache (3.12) (push) Has been cancelled
Build uv cache / build-cache (3.13) (push) Has been cancelled
CodeQL Advanced / Analyze (actions) (push) Has been cancelled
CodeQL Advanced / Analyze (python) (push) Has been cancelled
Notify Downstream / notify-downstream (push) Has been cancelled
Mark stale issues and pull requests / stale (push) Has been cancelled
Consolidates pytest config, standardizes env handling, reorganizes cassette layout, removes outdated VCR configs, improves sync with threading.Condition, updates event-waiting logic, ensures cleanup, regenerates Gemini cassettes, and reverts unintended test changes.
This commit is contained in:
@@ -12,13 +12,14 @@ from crewai.events.event_types import (
|
||||
ToolUsageStartedEvent,
|
||||
)
|
||||
from crewai.llm import CONTEXT_WINDOW_USAGE_RATIO, LLM
|
||||
from crewai.llms.providers.anthropic.completion import AnthropicCompletion
|
||||
from crewai.utilities.token_counter_callback import TokenCalcHandler
|
||||
from pydantic import BaseModel
|
||||
import pytest
|
||||
|
||||
|
||||
# TODO: This test fails without print statement, which makes me think that something is happening asynchronously that we need to eventually fix and dive deeper into at a later date
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
@pytest.mark.vcr()
|
||||
def test_llm_callback_replacement():
|
||||
llm1 = LLM(model="gpt-4o-mini", is_litellm=True)
|
||||
llm2 = LLM(model="gpt-4o-mini", is_litellm=True)
|
||||
@@ -45,7 +46,7 @@ def test_llm_callback_replacement():
|
||||
assert usage_metrics_1 == calc_handler_1.token_cost_process.get_summary()
|
||||
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
@pytest.mark.vcr()
|
||||
def test_llm_call_with_string_input():
|
||||
llm = LLM(model="gpt-4o-mini")
|
||||
|
||||
@@ -55,7 +56,7 @@ def test_llm_call_with_string_input():
|
||||
assert len(result.strip()) > 0 # Ensure the response is not empty
|
||||
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
@pytest.mark.vcr()
|
||||
def test_llm_call_with_string_input_and_callbacks():
|
||||
llm = LLM(model="gpt-4o-mini", is_litellm=True)
|
||||
calc_handler = TokenCalcHandler(token_cost_process=TokenProcess())
|
||||
@@ -72,7 +73,7 @@ def test_llm_call_with_string_input_and_callbacks():
|
||||
assert usage_metrics.successful_requests == 1
|
||||
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
@pytest.mark.vcr()
|
||||
def test_llm_call_with_message_list():
|
||||
llm = LLM(model="gpt-4o-mini")
|
||||
messages = [{"role": "user", "content": "What is the capital of France?"}]
|
||||
@@ -83,7 +84,7 @@ def test_llm_call_with_message_list():
|
||||
assert "Paris" in result
|
||||
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
@pytest.mark.vcr()
|
||||
def test_llm_call_with_tool_and_string_input():
|
||||
llm = LLM(model="gpt-4o-mini")
|
||||
|
||||
@@ -121,7 +122,7 @@ def test_llm_call_with_tool_and_string_input():
|
||||
assert result == get_current_year()
|
||||
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
@pytest.mark.vcr()
|
||||
def test_llm_call_with_tool_and_message_list():
|
||||
llm = LLM(model="gpt-4o-mini", is_litellm=True)
|
||||
|
||||
@@ -161,7 +162,7 @@ def test_llm_call_with_tool_and_message_list():
|
||||
assert result == 25
|
||||
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
@pytest.mark.vcr()
|
||||
def test_llm_passes_additional_params():
|
||||
llm = LLM(
|
||||
model="gpt-4o-mini",
|
||||
@@ -259,7 +260,7 @@ def test_validate_call_params_no_response_format():
|
||||
llm._validate_call_params()
|
||||
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"], filter_query_parameters=["key"])
|
||||
@pytest.mark.vcr()
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
@@ -267,19 +268,17 @@ def test_validate_call_params_no_response_format():
|
||||
"gemini/gemini-2.0-flash-thinking-exp-01-21",
|
||||
"gemini/gemini-2.0-flash-001",
|
||||
"gemini/gemini-2.0-flash-lite-001",
|
||||
"gemini/gemini-2.5-flash-preview-04-17",
|
||||
"gemini/gemini-2.5-pro-exp-03-25",
|
||||
],
|
||||
)
|
||||
def test_gemini_models(model):
|
||||
# Use LiteLLM for VCR compatibility (VCR can intercept HTTP calls but not native SDK calls)
|
||||
llm = LLM(model=model, is_litellm=True)
|
||||
llm = LLM(model=model, is_litellm=False)
|
||||
result = llm.call("What is the capital of France?")
|
||||
assert isinstance(result, str)
|
||||
assert "Paris" in result
|
||||
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"], filter_query_parameters=["key"])
|
||||
@pytest.mark.vcr()
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
@@ -294,7 +293,7 @@ def test_gemma3(model):
|
||||
assert "Paris" in result
|
||||
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
@pytest.mark.vcr()
|
||||
@pytest.mark.parametrize(
|
||||
"model", ["gpt-4.1", "gpt-4.1-mini-2025-04-14", "gpt-4.1-nano-2025-04-14"]
|
||||
)
|
||||
@@ -305,7 +304,7 @@ def test_gpt_4_1(model):
|
||||
assert "Paris" in result
|
||||
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
@pytest.mark.vcr()
|
||||
def test_o3_mini_reasoning_effort_high():
|
||||
llm = LLM(
|
||||
model="o3-mini",
|
||||
@@ -316,7 +315,7 @@ def test_o3_mini_reasoning_effort_high():
|
||||
assert "Paris" in result
|
||||
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
@pytest.mark.vcr()
|
||||
def test_o3_mini_reasoning_effort_low():
|
||||
llm = LLM(
|
||||
model="o3-mini",
|
||||
@@ -327,7 +326,7 @@ def test_o3_mini_reasoning_effort_low():
|
||||
assert "Paris" in result
|
||||
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
@pytest.mark.vcr()
|
||||
def test_o3_mini_reasoning_effort_medium():
|
||||
llm = LLM(
|
||||
model="o3-mini",
|
||||
@@ -416,11 +415,10 @@ def test_context_window_exceeded_error_handling():
|
||||
assert "8192 tokens" in str(excinfo.value)
|
||||
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
@pytest.fixture
|
||||
def anthropic_llm():
|
||||
"""Fixture providing an Anthropic LLM instance."""
|
||||
return LLM(model="anthropic/claude-3-sonnet", is_litellm=True)
|
||||
return LLM(model="anthropic/claude-3-sonnet", is_litellm=False)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@@ -438,18 +436,19 @@ def user_message():
|
||||
def test_anthropic_message_formatting_edge_cases(anthropic_llm):
|
||||
"""Test edge cases for Anthropic message formatting."""
|
||||
# Test None messages
|
||||
with pytest.raises(TypeError, match="Messages cannot be None"):
|
||||
anthropic_llm._format_messages_for_provider(None)
|
||||
anthropic_llm = AnthropicCompletion(model="claude-3-sonnet", is_litellm=False)
|
||||
with pytest.raises(TypeError):
|
||||
anthropic_llm._format_messages_for_anthropic(None)
|
||||
|
||||
# Test empty message list
|
||||
formatted = anthropic_llm._format_messages_for_provider([])
|
||||
# Test empty message list - Anthropic requires first message to be from user
|
||||
formatted, system_message = anthropic_llm._format_messages_for_anthropic([])
|
||||
assert len(formatted) == 1
|
||||
assert formatted[0]["role"] == "user"
|
||||
assert formatted[0]["content"] == "."
|
||||
assert formatted[0]["content"] == "Hello"
|
||||
|
||||
# Test invalid message format
|
||||
with pytest.raises(TypeError, match="Invalid message format"):
|
||||
anthropic_llm._format_messages_for_provider([{"invalid": "message"}])
|
||||
with pytest.raises(ValueError, match="must have 'role' and 'content' keys"):
|
||||
anthropic_llm._format_messages_for_anthropic([{"invalid": "message"}])
|
||||
|
||||
|
||||
def test_anthropic_model_detection():
|
||||
@@ -471,13 +470,15 @@ def test_anthropic_message_formatting(anthropic_llm, system_message, user_messag
|
||||
"""Test Anthropic message formatting with fixtures."""
|
||||
# Test when first message is system
|
||||
|
||||
formatted = anthropic_llm._format_messages_for_provider([])
|
||||
# Test empty message list - Anthropic requires first message to be from user
|
||||
formatted, extracted_system = anthropic_llm._format_messages_for_anthropic([])
|
||||
assert len(formatted) == 1
|
||||
assert formatted[0]["role"] == "user"
|
||||
assert formatted[0]["content"] == "."
|
||||
assert formatted[0]["content"] == "Hello"
|
||||
|
||||
with pytest.raises(TypeError, match="Invalid message format"):
|
||||
anthropic_llm._format_messages_for_provider([{"invalid": "message"}])
|
||||
# Test invalid message format
|
||||
with pytest.raises(ValueError, match="must have 'role' and 'content' keys"):
|
||||
anthropic_llm._format_messages_for_anthropic([{"invalid": "message"}])
|
||||
|
||||
|
||||
def test_deepseek_r1_with_open_router():
|
||||
@@ -556,7 +557,7 @@ def mock_emit() -> MagicMock:
|
||||
yield mock_emit
|
||||
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
@pytest.mark.vcr()
|
||||
def test_handle_streaming_tool_calls(get_weather_tool_schema, mock_emit):
|
||||
llm = LLM(model="openai/gpt-4o", stream=True, is_litellm=True)
|
||||
response = llm.call(
|
||||
@@ -584,7 +585,7 @@ def test_handle_streaming_tool_calls(get_weather_tool_schema, mock_emit):
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
@pytest.mark.vcr()
|
||||
def test_handle_streaming_tool_calls_with_error(get_weather_tool_schema, mock_emit):
|
||||
def get_weather_error(location):
|
||||
raise Exception("Error")
|
||||
@@ -609,7 +610,7 @@ def test_handle_streaming_tool_calls_with_error(get_weather_tool_schema, mock_em
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
@pytest.mark.vcr()
|
||||
def test_handle_streaming_tool_calls_no_available_functions(
|
||||
get_weather_tool_schema, mock_emit
|
||||
):
|
||||
@@ -630,7 +631,7 @@ def test_handle_streaming_tool_calls_no_available_functions(
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
@pytest.mark.vcr()
|
||||
def test_handle_streaming_tool_calls_no_tools(mock_emit):
|
||||
llm = LLM(model="openai/gpt-4o", stream=True, is_litellm=True)
|
||||
response = llm.call(
|
||||
@@ -651,7 +652,7 @@ def test_handle_streaming_tool_calls_no_tools(mock_emit):
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
@pytest.mark.vcr()
|
||||
@pytest.mark.skip(reason="Highly flaky on ci")
|
||||
def test_llm_call_when_stop_is_unsupported(caplog):
|
||||
llm = LLM(model="o1-mini", stop=["stop"], is_litellm=True)
|
||||
@@ -662,7 +663,7 @@ def test_llm_call_when_stop_is_unsupported(caplog):
|
||||
assert "Paris" in result
|
||||
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
@pytest.mark.vcr()
|
||||
@pytest.mark.skip(reason="Highly flaky on ci")
|
||||
def test_llm_call_when_stop_is_unsupported_when_additional_drop_params_is_provided(
|
||||
caplog,
|
||||
|
||||
Reference in New Issue
Block a user