Files
crewAI/lib/crewai/tests/test_llm_streaming_finish_reason.py
Lucas Gomide 4cb2ecd948 fix(otel): correct streaming finish_reason + bedrock response_id semantics
Two correctness fixes uncovered while landing the OTel finish_reason +
response_id plumbing:

- LiteLLM streaming (sync + async): `stream_options={"include_usage": True}`
  causes LiteLLM to emit a final usage-only chunk with `choices=[]`. The
  post-loop `_extract_finish_reason_and_response_id(last_chunk)` silently
  returned `(None, None)` because the last chunk has no choices, even though
  earlier chunks carried `finish_reason="stop"`. Track both fields
  incrementally inside the loop (mirroring how OpenAI/Gemini/Azure already
  handle their native streams) and use the tracked values for the
  LLMCallCompletedEvent emission and the partial-response error path.

- Bedrock Converse: `ResponseMetadata.RequestId` is an AWS infra trace id,
  not a model-level response id (semantically different from OpenAI's
  `chatcmpl-XXX`). Return None for `response_id` rather than mislead
  downstream telemetry consumers. The audit-fix's async propagation chain
  still works — None propagates through unchanged.

Adds `test_llm_streaming_finish_reason.py` pinning both the sync and async
LiteLLM streaming paths against the include_usage chunk shape.
2026-05-27 18:12:37 -03:00

97 lines
2.9 KiB
Python

"""Regression: LiteLLM emits a final usage-only chunk (choices=[]) when
``stream_options.include_usage`` is set. The old post-loop
``_extract_finish_reason_and_response_id(last_chunk)`` then silently returned
(None, None). These tests pin that we capture finish_reason/response_id
incrementally during the stream loop instead.
"""
from __future__ import annotations
from typing import Any
from unittest.mock import patch
import pytest
from crewai.events.event_bus import CrewAIEventsBus
from crewai.events.types.llm_events import LLMCallCompletedEvent
from crewai.llm import LLM
@pytest.fixture
def mock_emit():
with patch.object(CrewAIEventsBus, "emit") as mock:
yield mock
def _completed_event(mock_emit) -> LLMCallCompletedEvent:
matches = [
call.kwargs["event"]
for call in mock_emit.call_args_list
if isinstance(call.kwargs.get("event"), LLMCallCompletedEvent)
]
assert matches, "expected an LLMCallCompletedEvent to be emitted"
assert len(matches) == 1, f"expected one completed event, got {len(matches)}"
return matches[0]
def _chunks_with_usage_tail() -> list[dict[str, Any]]:
"""Three-chunk stream mirroring LiteLLM's include_usage behavior:
two content chunks where the second carries finish_reason="stop",
then a final usage-only chunk with choices=[]."""
return [
{
"id": "chatcmpl-stream-1",
"choices": [
{"delta": {"content": "hi"}, "finish_reason": None}
],
},
{
"id": "chatcmpl-stream-1",
"choices": [
{"delta": {"content": " there"}, "finish_reason": "stop"}
],
},
{
"id": "chatcmpl-stream-1",
"choices": [],
"usage": {
"prompt_tokens": 1,
"completion_tokens": 2,
"total_tokens": 3,
},
},
]
def test_sync_stream_emits_finish_reason_and_response_id_from_loop(mock_emit):
llm = LLM(model="gpt-4o-mini", is_litellm=True, stream=True)
with patch("crewai.llm.litellm.completion", return_value=iter(_chunks_with_usage_tail())):
result = llm.call("anything")
assert result == "hi there"
event = _completed_event(mock_emit)
assert event.finish_reason == "stop"
assert event.response_id == "chatcmpl-stream-1"
@pytest.mark.asyncio
async def test_async_stream_emits_finish_reason_and_response_id_from_loop(mock_emit):
llm = LLM(model="gpt-4o-mini", is_litellm=True, stream=True)
async def _aiter():
for chunk in _chunks_with_usage_tail():
yield chunk
async def _acompletion(*_args, **_kwargs):
return _aiter()
with patch("crewai.llm.litellm.acompletion", side_effect=_acompletion):
result = await llm.acall("anything")
assert result == "hi there"
event = _completed_event(mock_emit)
assert event.finish_reason == "stop"
assert event.response_id == "chatcmpl-stream-1"