Adding usage info in llm.py (#4172)

* Adding usage info everywhere

* Changing the check

* Changing the logic

* Adding tests

* Adding casellets

* Minor change

* Fixing testcase

* remove the duplicated test case, thanks to cursor

* Adding async test cases

* Updating test case

---------

Co-authored-by: Lorenze Jay <63378463+lorenzejay@users.noreply.github.com>
This commit is contained in:
Vidit Ostwal
2026-01-08 00:12:27 +05:30
committed by GitHub
parent 09014215a9
commit 1c4f44af80
6 changed files with 715 additions and 10 deletions

View File

@@ -925,11 +925,12 @@ class LLM(BaseLLM):
except Exception as e:
logging.debug(f"Error checking for tool calls: {e}")
# Track token usage and log callbacks if available in streaming mode
if usage_info:
self._track_token_usage_internal(usage_info)
self._handle_streaming_callbacks(callbacks, usage_info, last_chunk)
if not tool_calls or not available_functions:
# Track token usage and log callbacks if available in streaming mode
if usage_info:
self._track_token_usage_internal(usage_info)
self._handle_streaming_callbacks(callbacks, usage_info, last_chunk)
if response_model and self.is_litellm:
instructor_instance = InternalInstructor(
@@ -962,12 +963,7 @@ class LLM(BaseLLM):
if tool_result is not None:
return tool_result
# --- 10) Track token usage and log callbacks if available in streaming mode
if usage_info:
self._track_token_usage_internal(usage_info)
self._handle_streaming_callbacks(callbacks, usage_info, last_chunk)
# --- 11) Emit completion event and return response
# --- 10) Emit completion event and return response
self._handle_emit_call_events(
response=full_response,
call_type=LLMCallType.LLM_CALL,
@@ -1148,6 +1144,10 @@ class LLM(BaseLLM):
if response_model:
params["response_model"] = response_model
response = litellm.completion(**params)
if hasattr(response,"usage") and not isinstance(response.usage, type) and response.usage:
usage_info = response.usage
self._track_token_usage_internal(usage_info)
except ContextWindowExceededError as e:
# Convert litellm's context window error to our own exception type
@@ -1273,6 +1273,10 @@ class LLM(BaseLLM):
params["response_model"] = response_model
response = await litellm.acompletion(**params)
if hasattr(response,"usage") and not isinstance(response.usage, type) and response.usage:
usage_info = response.usage
self._track_token_usage_internal(usage_info)
except ContextWindowExceededError as e:
raise LLMContextLengthExceededError(str(e)) from e
@@ -1359,6 +1363,7 @@ class LLM(BaseLLM):
"""
full_response = ""
chunk_count = 0
usage_info = None
accumulated_tool_args: defaultdict[int, AccumulatedToolArgs] = defaultdict(
@@ -1444,6 +1449,9 @@ class LLM(BaseLLM):
end_time=0,
)
if usage_info:
self._track_token_usage_internal(usage_info)
if accumulated_tool_args and available_functions:
# Convert accumulated tool args to ChatCompletionDeltaToolCall objects
tool_calls_list: list[ChatCompletionDeltaToolCall] = [