Adding usage info in llm.py (#4172)

* Adding usage info everywhere * Changing the check * Changing the logic * Adding tests * Adding casellets * Minor change * Fixing testcase * remove the duplicated test case, thanks to cursor * Adding async test cases * Updating test case --------- Co-authored-by: Lorenze Jay <63378463+lorenzejay@users.noreply.github.com>
2026-05-03 00:02:36 +00:00 · 2026-01-08 00:12:27 +05:30
parent 09014215a9
commit 1c4f44af80
6 changed files with 715 additions and 10 deletions
--- a/lib/crewai/src/crewai/llm.py
+++ b/lib/crewai/src/crewai/llm.py
@@ -925,11 +925,12 @@ class LLM(BaseLLM):
            except Exception as e:
                logging.debug(f"Error checking for tool calls: {e}")

+            # Track token usage and log callbacks if available in streaming mode
+            if usage_info:
+                self._track_token_usage_internal(usage_info)
+            self._handle_streaming_callbacks(callbacks, usage_info, last_chunk)
+
            if not tool_calls or not available_functions:
-                # Track token usage and log callbacks if available in streaming mode
-                if usage_info:
-                    self._track_token_usage_internal(usage_info)
-                self._handle_streaming_callbacks(callbacks, usage_info, last_chunk)

                if response_model and self.is_litellm:
                    instructor_instance = InternalInstructor(
@@ -962,12 +963,7 @@ class LLM(BaseLLM):
            if tool_result is not None:
                return tool_result

-            # --- 10) Track token usage and log callbacks if available in streaming mode
-            if usage_info:
-                self._track_token_usage_internal(usage_info)
-            self._handle_streaming_callbacks(callbacks, usage_info, last_chunk)
-
-            # --- 11) Emit completion event and return response
+            # --- 10) Emit completion event and return response
            self._handle_emit_call_events(
                response=full_response,
                call_type=LLMCallType.LLM_CALL,
@@ -1148,6 +1144,10 @@ class LLM(BaseLLM):
            if response_model:
                params["response_model"] = response_model
            response = litellm.completion(**params)
+            
+            if hasattr(response,"usage") and not isinstance(response.usage, type) and response.usage:
+                usage_info = response.usage
+                self._track_token_usage_internal(usage_info)

        except ContextWindowExceededError as e:
            # Convert litellm's context window error to our own exception type
@@ -1273,6 +1273,10 @@ class LLM(BaseLLM):
                params["response_model"] = response_model
            response = await litellm.acompletion(**params)

+            if hasattr(response,"usage") and not isinstance(response.usage, type) and response.usage:
+                usage_info = response.usage
+                self._track_token_usage_internal(usage_info)
+
        except ContextWindowExceededError as e:
            raise LLMContextLengthExceededError(str(e)) from e

@@ -1359,6 +1363,7 @@ class LLM(BaseLLM):
        """
        full_response = ""
        chunk_count = 0
+        
        usage_info = None

        accumulated_tool_args: defaultdict[int, AccumulatedToolArgs] = defaultdict(
@@ -1444,6 +1449,9 @@ class LLM(BaseLLM):
                            end_time=0,
                        )

+            if usage_info:
+                self._track_token_usage_internal(usage_info)
+
            if accumulated_tool_args and available_functions:
                # Convert accumulated tool args to ChatCompletionDeltaToolCall objects
                tool_calls_list: list[ChatCompletionDeltaToolCall] = [