diff --git a/src/crewai/llm.py b/src/crewai/llm.py index aa8599b27..eaaef3744 100644 --- a/src/crewai/llm.py +++ b/src/crewai/llm.py @@ -286,6 +286,7 @@ class LLM: Args: params: Parameters for the completion call + callbacks: Optional list of callback functions available_functions: Dict of available functions Returns: @@ -295,10 +296,10 @@ class LLM: full_response = "" last_chunk = None chunk_count = 0 - debug_info = [] - # --- 2) Make sure stream is set to True + # --- 2) Make sure stream is set to True and include usage metrics params["stream"] = True + params["stream_options"] = {"include_usage": True} try: # --- 3) Process each chunk in the stream @@ -306,28 +307,21 @@ class LLM: chunk_count += 1 last_chunk = chunk - # Add debug info - debug_info.append(f"Chunk type: {type(chunk)}") - # Extract content from the chunk chunk_content = None # Handle ModelResponse objects if isinstance(chunk, ModelResponse): - debug_info.append("Chunk is ModelResponse") - - # Get usage information from the chunk + # Get usage information from the chunk (if any) usage_info = getattr(chunk, "usage", None) choices = getattr(chunk, "choices", []) if choices and len(choices) > 0: choice = choices[0] - debug_info.append(f"Choice type: {type(choice)}") # Handle dictionary-style choices if isinstance(choice, dict): delta = choice.get("delta", {}) - debug_info.append(f"Delta: {delta}") if ( isinstance(delta, dict) and "content" in delta @@ -338,7 +332,6 @@ class LLM: # Handle object-style choices else: delta = getattr(choice, "delta", None) - debug_info.append(f"Delta: {delta}") if delta is not None: if ( @@ -356,8 +349,6 @@ class LLM: self, event=LLMStreamChunkEvent(chunk=chunk_content), ) - else: - debug_info.append(f"No content found in chunk: {chunk}") # --- 4) Fallback to non-streaming if no content received if not full_response.strip() and chunk_count == 0: @@ -366,15 +357,15 @@ class LLM: ) non_streaming_params = params.copy() non_streaming_params["stream"] = False + non_streaming_params.pop( + "stream_options", None + ) # Remove stream_options for non-streaming call return self._handle_non_streaming_response( non_streaming_params, callbacks, available_functions ) # --- 5) Handle empty response with chunks if not full_response.strip() and chunk_count > 0: - logging.warning( - f"Received {chunk_count} chunks but no content. Debug info: {debug_info}" - ) if last_chunk is not None and isinstance(last_chunk, ModelResponse): usage_info = getattr(last_chunk, "usage", None) @@ -398,6 +389,10 @@ class LLM: logging.warning("Using default response as fallback") full_response = "I apologize, but I couldn't generate a proper response. Please try again or rephrase your request." + print("LAST CHUNK:", last_chunk) + if hasattr(last_chunk, "usage"): + print("LAST CHUNK USAGE:", last_chunk.usage) + # --- 7) Check for tool calls in the final response if isinstance(last_chunk, ModelResponse): usage_info = getattr(last_chunk, "usage", None) @@ -414,26 +409,29 @@ class LLM: return tool_result # --- 8) Log token usage if available in streaming mode - # Use usage info from the last chunk if present - usage_info = getattr(last_chunk, "usage", None) if last_chunk else None - if usage_info and self.callbacks and len(self.callbacks) > 0: - for callback in self.callbacks: + + # Safely handle callbacks with usage info + if callbacks and len(callbacks) > 0: + for callback in callbacks: if hasattr(callback, "log_success_event"): - callback.log_success_event( - kwargs=params, - response_obj={"usage": usage_info}, - start_time=0, - end_time=0, + usage_info = ( + getattr(last_chunk, "usage", None) if last_chunk else None ) + print("USAGE INFO", usage_info) + if usage_info: + callback.log_success_event( + kwargs=params, + response_obj={"usage": usage_info}, + start_time=0, + end_time=0, + ) # --- 9) Emit completion event and return response self._handle_emit_call_events(full_response, LLMCallType.LLM_CALL) return full_response except Exception as e: - logging.error( - f"Error in streaming response: {str(e)}, Debug info: {debug_info}" - ) + logging.error(f"Error in streaming response: {str(e)}") if full_response.strip(): logging.warning(f"Returning partial response despite error: {str(e)}") self._handle_emit_call_events(full_response, LLMCallType.LLM_CALL) @@ -442,6 +440,9 @@ class LLM: logging.warning("Falling back to non-streaming after error") non_streaming_params = params.copy() non_streaming_params["stream"] = False + non_streaming_params.pop( + "stream_options", None + ) # Remove stream_options for non-streaming call return self._handle_non_streaming_response( non_streaming_params, callbacks, available_functions )