diff --git a/lib/crewai/src/crewai/llms/providers/gemini/completion.py b/lib/crewai/src/crewai/llms/providers/gemini/completion.py index 1184c9d39..c81ab855c 100644 --- a/lib/crewai/src/crewai/llms/providers/gemini/completion.py +++ b/lib/crewai/src/crewai/llms/providers/gemini/completion.py @@ -34,6 +34,9 @@ except ImportError: ) from None +STRUCTURED_OUTPUT_TOOL_NAME = "structured_output" + + class GeminiCompletion(BaseLLM): """Google Gemini native completion implementation. @@ -447,6 +450,9 @@ class GeminiCompletion(BaseLLM): Structured output support varies by model version: - Gemini 1.5 and earlier: Uses response_schema (Pydantic model) - Gemini 2.0+: Uses response_json_schema (JSON Schema) with propertyOrdering + + When both tools AND response_model are present, we add a structured_output + pseudo-tool since Gemini doesn't support tools + response_schema together. """ self.tools = tools config_params: dict[str, Any] = {} @@ -472,7 +478,30 @@ class GeminiCompletion(BaseLLM): config_params["stop_sequences"] = self.stop_sequences if tools and self.supports_tools: - config_params["tools"] = self._convert_tools_for_interference(tools) + gemini_tools = self._convert_tools_for_interference(tools) + + if response_model: + schema_output = generate_model_description(response_model) + schema = schema_output.get("json_schema", {}).get("schema", {}) + if self.is_gemini_2_0: + schema = self._add_property_ordering(schema) + + structured_output_tool = types.Tool( + function_declarations=[ + types.FunctionDeclaration( + name=STRUCTURED_OUTPUT_TOOL_NAME, + description=( + "Use this tool to provide your final structured response. " + "Call this tool when you have gathered all necessary information " + "and are ready to provide the final answer in the required format." + ), + parameters_json_schema=schema, + ) + ] + ) + gemini_tools.append(structured_output_tool) + + config_params["tools"] = gemini_tools elif response_model: config_params["response_mime_type"] = "application/json" schema_output = generate_model_description(response_model) @@ -719,6 +748,47 @@ class GeminiCompletion(BaseLLM): messages_for_event, content, from_agent ) + def _handle_structured_output_tool_call( + self, + structured_data: dict[str, Any], + response_model: type[BaseModel], + contents: list[types.Content], + from_task: Any | None = None, + from_agent: Any | None = None, + ) -> BaseModel: + """Validate and emit event for structured_output tool call. + + Args: + structured_data: The arguments passed to the structured_output tool + response_model: Pydantic model to validate against + contents: Original contents for event conversion + from_task: Task that initiated the call + from_agent: Agent that initiated the call + + Returns: + Validated Pydantic model instance + + Raises: + ValueError: If validation fails + """ + try: + validated_data = response_model.model_validate(structured_data) + self._emit_call_completed_event( + response=validated_data.model_dump_json(), + call_type=LLMCallType.LLM_CALL, + from_task=from_task, + from_agent=from_agent, + messages=self._convert_contents_to_dict(contents), + ) + return validated_data + except Exception as e: + error_msg = ( + f"Failed to validate {STRUCTURED_OUTPUT_TOOL_NAME} tool response " + f"with model {response_model.__name__}: {e}" + ) + logging.error(error_msg) + raise ValueError(error_msg) from e + def _process_response_with_tools( self, response: GenerateContentResponse, @@ -749,17 +819,47 @@ class GeminiCompletion(BaseLLM): part for part in candidate.content.parts if part.function_call ] + # Check for structured_output pseudo-tool call (used when tools + response_model) + if response_model and function_call_parts: + for part in function_call_parts: + if ( + part.function_call + and part.function_call.name == STRUCTURED_OUTPUT_TOOL_NAME + ): + structured_data = ( + dict(part.function_call.args) + if part.function_call.args + else {} + ) + return self._handle_structured_output_tool_call( + structured_data=structured_data, + response_model=response_model, + contents=contents, + from_task=from_task, + from_agent=from_agent, + ) + + # Filter out structured_output from function calls returned to executor + non_structured_output_parts = [ + part + for part in function_call_parts + if not ( + part.function_call + and part.function_call.name == STRUCTURED_OUTPUT_TOOL_NAME + ) + ] + # If there are function calls but no available_functions, # return them for the executor to handle (like OpenAI/Anthropic) - if function_call_parts and not available_functions: + if non_structured_output_parts and not available_functions: self._emit_call_completed_event( - response=function_call_parts, + response=non_structured_output_parts, call_type=LLMCallType.TOOL_CALL, from_task=from_task, from_agent=from_agent, messages=self._convert_contents_to_dict(contents), ) - return function_call_parts + return non_structured_output_parts # Otherwise execute the tools internally for part in candidate.content.parts: @@ -767,6 +867,9 @@ class GeminiCompletion(BaseLLM): function_name = part.function_call.name if function_name is None: continue + # Skip structured_output - it's handled above + if function_name == STRUCTURED_OUTPUT_TOOL_NAME: + continue function_args = ( dict(part.function_call.args) if part.function_call.args @@ -899,9 +1002,27 @@ class GeminiCompletion(BaseLLM): """ self._track_token_usage_internal(usage_data) + if response_model and function_calls: + for call_data in function_calls.values(): + if call_data.get("name") == STRUCTURED_OUTPUT_TOOL_NAME: + structured_data = call_data.get("args", {}) + return self._handle_structured_output_tool_call( + structured_data=structured_data, + response_model=response_model, + contents=contents, + from_task=from_task, + from_agent=from_agent, + ) + + non_structured_output_calls = { + idx: call_data + for idx, call_data in function_calls.items() + if call_data.get("name") != STRUCTURED_OUTPUT_TOOL_NAME + } + # If there are function calls but no available_functions, # return them for the executor to handle - if function_calls and not available_functions: + if non_structured_output_calls and not available_functions: formatted_function_calls = [ { "id": call_data["id"], @@ -911,7 +1032,7 @@ class GeminiCompletion(BaseLLM): }, "type": "function", } - for call_data in function_calls.values() + for call_data in non_structured_output_calls.values() ] self._emit_call_completed_event( response=formatted_function_calls, @@ -922,9 +1043,9 @@ class GeminiCompletion(BaseLLM): ) return formatted_function_calls - # Handle completed function calls - if function_calls and available_functions: - for call_data in function_calls.values(): + # Handle completed function calls (excluding structured_output) + if non_structured_output_calls and available_functions: + for call_data in non_structured_output_calls.values(): function_name = call_data["name"] function_args = call_data["args"] @@ -948,6 +1069,9 @@ class GeminiCompletion(BaseLLM): if result is not None: return result + # When tools are present, structured output should come via the structured_output + # pseudo-tool, not via direct text response. If we reach here with tools present, + # the LLM chose to return plain text instead of calling structured_output. effective_response_model = None if self.tools else response_model return self._finalize_completion_response(