diff --git a/letta/llm_api/google_vertex_client.py b/letta/llm_api/google_vertex_client.py index 79e5c028..aa2e99ba 100644 --- a/letta/llm_api/google_vertex_client.py +++ b/letta/llm_api/google_vertex_client.py @@ -370,16 +370,22 @@ class GoogleVertexClient(LLMClientBase): # - Range: -1, 0, or 512-24576 # TODO when using v3 agent loop, properly support the native thinking in Gemini - # Add thinking_config for flash + # Add thinking_config for all Gemini reasoning models (2.5 series) # If enable_reasoner is False, set thinking_budget to 0 # Otherwise, use the value from max_reasoning_tokens - if "flash" in llm_config.model: - # Gemini flash models may fail to call tools even with FunctionCallingConfigMode.ANY if thinking is fully disabled, set to minimum to prevent tool call failure + if self.is_reasoning_model(llm_config) or "flash" in llm_config.model: + # Gemini reasoning models may fail to call tools even with FunctionCallingConfigMode.ANY if thinking is fully disabled, set to minimum to prevent tool call failure thinking_budget = llm_config.max_reasoning_tokens if llm_config.enable_reasoner else self.get_thinking_budget(llm_config.model) if thinking_budget <= 0: - logger.error( + logger.warning( f"Thinking budget of {thinking_budget} for Gemini reasoning model {llm_config.model}, this will likely cause tool call failures" ) + # For models that require thinking mode (2.5 Pro, 3.x), override with minimum valid budget + if llm_config.model.startswith("gemini-2.5-pro") or llm_config.model.startswith("gemini-3"): + thinking_budget = 128 + logger.warning( + f"Overriding thinking_budget to {thinking_budget} for model {llm_config.model} which requires thinking mode" + ) thinking_config = ThinkingConfig( thinking_budget=(thinking_budget), include_thoughts=(thinking_budget > 1), @@ -658,16 +664,24 @@ class GoogleVertexClient(LLMClientBase): # | 2.5 Pro | Dynamic thinking: Model decides when and how much to think | 128-32768 | N/A: Cannot disable | thinkingBudget = -1 | # | 2.5 Flash | Dynamic thinking: Model decides when and how much to think | 0-24576 | thinkingBudget = 0 | thinkingBudget = -1 | # | 2.5 Flash Lite | Model does not think | 512-24576 | thinkingBudget = 0 | thinkingBudget = -1 | + # | 3.x | Dynamic thinking: Model decides when and how much to think | 128-? | N/A: Cannot disable | thinkingBudget = -1 | def get_thinking_budget(self, model: str) -> bool: if model_settings.gemini_force_minimum_thinking_budget: if all(substring in model for substring in ["2.5", "flash", "lite"]): return 512 elif all(substring in model for substring in ["2.5", "flash"]): return 1 + # Gemini 3 and 2.5 Pro require thinking mode and cannot have budget 0 + if model.startswith("gemini-3") or model.startswith("gemini-2.5-pro"): + return 128 # Minimum valid budget for models that require thinking return 0 def is_reasoning_model(self, llm_config: LLMConfig) -> bool: - return llm_config.model.startswith("gemini-2.5-flash") or llm_config.model.startswith("gemini-2.5-pro") + return ( + llm_config.model.startswith("gemini-2.5-flash") + or llm_config.model.startswith("gemini-2.5-pro") + or llm_config.model.startswith("gemini-3") + ) def is_malformed_function_call(self, response_data: dict) -> dict: response = GenerateContentResponse(**response_data) diff --git a/letta/schemas/llm_config.py b/letta/schemas/llm_config.py index 338cb477..9fe0292b 100644 --- a/letta/schemas/llm_config.py +++ b/letta/schemas/llm_config.py @@ -422,8 +422,10 @@ class LLMConfig(BaseModel): # Anthropic 3.7/4 and Gemini: toggle honored is_google_reasoner_with_configurable_thinking = ( - cls.is_google_vertex_reasoning_model(config) or cls.is_google_ai_reasoning_model(config) - ) and not config.model.startswith("gemini-2.5-pro") + (cls.is_google_vertex_reasoning_model(config) or cls.is_google_ai_reasoning_model(config)) + and not config.model.startswith("gemini-2.5-pro") + and not config.model.startswith("gemini-3") + ) if cls.is_anthropic_reasoning_model(config) or is_google_reasoner_with_configurable_thinking: config.enable_reasoner = bool(reasoning) config.put_inner_thoughts_in_kwargs = False @@ -431,8 +433,8 @@ class LLMConfig(BaseModel): config.max_reasoning_tokens = 1024 return config - # Google Gemini 2.5 Pro: not possible to disable - if config.model.startswith("gemini-2.5-pro"): + # Google Gemini 2.5 Pro and Gemini 3: not possible to disable + if config.model.startswith("gemini-2.5-pro") or config.model.startswith("gemini-3"): config.put_inner_thoughts_in_kwargs = False config.enable_reasoner = True if config.max_reasoning_tokens == 0: @@ -466,8 +468,8 @@ class LLMConfig(BaseModel): # Set verbosity for GPT-5 models if config.model.startswith("gpt-5") and config.verbosity is None: config.verbosity = "medium" - elif config.model.startswith("gemini-2.5-pro"): - logger.warning("Reasoning cannot be disabled for Gemini 2.5 Pro model") + elif config.model.startswith("gemini-2.5-pro") or config.model.startswith("gemini-3"): + logger.warning(f"Reasoning cannot be disabled for {config.model} model") # Handle as non-reasoner until we support summary config.put_inner_thoughts_in_kwargs = True config.enable_reasoner = True diff --git a/tests/configs/llm_model_configs/gemini-2.5-pro-vertex.json b/tests/configs/llm_model_configs/gemini-2.5-pro-vertex.json index 4231e1c7..6a0fca1f 100644 --- a/tests/configs/llm_model_configs/gemini-2.5-pro-vertex.json +++ b/tests/configs/llm_model_configs/gemini-2.5-pro-vertex.json @@ -3,5 +3,7 @@ "model_endpoint_type": "google_vertex", "model_endpoint": "https://us-central1-aiplatform.googleapis.com/v1/projects/memgpt-428419/locations/us-central1", "context_window": 1048576, - "put_inner_thoughts_in_kwargs": true + "put_inner_thoughts_in_kwargs": false, + "enable_reasoner": true, + "max_reasoning_tokens": 1024 } diff --git a/tests/configs/llm_model_configs/gemini-2.5-pro.json b/tests/configs/llm_model_configs/gemini-2.5-pro.json index c291917c..e8a11b2b 100644 --- a/tests/configs/llm_model_configs/gemini-2.5-pro.json +++ b/tests/configs/llm_model_configs/gemini-2.5-pro.json @@ -4,5 +4,7 @@ "model_endpoint_type": "google_ai", "model_endpoint": "https://generativelanguage.googleapis.com", "model_wrapper": null, - "put_inner_thoughts_in_kwargs": true + "put_inner_thoughts_in_kwargs": false, + "enable_reasoner": true, + "max_reasoning_tokens": 1024 }