From 0334fabc960d8a573740c4098b98a87d5dcabd16 Mon Sep 17 00:00:00 2001
From: jnjpng <jin@letta.com>
Date: Tue, 19 Aug 2025 14:58:35 -0700
Subject: [PATCH] fix: include `google_ai` model endpoint type when setting
 reasoning tokens for google reasoning models

Co-authored-by: Jin Peng <jinjpeng@Jins-MacBook-Pro.local>
---
 letta/llm_api/google_vertex_client.py | 15 ++++++++++++---
 letta/schemas/llm_config.py           |  8 +++++++-
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/letta/llm_api/google_vertex_client.py b/letta/llm_api/google_vertex_client.py
index 49941696..63a3bcec 100644
--- a/letta/llm_api/google_vertex_client.py
+++ b/letta/llm_api/google_vertex_client.py
@@ -255,10 +255,13 @@ class GoogleVertexClient(LLMClientBase):
         # Otherwise, use the value from max_reasoning_tokens
         if "flash" in llm_config.model:
             # Gemini flash models may fail to call tools even with FunctionCallingConfigMode.ANY if thinking is fully disabled, set to minimum to prevent tool call failure
+            thinking_budget = llm_config.max_reasoning_tokens if llm_config.enable_reasoner else self.get_thinking_budget(llm_config.model)
+            if thinking_budget <= 0:
+                logger.error(
+                    f"Thinking budget of {thinking_budget} for Gemini reasoning model {llm_config.model}, this will likely cause tool call failures"
+                )
             thinking_config = ThinkingConfig(
-                thinking_budget=(
-                    llm_config.max_reasoning_tokens if llm_config.enable_reasoner else self.get_thinking_budget(llm_config.model)
-                ),
+                thinking_budget=(thinking_budget),
             )
             request_data["config"]["thinking_config"] = thinking_config.model_dump()
 
@@ -496,6 +499,12 @@ class GoogleVertexClient(LLMClientBase):
             "required": ["name", "args"],
         }
 
+    # https://ai.google.dev/gemini-api/docs/thinking#set-budget
+    # | Model           | Default setting                                                   | Range        | Disable thinking           | Turn on dynamic thinking|
+    # |-----------------|-------------------------------------------------------------------|--------------|----------------------------|-------------------------|
+    # | 2.5 Pro         | Dynamic thinking: Model decides when and how much to think        | 128-32768    | N/A: Cannot disable        | thinkingBudget = -1     |
+    # | 2.5 Flash       | Dynamic thinking: Model decides when and how much to think        | 0-24576      | thinkingBudget = 0         | thinkingBudget = -1     |
+    # | 2.5 Flash Lite  | Model does not think                                              | 512-24576    | thinkingBudget = 0         | thinkingBudget = -1     |
     def get_thinking_budget(self, model: str) -> bool:
         if model_settings.gemini_force_minimum_thinking_budget:
             if all(substring in model for substring in ["2.5", "flash", "lite"]):
diff --git a/letta/schemas/llm_config.py b/letta/schemas/llm_config.py
index c162317a..4756bda0 100644
--- a/letta/schemas/llm_config.py
+++ b/letta/schemas/llm_config.py
@@ -245,6 +245,12 @@ class LLMConfig(BaseModel):
             config.model.startswith("gemini-2.5-flash") or config.model.startswith("gemini-2.5-pro")
         )
 
+    @classmethod
+    def is_google_ai_reasoning_model(cls, config: "LLMConfig") -> bool:
+        return config.model_endpoint_type == "google_ai" and (
+            config.model.startswith("gemini-2.5-flash") or config.model.startswith("gemini-2.5-pro")
+        )
+
     @classmethod
     def supports_verbosity(cls, config: "LLMConfig") -> bool:
         """Check if the model supports verbosity control."""
@@ -276,7 +282,7 @@ class LLMConfig(BaseModel):
                 config.put_inner_thoughts_in_kwargs = False
                 if config.max_reasoning_tokens == 0:
                     config.max_reasoning_tokens = 1024
-            elif cls.is_google_vertex_reasoning_model(config):
+            elif cls.is_google_vertex_reasoning_model(config) or cls.is_google_ai_reasoning_model(config):
                 # Handle as non-reasoner until we support summary
                 config.put_inner_thoughts_in_kwargs = True
                 if config.max_reasoning_tokens == 0: