From 243a2b65e0ee25de6e5458031cd58b475d5a9571 Mon Sep 17 00:00:00 2001
From: jnjpng <jin@letta.com>
Date: Fri, 8 Aug 2025 16:34:32 -0700
Subject: [PATCH] fix: gemini 2.5 thinking models fail to call functions if
 thinking is fully disabled

Co-authored-by: Jin Peng <jinjpeng@Jins-MacBook-Pro.local>
---
 letta/helpers/json_helpers.py         |  2 +-
 letta/llm_api/google_vertex_client.py | 14 ++++++++++++--
 letta/settings.py                     |  1 +
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/letta/helpers/json_helpers.py b/letta/helpers/json_helpers.py
index 6618f274..ff6943b8 100644
--- a/letta/helpers/json_helpers.py
+++ b/letta/helpers/json_helpers.py
@@ -15,7 +15,7 @@ def json_dumps(data, indent=2) -> str:
             try:
                 return obj.decode("utf-8")
             except Exception:
-                print(f"Error decoding bytes as utf-8: {obj}")
+                # TODO: this is to handle Gemini thought signatures, b64 decode this back to bytes when sending back to Gemini
                 return base64.b64encode(obj).decode("utf-8")
         raise TypeError(f"Type {type(obj)} not serializable")
 
diff --git a/letta/llm_api/google_vertex_client.py b/letta/llm_api/google_vertex_client.py
index 08fa7282..32ca5d69 100644
--- a/letta/llm_api/google_vertex_client.py
+++ b/letta/llm_api/google_vertex_client.py
@@ -254,8 +254,11 @@ class GoogleVertexClient(LLMClientBase):
         # If enable_reasoner is False, set thinking_budget to 0
         # Otherwise, use the value from max_reasoning_tokens
         if "flash" in llm_config.model:
+            # Gemini flash models may fail to call tools even with FunctionCallingConfigMode.ANY if thinking is fully disabled, set to minimum to prevent tool call failure
             thinking_config = ThinkingConfig(
-                thinking_budget=llm_config.max_reasoning_tokens if llm_config.enable_reasoner else 0,
+                thinking_budget=(
+                    llm_config.max_reasoning_tokens if llm_config.enable_reasoner else self.get_thinking_budget(llm_config.model)
+                ),
             )
             request_data["config"]["thinking_config"] = thinking_config.model_dump()
 
@@ -292,7 +295,6 @@ class GoogleVertexClient(LLMClientBase):
         }
         }
         """
-        # print(response_data)
 
         response = GenerateContentResponse(**response_data)
         try:
@@ -494,6 +496,14 @@ class GoogleVertexClient(LLMClientBase):
             "required": ["name", "args"],
         }
 
+    def get_thinking_budget(self, model: str) -> bool:
+        if model_settings.gemini_force_minimum_thinking_budget:
+            if all(substring in model for substring in ["2.5", "flash", "lite"]):
+                return 512
+            elif all(substring in model for substring in ["2.5", "flash"]):
+                return 1
+        return 0
+
     @trace_method
     def handle_llm_error(self, e: Exception) -> Exception:
         # Fallback to base implementation
diff --git a/letta/settings.py b/letta/settings.py
index f51f3f81..1c047b47 100644
--- a/letta/settings.py
+++ b/letta/settings.py
@@ -144,6 +144,7 @@ class ModelSettings(BaseSettings):
     # google ai
     gemini_api_key: Optional[str] = None
     gemini_base_url: str = "https://generativelanguage.googleapis.com/"
+    gemini_force_minimum_thinking_budget: bool = False
 
     # google vertex
     google_cloud_project: Optional[str] = None