fix: gemini 2.5 thinking models fail to call functions if thinking is fully disabled

Co-authored-by: Jin Peng <jinjpeng@Jins-MacBook-Pro.local>
2025-08-08 16:34:32 -07:00
parent c6002744e6
commit 243a2b65e0
3 changed files with 14 additions and 3 deletions
--- a/letta/helpers/json_helpers.py
+++ b/letta/helpers/json_helpers.py
@@ -15,7 +15,7 @@ def json_dumps(data, indent=2) -> str:
            try:
                return obj.decode("utf-8")
            except Exception:
-                print(f"Error decoding bytes as utf-8: {obj}")
+                # TODO: this is to handle Gemini thought signatures, b64 decode this back to bytes when sending back to Gemini
                return base64.b64encode(obj).decode("utf-8")
        raise TypeError(f"Type {type(obj)} not serializable")

--- a/letta/llm_api/google_vertex_client.py
+++ b/letta/llm_api/google_vertex_client.py
@@ -254,8 +254,11 @@ class GoogleVertexClient(LLMClientBase):
        # If enable_reasoner is False, set thinking_budget to 0
        # Otherwise, use the value from max_reasoning_tokens
        if "flash" in llm_config.model:
+            # Gemini flash models may fail to call tools even with FunctionCallingConfigMode.ANY if thinking is fully disabled, set to minimum to prevent tool call failure
            thinking_config = ThinkingConfig(
-                thinking_budget=llm_config.max_reasoning_tokens if llm_config.enable_reasoner else 0,
+                thinking_budget=(
+                    llm_config.max_reasoning_tokens if llm_config.enable_reasoner else self.get_thinking_budget(llm_config.model)
+                ),
            )
            request_data["config"]["thinking_config"] = thinking_config.model_dump()

@@ -292,7 +295,6 @@ class GoogleVertexClient(LLMClientBase):
        }
        }
        """
-        # print(response_data)

        response = GenerateContentResponse(**response_data)
        try:
@@ -494,6 +496,14 @@ class GoogleVertexClient(LLMClientBase):
            "required": ["name", "args"],
        }

+    def get_thinking_budget(self, model: str) -> bool:
+        if model_settings.gemini_force_minimum_thinking_budget:
+            if all(substring in model for substring in ["2.5", "flash", "lite"]):
+                return 512
+            elif all(substring in model for substring in ["2.5", "flash"]):
+                return 1
+        return 0
+
    @trace_method
    def handle_llm_error(self, e: Exception) -> Exception:
        # Fallback to base implementation
--- a/letta/settings.py
+++ b/letta/settings.py
@@ -144,6 +144,7 @@ class ModelSettings(BaseSettings):
    # google ai
    gemini_api_key: Optional[str] = None
    gemini_base_url: str = "https://generativelanguage.googleapis.com/"
+    gemini_force_minimum_thinking_budget: bool = False

    # google vertex
    google_cloud_project: Optional[str] = None