From 243a2b65e0ee25de6e5458031cd58b475d5a9571 Mon Sep 17 00:00:00 2001 From: jnjpng Date: Fri, 8 Aug 2025 16:34:32 -0700 Subject: [PATCH] fix: gemini 2.5 thinking models fail to call functions if thinking is fully disabled Co-authored-by: Jin Peng --- letta/helpers/json_helpers.py | 2 +- letta/llm_api/google_vertex_client.py | 14 ++++++++++++-- letta/settings.py | 1 + 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/letta/helpers/json_helpers.py b/letta/helpers/json_helpers.py index 6618f274..ff6943b8 100644 --- a/letta/helpers/json_helpers.py +++ b/letta/helpers/json_helpers.py @@ -15,7 +15,7 @@ def json_dumps(data, indent=2) -> str: try: return obj.decode("utf-8") except Exception: - print(f"Error decoding bytes as utf-8: {obj}") + # TODO: this is to handle Gemini thought signatures, b64 decode this back to bytes when sending back to Gemini return base64.b64encode(obj).decode("utf-8") raise TypeError(f"Type {type(obj)} not serializable") diff --git a/letta/llm_api/google_vertex_client.py b/letta/llm_api/google_vertex_client.py index 08fa7282..32ca5d69 100644 --- a/letta/llm_api/google_vertex_client.py +++ b/letta/llm_api/google_vertex_client.py @@ -254,8 +254,11 @@ class GoogleVertexClient(LLMClientBase): # If enable_reasoner is False, set thinking_budget to 0 # Otherwise, use the value from max_reasoning_tokens if "flash" in llm_config.model: + # Gemini flash models may fail to call tools even with FunctionCallingConfigMode.ANY if thinking is fully disabled, set to minimum to prevent tool call failure thinking_config = ThinkingConfig( - thinking_budget=llm_config.max_reasoning_tokens if llm_config.enable_reasoner else 0, + thinking_budget=( + llm_config.max_reasoning_tokens if llm_config.enable_reasoner else self.get_thinking_budget(llm_config.model) + ), ) request_data["config"]["thinking_config"] = thinking_config.model_dump() @@ -292,7 +295,6 @@ class GoogleVertexClient(LLMClientBase): } } """ - # print(response_data) response = GenerateContentResponse(**response_data) try: @@ -494,6 +496,14 @@ class GoogleVertexClient(LLMClientBase): "required": ["name", "args"], } + def get_thinking_budget(self, model: str) -> bool: + if model_settings.gemini_force_minimum_thinking_budget: + if all(substring in model for substring in ["2.5", "flash", "lite"]): + return 512 + elif all(substring in model for substring in ["2.5", "flash"]): + return 1 + return 0 + @trace_method def handle_llm_error(self, e: Exception) -> Exception: # Fallback to base implementation diff --git a/letta/settings.py b/letta/settings.py index f51f3f81..1c047b47 100644 --- a/letta/settings.py +++ b/letta/settings.py @@ -144,6 +144,7 @@ class ModelSettings(BaseSettings): # google ai gemini_api_key: Optional[str] = None gemini_base_url: str = "https://generativelanguage.googleapis.com/" + gemini_force_minimum_thinking_budget: bool = False # google vertex google_cloud_project: Optional[str] = None