From 0334fabc960d8a573740c4098b98a87d5dcabd16 Mon Sep 17 00:00:00 2001 From: jnjpng Date: Tue, 19 Aug 2025 14:58:35 -0700 Subject: [PATCH] fix: include `google_ai` model endpoint type when setting reasoning tokens for google reasoning models Co-authored-by: Jin Peng --- letta/llm_api/google_vertex_client.py | 15 ++++++++++++--- letta/schemas/llm_config.py | 8 +++++++- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/letta/llm_api/google_vertex_client.py b/letta/llm_api/google_vertex_client.py index 49941696..63a3bcec 100644 --- a/letta/llm_api/google_vertex_client.py +++ b/letta/llm_api/google_vertex_client.py @@ -255,10 +255,13 @@ class GoogleVertexClient(LLMClientBase): # Otherwise, use the value from max_reasoning_tokens if "flash" in llm_config.model: # Gemini flash models may fail to call tools even with FunctionCallingConfigMode.ANY if thinking is fully disabled, set to minimum to prevent tool call failure + thinking_budget = llm_config.max_reasoning_tokens if llm_config.enable_reasoner else self.get_thinking_budget(llm_config.model) + if thinking_budget <= 0: + logger.error( + f"Thinking budget of {thinking_budget} for Gemini reasoning model {llm_config.model}, this will likely cause tool call failures" + ) thinking_config = ThinkingConfig( - thinking_budget=( - llm_config.max_reasoning_tokens if llm_config.enable_reasoner else self.get_thinking_budget(llm_config.model) - ), + thinking_budget=(thinking_budget), ) request_data["config"]["thinking_config"] = thinking_config.model_dump() @@ -496,6 +499,12 @@ class GoogleVertexClient(LLMClientBase): "required": ["name", "args"], } + # https://ai.google.dev/gemini-api/docs/thinking#set-budget + # | Model | Default setting | Range | Disable thinking | Turn on dynamic thinking| + # |-----------------|-------------------------------------------------------------------|--------------|----------------------------|-------------------------| + # | 2.5 Pro | Dynamic thinking: Model decides when and how much to think | 128-32768 | N/A: Cannot disable | thinkingBudget = -1 | + # | 2.5 Flash | Dynamic thinking: Model decides when and how much to think | 0-24576 | thinkingBudget = 0 | thinkingBudget = -1 | + # | 2.5 Flash Lite | Model does not think | 512-24576 | thinkingBudget = 0 | thinkingBudget = -1 | def get_thinking_budget(self, model: str) -> bool: if model_settings.gemini_force_minimum_thinking_budget: if all(substring in model for substring in ["2.5", "flash", "lite"]): diff --git a/letta/schemas/llm_config.py b/letta/schemas/llm_config.py index c162317a..4756bda0 100644 --- a/letta/schemas/llm_config.py +++ b/letta/schemas/llm_config.py @@ -245,6 +245,12 @@ class LLMConfig(BaseModel): config.model.startswith("gemini-2.5-flash") or config.model.startswith("gemini-2.5-pro") ) + @classmethod + def is_google_ai_reasoning_model(cls, config: "LLMConfig") -> bool: + return config.model_endpoint_type == "google_ai" and ( + config.model.startswith("gemini-2.5-flash") or config.model.startswith("gemini-2.5-pro") + ) + @classmethod def supports_verbosity(cls, config: "LLMConfig") -> bool: """Check if the model supports verbosity control.""" @@ -276,7 +282,7 @@ class LLMConfig(BaseModel): config.put_inner_thoughts_in_kwargs = False if config.max_reasoning_tokens == 0: config.max_reasoning_tokens = 1024 - elif cls.is_google_vertex_reasoning_model(config): + elif cls.is_google_vertex_reasoning_model(config) or cls.is_google_ai_reasoning_model(config): # Handle as non-reasoner until we support summary config.put_inner_thoughts_in_kwargs = True if config.max_reasoning_tokens == 0: