fix: include google_ai model endpoint type when setting reasoning tokens for google reasoning models
Co-authored-by: Jin Peng <jinjpeng@Jins-MacBook-Pro.local>
This commit is contained in:
@@ -255,10 +255,13 @@ class GoogleVertexClient(LLMClientBase):
|
||||
# Otherwise, use the value from max_reasoning_tokens
|
||||
if "flash" in llm_config.model:
|
||||
# Gemini flash models may fail to call tools even with FunctionCallingConfigMode.ANY if thinking is fully disabled, set to minimum to prevent tool call failure
|
||||
thinking_budget = llm_config.max_reasoning_tokens if llm_config.enable_reasoner else self.get_thinking_budget(llm_config.model)
|
||||
if thinking_budget <= 0:
|
||||
logger.error(
|
||||
f"Thinking budget of {thinking_budget} for Gemini reasoning model {llm_config.model}, this will likely cause tool call failures"
|
||||
)
|
||||
thinking_config = ThinkingConfig(
|
||||
thinking_budget=(
|
||||
llm_config.max_reasoning_tokens if llm_config.enable_reasoner else self.get_thinking_budget(llm_config.model)
|
||||
),
|
||||
thinking_budget=(thinking_budget),
|
||||
)
|
||||
request_data["config"]["thinking_config"] = thinking_config.model_dump()
|
||||
|
||||
@@ -496,6 +499,12 @@ class GoogleVertexClient(LLMClientBase):
|
||||
"required": ["name", "args"],
|
||||
}
|
||||
|
||||
# https://ai.google.dev/gemini-api/docs/thinking#set-budget
|
||||
# | Model | Default setting | Range | Disable thinking | Turn on dynamic thinking|
|
||||
# |-----------------|-------------------------------------------------------------------|--------------|----------------------------|-------------------------|
|
||||
# | 2.5 Pro | Dynamic thinking: Model decides when and how much to think | 128-32768 | N/A: Cannot disable | thinkingBudget = -1 |
|
||||
# | 2.5 Flash | Dynamic thinking: Model decides when and how much to think | 0-24576 | thinkingBudget = 0 | thinkingBudget = -1 |
|
||||
# | 2.5 Flash Lite | Model does not think | 512-24576 | thinkingBudget = 0 | thinkingBudget = -1 |
|
||||
def get_thinking_budget(self, model: str) -> bool:
|
||||
if model_settings.gemini_force_minimum_thinking_budget:
|
||||
if all(substring in model for substring in ["2.5", "flash", "lite"]):
|
||||
|
||||
@@ -245,6 +245,12 @@ class LLMConfig(BaseModel):
|
||||
config.model.startswith("gemini-2.5-flash") or config.model.startswith("gemini-2.5-pro")
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def is_google_ai_reasoning_model(cls, config: "LLMConfig") -> bool:
|
||||
return config.model_endpoint_type == "google_ai" and (
|
||||
config.model.startswith("gemini-2.5-flash") or config.model.startswith("gemini-2.5-pro")
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def supports_verbosity(cls, config: "LLMConfig") -> bool:
|
||||
"""Check if the model supports verbosity control."""
|
||||
@@ -276,7 +282,7 @@ class LLMConfig(BaseModel):
|
||||
config.put_inner_thoughts_in_kwargs = False
|
||||
if config.max_reasoning_tokens == 0:
|
||||
config.max_reasoning_tokens = 1024
|
||||
elif cls.is_google_vertex_reasoning_model(config):
|
||||
elif cls.is_google_vertex_reasoning_model(config) or cls.is_google_ai_reasoning_model(config):
|
||||
# Handle as non-reasoner until we support summary
|
||||
config.put_inner_thoughts_in_kwargs = True
|
||||
if config.max_reasoning_tokens == 0:
|
||||
|
||||
Reference in New Issue
Block a user