fix: preserve agent max_tokens when caller doesn't explicitly set it (#9679)

* fix: preserve agent max_tokens when caller doesn't explicitly set it

When updating an agent with convenience fields (model, model_settings)
but without an explicit max_tokens, the server was constructing a fresh
LLMConfig via get_llm_config_from_handle_async. The Pydantic validator
on LLMConfig hardcodes max_tokens=16384 for gpt-5* models, silently
overriding the agent's existing value (e.g. 128000).

This was triggered by reasoning tab-switch in the CLI, which sends
model + model_settings (with reasoning_effort) but no max_tokens.

Now, when request.max_tokens is None we carry forward the agent's
current max_tokens instead of accepting the provider default.

* fix: use correct 128k max_output_tokens defaults for gpt-5.2/5.3

- Update OpenAI provider fallback to return 128000 for gpt-5.2*/5.3*
  models (except -chat variants which are 16k)
- Update LLMConfig Pydantic validator to match
- Update gpt-5.2 default_config factory to use 128000
- Move server-side max_tokens preservation guard into the
  model_settings branch where llm_config is already available
This commit is contained in:
jnjpng
2026-02-25 17:18:08 -08:00
committed by Caren Thomas
parent 5d55d4ccd4
commit 46971414a4
3 changed files with 26 additions and 6 deletions

View File

@@ -1,3 +1,4 @@
import re
from typing import TYPE_CHECKING, Literal, Optional
from pydantic import BaseModel, ConfigDict, Field, model_validator
@@ -139,7 +140,9 @@ class LLMConfig(BaseModel):
# Set max_tokens defaults based on model (only if not explicitly provided)
if "max_tokens" not in values:
if model.startswith("gpt-5"): # Covers both gpt-5 and gpt-5.1
if re.match(r"^gpt-5\.[23]", model) and "-chat" not in model:
values["max_tokens"] = 128000
elif model.startswith("gpt-5"):
values["max_tokens"] = 16384
elif model == "gpt-4.1":
values["max_tokens"] = 8192
@@ -299,7 +302,7 @@ class LLMConfig(BaseModel):
context_window=272000,
reasoning_effort="none", # Default to "none" for GPT-5.2
verbosity="medium",
max_tokens=16384,
max_tokens=128000,
)
elif model_name == "letta":
return cls(

View File

@@ -50,10 +50,22 @@ class OpenAIProvider(Provider):
except Exception as e:
raise LLMError(message=f"{e}", code=ErrorCode.INTERNAL_SERVER_ERROR)
@staticmethod
def _openai_default_max_output_tokens(model_name: str) -> int:
"""Return a sensible max-output-tokens default for OpenAI models.
gpt-5.2* / gpt-5.3* support 128k output tokens, except the
`-chat` variants which are capped at 16k.
"""
import re
if re.match(r"^gpt-5\.[23]", model_name) and "-chat" not in model_name:
return 128000
return 16384
def get_default_max_output_tokens(self, model_name: str) -> int:
"""Get the default max output tokens for OpenAI models (sync fallback)."""
# Simple default for openai
return 16384
return self._openai_default_max_output_tokens(model_name)
async def get_default_max_output_tokens_async(self, model_name: str) -> int:
"""Get the default max output tokens for OpenAI models.
@@ -67,8 +79,7 @@ class OpenAIProvider(Provider):
if max_output is not None:
return max_output
# Simple default for openai
return 16384
return self._openai_default_max_output_tokens(model_name)
async def _get_models_async(self) -> list[dict]:
from letta.llm_api.openai import openai_get_model_list_async

View File

@@ -675,6 +675,12 @@ class SyncServer(object):
# Get the current agent's llm_config if not already set
agent = await self.agent_manager.get_agent_by_id_async(agent_id=agent_id, actor=actor)
request.llm_config = agent.llm_config.model_copy()
else:
# TODO: Refactor update_agent to accept partial llm_config so we
# don't need to fetch the full agent just to preserve max_tokens.
if request.max_tokens is None and "max_output_tokens" not in request.model_settings.model_fields_set:
agent = await self.agent_manager.get_agent_by_id_async(agent_id=agent_id, actor=actor)
request.llm_config.max_tokens = agent.llm_config.max_tokens
update_llm_config_params = request.model_settings._to_legacy_config_params()
# Don't clobber max_tokens with the Pydantic default when the caller
# didn't explicitly provide max_output_tokens in the request.