fix: preserve agent max_tokens when caller doesn't explicitly set it (#9679)
* fix: preserve agent max_tokens when caller doesn't explicitly set it When updating an agent with convenience fields (model, model_settings) but without an explicit max_tokens, the server was constructing a fresh LLMConfig via get_llm_config_from_handle_async. The Pydantic validator on LLMConfig hardcodes max_tokens=16384 for gpt-5* models, silently overriding the agent's existing value (e.g. 128000). This was triggered by reasoning tab-switch in the CLI, which sends model + model_settings (with reasoning_effort) but no max_tokens. Now, when request.max_tokens is None we carry forward the agent's current max_tokens instead of accepting the provider default. * fix: use correct 128k max_output_tokens defaults for gpt-5.2/5.3 - Update OpenAI provider fallback to return 128000 for gpt-5.2*/5.3* models (except -chat variants which are 16k) - Update LLMConfig Pydantic validator to match - Update gpt-5.2 default_config factory to use 128000 - Move server-side max_tokens preservation guard into the model_settings branch where llm_config is already available
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
import re
|
||||
from typing import TYPE_CHECKING, Literal, Optional
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
||||
@@ -139,7 +140,9 @@ class LLMConfig(BaseModel):
|
||||
|
||||
# Set max_tokens defaults based on model (only if not explicitly provided)
|
||||
if "max_tokens" not in values:
|
||||
if model.startswith("gpt-5"): # Covers both gpt-5 and gpt-5.1
|
||||
if re.match(r"^gpt-5\.[23]", model) and "-chat" not in model:
|
||||
values["max_tokens"] = 128000
|
||||
elif model.startswith("gpt-5"):
|
||||
values["max_tokens"] = 16384
|
||||
elif model == "gpt-4.1":
|
||||
values["max_tokens"] = 8192
|
||||
@@ -299,7 +302,7 @@ class LLMConfig(BaseModel):
|
||||
context_window=272000,
|
||||
reasoning_effort="none", # Default to "none" for GPT-5.2
|
||||
verbosity="medium",
|
||||
max_tokens=16384,
|
||||
max_tokens=128000,
|
||||
)
|
||||
elif model_name == "letta":
|
||||
return cls(
|
||||
|
||||
@@ -50,10 +50,22 @@ class OpenAIProvider(Provider):
|
||||
except Exception as e:
|
||||
raise LLMError(message=f"{e}", code=ErrorCode.INTERNAL_SERVER_ERROR)
|
||||
|
||||
@staticmethod
|
||||
def _openai_default_max_output_tokens(model_name: str) -> int:
|
||||
"""Return a sensible max-output-tokens default for OpenAI models.
|
||||
|
||||
gpt-5.2* / gpt-5.3* support 128k output tokens, except the
|
||||
`-chat` variants which are capped at 16k.
|
||||
"""
|
||||
import re
|
||||
|
||||
if re.match(r"^gpt-5\.[23]", model_name) and "-chat" not in model_name:
|
||||
return 128000
|
||||
return 16384
|
||||
|
||||
def get_default_max_output_tokens(self, model_name: str) -> int:
|
||||
"""Get the default max output tokens for OpenAI models (sync fallback)."""
|
||||
# Simple default for openai
|
||||
return 16384
|
||||
return self._openai_default_max_output_tokens(model_name)
|
||||
|
||||
async def get_default_max_output_tokens_async(self, model_name: str) -> int:
|
||||
"""Get the default max output tokens for OpenAI models.
|
||||
@@ -67,8 +79,7 @@ class OpenAIProvider(Provider):
|
||||
if max_output is not None:
|
||||
return max_output
|
||||
|
||||
# Simple default for openai
|
||||
return 16384
|
||||
return self._openai_default_max_output_tokens(model_name)
|
||||
|
||||
async def _get_models_async(self) -> list[dict]:
|
||||
from letta.llm_api.openai import openai_get_model_list_async
|
||||
|
||||
@@ -675,6 +675,12 @@ class SyncServer(object):
|
||||
# Get the current agent's llm_config if not already set
|
||||
agent = await self.agent_manager.get_agent_by_id_async(agent_id=agent_id, actor=actor)
|
||||
request.llm_config = agent.llm_config.model_copy()
|
||||
else:
|
||||
# TODO: Refactor update_agent to accept partial llm_config so we
|
||||
# don't need to fetch the full agent just to preserve max_tokens.
|
||||
if request.max_tokens is None and "max_output_tokens" not in request.model_settings.model_fields_set:
|
||||
agent = await self.agent_manager.get_agent_by_id_async(agent_id=agent_id, actor=actor)
|
||||
request.llm_config.max_tokens = agent.llm_config.max_tokens
|
||||
update_llm_config_params = request.model_settings._to_legacy_config_params()
|
||||
# Don't clobber max_tokens with the Pydantic default when the caller
|
||||
# didn't explicitly provide max_output_tokens in the request.
|
||||
|
||||
Reference in New Issue
Block a user