fix: preserve agent max_tokens when caller doesn't explicitly set it (#9679)

* fix: preserve agent max_tokens when caller doesn't explicitly set it When updating an agent with convenience fields (model, model_settings) but without an explicit max_tokens, the server was constructing a fresh LLMConfig via get_llm_config_from_handle_async. The Pydantic validator on LLMConfig hardcodes max_tokens=16384 for gpt-5* models, silently overriding the agent's existing value (e.g. 128000). This was triggered by reasoning tab-switch in the CLI, which sends model + model_settings (with reasoning_effort) but no max_tokens. Now, when request.max_tokens is None we carry forward the agent's current max_tokens instead of accepting the provider default. * fix: use correct 128k max_output_tokens defaults for gpt-5.2/5.3 - Update OpenAI provider fallback to return 128000 for gpt-5.2*/5.3* models (except -chat variants which are 16k) - Update LLMConfig Pydantic validator to match - Update gpt-5.2 default_config factory to use 128000 - Move server-side max_tokens preservation guard into the model_settings branch where llm_config is already available
2026-02-25 17:18:08 -08:00
parent 5d55d4ccd4
commit 46971414a4
3 changed files with 26 additions and 6 deletions
--- a/letta/schemas/llm_config.py
+++ b/letta/schemas/llm_config.py
@@ -1,3 +1,4 @@
+import re
 from typing import TYPE_CHECKING, Literal, Optional

 from pydantic import BaseModel, ConfigDict, Field, model_validator
@@ -139,7 +140,9 @@ class LLMConfig(BaseModel):

        # Set max_tokens defaults based on model (only if not explicitly provided)
        if "max_tokens" not in values:
-            if model.startswith("gpt-5"):  # Covers both gpt-5 and gpt-5.1
+            if re.match(r"^gpt-5\.[23]", model) and "-chat" not in model:
+                values["max_tokens"] = 128000
+            elif model.startswith("gpt-5"):
                values["max_tokens"] = 16384
            elif model == "gpt-4.1":
                values["max_tokens"] = 8192
@@ -299,7 +302,7 @@ class LLMConfig(BaseModel):
                context_window=272000,
                reasoning_effort="none",  # Default to "none" for GPT-5.2
                verbosity="medium",
-                max_tokens=16384,
+                max_tokens=128000,
            )
        elif model_name == "letta":
            return cls(
--- a/letta/schemas/providers/openai.py
+++ b/letta/schemas/providers/openai.py
@@ -50,10 +50,22 @@ class OpenAIProvider(Provider):
        except Exception as e:
            raise LLMError(message=f"{e}", code=ErrorCode.INTERNAL_SERVER_ERROR)

+    @staticmethod
+    def _openai_default_max_output_tokens(model_name: str) -> int:
+        """Return a sensible max-output-tokens default for OpenAI models.
+
+        gpt-5.2* / gpt-5.3* support 128k output tokens, except the
+        `-chat` variants which are capped at 16k.
+        """
+        import re
+
+        if re.match(r"^gpt-5\.[23]", model_name) and "-chat" not in model_name:
+            return 128000
+        return 16384
+
    def get_default_max_output_tokens(self, model_name: str) -> int:
        """Get the default max output tokens for OpenAI models (sync fallback)."""
-        # Simple default for openai
-        return 16384
+        return self._openai_default_max_output_tokens(model_name)

    async def get_default_max_output_tokens_async(self, model_name: str) -> int:
        """Get the default max output tokens for OpenAI models.
@@ -67,8 +79,7 @@ class OpenAIProvider(Provider):
        if max_output is not None:
            return max_output

-        # Simple default for openai
-        return 16384
+        return self._openai_default_max_output_tokens(model_name)

    async def _get_models_async(self) -> list[dict]:
        from letta.llm_api.openai import openai_get_model_list_async
--- a/letta/server/server.py
+++ b/letta/server/server.py
@@ -675,6 +675,12 @@ class SyncServer(object):
                # Get the current agent's llm_config if not already set
                agent = await self.agent_manager.get_agent_by_id_async(agent_id=agent_id, actor=actor)
                request.llm_config = agent.llm_config.model_copy()
+            else:
+                # TODO: Refactor update_agent to accept partial llm_config so we
+                # don't need to fetch the full agent just to preserve max_tokens.
+                if request.max_tokens is None and "max_output_tokens" not in request.model_settings.model_fields_set:
+                    agent = await self.agent_manager.get_agent_by_id_async(agent_id=agent_id, actor=actor)
+                    request.llm_config.max_tokens = agent.llm_config.max_tokens
            update_llm_config_params = request.model_settings._to_legacy_config_params()
            # Don't clobber max_tokens with the Pydantic default when the caller
            # didn't explicitly provide max_output_tokens in the request.