diff --git a/letta/schemas/llm_config.py b/letta/schemas/llm_config.py index 94f294b5..7d3e32a1 100644 --- a/letta/schemas/llm_config.py +++ b/letta/schemas/llm_config.py @@ -1,3 +1,4 @@ +import re from typing import TYPE_CHECKING, Literal, Optional from pydantic import BaseModel, ConfigDict, Field, model_validator @@ -139,7 +140,9 @@ class LLMConfig(BaseModel): # Set max_tokens defaults based on model (only if not explicitly provided) if "max_tokens" not in values: - if model.startswith("gpt-5"): # Covers both gpt-5 and gpt-5.1 + if re.match(r"^gpt-5\.[23]", model) and "-chat" not in model: + values["max_tokens"] = 128000 + elif model.startswith("gpt-5"): values["max_tokens"] = 16384 elif model == "gpt-4.1": values["max_tokens"] = 8192 @@ -299,7 +302,7 @@ class LLMConfig(BaseModel): context_window=272000, reasoning_effort="none", # Default to "none" for GPT-5.2 verbosity="medium", - max_tokens=16384, + max_tokens=128000, ) elif model_name == "letta": return cls( diff --git a/letta/schemas/providers/openai.py b/letta/schemas/providers/openai.py index c4c979e4..c5431596 100644 --- a/letta/schemas/providers/openai.py +++ b/letta/schemas/providers/openai.py @@ -50,10 +50,22 @@ class OpenAIProvider(Provider): except Exception as e: raise LLMError(message=f"{e}", code=ErrorCode.INTERNAL_SERVER_ERROR) + @staticmethod + def _openai_default_max_output_tokens(model_name: str) -> int: + """Return a sensible max-output-tokens default for OpenAI models. + + gpt-5.2* / gpt-5.3* support 128k output tokens, except the + `-chat` variants which are capped at 16k. + """ + import re + + if re.match(r"^gpt-5\.[23]", model_name) and "-chat" not in model_name: + return 128000 + return 16384 + def get_default_max_output_tokens(self, model_name: str) -> int: """Get the default max output tokens for OpenAI models (sync fallback).""" - # Simple default for openai - return 16384 + return self._openai_default_max_output_tokens(model_name) async def get_default_max_output_tokens_async(self, model_name: str) -> int: """Get the default max output tokens for OpenAI models. @@ -67,8 +79,7 @@ class OpenAIProvider(Provider): if max_output is not None: return max_output - # Simple default for openai - return 16384 + return self._openai_default_max_output_tokens(model_name) async def _get_models_async(self) -> list[dict]: from letta.llm_api.openai import openai_get_model_list_async diff --git a/letta/server/server.py b/letta/server/server.py index 33c98482..06ca99e6 100644 --- a/letta/server/server.py +++ b/letta/server/server.py @@ -675,6 +675,12 @@ class SyncServer(object): # Get the current agent's llm_config if not already set agent = await self.agent_manager.get_agent_by_id_async(agent_id=agent_id, actor=actor) request.llm_config = agent.llm_config.model_copy() + else: + # TODO: Refactor update_agent to accept partial llm_config so we + # don't need to fetch the full agent just to preserve max_tokens. + if request.max_tokens is None and "max_output_tokens" not in request.model_settings.model_fields_set: + agent = await self.agent_manager.get_agent_by_id_async(agent_id=agent_id, actor=actor) + request.llm_config.max_tokens = agent.llm_config.max_tokens update_llm_config_params = request.model_settings._to_legacy_config_params() # Don't clobber max_tokens with the Pydantic default when the caller # didn't explicitly provide max_output_tokens in the request.