From db9e0f42afe5c97d13f5f5836147193606d9411e Mon Sep 17 00:00:00 2001
From: jnjpng <jin@letta.com>
Date: Tue, 3 Mar 2026 11:48:19 -0800
Subject: [PATCH] fix(core): prevent ModelSettings default max_output_tokens
 from overriding agent config (#9739)

* fix(core): prevent ModelSettings default max_output_tokens from overriding agent config

When a conversation's model_settings were saved, the Pydantic default
of max_output_tokens=4096 was always persisted to the DB even when the
client never specified it. On subsequent messages, this default would
overwrite the agent's max_tokens (typically None) with 4096, silently
capping output.

Two changes:
1. Use model_dump(exclude_unset=True) when persisting model_settings
   to the DB so Pydantic defaults are not saved.
2. Add model_fields_set guards at all callsites that apply
   _to_legacy_config_params() to skip max_tokens when it was not
   explicitly provided by the caller.

Also conditionally set max_output_tokens in the OpenAI Responses API
request builder so None is not sent as null (which some models treat
as a hard 4096 cap).

* nit

* Fix model_settings serialization to preserve provider_type discriminator

Replace blanket exclude_unset=True with targeted removal of only
max_output_tokens when not explicitly set. The previous approach
stripped the provider_type field (a Literal with a default), which
broke discriminated union deserialization when reading back from DB.
---
 letta/llm_api/openai_client.py                |  6 +-
 .../rest_api/routers/v1/conversations.py      |  4 ++
 letta/server/server.py                        |  4 ++
 letta/services/conversation_manager.py        | 66 ++++++++++---------
 letta/services/streaming_service.py           |  4 ++
 letta/services/summarizer/compact.py          |  4 ++
 6 files changed, 56 insertions(+), 32 deletions(-)

diff --git a/letta/llm_api/openai_client.py b/letta/llm_api/openai_client.py
index 3ef03792..e615315b 100644
--- a/letta/llm_api/openai_client.py
+++ b/letta/llm_api/openai_client.py
@@ -389,7 +389,6 @@ class OpenAIClient(LLMClientBase):
             input=openai_messages_list,
             tools=responses_tools,
             tool_choice=tool_choice,
-            max_output_tokens=llm_config.max_tokens,
             temperature=llm_config.temperature if supports_temperature_param(model) else None,
             parallel_tool_calls=llm_config.parallel_tool_calls if tools and supports_parallel_tool_calling(model) else False,
         )
@@ -397,6 +396,10 @@ class OpenAIClient(LLMClientBase):
         # Handle text configuration (verbosity and response format)
         text_config_kwargs = {}
 
+        # Only set max_output_tokens if explicitly configured
+        if llm_config.max_tokens is not None:
+            data.max_output_tokens = llm_config.max_tokens
+
         # Add verbosity control for GPT-5 models
         if supports_verbosity_control(model) and llm_config.verbosity:
             text_config_kwargs["verbosity"] = llm_config.verbosity
@@ -451,7 +454,6 @@ class OpenAIClient(LLMClientBase):
         )
 
         request_data = data.model_dump(exclude_unset=True)
-        # print("responses request data", request_data)
         return request_data
 
     @trace_method
diff --git a/letta/server/rest_api/routers/v1/conversations.py b/letta/server/rest_api/routers/v1/conversations.py
index d7444d87..65af1d19 100644
--- a/letta/server/rest_api/routers/v1/conversations.py
+++ b/letta/server/rest_api/routers/v1/conversations.py
@@ -401,6 +401,10 @@ async def send_conversation_message(
         )
         if conversation.model_settings is not None:
             update_params = conversation.model_settings._to_legacy_config_params()
+            # Don't clobber max_tokens with the Pydantic default when the caller
+            # didn't explicitly provide max_output_tokens.
+            if "max_output_tokens" not in conversation.model_settings.model_fields_set:
+                update_params.pop("max_tokens", None)
             conversation_llm_config = conversation_llm_config.model_copy(update=update_params)
         agent = agent.model_copy(update={"llm_config": conversation_llm_config})
 
diff --git a/letta/server/server.py b/letta/server/server.py
index 06ca99e6..ea6eef31 100644
--- a/letta/server/server.py
+++ b/letta/server/server.py
@@ -562,6 +562,10 @@ class SyncServer(object):
         # update with model_settings
         if request.model_settings is not None:
             update_llm_config_params = request.model_settings._to_legacy_config_params()
+            # Don't clobber max_tokens with the Pydantic default when the caller
+            # didn't explicitly provide max_output_tokens in the request.
+            if "max_output_tokens" not in request.model_settings.model_fields_set:
+                update_llm_config_params.pop("max_tokens", None)
             request.llm_config = request.llm_config.model_copy(update=update_llm_config_params)
 
         # Copy parallel_tool_calls from request to llm_config if provided
diff --git a/letta/services/conversation_manager.py b/letta/services/conversation_manager.py
index f499c587..814ffabf 100644
--- a/letta/services/conversation_manager.py
+++ b/letta/services/conversation_manager.py
@@ -30,6 +30,21 @@ from letta.utils import enforce_types
 class ConversationManager:
     """Manager class to handle business logic related to Conversations."""
 
+    @staticmethod
+    def _serialize_model_settings(model_settings) -> Optional[dict]:
+        """Serialize model settings for DB storage, stripping max_output_tokens if not explicitly set.
+
+        Uses model_dump() to preserve all fields (including the provider_type discriminator),
+        but removes max_output_tokens when it wasn't explicitly provided by the caller so we
+        don't persist the Pydantic default (4096) and later overwrite the agent's own value.
+        """
+        if model_settings is None:
+            return None
+        data = model_settings.model_dump()
+        if "max_output_tokens" not in model_settings.model_fields_set:
+            data.pop("max_output_tokens", None)
+        return data
+
     @enforce_types
     @trace_method
     async def create_conversation(
@@ -57,7 +72,7 @@ class ConversationManager:
                 summary=conversation_create.summary,
                 organization_id=actor.organization_id,
                 model=conversation_create.model,
-                model_settings=conversation_create.model_settings.model_dump() if conversation_create.model_settings else None,
+                model_settings=self._serialize_model_settings(conversation_create.model_settings),
             )
             await conversation.create_async(session, actor=actor)
 
@@ -228,22 +243,15 @@ class ConversationManager:
             if sort_by == "last_run_completion":
                 # Subquery to get the latest completed_at for each conversation
                 latest_run_subquery = (
-                    select(
-                        RunModel.conversation_id,
-                        func.max(RunModel.completed_at).label("last_run_completion")
-                    )
+                    select(RunModel.conversation_id, func.max(RunModel.completed_at).label("last_run_completion"))
                     .where(RunModel.conversation_id.isnot(None))
                     .group_by(RunModel.conversation_id)
                     .subquery()
                 )
 
                 # Join conversations with the subquery
-                stmt = (
-                    select(ConversationModel)
-                    .outerjoin(
-                        latest_run_subquery,
-                        ConversationModel.id == latest_run_subquery.c.conversation_id
-                    )
+                stmt = select(ConversationModel).outerjoin(
+                    latest_run_subquery, ConversationModel.id == latest_run_subquery.c.conversation_id
                 )
                 sort_column = latest_run_subquery.c.last_run_completion
                 sort_nulls_last = True
@@ -265,10 +273,12 @@ class ConversationManager:
 
             # Add summary search filter if provided
             if summary_search:
-                conditions.extend([
-                    ConversationModel.summary.isnot(None),
-                    ConversationModel.summary.contains(summary_search),
-                ])
+                conditions.extend(
+                    [
+                        ConversationModel.summary.isnot(None),
+                        ConversationModel.summary.contains(summary_search),
+                    ]
+                )
 
             stmt = stmt.where(and_(*conditions))
 
@@ -277,10 +287,7 @@ class ConversationManager:
                 # Get the sort value for the cursor conversation
                 if sort_by == "last_run_completion":
                     cursor_query = (
-                        select(
-                            ConversationModel.id,
-                            func.max(RunModel.completed_at).label("last_run_completion")
-                        )
+                        select(ConversationModel.id, func.max(RunModel.completed_at).label("last_run_completion"))
                         .outerjoin(RunModel, ConversationModel.id == RunModel.conversation_id)
                         .where(ConversationModel.id == after)
                         .group_by(ConversationModel.id)
@@ -293,16 +300,11 @@ class ConversationManager:
                             # Cursor is at NULL - if ascending, get non-NULLs or NULLs with greater ID
                             if ascending:
                                 stmt = stmt.where(
-                                    or_(
-                                        and_(sort_column.is_(None), ConversationModel.id > after_id),
-                                        sort_column.isnot(None)
-                                    )
+                                    or_(and_(sort_column.is_(None), ConversationModel.id > after_id), sort_column.isnot(None))
                                 )
                             else:
                                 # If descending, get NULLs with smaller ID
-                                stmt = stmt.where(
-                                    and_(sort_column.is_(None), ConversationModel.id < after_id)
-                                )
+                                stmt = stmt.where(and_(sort_column.is_(None), ConversationModel.id < after_id))
                         else:
                             # Cursor is at non-NULL
                             if ascending:
@@ -312,8 +314,8 @@ class ConversationManager:
                                         sort_column.isnot(None),
                                         or_(
                                             sort_column > after_sort_value,
-                                            and_(sort_column == after_sort_value, ConversationModel.id > after_id)
-                                        )
+                                            and_(sort_column == after_sort_value, ConversationModel.id > after_id),
+                                        ),
                                     )
                                 )
                             else:
@@ -322,7 +324,7 @@ class ConversationManager:
                                     or_(
                                         sort_column.is_(None),
                                         sort_column < after_sort_value,
-                                        and_(sort_column == after_sort_value, ConversationModel.id < after_id)
+                                        and_(sort_column == after_sort_value, ConversationModel.id < after_id),
                                     )
                                 )
                 else:
@@ -372,7 +374,11 @@ class ConversationManager:
             for key, value in update_data.items():
                 # model_settings needs to be serialized to dict for the JSON column
                 if key == "model_settings" and value is not None:
-                    setattr(conversation, key, conversation_update.model_settings.model_dump() if conversation_update.model_settings else value)
+                    setattr(
+                        conversation,
+                        key,
+                        self._serialize_model_settings(conversation_update.model_settings) if conversation_update.model_settings else value,
+                    )
                 else:
                     setattr(conversation, key, value)
 
diff --git a/letta/services/streaming_service.py b/letta/services/streaming_service.py
index 64fdd346..9bb9901e 100644
--- a/letta/services/streaming_service.py
+++ b/letta/services/streaming_service.py
@@ -119,6 +119,10 @@ class StreamingService:
                 )
                 if conversation.model_settings is not None:
                     update_params = conversation.model_settings._to_legacy_config_params()
+                    # Don't clobber max_tokens with the Pydantic default when the caller
+                    # didn't explicitly provide max_output_tokens.
+                    if "max_output_tokens" not in conversation.model_settings.model_fields_set:
+                        update_params.pop("max_tokens", None)
                     conversation_llm_config = conversation_llm_config.model_copy(update=update_params)
                 agent = agent.model_copy(update={"llm_config": conversation_llm_config})
 
diff --git a/letta/services/summarizer/compact.py b/letta/services/summarizer/compact.py
index 6b581628..b87d18bb 100644
--- a/letta/services/summarizer/compact.py
+++ b/letta/services/summarizer/compact.py
@@ -96,6 +96,10 @@ async def build_summarizer_llm_config(
         # them just like server.create_agent_async does for agents.
         if summarizer_config.model_settings is not None:
             update_params = summarizer_config.model_settings._to_legacy_config_params()
+            # Don't clobber max_tokens with the Pydantic default when the caller
+            # didn't explicitly provide max_output_tokens.
+            if "max_output_tokens" not in summarizer_config.model_settings.model_fields_set:
+                update_params.pop("max_tokens", None)
             return base.model_copy(update=update_params)
 
         return base