fix(core): prevent ModelSettings default max_output_tokens from overriding agent config (#9739)

* fix(core): prevent ModelSettings default max_output_tokens from overriding agent config When a conversation's model_settings were saved, the Pydantic default of max_output_tokens=4096 was always persisted to the DB even when the client never specified it. On subsequent messages, this default would overwrite the agent's max_tokens (typically None) with 4096, silently capping output. Two changes: 1. Use model_dump(exclude_unset=True) when persisting model_settings to the DB so Pydantic defaults are not saved. 2. Add model_fields_set guards at all callsites that apply _to_legacy_config_params() to skip max_tokens when it was not explicitly provided by the caller. Also conditionally set max_output_tokens in the OpenAI Responses API request builder so None is not sent as null (which some models treat as a hard 4096 cap). * nit * Fix model_settings serialization to preserve provider_type discriminator Replace blanket exclude_unset=True with targeted removal of only max_output_tokens when not explicitly set. The previous approach stripped the provider_type field (a Literal with a default), which broke discriminated union deserialization when reading back from DB.
2026-03-03 11:48:19 -08:00
parent 8335aa0fa0
commit db9e0f42af
6 changed files with 56 additions and 32 deletions
--- a/letta/llm_api/openai_client.py
+++ b/letta/llm_api/openai_client.py
@@ -389,7 +389,6 @@ class OpenAIClient(LLMClientBase):
            input=openai_messages_list,
            tools=responses_tools,
            tool_choice=tool_choice,
-            max_output_tokens=llm_config.max_tokens,
            temperature=llm_config.temperature if supports_temperature_param(model) else None,
            parallel_tool_calls=llm_config.parallel_tool_calls if tools and supports_parallel_tool_calling(model) else False,
        )
@@ -397,6 +396,10 @@ class OpenAIClient(LLMClientBase):
        # Handle text configuration (verbosity and response format)
        text_config_kwargs = {}

+        # Only set max_output_tokens if explicitly configured
+        if llm_config.max_tokens is not None:
+            data.max_output_tokens = llm_config.max_tokens
+
        # Add verbosity control for GPT-5 models
        if supports_verbosity_control(model) and llm_config.verbosity:
            text_config_kwargs["verbosity"] = llm_config.verbosity
@@ -451,7 +454,6 @@ class OpenAIClient(LLMClientBase):
        )

        request_data = data.model_dump(exclude_unset=True)
-        # print("responses request data", request_data)
        return request_data

    @trace_method
--- a/letta/server/rest_api/routers/v1/conversations.py
+++ b/letta/server/rest_api/routers/v1/conversations.py
@@ -401,6 +401,10 @@ async def send_conversation_message(
        )
        if conversation.model_settings is not None:
            update_params = conversation.model_settings._to_legacy_config_params()
+            # Don't clobber max_tokens with the Pydantic default when the caller
+            # didn't explicitly provide max_output_tokens.
+            if "max_output_tokens" not in conversation.model_settings.model_fields_set:
+                update_params.pop("max_tokens", None)
            conversation_llm_config = conversation_llm_config.model_copy(update=update_params)
        agent = agent.model_copy(update={"llm_config": conversation_llm_config})

--- a/letta/server/server.py
+++ b/letta/server/server.py
@@ -562,6 +562,10 @@ class SyncServer(object):
        # update with model_settings
        if request.model_settings is not None:
            update_llm_config_params = request.model_settings._to_legacy_config_params()
+            # Don't clobber max_tokens with the Pydantic default when the caller
+            # didn't explicitly provide max_output_tokens in the request.
+            if "max_output_tokens" not in request.model_settings.model_fields_set:
+                update_llm_config_params.pop("max_tokens", None)
            request.llm_config = request.llm_config.model_copy(update=update_llm_config_params)

        # Copy parallel_tool_calls from request to llm_config if provided
--- a/letta/services/conversation_manager.py
+++ b/letta/services/conversation_manager.py
@@ -30,6 +30,21 @@ from letta.utils import enforce_types
 class ConversationManager:
    """Manager class to handle business logic related to Conversations."""

+    @staticmethod
+    def _serialize_model_settings(model_settings) -> Optional[dict]:
+        """Serialize model settings for DB storage, stripping max_output_tokens if not explicitly set.
+
+        Uses model_dump() to preserve all fields (including the provider_type discriminator),
+        but removes max_output_tokens when it wasn't explicitly provided by the caller so we
+        don't persist the Pydantic default (4096) and later overwrite the agent's own value.
+        """
+        if model_settings is None:
+            return None
+        data = model_settings.model_dump()
+        if "max_output_tokens" not in model_settings.model_fields_set:
+            data.pop("max_output_tokens", None)
+        return data
+
    @enforce_types
    @trace_method
    async def create_conversation(
@@ -57,7 +72,7 @@ class ConversationManager:
                summary=conversation_create.summary,
                organization_id=actor.organization_id,
                model=conversation_create.model,
-                model_settings=conversation_create.model_settings.model_dump() if conversation_create.model_settings else None,
+                model_settings=self._serialize_model_settings(conversation_create.model_settings),
            )
            await conversation.create_async(session, actor=actor)

@@ -228,22 +243,15 @@ class ConversationManager:
            if sort_by == "last_run_completion":
                # Subquery to get the latest completed_at for each conversation
                latest_run_subquery = (
-                    select(
-                        RunModel.conversation_id,
-                        func.max(RunModel.completed_at).label("last_run_completion")
-                    )
+                    select(RunModel.conversation_id, func.max(RunModel.completed_at).label("last_run_completion"))
                    .where(RunModel.conversation_id.isnot(None))
                    .group_by(RunModel.conversation_id)
                    .subquery()
                )

                # Join conversations with the subquery
-                stmt = (
-                    select(ConversationModel)
-                    .outerjoin(
-                        latest_run_subquery,
-                        ConversationModel.id == latest_run_subquery.c.conversation_id
-                    )
+                stmt = select(ConversationModel).outerjoin(
+                    latest_run_subquery, ConversationModel.id == latest_run_subquery.c.conversation_id
                )
                sort_column = latest_run_subquery.c.last_run_completion
                sort_nulls_last = True
@@ -265,10 +273,12 @@ class ConversationManager:

            # Add summary search filter if provided
            if summary_search:
-                conditions.extend([
+                conditions.extend(
+                    [
                        ConversationModel.summary.isnot(None),
                        ConversationModel.summary.contains(summary_search),
-                ])
+                    ]
+                )

            stmt = stmt.where(and_(*conditions))

@@ -277,10 +287,7 @@ class ConversationManager:
                # Get the sort value for the cursor conversation
                if sort_by == "last_run_completion":
                    cursor_query = (
-                        select(
-                            ConversationModel.id,
-                            func.max(RunModel.completed_at).label("last_run_completion")
-                        )
+                        select(ConversationModel.id, func.max(RunModel.completed_at).label("last_run_completion"))
                        .outerjoin(RunModel, ConversationModel.id == RunModel.conversation_id)
                        .where(ConversationModel.id == after)
                        .group_by(ConversationModel.id)
@@ -293,16 +300,11 @@ class ConversationManager:
                            # Cursor is at NULL - if ascending, get non-NULLs or NULLs with greater ID
                            if ascending:
                                stmt = stmt.where(
-                                    or_(
-                                        and_(sort_column.is_(None), ConversationModel.id > after_id),
-                                        sort_column.isnot(None)
-                                    )
+                                    or_(and_(sort_column.is_(None), ConversationModel.id > after_id), sort_column.isnot(None))
                                )
                            else:
                                # If descending, get NULLs with smaller ID
-                                stmt = stmt.where(
-                                    and_(sort_column.is_(None), ConversationModel.id < after_id)
-                                )
+                                stmt = stmt.where(and_(sort_column.is_(None), ConversationModel.id < after_id))
                        else:
                            # Cursor is at non-NULL
                            if ascending:
@@ -312,8 +314,8 @@ class ConversationManager:
                                        sort_column.isnot(None),
                                        or_(
                                            sort_column > after_sort_value,
-                                            and_(sort_column == after_sort_value, ConversationModel.id > after_id)
-                                        )
+                                            and_(sort_column == after_sort_value, ConversationModel.id > after_id),
+                                        ),
                                    )
                                )
                            else:
@@ -322,7 +324,7 @@ class ConversationManager:
                                    or_(
                                        sort_column.is_(None),
                                        sort_column < after_sort_value,
-                                        and_(sort_column == after_sort_value, ConversationModel.id < after_id)
+                                        and_(sort_column == after_sort_value, ConversationModel.id < after_id),
                                    )
                                )
                else:
@@ -372,7 +374,11 @@ class ConversationManager:
            for key, value in update_data.items():
                # model_settings needs to be serialized to dict for the JSON column
                if key == "model_settings" and value is not None:
-                    setattr(conversation, key, conversation_update.model_settings.model_dump() if conversation_update.model_settings else value)
+                    setattr(
+                        conversation,
+                        key,
+                        self._serialize_model_settings(conversation_update.model_settings) if conversation_update.model_settings else value,
+                    )
                else:
                    setattr(conversation, key, value)

--- a/letta/services/streaming_service.py
+++ b/letta/services/streaming_service.py
@@ -119,6 +119,10 @@ class StreamingService:
                )
                if conversation.model_settings is not None:
                    update_params = conversation.model_settings._to_legacy_config_params()
+                    # Don't clobber max_tokens with the Pydantic default when the caller
+                    # didn't explicitly provide max_output_tokens.
+                    if "max_output_tokens" not in conversation.model_settings.model_fields_set:
+                        update_params.pop("max_tokens", None)
                    conversation_llm_config = conversation_llm_config.model_copy(update=update_params)
                agent = agent.model_copy(update={"llm_config": conversation_llm_config})

--- a/letta/services/summarizer/compact.py
+++ b/letta/services/summarizer/compact.py
@@ -96,6 +96,10 @@ async def build_summarizer_llm_config(
        # them just like server.create_agent_async does for agents.
        if summarizer_config.model_settings is not None:
            update_params = summarizer_config.model_settings._to_legacy_config_params()
+            # Don't clobber max_tokens with the Pydantic default when the caller
+            # didn't explicitly provide max_output_tokens.
+            if "max_output_tokens" not in summarizer_config.model_settings.model_fields_set:
+                update_params.pop("max_tokens", None)
            return base.model_copy(update=update_params)

        return base