From db9e0f42afe5c97d13f5f5836147193606d9411e Mon Sep 17 00:00:00 2001 From: jnjpng Date: Tue, 3 Mar 2026 11:48:19 -0800 Subject: [PATCH] fix(core): prevent ModelSettings default max_output_tokens from overriding agent config (#9739) * fix(core): prevent ModelSettings default max_output_tokens from overriding agent config When a conversation's model_settings were saved, the Pydantic default of max_output_tokens=4096 was always persisted to the DB even when the client never specified it. On subsequent messages, this default would overwrite the agent's max_tokens (typically None) with 4096, silently capping output. Two changes: 1. Use model_dump(exclude_unset=True) when persisting model_settings to the DB so Pydantic defaults are not saved. 2. Add model_fields_set guards at all callsites that apply _to_legacy_config_params() to skip max_tokens when it was not explicitly provided by the caller. Also conditionally set max_output_tokens in the OpenAI Responses API request builder so None is not sent as null (which some models treat as a hard 4096 cap). * nit * Fix model_settings serialization to preserve provider_type discriminator Replace blanket exclude_unset=True with targeted removal of only max_output_tokens when not explicitly set. The previous approach stripped the provider_type field (a Literal with a default), which broke discriminated union deserialization when reading back from DB. --- letta/llm_api/openai_client.py | 6 +- .../rest_api/routers/v1/conversations.py | 4 ++ letta/server/server.py | 4 ++ letta/services/conversation_manager.py | 66 ++++++++++--------- letta/services/streaming_service.py | 4 ++ letta/services/summarizer/compact.py | 4 ++ 6 files changed, 56 insertions(+), 32 deletions(-) diff --git a/letta/llm_api/openai_client.py b/letta/llm_api/openai_client.py index 3ef03792..e615315b 100644 --- a/letta/llm_api/openai_client.py +++ b/letta/llm_api/openai_client.py @@ -389,7 +389,6 @@ class OpenAIClient(LLMClientBase): input=openai_messages_list, tools=responses_tools, tool_choice=tool_choice, - max_output_tokens=llm_config.max_tokens, temperature=llm_config.temperature if supports_temperature_param(model) else None, parallel_tool_calls=llm_config.parallel_tool_calls if tools and supports_parallel_tool_calling(model) else False, ) @@ -397,6 +396,10 @@ class OpenAIClient(LLMClientBase): # Handle text configuration (verbosity and response format) text_config_kwargs = {} + # Only set max_output_tokens if explicitly configured + if llm_config.max_tokens is not None: + data.max_output_tokens = llm_config.max_tokens + # Add verbosity control for GPT-5 models if supports_verbosity_control(model) and llm_config.verbosity: text_config_kwargs["verbosity"] = llm_config.verbosity @@ -451,7 +454,6 @@ class OpenAIClient(LLMClientBase): ) request_data = data.model_dump(exclude_unset=True) - # print("responses request data", request_data) return request_data @trace_method diff --git a/letta/server/rest_api/routers/v1/conversations.py b/letta/server/rest_api/routers/v1/conversations.py index d7444d87..65af1d19 100644 --- a/letta/server/rest_api/routers/v1/conversations.py +++ b/letta/server/rest_api/routers/v1/conversations.py @@ -401,6 +401,10 @@ async def send_conversation_message( ) if conversation.model_settings is not None: update_params = conversation.model_settings._to_legacy_config_params() + # Don't clobber max_tokens with the Pydantic default when the caller + # didn't explicitly provide max_output_tokens. + if "max_output_tokens" not in conversation.model_settings.model_fields_set: + update_params.pop("max_tokens", None) conversation_llm_config = conversation_llm_config.model_copy(update=update_params) agent = agent.model_copy(update={"llm_config": conversation_llm_config}) diff --git a/letta/server/server.py b/letta/server/server.py index 06ca99e6..ea6eef31 100644 --- a/letta/server/server.py +++ b/letta/server/server.py @@ -562,6 +562,10 @@ class SyncServer(object): # update with model_settings if request.model_settings is not None: update_llm_config_params = request.model_settings._to_legacy_config_params() + # Don't clobber max_tokens with the Pydantic default when the caller + # didn't explicitly provide max_output_tokens in the request. + if "max_output_tokens" not in request.model_settings.model_fields_set: + update_llm_config_params.pop("max_tokens", None) request.llm_config = request.llm_config.model_copy(update=update_llm_config_params) # Copy parallel_tool_calls from request to llm_config if provided diff --git a/letta/services/conversation_manager.py b/letta/services/conversation_manager.py index f499c587..814ffabf 100644 --- a/letta/services/conversation_manager.py +++ b/letta/services/conversation_manager.py @@ -30,6 +30,21 @@ from letta.utils import enforce_types class ConversationManager: """Manager class to handle business logic related to Conversations.""" + @staticmethod + def _serialize_model_settings(model_settings) -> Optional[dict]: + """Serialize model settings for DB storage, stripping max_output_tokens if not explicitly set. + + Uses model_dump() to preserve all fields (including the provider_type discriminator), + but removes max_output_tokens when it wasn't explicitly provided by the caller so we + don't persist the Pydantic default (4096) and later overwrite the agent's own value. + """ + if model_settings is None: + return None + data = model_settings.model_dump() + if "max_output_tokens" not in model_settings.model_fields_set: + data.pop("max_output_tokens", None) + return data + @enforce_types @trace_method async def create_conversation( @@ -57,7 +72,7 @@ class ConversationManager: summary=conversation_create.summary, organization_id=actor.organization_id, model=conversation_create.model, - model_settings=conversation_create.model_settings.model_dump() if conversation_create.model_settings else None, + model_settings=self._serialize_model_settings(conversation_create.model_settings), ) await conversation.create_async(session, actor=actor) @@ -228,22 +243,15 @@ class ConversationManager: if sort_by == "last_run_completion": # Subquery to get the latest completed_at for each conversation latest_run_subquery = ( - select( - RunModel.conversation_id, - func.max(RunModel.completed_at).label("last_run_completion") - ) + select(RunModel.conversation_id, func.max(RunModel.completed_at).label("last_run_completion")) .where(RunModel.conversation_id.isnot(None)) .group_by(RunModel.conversation_id) .subquery() ) # Join conversations with the subquery - stmt = ( - select(ConversationModel) - .outerjoin( - latest_run_subquery, - ConversationModel.id == latest_run_subquery.c.conversation_id - ) + stmt = select(ConversationModel).outerjoin( + latest_run_subquery, ConversationModel.id == latest_run_subquery.c.conversation_id ) sort_column = latest_run_subquery.c.last_run_completion sort_nulls_last = True @@ -265,10 +273,12 @@ class ConversationManager: # Add summary search filter if provided if summary_search: - conditions.extend([ - ConversationModel.summary.isnot(None), - ConversationModel.summary.contains(summary_search), - ]) + conditions.extend( + [ + ConversationModel.summary.isnot(None), + ConversationModel.summary.contains(summary_search), + ] + ) stmt = stmt.where(and_(*conditions)) @@ -277,10 +287,7 @@ class ConversationManager: # Get the sort value for the cursor conversation if sort_by == "last_run_completion": cursor_query = ( - select( - ConversationModel.id, - func.max(RunModel.completed_at).label("last_run_completion") - ) + select(ConversationModel.id, func.max(RunModel.completed_at).label("last_run_completion")) .outerjoin(RunModel, ConversationModel.id == RunModel.conversation_id) .where(ConversationModel.id == after) .group_by(ConversationModel.id) @@ -293,16 +300,11 @@ class ConversationManager: # Cursor is at NULL - if ascending, get non-NULLs or NULLs with greater ID if ascending: stmt = stmt.where( - or_( - and_(sort_column.is_(None), ConversationModel.id > after_id), - sort_column.isnot(None) - ) + or_(and_(sort_column.is_(None), ConversationModel.id > after_id), sort_column.isnot(None)) ) else: # If descending, get NULLs with smaller ID - stmt = stmt.where( - and_(sort_column.is_(None), ConversationModel.id < after_id) - ) + stmt = stmt.where(and_(sort_column.is_(None), ConversationModel.id < after_id)) else: # Cursor is at non-NULL if ascending: @@ -312,8 +314,8 @@ class ConversationManager: sort_column.isnot(None), or_( sort_column > after_sort_value, - and_(sort_column == after_sort_value, ConversationModel.id > after_id) - ) + and_(sort_column == after_sort_value, ConversationModel.id > after_id), + ), ) ) else: @@ -322,7 +324,7 @@ class ConversationManager: or_( sort_column.is_(None), sort_column < after_sort_value, - and_(sort_column == after_sort_value, ConversationModel.id < after_id) + and_(sort_column == after_sort_value, ConversationModel.id < after_id), ) ) else: @@ -372,7 +374,11 @@ class ConversationManager: for key, value in update_data.items(): # model_settings needs to be serialized to dict for the JSON column if key == "model_settings" and value is not None: - setattr(conversation, key, conversation_update.model_settings.model_dump() if conversation_update.model_settings else value) + setattr( + conversation, + key, + self._serialize_model_settings(conversation_update.model_settings) if conversation_update.model_settings else value, + ) else: setattr(conversation, key, value) diff --git a/letta/services/streaming_service.py b/letta/services/streaming_service.py index 64fdd346..9bb9901e 100644 --- a/letta/services/streaming_service.py +++ b/letta/services/streaming_service.py @@ -119,6 +119,10 @@ class StreamingService: ) if conversation.model_settings is not None: update_params = conversation.model_settings._to_legacy_config_params() + # Don't clobber max_tokens with the Pydantic default when the caller + # didn't explicitly provide max_output_tokens. + if "max_output_tokens" not in conversation.model_settings.model_fields_set: + update_params.pop("max_tokens", None) conversation_llm_config = conversation_llm_config.model_copy(update=update_params) agent = agent.model_copy(update={"llm_config": conversation_llm_config}) diff --git a/letta/services/summarizer/compact.py b/letta/services/summarizer/compact.py index 6b581628..b87d18bb 100644 --- a/letta/services/summarizer/compact.py +++ b/letta/services/summarizer/compact.py @@ -96,6 +96,10 @@ async def build_summarizer_llm_config( # them just like server.create_agent_async does for agents. if summarizer_config.model_settings is not None: update_params = summarizer_config.model_settings._to_legacy_config_params() + # Don't clobber max_tokens with the Pydantic default when the caller + # didn't explicitly provide max_output_tokens. + if "max_output_tokens" not in summarizer_config.model_settings.model_fields_set: + update_params.pop("max_tokens", None) return base.model_copy(update=update_params) return base