From 612a2ae98bb6990d3330453c5c8500a96f2cb3d1 Mon Sep 17 00:00:00 2001 From: amysguan <64990783+amysguan@users.noreply.github.com> Date: Fri, 27 Feb 2026 17:10:46 -0800 Subject: [PATCH] Fix: Change Z.ai context window to account for max_token subtraction (#9710) fix zai context window (functionally [advertised context window] - [max output tokens]) and properly pass in max tokens so Z.ai doesn't default to 65k for GLM-5 --- letta/constants.py | 8 ++++---- letta/llm_api/zai_client.py | 6 ++++++ letta/schemas/providers/zai.py | 9 +++++---- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/letta/constants.py b/letta/constants.py index 845190c7..2e1748e9 100644 --- a/letta/constants.py +++ b/letta/constants.py @@ -253,10 +253,10 @@ LLM_MAX_CONTEXT_WINDOW = { "deepseek-reasoner": 64000, # glm (Z.AI) "glm-4.5": 128000, - "glm-4.6": 200000, - "glm-4.7": 200000, - "glm-5": 200000, - "glm-5-code": 200000, + "glm-4.6": 180000, + "glm-4.7": 180000, + "glm-5": 180000, + "glm-5-code": 180000, ## OpenAI models: https://platform.openai.com/docs/models/overview # gpt-5 "gpt-5": 272000, diff --git a/letta/llm_api/zai_client.py b/letta/llm_api/zai_client.py index 87d577ef..19e5eeae 100644 --- a/letta/llm_api/zai_client.py +++ b/letta/llm_api/zai_client.py @@ -68,6 +68,12 @@ class ZAIClient(OpenAIClient): } } + # Z.ai's API uses max_tokens, not max_completion_tokens. + # If max_completion_tokens is sent, Z.ai ignores it and falls back to its + # default of 65536, silently truncating input to ~137K of the 200K context window. + if "max_completion_tokens" in data: + data["max_tokens"] = data.pop("max_completion_tokens") + # Sanitize empty text content — ZAI rejects empty text blocks if "messages" in data: for msg in data["messages"]: diff --git a/letta/schemas/providers/zai.py b/letta/schemas/providers/zai.py index 8682e4b1..1ff30872 100644 --- a/letta/schemas/providers/zai.py +++ b/letta/schemas/providers/zai.py @@ -12,12 +12,13 @@ from letta.schemas.providers.openai import OpenAIProvider # Z.ai model context windows # Reference: https://docs.z.ai/ +# GLM-5 max context window is 200K tokens but max_output_tokens (default 16k) counts against that --> 180k MODEL_CONTEXT_WINDOWS = { "glm-4.5": 128000, - "glm-4.6": 200000, - "glm-4.7": 200000, - "glm-5": 200000, - "glm-5-code": 200000, + "glm-4.6": 180000, + "glm-4.7": 180000, + "glm-5": 180000, + "glm-5-code": 180000, }