Fix: Change Z.ai context window to account for max_token subtraction (#9710)

fix zai context window (functionally [advertised context window] - [max output tokens]) and properly pass in max tokens so Z.ai doesn't default to 65k for GLM-5
2026-02-27 17:10:46 -08:00
parent aa66e81a71
commit 612a2ae98b
3 changed files with 15 additions and 8 deletions
--- a/letta/constants.py
+++ b/letta/constants.py
@@ -253,10 +253,10 @@ LLM_MAX_CONTEXT_WINDOW = {
    "deepseek-reasoner": 64000,
    # glm (Z.AI)
    "glm-4.5": 128000,
-    "glm-4.6": 200000,
-    "glm-4.7": 200000,
-    "glm-5": 200000,
-    "glm-5-code": 200000,
+    "glm-4.6": 180000,
+    "glm-4.7": 180000,
+    "glm-5": 180000,
+    "glm-5-code": 180000,
    ## OpenAI models: https://platform.openai.com/docs/models/overview
    # gpt-5
    "gpt-5": 272000,
--- a/letta/llm_api/zai_client.py
+++ b/letta/llm_api/zai_client.py
@@ -68,6 +68,12 @@ class ZAIClient(OpenAIClient):
                    }
                }

+        # Z.ai's API uses max_tokens, not max_completion_tokens.
+        # If max_completion_tokens is sent, Z.ai ignores it and falls back to its
+        # default of 65536, silently truncating input to ~137K of the 200K context window.
+        if "max_completion_tokens" in data:
+            data["max_tokens"] = data.pop("max_completion_tokens")
+
        # Sanitize empty text content — ZAI rejects empty text blocks
        if "messages" in data:
            for msg in data["messages"]:
--- a/letta/schemas/providers/zai.py
+++ b/letta/schemas/providers/zai.py
@@ -12,12 +12,13 @@ from letta.schemas.providers.openai import OpenAIProvider

 # Z.ai model context windows
 # Reference: https://docs.z.ai/
+# GLM-5 max context window is 200K tokens but max_output_tokens (default 16k) counts against that --> 180k
 MODEL_CONTEXT_WINDOWS = {
    "glm-4.5": 128000,
-    "glm-4.6": 200000,
-    "glm-4.7": 200000,
-    "glm-5": 200000,
-    "glm-5-code": 200000,
+    "glm-4.6": 180000,
+    "glm-4.7": 180000,
+    "glm-5": 180000,
+    "glm-5-code": 180000,
 }