From 612a2ae98bb6990d3330453c5c8500a96f2cb3d1 Mon Sep 17 00:00:00 2001
From: amysguan <64990783+amysguan@users.noreply.github.com>
Date: Fri, 27 Feb 2026 17:10:46 -0800
Subject: [PATCH] Fix: Change Z.ai context window to account for max_token
 subtraction (#9710)

fix zai context window (functionally [advertised context window] - [max output tokens]) and properly pass in max tokens so Z.ai doesn't default to 65k for GLM-5
---
 letta/constants.py             | 8 ++++----
 letta/llm_api/zai_client.py    | 6 ++++++
 letta/schemas/providers/zai.py | 9 +++++----
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/letta/constants.py b/letta/constants.py
index 845190c7..2e1748e9 100644
--- a/letta/constants.py
+++ b/letta/constants.py
@@ -253,10 +253,10 @@ LLM_MAX_CONTEXT_WINDOW = {
     "deepseek-reasoner": 64000,
     # glm (Z.AI)
     "glm-4.5": 128000,
-    "glm-4.6": 200000,
-    "glm-4.7": 200000,
-    "glm-5": 200000,
-    "glm-5-code": 200000,
+    "glm-4.6": 180000,
+    "glm-4.7": 180000,
+    "glm-5": 180000,
+    "glm-5-code": 180000,
     ## OpenAI models: https://platform.openai.com/docs/models/overview
     # gpt-5
     "gpt-5": 272000,
diff --git a/letta/llm_api/zai_client.py b/letta/llm_api/zai_client.py
index 87d577ef..19e5eeae 100644
--- a/letta/llm_api/zai_client.py
+++ b/letta/llm_api/zai_client.py
@@ -68,6 +68,12 @@ class ZAIClient(OpenAIClient):
                     }
                 }
 
+        # Z.ai's API uses max_tokens, not max_completion_tokens.
+        # If max_completion_tokens is sent, Z.ai ignores it and falls back to its
+        # default of 65536, silently truncating input to ~137K of the 200K context window.
+        if "max_completion_tokens" in data:
+            data["max_tokens"] = data.pop("max_completion_tokens")
+
         # Sanitize empty text content — ZAI rejects empty text blocks
         if "messages" in data:
             for msg in data["messages"]:
diff --git a/letta/schemas/providers/zai.py b/letta/schemas/providers/zai.py
index 8682e4b1..1ff30872 100644
--- a/letta/schemas/providers/zai.py
+++ b/letta/schemas/providers/zai.py
@@ -12,12 +12,13 @@ from letta.schemas.providers.openai import OpenAIProvider
 
 # Z.ai model context windows
 # Reference: https://docs.z.ai/
+# GLM-5 max context window is 200K tokens but max_output_tokens (default 16k) counts against that --> 180k
 MODEL_CONTEXT_WINDOWS = {
     "glm-4.5": 128000,
-    "glm-4.6": 200000,
-    "glm-4.7": 200000,
-    "glm-5": 200000,
-    "glm-5-code": 200000,
+    "glm-4.6": 180000,
+    "glm-4.7": 180000,
+    "glm-5": 180000,
+    "glm-5-code": 180000,
 }