diff --git a/fern/openapi.json b/fern/openapi.json index dc495cc5..88c0fe28 100644 --- a/fern/openapi.json +++ b/fern/openapi.json @@ -46358,12 +46358,40 @@ ], "title": "Response Format", "description": "The response format for the model." + }, + "thinking": { + "$ref": "#/components/schemas/ZAIThinking", + "description": "The thinking configuration for GLM-4.5+ models.", + "default": { + "type": "enabled", + "clear_thinking": false + } } }, "type": "object", "title": "ZAIModelSettings", "description": "Z.ai (ZhipuAI) model configuration (OpenAI-compatible)." }, + "ZAIThinking": { + "properties": { + "type": { + "type": "string", + "enum": ["enabled", "disabled"], + "title": "Type", + "description": "Whether thinking is enabled or disabled.", + "default": "enabled" + }, + "clear_thinking": { + "type": "boolean", + "title": "Clear Thinking", + "description": "If False, preserved thinking is used (recommended for agents).", + "default": false + } + }, + "type": "object", + "title": "ZAIThinking", + "description": "Thinking configuration for ZAI GLM-4.5+ models." + }, "letta__schemas__agent_file__AgentSchema": { "properties": { "name": { diff --git a/letta/interfaces/openai_streaming_interface.py b/letta/interfaces/openai_streaming_interface.py index ca3602df..bf0a2938 100644 --- a/letta/interfaces/openai_streaming_interface.py +++ b/letta/interfaces/openai_streaming_interface.py @@ -887,14 +887,10 @@ class SimpleOpenAIStreamingInterface: prev_message_type = assistant_msg.message_type yield assistant_msg - if ( - hasattr(chunk, "choices") - and len(chunk.choices) > 0 - and hasattr(chunk.choices[0], "delta") - and hasattr(chunk.choices[0].delta, "reasoning_content") - ): + if hasattr(chunk, "choices") and len(chunk.choices) > 0 and hasattr(chunk.choices[0], "delta"): delta = chunk.choices[0].delta - reasoning_content = getattr(delta, "reasoning_content", None) + # Check for reasoning_content (standard) or reasoning (OpenRouter) + reasoning_content = getattr(delta, "reasoning_content", None) or getattr(delta, "reasoning", None) if reasoning_content is not None and reasoning_content != "": if prev_message_type and prev_message_type != "reasoning_message": message_index += 1 diff --git a/letta/llm_api/openai_client.py b/letta/llm_api/openai_client.py index ca26c768..8560d86b 100644 --- a/letta/llm_api/openai_client.py +++ b/letta/llm_api/openai_client.py @@ -564,6 +564,17 @@ class OpenAIClient(LLMClientBase): # If set, then in the backend "medium" thinking is turned on # request_data["reasoning_effort"] = "medium" + # Add OpenRouter reasoning configuration via extra_body + if is_openrouter and llm_config.enable_reasoner: + reasoning_config = {} + if llm_config.reasoning_effort: + reasoning_config["effort"] = llm_config.reasoning_effort + if llm_config.max_reasoning_tokens and llm_config.max_reasoning_tokens > 0: + reasoning_config["max_tokens"] = llm_config.max_reasoning_tokens + if not reasoning_config: + reasoning_config = {"enabled": True} + request_data["extra_body"] = {"reasoning": reasoning_config} + return request_data @trace_method @@ -765,12 +776,12 @@ class OpenAIClient(LLMClientBase): ): if "choices" in response_data and len(response_data["choices"]) > 0: choice_data = response_data["choices"][0] - if "message" in choice_data and "reasoning_content" in choice_data["message"]: - reasoning_content = choice_data["message"]["reasoning_content"] - if reasoning_content: - chat_completion_response.choices[0].message.reasoning_content = reasoning_content - - chat_completion_response.choices[0].message.reasoning_content_signature = None + message_data = choice_data.get("message", {}) + # Check for reasoning_content (standard) or reasoning (OpenRouter) + reasoning_content = message_data.get("reasoning_content") or message_data.get("reasoning") + if reasoning_content: + chat_completion_response.choices[0].message.reasoning_content = reasoning_content + chat_completion_response.choices[0].message.reasoning_content_signature = None # Unpack inner thoughts if they were embedded in function arguments if llm_config.put_inner_thoughts_in_kwargs: diff --git a/letta/llm_api/zai_client.py b/letta/llm_api/zai_client.py index 9eec79c2..c7e3d059 100644 --- a/letta/llm_api/zai_client.py +++ b/letta/llm_api/zai_client.py @@ -1,4 +1,3 @@ -import os from typing import List, Optional from openai import AsyncOpenAI, AsyncStream, OpenAI @@ -11,9 +10,15 @@ from letta.schemas.embedding_config import EmbeddingConfig from letta.schemas.enums import AgentType from letta.schemas.llm_config import LLMConfig from letta.schemas.message import Message as PydanticMessage +from letta.schemas.openai.chat_completion_response import ChatCompletionResponse from letta.settings import model_settings +def is_zai_reasoning_model(model_name: str) -> bool: + """Check if the model is a ZAI reasoning model (GLM-4.5+).""" + return model_name.startswith("glm-4.5") or model_name.startswith("glm-4.6") or model_name.startswith("glm-4.7") + + class ZAIClient(OpenAIClient): """Z.ai (ZhipuAI) client - uses OpenAI-compatible API.""" @@ -23,6 +28,10 @@ class ZAIClient(OpenAIClient): def supports_structured_output(self, llm_config: LLMConfig) -> bool: return False + def is_reasoning_model(self, llm_config: LLMConfig) -> bool: + """Returns True if the model is a ZAI reasoning model (GLM-4.5+).""" + return is_zai_reasoning_model(llm_config.model) + @trace_method def build_request_data( self, @@ -35,6 +44,24 @@ class ZAIClient(OpenAIClient): tool_return_truncation_chars: Optional[int] = None, ) -> dict: data = super().build_request_data(agent_type, messages, llm_config, tools, force_tool_call, requires_subsequent_tool_call) + + # Add thinking configuration for ZAI GLM-4.5+ models + # Must explicitly send type: "disabled" when reasoning is off, as GLM-4.7 has thinking on by default + if self.is_reasoning_model(llm_config): + if llm_config.enable_reasoner: + data["extra_body"] = { + "thinking": { + "type": "enabled", + "clear_thinking": False, # Preserved thinking for agents + } + } + else: + data["extra_body"] = { + "thinking": { + "type": "disabled", + } + } + return data @trace_method @@ -79,3 +106,39 @@ class ZAIClient(OpenAIClient): response = await client.embeddings.create(model=embedding_config.embedding_model, input=inputs) return [r.embedding for r in response.data] + + @trace_method + async def convert_response_to_chat_completion( + self, + response_data: dict, + input_messages: List[PydanticMessage], + llm_config: LLMConfig, + ) -> ChatCompletionResponse: + """ + Converts raw ZAI response dict into the ChatCompletionResponse Pydantic model. + Handles extraction of reasoning_content from ZAI GLM-4.5+ responses. + """ + # Use parent class conversion first + chat_completion_response = await super().convert_response_to_chat_completion(response_data, input_messages, llm_config) + + # Parse reasoning_content from ZAI responses (similar to OpenAI pattern) + # ZAI returns reasoning_content in delta.reasoning_content (streaming) or message.reasoning_content + if ( + chat_completion_response.choices + and len(chat_completion_response.choices) > 0 + and chat_completion_response.choices[0].message + and not chat_completion_response.choices[0].message.reasoning_content + ): + if "choices" in response_data and len(response_data["choices"]) > 0: + choice_data = response_data["choices"][0] + if "message" in choice_data and "reasoning_content" in choice_data["message"]: + reasoning_content = choice_data["message"]["reasoning_content"] + if reasoning_content: + chat_completion_response.choices[0].message.reasoning_content = reasoning_content + chat_completion_response.choices[0].message.reasoning_content_signature = None + + # If we used a reasoning model, mark that reasoning content was used + if self.is_reasoning_model(llm_config) and llm_config.enable_reasoner: + chat_completion_response.choices[0].message.omitted_reasoning_content = True + + return chat_completion_response diff --git a/letta/schemas/llm_config.py b/letta/schemas/llm_config.py index 4a62b2c1..4c9ad3fd 100644 --- a/letta/schemas/llm_config.py +++ b/letta/schemas/llm_config.py @@ -374,9 +374,13 @@ class LLMConfig(BaseModel): temperature=self.temperature, ) elif self.model_endpoint_type == "zai": + from letta.schemas.model import ZAIThinking + + thinking_type = "enabled" if self.enable_reasoner else "disabled" return ZAIModelSettings( max_output_tokens=self.max_tokens or 4096, temperature=self.temperature, + thinking=ZAIThinking(type=thinking_type, clear_thinking=False), ) elif self.model_endpoint_type == "groq": return GroqModelSettings( @@ -451,6 +455,45 @@ class LLMConfig(BaseModel): config.model.startswith("gemini-2.5-flash") or config.model.startswith("gemini-2.5-pro") ) + @classmethod + def is_zai_reasoning_model(cls, config: "LLMConfig") -> bool: + return config.model_endpoint_type == "zai" and ( + config.model.startswith("glm-4.5") or config.model.startswith("glm-4.6") or config.model.startswith("glm-4.7") + ) + + @classmethod + def is_openrouter_reasoning_model(cls, config: "LLMConfig") -> bool: + """Check if this is an OpenRouter model that supports reasoning. + + OpenRouter model names include provider prefix, e.g.: + - anthropic/claude-sonnet-4 + - openai/o3-mini + - moonshotai/kimi-k2-thinking + - deepseek/deepseek-r1 + """ + if config.model_endpoint_type != "openrouter": + return False + model = config.model.lower() + # OpenAI reasoning models + if "/o1" in model or "/o3" in model or "/o4" in model or "/gpt-5" in model: + return True + # Anthropic Claude reasoning models + if "claude-3-7-sonnet" in model or "claude-sonnet-4" in model or "claude-opus-4" in model or "claude-haiku-4" in model: + return True + # Google Gemini reasoning models + if "gemini" in model: + return True + # ZAI GLM reasoning models + if "glm-4.5" in model or "glm-4.6" in model or "glm-4.7" in model: + return True + # DeepSeek reasoning models + if "deepseek-r1" in model or "deepseek-reasoner" in model: + return True + # Moonshot Kimi reasoning models + if "kimi" in model: + return True + return False + @classmethod def supports_verbosity(cls, config: "LLMConfig") -> bool: """Check if the model supports verbosity control.""" @@ -505,6 +548,18 @@ class LLMConfig(BaseModel): config.effort = "medium" return config + # ZAI GLM-4.5+ models: toggle honored (similar to Anthropic) + if cls.is_zai_reasoning_model(config): + config.enable_reasoner = bool(reasoning) + config.put_inner_thoughts_in_kwargs = False + return config + + # OpenRouter reasoning models: toggle honored + if cls.is_openrouter_reasoning_model(config): + config.enable_reasoner = bool(reasoning) + config.put_inner_thoughts_in_kwargs = False + return config + # Google Gemini 2.5 Pro and Gemini 3: not possible to disable if config.model.startswith("gemini-2.5-pro") or config.model.startswith("gemini-3"): config.put_inner_thoughts_in_kwargs = False @@ -565,6 +620,10 @@ class LLMConfig(BaseModel): config.put_inner_thoughts_in_kwargs = True if config.max_reasoning_tokens == 0: config.max_reasoning_tokens = 1024 + elif cls.is_zai_reasoning_model(config): + config.put_inner_thoughts_in_kwargs = False + elif cls.is_openrouter_reasoning_model(config): + config.put_inner_thoughts_in_kwargs = False elif cls.is_openai_reasoning_model(config): config.put_inner_thoughts_in_kwargs = False if config.reasoning_effort is None: diff --git a/letta/schemas/model.py b/letta/schemas/model.py index f5d5fdac..ea855206 100644 --- a/letta/schemas/model.py +++ b/letta/schemas/model.py @@ -374,12 +374,22 @@ class XAIModelSettings(ModelSettings): } +class ZAIThinking(BaseModel): + """Thinking configuration for ZAI GLM-4.5+ models.""" + + type: Literal["enabled", "disabled"] = Field("enabled", description="Whether thinking is enabled or disabled.") + clear_thinking: bool = Field(False, description="If False, preserved thinking is used (recommended for agents).") + + class ZAIModelSettings(ModelSettings): """Z.ai (ZhipuAI) model configuration (OpenAI-compatible).""" provider_type: Literal[ProviderType.zai] = Field(ProviderType.zai, description="The type of the provider.") temperature: float = Field(0.7, description="The temperature of the model.") response_format: Optional[ResponseFormatUnion] = Field(None, description="The response format for the model.") + thinking: ZAIThinking = Field( + ZAIThinking(type="enabled", clear_thinking=False), description="The thinking configuration for GLM-4.5+ models." + ) def _to_legacy_config_params(self) -> dict: return { @@ -388,6 +398,7 @@ class ZAIModelSettings(ModelSettings): "response_format": self.response_format, "parallel_tool_calls": self.parallel_tool_calls, "strict": False, # ZAI does not support strict mode + "extended_thinking": self.thinking.type == "enabled", } diff --git a/letta/services/agent_manager.py b/letta/services/agent_manager.py index 56b6a62f..8a9192dc 100644 --- a/letta/services/agent_manager.py +++ b/letta/services/agent_manager.py @@ -351,9 +351,11 @@ class AgentManager: # For v1 agents, enforce sane defaults even when reasoning is omitted if agent_create.agent_type == AgentType.letta_v1_agent: - # Claude 3.7/4 or OpenAI o1/o3/o4/gpt-5 - default_reasoning = LLMConfig.is_anthropic_reasoning_model(agent_create.llm_config) or LLMConfig.is_openai_reasoning_model( - agent_create.llm_config + # Claude 3.7/4 or OpenAI o1/o3/o4/gpt-5 or ZAI GLM-4.5+ + default_reasoning = ( + LLMConfig.is_anthropic_reasoning_model(agent_create.llm_config) + or LLMConfig.is_openai_reasoning_model(agent_create.llm_config) + or LLMConfig.is_zai_reasoning_model(agent_create.llm_config) ) agent_create.llm_config = LLMConfig.apply_reasoning_setting_to_config( agent_create.llm_config, diff --git a/tests/model_settings/zai-glm-4.6.json b/tests/model_settings/zai-glm-4.6.json index 00ca14c6..e0a2e2f2 100644 --- a/tests/model_settings/zai-glm-4.6.json +++ b/tests/model_settings/zai-glm-4.6.json @@ -4,6 +4,10 @@ "provider_type": "zai", "temperature": 1.0, "max_output_tokens": 4096, - "parallel_tool_calls": false + "parallel_tool_calls": false, + "thinking": { + "type": "enabled", + "clear_thinking": false + } } }