Multiple OpenAI-compatible LLM clients (Azure, Deepseek, Groq, Together, XAI, ZAI) and Anthropic-compatible clients (Anthropic, MiniMax, Google Vertex) were overriding request_async/stream_async without calling sanitize_unicode_surrogates, causing UnicodeEncodeError when message content contained lone UTF-16 surrogates. Root cause: Child classes override parent methods but omit the sanitization step that the base OpenAIClient includes. This allows corrupted Unicode (unpaired surrogates from malformed emoji) to reach the httpx layer, which rejects it during UTF-8 encoding. Fix: Import and call sanitize_unicode_surrogates in all overridden request methods. Also removed duplicate sanitize_unicode_surrogates definition from openai_client.py that shadowed the canonical implementation in letta.helpers.json_helpers. 🐾 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> Issue-ID: 10c0f2e4-f87b-11f0-b91c-da7ad0900000
150 lines
6.5 KiB
Python
150 lines
6.5 KiB
Python
from typing import List, Optional
|
|
|
|
from openai import AsyncOpenAI, AsyncStream, OpenAI
|
|
from openai.types.chat.chat_completion import ChatCompletion
|
|
from openai.types.chat.chat_completion_chunk import ChatCompletionChunk
|
|
|
|
from letta.helpers.json_helpers import sanitize_unicode_surrogates
|
|
from letta.llm_api.openai_client import OpenAIClient
|
|
from letta.otel.tracing import trace_method
|
|
from letta.schemas.embedding_config import EmbeddingConfig
|
|
from letta.schemas.enums import AgentType
|
|
from letta.schemas.llm_config import LLMConfig
|
|
from letta.schemas.message import Message as PydanticMessage
|
|
from letta.schemas.openai.chat_completion_response import ChatCompletionResponse
|
|
from letta.settings import model_settings
|
|
|
|
|
|
def is_zai_reasoning_model(model_name: str) -> bool:
|
|
"""Check if the model is a ZAI reasoning model (GLM-4.5+)."""
|
|
return model_name.startswith("glm-4.5") or model_name.startswith("glm-4.6") or model_name.startswith("glm-4.7")
|
|
|
|
|
|
class ZAIClient(OpenAIClient):
|
|
"""Z.ai (ZhipuAI) client - uses OpenAI-compatible API."""
|
|
|
|
def requires_auto_tool_choice(self, llm_config: LLMConfig) -> bool:
|
|
return False
|
|
|
|
def supports_structured_output(self, llm_config: LLMConfig) -> bool:
|
|
return False
|
|
|
|
def is_reasoning_model(self, llm_config: LLMConfig) -> bool:
|
|
"""Returns True if the model is a ZAI reasoning model (GLM-4.5+)."""
|
|
return is_zai_reasoning_model(llm_config.model)
|
|
|
|
@trace_method
|
|
def build_request_data(
|
|
self,
|
|
agent_type: AgentType,
|
|
messages: List[PydanticMessage],
|
|
llm_config: LLMConfig,
|
|
tools: Optional[List[dict]] = None,
|
|
force_tool_call: Optional[str] = None,
|
|
requires_subsequent_tool_call: bool = False,
|
|
tool_return_truncation_chars: Optional[int] = None,
|
|
) -> dict:
|
|
data = super().build_request_data(agent_type, messages, llm_config, tools, force_tool_call, requires_subsequent_tool_call)
|
|
|
|
# Add thinking configuration for ZAI GLM-4.5+ models
|
|
# Must explicitly send type: "disabled" when reasoning is off, as GLM-4.7 has thinking on by default
|
|
if self.is_reasoning_model(llm_config):
|
|
if llm_config.enable_reasoner:
|
|
data["extra_body"] = {
|
|
"thinking": {
|
|
"type": "enabled",
|
|
"clear_thinking": False, # Preserved thinking for agents
|
|
}
|
|
}
|
|
else:
|
|
data["extra_body"] = {
|
|
"thinking": {
|
|
"type": "disabled",
|
|
}
|
|
}
|
|
|
|
return data
|
|
|
|
@trace_method
|
|
def request(self, request_data: dict, llm_config: LLMConfig) -> dict:
|
|
"""
|
|
Performs underlying synchronous request to Z.ai API and returns raw response dict.
|
|
"""
|
|
api_key = model_settings.zai_api_key
|
|
client = OpenAI(api_key=api_key, base_url=llm_config.model_endpoint)
|
|
|
|
response: ChatCompletion = client.chat.completions.create(**request_data)
|
|
return response.model_dump()
|
|
|
|
@trace_method
|
|
async def request_async(self, request_data: dict, llm_config: LLMConfig) -> dict:
|
|
"""
|
|
Performs underlying asynchronous request to Z.ai API and returns raw response dict.
|
|
"""
|
|
request_data = sanitize_unicode_surrogates(request_data)
|
|
|
|
api_key = model_settings.zai_api_key
|
|
client = AsyncOpenAI(api_key=api_key, base_url=llm_config.model_endpoint)
|
|
|
|
response: ChatCompletion = await client.chat.completions.create(**request_data)
|
|
return response.model_dump()
|
|
|
|
@trace_method
|
|
async def stream_async(self, request_data: dict, llm_config: LLMConfig) -> AsyncStream[ChatCompletionChunk]:
|
|
"""
|
|
Performs underlying asynchronous streaming request to Z.ai and returns the async stream iterator.
|
|
"""
|
|
request_data = sanitize_unicode_surrogates(request_data)
|
|
|
|
api_key = model_settings.zai_api_key
|
|
client = AsyncOpenAI(api_key=api_key, base_url=llm_config.model_endpoint)
|
|
response_stream: AsyncStream[ChatCompletionChunk] = await client.chat.completions.create(
|
|
**request_data, stream=True, stream_options={"include_usage": True}
|
|
)
|
|
return response_stream
|
|
|
|
@trace_method
|
|
async def request_embeddings(self, inputs: List[str], embedding_config: EmbeddingConfig) -> List[List[float]]:
|
|
"""Request embeddings given texts and embedding config"""
|
|
api_key = model_settings.zai_api_key
|
|
client = AsyncOpenAI(api_key=api_key, base_url=embedding_config.embedding_endpoint)
|
|
response = await client.embeddings.create(model=embedding_config.embedding_model, input=inputs)
|
|
|
|
return [r.embedding for r in response.data]
|
|
|
|
@trace_method
|
|
async def convert_response_to_chat_completion(
|
|
self,
|
|
response_data: dict,
|
|
input_messages: List[PydanticMessage],
|
|
llm_config: LLMConfig,
|
|
) -> ChatCompletionResponse:
|
|
"""
|
|
Converts raw ZAI response dict into the ChatCompletionResponse Pydantic model.
|
|
Handles extraction of reasoning_content from ZAI GLM-4.5+ responses.
|
|
"""
|
|
# Use parent class conversion first
|
|
chat_completion_response = await super().convert_response_to_chat_completion(response_data, input_messages, llm_config)
|
|
|
|
# Parse reasoning_content from ZAI responses (similar to OpenAI pattern)
|
|
# ZAI returns reasoning_content in delta.reasoning_content (streaming) or message.reasoning_content
|
|
if (
|
|
chat_completion_response.choices
|
|
and len(chat_completion_response.choices) > 0
|
|
and chat_completion_response.choices[0].message
|
|
and not chat_completion_response.choices[0].message.reasoning_content
|
|
):
|
|
if "choices" in response_data and len(response_data["choices"]) > 0:
|
|
choice_data = response_data["choices"][0]
|
|
if "message" in choice_data and "reasoning_content" in choice_data["message"]:
|
|
reasoning_content = choice_data["message"]["reasoning_content"]
|
|
if reasoning_content:
|
|
chat_completion_response.choices[0].message.reasoning_content = reasoning_content
|
|
chat_completion_response.choices[0].message.reasoning_content_signature = None
|
|
|
|
# If we used a reasoning model, mark that reasoning content was used
|
|
if self.is_reasoning_model(llm_config) and llm_config.enable_reasoner:
|
|
chat_completion_response.choices[0].message.omitted_reasoning_content = True
|
|
|
|
return chat_completion_response
|