fix(core): sanitize Unicode surrogates in all LLM client requests (#9323)

Multiple OpenAI-compatible LLM clients (Azure, Deepseek, Groq, Together, XAI, ZAI) and Anthropic-compatible clients (Anthropic, MiniMax, Google Vertex) were overriding request_async/stream_async without calling sanitize_unicode_surrogates, causing UnicodeEncodeError when message content contained lone UTF-16 surrogates. Root cause: Child classes override parent methods but omit the sanitization step that the base OpenAIClient includes. This allows corrupted Unicode (unpaired surrogates from malformed emoji) to reach the httpx layer, which rejects it during UTF-8 encoding. Fix: Import and call sanitize_unicode_surrogates in all overridden request methods. Also removed duplicate sanitize_unicode_surrogates definition from openai_client.py that shadowed the canonical implementation in letta.helpers.json_helpers. 🐾 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> Issue-ID: 10c0f2e4-f87b-11f0-b91c-da7ad0900000
2026-02-05 20:20:34 -08:00
parent 69fc934135
commit d48932bdb6
11 changed files with 44 additions and 28 deletions
--- a/letta/llm_api/openai_client.py
+++ b/letta/llm_api/openai_client.py
@@ -67,33 +67,6 @@ from letta.settings import model_settings
 logger = get_logger(__name__)


-def sanitize_unicode_surrogates(obj: Any) -> Any:
-    """Recursively sanitize invalid Unicode surrogates in strings within nested data structures.
-
-    This fixes UnicodeEncodeError when the OpenAI SDK tries to encode requests containing
-    unpaired UTF-16 surrogates (e.g., '\ud83c' without its pair) which can occur in corrupted
-    emoji data or malformed Unicode sequences.
-
-    Args:
-        obj: The object to sanitize (dict, list, str, or other types)
-
-    Returns:
-        The sanitized object with invalid surrogates replaced by the Unicode replacement character
-    """
-    if isinstance(obj, dict):
-        return {k: sanitize_unicode_surrogates(v) for k, v in obj.items()}
-    elif isinstance(obj, list):
-        return [sanitize_unicode_surrogates(item) for item in obj]
-    elif isinstance(obj, str):
-        try:
-            obj.encode("utf-8")
-            return obj
-        except UnicodeEncodeError:
-            return obj.encode("utf-8", errors="replace").decode("utf-8")
-    else:
-        return obj
-
-
 def is_openai_reasoning_model(model: str) -> bool:
    """Utility function to check if the model is a 'reasoner'"""