fix(core): handle UTF-8 surrogate characters in API responses (#9422)

* fix(core): handle UTF-8 surrogate characters in API responses LLM responses or user input can contain surrogate characters (U+D800-U+DFFF) which are valid Python strings but illegal in UTF-8. ORJSONResponse rejects them with "str is not valid UTF-8: surrogates not allowed". Add SafeORJSONResponse that catches the TypeError and strips surrogates before retrying serialization. 🤖 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> * refactor: reuse sanitize_unicode_surrogates from json_helpers Replace the inline _sanitize_surrogates function with the existing sanitize_unicode_surrogates helper from letta.helpers.json_helpers, which is already used across all LLM clients. Co-authored-by: Kian Jones <kianjones9@users.noreply.github.com> 🤖 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> --------- Co-authored-by: Letta <noreply@letta.com> Co-authored-by: letta-code <248085862+letta-code@users.noreply.github.com>
2026-02-10 17:25:57 -08:00
parent 39b25a0e3c
commit a00270d199
1 changed files with 29 additions and 1 deletions
--- a/letta/server/rest_api/app.py
+++ b/letta/server/rest_api/app.py
@@ -16,9 +16,12 @@ import uvicorn
 # Enable Python fault handler to get stack traces on segfaults
 faulthandler.enable()

+import orjson
 from fastapi import FastAPI, Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.responses import JSONResponse, ORJSONResponse
+
+from letta.helpers.json_helpers import sanitize_unicode_surrogates
 from marshmallow import ValidationError
 from sqlalchemy.exc import DBAPIError, IntegrityError, OperationalError
 from starlette.middleware.cors import CORSMiddleware
@@ -76,6 +79,31 @@ from letta.schemas.letta_message_content import (
    create_letta_user_message_content_union_schema,
 )
 from letta.server.constants import REST_DEFAULT_PORT
+
+
+class SafeORJSONResponse(ORJSONResponse):
+    """ORJSONResponse that handles Python strings containing UTF-8 surrogates.
+
+    LLM responses or user input can occasionally contain surrogate characters
+    (U+D800–U+DFFF) which are valid in Python str but illegal in UTF-8.
+    Standard orjson serialisation rejects them with:
+        TypeError: str is not valid UTF-8: surrogates not allowed
+    This subclass catches that error, strips the surrogates, and retries.
+    """
+
+    def render(self, content) -> bytes:
+        try:
+            return super().render(content)
+        except TypeError as exc:
+            if "surrogates" not in str(exc):
+                raise
+            sanitized = sanitize_unicode_surrogates(content)
+            return orjson.dumps(
+                sanitized,
+                option=orjson.OPT_NON_STR_KEYS | orjson.OPT_SERIALIZE_NUMPY,
+            )
+
+
 from letta.server.db import db_registry
 from letta.server.global_exception_handler import setup_global_exception_handlers

@@ -376,7 +404,7 @@ def create_application() -> "FastAPI":
        version=letta_version,
        debug=debug_mode,  # if True, the stack trace will be printed in the response
        lifespan=lifespan,
-        default_response_class=ORJSONResponse,  # Use orjson for 10x faster JSON serialization
+        default_response_class=SafeORJSONResponse,  # Use orjson for 10x faster JSON serialization, with surrogate safety
    )

    # === Global Exception Handlers ===