fix(core): handle UTF-8 surrogate characters in API responses (#9422)

* fix(core): handle UTF-8 surrogate characters in API responses

LLM responses or user input can contain surrogate characters (U+D800-U+DFFF)
which are valid Python strings but illegal in UTF-8. ORJSONResponse rejects
them with "str is not valid UTF-8: surrogates not allowed". Add
SafeORJSONResponse that catches the TypeError and strips surrogates before
retrying serialization.

🤖 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

* refactor: reuse sanitize_unicode_surrogates from json_helpers

Replace the inline _sanitize_surrogates function with the existing
sanitize_unicode_surrogates helper from letta.helpers.json_helpers,
which is already used across all LLM clients.

Co-authored-by: Kian Jones <kianjones9@users.noreply.github.com>

🤖 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

---------

Co-authored-by: Letta <noreply@letta.com>
Co-authored-by: letta-code <248085862+letta-code@users.noreply.github.com>
This commit is contained in:
Kian Jones
2026-02-10 17:25:57 -08:00
committed by Caren Thomas
parent 39b25a0e3c
commit a00270d199

View File

@@ -16,9 +16,12 @@ import uvicorn
# Enable Python fault handler to get stack traces on segfaults
faulthandler.enable()
import orjson
from fastapi import FastAPI, Request
from fastapi.exceptions import RequestValidationError
from fastapi.responses import JSONResponse, ORJSONResponse
from letta.helpers.json_helpers import sanitize_unicode_surrogates
from marshmallow import ValidationError
from sqlalchemy.exc import DBAPIError, IntegrityError, OperationalError
from starlette.middleware.cors import CORSMiddleware
@@ -76,6 +79,31 @@ from letta.schemas.letta_message_content import (
create_letta_user_message_content_union_schema,
)
from letta.server.constants import REST_DEFAULT_PORT
class SafeORJSONResponse(ORJSONResponse):
"""ORJSONResponse that handles Python strings containing UTF-8 surrogates.
LLM responses or user input can occasionally contain surrogate characters
(U+D800U+DFFF) which are valid in Python str but illegal in UTF-8.
Standard orjson serialisation rejects them with:
TypeError: str is not valid UTF-8: surrogates not allowed
This subclass catches that error, strips the surrogates, and retries.
"""
def render(self, content) -> bytes:
try:
return super().render(content)
except TypeError as exc:
if "surrogates" not in str(exc):
raise
sanitized = sanitize_unicode_surrogates(content)
return orjson.dumps(
sanitized,
option=orjson.OPT_NON_STR_KEYS | orjson.OPT_SERIALIZE_NUMPY,
)
from letta.server.db import db_registry
from letta.server.global_exception_handler import setup_global_exception_handlers
@@ -376,7 +404,7 @@ def create_application() -> "FastAPI":
version=letta_version,
debug=debug_mode, # if True, the stack trace will be printed in the response
lifespan=lifespan,
default_response_class=ORJSONResponse, # Use orjson for 10x faster JSON serialization
default_response_class=SafeORJSONResponse, # Use orjson for 10x faster JSON serialization, with surrogate safety
)
# === Global Exception Handlers ===