Files
letta-server/letta/helpers/json_helpers.py
Ani Tunturi 08d3c26732 fix: sanitize control characters before sending to inference backends
Fireworks (via Synthetic Direct) chokes on raw ASCII control chars
(0x00-0x1F) in JSON payloads with "Unterminated string" errors.
The existing sanitize_unicode_surrogates only handles U+D800-DFFF.
Now we also strip control chars (preserving tab/newline/CR) at all
4 request paths — sync, async, and both streaming variants.
2026-03-21 20:23:56 -04:00

155 lines
6.1 KiB
Python
Raw Blame History

import base64
import json
from datetime import datetime
from typing import Any
def sanitize_unicode_surrogates(value: Any) -> Any:
"""Recursively remove invalid Unicode surrogate characters from strings.
Unicode surrogate pairs (U+D800 to U+DFFF) are used internally by UTF-16 encoding
but are invalid as standalone characters in UTF-8. When present, they cause
UnicodeEncodeError when encoding to UTF-8, breaking API requests that need to
serialize data to JSON.
This function sanitizes:
- Strings: removes unpaired surrogates that can't be encoded to UTF-8
- Dicts: recursively sanitizes all string values
- Lists: recursively sanitizes all elements
- Other types: returned as-is
Args:
value: The value to sanitize
Returns:
The sanitized value with surrogate characters removed from all strings
"""
if isinstance(value, str):
# Remove lone surrogate characters (U+D800 to U+DFFF) which are invalid in UTF-8
# Using character filtering is more reliable than encode/decode for edge cases
try:
# Filter out any character in the surrogate range
return "".join(char for char in value if not (0xD800 <= ord(char) <= 0xDFFF))
except Exception:
# Fallback: try encode with errors="replace" which replaces surrogates with <20>
try:
return value.encode("utf-8", errors="replace").decode("utf-8")
except Exception:
# Last resort: return original (should never reach here)
return value
elif isinstance(value, dict):
# Recursively sanitize dictionary keys and values
return {sanitize_unicode_surrogates(k): sanitize_unicode_surrogates(v) for k, v in value.items()}
elif isinstance(value, list):
# Recursively sanitize list elements
return [sanitize_unicode_surrogates(item) for item in value]
elif isinstance(value, tuple):
# Recursively sanitize tuple elements (return as tuple)
return tuple(sanitize_unicode_surrogates(item) for item in value)
else:
# Return other types as-is (int, float, bool, None, etc.)
return value
def sanitize_control_characters(value: Any) -> Any:
"""Recursively remove ASCII control characters (0x00-0x1F) from strings,
preserving tab (0x09), newline (0x0A), and carriage return (0x0D).
Some inference backends (e.g. Fireworks AI) perform strict JSON parsing on
the request body and reject payloads containing unescaped control characters.
Python's json.dumps will escape these, but certain proxy layers may
double-parse or re-serialize in ways that expose the raw bytes.
This function sanitizes:
- Strings: strips control characters except whitespace (tab, newline, CR)
- Dicts: recursively sanitizes all string values
- Lists: recursively sanitizes all elements
- Other types: returned as-is
"""
if isinstance(value, str):
return "".join(
char for char in value
if ord(char) >= 0x20 # printable
or char in ("\t", "\n", "\r") # allowed whitespace
)
elif isinstance(value, dict):
return {sanitize_control_characters(k): sanitize_control_characters(v) for k, v in value.items()}
elif isinstance(value, list):
return [sanitize_control_characters(item) for item in value]
elif isinstance(value, tuple):
return tuple(sanitize_control_characters(item) for item in value)
else:
return value
def sanitize_null_bytes(value: Any) -> Any:
"""Recursively remove null bytes (0x00) from strings.
PostgreSQL TEXT columns don't accept null bytes in UTF-8 encoding, which causes
asyncpg.exceptions.CharacterNotInRepertoireError when data with null bytes is inserted.
This function sanitizes:
- Strings: removes all null bytes
- Dicts: recursively sanitizes all string values
- Lists: recursively sanitizes all elements
- Other types: returned as-is
Args:
value: The value to sanitize
Returns:
The sanitized value with null bytes removed from all strings
"""
if isinstance(value, str):
# Remove null bytes from strings
return value.replace("\x00", "")
elif isinstance(value, dict):
# Recursively sanitize dictionary keys and values
return {sanitize_null_bytes(k): sanitize_null_bytes(v) for k, v in value.items()}
elif isinstance(value, list):
# Recursively sanitize list elements
return [sanitize_null_bytes(item) for item in value]
elif isinstance(value, tuple):
# Recursively sanitize tuple elements (return as tuple)
return tuple(sanitize_null_bytes(item) for item in value)
else:
# Return other types as-is (int, float, bool, None, etc.)
return value
def json_loads(data):
return json.loads(data, strict=False)
def json_dumps(data, indent=2) -> str:
"""Serialize data to JSON string, sanitizing null bytes to prevent PostgreSQL errors.
PostgreSQL TEXT columns reject null bytes (0x00) in UTF-8 encoding. This function
sanitizes all strings in the data structure before JSON serialization to prevent
asyncpg.exceptions.CharacterNotInRepertoireError.
Args:
data: The data to serialize
indent: JSON indentation level (default: 2)
Returns:
JSON string with null bytes removed from all string values
"""
# Sanitize null bytes before serialization to prevent PostgreSQL errors
sanitized_data = sanitize_null_bytes(data)
def safe_serializer(obj):
if isinstance(obj, datetime):
return obj.isoformat()
if isinstance(obj, bytes):
try:
decoded = obj.decode("utf-8")
# Also sanitize decoded bytes
return decoded.replace("\x00", "")
except Exception:
# TODO: this is to handle Gemini thought signatures, b64 decode this back to bytes when sending back to Gemini
return base64.b64encode(obj).decode("utf-8")
raise TypeError(f"Type {type(obj)} not serializable")
return json.dumps(sanitized_data, indent=indent, default=safe_serializer, ensure_ascii=False)