Fireworks (via Synthetic Direct) chokes on raw ASCII control chars (0x00-0x1F) in JSON payloads with "Unterminated string" errors. The existing sanitize_unicode_surrogates only handles U+D800-DFFF. Now we also strip control chars (preserving tab/newline/CR) at all 4 request paths — sync, async, and both streaming variants.
155 lines
6.1 KiB
Python
155 lines
6.1 KiB
Python
import base64
|
||
import json
|
||
from datetime import datetime
|
||
from typing import Any
|
||
|
||
|
||
def sanitize_unicode_surrogates(value: Any) -> Any:
|
||
"""Recursively remove invalid Unicode surrogate characters from strings.
|
||
|
||
Unicode surrogate pairs (U+D800 to U+DFFF) are used internally by UTF-16 encoding
|
||
but are invalid as standalone characters in UTF-8. When present, they cause
|
||
UnicodeEncodeError when encoding to UTF-8, breaking API requests that need to
|
||
serialize data to JSON.
|
||
|
||
This function sanitizes:
|
||
- Strings: removes unpaired surrogates that can't be encoded to UTF-8
|
||
- Dicts: recursively sanitizes all string values
|
||
- Lists: recursively sanitizes all elements
|
||
- Other types: returned as-is
|
||
|
||
Args:
|
||
value: The value to sanitize
|
||
|
||
Returns:
|
||
The sanitized value with surrogate characters removed from all strings
|
||
"""
|
||
if isinstance(value, str):
|
||
# Remove lone surrogate characters (U+D800 to U+DFFF) which are invalid in UTF-8
|
||
# Using character filtering is more reliable than encode/decode for edge cases
|
||
try:
|
||
# Filter out any character in the surrogate range
|
||
return "".join(char for char in value if not (0xD800 <= ord(char) <= 0xDFFF))
|
||
except Exception:
|
||
# Fallback: try encode with errors="replace" which replaces surrogates with <20>
|
||
try:
|
||
return value.encode("utf-8", errors="replace").decode("utf-8")
|
||
except Exception:
|
||
# Last resort: return original (should never reach here)
|
||
return value
|
||
elif isinstance(value, dict):
|
||
# Recursively sanitize dictionary keys and values
|
||
return {sanitize_unicode_surrogates(k): sanitize_unicode_surrogates(v) for k, v in value.items()}
|
||
elif isinstance(value, list):
|
||
# Recursively sanitize list elements
|
||
return [sanitize_unicode_surrogates(item) for item in value]
|
||
elif isinstance(value, tuple):
|
||
# Recursively sanitize tuple elements (return as tuple)
|
||
return tuple(sanitize_unicode_surrogates(item) for item in value)
|
||
else:
|
||
# Return other types as-is (int, float, bool, None, etc.)
|
||
return value
|
||
|
||
|
||
def sanitize_control_characters(value: Any) -> Any:
|
||
"""Recursively remove ASCII control characters (0x00-0x1F) from strings,
|
||
preserving tab (0x09), newline (0x0A), and carriage return (0x0D).
|
||
|
||
Some inference backends (e.g. Fireworks AI) perform strict JSON parsing on
|
||
the request body and reject payloads containing unescaped control characters.
|
||
Python's json.dumps will escape these, but certain proxy layers may
|
||
double-parse or re-serialize in ways that expose the raw bytes.
|
||
|
||
This function sanitizes:
|
||
- Strings: strips control characters except whitespace (tab, newline, CR)
|
||
- Dicts: recursively sanitizes all string values
|
||
- Lists: recursively sanitizes all elements
|
||
- Other types: returned as-is
|
||
"""
|
||
if isinstance(value, str):
|
||
return "".join(
|
||
char for char in value
|
||
if ord(char) >= 0x20 # printable
|
||
or char in ("\t", "\n", "\r") # allowed whitespace
|
||
)
|
||
elif isinstance(value, dict):
|
||
return {sanitize_control_characters(k): sanitize_control_characters(v) for k, v in value.items()}
|
||
elif isinstance(value, list):
|
||
return [sanitize_control_characters(item) for item in value]
|
||
elif isinstance(value, tuple):
|
||
return tuple(sanitize_control_characters(item) for item in value)
|
||
else:
|
||
return value
|
||
|
||
|
||
def sanitize_null_bytes(value: Any) -> Any:
|
||
"""Recursively remove null bytes (0x00) from strings.
|
||
|
||
PostgreSQL TEXT columns don't accept null bytes in UTF-8 encoding, which causes
|
||
asyncpg.exceptions.CharacterNotInRepertoireError when data with null bytes is inserted.
|
||
|
||
This function sanitizes:
|
||
- Strings: removes all null bytes
|
||
- Dicts: recursively sanitizes all string values
|
||
- Lists: recursively sanitizes all elements
|
||
- Other types: returned as-is
|
||
|
||
Args:
|
||
value: The value to sanitize
|
||
|
||
Returns:
|
||
The sanitized value with null bytes removed from all strings
|
||
"""
|
||
if isinstance(value, str):
|
||
# Remove null bytes from strings
|
||
return value.replace("\x00", "")
|
||
elif isinstance(value, dict):
|
||
# Recursively sanitize dictionary keys and values
|
||
return {sanitize_null_bytes(k): sanitize_null_bytes(v) for k, v in value.items()}
|
||
elif isinstance(value, list):
|
||
# Recursively sanitize list elements
|
||
return [sanitize_null_bytes(item) for item in value]
|
||
elif isinstance(value, tuple):
|
||
# Recursively sanitize tuple elements (return as tuple)
|
||
return tuple(sanitize_null_bytes(item) for item in value)
|
||
else:
|
||
# Return other types as-is (int, float, bool, None, etc.)
|
||
return value
|
||
|
||
|
||
def json_loads(data):
|
||
return json.loads(data, strict=False)
|
||
|
||
|
||
def json_dumps(data, indent=2) -> str:
|
||
"""Serialize data to JSON string, sanitizing null bytes to prevent PostgreSQL errors.
|
||
|
||
PostgreSQL TEXT columns reject null bytes (0x00) in UTF-8 encoding. This function
|
||
sanitizes all strings in the data structure before JSON serialization to prevent
|
||
asyncpg.exceptions.CharacterNotInRepertoireError.
|
||
|
||
Args:
|
||
data: The data to serialize
|
||
indent: JSON indentation level (default: 2)
|
||
|
||
Returns:
|
||
JSON string with null bytes removed from all string values
|
||
"""
|
||
# Sanitize null bytes before serialization to prevent PostgreSQL errors
|
||
sanitized_data = sanitize_null_bytes(data)
|
||
|
||
def safe_serializer(obj):
|
||
if isinstance(obj, datetime):
|
||
return obj.isoformat()
|
||
if isinstance(obj, bytes):
|
||
try:
|
||
decoded = obj.decode("utf-8")
|
||
# Also sanitize decoded bytes
|
||
return decoded.replace("\x00", "")
|
||
except Exception:
|
||
# TODO: this is to handle Gemini thought signatures, b64 decode this back to bytes when sending back to Gemini
|
||
return base64.b64encode(obj).decode("utf-8")
|
||
raise TypeError(f"Type {type(obj)} not serializable")
|
||
|
||
return json.dumps(sanitized_data, indent=indent, default=safe_serializer, ensure_ascii=False)
|