Files
letta-server/letta/helpers/json_helpers.py
Ani Tunturi 1d1adb261a
Some checks are pending
Test Package Installation / test-install (3.11) (push) Waiting to run
Test Package Installation / test-install (3.12) (push) Waiting to run
Test Package Installation / test-install (3.13) (push) Waiting to run
fix: orphaned approvals, token inflation, reasoning fields, memfs redis dep
[IN TESTING — self-hosted 0.16.6, Kimi-K2.5 via Synthetic Direct]

Four independent fixes that landed together on this stack:

helpers.py — skip PendingApprovalError when the associated run is already
cancelled or failed. Stale approvals from interrupted runs were blocking all
subsequent messages on that conversation. Now checks run status before raising;
falls back to raising on lookup failure (conservative).

letta_agent_v3.py — use prompt_tokens not total_tokens for context window
estimate. total_tokens inflated the estimate by including completion tokens,
triggering premature compaction. This was causing context window resets mid-
conversation and is the root of the token inflation bug (see #3242).

openai_client.py (both build_request_data paths) — strip reasoning_content,
reasoning_content_signature, redacted_reasoning_content, omitted_reasoning_content
from message history before sending to inference backends. Fireworks and Synthetic
Direct reject these fields with 422/400 errors. exclude_none handles None values
but not actual text content from previous assistant turns.

block_manager_git.py — skip DB write when block value is unchanged. Reduces
unnecessary write amplification on every memfs sync cycle.

memfs_client_base.py — remove redis_client= kwarg from GitOperations init.
Dependency was removed upstream but the call site wasn't updated.

Dockerfile / compose files — context window and config updates for 220k limit.
2026-03-26 23:24:32 -04:00

176 lines
7.0 KiB
Python
Raw Permalink Blame History

import base64
import json
from datetime import datetime
from typing import Any
def sanitize_unicode_surrogates(value: Any) -> Any:
"""Recursively remove invalid Unicode surrogate characters from strings.
Unicode surrogate pairs (U+D800 to U+DFFF) are used internally by UTF-16 encoding
but are invalid as standalone characters in UTF-8. When present, they cause
UnicodeEncodeError when encoding to UTF-8, breaking API requests that need to
serialize data to JSON.
This function sanitizes:
- Strings: removes unpaired surrogates that can't be encoded to UTF-8
- Dicts: recursively sanitizes all string values
- Lists: recursively sanitizes all elements
- Other types: returned as-is
Args:
value: The value to sanitize
Returns:
The sanitized value with surrogate characters removed from all strings
"""
if isinstance(value, str):
# Remove lone surrogate characters (U+D800 to U+DFFF) which are invalid in UTF-8
# Using character filtering is more reliable than encode/decode for edge cases
try:
# Filter out any character in the surrogate range
return "".join(char for char in value if not (0xD800 <= ord(char) <= 0xDFFF))
except Exception:
# Fallback: try encode with errors="replace" which replaces surrogates with <20>
try:
return value.encode("utf-8", errors="replace").decode("utf-8")
except Exception:
# Last resort: return original (should never reach here)
return value
elif isinstance(value, dict):
# Recursively sanitize dictionary keys and values
return {sanitize_unicode_surrogates(k): sanitize_unicode_surrogates(v) for k, v in value.items()}
elif isinstance(value, list):
# Recursively sanitize list elements
return [sanitize_unicode_surrogates(item) for item in value]
elif isinstance(value, tuple):
# Recursively sanitize tuple elements (return as tuple)
return tuple(sanitize_unicode_surrogates(item) for item in value)
else:
# Return other types as-is (int, float, bool, None, etc.)
return value
_UNICODE_TO_ASCII = {
"\u2014": "--", # em-dash
"\u2013": "-", # en-dash
"\u2012": "-", # figure dash
"\u2010": "-", # hyphen
"\u2011": "-", # non-breaking hyphen
"\u201c": '"', # left double quotation mark
"\u201d": '"', # right double quotation mark
"\u2018": "'", # left single quotation mark
"\u2019": "'", # right single quotation mark
"\u201a": ",", # single low-9 quotation mark
"\u201e": '"', # double low-9 quotation mark
"\u2026": "...", # horizontal ellipsis
"\u00a0": " ", # non-breaking space
"\u00ad": "", # soft hyphen (invisible, strip)
}
def sanitize_control_characters(value: Any) -> Any:
"""Recursively sanitize strings for strict ASCII-only JSON backends (e.g. Synthetic).
Removes ASCII control characters (0x00-0x1F) except tab/newline/CR.
Replaces common non-ASCII typography (em-dash, curly quotes, ellipsis, etc.)
with ASCII equivalents. Strips remaining non-ASCII chars (> 0x7E) that would
appear as raw multi-byte UTF-8 sequences in the request body and cause parse
failures on backends that expect ASCII-safe JSON.
This function sanitizes:
- Strings: replaces/strips non-ASCII; strips control chars except whitespace
- Dicts: recursively sanitizes all string values
- Lists: recursively sanitizes all elements
- Other types: returned as-is
"""
if isinstance(value, str):
# Replace known typographic Unicode with ASCII equivalents first
for uni, asc in _UNICODE_TO_ASCII.items():
value = value.replace(uni, asc)
return "".join(
char for char in value
if ord(char) <= 0x7E # printable ASCII only
or char in ("\t", "\n", "\r") # allowed whitespace
)
elif isinstance(value, dict):
return {sanitize_control_characters(k): sanitize_control_characters(v) for k, v in value.items()}
elif isinstance(value, list):
return [sanitize_control_characters(item) for item in value]
elif isinstance(value, tuple):
return tuple(sanitize_control_characters(item) for item in value)
else:
return value
def sanitize_null_bytes(value: Any) -> Any:
"""Recursively remove null bytes (0x00) from strings.
PostgreSQL TEXT columns don't accept null bytes in UTF-8 encoding, which causes
asyncpg.exceptions.CharacterNotInRepertoireError when data with null bytes is inserted.
This function sanitizes:
- Strings: removes all null bytes
- Dicts: recursively sanitizes all string values
- Lists: recursively sanitizes all elements
- Other types: returned as-is
Args:
value: The value to sanitize
Returns:
The sanitized value with null bytes removed from all strings
"""
if isinstance(value, str):
# Remove null bytes from strings
return value.replace("\x00", "")
elif isinstance(value, dict):
# Recursively sanitize dictionary keys and values
return {sanitize_null_bytes(k): sanitize_null_bytes(v) for k, v in value.items()}
elif isinstance(value, list):
# Recursively sanitize list elements
return [sanitize_null_bytes(item) for item in value]
elif isinstance(value, tuple):
# Recursively sanitize tuple elements (return as tuple)
return tuple(sanitize_null_bytes(item) for item in value)
else:
# Return other types as-is (int, float, bool, None, etc.)
return value
def json_loads(data):
return json.loads(data, strict=False)
def json_dumps(data, indent=2) -> str:
"""Serialize data to JSON string, sanitizing null bytes to prevent PostgreSQL errors.
PostgreSQL TEXT columns reject null bytes (0x00) in UTF-8 encoding. This function
sanitizes all strings in the data structure before JSON serialization to prevent
asyncpg.exceptions.CharacterNotInRepertoireError.
Args:
data: The data to serialize
indent: JSON indentation level (default: 2)
Returns:
JSON string with null bytes removed from all string values
"""
# Sanitize null bytes before serialization to prevent PostgreSQL errors
sanitized_data = sanitize_null_bytes(data)
def safe_serializer(obj):
if isinstance(obj, datetime):
return obj.isoformat()
if isinstance(obj, bytes):
try:
decoded = obj.decode("utf-8")
# Also sanitize decoded bytes
return decoded.replace("\x00", "")
except Exception:
# TODO: this is to handle Gemini thought signatures, b64 decode this back to bytes when sending back to Gemini
return base64.b64encode(obj).decode("utf-8")
raise TypeError(f"Type {type(obj)} not serializable")
return json.dumps(sanitized_data, indent=indent, default=safe_serializer, ensure_ascii=False)