fix: sanitize null bytes to prevent PostgreSQL CharacterNotInRepertoireError (#8015)

This fixes the asyncpg.exceptions.CharacterNotInRepertoireError that occurs
when tool returns contain null bytes (0x00), which PostgreSQL TEXT columns
reject in UTF-8 encoding.

Changes:
- Add sanitize_null_bytes() function to recursively remove null bytes from strings
- Update json_dumps() to sanitize data before serialization
- Apply sanitization in converters.py for tool_calls, tool_returns, approvals, and message_content
- Add comprehensive unit tests

Fixes #8014

🤖 Generated with [Letta Code](https://letta.com)

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Letta <noreply@letta.com>
Co-authored-by: Kian Jones <11655409+kianjones9@users.noreply.github.com>
This commit is contained in:
github-actions[bot]
2025-12-28 15:01:21 -05:00
committed by Caren Thomas
parent d5decc2a27
commit dbdd1a40e4
3 changed files with 196 additions and 15 deletions

View File

@@ -6,6 +6,10 @@ from openai.types.chat.chat_completion_message_tool_call import ChatCompletionMe
from sqlalchemy import Dialect
from letta.functions.mcp_client.types import StdioServerConfig
from letta.helpers.json_helpers import sanitize_null_bytes
from letta.log import get_logger
logger = get_logger(__name__)
from letta.schemas.embedding_config import EmbeddingConfig
from letta.schemas.enums import ProviderType, ToolRuleType
from letta.schemas.letta_message import ApprovalReturn, MessageReturnType
@@ -184,16 +188,22 @@ def deserialize_tool_rule(
def serialize_tool_calls(tool_calls: Optional[List[Union[OpenAIToolCall, dict]]]) -> List[Dict]:
"""Convert a list of OpenAI ToolCall objects into JSON-serializable format."""
"""Convert a list of OpenAI ToolCall objects into JSON-serializable format.
Note: Tool call arguments may contain null bytes from various sources.
These are sanitized to prevent PostgreSQL errors.
"""
if not tool_calls:
return []
serialized_calls = []
for call in tool_calls:
if isinstance(call, OpenAIToolCall):
serialized_calls.append(call.model_dump(mode="json"))
# Sanitize null bytes from tool call data to prevent PostgreSQL errors
serialized_calls.append(sanitize_null_bytes(call.model_dump(mode="json")))
elif isinstance(call, dict):
serialized_calls.append(call) # Already a dictionary, leave it as-is
# Sanitize null bytes from dictionary data
serialized_calls.append(sanitize_null_bytes(call))
else:
raise TypeError(f"Unexpected tool call type: {type(call)}")
@@ -221,16 +231,22 @@ def deserialize_tool_calls(data: Optional[List[Dict]]) -> List[OpenAIToolCall]:
def serialize_tool_returns(tool_returns: Optional[List[Union[ToolReturn, dict]]]) -> List[Dict]:
"""Convert a list of ToolReturn objects into JSON-serializable format."""
"""Convert a list of ToolReturn objects into JSON-serializable format.
Note: Tool returns may contain null bytes from sandbox execution or binary data.
These are sanitized to prevent PostgreSQL errors.
"""
if not tool_returns:
return []
serialized_tool_returns = []
for tool_return in tool_returns:
if isinstance(tool_return, ToolReturn):
serialized_tool_returns.append(tool_return.model_dump(mode="json"))
# Sanitize null bytes from tool return data to prevent PostgreSQL errors
serialized_tool_returns.append(sanitize_null_bytes(tool_return.model_dump(mode="json")))
elif isinstance(tool_return, dict):
serialized_tool_returns.append(tool_return) # Already a dictionary, leave it as-is
# Sanitize null bytes from dictionary data
serialized_tool_returns.append(sanitize_null_bytes(tool_return))
else:
raise TypeError(f"Unexpected tool return type: {type(tool_return)}")
@@ -256,18 +272,24 @@ def deserialize_tool_returns(data: Optional[List[Dict]]) -> List[ToolReturn]:
def serialize_approvals(approvals: Optional[List[Union[ApprovalReturn, ToolReturn, dict]]]) -> List[Dict]:
"""Convert a list of ToolReturn objects into JSON-serializable format."""
"""Convert a list of ToolReturn objects into JSON-serializable format.
Note: Approval data may contain null bytes from various sources.
These are sanitized to prevent PostgreSQL errors.
"""
if not approvals:
return []
serialized_approvals = []
for approval in approvals:
if isinstance(approval, ApprovalReturn):
serialized_approvals.append(approval.model_dump(mode="json"))
# Sanitize null bytes from approval data to prevent PostgreSQL errors
serialized_approvals.append(sanitize_null_bytes(approval.model_dump(mode="json")))
elif isinstance(approval, ToolReturn):
serialized_approvals.append(approval.model_dump(mode="json"))
serialized_approvals.append(sanitize_null_bytes(approval.model_dump(mode="json")))
elif isinstance(approval, dict):
serialized_approvals.append(approval) # Already a dictionary, leave it as-is
# Sanitize null bytes from dictionary data
serialized_approvals.append(sanitize_null_bytes(approval))
else:
raise TypeError(f"Unexpected approval type: {type(approval)}")
@@ -318,7 +340,11 @@ def deserialize_approvals(data: Optional[List[Dict]]) -> List[Union[ApprovalRetu
def serialize_message_content(message_content: Optional[List[Union[MessageContent, dict]]]) -> List[Dict]:
"""Convert a list of MessageContent objects into JSON-serializable format."""
"""Convert a list of MessageContent objects into JSON-serializable format.
Note: Message content may contain null bytes from various sources.
These are sanitized to prevent PostgreSQL errors.
"""
if not message_content:
return []
@@ -327,9 +353,11 @@ def serialize_message_content(message_content: Optional[List[Union[MessageConten
if isinstance(content, MessageContent):
if content.type == MessageContentType.image:
assert content.source.type == ImageSourceType.letta, f"Invalid image source type: {content.source.type}"
serialized_message_content.append(content.model_dump(mode="json"))
# Sanitize null bytes from message content to prevent PostgreSQL errors
serialized_message_content.append(sanitize_null_bytes(content.model_dump(mode="json")))
elif isinstance(content, dict):
serialized_message_content.append(content) # Already a dictionary, leave it as-is
# Sanitize null bytes from dictionary data
serialized_message_content.append(sanitize_null_bytes(content))
else:
raise TypeError(f"Unexpected message content type: {type(content)}")
return serialized_message_content

View File

@@ -1,6 +1,42 @@
import base64
import json
from datetime import datetime
from typing import Any
def sanitize_null_bytes(value: Any) -> Any:
"""Recursively remove null bytes (0x00) from strings.
PostgreSQL TEXT columns don't accept null bytes in UTF-8 encoding, which causes
asyncpg.exceptions.CharacterNotInRepertoireError when data with null bytes is inserted.
This function sanitizes:
- Strings: removes all null bytes
- Dicts: recursively sanitizes all string values
- Lists: recursively sanitizes all elements
- Other types: returned as-is
Args:
value: The value to sanitize
Returns:
The sanitized value with null bytes removed from all strings
"""
if isinstance(value, str):
# Remove null bytes from strings
return value.replace("\x00", "")
elif isinstance(value, dict):
# Recursively sanitize dictionary values
return {k: sanitize_null_bytes(v) for k, v in value.items()}
elif isinstance(value, list):
# Recursively sanitize list elements
return [sanitize_null_bytes(item) for item in value]
elif isinstance(value, tuple):
# Recursively sanitize tuple elements (return as tuple)
return tuple(sanitize_null_bytes(item) for item in value)
else:
# Return other types as-is (int, float, bool, None, etc.)
return value
def json_loads(data):
@@ -8,15 +44,33 @@ def json_loads(data):
def json_dumps(data, indent=2) -> str:
"""Serialize data to JSON string, sanitizing null bytes to prevent PostgreSQL errors.
PostgreSQL TEXT columns reject null bytes (0x00) in UTF-8 encoding. This function
sanitizes all strings in the data structure before JSON serialization to prevent
asyncpg.exceptions.CharacterNotInRepertoireError.
Args:
data: The data to serialize
indent: JSON indentation level (default: 2)
Returns:
JSON string with null bytes removed from all string values
"""
# Sanitize null bytes before serialization to prevent PostgreSQL errors
sanitized_data = sanitize_null_bytes(data)
def safe_serializer(obj):
if isinstance(obj, datetime):
return obj.isoformat()
if isinstance(obj, bytes):
try:
return obj.decode("utf-8")
decoded = obj.decode("utf-8")
# Also sanitize decoded bytes
return decoded.replace("\x00", "")
except Exception:
# TODO: this is to handle Gemini thought signatures, b64 decode this back to bytes when sending back to Gemini
return base64.b64encode(obj).decode("utf-8")
raise TypeError(f"Type {type(obj)} not serializable")
return json.dumps(data, indent=indent, default=safe_serializer, ensure_ascii=False)
return json.dumps(sanitized_data, indent=indent, default=safe_serializer, ensure_ascii=False)