fix: sanitize null bytes to prevent PostgreSQL CharacterNotInRepertoireError (#8015)
This fixes the asyncpg.exceptions.CharacterNotInRepertoireError that occurs when tool returns contain null bytes (0x00), which PostgreSQL TEXT columns reject in UTF-8 encoding. Changes: - Add sanitize_null_bytes() function to recursively remove null bytes from strings - Update json_dumps() to sanitize data before serialization - Apply sanitization in converters.py for tool_calls, tool_returns, approvals, and message_content - Add comprehensive unit tests Fixes #8014 🤖 Generated with [Letta Code](https://letta.com) Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Letta <noreply@letta.com> Co-authored-by: Kian Jones <11655409+kianjones9@users.noreply.github.com>
This commit is contained in:
committed by
Caren Thomas
parent
d5decc2a27
commit
dbdd1a40e4
@@ -6,6 +6,10 @@ from openai.types.chat.chat_completion_message_tool_call import ChatCompletionMe
|
||||
from sqlalchemy import Dialect
|
||||
|
||||
from letta.functions.mcp_client.types import StdioServerConfig
|
||||
from letta.helpers.json_helpers import sanitize_null_bytes
|
||||
from letta.log import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
from letta.schemas.embedding_config import EmbeddingConfig
|
||||
from letta.schemas.enums import ProviderType, ToolRuleType
|
||||
from letta.schemas.letta_message import ApprovalReturn, MessageReturnType
|
||||
@@ -184,16 +188,22 @@ def deserialize_tool_rule(
|
||||
|
||||
|
||||
def serialize_tool_calls(tool_calls: Optional[List[Union[OpenAIToolCall, dict]]]) -> List[Dict]:
|
||||
"""Convert a list of OpenAI ToolCall objects into JSON-serializable format."""
|
||||
"""Convert a list of OpenAI ToolCall objects into JSON-serializable format.
|
||||
|
||||
Note: Tool call arguments may contain null bytes from various sources.
|
||||
These are sanitized to prevent PostgreSQL errors.
|
||||
"""
|
||||
if not tool_calls:
|
||||
return []
|
||||
|
||||
serialized_calls = []
|
||||
for call in tool_calls:
|
||||
if isinstance(call, OpenAIToolCall):
|
||||
serialized_calls.append(call.model_dump(mode="json"))
|
||||
# Sanitize null bytes from tool call data to prevent PostgreSQL errors
|
||||
serialized_calls.append(sanitize_null_bytes(call.model_dump(mode="json")))
|
||||
elif isinstance(call, dict):
|
||||
serialized_calls.append(call) # Already a dictionary, leave it as-is
|
||||
# Sanitize null bytes from dictionary data
|
||||
serialized_calls.append(sanitize_null_bytes(call))
|
||||
else:
|
||||
raise TypeError(f"Unexpected tool call type: {type(call)}")
|
||||
|
||||
@@ -221,16 +231,22 @@ def deserialize_tool_calls(data: Optional[List[Dict]]) -> List[OpenAIToolCall]:
|
||||
|
||||
|
||||
def serialize_tool_returns(tool_returns: Optional[List[Union[ToolReturn, dict]]]) -> List[Dict]:
|
||||
"""Convert a list of ToolReturn objects into JSON-serializable format."""
|
||||
"""Convert a list of ToolReturn objects into JSON-serializable format.
|
||||
|
||||
Note: Tool returns may contain null bytes from sandbox execution or binary data.
|
||||
These are sanitized to prevent PostgreSQL errors.
|
||||
"""
|
||||
if not tool_returns:
|
||||
return []
|
||||
|
||||
serialized_tool_returns = []
|
||||
for tool_return in tool_returns:
|
||||
if isinstance(tool_return, ToolReturn):
|
||||
serialized_tool_returns.append(tool_return.model_dump(mode="json"))
|
||||
# Sanitize null bytes from tool return data to prevent PostgreSQL errors
|
||||
serialized_tool_returns.append(sanitize_null_bytes(tool_return.model_dump(mode="json")))
|
||||
elif isinstance(tool_return, dict):
|
||||
serialized_tool_returns.append(tool_return) # Already a dictionary, leave it as-is
|
||||
# Sanitize null bytes from dictionary data
|
||||
serialized_tool_returns.append(sanitize_null_bytes(tool_return))
|
||||
else:
|
||||
raise TypeError(f"Unexpected tool return type: {type(tool_return)}")
|
||||
|
||||
@@ -256,18 +272,24 @@ def deserialize_tool_returns(data: Optional[List[Dict]]) -> List[ToolReturn]:
|
||||
|
||||
|
||||
def serialize_approvals(approvals: Optional[List[Union[ApprovalReturn, ToolReturn, dict]]]) -> List[Dict]:
|
||||
"""Convert a list of ToolReturn objects into JSON-serializable format."""
|
||||
"""Convert a list of ToolReturn objects into JSON-serializable format.
|
||||
|
||||
Note: Approval data may contain null bytes from various sources.
|
||||
These are sanitized to prevent PostgreSQL errors.
|
||||
"""
|
||||
if not approvals:
|
||||
return []
|
||||
|
||||
serialized_approvals = []
|
||||
for approval in approvals:
|
||||
if isinstance(approval, ApprovalReturn):
|
||||
serialized_approvals.append(approval.model_dump(mode="json"))
|
||||
# Sanitize null bytes from approval data to prevent PostgreSQL errors
|
||||
serialized_approvals.append(sanitize_null_bytes(approval.model_dump(mode="json")))
|
||||
elif isinstance(approval, ToolReturn):
|
||||
serialized_approvals.append(approval.model_dump(mode="json"))
|
||||
serialized_approvals.append(sanitize_null_bytes(approval.model_dump(mode="json")))
|
||||
elif isinstance(approval, dict):
|
||||
serialized_approvals.append(approval) # Already a dictionary, leave it as-is
|
||||
# Sanitize null bytes from dictionary data
|
||||
serialized_approvals.append(sanitize_null_bytes(approval))
|
||||
else:
|
||||
raise TypeError(f"Unexpected approval type: {type(approval)}")
|
||||
|
||||
@@ -318,7 +340,11 @@ def deserialize_approvals(data: Optional[List[Dict]]) -> List[Union[ApprovalRetu
|
||||
|
||||
|
||||
def serialize_message_content(message_content: Optional[List[Union[MessageContent, dict]]]) -> List[Dict]:
|
||||
"""Convert a list of MessageContent objects into JSON-serializable format."""
|
||||
"""Convert a list of MessageContent objects into JSON-serializable format.
|
||||
|
||||
Note: Message content may contain null bytes from various sources.
|
||||
These are sanitized to prevent PostgreSQL errors.
|
||||
"""
|
||||
if not message_content:
|
||||
return []
|
||||
|
||||
@@ -327,9 +353,11 @@ def serialize_message_content(message_content: Optional[List[Union[MessageConten
|
||||
if isinstance(content, MessageContent):
|
||||
if content.type == MessageContentType.image:
|
||||
assert content.source.type == ImageSourceType.letta, f"Invalid image source type: {content.source.type}"
|
||||
serialized_message_content.append(content.model_dump(mode="json"))
|
||||
# Sanitize null bytes from message content to prevent PostgreSQL errors
|
||||
serialized_message_content.append(sanitize_null_bytes(content.model_dump(mode="json")))
|
||||
elif isinstance(content, dict):
|
||||
serialized_message_content.append(content) # Already a dictionary, leave it as-is
|
||||
# Sanitize null bytes from dictionary data
|
||||
serialized_message_content.append(sanitize_null_bytes(content))
|
||||
else:
|
||||
raise TypeError(f"Unexpected message content type: {type(content)}")
|
||||
return serialized_message_content
|
||||
|
||||
@@ -1,6 +1,42 @@
|
||||
import base64
|
||||
import json
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
|
||||
def sanitize_null_bytes(value: Any) -> Any:
|
||||
"""Recursively remove null bytes (0x00) from strings.
|
||||
|
||||
PostgreSQL TEXT columns don't accept null bytes in UTF-8 encoding, which causes
|
||||
asyncpg.exceptions.CharacterNotInRepertoireError when data with null bytes is inserted.
|
||||
|
||||
This function sanitizes:
|
||||
- Strings: removes all null bytes
|
||||
- Dicts: recursively sanitizes all string values
|
||||
- Lists: recursively sanitizes all elements
|
||||
- Other types: returned as-is
|
||||
|
||||
Args:
|
||||
value: The value to sanitize
|
||||
|
||||
Returns:
|
||||
The sanitized value with null bytes removed from all strings
|
||||
"""
|
||||
if isinstance(value, str):
|
||||
# Remove null bytes from strings
|
||||
return value.replace("\x00", "")
|
||||
elif isinstance(value, dict):
|
||||
# Recursively sanitize dictionary values
|
||||
return {k: sanitize_null_bytes(v) for k, v in value.items()}
|
||||
elif isinstance(value, list):
|
||||
# Recursively sanitize list elements
|
||||
return [sanitize_null_bytes(item) for item in value]
|
||||
elif isinstance(value, tuple):
|
||||
# Recursively sanitize tuple elements (return as tuple)
|
||||
return tuple(sanitize_null_bytes(item) for item in value)
|
||||
else:
|
||||
# Return other types as-is (int, float, bool, None, etc.)
|
||||
return value
|
||||
|
||||
|
||||
def json_loads(data):
|
||||
@@ -8,15 +44,33 @@ def json_loads(data):
|
||||
|
||||
|
||||
def json_dumps(data, indent=2) -> str:
|
||||
"""Serialize data to JSON string, sanitizing null bytes to prevent PostgreSQL errors.
|
||||
|
||||
PostgreSQL TEXT columns reject null bytes (0x00) in UTF-8 encoding. This function
|
||||
sanitizes all strings in the data structure before JSON serialization to prevent
|
||||
asyncpg.exceptions.CharacterNotInRepertoireError.
|
||||
|
||||
Args:
|
||||
data: The data to serialize
|
||||
indent: JSON indentation level (default: 2)
|
||||
|
||||
Returns:
|
||||
JSON string with null bytes removed from all string values
|
||||
"""
|
||||
# Sanitize null bytes before serialization to prevent PostgreSQL errors
|
||||
sanitized_data = sanitize_null_bytes(data)
|
||||
|
||||
def safe_serializer(obj):
|
||||
if isinstance(obj, datetime):
|
||||
return obj.isoformat()
|
||||
if isinstance(obj, bytes):
|
||||
try:
|
||||
return obj.decode("utf-8")
|
||||
decoded = obj.decode("utf-8")
|
||||
# Also sanitize decoded bytes
|
||||
return decoded.replace("\x00", "")
|
||||
except Exception:
|
||||
# TODO: this is to handle Gemini thought signatures, b64 decode this back to bytes when sending back to Gemini
|
||||
return base64.b64encode(obj).decode("utf-8")
|
||||
raise TypeError(f"Type {type(obj)} not serializable")
|
||||
|
||||
return json.dumps(data, indent=indent, default=safe_serializer, ensure_ascii=False)
|
||||
return json.dumps(sanitized_data, indent=indent, default=safe_serializer, ensure_ascii=False)
|
||||
|
||||
Reference in New Issue
Block a user