From dbdd1a40e45bbeda9132d4b80be1663541590d66 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sun, 28 Dec 2025 15:01:21 -0500 Subject: [PATCH] fix: sanitize null bytes to prevent PostgreSQL CharacterNotInRepertoireError (#8015) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes the asyncpg.exceptions.CharacterNotInRepertoireError that occurs when tool returns contain null bytes (0x00), which PostgreSQL TEXT columns reject in UTF-8 encoding. Changes: - Add sanitize_null_bytes() function to recursively remove null bytes from strings - Update json_dumps() to sanitize data before serialization - Apply sanitization in converters.py for tool_calls, tool_returns, approvals, and message_content - Add comprehensive unit tests Fixes #8014 🤖 Generated with [Letta Code](https://letta.com) Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Letta Co-authored-by: Kian Jones <11655409+kianjones9@users.noreply.github.com> --- letta/helpers/converters.py | 54 ++++++++++++++----- letta/helpers/json_helpers.py | 58 +++++++++++++++++++- tests/test_utils.py | 99 +++++++++++++++++++++++++++++++++++ 3 files changed, 196 insertions(+), 15 deletions(-) diff --git a/letta/helpers/converters.py b/letta/helpers/converters.py index f9befaa9..424dad81 100644 --- a/letta/helpers/converters.py +++ b/letta/helpers/converters.py @@ -6,6 +6,10 @@ from openai.types.chat.chat_completion_message_tool_call import ChatCompletionMe from sqlalchemy import Dialect from letta.functions.mcp_client.types import StdioServerConfig +from letta.helpers.json_helpers import sanitize_null_bytes +from letta.log import get_logger + +logger = get_logger(__name__) from letta.schemas.embedding_config import EmbeddingConfig from letta.schemas.enums import ProviderType, ToolRuleType from letta.schemas.letta_message import ApprovalReturn, MessageReturnType @@ -184,16 +188,22 @@ def deserialize_tool_rule( def serialize_tool_calls(tool_calls: Optional[List[Union[OpenAIToolCall, dict]]]) -> List[Dict]: - """Convert a list of OpenAI ToolCall objects into JSON-serializable format.""" + """Convert a list of OpenAI ToolCall objects into JSON-serializable format. + + Note: Tool call arguments may contain null bytes from various sources. + These are sanitized to prevent PostgreSQL errors. + """ if not tool_calls: return [] serialized_calls = [] for call in tool_calls: if isinstance(call, OpenAIToolCall): - serialized_calls.append(call.model_dump(mode="json")) + # Sanitize null bytes from tool call data to prevent PostgreSQL errors + serialized_calls.append(sanitize_null_bytes(call.model_dump(mode="json"))) elif isinstance(call, dict): - serialized_calls.append(call) # Already a dictionary, leave it as-is + # Sanitize null bytes from dictionary data + serialized_calls.append(sanitize_null_bytes(call)) else: raise TypeError(f"Unexpected tool call type: {type(call)}") @@ -221,16 +231,22 @@ def deserialize_tool_calls(data: Optional[List[Dict]]) -> List[OpenAIToolCall]: def serialize_tool_returns(tool_returns: Optional[List[Union[ToolReturn, dict]]]) -> List[Dict]: - """Convert a list of ToolReturn objects into JSON-serializable format.""" + """Convert a list of ToolReturn objects into JSON-serializable format. + + Note: Tool returns may contain null bytes from sandbox execution or binary data. + These are sanitized to prevent PostgreSQL errors. + """ if not tool_returns: return [] serialized_tool_returns = [] for tool_return in tool_returns: if isinstance(tool_return, ToolReturn): - serialized_tool_returns.append(tool_return.model_dump(mode="json")) + # Sanitize null bytes from tool return data to prevent PostgreSQL errors + serialized_tool_returns.append(sanitize_null_bytes(tool_return.model_dump(mode="json"))) elif isinstance(tool_return, dict): - serialized_tool_returns.append(tool_return) # Already a dictionary, leave it as-is + # Sanitize null bytes from dictionary data + serialized_tool_returns.append(sanitize_null_bytes(tool_return)) else: raise TypeError(f"Unexpected tool return type: {type(tool_return)}") @@ -256,18 +272,24 @@ def deserialize_tool_returns(data: Optional[List[Dict]]) -> List[ToolReturn]: def serialize_approvals(approvals: Optional[List[Union[ApprovalReturn, ToolReturn, dict]]]) -> List[Dict]: - """Convert a list of ToolReturn objects into JSON-serializable format.""" + """Convert a list of ToolReturn objects into JSON-serializable format. + + Note: Approval data may contain null bytes from various sources. + These are sanitized to prevent PostgreSQL errors. + """ if not approvals: return [] serialized_approvals = [] for approval in approvals: if isinstance(approval, ApprovalReturn): - serialized_approvals.append(approval.model_dump(mode="json")) + # Sanitize null bytes from approval data to prevent PostgreSQL errors + serialized_approvals.append(sanitize_null_bytes(approval.model_dump(mode="json"))) elif isinstance(approval, ToolReturn): - serialized_approvals.append(approval.model_dump(mode="json")) + serialized_approvals.append(sanitize_null_bytes(approval.model_dump(mode="json"))) elif isinstance(approval, dict): - serialized_approvals.append(approval) # Already a dictionary, leave it as-is + # Sanitize null bytes from dictionary data + serialized_approvals.append(sanitize_null_bytes(approval)) else: raise TypeError(f"Unexpected approval type: {type(approval)}") @@ -318,7 +340,11 @@ def deserialize_approvals(data: Optional[List[Dict]]) -> List[Union[ApprovalRetu def serialize_message_content(message_content: Optional[List[Union[MessageContent, dict]]]) -> List[Dict]: - """Convert a list of MessageContent objects into JSON-serializable format.""" + """Convert a list of MessageContent objects into JSON-serializable format. + + Note: Message content may contain null bytes from various sources. + These are sanitized to prevent PostgreSQL errors. + """ if not message_content: return [] @@ -327,9 +353,11 @@ def serialize_message_content(message_content: Optional[List[Union[MessageConten if isinstance(content, MessageContent): if content.type == MessageContentType.image: assert content.source.type == ImageSourceType.letta, f"Invalid image source type: {content.source.type}" - serialized_message_content.append(content.model_dump(mode="json")) + # Sanitize null bytes from message content to prevent PostgreSQL errors + serialized_message_content.append(sanitize_null_bytes(content.model_dump(mode="json"))) elif isinstance(content, dict): - serialized_message_content.append(content) # Already a dictionary, leave it as-is + # Sanitize null bytes from dictionary data + serialized_message_content.append(sanitize_null_bytes(content)) else: raise TypeError(f"Unexpected message content type: {type(content)}") return serialized_message_content diff --git a/letta/helpers/json_helpers.py b/letta/helpers/json_helpers.py index ff6943b8..aba17d05 100644 --- a/letta/helpers/json_helpers.py +++ b/letta/helpers/json_helpers.py @@ -1,6 +1,42 @@ import base64 import json from datetime import datetime +from typing import Any + + +def sanitize_null_bytes(value: Any) -> Any: + """Recursively remove null bytes (0x00) from strings. + + PostgreSQL TEXT columns don't accept null bytes in UTF-8 encoding, which causes + asyncpg.exceptions.CharacterNotInRepertoireError when data with null bytes is inserted. + + This function sanitizes: + - Strings: removes all null bytes + - Dicts: recursively sanitizes all string values + - Lists: recursively sanitizes all elements + - Other types: returned as-is + + Args: + value: The value to sanitize + + Returns: + The sanitized value with null bytes removed from all strings + """ + if isinstance(value, str): + # Remove null bytes from strings + return value.replace("\x00", "") + elif isinstance(value, dict): + # Recursively sanitize dictionary values + return {k: sanitize_null_bytes(v) for k, v in value.items()} + elif isinstance(value, list): + # Recursively sanitize list elements + return [sanitize_null_bytes(item) for item in value] + elif isinstance(value, tuple): + # Recursively sanitize tuple elements (return as tuple) + return tuple(sanitize_null_bytes(item) for item in value) + else: + # Return other types as-is (int, float, bool, None, etc.) + return value def json_loads(data): @@ -8,15 +44,33 @@ def json_loads(data): def json_dumps(data, indent=2) -> str: + """Serialize data to JSON string, sanitizing null bytes to prevent PostgreSQL errors. + + PostgreSQL TEXT columns reject null bytes (0x00) in UTF-8 encoding. This function + sanitizes all strings in the data structure before JSON serialization to prevent + asyncpg.exceptions.CharacterNotInRepertoireError. + + Args: + data: The data to serialize + indent: JSON indentation level (default: 2) + + Returns: + JSON string with null bytes removed from all string values + """ + # Sanitize null bytes before serialization to prevent PostgreSQL errors + sanitized_data = sanitize_null_bytes(data) + def safe_serializer(obj): if isinstance(obj, datetime): return obj.isoformat() if isinstance(obj, bytes): try: - return obj.decode("utf-8") + decoded = obj.decode("utf-8") + # Also sanitize decoded bytes + return decoded.replace("\x00", "") except Exception: # TODO: this is to handle Gemini thought signatures, b64 decode this back to bytes when sending back to Gemini return base64.b64encode(obj).decode("utf-8") raise TypeError(f"Type {type(obj)} not serializable") - return json.dumps(data, indent=indent, default=safe_serializer, ensure_ascii=False) + return json.dumps(sanitized_data, indent=indent, default=safe_serializer, ensure_ascii=False) diff --git a/tests/test_utils.py b/tests/test_utils.py index 3d16039e..37114aa7 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -678,3 +678,102 @@ def test_sdk_version_check(): assert is_1_0_sdk_version(HeaderParams(sdk_version="v1.0.0-alpha.7")) assert is_1_0_sdk_version(HeaderParams(sdk_version="v1.0.0a7")) assert is_1_0_sdk_version(HeaderParams(sdk_version="v2.0.0")) + + +# ---------------------- sanitize_null_bytes TESTS ---------------------- # + + +def test_sanitize_null_bytes_string(): + """Test that null bytes are removed from strings""" + from letta.helpers.json_helpers import sanitize_null_bytes + + # Test basic null byte removal + assert sanitize_null_bytes("hello\x00world") == "helloworld" + + # Test multiple null bytes + assert sanitize_null_bytes("a\x00b\x00c") == "abc" + + # Test null byte at beginning + assert sanitize_null_bytes("\x00hello") == "hello" + + # Test null byte at end + assert sanitize_null_bytes("hello\x00") == "hello" + + # Test string without null bytes + assert sanitize_null_bytes("hello world") == "hello world" + + # Test empty string + assert sanitize_null_bytes("") == "" + + +def test_sanitize_null_bytes_dict(): + """Test that null bytes are removed from dictionary values""" + from letta.helpers.json_helpers import sanitize_null_bytes + + # Test nested dict with null bytes + result = sanitize_null_bytes({ + "key1": "value\x00with\x00nulls", + "key2": {"nested": "also\x00null"}, + "key3": 123, # non-string should be unchanged + }) + assert result == { + "key1": "valuewithnulls", + "key2": {"nested": "alsonull"}, + "key3": 123, + } + + +def test_sanitize_null_bytes_list(): + """Test that null bytes are removed from list elements""" + from letta.helpers.json_helpers import sanitize_null_bytes + + result = sanitize_null_bytes(["hello\x00world", "no nulls", {"nested\x00key": "value\x00"}]) + assert result == ["helloworld", "no nulls", {"nestedkey": "value"}] + + +def test_sanitize_null_bytes_tuple(): + """Test that null bytes are removed from tuple elements""" + from letta.helpers.json_helpers import sanitize_null_bytes + + result = sanitize_null_bytes(("hello\x00world", "no nulls")) + assert result == ("helloworld", "no nulls") + + +def test_sanitize_null_bytes_preserves_other_types(): + """Test that non-string types are preserved unchanged""" + from letta.helpers.json_helpers import sanitize_null_bytes + + assert sanitize_null_bytes(123) == 123 + assert sanitize_null_bytes(3.14) == 3.14 + assert sanitize_null_bytes(True) is True + assert sanitize_null_bytes(False) is False + assert sanitize_null_bytes(None) is None + + +def test_json_dumps_sanitizes_null_bytes(): + """Test that json_dumps sanitizes null bytes before serialization""" + from letta.helpers.json_helpers import json_dumps + + # Test that null bytes are removed from the output + result = json_dumps({"message": "hello\x00world"}) + assert "\x00" not in result + assert "helloworld" in result + + +def test_json_dumps_with_complex_nested_null_bytes(): + """Test that json_dumps handles complex nested structures with null bytes""" + from letta.helpers.json_helpers import json_dumps + + data = { + "tool_return": { + "status": "success", + "func_response": "Binary\x00data\x00here", + }, + "content": [ + {"type": "text", "text": "Message\x00with\x00nulls"}, + ], + } + result = json_dumps(data) + assert "\x00" not in result + assert "Binarydatahere" in result + assert "Messagewithnulls" in result