fix: resolve crouton telemetry failures (#9269)

Two issues were causing telemetry failures: 1. Startup race - memgpt-server sending telemetry before crouton created socket 2. Oversized payloads - large context windows (1M+ tokens) exceeding buffer Changes: - Increase crouton buffer to 128MB max with lazy allocation (64KB initial) - Bump crouton resources (512Mi limit, 128Mi request) - Add retry with exponential backoff in socket backend - Move crouton to initContainers with restartPolicy: Always for deterministic startup 🐙 Generated with [Letta Code](https://letta.com) Co-authored-by: Letta <noreply@letta.com>
2026-02-03 17:08:20 -08:00
parent eaf64fb510
commit 00b36bc591
2 changed files with 30 additions and 16 deletions
--- a/letta/services/provider_trace_backends/socket.py
+++ b/letta/services/provider_trace_backends/socket.py
@@ -4,6 +4,7 @@ import json
 import os
 import socket as socket_module
 import threading
+import time
 from datetime import datetime, timezone
 from typing import Any

@@ -18,7 +19,8 @@ logger = get_logger(__name__)
 # Bump this when making breaking changes to the record schema.
 # Must match ProtocolVersion in apps/crouton/main.go.
 # v2: Added user_id, compaction_settings (summarization), llm_config (non-summarization)
-PROTOCOL_VERSION = 2
+# v3: Increased buffer to 128MB, native sidecar for deterministic startup
+PROTOCOL_VERSION = 3


 class SocketProviderTraceBackend(ProviderTraceBackendClient):
@@ -106,17 +108,29 @@ class SocketProviderTraceBackend(ProviderTraceBackendClient):
        thread = threading.Thread(target=self._send_async, args=(record,), daemon=True)
        thread.start()

-    def _send_async(self, record: dict[str, Any]) -> None:
+    def _send_async(self, record: dict[str, Any], max_retries: int = 3) -> None:
        """Send record to Unix socket (runs in background thread)."""
-        try:
-            if not os.path.exists(self.socket_path):
-                logger.warning(f"Crouton socket not found at {self.socket_path}")
-                return
+        base_delay = 0.5
+        for attempt in range(max_retries):
+            try:
+                if not os.path.exists(self.socket_path):
+                    if attempt < max_retries - 1:
+                        time.sleep(base_delay * (2**attempt))
+                        continue
+                    logger.warning(f"Crouton socket not found at {self.socket_path} after {max_retries} attempts")
+                    return

-            with socket_module.socket(socket_module.AF_UNIX, socket_module.SOCK_STREAM) as sock:
-                sock.settimeout(5.0)
-                sock.connect(self.socket_path)
-                payload = json.dumps(record, default=str) + "\n"
-                sock.sendall(payload.encode())
-        except Exception as e:
-            logger.warning(f"Failed to send telemetry to Crouton: {e}")
+                with socket_module.socket(socket_module.AF_UNIX, socket_module.SOCK_STREAM) as sock:
+                    sock.settimeout(5.0)
+                    sock.connect(self.socket_path)
+                    payload = json.dumps(record, default=str) + "\n"
+                    sock.sendall(payload.encode())
+                return
+            except BrokenPipeError:
+                if attempt < max_retries - 1:
+                    time.sleep(base_delay * (2**attempt))
+                    continue
+                logger.warning(f"Failed to send telemetry to Crouton: broken pipe after {max_retries} attempts")
+            except Exception as e:
+                logger.warning(f"Failed to send telemetry to Crouton: {e}")
+                return
--- a/tests/test_provider_trace_backends.py
+++ b/tests/test_provider_trace_backends.py
@@ -288,8 +288,8 @@ class TestSocketProviderTraceBackend:
        assert captured_records[0]["error"] == "Rate limit exceeded"
        assert captured_records[0]["response"] is None

-    def test_record_includes_v2_protocol_fields(self):
-        """Test that v2 protocol fields are included in the socket record."""
+    def test_record_includes_v3_protocol_fields(self):
+        """Test that v3 protocol fields are included in the socket record."""
        trace = ProviderTrace(
            request_json={"model": "gpt-4"},
            response_json={"id": "test"},
@@ -312,7 +312,7 @@ class TestSocketProviderTraceBackend:

        assert len(captured_records) == 1
        record = captured_records[0]
-        assert record["protocol_version"] == 2
+        assert record["protocol_version"] == 3
        assert record["org_id"] == "org-456"
        assert record["user_id"] == "user-456"
        assert record["compaction_settings"] == {"mode": "sliding_window"}