feat: out of band diagnostics on local health check (#6264)

* move to not ready immedietly if there's a failure. * another script in container
2025-11-19 13:50:53 -08:00
parent 3de712fce0
commit ccafc6bef4
3 changed files with 97 additions and 3 deletions
--- a/10
+++ b/10
@@ -36,7 +36,9 @@ COPY pyproject.toml uv.lock ./
 # Then copy the rest of the application code
 COPY . .

-RUN uv sync --frozen --no-dev --all-extras --python 3.11
+RUN uv sync --frozen --no-dev --all-extras --python 3.11 && \
+    # Install py-spy for Python stack trace diagnostics
+    uv pip install py-spy

 # Runtime stage
 FROM ankane/pgvector:v0.5.1 AS runtime
@@ -47,6 +49,8 @@ ARG NODE_VERSION=22
 RUN apt-get update && \
    # Install curl, Python, and PostgreSQL client libraries
    apt-get install -y curl python3 python3-venv libpq-dev && \
+    # Install diagnostic tools (gdb for stack traces, procps for ps/top, net-tools for netstat)
+    apt-get install -y gdb procps net-tools && \
    # Install Node.js
    curl -fsSL https://deb.nodesource.com/setup_${NODE_VERSION}.x | bash - && \
    apt-get install -y nodejs && \
@@ -82,6 +86,10 @@ COPY --from=builder /app .
 # Copy initialization SQL if it exists
 COPY init.sql /docker-entrypoint-initdb.d/

+# Copy diagnostic monitor script
+COPY diagnostic_monitor.sh /app/diagnostic_monitor.sh
+RUN chmod +x /app/diagnostic_monitor.sh
+
 EXPOSE 8283 5432 4317 4318

 ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"]
--- a/diagnostic_monitor.sh
+++ b/diagnostic_monitor.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+# Diagnostic monitor that runs in background and dumps state when health checks fail
+
+HEALTH_URL="http://localhost:8083/v1/health/"
+CHECK_INTERVAL=5
+CONSECUTIVE_FAILURES=0
+DIAGNOSTIC_TRIGGERED=false
+
+echo "[DIAGNOSTIC_MONITOR] Started monitoring $HEALTH_URL every ${CHECK_INTERVAL}s"
+
+while true; do
+    sleep $CHECK_INTERVAL
+
+    # Try health check with 10s timeout (same as K8s probe)
+    HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 "$HEALTH_URL" 2>/dev/null)
+
+    if [ "$HTTP_CODE" = "200" ]; then
+        # Health check passed
+        if [ $CONSECUTIVE_FAILURES -gt 0 ]; then
+            echo "[DIAGNOSTIC_MONITOR] Health check recovered after $CONSECUTIVE_FAILURES failures"
+        fi
+        CONSECUTIVE_FAILURES=0
+        DIAGNOSTIC_TRIGGERED=false
+    else
+        # Health check failed
+        CONSECUTIVE_FAILURES=$((CONSECUTIVE_FAILURES + 1))
+        echo "[DIAGNOSTIC_MONITOR] Health check FAILED (attempt $CONSECUTIVE_FAILURES, HTTP code: $HTTP_CODE)"
+
+        # Trigger diagnostics after 2 consecutive failures (before K8s kills us)
+        if [ $CONSECUTIVE_FAILURES -ge 2 ] && [ "$DIAGNOSTIC_TRIGGERED" = false ]; then
+            DIAGNOSTIC_TRIGGERED=true
+            TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%S")
+            echo "[DIAGNOSTIC_MONITOR] ====== TRIGGERING DIAGNOSTICS at $TIMESTAMP ======"
+
+            # Dump Python process info
+            echo "[DIAGNOSTIC_MONITOR] === Process Tree ==="
+            ps auxf 2>&1 || echo "ps failed"
+
+            echo "[DIAGNOSTIC_MONITOR] === Python Process Details ==="
+            PYTHON_PID=$(pgrep -f "python.*server" | head -1)
+            if [ -n "$PYTHON_PID" ]; then
+                echo "Python PID: $PYTHON_PID"
+                echo "--- /proc/$PYTHON_PID/status ---"
+                cat /proc/$PYTHON_PID/status 2>&1 || echo "Cannot read status"
+
+                echo "--- Thread count ---"
+                ls /proc/$PYTHON_PID/task 2>&1 | wc -l || echo "Cannot count threads"
+
+                echo "--- Open files ---"
+                ls -l /proc/$PYTHON_PID/fd 2>&1 | wc -l || echo "Cannot count fds"
+
+                echo "--- Thread stack traces (via py-spy if available) ---"
+                if command -v py-spy &> /dev/null; then
+                    timeout 5 py-spy dump --pid $PYTHON_PID 2>&1 || echo "py-spy failed"
+                else
+                    echo "py-spy not installed"
+                fi
+
+                echo "--- System stack trace (via pstack/gdb if available) ---"
+                if command -v gdb &> /dev/null; then
+                    timeout 5 gdb -batch -ex "thread apply all bt" -p $PYTHON_PID 2>&1 || echo "gdb failed"
+                else
+                    echo "gdb not available"
+                fi
+            else
+                echo "Could not find Python process"
+            fi
+
+            echo "[DIAGNOSTIC_MONITOR] === Network/Port Status ==="
+            netstat -tlnp 2>&1 | grep 8083 || echo "netstat failed"
+
+            echo "[DIAGNOSTIC_MONITOR] === Memory Info ==="
+            free -h 2>&1 || echo "free failed"
+
+            echo "[DIAGNOSTIC_MONITOR] === Recent Logs (last 100 lines) ==="
+            tail -100 /proc/$PYTHON_PID/fd/1 2>&1 || echo "Cannot read stdout"
+
+            echo "[DIAGNOSTIC_MONITOR] ====== DIAGNOSTICS COMPLETE ======"
+        fi
+    fi
+done
--- a/letta/server/startup.sh
+++ b/letta/server/startup.sh
@@ -68,11 +68,16 @@ fi
 /usr/local/bin/otelcol-contrib --config "$CONFIG_FILE" &
 OTEL_PID=$!

+# Start diagnostic monitor in background
+echo "Starting diagnostic monitor..."
+/app/diagnostic_monitor.sh 2>&1 | while IFS= read -r line; do echo "[DIAG] $line"; done &
+DIAG_PID=$!
+
 # Function to cleanup processes on exit
 cleanup() {
    echo "Shutting down..."
-    kill $OTEL_PID
-    wait $OTEL_PID
+    kill $OTEL_PID $DIAG_PID 2>/dev/null
+    wait $OTEL_PID $DIAG_PID 2>/dev/null
 }
 trap cleanup EXIT