From ccafc6bef429d51a58a412dcacc0f1b1109f0b60 Mon Sep 17 00:00:00 2001 From: Kian Jones <11655409+kianjones9@users.noreply.github.com> Date: Wed, 19 Nov 2025 13:50:53 -0800 Subject: [PATCH] feat: out of band diagnostics on local health check (#6264) * move to not ready immedietly if there's a failure. * another script in container --- Dockerfile | 10 ++++- diagnostic_monitor.sh | 81 +++++++++++++++++++++++++++++++++++++++++ letta/server/startup.sh | 9 ++++- 3 files changed, 97 insertions(+), 3 deletions(-) create mode 100644 diagnostic_monitor.sh diff --git a/Dockerfile b/Dockerfile index 7ac54191..652c0489 100644 --- a/Dockerfile +++ b/Dockerfile @@ -36,7 +36,9 @@ COPY pyproject.toml uv.lock ./ # Then copy the rest of the application code COPY . . -RUN uv sync --frozen --no-dev --all-extras --python 3.11 +RUN uv sync --frozen --no-dev --all-extras --python 3.11 && \ + # Install py-spy for Python stack trace diagnostics + uv pip install py-spy # Runtime stage FROM ankane/pgvector:v0.5.1 AS runtime @@ -47,6 +49,8 @@ ARG NODE_VERSION=22 RUN apt-get update && \ # Install curl, Python, and PostgreSQL client libraries apt-get install -y curl python3 python3-venv libpq-dev && \ + # Install diagnostic tools (gdb for stack traces, procps for ps/top, net-tools for netstat) + apt-get install -y gdb procps net-tools && \ # Install Node.js curl -fsSL https://deb.nodesource.com/setup_${NODE_VERSION}.x | bash - && \ apt-get install -y nodejs && \ @@ -82,6 +86,10 @@ COPY --from=builder /app . # Copy initialization SQL if it exists COPY init.sql /docker-entrypoint-initdb.d/ +# Copy diagnostic monitor script +COPY diagnostic_monitor.sh /app/diagnostic_monitor.sh +RUN chmod +x /app/diagnostic_monitor.sh + EXPOSE 8283 5432 4317 4318 ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"] diff --git a/diagnostic_monitor.sh b/diagnostic_monitor.sh new file mode 100644 index 00000000..353428a5 --- /dev/null +++ b/diagnostic_monitor.sh @@ -0,0 +1,81 @@ +#!/bin/bash +# Diagnostic monitor that runs in background and dumps state when health checks fail + +HEALTH_URL="http://localhost:8083/v1/health/" +CHECK_INTERVAL=5 +CONSECUTIVE_FAILURES=0 +DIAGNOSTIC_TRIGGERED=false + +echo "[DIAGNOSTIC_MONITOR] Started monitoring $HEALTH_URL every ${CHECK_INTERVAL}s" + +while true; do + sleep $CHECK_INTERVAL + + # Try health check with 10s timeout (same as K8s probe) + HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 "$HEALTH_URL" 2>/dev/null) + + if [ "$HTTP_CODE" = "200" ]; then + # Health check passed + if [ $CONSECUTIVE_FAILURES -gt 0 ]; then + echo "[DIAGNOSTIC_MONITOR] Health check recovered after $CONSECUTIVE_FAILURES failures" + fi + CONSECUTIVE_FAILURES=0 + DIAGNOSTIC_TRIGGERED=false + else + # Health check failed + CONSECUTIVE_FAILURES=$((CONSECUTIVE_FAILURES + 1)) + echo "[DIAGNOSTIC_MONITOR] Health check FAILED (attempt $CONSECUTIVE_FAILURES, HTTP code: $HTTP_CODE)" + + # Trigger diagnostics after 2 consecutive failures (before K8s kills us) + if [ $CONSECUTIVE_FAILURES -ge 2 ] && [ "$DIAGNOSTIC_TRIGGERED" = false ]; then + DIAGNOSTIC_TRIGGERED=true + TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%S") + echo "[DIAGNOSTIC_MONITOR] ====== TRIGGERING DIAGNOSTICS at $TIMESTAMP ======" + + # Dump Python process info + echo "[DIAGNOSTIC_MONITOR] === Process Tree ===" + ps auxf 2>&1 || echo "ps failed" + + echo "[DIAGNOSTIC_MONITOR] === Python Process Details ===" + PYTHON_PID=$(pgrep -f "python.*server" | head -1) + if [ -n "$PYTHON_PID" ]; then + echo "Python PID: $PYTHON_PID" + echo "--- /proc/$PYTHON_PID/status ---" + cat /proc/$PYTHON_PID/status 2>&1 || echo "Cannot read status" + + echo "--- Thread count ---" + ls /proc/$PYTHON_PID/task 2>&1 | wc -l || echo "Cannot count threads" + + echo "--- Open files ---" + ls -l /proc/$PYTHON_PID/fd 2>&1 | wc -l || echo "Cannot count fds" + + echo "--- Thread stack traces (via py-spy if available) ---" + if command -v py-spy &> /dev/null; then + timeout 5 py-spy dump --pid $PYTHON_PID 2>&1 || echo "py-spy failed" + else + echo "py-spy not installed" + fi + + echo "--- System stack trace (via pstack/gdb if available) ---" + if command -v gdb &> /dev/null; then + timeout 5 gdb -batch -ex "thread apply all bt" -p $PYTHON_PID 2>&1 || echo "gdb failed" + else + echo "gdb not available" + fi + else + echo "Could not find Python process" + fi + + echo "[DIAGNOSTIC_MONITOR] === Network/Port Status ===" + netstat -tlnp 2>&1 | grep 8083 || echo "netstat failed" + + echo "[DIAGNOSTIC_MONITOR] === Memory Info ===" + free -h 2>&1 || echo "free failed" + + echo "[DIAGNOSTIC_MONITOR] === Recent Logs (last 100 lines) ===" + tail -100 /proc/$PYTHON_PID/fd/1 2>&1 || echo "Cannot read stdout" + + echo "[DIAGNOSTIC_MONITOR] ====== DIAGNOSTICS COMPLETE ======" + fi + fi +done diff --git a/letta/server/startup.sh b/letta/server/startup.sh index 5d8d736a..66509798 100755 --- a/letta/server/startup.sh +++ b/letta/server/startup.sh @@ -68,11 +68,16 @@ fi /usr/local/bin/otelcol-contrib --config "$CONFIG_FILE" & OTEL_PID=$! +# Start diagnostic monitor in background +echo "Starting diagnostic monitor..." +/app/diagnostic_monitor.sh 2>&1 | while IFS= read -r line; do echo "[DIAG] $line"; done & +DIAG_PID=$! + # Function to cleanup processes on exit cleanup() { echo "Shutting down..." - kill $OTEL_PID - wait $OTEL_PID + kill $OTEL_PID $DIAG_PID 2>/dev/null + wait $OTEL_PID $DIAG_PID 2>/dev/null } trap cleanup EXIT