feat: out of band diagnostics on local health check (#6264)

* move to not ready immedietly if there's a failure.

* another script in container
This commit is contained in:
Kian Jones
2025-11-19 13:50:53 -08:00
committed by Caren Thomas
parent 3de712fce0
commit ccafc6bef4
3 changed files with 97 additions and 3 deletions

View File

@@ -36,7 +36,9 @@ COPY pyproject.toml uv.lock ./
# Then copy the rest of the application code
COPY . .
RUN uv sync --frozen --no-dev --all-extras --python 3.11
RUN uv sync --frozen --no-dev --all-extras --python 3.11 && \
# Install py-spy for Python stack trace diagnostics
uv pip install py-spy
# Runtime stage
FROM ankane/pgvector:v0.5.1 AS runtime
@@ -47,6 +49,8 @@ ARG NODE_VERSION=22
RUN apt-get update && \
# Install curl, Python, and PostgreSQL client libraries
apt-get install -y curl python3 python3-venv libpq-dev && \
# Install diagnostic tools (gdb for stack traces, procps for ps/top, net-tools for netstat)
apt-get install -y gdb procps net-tools && \
# Install Node.js
curl -fsSL https://deb.nodesource.com/setup_${NODE_VERSION}.x | bash - && \
apt-get install -y nodejs && \
@@ -82,6 +86,10 @@ COPY --from=builder /app .
# Copy initialization SQL if it exists
COPY init.sql /docker-entrypoint-initdb.d/
# Copy diagnostic monitor script
COPY diagnostic_monitor.sh /app/diagnostic_monitor.sh
RUN chmod +x /app/diagnostic_monitor.sh
EXPOSE 8283 5432 4317 4318
ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"]

81
diagnostic_monitor.sh Normal file
View File

@@ -0,0 +1,81 @@
#!/bin/bash
# Diagnostic monitor that runs in background and dumps state when health checks fail
HEALTH_URL="http://localhost:8083/v1/health/"
CHECK_INTERVAL=5
CONSECUTIVE_FAILURES=0
DIAGNOSTIC_TRIGGERED=false
echo "[DIAGNOSTIC_MONITOR] Started monitoring $HEALTH_URL every ${CHECK_INTERVAL}s"
while true; do
sleep $CHECK_INTERVAL
# Try health check with 10s timeout (same as K8s probe)
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 "$HEALTH_URL" 2>/dev/null)
if [ "$HTTP_CODE" = "200" ]; then
# Health check passed
if [ $CONSECUTIVE_FAILURES -gt 0 ]; then
echo "[DIAGNOSTIC_MONITOR] Health check recovered after $CONSECUTIVE_FAILURES failures"
fi
CONSECUTIVE_FAILURES=0
DIAGNOSTIC_TRIGGERED=false
else
# Health check failed
CONSECUTIVE_FAILURES=$((CONSECUTIVE_FAILURES + 1))
echo "[DIAGNOSTIC_MONITOR] Health check FAILED (attempt $CONSECUTIVE_FAILURES, HTTP code: $HTTP_CODE)"
# Trigger diagnostics after 2 consecutive failures (before K8s kills us)
if [ $CONSECUTIVE_FAILURES -ge 2 ] && [ "$DIAGNOSTIC_TRIGGERED" = false ]; then
DIAGNOSTIC_TRIGGERED=true
TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%S")
echo "[DIAGNOSTIC_MONITOR] ====== TRIGGERING DIAGNOSTICS at $TIMESTAMP ======"
# Dump Python process info
echo "[DIAGNOSTIC_MONITOR] === Process Tree ==="
ps auxf 2>&1 || echo "ps failed"
echo "[DIAGNOSTIC_MONITOR] === Python Process Details ==="
PYTHON_PID=$(pgrep -f "python.*server" | head -1)
if [ -n "$PYTHON_PID" ]; then
echo "Python PID: $PYTHON_PID"
echo "--- /proc/$PYTHON_PID/status ---"
cat /proc/$PYTHON_PID/status 2>&1 || echo "Cannot read status"
echo "--- Thread count ---"
ls /proc/$PYTHON_PID/task 2>&1 | wc -l || echo "Cannot count threads"
echo "--- Open files ---"
ls -l /proc/$PYTHON_PID/fd 2>&1 | wc -l || echo "Cannot count fds"
echo "--- Thread stack traces (via py-spy if available) ---"
if command -v py-spy &> /dev/null; then
timeout 5 py-spy dump --pid $PYTHON_PID 2>&1 || echo "py-spy failed"
else
echo "py-spy not installed"
fi
echo "--- System stack trace (via pstack/gdb if available) ---"
if command -v gdb &> /dev/null; then
timeout 5 gdb -batch -ex "thread apply all bt" -p $PYTHON_PID 2>&1 || echo "gdb failed"
else
echo "gdb not available"
fi
else
echo "Could not find Python process"
fi
echo "[DIAGNOSTIC_MONITOR] === Network/Port Status ==="
netstat -tlnp 2>&1 | grep 8083 || echo "netstat failed"
echo "[DIAGNOSTIC_MONITOR] === Memory Info ==="
free -h 2>&1 || echo "free failed"
echo "[DIAGNOSTIC_MONITOR] === Recent Logs (last 100 lines) ==="
tail -100 /proc/$PYTHON_PID/fd/1 2>&1 || echo "Cannot read stdout"
echo "[DIAGNOSTIC_MONITOR] ====== DIAGNOSTICS COMPLETE ======"
fi
fi
done

View File

@@ -68,11 +68,16 @@ fi
/usr/local/bin/otelcol-contrib --config "$CONFIG_FILE" &
OTEL_PID=$!
# Start diagnostic monitor in background
echo "Starting diagnostic monitor..."
/app/diagnostic_monitor.sh 2>&1 | while IFS= read -r line; do echo "[DIAG] $line"; done &
DIAG_PID=$!
# Function to cleanup processes on exit
cleanup() {
echo "Shutting down..."
kill $OTEL_PID
wait $OTEL_PID
kill $OTEL_PID $DIAG_PID 2>/dev/null
wait $OTEL_PID $DIAG_PID 2>/dev/null
}
trap cleanup EXIT