feat: out of band diagnostics on local health check (#6264)
* move to not ready immedietly if there's a failure. * another script in container
This commit is contained in:
10
Dockerfile
10
Dockerfile
@@ -36,7 +36,9 @@ COPY pyproject.toml uv.lock ./
|
||||
# Then copy the rest of the application code
|
||||
COPY . .
|
||||
|
||||
RUN uv sync --frozen --no-dev --all-extras --python 3.11
|
||||
RUN uv sync --frozen --no-dev --all-extras --python 3.11 && \
|
||||
# Install py-spy for Python stack trace diagnostics
|
||||
uv pip install py-spy
|
||||
|
||||
# Runtime stage
|
||||
FROM ankane/pgvector:v0.5.1 AS runtime
|
||||
@@ -47,6 +49,8 @@ ARG NODE_VERSION=22
|
||||
RUN apt-get update && \
|
||||
# Install curl, Python, and PostgreSQL client libraries
|
||||
apt-get install -y curl python3 python3-venv libpq-dev && \
|
||||
# Install diagnostic tools (gdb for stack traces, procps for ps/top, net-tools for netstat)
|
||||
apt-get install -y gdb procps net-tools && \
|
||||
# Install Node.js
|
||||
curl -fsSL https://deb.nodesource.com/setup_${NODE_VERSION}.x | bash - && \
|
||||
apt-get install -y nodejs && \
|
||||
@@ -82,6 +86,10 @@ COPY --from=builder /app .
|
||||
# Copy initialization SQL if it exists
|
||||
COPY init.sql /docker-entrypoint-initdb.d/
|
||||
|
||||
# Copy diagnostic monitor script
|
||||
COPY diagnostic_monitor.sh /app/diagnostic_monitor.sh
|
||||
RUN chmod +x /app/diagnostic_monitor.sh
|
||||
|
||||
EXPOSE 8283 5432 4317 4318
|
||||
|
||||
ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"]
|
||||
|
||||
81
diagnostic_monitor.sh
Normal file
81
diagnostic_monitor.sh
Normal file
@@ -0,0 +1,81 @@
|
||||
#!/bin/bash
|
||||
# Diagnostic monitor that runs in background and dumps state when health checks fail
|
||||
|
||||
HEALTH_URL="http://localhost:8083/v1/health/"
|
||||
CHECK_INTERVAL=5
|
||||
CONSECUTIVE_FAILURES=0
|
||||
DIAGNOSTIC_TRIGGERED=false
|
||||
|
||||
echo "[DIAGNOSTIC_MONITOR] Started monitoring $HEALTH_URL every ${CHECK_INTERVAL}s"
|
||||
|
||||
while true; do
|
||||
sleep $CHECK_INTERVAL
|
||||
|
||||
# Try health check with 10s timeout (same as K8s probe)
|
||||
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 "$HEALTH_URL" 2>/dev/null)
|
||||
|
||||
if [ "$HTTP_CODE" = "200" ]; then
|
||||
# Health check passed
|
||||
if [ $CONSECUTIVE_FAILURES -gt 0 ]; then
|
||||
echo "[DIAGNOSTIC_MONITOR] Health check recovered after $CONSECUTIVE_FAILURES failures"
|
||||
fi
|
||||
CONSECUTIVE_FAILURES=0
|
||||
DIAGNOSTIC_TRIGGERED=false
|
||||
else
|
||||
# Health check failed
|
||||
CONSECUTIVE_FAILURES=$((CONSECUTIVE_FAILURES + 1))
|
||||
echo "[DIAGNOSTIC_MONITOR] Health check FAILED (attempt $CONSECUTIVE_FAILURES, HTTP code: $HTTP_CODE)"
|
||||
|
||||
# Trigger diagnostics after 2 consecutive failures (before K8s kills us)
|
||||
if [ $CONSECUTIVE_FAILURES -ge 2 ] && [ "$DIAGNOSTIC_TRIGGERED" = false ]; then
|
||||
DIAGNOSTIC_TRIGGERED=true
|
||||
TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%S")
|
||||
echo "[DIAGNOSTIC_MONITOR] ====== TRIGGERING DIAGNOSTICS at $TIMESTAMP ======"
|
||||
|
||||
# Dump Python process info
|
||||
echo "[DIAGNOSTIC_MONITOR] === Process Tree ==="
|
||||
ps auxf 2>&1 || echo "ps failed"
|
||||
|
||||
echo "[DIAGNOSTIC_MONITOR] === Python Process Details ==="
|
||||
PYTHON_PID=$(pgrep -f "python.*server" | head -1)
|
||||
if [ -n "$PYTHON_PID" ]; then
|
||||
echo "Python PID: $PYTHON_PID"
|
||||
echo "--- /proc/$PYTHON_PID/status ---"
|
||||
cat /proc/$PYTHON_PID/status 2>&1 || echo "Cannot read status"
|
||||
|
||||
echo "--- Thread count ---"
|
||||
ls /proc/$PYTHON_PID/task 2>&1 | wc -l || echo "Cannot count threads"
|
||||
|
||||
echo "--- Open files ---"
|
||||
ls -l /proc/$PYTHON_PID/fd 2>&1 | wc -l || echo "Cannot count fds"
|
||||
|
||||
echo "--- Thread stack traces (via py-spy if available) ---"
|
||||
if command -v py-spy &> /dev/null; then
|
||||
timeout 5 py-spy dump --pid $PYTHON_PID 2>&1 || echo "py-spy failed"
|
||||
else
|
||||
echo "py-spy not installed"
|
||||
fi
|
||||
|
||||
echo "--- System stack trace (via pstack/gdb if available) ---"
|
||||
if command -v gdb &> /dev/null; then
|
||||
timeout 5 gdb -batch -ex "thread apply all bt" -p $PYTHON_PID 2>&1 || echo "gdb failed"
|
||||
else
|
||||
echo "gdb not available"
|
||||
fi
|
||||
else
|
||||
echo "Could not find Python process"
|
||||
fi
|
||||
|
||||
echo "[DIAGNOSTIC_MONITOR] === Network/Port Status ==="
|
||||
netstat -tlnp 2>&1 | grep 8083 || echo "netstat failed"
|
||||
|
||||
echo "[DIAGNOSTIC_MONITOR] === Memory Info ==="
|
||||
free -h 2>&1 || echo "free failed"
|
||||
|
||||
echo "[DIAGNOSTIC_MONITOR] === Recent Logs (last 100 lines) ==="
|
||||
tail -100 /proc/$PYTHON_PID/fd/1 2>&1 || echo "Cannot read stdout"
|
||||
|
||||
echo "[DIAGNOSTIC_MONITOR] ====== DIAGNOSTICS COMPLETE ======"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
@@ -68,11 +68,16 @@ fi
|
||||
/usr/local/bin/otelcol-contrib --config "$CONFIG_FILE" &
|
||||
OTEL_PID=$!
|
||||
|
||||
# Start diagnostic monitor in background
|
||||
echo "Starting diagnostic monitor..."
|
||||
/app/diagnostic_monitor.sh 2>&1 | while IFS= read -r line; do echo "[DIAG] $line"; done &
|
||||
DIAG_PID=$!
|
||||
|
||||
# Function to cleanup processes on exit
|
||||
cleanup() {
|
||||
echo "Shutting down..."
|
||||
kill $OTEL_PID
|
||||
wait $OTEL_PID
|
||||
kill $OTEL_PID $DIAG_PID 2>/dev/null
|
||||
wait $OTEL_PID $DIAG_PID 2>/dev/null
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
|
||||
Reference in New Issue
Block a user