Files
letta-server/diagnostic_monitor.sh
Kian Jones ccafc6bef4 feat: out of band diagnostics on local health check (#6264)
* move to not ready immedietly if there's a failure.

* another script in container
2025-11-24 19:10:11 -08:00

82 lines
3.3 KiB
Bash

#!/bin/bash
# Diagnostic monitor that runs in background and dumps state when health checks fail
HEALTH_URL="http://localhost:8083/v1/health/"
CHECK_INTERVAL=5
CONSECUTIVE_FAILURES=0
DIAGNOSTIC_TRIGGERED=false
echo "[DIAGNOSTIC_MONITOR] Started monitoring $HEALTH_URL every ${CHECK_INTERVAL}s"
while true; do
sleep $CHECK_INTERVAL
# Try health check with 10s timeout (same as K8s probe)
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 "$HEALTH_URL" 2>/dev/null)
if [ "$HTTP_CODE" = "200" ]; then
# Health check passed
if [ $CONSECUTIVE_FAILURES -gt 0 ]; then
echo "[DIAGNOSTIC_MONITOR] Health check recovered after $CONSECUTIVE_FAILURES failures"
fi
CONSECUTIVE_FAILURES=0
DIAGNOSTIC_TRIGGERED=false
else
# Health check failed
CONSECUTIVE_FAILURES=$((CONSECUTIVE_FAILURES + 1))
echo "[DIAGNOSTIC_MONITOR] Health check FAILED (attempt $CONSECUTIVE_FAILURES, HTTP code: $HTTP_CODE)"
# Trigger diagnostics after 2 consecutive failures (before K8s kills us)
if [ $CONSECUTIVE_FAILURES -ge 2 ] && [ "$DIAGNOSTIC_TRIGGERED" = false ]; then
DIAGNOSTIC_TRIGGERED=true
TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%S")
echo "[DIAGNOSTIC_MONITOR] ====== TRIGGERING DIAGNOSTICS at $TIMESTAMP ======"
# Dump Python process info
echo "[DIAGNOSTIC_MONITOR] === Process Tree ==="
ps auxf 2>&1 || echo "ps failed"
echo "[DIAGNOSTIC_MONITOR] === Python Process Details ==="
PYTHON_PID=$(pgrep -f "python.*server" | head -1)
if [ -n "$PYTHON_PID" ]; then
echo "Python PID: $PYTHON_PID"
echo "--- /proc/$PYTHON_PID/status ---"
cat /proc/$PYTHON_PID/status 2>&1 || echo "Cannot read status"
echo "--- Thread count ---"
ls /proc/$PYTHON_PID/task 2>&1 | wc -l || echo "Cannot count threads"
echo "--- Open files ---"
ls -l /proc/$PYTHON_PID/fd 2>&1 | wc -l || echo "Cannot count fds"
echo "--- Thread stack traces (via py-spy if available) ---"
if command -v py-spy &> /dev/null; then
timeout 5 py-spy dump --pid $PYTHON_PID 2>&1 || echo "py-spy failed"
else
echo "py-spy not installed"
fi
echo "--- System stack trace (via pstack/gdb if available) ---"
if command -v gdb &> /dev/null; then
timeout 5 gdb -batch -ex "thread apply all bt" -p $PYTHON_PID 2>&1 || echo "gdb failed"
else
echo "gdb not available"
fi
else
echo "Could not find Python process"
fi
echo "[DIAGNOSTIC_MONITOR] === Network/Port Status ==="
netstat -tlnp 2>&1 | grep 8083 || echo "netstat failed"
echo "[DIAGNOSTIC_MONITOR] === Memory Info ==="
free -h 2>&1 || echo "free failed"
echo "[DIAGNOSTIC_MONITOR] === Recent Logs (last 100 lines) ==="
tail -100 /proc/$PYTHON_PID/fd/1 2>&1 || echo "Cannot read stdout"
echo "[DIAGNOSTIC_MONITOR] ====== DIAGNOSTICS COMPLETE ======"
fi
fi
done