feat: improved watchdog thread (#6252)

log even when event loop is not blocked to clarify whether event loop is blocked during 'freeze' and attempt to estimate main event loop load
This commit is contained in:
Kian Jones
2025-11-18 16:57:02 -08:00
committed by Caren Thomas
parent 0459088ff3
commit ea845d379d

View File

@@ -85,20 +85,37 @@ class EventLoopWatchdog:
time_since_heartbeat = time.time() - last_beat time_since_heartbeat = time.time() - last_beat
# Try to estimate event loop load (safe from separate thread)
task_count = -1
try:
if self._loop and not self._loop.is_closed():
# all_tasks returns only unfinished tasks
all_tasks = asyncio.all_tasks(self._loop)
task_count = len(all_tasks)
except Exception:
# Accessing loop from thread can be fragile, don't fail
pass
# ALWAYS log every check to prove watchdog is alive
logger.info(
f"WATCHDOG_CHECK: heartbeat_age={time_since_heartbeat:.1f}s, consecutive_hangs={consecutive_hangs}, tasks={task_count}"
)
if time_since_heartbeat > self.timeout_threshold: if time_since_heartbeat > self.timeout_threshold:
consecutive_hangs += 1 consecutive_hangs += 1
logger.error( logger.error(
f"EVENT LOOP HANG DETECTED! No heartbeat for {time_since_heartbeat:.1f}s (threshold: {self.timeout_threshold}s)" f"EVENT LOOP HANG DETECTED! No heartbeat for {time_since_heartbeat:.1f}s (threshold: {self.timeout_threshold}s), "
f"tasks={task_count}"
) )
# Dump basic state # Dump basic state
self._dump_state() self._dump_state()
if consecutive_hangs >= 2: if consecutive_hangs >= 2:
logger.critical(f"Event loop appears frozen ({consecutive_hangs} consecutive hangs)") logger.critical(f"Event loop appears frozen ({consecutive_hangs} consecutive hangs), tasks={task_count}")
else: else:
if consecutive_hangs > 0: if consecutive_hangs > 0:
logger.info("Event loop recovered") logger.info(f"Event loop recovered (was {consecutive_hangs} hangs, tasks now: {task_count})")
consecutive_hangs = 0 consecutive_hangs = 0
except Exception as e: except Exception as e: