Files
letta-server/letta/monitoring/event_loop_watchdog.py
Kian Jones 71bce718f7 Add lightweight event loop watchdog monitoring (#6209)
* Add lightweight event loop watchdog monitoring

- Thread-based watchdog detects event loop hangs >15s
- Runs independently, won't interfere with normal operation
- Disabled in test environments
- Minimal overhead, just heartbeat checks every 5s

* actually test it

* Add test script to validate watchdog detects hangs

Run with: uv run python test_watchdog_hang.py

Tests:
- Normal operation (no false positives)
- Short blocks under threshold (no alerts)
- Long blocks over threshold (correctly alerts)
2025-11-24 19:09:33 -08:00

142 lines
4.6 KiB
Python

"""
Lightweight thread-based watchdog to detect event loop hangs.
Runs independently and won't interfere with tests or normal operation.
"""
import asyncio
import threading
import time
import traceback
from typing import Optional
from letta.log import get_logger
logger = get_logger(__name__)
class EventLoopWatchdog:
"""
Minimal watchdog that monitors event loop health from a separate thread.
Detects complete event loop freezes that would cause health check failures.
"""
def __init__(self, check_interval: float = 5.0, timeout_threshold: float = 15.0):
"""
Args:
check_interval: How often to check (seconds)
timeout_threshold: Threshold for hang detection (seconds)
"""
self.check_interval = check_interval
self.timeout_threshold = timeout_threshold
self._thread: Optional[threading.Thread] = None
self._stop_event = threading.Event()
self._last_heartbeat = time.time()
self._heartbeat_lock = threading.Lock()
self._loop: Optional[asyncio.AbstractEventLoop] = None
self._monitoring = False
def start(self, loop: asyncio.AbstractEventLoop):
"""Start the watchdog thread."""
if self._monitoring:
return
self._loop = loop
self._monitoring = True
self._stop_event.clear()
self._last_heartbeat = time.time()
self._thread = threading.Thread(target=self._watch_loop, daemon=True, name="EventLoopWatchdog")
self._thread.start()
# Schedule periodic heartbeats on the event loop
loop.call_soon(self._schedule_heartbeats)
logger.info(f"Watchdog started (timeout: {self.timeout_threshold}s)")
def stop(self):
"""Stop the watchdog thread."""
self._monitoring = False
self._stop_event.set()
if self._thread:
self._thread.join(timeout=2)
logger.info("Watchdog stopped")
def _schedule_heartbeats(self):
"""Schedule periodic heartbeat updates on the event loop."""
if not self._monitoring:
return
with self._heartbeat_lock:
self._last_heartbeat = time.time()
if self._loop and self._monitoring:
self._loop.call_later(1.0, self._schedule_heartbeats)
def _watch_loop(self):
"""Main watchdog loop running in separate thread."""
consecutive_hangs = 0
while not self._stop_event.is_set():
try:
time.sleep(self.check_interval)
with self._heartbeat_lock:
last_beat = self._last_heartbeat
time_since_heartbeat = time.time() - last_beat
if time_since_heartbeat > self.timeout_threshold:
consecutive_hangs += 1
logger.error(
f"EVENT LOOP HANG DETECTED! No heartbeat for {time_since_heartbeat:.1f}s (threshold: {self.timeout_threshold}s)"
)
# Dump basic state
self._dump_state()
if consecutive_hangs >= 2:
logger.critical(f"Event loop appears frozen ({consecutive_hangs} consecutive hangs)")
else:
if consecutive_hangs > 0:
logger.info("Event loop recovered")
consecutive_hangs = 0
except Exception as e:
logger.error(f"Watchdog error: {e}")
def _dump_state(self):
"""Dump minimal state when hang detected."""
try:
# Get all threads
logger.error(f"Active threads: {threading.active_count()}")
for thread in threading.enumerate():
logger.error(f" {thread.name} (daemon={thread.daemon})")
except Exception as e:
logger.error(f"Failed to dump state: {e}")
_global_watchdog: Optional[EventLoopWatchdog] = None
def get_watchdog() -> Optional[EventLoopWatchdog]:
"""Get the global watchdog instance."""
return _global_watchdog
def start_watchdog(loop: asyncio.AbstractEventLoop, check_interval: float = 5.0, timeout_threshold: float = 15.0):
"""Start the global watchdog."""
global _global_watchdog
if _global_watchdog is None:
_global_watchdog = EventLoopWatchdog(check_interval=check_interval, timeout_threshold=timeout_threshold)
_global_watchdog.start(loop)
return _global_watchdog
def stop_watchdog():
"""Stop the global watchdog."""
global _global_watchdog
if _global_watchdog:
_global_watchdog.stop()
_global_watchdog = None