""" Lightweight thread-based watchdog to detect event loop hangs. Runs independently and won't interfere with tests or normal operation. """ import asyncio import threading import time import traceback from typing import Optional from letta.log import get_logger logger = get_logger(__name__) class EventLoopWatchdog: """ Minimal watchdog that monitors event loop health from a separate thread. Detects complete event loop freezes that would cause health check failures. """ def __init__(self, check_interval: float = 5.0, timeout_threshold: float = 15.0): """ Args: check_interval: How often to check (seconds) timeout_threshold: Threshold for hang detection (seconds) """ self.check_interval = check_interval self.timeout_threshold = timeout_threshold self._thread: Optional[threading.Thread] = None self._stop_event = threading.Event() self._last_heartbeat = time.time() self._heartbeat_lock = threading.Lock() self._loop: Optional[asyncio.AbstractEventLoop] = None self._monitoring = False def start(self, loop: asyncio.AbstractEventLoop): """Start the watchdog thread.""" if self._monitoring: return self._loop = loop self._monitoring = True self._stop_event.clear() self._last_heartbeat = time.time() self._thread = threading.Thread(target=self._watch_loop, daemon=True, name="EventLoopWatchdog") self._thread.start() # Schedule periodic heartbeats on the event loop loop.call_soon(self._schedule_heartbeats) logger.info(f"Watchdog started (timeout: {self.timeout_threshold}s)") def stop(self): """Stop the watchdog thread.""" self._monitoring = False self._stop_event.set() if self._thread: self._thread.join(timeout=2) logger.info("Watchdog stopped") def _schedule_heartbeats(self): """Schedule periodic heartbeat updates on the event loop.""" if not self._monitoring: return with self._heartbeat_lock: self._last_heartbeat = time.time() if self._loop and self._monitoring: self._loop.call_later(1.0, self._schedule_heartbeats) def _watch_loop(self): """Main watchdog loop running in separate thread.""" consecutive_hangs = 0 while not self._stop_event.is_set(): try: time.sleep(self.check_interval) with self._heartbeat_lock: last_beat = self._last_heartbeat time_since_heartbeat = time.time() - last_beat if time_since_heartbeat > self.timeout_threshold: consecutive_hangs += 1 logger.error( f"EVENT LOOP HANG DETECTED! No heartbeat for {time_since_heartbeat:.1f}s (threshold: {self.timeout_threshold}s)" ) # Dump basic state self._dump_state() if consecutive_hangs >= 2: logger.critical(f"Event loop appears frozen ({consecutive_hangs} consecutive hangs)") else: if consecutive_hangs > 0: logger.info("Event loop recovered") consecutive_hangs = 0 except Exception as e: logger.error(f"Watchdog error: {e}") def _dump_state(self): """Dump minimal state when hang detected.""" try: # Get all threads logger.error(f"Active threads: {threading.active_count()}") for thread in threading.enumerate(): logger.error(f" {thread.name} (daemon={thread.daemon})") except Exception as e: logger.error(f"Failed to dump state: {e}") _global_watchdog: Optional[EventLoopWatchdog] = None def get_watchdog() -> Optional[EventLoopWatchdog]: """Get the global watchdog instance.""" return _global_watchdog def start_watchdog(loop: asyncio.AbstractEventLoop, check_interval: float = 5.0, timeout_threshold: float = 15.0): """Start the global watchdog.""" global _global_watchdog if _global_watchdog is None: _global_watchdog = EventLoopWatchdog(check_interval=check_interval, timeout_threshold=timeout_threshold) _global_watchdog.start(loop) return _global_watchdog def stop_watchdog(): """Stop the global watchdog.""" global _global_watchdog if _global_watchdog: _global_watchdog.stop() _global_watchdog = None