Add lightweight event loop watchdog monitoring (#6209)

* Add lightweight event loop watchdog monitoring - Thread-based watchdog detects event loop hangs >15s - Runs independently, won't interfere with normal operation - Disabled in test environments - Minimal overhead, just heartbeat checks every 5s * actually test it * Add test script to validate watchdog detects hangs Run with: uv run python test_watchdog_hang.py Tests: - Normal operation (no false positives) - Short blocks under threshold (no alerts) - Long blocks over threshold (correctly alerts)
2025-11-17 12:01:50 -08:00
parent c18af2bc81
commit 71bce718f7
3 changed files with 251 additions and 0 deletions
--- a/letta/monitoring/event_loop_watchdog.py
+++ b/letta/monitoring/event_loop_watchdog.py
@@ -0,0 +1,141 @@
 """
 Lightweight thread-based watchdog to detect event loop hangs.
 Runs independently and won't interfere with tests or normal operation.
 """
 import asyncio
 import threading
 import time
 import traceback
 from typing import Optional
 from letta.log import get_logger
 logger = get_logger(__name__)
 class EventLoopWatchdog:
    """
    Minimal watchdog that monitors event loop health from a separate thread.
    Detects complete event loop freezes that would cause health check failures.
    """
    def __init__(self, check_interval: float = 5.0, timeout_threshold: float = 15.0):
        """
        Args:
            check_interval: How often to check (seconds)
            timeout_threshold: Threshold for hang detection (seconds)
        """
        self.check_interval = check_interval
        self.timeout_threshold = timeout_threshold
        self._thread: Optional[threading.Thread] = None
        self._stop_event = threading.Event()
        self._last_heartbeat = time.time()
        self._heartbeat_lock = threading.Lock()
        self._loop: Optional[asyncio.AbstractEventLoop] = None
        self._monitoring = False
    def start(self, loop: asyncio.AbstractEventLoop):
        """Start the watchdog thread."""
        if self._monitoring:
            return
        self._loop = loop
        self._monitoring = True
        self._stop_event.clear()
        self._last_heartbeat = time.time()
        self._thread = threading.Thread(target=self._watch_loop, daemon=True, name="EventLoopWatchdog")
        self._thread.start()
        # Schedule periodic heartbeats on the event loop
        loop.call_soon(self._schedule_heartbeats)
        logger.info(f"Watchdog started (timeout: {self.timeout_threshold}s)")
    def stop(self):
        """Stop the watchdog thread."""
        self._monitoring = False
        self._stop_event.set()
        if self._thread:
            self._thread.join(timeout=2)
        logger.info("Watchdog stopped")
    def _schedule_heartbeats(self):
        """Schedule periodic heartbeat updates on the event loop."""
        if not self._monitoring:
            return
        with self._heartbeat_lock:
            self._last_heartbeat = time.time()
        if self._loop and self._monitoring:
            self._loop.call_later(1.0, self._schedule_heartbeats)
    def _watch_loop(self):
        """Main watchdog loop running in separate thread."""
        consecutive_hangs = 0
        while not self._stop_event.is_set():
            try:
                time.sleep(self.check_interval)
                with self._heartbeat_lock:
                    last_beat = self._last_heartbeat
                time_since_heartbeat = time.time() - last_beat
                if time_since_heartbeat > self.timeout_threshold:
                    consecutive_hangs += 1
                    logger.error(
                        f"EVENT LOOP HANG DETECTED! No heartbeat for {time_since_heartbeat:.1f}s (threshold: {self.timeout_threshold}s)"
                    )
                    # Dump basic state
                    self._dump_state()
                    if consecutive_hangs >= 2:
                        logger.critical(f"Event loop appears frozen ({consecutive_hangs} consecutive hangs)")
                else:
                    if consecutive_hangs > 0:
                        logger.info("Event loop recovered")
                    consecutive_hangs = 0
            except Exception as e:
                logger.error(f"Watchdog error: {e}")
    def _dump_state(self):
        """Dump minimal state when hang detected."""
        try:
            # Get all threads
            logger.error(f"Active threads: {threading.active_count()}")
            for thread in threading.enumerate():
                logger.error(f"  {thread.name} (daemon={thread.daemon})")
        except Exception as e:
            logger.error(f"Failed to dump state: {e}")
 _global_watchdog: Optional[EventLoopWatchdog] = None
 def get_watchdog() -> Optional[EventLoopWatchdog]:
    """Get the global watchdog instance."""
    return _global_watchdog
 def start_watchdog(loop: asyncio.AbstractEventLoop, check_interval: float = 5.0, timeout_threshold: float = 15.0):
    """Start the global watchdog."""
    global _global_watchdog
    if _global_watchdog is None:
        _global_watchdog = EventLoopWatchdog(check_interval=check_interval, timeout_threshold=timeout_threshold)
        _global_watchdog.start(loop)
    return _global_watchdog
 def stop_watchdog():
    """Stop the global watchdog."""
    global _global_watchdog
    if _global_watchdog:
        _global_watchdog.stop()
        _global_watchdog = None
--- a/letta/server/rest_api/app.py
+++ b/letta/server/rest_api/app.py
@@ -5,6 +5,7 @@ import logging
 import os
 import platform
 import sys
 import threading
 from contextlib import asynccontextmanager
 from functools import partial
 from pathlib import Path
@@ -150,6 +151,18 @@ async def lifespan(app_: FastAPI):
        await tracker.start_background_monitor()
        logger.info(f"[Worker {worker_id}] Memory tracking enabled - monitoring every 5s with proactive alerts")
    # Initialize event loop watchdog
    try:
        import asyncio
        from letta.monitoring.event_loop_watchdog import start_watchdog
        loop = asyncio.get_running_loop()
        start_watchdog(loop, check_interval=5.0, timeout_threshold=15.0)
        logger.info(f"[Worker {worker_id}] Event loop watchdog started")
    except Exception as e:
        logger.warning(f"[Worker {worker_id}] Failed to start watchdog: {e}")
    if telemetry_settings.profiler:
        try:
            import googlecloudprofiler
--- a/test_watchdog_hang.py
+++ b/test_watchdog_hang.py
@@ -0,0 +1,97 @@
 #!/usr/bin/env python3
 """
 Test script to verify the watchdog detects event loop hangs.
 Run this to validate the watchdog works before deploying.
 """
 import asyncio
 import logging
 import sys
 import time
 from pathlib import Path
 # Setup logging to see watchdog output
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
 # Add letta to path
 sys.path.insert(0, str(Path(__file__).parent))
 from letta.monitoring.event_loop_watchdog import start_watchdog
 def blocking_operation(seconds: float):
    """Simulate a blocking operation that hangs the event loop."""
    print(f"\n⚠️  BLOCKING event loop for {seconds}s (simulating hang)...")
    time.sleep(seconds)
    print("✓ Blocking operation completed\n")
 async def test_watchdog_detection():
    """Test that watchdog detects event loop hangs."""
    print("\n" + "=" * 70)
    print("EVENT LOOP WATCHDOG TEST")
    print("=" * 70)
    # Start the watchdog with aggressive settings for testing
    loop = asyncio.get_running_loop()
    watchdog = start_watchdog(loop, check_interval=2.0, timeout_threshold=5.0)
    print("\n✓ Watchdog started (will alert if no heartbeat for >5s)")
    print("  Checking every 2 seconds...\n")
    # Test 1: Normal operation (should NOT trigger)
    print("TEST 1: Normal async operation (no hang expected)")
    print("-" * 70)
    for i in range(3):
        await asyncio.sleep(1)
        print(f"  Heartbeat {i + 1}/3 - event loop running normally")
    print("✓ Test 1 passed: No false alarms\n")
    await asyncio.sleep(3)
    # Test 2: Short block (should NOT trigger - under 5s threshold)
    print("TEST 2: Short blocking operation (4s - should NOT trigger)")
    print("-" * 70)
    blocking_operation(4.0)
    await asyncio.sleep(3)
    print("✓ Test 2 passed: Short blocks don't trigger false alarms\n")
    await asyncio.sleep(2)
    # Test 3: Long block (SHOULD trigger - exceeds 5s threshold)
    print("TEST 3: Long blocking operation (8s - SHOULD trigger watchdog)")
    print("-" * 70)
    print("🔍 Watch for ERROR logs from the watchdog...")
    blocking_operation(8.0)
    # Give watchdog time to detect and log
    await asyncio.sleep(3)
    print("\n" + "=" * 70)
    print("TEST COMPLETE")
    print("=" * 70)
    print("\nExpected results:")
    print("  ✓ Test 1: No watchdog alerts (normal operation)")
    print("  ✓ Test 2: No watchdog alerts (4s < 5s threshold)")
    print("  ✓ Test 3: WATCHDOG ALERT logged (8s > 5s threshold)")
    print("\nIf you saw 'EVENT LOOP HANG DETECTED' in Test 3, watchdog works! ✓")
    print("\n")
    # Stop watchdog
    watchdog.stop()
 async def main():
    """Run the test."""
    try:
        await test_watchdog_detection()
    except Exception as e:
        print(f"\n❌ Test failed with error: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)
 if __name__ == "__main__":
    asyncio.run(main())