* Add lightweight event loop watchdog monitoring - Thread-based watchdog detects event loop hangs >15s - Runs independently, won't interfere with normal operation - Disabled in test environments - Minimal overhead, just heartbeat checks every 5s * actually test it * Add test script to validate watchdog detects hangs Run with: uv run python test_watchdog_hang.py Tests: - Normal operation (no false positives) - Short blocks under threshold (no alerts) - Long blocks over threshold (correctly alerts)
98 lines
2.9 KiB
Python
98 lines
2.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test script to verify the watchdog detects event loop hangs.
|
|
Run this to validate the watchdog works before deploying.
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
# Setup logging to see watchdog output
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
|
|
|
# Add letta to path
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
from letta.monitoring.event_loop_watchdog import start_watchdog
|
|
|
|
|
|
def blocking_operation(seconds: float):
|
|
"""Simulate a blocking operation that hangs the event loop."""
|
|
print(f"\n⚠️ BLOCKING event loop for {seconds}s (simulating hang)...")
|
|
time.sleep(seconds)
|
|
print("✓ Blocking operation completed\n")
|
|
|
|
|
|
async def test_watchdog_detection():
|
|
"""Test that watchdog detects event loop hangs."""
|
|
print("\n" + "=" * 70)
|
|
print("EVENT LOOP WATCHDOG TEST")
|
|
print("=" * 70)
|
|
|
|
# Start the watchdog with aggressive settings for testing
|
|
loop = asyncio.get_running_loop()
|
|
watchdog = start_watchdog(loop, check_interval=2.0, timeout_threshold=5.0)
|
|
|
|
print("\n✓ Watchdog started (will alert if no heartbeat for >5s)")
|
|
print(" Checking every 2 seconds...\n")
|
|
|
|
# Test 1: Normal operation (should NOT trigger)
|
|
print("TEST 1: Normal async operation (no hang expected)")
|
|
print("-" * 70)
|
|
for i in range(3):
|
|
await asyncio.sleep(1)
|
|
print(f" Heartbeat {i + 1}/3 - event loop running normally")
|
|
print("✓ Test 1 passed: No false alarms\n")
|
|
|
|
await asyncio.sleep(3)
|
|
|
|
# Test 2: Short block (should NOT trigger - under 5s threshold)
|
|
print("TEST 2: Short blocking operation (4s - should NOT trigger)")
|
|
print("-" * 70)
|
|
blocking_operation(4.0)
|
|
await asyncio.sleep(3)
|
|
print("✓ Test 2 passed: Short blocks don't trigger false alarms\n")
|
|
|
|
await asyncio.sleep(2)
|
|
|
|
# Test 3: Long block (SHOULD trigger - exceeds 5s threshold)
|
|
print("TEST 3: Long blocking operation (8s - SHOULD trigger watchdog)")
|
|
print("-" * 70)
|
|
print("🔍 Watch for ERROR logs from the watchdog...")
|
|
blocking_operation(8.0)
|
|
|
|
# Give watchdog time to detect and log
|
|
await asyncio.sleep(3)
|
|
|
|
print("\n" + "=" * 70)
|
|
print("TEST COMPLETE")
|
|
print("=" * 70)
|
|
print("\nExpected results:")
|
|
print(" ✓ Test 1: No watchdog alerts (normal operation)")
|
|
print(" ✓ Test 2: No watchdog alerts (4s < 5s threshold)")
|
|
print(" ✓ Test 3: WATCHDOG ALERT logged (8s > 5s threshold)")
|
|
print("\nIf you saw 'EVENT LOOP HANG DETECTED' in Test 3, watchdog works! ✓")
|
|
print("\n")
|
|
|
|
# Stop watchdog
|
|
watchdog.stop()
|
|
|
|
|
|
async def main():
|
|
"""Run the test."""
|
|
try:
|
|
await test_watchdog_detection()
|
|
except Exception as e:
|
|
print(f"\n❌ Test failed with error: {e}")
|
|
import traceback
|
|
|
|
traceback.print_exc()
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|