fix: register memory monitor at startup (#6195)

register monitor at startup
2025-11-14 16:54:37 -08:00
parent 1e5d6e39c4
commit a5e435c56f
2 changed files with 33 additions and 7 deletions
--- a/letta/monitoring/memory_tracker.py
+++ b/letta/monitoring/memory_tracker.py
@@ -87,11 +87,12 @@ class MemoryTracker:
                # Check if there's a running event loop
                loop = asyncio.get_running_loop()
                # Create the monitor task
-                asyncio.create_task(self.start_background_monitor())
+                task = asyncio.create_task(self.start_background_monitor())
                self._monitor_started = True
-            except RuntimeError:
+                logger.debug("Monitor start task created from _ensure_monitor_started")
+            except RuntimeError as e:
                # No event loop running yet, will try again later
-                pass
+                logger.debug(f"No event loop available yet for monitor start: {e}")

    def get_memory_info(self) -> Dict[str, Any]:
        """Get current memory information."""
@@ -303,12 +304,20 @@ class MemoryTracker:
    async def start_background_monitor(self):
        """Start the background memory monitoring task."""
        if self._monitoring:
+            logger.info("Background monitor already running, skipping start")
            return

        self._monitoring = True
        self._monitor_started = True
-        self._monitor_task = asyncio.create_task(self._monitor_loop())
-        logger.info(f"Background memory monitor started (interval: {self.monitor_interval}s)")
+
+        try:
+            self._monitor_task = asyncio.create_task(self._monitor_loop())
+            logger.info(f"Background memory monitor task created successfully (interval: {self.monitor_interval}s)")
+        except Exception as e:
+            logger.error(f"Failed to create monitor task: {e}", exc_info=True)
+            self._monitoring = False
+            self._monitor_started = False
+            raise

    async def stop_background_monitor(self):
        """Stop the background memory monitoring task."""
@@ -326,9 +335,13 @@ class MemoryTracker:
        """Background monitoring loop that runs continuously using asyncio."""
        consecutive_high_memory = 0
        last_gc_time = time.time()
+        iteration_count = 0
+
+        logger.info(f"Memory monitor loop started (PID: {os.getpid()})")

        while self._monitoring:
            try:
+                iteration_count += 1
                mem_info = self.get_memory_info()
                current_mb = mem_info.get("rss_mb", 0)
                system_percent = mem_info.get("system_percent", 0)
@@ -339,6 +352,14 @@ class MemoryTracker:
                    if len(self.memory_history) > self.max_history_size:
                        self.memory_history.pop(0)

+                # Log periodic memory status
+                # Using INFO since production logging is at INFO level
+                percent = (current_mb / self.CRITICAL_THRESHOLD_MB) * 100
+                logger.info(
+                    f"Memory Status: RSS: {current_mb:.2f} MB ({percent:.1f}% of {self.CRITICAL_THRESHOLD_MB} MB limit), "
+                    f"System: {system_percent:.1f}%"
+                )
+
                # Check memory levels
                self._check_memory_thresholds(mem_info)

@@ -364,11 +385,14 @@ class MemoryTracker:
                await asyncio.sleep(self.monitor_interval)

            except asyncio.CancelledError:
+                logger.info(f"Memory monitor loop cancelled after {iteration_count} iterations")
                break
            except Exception as e:
-                logger.error(f"Error in memory monitor loop: {e}")
+                logger.error(f"Error in memory monitor loop (iteration {iteration_count}): {e}", exc_info=True)
                await asyncio.sleep(self.monitor_interval)

+        logger.info(f"Memory monitor loop exited after {iteration_count} iterations")
+
    def _check_memory_thresholds(self, mem_info: Dict[str, Any]):
        """Check memory against thresholds and log appropriately."""
        current_mb = mem_info.get("rss_mb", 0)
--- a/letta/server/rest_api/app.py
+++ b/letta/server/rest_api/app.py
@@ -144,8 +144,10 @@ async def lifespan(app_: FastAPI):
    # Initialize memory tracking
    if MEMORY_TRACKING_ENABLED:
        logger.info(f"[Worker {worker_id}] Initializing memory tracking")
-        # Get the global tracker instance (will start background monitor automatically)
+        # Get the global tracker instance
        tracker = get_memory_tracker(enable_background_monitor=True, monitor_interval=5)
+        # Explicitly start the background monitor (won't wait for first tracked operation)
+        await tracker.start_background_monitor()
        logger.info(f"[Worker {worker_id}] Memory tracking enabled - monitoring every 5s with proactive alerts")

    if telemetry_settings.profiler: