diff --git a/letta/monitoring/memory_tracker.py b/letta/monitoring/memory_tracker.py index a0670eca..10fe1953 100644 --- a/letta/monitoring/memory_tracker.py +++ b/letta/monitoring/memory_tracker.py @@ -87,11 +87,12 @@ class MemoryTracker: # Check if there's a running event loop loop = asyncio.get_running_loop() # Create the monitor task - asyncio.create_task(self.start_background_monitor()) + task = asyncio.create_task(self.start_background_monitor()) self._monitor_started = True - except RuntimeError: + logger.debug("Monitor start task created from _ensure_monitor_started") + except RuntimeError as e: # No event loop running yet, will try again later - pass + logger.debug(f"No event loop available yet for monitor start: {e}") def get_memory_info(self) -> Dict[str, Any]: """Get current memory information.""" @@ -303,12 +304,20 @@ class MemoryTracker: async def start_background_monitor(self): """Start the background memory monitoring task.""" if self._monitoring: + logger.info("Background monitor already running, skipping start") return self._monitoring = True self._monitor_started = True - self._monitor_task = asyncio.create_task(self._monitor_loop()) - logger.info(f"Background memory monitor started (interval: {self.monitor_interval}s)") + + try: + self._monitor_task = asyncio.create_task(self._monitor_loop()) + logger.info(f"Background memory monitor task created successfully (interval: {self.monitor_interval}s)") + except Exception as e: + logger.error(f"Failed to create monitor task: {e}", exc_info=True) + self._monitoring = False + self._monitor_started = False + raise async def stop_background_monitor(self): """Stop the background memory monitoring task.""" @@ -326,9 +335,13 @@ class MemoryTracker: """Background monitoring loop that runs continuously using asyncio.""" consecutive_high_memory = 0 last_gc_time = time.time() + iteration_count = 0 + + logger.info(f"Memory monitor loop started (PID: {os.getpid()})") while self._monitoring: try: + iteration_count += 1 mem_info = self.get_memory_info() current_mb = mem_info.get("rss_mb", 0) system_percent = mem_info.get("system_percent", 0) @@ -339,6 +352,14 @@ class MemoryTracker: if len(self.memory_history) > self.max_history_size: self.memory_history.pop(0) + # Log periodic memory status + # Using INFO since production logging is at INFO level + percent = (current_mb / self.CRITICAL_THRESHOLD_MB) * 100 + logger.info( + f"Memory Status: RSS: {current_mb:.2f} MB ({percent:.1f}% of {self.CRITICAL_THRESHOLD_MB} MB limit), " + f"System: {system_percent:.1f}%" + ) + # Check memory levels self._check_memory_thresholds(mem_info) @@ -364,11 +385,14 @@ class MemoryTracker: await asyncio.sleep(self.monitor_interval) except asyncio.CancelledError: + logger.info(f"Memory monitor loop cancelled after {iteration_count} iterations") break except Exception as e: - logger.error(f"Error in memory monitor loop: {e}") + logger.error(f"Error in memory monitor loop (iteration {iteration_count}): {e}", exc_info=True) await asyncio.sleep(self.monitor_interval) + logger.info(f"Memory monitor loop exited after {iteration_count} iterations") + def _check_memory_thresholds(self, mem_info: Dict[str, Any]): """Check memory against thresholds and log appropriately.""" current_mb = mem_info.get("rss_mb", 0) diff --git a/letta/server/rest_api/app.py b/letta/server/rest_api/app.py index e97f564b..d944a5ee 100644 --- a/letta/server/rest_api/app.py +++ b/letta/server/rest_api/app.py @@ -144,8 +144,10 @@ async def lifespan(app_: FastAPI): # Initialize memory tracking if MEMORY_TRACKING_ENABLED: logger.info(f"[Worker {worker_id}] Initializing memory tracking") - # Get the global tracker instance (will start background monitor automatically) + # Get the global tracker instance tracker = get_memory_tracker(enable_background_monitor=True, monitor_interval=5) + # Explicitly start the background monitor (won't wait for first tracked operation) + await tracker.start_background_monitor() logger.info(f"[Worker {worker_id}] Memory tracking enabled - monitoring every 5s with proactive alerts") if telemetry_settings.profiler: