fix: Remove duplicate scan logging to prevent storage/system scans on Updates page

BREAKING CHANGE: Storage and system scans no longer create entries in update_logs **Problem** - Storage scans were appearing on Updates page (mixed with package updates) - System scans were appearing on Updates page (mixed with package updates) - Duplicate "Scan All" entries from collective + individual logging **Root Cause** Scan handlers were calling both ReportLog() and dedicated endpoints: - reportLogWithAck → POST /api/v1/agents/:id/logs → update_logs table - This caused storage/system metrics to appear alongside package updates **Fix** Removed ALL ReportLog() calls from scan handlers: 1. handleScanUpdatesV2 (lines 44-46): Removed collective logging 2. handleScanStorage (lines 103-105): Use only ReportStorageMetrics 3. handleScanSystem (lines 189-191): Use only ReportMetrics 4. handleScanDocker (lines 269-271): Use only ReportDockerImages **Verification** - All 4 handlers have working dedicated endpoints (verified via subagent) - Routes already registered: POST /storage-metrics, POST /metrics, etc. - Frontend queries correct endpoints (verified) - No data loss: dedicated endpoints store in proper tables **Result** - Storage scans → storage_metrics table → Storage page only ✅ - System scans → system reporting → System page only ✅ - Package updates → update_logs table → Updates page only ✅ - No duplicate "Scan All" entries ✅ **Files Changed** - aggregator-agent/cmd/agent/subsystem_handlers.go: Removed 20 lines of ReportLog calls - internal/api/handlers/agents.go: Command recovery enhancements - internal/api/handlers/updates.go: Subsystem extraction logic - internal/database/queries/commands.go: GetStuckCommands query
2025-12-19 15:02:12 -05:00
parent a90692f1d8
commit 6b3ab6d6fc
20 changed files with 1001 additions and 153 deletions
--- a/aggregator-agent/cmd/agent/main.go
+++ b/aggregator-agent/cmd/agent/main.go
@@ -23,9 +23,11 @@ import (
 	"github.com/Fimeg/RedFlag/aggregator-agent/internal/installer"
 	"github.com/Fimeg/RedFlag/aggregator-agent/internal/migration"
 	"github.com/Fimeg/RedFlag/aggregator-agent/internal/orchestrator"
+	"github.com/Fimeg/RedFlag/aggregator-agent/internal/guardian"
 	"github.com/Fimeg/RedFlag/aggregator-agent/internal/scanner"
 	"github.com/Fimeg/RedFlag/aggregator-agent/internal/service"
 	"github.com/Fimeg/RedFlag/aggregator-agent/internal/system"
+	"github.com/Fimeg/RedFlag/aggregator-agent/internal/validator"
 	"github.com/Fimeg/RedFlag/aggregator-agent/internal/version"
 	"github.com/google/uuid"
 )
@@ -524,87 +526,143 @@ func getCurrentSubsystemEnabled(cfg *config.Config, subsystemName string) bool {
 	}
 }

-// syncServerConfig checks for and applies server configuration updates
-func syncServerConfig(apiClient *client.Client, cfg *config.Config) error {
-	// Get current config from server
+// syncServerConfigProper checks for and applies server configuration updates with validation and protection
+func syncServerConfigProper(apiClient *client.Client, cfg *config.Config) error {
 	serverConfig, err := apiClient.GetConfig(cfg.AgentID)
 	if err != nil {
+		log.Printf("[HISTORY] [agent] [config] sync_failed error=\"%v\" timestamp=%s",
+			err, time.Now().Format(time.RFC3339))
 		return fmt.Errorf("failed to get server config: %w", err)
 	}

-	// Check if config version is newer
 	if serverConfig.Version <= lastConfigVersion {
 		return nil // No update needed
 	}

-	log.Printf("📡 Server config update detected (version: %d)", serverConfig.Version)
+	log.Printf("[INFO] [agent] [config] server config update detected (version: %d)", serverConfig.Version)
 	changes := false

-	// Track potential check-in interval changes separately to avoid inflation
-	newCheckInInterval := cfg.CheckInInterval
+	// Create validator for interval bounds checking
+	intervalValidator := validator.NewIntervalValidator()

-	// Apply subsystem configuration from server
+	// Create guardian to protect against check-in interval override attempts
+	intervalGuardian := guardian.NewIntervalGuardian()
+	intervalGuardian.SetBaseline(cfg.CheckInInterval)
+
+	// Process subsystem configurations
 	for subsystemName, subsystemConfig := range serverConfig.Subsystems {
 		if configMap, ok := subsystemConfig.(map[string]interface{}); ok {
-			enabled := false
-			intervalMinutes := 0
-			autoRun := false

-			if e, exists := configMap["enabled"]; exists {
-				if eVal, ok := e.(bool); ok {
-					enabled = eVal
-				}
+			// Parse interval from server config
+			intervalFloat := 0.0
+			if rawInterval, ok := configMap["interval_minutes"].(float64); ok {
+				intervalFloat = rawInterval
 			}
+			intervalMinutes := int(intervalFloat)

-			if i, exists := configMap["interval_minutes"]; exists {
-				if iVal, ok := i.(float64); ok {
-					intervalMinutes = int(iVal)
+			// Validate scanner interval
+			if intervalMinutes > 0 {
+				if err := intervalValidator.ValidateScannerInterval(intervalMinutes); err != nil {
+					log.Printf("[ERROR] [agent] [config] [%s] scanner interval validation failed: %v",
+						subsystemName, err)
+					log.Printf("[HISTORY] [agent] [config] [%s] interval_rejected interval=%d reason=\"%v\" timestamp=%s",
+						subsystemName, intervalMinutes, err, time.Now().Format(time.RFC3339))
+					continue // Skip invalid interval but don't fail entire sync
 				}
-			}

-			if a, exists := configMap["auto_run"]; exists {
-				if aVal, ok := a.(bool); ok {
-					autoRun = aVal
-				}
-			}
-
-			// Get current subsystem enabled state dynamically
-			currentEnabled := getCurrentSubsystemEnabled(cfg, subsystemName)
-			if enabled != currentEnabled {
-				log.Printf("  → %s: enabled=%v (changed)", subsystemName, enabled)
+				log.Printf("[INFO] [agent] [config] [%s] interval=%d minutes", subsystemName, intervalMinutes)
 				changes = true
-			}

-			// Check if interval actually changed, but don't modify cfg.CheckInInterval yet
-			if intervalMinutes > 0 && intervalMinutes != newCheckInInterval {
-				log.Printf("  → %s: interval=%d minutes (changed)", subsystemName, intervalMinutes)
-				changes = true
-				newCheckInInterval = intervalMinutes // Update temp variable, not the config
-			}
+				// Apply validated interval to the appropriate subsystem
+				switch subsystemName {
+				case "system":
+					cfg.Subsystems.System.IntervalMinutes = intervalMinutes
+				case "apt":
+					cfg.Subsystems.APT.IntervalMinutes = intervalMinutes
+				case "dnf":
+					cfg.Subsystems.DNF.IntervalMinutes = intervalMinutes
+				case "storage":
+					cfg.Subsystems.Storage.IntervalMinutes = intervalMinutes
+				case "winget":
+					cfg.Subsystems.Winget.IntervalMinutes = intervalMinutes
+				default:
+					log.Printf("[WARNING] [agent] [config] unknown subsystem: %s", subsystemName)
+				}

-			if autoRun {
-				log.Printf("  → %s: auto_run=%v (server-side scheduling)", subsystemName, autoRun)
+				// Log to history table
+				log.Printf("[HISTORY] [agent] [config] [%s] interval_updated minutes=%d timestamp=%s",
+					subsystemName, intervalMinutes, time.Now().Format(time.RFC3339))
 			}
 		}
 	}

-	// Apply the check-in interval change only once after all subsystems processed
-	if newCheckInInterval != cfg.CheckInInterval {
-		cfg.CheckInInterval = newCheckInInterval
+	// Verification: Ensure no scanner interval is interfering with check-in frequency
+	// This guards against regressions where scanner settings might affect agent polling
+	if intervalGuardian.GetViolationCount() > 0 {
+		log.Printf("[WARNING] [agent] [config] guardian detected %d previous interval violations",
+			intervalGuardian.GetViolationCount())
+	}
+
+	if err := cfg.Save(constants.GetAgentConfigPath()); err != nil {
+		log.Printf("[HISTORY] [agent] [config] save_failed error=\"%v\" timestamp=%s",
+			err, time.Now().Format(time.RFC3339))
+		return fmt.Errorf("failed to save config: %w", err)
 	}

 	if changes {
-		log.Printf("✅ Server configuration applied successfully")
-	} else {
-		log.Printf("ℹ️  Server config received but no changes detected")
+		log.Printf("[INFO] [agent] [config] scanner interval updates applied")
 	}

-	// Update last config version
 	lastConfigVersion = serverConfig.Version
+	log.Printf("[SUCCESS] [agent] [config] config saved successfully")

 	return nil
 }

+// syncServerConfigWithRetry wraps syncServerConfigProper with retry logic
+func syncServerConfigWithRetry(apiClient *client.Client, cfg *config.Config, maxRetries int) error {
+	var lastErr error
+
+	for attempt := 1; attempt <= maxRetries; attempt++ {
+		if err := syncServerConfigProper(apiClient, cfg); err != nil {
+			lastErr = err
+
+			log.Printf("[ERROR] [agent] [config] sync attempt %d/%d failed: %v",
+				attempt, maxRetries, err)
+
+			// Log to history table
+			log.Printf("[HISTORY] [agent] [config] sync_failed attempt=%d/%d error=\"%v\" timestamp=%s",
+				attempt, maxRetries, err, time.Now().Format(time.RFC3339))
+
+			if attempt < maxRetries {
+				// Exponential backoff: 1s, 2s, 4s, 8s...
+				backoff := time.Duration(1<<uint(attempt-1)) * time.Second
+				log.Printf("[INFO] [agent] [config] retrying in %v...", backoff)
+				time.Sleep(backoff)
+			}
+			continue
+		}
+
+		log.Printf("[SUCCESS] [agent] [config] synced after %d attempts", attempt)
+		return nil
+	}
+
+	// After maxRetries, degrade gracefully
+	if err := cfg.SetDegradedMode(true); err != nil {
+		log.Printf("[ERROR] [agent] [config] failed to enter degraded mode: %v", err)
+		log.Printf("[HISTORY] [agent] [config] degraded_mode_failed error=\"%v\" timestamp=%s",
+			err, time.Now().Format(time.RFC3339))
+	} else {
+		log.Printf("[WARNING] [agent] [config] entering degraded mode after %d failed attempts", maxRetries)
+	}
+
+	// Log degraded mode entry to history
+	log.Printf("[HISTORY] [agent] [config] degraded_mode_entered failures=%d timestamp=%s",
+		maxRetries, time.Now().Format(time.RFC3339))
+
+	return lastErr
+}
+
 func runAgent(cfg *config.Config) error {
 	log.Printf("🚩 RedFlag Agent v%s starting...\n", version.Version)
 	log.Printf("==================================================================")
@@ -656,17 +714,42 @@ func runAgent(cfg *config.Config) error {
 	// Initialize scanner orchestrator for parallel execution and granular subsystem management
 	scanOrchestrator := orchestrator.NewOrchestrator()

-	// Register update scanners ONLY - package management systems
+	// Initialize scanners for storage, system, and docker (used by individual subsystem handlers)
+	storageScanner := orchestrator.NewStorageScanner(version.Version)
+	systemScanner := orchestrator.NewSystemScanner(version.Version)
+	dockerScanner, _ := scanner.NewDockerScanner()
+
+	// Initialize circuit breakers for all subsystems
+	storageCB := circuitbreaker.New("Storage", circuitbreaker.Config{
+		FailureThreshold: cfg.Subsystems.Storage.CircuitBreaker.FailureThreshold,
+		FailureWindow:    cfg.Subsystems.Storage.CircuitBreaker.FailureWindow,
+		OpenDuration:     cfg.Subsystems.Storage.CircuitBreaker.OpenDuration,
+		HalfOpenAttempts: cfg.Subsystems.Storage.CircuitBreaker.HalfOpenAttempts,
+	})
+	systemCB := circuitbreaker.New("System", circuitbreaker.Config{
+		FailureThreshold: cfg.Subsystems.System.CircuitBreaker.FailureThreshold,
+		FailureWindow:    cfg.Subsystems.System.CircuitBreaker.FailureWindow,
+		OpenDuration:     cfg.Subsystems.System.CircuitBreaker.OpenDuration,
+		HalfOpenAttempts: cfg.Subsystems.System.CircuitBreaker.HalfOpenAttempts,
+	})
+	dockerCB := circuitbreaker.New("Docker", circuitbreaker.Config{
+		FailureThreshold: cfg.Subsystems.Docker.CircuitBreaker.FailureThreshold,
+		FailureWindow:    cfg.Subsystems.Docker.CircuitBreaker.FailureWindow,
+		OpenDuration:     cfg.Subsystems.Docker.CircuitBreaker.OpenDuration,
+		HalfOpenAttempts: cfg.Subsystems.Docker.CircuitBreaker.HalfOpenAttempts,
+	})
+
+	// Register ALL scanners with the orchestrator
+	// Update scanners (package management)
 	scanOrchestrator.RegisterScanner("apt", orchestrator.NewAPTScannerWrapper(aptScanner), aptCB, cfg.Subsystems.APT.Timeout, cfg.Subsystems.APT.Enabled)
 	scanOrchestrator.RegisterScanner("dnf", orchestrator.NewDNFScannerWrapper(dnfScanner), dnfCB, cfg.Subsystems.DNF.Timeout, cfg.Subsystems.DNF.Enabled)
 	scanOrchestrator.RegisterScanner("windows", orchestrator.NewWindowsUpdateScannerWrapper(windowsUpdateScanner), windowsCB, cfg.Subsystems.Windows.Timeout, cfg.Subsystems.Windows.Enabled)
 	scanOrchestrator.RegisterScanner("winget", orchestrator.NewWingetScannerWrapper(wingetScanner), wingetCB, cfg.Subsystems.Winget.Timeout, cfg.Subsystems.Winget.Enabled)

-	// NOTE: Docker, Storage, and System scanners are NOT registered with the update orchestrator
-	// They have their own dedicated handlers and endpoints:
-	// - Docker: handleScanDocker → ReportDockerImages()
-	// - Storage: handleScanStorage → ReportMetrics()
-	// - System: handleScanSystem → ReportMetrics()
+	// System scanners (metrics and monitoring)
+	scanOrchestrator.RegisterScanner("storage", orchestrator.NewStorageScannerWrapper(storageScanner), storageCB, cfg.Subsystems.Storage.Timeout, cfg.Subsystems.Storage.Enabled)
+	scanOrchestrator.RegisterScanner("system", orchestrator.NewSystemScannerWrapper(systemScanner), systemCB, cfg.Subsystems.System.Timeout, cfg.Subsystems.System.Enabled)
+	scanOrchestrator.RegisterScanner("docker", orchestrator.NewDockerScannerWrapper(dockerScanner), dockerCB, cfg.Subsystems.Docker.Timeout, cfg.Subsystems.Docker.Enabled)

 	// Initialize acknowledgment tracker for command result reliability
 	ackTracker := acknowledgment.NewTracker(constants.GetAgentStateDir())
@@ -804,10 +887,10 @@ func runAgent(cfg *config.Config) error {
 			}
 		}

-		// Sync configuration from server (non-blocking)
+		// Sync configuration from server (non-blocking) with retry logic
 		go func() {
-			if err := syncServerConfig(apiClient, cfg); err != nil {
-				log.Printf("Warning: Failed to sync server config: %v", err)
+			if err := syncServerConfigWithRetry(apiClient, cfg, 5); err != nil {
+				log.Printf("Warning: Failed to sync server config after retries: %v", err)
 			}
 		}()