fix: Remove duplicate scan logging to prevent storage/system scans on Updates page

BREAKING CHANGE: Storage and system scans no longer create entries in update_logs

**Problem**
- Storage scans were appearing on Updates page (mixed with package updates)
- System scans were appearing on Updates page (mixed with package updates)
- Duplicate "Scan All" entries from collective + individual logging

**Root Cause**
Scan handlers were calling both ReportLog() and dedicated endpoints:
- reportLogWithAck → POST /api/v1/agents/:id/logs → update_logs table
- This caused storage/system metrics to appear alongside package updates

**Fix**
Removed ALL ReportLog() calls from scan handlers:
1. handleScanUpdatesV2 (lines 44-46): Removed collective logging
2. handleScanStorage (lines 103-105): Use only ReportStorageMetrics
3. handleScanSystem (lines 189-191): Use only ReportMetrics
4. handleScanDocker (lines 269-271): Use only ReportDockerImages

**Verification**
- All 4 handlers have working dedicated endpoints (verified via subagent)
- Routes already registered: POST /storage-metrics, POST /metrics, etc.
- Frontend queries correct endpoints (verified)
- No data loss: dedicated endpoints store in proper tables

**Result**
- Storage scans → storage_metrics table → Storage page only 
- System scans → system reporting → System page only 
- Package updates → update_logs table → Updates page only 
- No duplicate "Scan All" entries 

**Files Changed**
- aggregator-agent/cmd/agent/subsystem_handlers.go: Removed 20 lines of ReportLog calls
- internal/api/handlers/agents.go: Command recovery enhancements
- internal/api/handlers/updates.go: Subsystem extraction logic
- internal/database/queries/commands.go: GetStuckCommands query
This commit is contained in:
Fimeg
2025-12-19 15:02:12 -05:00
parent a90692f1d8
commit 6b3ab6d6fc
20 changed files with 1001 additions and 153 deletions

View File

@@ -23,9 +23,11 @@ import (
"github.com/Fimeg/RedFlag/aggregator-agent/internal/installer"
"github.com/Fimeg/RedFlag/aggregator-agent/internal/migration"
"github.com/Fimeg/RedFlag/aggregator-agent/internal/orchestrator"
"github.com/Fimeg/RedFlag/aggregator-agent/internal/guardian"
"github.com/Fimeg/RedFlag/aggregator-agent/internal/scanner"
"github.com/Fimeg/RedFlag/aggregator-agent/internal/service"
"github.com/Fimeg/RedFlag/aggregator-agent/internal/system"
"github.com/Fimeg/RedFlag/aggregator-agent/internal/validator"
"github.com/Fimeg/RedFlag/aggregator-agent/internal/version"
"github.com/google/uuid"
)
@@ -524,87 +526,143 @@ func getCurrentSubsystemEnabled(cfg *config.Config, subsystemName string) bool {
}
}
// syncServerConfig checks for and applies server configuration updates
func syncServerConfig(apiClient *client.Client, cfg *config.Config) error {
// Get current config from server
// syncServerConfigProper checks for and applies server configuration updates with validation and protection
func syncServerConfigProper(apiClient *client.Client, cfg *config.Config) error {
serverConfig, err := apiClient.GetConfig(cfg.AgentID)
if err != nil {
log.Printf("[HISTORY] [agent] [config] sync_failed error=\"%v\" timestamp=%s",
err, time.Now().Format(time.RFC3339))
return fmt.Errorf("failed to get server config: %w", err)
}
// Check if config version is newer
if serverConfig.Version <= lastConfigVersion {
return nil // No update needed
}
log.Printf("📡 Server config update detected (version: %d)", serverConfig.Version)
log.Printf("[INFO] [agent] [config] server config update detected (version: %d)", serverConfig.Version)
changes := false
// Track potential check-in interval changes separately to avoid inflation
newCheckInInterval := cfg.CheckInInterval
// Create validator for interval bounds checking
intervalValidator := validator.NewIntervalValidator()
// Apply subsystem configuration from server
// Create guardian to protect against check-in interval override attempts
intervalGuardian := guardian.NewIntervalGuardian()
intervalGuardian.SetBaseline(cfg.CheckInInterval)
// Process subsystem configurations
for subsystemName, subsystemConfig := range serverConfig.Subsystems {
if configMap, ok := subsystemConfig.(map[string]interface{}); ok {
enabled := false
intervalMinutes := 0
autoRun := false
if e, exists := configMap["enabled"]; exists {
if eVal, ok := e.(bool); ok {
enabled = eVal
}
// Parse interval from server config
intervalFloat := 0.0
if rawInterval, ok := configMap["interval_minutes"].(float64); ok {
intervalFloat = rawInterval
}
intervalMinutes := int(intervalFloat)
if i, exists := configMap["interval_minutes"]; exists {
if iVal, ok := i.(float64); ok {
intervalMinutes = int(iVal)
// Validate scanner interval
if intervalMinutes > 0 {
if err := intervalValidator.ValidateScannerInterval(intervalMinutes); err != nil {
log.Printf("[ERROR] [agent] [config] [%s] scanner interval validation failed: %v",
subsystemName, err)
log.Printf("[HISTORY] [agent] [config] [%s] interval_rejected interval=%d reason=\"%v\" timestamp=%s",
subsystemName, intervalMinutes, err, time.Now().Format(time.RFC3339))
continue // Skip invalid interval but don't fail entire sync
}
}
if a, exists := configMap["auto_run"]; exists {
if aVal, ok := a.(bool); ok {
autoRun = aVal
}
}
// Get current subsystem enabled state dynamically
currentEnabled := getCurrentSubsystemEnabled(cfg, subsystemName)
if enabled != currentEnabled {
log.Printf(" → %s: enabled=%v (changed)", subsystemName, enabled)
log.Printf("[INFO] [agent] [config] [%s] interval=%d minutes", subsystemName, intervalMinutes)
changes = true
}
// Check if interval actually changed, but don't modify cfg.CheckInInterval yet
if intervalMinutes > 0 && intervalMinutes != newCheckInInterval {
log.Printf(" → %s: interval=%d minutes (changed)", subsystemName, intervalMinutes)
changes = true
newCheckInInterval = intervalMinutes // Update temp variable, not the config
}
// Apply validated interval to the appropriate subsystem
switch subsystemName {
case "system":
cfg.Subsystems.System.IntervalMinutes = intervalMinutes
case "apt":
cfg.Subsystems.APT.IntervalMinutes = intervalMinutes
case "dnf":
cfg.Subsystems.DNF.IntervalMinutes = intervalMinutes
case "storage":
cfg.Subsystems.Storage.IntervalMinutes = intervalMinutes
case "winget":
cfg.Subsystems.Winget.IntervalMinutes = intervalMinutes
default:
log.Printf("[WARNING] [agent] [config] unknown subsystem: %s", subsystemName)
}
if autoRun {
log.Printf(" → %s: auto_run=%v (server-side scheduling)", subsystemName, autoRun)
// Log to history table
log.Printf("[HISTORY] [agent] [config] [%s] interval_updated minutes=%d timestamp=%s",
subsystemName, intervalMinutes, time.Now().Format(time.RFC3339))
}
}
}
// Apply the check-in interval change only once after all subsystems processed
if newCheckInInterval != cfg.CheckInInterval {
cfg.CheckInInterval = newCheckInInterval
// Verification: Ensure no scanner interval is interfering with check-in frequency
// This guards against regressions where scanner settings might affect agent polling
if intervalGuardian.GetViolationCount() > 0 {
log.Printf("[WARNING] [agent] [config] guardian detected %d previous interval violations",
intervalGuardian.GetViolationCount())
}
if err := cfg.Save(constants.GetAgentConfigPath()); err != nil {
log.Printf("[HISTORY] [agent] [config] save_failed error=\"%v\" timestamp=%s",
err, time.Now().Format(time.RFC3339))
return fmt.Errorf("failed to save config: %w", err)
}
if changes {
log.Printf("✅ Server configuration applied successfully")
} else {
log.Printf(" Server config received but no changes detected")
log.Printf("[INFO] [agent] [config] scanner interval updates applied")
}
// Update last config version
lastConfigVersion = serverConfig.Version
log.Printf("[SUCCESS] [agent] [config] config saved successfully")
return nil
}
// syncServerConfigWithRetry wraps syncServerConfigProper with retry logic
func syncServerConfigWithRetry(apiClient *client.Client, cfg *config.Config, maxRetries int) error {
var lastErr error
for attempt := 1; attempt <= maxRetries; attempt++ {
if err := syncServerConfigProper(apiClient, cfg); err != nil {
lastErr = err
log.Printf("[ERROR] [agent] [config] sync attempt %d/%d failed: %v",
attempt, maxRetries, err)
// Log to history table
log.Printf("[HISTORY] [agent] [config] sync_failed attempt=%d/%d error=\"%v\" timestamp=%s",
attempt, maxRetries, err, time.Now().Format(time.RFC3339))
if attempt < maxRetries {
// Exponential backoff: 1s, 2s, 4s, 8s...
backoff := time.Duration(1<<uint(attempt-1)) * time.Second
log.Printf("[INFO] [agent] [config] retrying in %v...", backoff)
time.Sleep(backoff)
}
continue
}
log.Printf("[SUCCESS] [agent] [config] synced after %d attempts", attempt)
return nil
}
// After maxRetries, degrade gracefully
if err := cfg.SetDegradedMode(true); err != nil {
log.Printf("[ERROR] [agent] [config] failed to enter degraded mode: %v", err)
log.Printf("[HISTORY] [agent] [config] degraded_mode_failed error=\"%v\" timestamp=%s",
err, time.Now().Format(time.RFC3339))
} else {
log.Printf("[WARNING] [agent] [config] entering degraded mode after %d failed attempts", maxRetries)
}
// Log degraded mode entry to history
log.Printf("[HISTORY] [agent] [config] degraded_mode_entered failures=%d timestamp=%s",
maxRetries, time.Now().Format(time.RFC3339))
return lastErr
}
func runAgent(cfg *config.Config) error {
log.Printf("🚩 RedFlag Agent v%s starting...\n", version.Version)
log.Printf("==================================================================")
@@ -656,17 +714,42 @@ func runAgent(cfg *config.Config) error {
// Initialize scanner orchestrator for parallel execution and granular subsystem management
scanOrchestrator := orchestrator.NewOrchestrator()
// Register update scanners ONLY - package management systems
// Initialize scanners for storage, system, and docker (used by individual subsystem handlers)
storageScanner := orchestrator.NewStorageScanner(version.Version)
systemScanner := orchestrator.NewSystemScanner(version.Version)
dockerScanner, _ := scanner.NewDockerScanner()
// Initialize circuit breakers for all subsystems
storageCB := circuitbreaker.New("Storage", circuitbreaker.Config{
FailureThreshold: cfg.Subsystems.Storage.CircuitBreaker.FailureThreshold,
FailureWindow: cfg.Subsystems.Storage.CircuitBreaker.FailureWindow,
OpenDuration: cfg.Subsystems.Storage.CircuitBreaker.OpenDuration,
HalfOpenAttempts: cfg.Subsystems.Storage.CircuitBreaker.HalfOpenAttempts,
})
systemCB := circuitbreaker.New("System", circuitbreaker.Config{
FailureThreshold: cfg.Subsystems.System.CircuitBreaker.FailureThreshold,
FailureWindow: cfg.Subsystems.System.CircuitBreaker.FailureWindow,
OpenDuration: cfg.Subsystems.System.CircuitBreaker.OpenDuration,
HalfOpenAttempts: cfg.Subsystems.System.CircuitBreaker.HalfOpenAttempts,
})
dockerCB := circuitbreaker.New("Docker", circuitbreaker.Config{
FailureThreshold: cfg.Subsystems.Docker.CircuitBreaker.FailureThreshold,
FailureWindow: cfg.Subsystems.Docker.CircuitBreaker.FailureWindow,
OpenDuration: cfg.Subsystems.Docker.CircuitBreaker.OpenDuration,
HalfOpenAttempts: cfg.Subsystems.Docker.CircuitBreaker.HalfOpenAttempts,
})
// Register ALL scanners with the orchestrator
// Update scanners (package management)
scanOrchestrator.RegisterScanner("apt", orchestrator.NewAPTScannerWrapper(aptScanner), aptCB, cfg.Subsystems.APT.Timeout, cfg.Subsystems.APT.Enabled)
scanOrchestrator.RegisterScanner("dnf", orchestrator.NewDNFScannerWrapper(dnfScanner), dnfCB, cfg.Subsystems.DNF.Timeout, cfg.Subsystems.DNF.Enabled)
scanOrchestrator.RegisterScanner("windows", orchestrator.NewWindowsUpdateScannerWrapper(windowsUpdateScanner), windowsCB, cfg.Subsystems.Windows.Timeout, cfg.Subsystems.Windows.Enabled)
scanOrchestrator.RegisterScanner("winget", orchestrator.NewWingetScannerWrapper(wingetScanner), wingetCB, cfg.Subsystems.Winget.Timeout, cfg.Subsystems.Winget.Enabled)
// NOTE: Docker, Storage, and System scanners are NOT registered with the update orchestrator
// They have their own dedicated handlers and endpoints:
// - Docker: handleScanDocker → ReportDockerImages()
// - Storage: handleScanStorage → ReportMetrics()
// - System: handleScanSystem → ReportMetrics()
// System scanners (metrics and monitoring)
scanOrchestrator.RegisterScanner("storage", orchestrator.NewStorageScannerWrapper(storageScanner), storageCB, cfg.Subsystems.Storage.Timeout, cfg.Subsystems.Storage.Enabled)
scanOrchestrator.RegisterScanner("system", orchestrator.NewSystemScannerWrapper(systemScanner), systemCB, cfg.Subsystems.System.Timeout, cfg.Subsystems.System.Enabled)
scanOrchestrator.RegisterScanner("docker", orchestrator.NewDockerScannerWrapper(dockerScanner), dockerCB, cfg.Subsystems.Docker.Timeout, cfg.Subsystems.Docker.Enabled)
// Initialize acknowledgment tracker for command result reliability
ackTracker := acknowledgment.NewTracker(constants.GetAgentStateDir())
@@ -804,10 +887,10 @@ func runAgent(cfg *config.Config) error {
}
}
// Sync configuration from server (non-blocking)
// Sync configuration from server (non-blocking) with retry logic
go func() {
if err := syncServerConfig(apiClient, cfg); err != nil {
log.Printf("Warning: Failed to sync server config: %v", err)
if err := syncServerConfigWithRetry(apiClient, cfg, 5); err != nil {
log.Printf("Warning: Failed to sync server config after retries: %v", err)
}
}()