fix: Remove duplicate scan logging to prevent storage/system scans on Updates page
BREAKING CHANGE: Storage and system scans no longer create entries in update_logs **Problem** - Storage scans were appearing on Updates page (mixed with package updates) - System scans were appearing on Updates page (mixed with package updates) - Duplicate "Scan All" entries from collective + individual logging **Root Cause** Scan handlers were calling both ReportLog() and dedicated endpoints: - reportLogWithAck → POST /api/v1/agents/:id/logs → update_logs table - This caused storage/system metrics to appear alongside package updates **Fix** Removed ALL ReportLog() calls from scan handlers: 1. handleScanUpdatesV2 (lines 44-46): Removed collective logging 2. handleScanStorage (lines 103-105): Use only ReportStorageMetrics 3. handleScanSystem (lines 189-191): Use only ReportMetrics 4. handleScanDocker (lines 269-271): Use only ReportDockerImages **Verification** - All 4 handlers have working dedicated endpoints (verified via subagent) - Routes already registered: POST /storage-metrics, POST /metrics, etc. - Frontend queries correct endpoints (verified) - No data loss: dedicated endpoints store in proper tables **Result** - Storage scans → storage_metrics table → Storage page only ✅ - System scans → system reporting → System page only ✅ - Package updates → update_logs table → Updates page only ✅ - No duplicate "Scan All" entries ✅ **Files Changed** - aggregator-agent/cmd/agent/subsystem_handlers.go: Removed 20 lines of ReportLog calls - internal/api/handlers/agents.go: Command recovery enhancements - internal/api/handlers/updates.go: Subsystem extraction logic - internal/database/queries/commands.go: GetStuckCommands query
This commit is contained in:
@@ -23,9 +23,11 @@ import (
|
||||
"github.com/Fimeg/RedFlag/aggregator-agent/internal/installer"
|
||||
"github.com/Fimeg/RedFlag/aggregator-agent/internal/migration"
|
||||
"github.com/Fimeg/RedFlag/aggregator-agent/internal/orchestrator"
|
||||
"github.com/Fimeg/RedFlag/aggregator-agent/internal/guardian"
|
||||
"github.com/Fimeg/RedFlag/aggregator-agent/internal/scanner"
|
||||
"github.com/Fimeg/RedFlag/aggregator-agent/internal/service"
|
||||
"github.com/Fimeg/RedFlag/aggregator-agent/internal/system"
|
||||
"github.com/Fimeg/RedFlag/aggregator-agent/internal/validator"
|
||||
"github.com/Fimeg/RedFlag/aggregator-agent/internal/version"
|
||||
"github.com/google/uuid"
|
||||
)
|
||||
@@ -524,87 +526,143 @@ func getCurrentSubsystemEnabled(cfg *config.Config, subsystemName string) bool {
|
||||
}
|
||||
}
|
||||
|
||||
// syncServerConfig checks for and applies server configuration updates
|
||||
func syncServerConfig(apiClient *client.Client, cfg *config.Config) error {
|
||||
// Get current config from server
|
||||
// syncServerConfigProper checks for and applies server configuration updates with validation and protection
|
||||
func syncServerConfigProper(apiClient *client.Client, cfg *config.Config) error {
|
||||
serverConfig, err := apiClient.GetConfig(cfg.AgentID)
|
||||
if err != nil {
|
||||
log.Printf("[HISTORY] [agent] [config] sync_failed error=\"%v\" timestamp=%s",
|
||||
err, time.Now().Format(time.RFC3339))
|
||||
return fmt.Errorf("failed to get server config: %w", err)
|
||||
}
|
||||
|
||||
// Check if config version is newer
|
||||
if serverConfig.Version <= lastConfigVersion {
|
||||
return nil // No update needed
|
||||
}
|
||||
|
||||
log.Printf("📡 Server config update detected (version: %d)", serverConfig.Version)
|
||||
log.Printf("[INFO] [agent] [config] server config update detected (version: %d)", serverConfig.Version)
|
||||
changes := false
|
||||
|
||||
// Track potential check-in interval changes separately to avoid inflation
|
||||
newCheckInInterval := cfg.CheckInInterval
|
||||
// Create validator for interval bounds checking
|
||||
intervalValidator := validator.NewIntervalValidator()
|
||||
|
||||
// Apply subsystem configuration from server
|
||||
// Create guardian to protect against check-in interval override attempts
|
||||
intervalGuardian := guardian.NewIntervalGuardian()
|
||||
intervalGuardian.SetBaseline(cfg.CheckInInterval)
|
||||
|
||||
// Process subsystem configurations
|
||||
for subsystemName, subsystemConfig := range serverConfig.Subsystems {
|
||||
if configMap, ok := subsystemConfig.(map[string]interface{}); ok {
|
||||
enabled := false
|
||||
intervalMinutes := 0
|
||||
autoRun := false
|
||||
|
||||
if e, exists := configMap["enabled"]; exists {
|
||||
if eVal, ok := e.(bool); ok {
|
||||
enabled = eVal
|
||||
}
|
||||
// Parse interval from server config
|
||||
intervalFloat := 0.0
|
||||
if rawInterval, ok := configMap["interval_minutes"].(float64); ok {
|
||||
intervalFloat = rawInterval
|
||||
}
|
||||
intervalMinutes := int(intervalFloat)
|
||||
|
||||
if i, exists := configMap["interval_minutes"]; exists {
|
||||
if iVal, ok := i.(float64); ok {
|
||||
intervalMinutes = int(iVal)
|
||||
// Validate scanner interval
|
||||
if intervalMinutes > 0 {
|
||||
if err := intervalValidator.ValidateScannerInterval(intervalMinutes); err != nil {
|
||||
log.Printf("[ERROR] [agent] [config] [%s] scanner interval validation failed: %v",
|
||||
subsystemName, err)
|
||||
log.Printf("[HISTORY] [agent] [config] [%s] interval_rejected interval=%d reason=\"%v\" timestamp=%s",
|
||||
subsystemName, intervalMinutes, err, time.Now().Format(time.RFC3339))
|
||||
continue // Skip invalid interval but don't fail entire sync
|
||||
}
|
||||
}
|
||||
|
||||
if a, exists := configMap["auto_run"]; exists {
|
||||
if aVal, ok := a.(bool); ok {
|
||||
autoRun = aVal
|
||||
}
|
||||
}
|
||||
|
||||
// Get current subsystem enabled state dynamically
|
||||
currentEnabled := getCurrentSubsystemEnabled(cfg, subsystemName)
|
||||
if enabled != currentEnabled {
|
||||
log.Printf(" → %s: enabled=%v (changed)", subsystemName, enabled)
|
||||
log.Printf("[INFO] [agent] [config] [%s] interval=%d minutes", subsystemName, intervalMinutes)
|
||||
changes = true
|
||||
}
|
||||
|
||||
// Check if interval actually changed, but don't modify cfg.CheckInInterval yet
|
||||
if intervalMinutes > 0 && intervalMinutes != newCheckInInterval {
|
||||
log.Printf(" → %s: interval=%d minutes (changed)", subsystemName, intervalMinutes)
|
||||
changes = true
|
||||
newCheckInInterval = intervalMinutes // Update temp variable, not the config
|
||||
}
|
||||
// Apply validated interval to the appropriate subsystem
|
||||
switch subsystemName {
|
||||
case "system":
|
||||
cfg.Subsystems.System.IntervalMinutes = intervalMinutes
|
||||
case "apt":
|
||||
cfg.Subsystems.APT.IntervalMinutes = intervalMinutes
|
||||
case "dnf":
|
||||
cfg.Subsystems.DNF.IntervalMinutes = intervalMinutes
|
||||
case "storage":
|
||||
cfg.Subsystems.Storage.IntervalMinutes = intervalMinutes
|
||||
case "winget":
|
||||
cfg.Subsystems.Winget.IntervalMinutes = intervalMinutes
|
||||
default:
|
||||
log.Printf("[WARNING] [agent] [config] unknown subsystem: %s", subsystemName)
|
||||
}
|
||||
|
||||
if autoRun {
|
||||
log.Printf(" → %s: auto_run=%v (server-side scheduling)", subsystemName, autoRun)
|
||||
// Log to history table
|
||||
log.Printf("[HISTORY] [agent] [config] [%s] interval_updated minutes=%d timestamp=%s",
|
||||
subsystemName, intervalMinutes, time.Now().Format(time.RFC3339))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Apply the check-in interval change only once after all subsystems processed
|
||||
if newCheckInInterval != cfg.CheckInInterval {
|
||||
cfg.CheckInInterval = newCheckInInterval
|
||||
// Verification: Ensure no scanner interval is interfering with check-in frequency
|
||||
// This guards against regressions where scanner settings might affect agent polling
|
||||
if intervalGuardian.GetViolationCount() > 0 {
|
||||
log.Printf("[WARNING] [agent] [config] guardian detected %d previous interval violations",
|
||||
intervalGuardian.GetViolationCount())
|
||||
}
|
||||
|
||||
if err := cfg.Save(constants.GetAgentConfigPath()); err != nil {
|
||||
log.Printf("[HISTORY] [agent] [config] save_failed error=\"%v\" timestamp=%s",
|
||||
err, time.Now().Format(time.RFC3339))
|
||||
return fmt.Errorf("failed to save config: %w", err)
|
||||
}
|
||||
|
||||
if changes {
|
||||
log.Printf("✅ Server configuration applied successfully")
|
||||
} else {
|
||||
log.Printf("ℹ️ Server config received but no changes detected")
|
||||
log.Printf("[INFO] [agent] [config] scanner interval updates applied")
|
||||
}
|
||||
|
||||
// Update last config version
|
||||
lastConfigVersion = serverConfig.Version
|
||||
log.Printf("[SUCCESS] [agent] [config] config saved successfully")
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// syncServerConfigWithRetry wraps syncServerConfigProper with retry logic
|
||||
func syncServerConfigWithRetry(apiClient *client.Client, cfg *config.Config, maxRetries int) error {
|
||||
var lastErr error
|
||||
|
||||
for attempt := 1; attempt <= maxRetries; attempt++ {
|
||||
if err := syncServerConfigProper(apiClient, cfg); err != nil {
|
||||
lastErr = err
|
||||
|
||||
log.Printf("[ERROR] [agent] [config] sync attempt %d/%d failed: %v",
|
||||
attempt, maxRetries, err)
|
||||
|
||||
// Log to history table
|
||||
log.Printf("[HISTORY] [agent] [config] sync_failed attempt=%d/%d error=\"%v\" timestamp=%s",
|
||||
attempt, maxRetries, err, time.Now().Format(time.RFC3339))
|
||||
|
||||
if attempt < maxRetries {
|
||||
// Exponential backoff: 1s, 2s, 4s, 8s...
|
||||
backoff := time.Duration(1<<uint(attempt-1)) * time.Second
|
||||
log.Printf("[INFO] [agent] [config] retrying in %v...", backoff)
|
||||
time.Sleep(backoff)
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
log.Printf("[SUCCESS] [agent] [config] synced after %d attempts", attempt)
|
||||
return nil
|
||||
}
|
||||
|
||||
// After maxRetries, degrade gracefully
|
||||
if err := cfg.SetDegradedMode(true); err != nil {
|
||||
log.Printf("[ERROR] [agent] [config] failed to enter degraded mode: %v", err)
|
||||
log.Printf("[HISTORY] [agent] [config] degraded_mode_failed error=\"%v\" timestamp=%s",
|
||||
err, time.Now().Format(time.RFC3339))
|
||||
} else {
|
||||
log.Printf("[WARNING] [agent] [config] entering degraded mode after %d failed attempts", maxRetries)
|
||||
}
|
||||
|
||||
// Log degraded mode entry to history
|
||||
log.Printf("[HISTORY] [agent] [config] degraded_mode_entered failures=%d timestamp=%s",
|
||||
maxRetries, time.Now().Format(time.RFC3339))
|
||||
|
||||
return lastErr
|
||||
}
|
||||
|
||||
func runAgent(cfg *config.Config) error {
|
||||
log.Printf("🚩 RedFlag Agent v%s starting...\n", version.Version)
|
||||
log.Printf("==================================================================")
|
||||
@@ -656,17 +714,42 @@ func runAgent(cfg *config.Config) error {
|
||||
// Initialize scanner orchestrator for parallel execution and granular subsystem management
|
||||
scanOrchestrator := orchestrator.NewOrchestrator()
|
||||
|
||||
// Register update scanners ONLY - package management systems
|
||||
// Initialize scanners for storage, system, and docker (used by individual subsystem handlers)
|
||||
storageScanner := orchestrator.NewStorageScanner(version.Version)
|
||||
systemScanner := orchestrator.NewSystemScanner(version.Version)
|
||||
dockerScanner, _ := scanner.NewDockerScanner()
|
||||
|
||||
// Initialize circuit breakers for all subsystems
|
||||
storageCB := circuitbreaker.New("Storage", circuitbreaker.Config{
|
||||
FailureThreshold: cfg.Subsystems.Storage.CircuitBreaker.FailureThreshold,
|
||||
FailureWindow: cfg.Subsystems.Storage.CircuitBreaker.FailureWindow,
|
||||
OpenDuration: cfg.Subsystems.Storage.CircuitBreaker.OpenDuration,
|
||||
HalfOpenAttempts: cfg.Subsystems.Storage.CircuitBreaker.HalfOpenAttempts,
|
||||
})
|
||||
systemCB := circuitbreaker.New("System", circuitbreaker.Config{
|
||||
FailureThreshold: cfg.Subsystems.System.CircuitBreaker.FailureThreshold,
|
||||
FailureWindow: cfg.Subsystems.System.CircuitBreaker.FailureWindow,
|
||||
OpenDuration: cfg.Subsystems.System.CircuitBreaker.OpenDuration,
|
||||
HalfOpenAttempts: cfg.Subsystems.System.CircuitBreaker.HalfOpenAttempts,
|
||||
})
|
||||
dockerCB := circuitbreaker.New("Docker", circuitbreaker.Config{
|
||||
FailureThreshold: cfg.Subsystems.Docker.CircuitBreaker.FailureThreshold,
|
||||
FailureWindow: cfg.Subsystems.Docker.CircuitBreaker.FailureWindow,
|
||||
OpenDuration: cfg.Subsystems.Docker.CircuitBreaker.OpenDuration,
|
||||
HalfOpenAttempts: cfg.Subsystems.Docker.CircuitBreaker.HalfOpenAttempts,
|
||||
})
|
||||
|
||||
// Register ALL scanners with the orchestrator
|
||||
// Update scanners (package management)
|
||||
scanOrchestrator.RegisterScanner("apt", orchestrator.NewAPTScannerWrapper(aptScanner), aptCB, cfg.Subsystems.APT.Timeout, cfg.Subsystems.APT.Enabled)
|
||||
scanOrchestrator.RegisterScanner("dnf", orchestrator.NewDNFScannerWrapper(dnfScanner), dnfCB, cfg.Subsystems.DNF.Timeout, cfg.Subsystems.DNF.Enabled)
|
||||
scanOrchestrator.RegisterScanner("windows", orchestrator.NewWindowsUpdateScannerWrapper(windowsUpdateScanner), windowsCB, cfg.Subsystems.Windows.Timeout, cfg.Subsystems.Windows.Enabled)
|
||||
scanOrchestrator.RegisterScanner("winget", orchestrator.NewWingetScannerWrapper(wingetScanner), wingetCB, cfg.Subsystems.Winget.Timeout, cfg.Subsystems.Winget.Enabled)
|
||||
|
||||
// NOTE: Docker, Storage, and System scanners are NOT registered with the update orchestrator
|
||||
// They have their own dedicated handlers and endpoints:
|
||||
// - Docker: handleScanDocker → ReportDockerImages()
|
||||
// - Storage: handleScanStorage → ReportMetrics()
|
||||
// - System: handleScanSystem → ReportMetrics()
|
||||
// System scanners (metrics and monitoring)
|
||||
scanOrchestrator.RegisterScanner("storage", orchestrator.NewStorageScannerWrapper(storageScanner), storageCB, cfg.Subsystems.Storage.Timeout, cfg.Subsystems.Storage.Enabled)
|
||||
scanOrchestrator.RegisterScanner("system", orchestrator.NewSystemScannerWrapper(systemScanner), systemCB, cfg.Subsystems.System.Timeout, cfg.Subsystems.System.Enabled)
|
||||
scanOrchestrator.RegisterScanner("docker", orchestrator.NewDockerScannerWrapper(dockerScanner), dockerCB, cfg.Subsystems.Docker.Timeout, cfg.Subsystems.Docker.Enabled)
|
||||
|
||||
// Initialize acknowledgment tracker for command result reliability
|
||||
ackTracker := acknowledgment.NewTracker(constants.GetAgentStateDir())
|
||||
@@ -804,10 +887,10 @@ func runAgent(cfg *config.Config) error {
|
||||
}
|
||||
}
|
||||
|
||||
// Sync configuration from server (non-blocking)
|
||||
// Sync configuration from server (non-blocking) with retry logic
|
||||
go func() {
|
||||
if err := syncServerConfig(apiClient, cfg); err != nil {
|
||||
log.Printf("Warning: Failed to sync server config: %v", err)
|
||||
if err := syncServerConfigWithRetry(apiClient, cfg, 5); err != nil {
|
||||
log.Printf("Warning: Failed to sync server config after retries: %v", err)
|
||||
}
|
||||
}()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user