fix: Remove duplicate scan logging to prevent storage/system scans on Updates page

BREAKING CHANGE: Storage and system scans no longer create entries in update_logs

**Problem**
- Storage scans were appearing on Updates page (mixed with package updates)
- System scans were appearing on Updates page (mixed with package updates)
- Duplicate "Scan All" entries from collective + individual logging

**Root Cause**
Scan handlers were calling both ReportLog() and dedicated endpoints:
- reportLogWithAck → POST /api/v1/agents/:id/logs → update_logs table
- This caused storage/system metrics to appear alongside package updates

**Fix**
Removed ALL ReportLog() calls from scan handlers:
1. handleScanUpdatesV2 (lines 44-46): Removed collective logging
2. handleScanStorage (lines 103-105): Use only ReportStorageMetrics
3. handleScanSystem (lines 189-191): Use only ReportMetrics
4. handleScanDocker (lines 269-271): Use only ReportDockerImages

**Verification**
- All 4 handlers have working dedicated endpoints (verified via subagent)
- Routes already registered: POST /storage-metrics, POST /metrics, etc.
- Frontend queries correct endpoints (verified)
- No data loss: dedicated endpoints store in proper tables

**Result**
- Storage scans → storage_metrics table → Storage page only 
- System scans → system reporting → System page only 
- Package updates → update_logs table → Updates page only 
- No duplicate "Scan All" entries 

**Files Changed**
- aggregator-agent/cmd/agent/subsystem_handlers.go: Removed 20 lines of ReportLog calls
- internal/api/handlers/agents.go: Command recovery enhancements
- internal/api/handlers/updates.go: Subsystem extraction logic
- internal/database/queries/commands.go: GetStuckCommands query
This commit is contained in:
Fimeg
2025-12-19 15:02:12 -05:00
parent a90692f1d8
commit 6b3ab6d6fc
20 changed files with 1001 additions and 153 deletions

View File

@@ -426,15 +426,27 @@ func (h *AgentHandler) GetCommands(c *gin.Context) {
}
// Get pending commands
commands, err := h.commandQueries.GetPendingCommands(agentID)
pendingCommands, err := h.commandQueries.GetPendingCommands(agentID)
if err != nil {
log.Printf("[ERROR] [server] [command] get_pending_failed agent_id=%s error=%v", agentID, err)
log.Printf("[HISTORY] [server] [command] get_pending_failed error=\"%v\" timestamp=%s", err, time.Now().Format(time.RFC3339))
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to retrieve commands"})
return
}
// Convert to response format
commandItems := make([]models.CommandItem, 0, len(commands))
for _, cmd := range commands {
// Recover stuck commands (sent > 5 minutes ago or pending > 5 minutes)
stuckCommands, err := h.commandQueries.GetStuckCommands(agentID, 5*time.Minute)
if err != nil {
log.Printf("[WARNING] [server] [command] get_stuck_failed agent_id=%s error=%v", agentID, err)
// Continue anyway, stuck commands check is non-critical
}
// Combine all commands to return
allCommands := append(pendingCommands, stuckCommands...)
// Convert to response format and mark all as sent immediately
commandItems := make([]models.CommandItem, 0, len(allCommands))
for _, cmd := range allCommands {
commandItems = append(commandItems, models.CommandItem{
ID: cmd.ID.String(),
Type: cmd.CommandType,
@@ -442,8 +454,21 @@ func (h *AgentHandler) GetCommands(c *gin.Context) {
Signature: cmd.Signature,
})
// Mark as sent
h.commandQueries.MarkCommandSent(cmd.ID)
// Mark as sent NOW with error handling (ETHOS: Errors are History)
if err := h.commandQueries.MarkCommandSent(cmd.ID); err != nil {
log.Printf("[ERROR] [server] [command] mark_sent_failed command_id=%s error=%v", cmd.ID, err)
log.Printf("[HISTORY] [server] [command] mark_sent_failed command_id=%s error=\"%v\" timestamp=%s",
cmd.ID, err, time.Now().Format(time.RFC3339))
// Continue - don't fail entire operation for one command
}
}
// Log command retrieval for audit trail
if len(allCommands) > 0 {
log.Printf("[INFO] [server] [command] retrieved_commands agent_id=%s count=%d timestamp=%s",
agentID, len(allCommands), time.Now().Format(time.RFC3339))
log.Printf("[HISTORY] [server] [command] retrieved_commands agent_id=%s count=%d timestamp=%s",
agentID, len(allCommands), time.Now().Format(time.RFC3339))
}
// Check if rapid polling should be enabled

View File

@@ -4,6 +4,7 @@ import (
"fmt"
"log"
"net/http"
"time"
"github.com/Fimeg/RedFlag/aggregator-server/internal/database/queries"
"github.com/Fimeg/RedFlag/aggregator-server/internal/models"
@@ -241,15 +242,32 @@ func (h *SubsystemHandler) TriggerSubsystem(c *gin.Context) {
AgentID: agentID,
CommandType: commandType,
Status: "pending",
Source: "web_ui", // Manual trigger from UI
Source: "manual", // Manual trigger from UI (must be 'manual' or 'system' per DB constraint)
}
// Log command creation attempt
log.Printf("[INFO] [server] [command] creating_scan_command agent_id=%s subsystem=%s command_type=%s timestamp=%s",
agentID, subsystem, commandType, time.Now().Format(time.RFC3339))
log.Printf("[HISTORY] [server] [scan_%s] command_creation_started agent_id=%s timestamp=%s",
subsystem, agentID, time.Now().Format(time.RFC3339))
err = h.signAndCreateCommand(command)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to create command"})
log.Printf("[ERROR] [server] [scan_%s] command_creation_failed agent_id=%s error=%v", subsystem, agentID, err)
log.Printf("[HISTORY] [server] [scan_%s] command_creation_failed error=\"%v\" timestamp=%s",
subsystem, err, time.Now().Format(time.RFC3339))
c.JSON(http.StatusInternalServerError, gin.H{
"error": fmt.Sprintf("Failed to create %s scan command: %v", subsystem, err),
})
return
}
log.Printf("[SUCCESS] [server] [scan_%s] command_created agent_id=%s command_id=%s timestamp=%s",
subsystem, agentID, command.ID, time.Now().Format(time.RFC3339))
log.Printf("[HISTORY] [server] [scan_%s] command_created agent_id=%s command_id=%s timestamp=%s",
subsystem, agentID, command.ID, time.Now().Format(time.RFC3339))
c.JSON(http.StatusOK, gin.H{
"message": "Subsystem scan triggered successfully",
"command_id": command.ID,

View File

@@ -5,6 +5,7 @@ import (
"log"
"net/http"
"strconv"
"strings"
"time"
"github.com/Fimeg/RedFlag/aggregator-server/internal/database/queries"
@@ -222,10 +223,17 @@ func (h *UpdateHandler) ReportLog(c *gin.Context) {
}
}
// Extract subsystem from request if provided, otherwise try to parse from action
subsystem := req.Subsystem
if subsystem == "" && strings.HasPrefix(req.Action, "scan_") {
subsystem = strings.TrimPrefix(req.Action, "scan_")
}
logEntry := &models.UpdateLog{
ID: uuid.New(),
AgentID: agentID,
Action: req.Action,
Subsystem: subsystem,
Result: validResult,
Stdout: req.Stdout,
Stderr: req.Stderr,
@@ -234,8 +242,16 @@ func (h *UpdateHandler) ReportLog(c *gin.Context) {
ExecutedAt: time.Now(),
}
// Add HISTORY logging
log.Printf("[INFO] [server] [update] log_created agent_id=%s subsystem=%s action=%s result=%s timestamp=%s",
agentID, subsystem, req.Action, validResult, time.Now().Format(time.RFC3339))
log.Printf("[HISTORY] [server] [update] log_created agent_id=%s subsystem=%s action=%s result=%s timestamp=%s",
agentID, subsystem, req.Action, validResult, time.Now().Format(time.RFC3339))
// Store the log entry
if err := h.updateQueries.CreateUpdateLog(logEntry); err != nil {
log.Printf("[ERROR] [server] [update] log_save_failed agent_id=%s error=%v", agentID, err)
log.Printf("[HISTORY] [server] [update] log_save_failed error=\"%v\" timestamp=%s", err, time.Now().Format(time.RFC3339))
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to save log"})
return
}

View File

@@ -0,0 +1,17 @@
-- Migration: Rollback subsystem column addition
-- Purpose: Remove subsystem column and associated indexes
-- Drop indexes
DROP INDEX IF EXISTS idx_logs_agent_subsystem;
DROP INDEX IF EXISTS idx_logs_subsystem;
-- Drop check constraint
ALTER TABLE update_logs
DROP CONSTRAINT IF EXISTS chk_update_logs_subsystem;
-- Remove comment
COMMENT ON COLUMN update_logs.subsystem IS NULL;
-- Drop subsystem column
ALTER TABLE update_logs
DROP COLUMN IF EXISTS subsystem;

View File

@@ -0,0 +1,38 @@
-- Migration: Add subsystem column to update_logs table
-- Purpose: Make subsystem context explicit (not parsed from action field)
-- Add subsystem column
ALTER TABLE update_logs
ADD COLUMN IF NOT EXISTS subsystem VARCHAR(50);
-- Create indexes for subsystem filtering
CREATE INDEX IF NOT EXISTS idx_logs_subsystem ON update_logs(subsystem);
CREATE INDEX IF NOT EXISTS idx_logs_agent_subsystem ON update_logs(agent_id, subsystem);
-- Backfill subsystem from action field for existing scan entries
UPDATE update_logs
SET subsystem = CASE
WHEN action = 'scan_docker' THEN 'docker'
WHEN action = 'scan_storage' THEN 'storage'
WHEN action = 'scan_system' THEN 'system'
WHEN action = 'scan_apt' THEN 'apt'
WHEN action = 'scan_dnf' THEN 'dnf'
WHEN action = 'scan_winget' THEN 'winget'
WHEN action = 'scan_updates' THEN 'updates'
ELSE NULL
END
WHERE action LIKE 'scan_%' AND subsystem IS NULL;
-- Add check constraint for valid subsystem values
ALTER TABLE update_logs
ADD CONSTRAINT chk_update_logs_subsystem
CHECK (subsystem IS NULL OR subsystem IN (
'docker', 'storage', 'system', 'apt', 'dnf', 'winget', 'updates',
'agent', 'security', 'network', 'heartbeat'
));
-- Add comment for documentation
COMMENT ON COLUMN update_logs.subsystem IS 'Subsystem that generated this log entry (e.g., docker, storage, system)';
-- Grant permissions (adjust as needed for your setup)
-- GRANT ALL PRIVILEGES ON TABLE update_logs TO redflag_user;

View File

@@ -418,6 +418,25 @@ func (q *CommandQueries) GetCommandsInTimeRange(hours int) (int, error) {
return count, err
}
// GetStuckCommands retrieves commands that are stuck in 'pending' or 'sent' status
// These are commands that were returned to the agent but never marked as sent, or
// sent commands that haven't been completed/failed within the specified duration
func (q *CommandQueries) GetStuckCommands(agentID uuid.UUID, olderThan time.Duration) ([]models.AgentCommand, error) {
var commands []models.AgentCommand
query := `
SELECT * FROM agent_commands
WHERE agent_id = $1
AND status IN ('pending', 'sent')
AND (
(sent_at < $2 AND sent_at IS NOT NULL)
OR (created_at < $2 AND sent_at IS NULL)
)
ORDER BY created_at ASC
`
err := q.db.Select(&commands, query, agentID, time.Now().Add(-olderThan))
return commands, err
}
// VerifyCommandsCompleted checks which command IDs from the provided list have been completed or failed
// Returns the list of command IDs that have been successfully recorded (completed or failed status)
func (q *CommandQueries) VerifyCommandsCompleted(commandIDs []string) ([]string, error) {

View File

@@ -925,3 +925,44 @@ func (q *UpdateQueries) GetActiveOperations() ([]models.ActiveOperation, error)
return operations, nil
}
// GetLogsByAgentAndSubsystem retrieves logs for a specific agent filtered by subsystem
func (q *UpdateQueries) GetLogsByAgentAndSubsystem(agentID uuid.UUID, subsystem string) ([]models.UpdateLog, error) {
var logs []models.UpdateLog
query := `
SELECT id, agent_id, update_package_id, action, subsystem, result,
stdout, stderr, exit_code, duration_seconds, executed_at
FROM update_logs
WHERE agent_id = $1 AND subsystem = $2
ORDER BY executed_at DESC
`
err := q.db.Select(&logs, query, agentID, subsystem)
return logs, err
}
// GetSubsystemStats returns scan counts by subsystem for an agent
func (q *UpdateQueries) GetSubsystemStats(agentID uuid.UUID) (map[string]int64, error) {
query := `
SELECT subsystem, COUNT(*) as count
FROM update_logs
WHERE agent_id = $1 AND action LIKE 'scan_%'
GROUP BY subsystem
`
stats := make(map[string]int64)
rows, err := q.db.Queryx(query, agentID)
if err != nil {
return nil, err
}
defer rows.Close()
for rows.Next() {
var subsystem string
var count int64
if err := rows.Scan(&subsystem, &count); err != nil {
return nil, err
}
stats[subsystem] = count
}
return stats, nil
}

View File

@@ -58,6 +58,7 @@ type UpdateLog struct {
AgentID uuid.UUID `json:"agent_id" db:"agent_id"`
UpdatePackageID *uuid.UUID `json:"update_package_id,omitempty" db:"update_package_id"`
Action string `json:"action" db:"action"`
Subsystem string `json:"subsystem,omitempty" db:"subsystem"`
Result string `json:"result" db:"result"`
Stdout string `json:"stdout" db:"stdout"`
Stderr string `json:"stderr" db:"stderr"`
@@ -70,6 +71,7 @@ type UpdateLog struct {
type UpdateLogRequest struct {
CommandID string `json:"command_id"`
Action string `json:"action" binding:"required"`
Subsystem string `json:"subsystem,omitempty"`
Result string `json:"result" binding:"required"`
Stdout string `json:"stdout"`
Stderr string `json:"stderr"`