feat: add resilience and reliability features for agent subsystems

Added circuit breakers with configurable timeouts for all subsystems (APT, DNF, Docker, Windows, Winget, Storage). Replaces cron-based scheduler with priority queue that should scale beyond 1000+ agents if your homelab is that big. Command acknowledgment system ensures results aren't lost on network failures or restarts. Agent tracks pending acknowledgments with persistent state and automatic retry. - Circuit breakers: 3 failures in 1min opens circuit, 30s cooldown - Per-subsystem timeouts: 30s-10min depending on scanner - Priority queue scheduler: O(log n), worker pool, jitter, backpressure - Acknowledgments: at-least-once delivery, max 10 retries over 24h - All tests passing (26/26)
2025-11-01 18:42:41 -04:00
parent 528848f476
commit bf4d46529f
26 changed files with 2733 additions and 152 deletions
--- a/aggregator-server/internal/database/queries/commands.go
+++ b/aggregator-server/internal/database/queries/commands.go
@@ -337,3 +337,61 @@ func (q *CommandQueries) ClearAllFailedCommands(days int) (int64, error) {

 	return result.RowsAffected()
 }
+
+// CountPendingCommandsForAgent returns the number of pending commands for a specific agent
+// Used by scheduler for backpressure detection
+func (q *CommandQueries) CountPendingCommandsForAgent(agentID uuid.UUID) (int, error) {
+	var count int
+	query := `
+		SELECT COUNT(*)
+		FROM agent_commands
+		WHERE agent_id = $1 AND status = 'pending'
+	`
+	err := q.db.Get(&count, query, agentID)
+	return count, err
+}
+
+// VerifyCommandsCompleted checks which command IDs from the provided list have been completed or failed
+// Returns the list of command IDs that have been successfully recorded (completed or failed status)
+func (q *CommandQueries) VerifyCommandsCompleted(commandIDs []string) ([]string, error) {
+	if len(commandIDs) == 0 {
+		return []string{}, nil
+	}
+
+	// Convert string IDs to UUIDs
+	uuidIDs := make([]uuid.UUID, 0, len(commandIDs))
+	for _, idStr := range commandIDs {
+		id, err := uuid.Parse(idStr)
+		if err != nil {
+			// Skip invalid UUIDs
+			continue
+		}
+		uuidIDs = append(uuidIDs, id)
+	}
+
+	if len(uuidIDs) == 0 {
+		return []string{}, nil
+	}
+
+	// Query for commands that are completed or failed
+	query := `
+		SELECT id
+		FROM agent_commands
+		WHERE id = ANY($1)
+		AND status IN ('completed', 'failed')
+	`
+
+	var completedUUIDs []uuid.UUID
+	err := q.db.Select(&completedUUIDs, query, uuidIDs)
+	if err != nil {
+		return nil, fmt.Errorf("failed to verify command completion: %w", err)
+	}
+
+	// Convert back to strings
+	completedIDs := make([]string, len(completedUUIDs))
+	for i, id := range completedUUIDs {
+		completedIDs[i] = id.String()
+	}
+
+	return completedIDs, nil
+}