feat: add resilience and reliability features for agent subsystems

Added circuit breakers with configurable timeouts for all subsystems (APT, DNF, Docker, Windows, Winget, Storage). Replaces cron-based scheduler with priority queue that should scale beyond 1000+ agents if your homelab is that big. Command acknowledgment system ensures results aren't lost on network failures or restarts. Agent tracks pending acknowledgments with persistent state and automatic retry. - Circuit breakers: 3 failures in 1min opens circuit, 30s cooldown - Per-subsystem timeouts: 30s-10min depending on scanner - Priority queue scheduler: O(log n), worker pool, jitter, backpressure - Acknowledgments: at-least-once delivery, max 10 retries over 24h - All tests passing (26/26)
2025-11-01 18:42:41 -04:00
parent 528848f476
commit bf4d46529f
26 changed files with 2733 additions and 152 deletions
--- a/aggregator-server/internal/api/handlers/agents.go
+++ b/aggregator-server/internal/api/handlers/agents.go
@@ -269,6 +269,21 @@ func (h *AgentHandler) GetCommands(c *gin.Context) {
 		return
 	}

+	// Process command acknowledgments if agent sent any
+	var acknowledgedIDs []string
+	if metrics != nil && len(metrics.PendingAcknowledgments) > 0 {
+		// Verify which commands from the agent's pending list have been recorded
+		verified, err := h.commandQueries.VerifyCommandsCompleted(metrics.PendingAcknowledgments)
+		if err != nil {
+			log.Printf("Warning: Failed to verify command acknowledgments for agent %s: %v", agentID, err)
+		} else {
+			acknowledgedIDs = verified
+			if len(acknowledgedIDs) > 0 {
+				log.Printf("Acknowledged %d command results for agent %s", len(acknowledgedIDs), agentID)
+			}
+		}
+	}
+
 	// Process heartbeat metadata from agent check-ins
 	if metrics.Metadata != nil {
 		agent, err := h.agentQueries.GetAgentByID(agentID)
@@ -437,8 +452,9 @@ func (h *AgentHandler) GetCommands(c *gin.Context) {
 	}

 	response := models.CommandsResponse{
-		Commands:     commandItems,
-		RapidPolling: rapidPolling,
+		Commands:        commandItems,
+		RapidPolling:    rapidPolling,
+		AcknowledgedIDs: acknowledgedIDs,
 	}

 	c.JSON(http.StatusOK, response)