feat: add resilience and reliability features for agent subsystems

Added circuit breakers with configurable timeouts for all subsystems (APT, DNF, Docker, Windows, Winget, Storage). Replaces cron-based scheduler with priority queue that should scale beyond 1000+ agents if your homelab is that big.

Command acknowledgment system ensures results aren't lost on network failures or restarts. Agent tracks pending acknowledgments with persistent state and automatic retry.

- Circuit breakers: 3 failures in 1min opens circuit, 30s cooldown
- Per-subsystem timeouts: 30s-10min depending on scanner
- Priority queue scheduler: O(log n), worker pool, jitter, backpressure
- Acknowledgments: at-least-once delivery, max 10 retries over 24h
- All tests passing (26/26)
This commit is contained in:
Fimeg
2025-11-01 18:42:41 -04:00
parent 528848f476
commit bf4d46529f
26 changed files with 2733 additions and 152 deletions

View File

@@ -269,6 +269,21 @@ func (h *AgentHandler) GetCommands(c *gin.Context) {
return
}
// Process command acknowledgments if agent sent any
var acknowledgedIDs []string
if metrics != nil && len(metrics.PendingAcknowledgments) > 0 {
// Verify which commands from the agent's pending list have been recorded
verified, err := h.commandQueries.VerifyCommandsCompleted(metrics.PendingAcknowledgments)
if err != nil {
log.Printf("Warning: Failed to verify command acknowledgments for agent %s: %v", agentID, err)
} else {
acknowledgedIDs = verified
if len(acknowledgedIDs) > 0 {
log.Printf("Acknowledged %d command results for agent %s", len(acknowledgedIDs), agentID)
}
}
}
// Process heartbeat metadata from agent check-ins
if metrics.Metadata != nil {
agent, err := h.agentQueries.GetAgentByID(agentID)
@@ -437,8 +452,9 @@ func (h *AgentHandler) GetCommands(c *gin.Context) {
}
response := models.CommandsResponse{
Commands: commandItems,
RapidPolling: rapidPolling,
Commands: commandItems,
RapidPolling: rapidPolling,
AcknowledgedIDs: acknowledgedIDs,
}
c.JSON(http.StatusOK, response)