feat: add resilience and reliability features for agent subsystems
Added circuit breakers with configurable timeouts for all subsystems (APT, DNF, Docker, Windows, Winget, Storage). Replaces cron-based scheduler with priority queue that should scale beyond 1000+ agents if your homelab is that big. Command acknowledgment system ensures results aren't lost on network failures or restarts. Agent tracks pending acknowledgments with persistent state and automatic retry. - Circuit breakers: 3 failures in 1min opens circuit, 30s cooldown - Per-subsystem timeouts: 30s-10min depending on scanner - Priority queue scheduler: O(log n), worker pool, jitter, backpressure - Acknowledgments: at-least-once delivery, max 10 retries over 24h - All tests passing (26/26)
This commit is contained in:
@@ -269,6 +269,21 @@ func (h *AgentHandler) GetCommands(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
// Process command acknowledgments if agent sent any
|
||||
var acknowledgedIDs []string
|
||||
if metrics != nil && len(metrics.PendingAcknowledgments) > 0 {
|
||||
// Verify which commands from the agent's pending list have been recorded
|
||||
verified, err := h.commandQueries.VerifyCommandsCompleted(metrics.PendingAcknowledgments)
|
||||
if err != nil {
|
||||
log.Printf("Warning: Failed to verify command acknowledgments for agent %s: %v", agentID, err)
|
||||
} else {
|
||||
acknowledgedIDs = verified
|
||||
if len(acknowledgedIDs) > 0 {
|
||||
log.Printf("Acknowledged %d command results for agent %s", len(acknowledgedIDs), agentID)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Process heartbeat metadata from agent check-ins
|
||||
if metrics.Metadata != nil {
|
||||
agent, err := h.agentQueries.GetAgentByID(agentID)
|
||||
@@ -437,8 +452,9 @@ func (h *AgentHandler) GetCommands(c *gin.Context) {
|
||||
}
|
||||
|
||||
response := models.CommandsResponse{
|
||||
Commands: commandItems,
|
||||
RapidPolling: rapidPolling,
|
||||
Commands: commandItems,
|
||||
RapidPolling: rapidPolling,
|
||||
AcknowledgedIDs: acknowledgedIDs,
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, response)
|
||||
|
||||
Reference in New Issue
Block a user