feat: add resilience and reliability features for agent subsystems

Added circuit breakers with configurable timeouts for all subsystems (APT, DNF, Docker, Windows, Winget, Storage). Replaces cron-based scheduler with priority queue that should scale beyond 1000+ agents if your homelab is that big.

Command acknowledgment system ensures results aren't lost on network failures or restarts. Agent tracks pending acknowledgments with persistent state and automatic retry.

- Circuit breakers: 3 failures in 1min opens circuit, 30s cooldown
- Per-subsystem timeouts: 30s-10min depending on scanner
- Priority queue scheduler: O(log n), worker pool, jitter, backpressure
- Acknowledgments: at-least-once delivery, max 10 retries over 24h
- All tests passing (26/26)
This commit is contained in:
Fimeg
2025-11-01 18:42:41 -04:00
parent 528848f476
commit bf4d46529f
26 changed files with 2733 additions and 152 deletions

View File

@@ -174,8 +174,9 @@ type Command struct {
// CommandsResponse contains pending commands
type CommandsResponse struct {
Commands []Command `json:"commands"`
RapidPolling *RapidPollingConfig `json:"rapid_polling,omitempty"`
Commands []Command `json:"commands"`
RapidPolling *RapidPollingConfig `json:"rapid_polling,omitempty"`
AcknowledgedIDs []string `json:"acknowledged_ids,omitempty"` // IDs server has received
}
// RapidPollingConfig contains rapid polling configuration from server
@@ -196,11 +197,15 @@ type SystemMetrics struct {
Uptime string `json:"uptime,omitempty"`
Version string `json:"version,omitempty"` // Agent version
Metadata map[string]interface{} `json:"metadata,omitempty"` // Additional metadata
// Command acknowledgment tracking
PendingAcknowledgments []string `json:"pending_acknowledgments,omitempty"` // Command IDs awaiting ACK
}
// GetCommands retrieves pending commands from the server
// Optionally sends lightweight system metrics in the request
func (c *Client) GetCommands(agentID uuid.UUID, metrics *SystemMetrics) ([]Command, error) {
// Returns the full response including commands and acknowledged IDs
func (c *Client) GetCommands(agentID uuid.UUID, metrics *SystemMetrics) (*CommandsResponse, error) {
url := fmt.Sprintf("%s/api/v1/agents/%s/commands", c.baseURL, agentID)
var req *http.Request
@@ -252,7 +257,7 @@ func (c *Client) GetCommands(agentID uuid.UUID, metrics *SystemMetrics) ([]Comma
}
}
return result.Commands, nil
return &result, nil
}
// UpdateReport represents discovered updates