feat: add resilience and reliability features for agent subsystems
Added circuit breakers with configurable timeouts for all subsystems (APT, DNF, Docker, Windows, Winget, Storage). Replaces cron-based scheduler with priority queue that should scale beyond 1000+ agents if your homelab is that big. Command acknowledgment system ensures results aren't lost on network failures or restarts. Agent tracks pending acknowledgments with persistent state and automatic retry. - Circuit breakers: 3 failures in 1min opens circuit, 30s cooldown - Per-subsystem timeouts: 30s-10min depending on scanner - Priority queue scheduler: O(log n), worker pool, jitter, backpressure - Acknowledgments: at-least-once delivery, max 10 retries over 24h - All tests passing (26/26)
This commit is contained in:
@@ -174,8 +174,9 @@ type Command struct {
|
||||
|
||||
// CommandsResponse contains pending commands
|
||||
type CommandsResponse struct {
|
||||
Commands []Command `json:"commands"`
|
||||
RapidPolling *RapidPollingConfig `json:"rapid_polling,omitempty"`
|
||||
Commands []Command `json:"commands"`
|
||||
RapidPolling *RapidPollingConfig `json:"rapid_polling,omitempty"`
|
||||
AcknowledgedIDs []string `json:"acknowledged_ids,omitempty"` // IDs server has received
|
||||
}
|
||||
|
||||
// RapidPollingConfig contains rapid polling configuration from server
|
||||
@@ -196,11 +197,15 @@ type SystemMetrics struct {
|
||||
Uptime string `json:"uptime,omitempty"`
|
||||
Version string `json:"version,omitempty"` // Agent version
|
||||
Metadata map[string]interface{} `json:"metadata,omitempty"` // Additional metadata
|
||||
|
||||
// Command acknowledgment tracking
|
||||
PendingAcknowledgments []string `json:"pending_acknowledgments,omitempty"` // Command IDs awaiting ACK
|
||||
}
|
||||
|
||||
// GetCommands retrieves pending commands from the server
|
||||
// Optionally sends lightweight system metrics in the request
|
||||
func (c *Client) GetCommands(agentID uuid.UUID, metrics *SystemMetrics) ([]Command, error) {
|
||||
// Returns the full response including commands and acknowledged IDs
|
||||
func (c *Client) GetCommands(agentID uuid.UUID, metrics *SystemMetrics) (*CommandsResponse, error) {
|
||||
url := fmt.Sprintf("%s/api/v1/agents/%s/commands", c.baseURL, agentID)
|
||||
|
||||
var req *http.Request
|
||||
@@ -252,7 +257,7 @@ func (c *Client) GetCommands(agentID uuid.UUID, metrics *SystemMetrics) ([]Comma
|
||||
}
|
||||
}
|
||||
|
||||
return result.Commands, nil
|
||||
return &result, nil
|
||||
}
|
||||
|
||||
// UpdateReport represents discovered updates
|
||||
|
||||
Reference in New Issue
Block a user