feat: add resilience and reliability features for agent subsystems
Added circuit breakers with configurable timeouts for all subsystems (APT, DNF, Docker, Windows, Winget, Storage). Replaces cron-based scheduler with priority queue that should scale beyond 1000+ agents if your homelab is that big. Command acknowledgment system ensures results aren't lost on network failures or restarts. Agent tracks pending acknowledgments with persistent state and automatic retry. - Circuit breakers: 3 failures in 1min opens circuit, 30s cooldown - Per-subsystem timeouts: 30s-10min depending on scanner - Priority queue scheduler: O(log n), worker pool, jitter, backpressure - Acknowledgments: at-least-once delivery, max 10 retries over 24h - All tests passing (26/26)
This commit is contained in:
@@ -337,3 +337,61 @@ func (q *CommandQueries) ClearAllFailedCommands(days int) (int64, error) {
|
||||
|
||||
return result.RowsAffected()
|
||||
}
|
||||
|
||||
// CountPendingCommandsForAgent returns the number of pending commands for a specific agent
|
||||
// Used by scheduler for backpressure detection
|
||||
func (q *CommandQueries) CountPendingCommandsForAgent(agentID uuid.UUID) (int, error) {
|
||||
var count int
|
||||
query := `
|
||||
SELECT COUNT(*)
|
||||
FROM agent_commands
|
||||
WHERE agent_id = $1 AND status = 'pending'
|
||||
`
|
||||
err := q.db.Get(&count, query, agentID)
|
||||
return count, err
|
||||
}
|
||||
|
||||
// VerifyCommandsCompleted checks which command IDs from the provided list have been completed or failed
|
||||
// Returns the list of command IDs that have been successfully recorded (completed or failed status)
|
||||
func (q *CommandQueries) VerifyCommandsCompleted(commandIDs []string) ([]string, error) {
|
||||
if len(commandIDs) == 0 {
|
||||
return []string{}, nil
|
||||
}
|
||||
|
||||
// Convert string IDs to UUIDs
|
||||
uuidIDs := make([]uuid.UUID, 0, len(commandIDs))
|
||||
for _, idStr := range commandIDs {
|
||||
id, err := uuid.Parse(idStr)
|
||||
if err != nil {
|
||||
// Skip invalid UUIDs
|
||||
continue
|
||||
}
|
||||
uuidIDs = append(uuidIDs, id)
|
||||
}
|
||||
|
||||
if len(uuidIDs) == 0 {
|
||||
return []string{}, nil
|
||||
}
|
||||
|
||||
// Query for commands that are completed or failed
|
||||
query := `
|
||||
SELECT id
|
||||
FROM agent_commands
|
||||
WHERE id = ANY($1)
|
||||
AND status IN ('completed', 'failed')
|
||||
`
|
||||
|
||||
var completedUUIDs []uuid.UUID
|
||||
err := q.db.Select(&completedUUIDs, query, uuidIDs)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to verify command completion: %w", err)
|
||||
}
|
||||
|
||||
// Convert back to strings
|
||||
completedIDs := make([]string, len(completedUUIDs))
|
||||
for i, id := range completedUUIDs {
|
||||
completedIDs[i] = id.String()
|
||||
}
|
||||
|
||||
return completedIDs, nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user