feat: add resilience and reliability features for agent subsystems

Added circuit breakers with configurable timeouts for all subsystems (APT, DNF, Docker, Windows, Winget, Storage). Replaces cron-based scheduler with priority queue that should scale beyond 1000+ agents if your homelab is that big.

Command acknowledgment system ensures results aren't lost on network failures or restarts. Agent tracks pending acknowledgments with persistent state and automatic retry.

- Circuit breakers: 3 failures in 1min opens circuit, 30s cooldown
- Per-subsystem timeouts: 30s-10min depending on scanner
- Priority queue scheduler: O(log n), worker pool, jitter, backpressure
- Acknowledgments: at-least-once delivery, max 10 retries over 24h
- All tests passing (26/26)
This commit is contained in:
Fimeg
2025-11-01 18:42:41 -04:00
parent 528848f476
commit bf4d46529f
26 changed files with 2733 additions and 152 deletions

View File

@@ -337,3 +337,61 @@ func (q *CommandQueries) ClearAllFailedCommands(days int) (int64, error) {
return result.RowsAffected()
}
// CountPendingCommandsForAgent returns the number of pending commands for a specific agent
// Used by scheduler for backpressure detection
func (q *CommandQueries) CountPendingCommandsForAgent(agentID uuid.UUID) (int, error) {
var count int
query := `
SELECT COUNT(*)
FROM agent_commands
WHERE agent_id = $1 AND status = 'pending'
`
err := q.db.Get(&count, query, agentID)
return count, err
}
// VerifyCommandsCompleted checks which command IDs from the provided list have been completed or failed
// Returns the list of command IDs that have been successfully recorded (completed or failed status)
func (q *CommandQueries) VerifyCommandsCompleted(commandIDs []string) ([]string, error) {
if len(commandIDs) == 0 {
return []string{}, nil
}
// Convert string IDs to UUIDs
uuidIDs := make([]uuid.UUID, 0, len(commandIDs))
for _, idStr := range commandIDs {
id, err := uuid.Parse(idStr)
if err != nil {
// Skip invalid UUIDs
continue
}
uuidIDs = append(uuidIDs, id)
}
if len(uuidIDs) == 0 {
return []string{}, nil
}
// Query for commands that are completed or failed
query := `
SELECT id
FROM agent_commands
WHERE id = ANY($1)
AND status IN ('completed', 'failed')
`
var completedUUIDs []uuid.UUID
err := q.db.Select(&completedUUIDs, query, uuidIDs)
if err != nil {
return nil, fmt.Errorf("failed to verify command completion: %w", err)
}
// Convert back to strings
completedIDs := make([]string, len(completedUUIDs))
for i, id := range completedUUIDs {
completedIDs[i] = id.String()
}
return completedIDs, nil
}