feat: add resilience and reliability features for agent subsystems
Added circuit breakers with configurable timeouts for all subsystems (APT, DNF, Docker, Windows, Winget, Storage). Replaces cron-based scheduler with priority queue that should scale beyond 1000+ agents if your homelab is that big. Command acknowledgment system ensures results aren't lost on network failures or restarts. Agent tracks pending acknowledgments with persistent state and automatic retry. - Circuit breakers: 3 failures in 1min opens circuit, 30s cooldown - Per-subsystem timeouts: 30s-10min depending on scanner - Priority queue scheduler: O(log n), worker pool, jitter, backpressure - Acknowledgments: at-least-once delivery, max 10 retries over 24h - All tests passing (26/26)
This commit is contained in:
233
aggregator-agent/internal/circuitbreaker/circuitbreaker.go
Normal file
233
aggregator-agent/internal/circuitbreaker/circuitbreaker.go
Normal file
@@ -0,0 +1,233 @@
|
||||
package circuitbreaker
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// State represents the circuit breaker state
|
||||
type State int
|
||||
|
||||
const (
|
||||
StateClosed State = iota // Normal operation
|
||||
StateOpen // Circuit is open, failing fast
|
||||
StateHalfOpen // Testing if service recovered
|
||||
)
|
||||
|
||||
func (s State) String() string {
|
||||
switch s {
|
||||
case StateClosed:
|
||||
return "closed"
|
||||
case StateOpen:
|
||||
return "open"
|
||||
case StateHalfOpen:
|
||||
return "half-open"
|
||||
default:
|
||||
return "unknown"
|
||||
}
|
||||
}
|
||||
|
||||
// Config holds circuit breaker configuration
|
||||
type Config struct {
|
||||
FailureThreshold int // Number of failures before opening
|
||||
FailureWindow time.Duration // Time window to track failures
|
||||
OpenDuration time.Duration // How long circuit stays open
|
||||
HalfOpenAttempts int // Successful attempts needed to close from half-open
|
||||
}
|
||||
|
||||
// CircuitBreaker implements the circuit breaker pattern for subsystems
|
||||
type CircuitBreaker struct {
|
||||
name string
|
||||
config Config
|
||||
|
||||
mu sync.RWMutex
|
||||
state State
|
||||
failures []time.Time // Timestamps of recent failures
|
||||
consecutiveSuccess int // Consecutive successes in half-open state
|
||||
openedAt time.Time // When circuit was opened
|
||||
}
|
||||
|
||||
// New creates a new circuit breaker
|
||||
func New(name string, config Config) *CircuitBreaker {
|
||||
return &CircuitBreaker{
|
||||
name: name,
|
||||
config: config,
|
||||
state: StateClosed,
|
||||
failures: make([]time.Time, 0),
|
||||
}
|
||||
}
|
||||
|
||||
// Call executes the given function with circuit breaker protection
|
||||
func (cb *CircuitBreaker) Call(fn func() error) error {
|
||||
// Check if we can execute
|
||||
if err := cb.beforeCall(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Execute the function
|
||||
err := fn()
|
||||
|
||||
// Record the result
|
||||
cb.afterCall(err)
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
// beforeCall checks if the call should be allowed
|
||||
func (cb *CircuitBreaker) beforeCall() error {
|
||||
cb.mu.Lock()
|
||||
defer cb.mu.Unlock()
|
||||
|
||||
switch cb.state {
|
||||
case StateClosed:
|
||||
// Normal operation, allow call
|
||||
return nil
|
||||
|
||||
case StateOpen:
|
||||
// Check if enough time has passed to try half-open
|
||||
if time.Since(cb.openedAt) >= cb.config.OpenDuration {
|
||||
cb.state = StateHalfOpen
|
||||
cb.consecutiveSuccess = 0
|
||||
return nil
|
||||
}
|
||||
// Circuit is still open, fail fast
|
||||
return fmt.Errorf("circuit breaker [%s] is OPEN (will retry at %s)",
|
||||
cb.name, cb.openedAt.Add(cb.config.OpenDuration).Format("15:04:05"))
|
||||
|
||||
case StateHalfOpen:
|
||||
// In half-open state, allow limited attempts
|
||||
return nil
|
||||
|
||||
default:
|
||||
return fmt.Errorf("unknown circuit breaker state")
|
||||
}
|
||||
}
|
||||
|
||||
// afterCall records the result and updates state
|
||||
func (cb *CircuitBreaker) afterCall(err error) {
|
||||
cb.mu.Lock()
|
||||
defer cb.mu.Unlock()
|
||||
|
||||
now := time.Now()
|
||||
|
||||
if err != nil {
|
||||
// Record failure
|
||||
cb.recordFailure(now)
|
||||
|
||||
// If in half-open, go back to open on any failure
|
||||
if cb.state == StateHalfOpen {
|
||||
cb.state = StateOpen
|
||||
cb.openedAt = now
|
||||
cb.consecutiveSuccess = 0
|
||||
return
|
||||
}
|
||||
|
||||
// Check if we should open the circuit
|
||||
if cb.shouldOpen(now) {
|
||||
cb.state = StateOpen
|
||||
cb.openedAt = now
|
||||
cb.consecutiveSuccess = 0
|
||||
}
|
||||
} else {
|
||||
// Success
|
||||
switch cb.state {
|
||||
case StateHalfOpen:
|
||||
// Count consecutive successes
|
||||
cb.consecutiveSuccess++
|
||||
if cb.consecutiveSuccess >= cb.config.HalfOpenAttempts {
|
||||
// Enough successes, close the circuit
|
||||
cb.state = StateClosed
|
||||
cb.failures = make([]time.Time, 0)
|
||||
cb.consecutiveSuccess = 0
|
||||
}
|
||||
|
||||
case StateClosed:
|
||||
// Clean up old failures on success
|
||||
cb.cleanupOldFailures(now)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// recordFailure adds a failure timestamp
|
||||
func (cb *CircuitBreaker) recordFailure(now time.Time) {
|
||||
cb.failures = append(cb.failures, now)
|
||||
cb.cleanupOldFailures(now)
|
||||
}
|
||||
|
||||
// cleanupOldFailures removes failures outside the window
|
||||
func (cb *CircuitBreaker) cleanupOldFailures(now time.Time) {
|
||||
cutoff := now.Add(-cb.config.FailureWindow)
|
||||
validFailures := make([]time.Time, 0)
|
||||
|
||||
for _, failTime := range cb.failures {
|
||||
if failTime.After(cutoff) {
|
||||
validFailures = append(validFailures, failTime)
|
||||
}
|
||||
}
|
||||
|
||||
cb.failures = validFailures
|
||||
}
|
||||
|
||||
// shouldOpen determines if circuit should open based on failures
|
||||
func (cb *CircuitBreaker) shouldOpen(now time.Time) bool {
|
||||
cb.cleanupOldFailures(now)
|
||||
return len(cb.failures) >= cb.config.FailureThreshold
|
||||
}
|
||||
|
||||
// State returns the current circuit breaker state (thread-safe)
|
||||
func (cb *CircuitBreaker) State() State {
|
||||
cb.mu.RLock()
|
||||
defer cb.mu.RUnlock()
|
||||
return cb.state
|
||||
}
|
||||
|
||||
// GetStats returns current circuit breaker statistics
|
||||
func (cb *CircuitBreaker) GetStats() Stats {
|
||||
cb.mu.RLock()
|
||||
defer cb.mu.RUnlock()
|
||||
|
||||
stats := Stats{
|
||||
Name: cb.name,
|
||||
State: cb.state.String(),
|
||||
RecentFailures: len(cb.failures),
|
||||
ConsecutiveSuccess: cb.consecutiveSuccess,
|
||||
}
|
||||
|
||||
if cb.state == StateOpen && !cb.openedAt.IsZero() {
|
||||
nextAttempt := cb.openedAt.Add(cb.config.OpenDuration)
|
||||
stats.NextAttempt = &nextAttempt
|
||||
}
|
||||
|
||||
return stats
|
||||
}
|
||||
|
||||
// Reset manually resets the circuit breaker to closed state
|
||||
func (cb *CircuitBreaker) Reset() {
|
||||
cb.mu.Lock()
|
||||
defer cb.mu.Unlock()
|
||||
|
||||
cb.state = StateClosed
|
||||
cb.failures = make([]time.Time, 0)
|
||||
cb.consecutiveSuccess = 0
|
||||
cb.openedAt = time.Time{}
|
||||
}
|
||||
|
||||
// Stats holds circuit breaker statistics
|
||||
type Stats struct {
|
||||
Name string
|
||||
State string
|
||||
RecentFailures int
|
||||
ConsecutiveSuccess int
|
||||
NextAttempt *time.Time
|
||||
}
|
||||
|
||||
// String returns a human-readable representation of the stats
|
||||
func (s Stats) String() string {
|
||||
if s.NextAttempt != nil {
|
||||
return fmt.Sprintf("[%s] state=%s, failures=%d, next_attempt=%s",
|
||||
s.Name, s.State, s.RecentFailures, s.NextAttempt.Format("15:04:05"))
|
||||
}
|
||||
return fmt.Sprintf("[%s] state=%s, failures=%d, success=%d",
|
||||
s.Name, s.State, s.RecentFailures, s.ConsecutiveSuccess)
|
||||
}
|
||||
Reference in New Issue
Block a user