Files
Redflag/aggregator-agent/internal/config/subsystems.go
Fimeg bf4d46529f feat: add resilience and reliability features for agent subsystems
Added circuit breakers with configurable timeouts for all subsystems (APT, DNF, Docker, Windows, Winget, Storage). Replaces cron-based scheduler with priority queue that should scale beyond 1000+ agents if your homelab is that big.

Command acknowledgment system ensures results aren't lost on network failures or restarts. Agent tracks pending acknowledgments with persistent state and automatic retry.

- Circuit breakers: 3 failures in 1min opens circuit, 30s cooldown
- Per-subsystem timeouts: 30s-10min depending on scanner
- Priority queue scheduler: O(log n), worker pool, jitter, backpressure
- Acknowledgments: at-least-once delivery, max 10 retries over 24h
- All tests passing (26/26)
2025-11-01 18:42:41 -04:00

96 lines
3.1 KiB
Go

package config
import "time"
// SubsystemConfig holds configuration for individual subsystems
type SubsystemConfig struct {
// Execution settings
Enabled bool `json:"enabled"`
Timeout time.Duration `json:"timeout"` // Timeout for this subsystem
// Circuit breaker settings
CircuitBreaker CircuitBreakerConfig `json:"circuit_breaker"`
}
// CircuitBreakerConfig holds circuit breaker settings for subsystems
type CircuitBreakerConfig struct {
// Enabled controls whether circuit breaker is active
Enabled bool `json:"enabled"`
// FailureThreshold is the number of consecutive failures before opening the circuit
FailureThreshold int `json:"failure_threshold"`
// FailureWindow is the time window to track failures (e.g., 3 failures in 10 minutes)
FailureWindow time.Duration `json:"failure_window"`
// OpenDuration is how long the circuit stays open before attempting recovery
OpenDuration time.Duration `json:"open_duration"`
// HalfOpenAttempts is the number of test attempts in half-open state before fully closing
HalfOpenAttempts int `json:"half_open_attempts"`
}
// SubsystemsConfig holds all subsystem configurations
type SubsystemsConfig struct {
APT SubsystemConfig `json:"apt"`
DNF SubsystemConfig `json:"dnf"`
Docker SubsystemConfig `json:"docker"`
Windows SubsystemConfig `json:"windows"`
Winget SubsystemConfig `json:"winget"`
Storage SubsystemConfig `json:"storage"`
}
// GetDefaultSubsystemsConfig returns default subsystem configurations
func GetDefaultSubsystemsConfig() SubsystemsConfig {
// Default circuit breaker config
defaultCB := CircuitBreakerConfig{
Enabled: true,
FailureThreshold: 3, // 3 consecutive failures
FailureWindow: 10 * time.Minute, // within 10 minutes
OpenDuration: 30 * time.Minute, // circuit open for 30 min
HalfOpenAttempts: 2, // 2 successful attempts to close circuit
}
// Aggressive circuit breaker for Windows Update (known to be slow/problematic)
windowsCB := CircuitBreakerConfig{
Enabled: true,
FailureThreshold: 2, // Only 2 failures
FailureWindow: 15 * time.Minute,
OpenDuration: 60 * time.Minute, // Open for 1 hour
HalfOpenAttempts: 3,
}
return SubsystemsConfig{
APT: SubsystemConfig{
Enabled: true,
Timeout: 30 * time.Second,
CircuitBreaker: defaultCB,
},
DNF: SubsystemConfig{
Enabled: true,
Timeout: 45 * time.Second, // DNF can be slower
CircuitBreaker: defaultCB,
},
Docker: SubsystemConfig{
Enabled: true,
Timeout: 60 * time.Second, // Registry queries can be slow
CircuitBreaker: defaultCB,
},
Windows: SubsystemConfig{
Enabled: true,
Timeout: 10 * time.Minute, // Windows Update can be VERY slow
CircuitBreaker: windowsCB,
},
Winget: SubsystemConfig{
Enabled: true,
Timeout: 2 * time.Minute, // Winget has multiple retry strategies
CircuitBreaker: defaultCB,
},
Storage: SubsystemConfig{
Enabled: true,
Timeout: 10 * time.Second, // Disk info should be fast
CircuitBreaker: defaultCB,
},
}
}