Added circuit breakers with configurable timeouts for all subsystems (APT, DNF, Docker, Windows, Winget, Storage). Replaces cron-based scheduler with priority queue that should scale beyond 1000+ agents if your homelab is that big. Command acknowledgment system ensures results aren't lost on network failures or restarts. Agent tracks pending acknowledgments with persistent state and automatic retry. - Circuit breakers: 3 failures in 1min opens circuit, 30s cooldown - Per-subsystem timeouts: 30s-10min depending on scanner - Priority queue scheduler: O(log n), worker pool, jitter, backpressure - Acknowledgments: at-least-once delivery, max 10 retries over 24h - All tests passing (26/26)
96 lines
3.1 KiB
Go
96 lines
3.1 KiB
Go
package config
|
|
|
|
import "time"
|
|
|
|
// SubsystemConfig holds configuration for individual subsystems
|
|
type SubsystemConfig struct {
|
|
// Execution settings
|
|
Enabled bool `json:"enabled"`
|
|
Timeout time.Duration `json:"timeout"` // Timeout for this subsystem
|
|
|
|
// Circuit breaker settings
|
|
CircuitBreaker CircuitBreakerConfig `json:"circuit_breaker"`
|
|
}
|
|
|
|
// CircuitBreakerConfig holds circuit breaker settings for subsystems
|
|
type CircuitBreakerConfig struct {
|
|
// Enabled controls whether circuit breaker is active
|
|
Enabled bool `json:"enabled"`
|
|
|
|
// FailureThreshold is the number of consecutive failures before opening the circuit
|
|
FailureThreshold int `json:"failure_threshold"`
|
|
|
|
// FailureWindow is the time window to track failures (e.g., 3 failures in 10 minutes)
|
|
FailureWindow time.Duration `json:"failure_window"`
|
|
|
|
// OpenDuration is how long the circuit stays open before attempting recovery
|
|
OpenDuration time.Duration `json:"open_duration"`
|
|
|
|
// HalfOpenAttempts is the number of test attempts in half-open state before fully closing
|
|
HalfOpenAttempts int `json:"half_open_attempts"`
|
|
}
|
|
|
|
// SubsystemsConfig holds all subsystem configurations
|
|
type SubsystemsConfig struct {
|
|
APT SubsystemConfig `json:"apt"`
|
|
DNF SubsystemConfig `json:"dnf"`
|
|
Docker SubsystemConfig `json:"docker"`
|
|
Windows SubsystemConfig `json:"windows"`
|
|
Winget SubsystemConfig `json:"winget"`
|
|
Storage SubsystemConfig `json:"storage"`
|
|
}
|
|
|
|
// GetDefaultSubsystemsConfig returns default subsystem configurations
|
|
func GetDefaultSubsystemsConfig() SubsystemsConfig {
|
|
// Default circuit breaker config
|
|
defaultCB := CircuitBreakerConfig{
|
|
Enabled: true,
|
|
FailureThreshold: 3, // 3 consecutive failures
|
|
FailureWindow: 10 * time.Minute, // within 10 minutes
|
|
OpenDuration: 30 * time.Minute, // circuit open for 30 min
|
|
HalfOpenAttempts: 2, // 2 successful attempts to close circuit
|
|
}
|
|
|
|
// Aggressive circuit breaker for Windows Update (known to be slow/problematic)
|
|
windowsCB := CircuitBreakerConfig{
|
|
Enabled: true,
|
|
FailureThreshold: 2, // Only 2 failures
|
|
FailureWindow: 15 * time.Minute,
|
|
OpenDuration: 60 * time.Minute, // Open for 1 hour
|
|
HalfOpenAttempts: 3,
|
|
}
|
|
|
|
return SubsystemsConfig{
|
|
APT: SubsystemConfig{
|
|
Enabled: true,
|
|
Timeout: 30 * time.Second,
|
|
CircuitBreaker: defaultCB,
|
|
},
|
|
DNF: SubsystemConfig{
|
|
Enabled: true,
|
|
Timeout: 45 * time.Second, // DNF can be slower
|
|
CircuitBreaker: defaultCB,
|
|
},
|
|
Docker: SubsystemConfig{
|
|
Enabled: true,
|
|
Timeout: 60 * time.Second, // Registry queries can be slow
|
|
CircuitBreaker: defaultCB,
|
|
},
|
|
Windows: SubsystemConfig{
|
|
Enabled: true,
|
|
Timeout: 10 * time.Minute, // Windows Update can be VERY slow
|
|
CircuitBreaker: windowsCB,
|
|
},
|
|
Winget: SubsystemConfig{
|
|
Enabled: true,
|
|
Timeout: 2 * time.Minute, // Winget has multiple retry strategies
|
|
CircuitBreaker: defaultCB,
|
|
},
|
|
Storage: SubsystemConfig{
|
|
Enabled: true,
|
|
Timeout: 10 * time.Second, // Disk info should be fast
|
|
CircuitBreaker: defaultCB,
|
|
},
|
|
}
|
|
}
|