feat: add resilience and reliability features for agent subsystems
Added circuit breakers with configurable timeouts for all subsystems (APT, DNF, Docker, Windows, Winget, Storage). Replaces cron-based scheduler with priority queue that should scale beyond 1000+ agents if your homelab is that big. Command acknowledgment system ensures results aren't lost on network failures or restarts. Agent tracks pending acknowledgments with persistent state and automatic retry. - Circuit breakers: 3 failures in 1min opens circuit, 30s cooldown - Per-subsystem timeouts: 30s-10min depending on scanner - Priority queue scheduler: O(log n), worker pool, jitter, backpressure - Acknowledgments: at-least-once delivery, max 10 retries over 24h - All tests passing (26/26)
This commit is contained in:
138
aggregator-agent/internal/circuitbreaker/circuitbreaker_test.go
Normal file
138
aggregator-agent/internal/circuitbreaker/circuitbreaker_test.go
Normal file
@@ -0,0 +1,138 @@
|
||||
package circuitbreaker
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestCircuitBreaker_NormalOperation(t *testing.T) {
|
||||
cb := New("test", Config{
|
||||
FailureThreshold: 3,
|
||||
FailureWindow: 1 * time.Minute,
|
||||
OpenDuration: 1 * time.Minute,
|
||||
HalfOpenAttempts: 2,
|
||||
})
|
||||
|
||||
// Should allow calls in closed state
|
||||
err := cb.Call(func() error { return nil })
|
||||
if err != nil {
|
||||
t.Fatalf("expected no error, got %v", err)
|
||||
}
|
||||
|
||||
if cb.State() != StateClosed {
|
||||
t.Fatalf("expected state closed, got %v", cb.State())
|
||||
}
|
||||
}
|
||||
|
||||
func TestCircuitBreaker_OpensAfterFailures(t *testing.T) {
|
||||
cb := New("test", Config{
|
||||
FailureThreshold: 3,
|
||||
FailureWindow: 1 * time.Minute,
|
||||
OpenDuration: 100 * time.Millisecond,
|
||||
HalfOpenAttempts: 2,
|
||||
})
|
||||
|
||||
testErr := errors.New("test error")
|
||||
|
||||
// Record 3 failures
|
||||
for i := 0; i < 3; i++ {
|
||||
cb.Call(func() error { return testErr })
|
||||
}
|
||||
|
||||
// Should now be open
|
||||
if cb.State() != StateOpen {
|
||||
t.Fatalf("expected state open after %d failures, got %v", 3, cb.State())
|
||||
}
|
||||
|
||||
// Next call should fail fast
|
||||
err := cb.Call(func() error { return nil })
|
||||
if err == nil {
|
||||
t.Fatal("expected circuit breaker to reject call, but it succeeded")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCircuitBreaker_HalfOpenRecovery(t *testing.T) {
|
||||
cb := New("test", Config{
|
||||
FailureThreshold: 2,
|
||||
FailureWindow: 1 * time.Minute,
|
||||
OpenDuration: 50 * time.Millisecond,
|
||||
HalfOpenAttempts: 2,
|
||||
})
|
||||
|
||||
testErr := errors.New("test error")
|
||||
|
||||
// Open the circuit
|
||||
cb.Call(func() error { return testErr })
|
||||
cb.Call(func() error { return testErr })
|
||||
|
||||
if cb.State() != StateOpen {
|
||||
t.Fatal("circuit should be open")
|
||||
}
|
||||
|
||||
// Wait for open duration
|
||||
time.Sleep(60 * time.Millisecond)
|
||||
|
||||
// Should transition to half-open and allow call
|
||||
err := cb.Call(func() error { return nil })
|
||||
if err != nil {
|
||||
t.Fatalf("expected call to succeed in half-open state, got %v", err)
|
||||
}
|
||||
|
||||
if cb.State() != StateHalfOpen {
|
||||
t.Fatalf("expected half-open state, got %v", cb.State())
|
||||
}
|
||||
|
||||
// One more success should close it
|
||||
cb.Call(func() error { return nil })
|
||||
|
||||
if cb.State() != StateClosed {
|
||||
t.Fatalf("expected closed state after %d successes, got %v", 2, cb.State())
|
||||
}
|
||||
}
|
||||
|
||||
func TestCircuitBreaker_HalfOpenFailure(t *testing.T) {
|
||||
cb := New("test", Config{
|
||||
FailureThreshold: 2,
|
||||
FailureWindow: 1 * time.Minute,
|
||||
OpenDuration: 50 * time.Millisecond,
|
||||
HalfOpenAttempts: 2,
|
||||
})
|
||||
|
||||
testErr := errors.New("test error")
|
||||
|
||||
// Open the circuit
|
||||
cb.Call(func() error { return testErr })
|
||||
cb.Call(func() error { return testErr })
|
||||
|
||||
// Wait and attempt in half-open
|
||||
time.Sleep(60 * time.Millisecond)
|
||||
cb.Call(func() error { return nil }) // Half-open
|
||||
|
||||
// Fail in half-open - should go back to open
|
||||
cb.Call(func() error { return testErr })
|
||||
|
||||
if cb.State() != StateOpen {
|
||||
t.Fatalf("expected open state after half-open failure, got %v", cb.State())
|
||||
}
|
||||
}
|
||||
|
||||
func TestCircuitBreaker_Stats(t *testing.T) {
|
||||
cb := New("test-subsystem", Config{
|
||||
FailureThreshold: 3,
|
||||
FailureWindow: 1 * time.Minute,
|
||||
OpenDuration: 1 * time.Minute,
|
||||
HalfOpenAttempts: 2,
|
||||
})
|
||||
|
||||
stats := cb.GetStats()
|
||||
if stats.Name != "test-subsystem" {
|
||||
t.Fatalf("expected name 'test-subsystem', got %s", stats.Name)
|
||||
}
|
||||
if stats.State != "closed" {
|
||||
t.Fatalf("expected state 'closed', got %s", stats.State)
|
||||
}
|
||||
if stats.RecentFailures != 0 {
|
||||
t.Fatalf("expected 0 failures, got %d", stats.RecentFailures)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user