feat: add resilience and reliability features for agent subsystems
Added circuit breakers with configurable timeouts for all subsystems (APT, DNF, Docker, Windows, Winget, Storage). Replaces cron-based scheduler with priority queue that should scale beyond 1000+ agents if your homelab is that big. Command acknowledgment system ensures results aren't lost on network failures or restarts. Agent tracks pending acknowledgments with persistent state and automatic retry. - Circuit breakers: 3 failures in 1min opens circuit, 30s cooldown - Per-subsystem timeouts: 30s-10min depending on scanner - Priority queue scheduler: O(log n), worker pool, jitter, backpressure - Acknowledgments: at-least-once delivery, max 10 retries over 24h - All tests passing (26/26)
This commit is contained in:
323
aggregator-server/internal/scheduler/scheduler_test.go
Normal file
323
aggregator-server/internal/scheduler/scheduler_test.go
Normal file
@@ -0,0 +1,323 @@
|
||||
package scheduler
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
)
|
||||
|
||||
func TestScheduler_NewScheduler(t *testing.T) {
|
||||
config := DefaultConfig()
|
||||
s := NewScheduler(config, nil, nil)
|
||||
|
||||
if s == nil {
|
||||
t.Fatal("NewScheduler returned nil")
|
||||
}
|
||||
|
||||
if s.config.NumWorkers != 10 {
|
||||
t.Fatalf("expected 10 workers, got %d", s.config.NumWorkers)
|
||||
}
|
||||
|
||||
if s.queue == nil {
|
||||
t.Fatal("queue not initialized")
|
||||
}
|
||||
|
||||
if len(s.workers) != config.NumWorkers {
|
||||
t.Fatalf("expected %d workers, got %d", config.NumWorkers, len(s.workers))
|
||||
}
|
||||
}
|
||||
|
||||
func TestScheduler_DefaultConfig(t *testing.T) {
|
||||
config := DefaultConfig()
|
||||
|
||||
if config.CheckInterval != 10*time.Second {
|
||||
t.Fatalf("expected check interval 10s, got %v", config.CheckInterval)
|
||||
}
|
||||
|
||||
if config.LookaheadWindow != 60*time.Second {
|
||||
t.Fatalf("expected lookahead 60s, got %v", config.LookaheadWindow)
|
||||
}
|
||||
|
||||
if config.MaxJitter != 30*time.Second {
|
||||
t.Fatalf("expected max jitter 30s, got %v", config.MaxJitter)
|
||||
}
|
||||
|
||||
if config.NumWorkers != 10 {
|
||||
t.Fatalf("expected 10 workers, got %d", config.NumWorkers)
|
||||
}
|
||||
|
||||
if config.BackpressureThreshold != 5 {
|
||||
t.Fatalf("expected backpressure threshold 5, got %d", config.BackpressureThreshold)
|
||||
}
|
||||
|
||||
if config.RateLimitPerSecond != 100 {
|
||||
t.Fatalf("expected rate limit 100/s, got %d", config.RateLimitPerSecond)
|
||||
}
|
||||
}
|
||||
|
||||
func TestScheduler_QueueIntegration(t *testing.T) {
|
||||
config := DefaultConfig()
|
||||
s := NewScheduler(config, nil, nil)
|
||||
|
||||
// Add jobs to queue
|
||||
agent1 := uuid.New()
|
||||
agent2 := uuid.New()
|
||||
|
||||
job1 := &SubsystemJob{
|
||||
AgentID: agent1,
|
||||
AgentHostname: "agent-01",
|
||||
Subsystem: "updates",
|
||||
IntervalMinutes: 15,
|
||||
NextRunAt: time.Now().Add(5 * time.Minute),
|
||||
}
|
||||
|
||||
job2 := &SubsystemJob{
|
||||
AgentID: agent2,
|
||||
AgentHostname: "agent-02",
|
||||
Subsystem: "storage",
|
||||
IntervalMinutes: 15,
|
||||
NextRunAt: time.Now().Add(10 * time.Minute),
|
||||
}
|
||||
|
||||
s.queue.Push(job1)
|
||||
s.queue.Push(job2)
|
||||
|
||||
if s.queue.Len() != 2 {
|
||||
t.Fatalf("expected queue len 2, got %d", s.queue.Len())
|
||||
}
|
||||
|
||||
// Get stats
|
||||
stats := s.GetQueueStats()
|
||||
if stats.Size != 2 {
|
||||
t.Fatalf("expected stats size 2, got %d", stats.Size)
|
||||
}
|
||||
}
|
||||
|
||||
func TestScheduler_GetStats(t *testing.T) {
|
||||
config := DefaultConfig()
|
||||
s := NewScheduler(config, nil, nil)
|
||||
|
||||
// Initial stats should be zero
|
||||
stats := s.GetStats()
|
||||
|
||||
if stats.JobsProcessed != 0 {
|
||||
t.Fatalf("expected 0 jobs processed, got %d", stats.JobsProcessed)
|
||||
}
|
||||
|
||||
if stats.CommandsCreated != 0 {
|
||||
t.Fatalf("expected 0 commands created, got %d", stats.CommandsCreated)
|
||||
}
|
||||
|
||||
if stats.BackpressureSkips != 0 {
|
||||
t.Fatalf("expected 0 backpressure skips, got %d", stats.BackpressureSkips)
|
||||
}
|
||||
|
||||
// Manually update stats (simulating processing)
|
||||
s.mu.Lock()
|
||||
s.stats.JobsProcessed = 100
|
||||
s.stats.CommandsCreated = 95
|
||||
s.stats.BackpressureSkips = 5
|
||||
s.mu.Unlock()
|
||||
|
||||
stats = s.GetStats()
|
||||
|
||||
if stats.JobsProcessed != 100 {
|
||||
t.Fatalf("expected 100 jobs processed, got %d", stats.JobsProcessed)
|
||||
}
|
||||
|
||||
if stats.CommandsCreated != 95 {
|
||||
t.Fatalf("expected 95 commands created, got %d", stats.CommandsCreated)
|
||||
}
|
||||
|
||||
if stats.BackpressureSkips != 5 {
|
||||
t.Fatalf("expected 5 backpressure skips, got %d", stats.BackpressureSkips)
|
||||
}
|
||||
}
|
||||
|
||||
func TestScheduler_StartStop(t *testing.T) {
|
||||
config := Config{
|
||||
CheckInterval: 100 * time.Millisecond, // Fast for testing
|
||||
LookaheadWindow: 60 * time.Second,
|
||||
MaxJitter: 1 * time.Second,
|
||||
NumWorkers: 2,
|
||||
BackpressureThreshold: 5,
|
||||
RateLimitPerSecond: 0, // Disable rate limiting for test
|
||||
}
|
||||
|
||||
s := NewScheduler(config, nil, nil)
|
||||
|
||||
// Start scheduler
|
||||
err := s.Start()
|
||||
if err != nil {
|
||||
t.Fatalf("failed to start scheduler: %v", err)
|
||||
}
|
||||
|
||||
// Let it run for a bit
|
||||
time.Sleep(500 * time.Millisecond)
|
||||
|
||||
// Stop scheduler
|
||||
err = s.Stop()
|
||||
if err != nil {
|
||||
t.Fatalf("failed to stop scheduler: %v", err)
|
||||
}
|
||||
|
||||
// Should stop cleanly
|
||||
}
|
||||
|
||||
func TestScheduler_ProcessQueueEmpty(t *testing.T) {
|
||||
config := DefaultConfig()
|
||||
s := NewScheduler(config, nil, nil)
|
||||
|
||||
// Process empty queue should not panic
|
||||
s.processQueue()
|
||||
|
||||
stats := s.GetStats()
|
||||
if stats.JobsProcessed != 0 {
|
||||
t.Fatalf("expected 0 jobs processed on empty queue, got %d", stats.JobsProcessed)
|
||||
}
|
||||
}
|
||||
|
||||
func TestScheduler_ProcessQueueWithJobs(t *testing.T) {
|
||||
config := Config{
|
||||
CheckInterval: 1 * time.Second,
|
||||
LookaheadWindow: 60 * time.Second,
|
||||
MaxJitter: 5 * time.Second,
|
||||
NumWorkers: 2,
|
||||
BackpressureThreshold: 5,
|
||||
RateLimitPerSecond: 0, // Disable for test
|
||||
}
|
||||
|
||||
s := NewScheduler(config, nil, nil)
|
||||
|
||||
// Add jobs that are due now
|
||||
for i := 0; i < 5; i++ {
|
||||
job := &SubsystemJob{
|
||||
AgentID: uuid.New(),
|
||||
AgentHostname: "test-agent",
|
||||
Subsystem: "updates",
|
||||
IntervalMinutes: 15,
|
||||
NextRunAt: time.Now(), // Due now
|
||||
}
|
||||
s.queue.Push(job)
|
||||
}
|
||||
|
||||
if s.queue.Len() != 5 {
|
||||
t.Fatalf("expected 5 jobs in queue, got %d", s.queue.Len())
|
||||
}
|
||||
|
||||
// Process the queue
|
||||
s.processQueue()
|
||||
|
||||
// Jobs should be dispatched to job channel
|
||||
// Note: Without database, workers can't actually process them
|
||||
// But we can verify they were dispatched
|
||||
|
||||
stats := s.GetStats()
|
||||
if stats.JobsProcessed == 0 {
|
||||
t.Fatal("expected some jobs to be processed")
|
||||
}
|
||||
}
|
||||
|
||||
func TestScheduler_RateLimiterRefill(t *testing.T) {
|
||||
config := Config{
|
||||
CheckInterval: 1 * time.Second,
|
||||
LookaheadWindow: 60 * time.Second,
|
||||
MaxJitter: 1 * time.Second,
|
||||
NumWorkers: 2,
|
||||
BackpressureThreshold: 5,
|
||||
RateLimitPerSecond: 10, // 10 tokens per second
|
||||
}
|
||||
|
||||
s := NewScheduler(config, nil, nil)
|
||||
|
||||
if s.rateLimiter == nil {
|
||||
t.Fatal("rate limiter not initialized")
|
||||
}
|
||||
|
||||
// Start refill goroutine
|
||||
go s.refillRateLimiter()
|
||||
|
||||
// Wait for some tokens to be added
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
|
||||
// Should have some tokens available
|
||||
tokensAvailable := 0
|
||||
for i := 0; i < 15; i++ {
|
||||
select {
|
||||
case <-s.rateLimiter:
|
||||
tokensAvailable++
|
||||
default:
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if tokensAvailable == 0 {
|
||||
t.Fatal("expected some tokens to be available after refill")
|
||||
}
|
||||
|
||||
// Should not exceed buffer size (10)
|
||||
if tokensAvailable > 10 {
|
||||
t.Fatalf("token bucket overflowed: got %d tokens, max is 10", tokensAvailable)
|
||||
}
|
||||
}
|
||||
|
||||
func TestScheduler_ConcurrentQueueAccess(t *testing.T) {
|
||||
config := DefaultConfig()
|
||||
s := NewScheduler(config, nil, nil)
|
||||
|
||||
done := make(chan bool)
|
||||
|
||||
// Concurrent pushes
|
||||
go func() {
|
||||
for i := 0; i < 100; i++ {
|
||||
job := &SubsystemJob{
|
||||
AgentID: uuid.New(),
|
||||
Subsystem: "updates",
|
||||
IntervalMinutes: 15,
|
||||
NextRunAt: time.Now(),
|
||||
}
|
||||
s.queue.Push(job)
|
||||
}
|
||||
done <- true
|
||||
}()
|
||||
|
||||
// Concurrent stats reads
|
||||
go func() {
|
||||
for i := 0; i < 100; i++ {
|
||||
s.GetStats()
|
||||
s.GetQueueStats()
|
||||
}
|
||||
done <- true
|
||||
}()
|
||||
|
||||
// Wait for both
|
||||
<-done
|
||||
<-done
|
||||
|
||||
// Should not panic and should have queued jobs
|
||||
if s.queue.Len() <= 0 {
|
||||
t.Fatal("expected jobs in queue after concurrent pushes")
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkScheduler_ProcessQueue(b *testing.B) {
|
||||
config := DefaultConfig()
|
||||
s := NewScheduler(config, nil, nil)
|
||||
|
||||
// Pre-fill queue with jobs
|
||||
for i := 0; i < 1000; i++ {
|
||||
job := &SubsystemJob{
|
||||
AgentID: uuid.New(),
|
||||
Subsystem: "updates",
|
||||
IntervalMinutes: 15,
|
||||
NextRunAt: time.Now(),
|
||||
}
|
||||
s.queue.Push(job)
|
||||
}
|
||||
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
s.processQueue()
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user