feat: add resilience and reliability features for agent subsystems
Added circuit breakers with configurable timeouts for all subsystems (APT, DNF, Docker, Windows, Winget, Storage). Replaces cron-based scheduler with priority queue that should scale beyond 1000+ agents if your homelab is that big. Command acknowledgment system ensures results aren't lost on network failures or restarts. Agent tracks pending acknowledgments with persistent state and automatic retry. - Circuit breakers: 3 failures in 1min opens circuit, 30s cooldown - Per-subsystem timeouts: 30s-10min depending on scanner - Priority queue scheduler: O(log n), worker pool, jitter, backpressure - Acknowledgments: at-least-once delivery, max 10 retries over 24h - All tests passing (26/26)
This commit is contained in:
@@ -269,6 +269,21 @@ func (h *AgentHandler) GetCommands(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
// Process command acknowledgments if agent sent any
|
||||
var acknowledgedIDs []string
|
||||
if metrics != nil && len(metrics.PendingAcknowledgments) > 0 {
|
||||
// Verify which commands from the agent's pending list have been recorded
|
||||
verified, err := h.commandQueries.VerifyCommandsCompleted(metrics.PendingAcknowledgments)
|
||||
if err != nil {
|
||||
log.Printf("Warning: Failed to verify command acknowledgments for agent %s: %v", agentID, err)
|
||||
} else {
|
||||
acknowledgedIDs = verified
|
||||
if len(acknowledgedIDs) > 0 {
|
||||
log.Printf("Acknowledged %d command results for agent %s", len(acknowledgedIDs), agentID)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Process heartbeat metadata from agent check-ins
|
||||
if metrics.Metadata != nil {
|
||||
agent, err := h.agentQueries.GetAgentByID(agentID)
|
||||
@@ -437,8 +452,9 @@ func (h *AgentHandler) GetCommands(c *gin.Context) {
|
||||
}
|
||||
|
||||
response := models.CommandsResponse{
|
||||
Commands: commandItems,
|
||||
RapidPolling: rapidPolling,
|
||||
Commands: commandItems,
|
||||
RapidPolling: rapidPolling,
|
||||
AcknowledgedIDs: acknowledgedIDs,
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, response)
|
||||
|
||||
@@ -337,3 +337,61 @@ func (q *CommandQueries) ClearAllFailedCommands(days int) (int64, error) {
|
||||
|
||||
return result.RowsAffected()
|
||||
}
|
||||
|
||||
// CountPendingCommandsForAgent returns the number of pending commands for a specific agent
|
||||
// Used by scheduler for backpressure detection
|
||||
func (q *CommandQueries) CountPendingCommandsForAgent(agentID uuid.UUID) (int, error) {
|
||||
var count int
|
||||
query := `
|
||||
SELECT COUNT(*)
|
||||
FROM agent_commands
|
||||
WHERE agent_id = $1 AND status = 'pending'
|
||||
`
|
||||
err := q.db.Get(&count, query, agentID)
|
||||
return count, err
|
||||
}
|
||||
|
||||
// VerifyCommandsCompleted checks which command IDs from the provided list have been completed or failed
|
||||
// Returns the list of command IDs that have been successfully recorded (completed or failed status)
|
||||
func (q *CommandQueries) VerifyCommandsCompleted(commandIDs []string) ([]string, error) {
|
||||
if len(commandIDs) == 0 {
|
||||
return []string{}, nil
|
||||
}
|
||||
|
||||
// Convert string IDs to UUIDs
|
||||
uuidIDs := make([]uuid.UUID, 0, len(commandIDs))
|
||||
for _, idStr := range commandIDs {
|
||||
id, err := uuid.Parse(idStr)
|
||||
if err != nil {
|
||||
// Skip invalid UUIDs
|
||||
continue
|
||||
}
|
||||
uuidIDs = append(uuidIDs, id)
|
||||
}
|
||||
|
||||
if len(uuidIDs) == 0 {
|
||||
return []string{}, nil
|
||||
}
|
||||
|
||||
// Query for commands that are completed or failed
|
||||
query := `
|
||||
SELECT id
|
||||
FROM agent_commands
|
||||
WHERE id = ANY($1)
|
||||
AND status IN ('completed', 'failed')
|
||||
`
|
||||
|
||||
var completedUUIDs []uuid.UUID
|
||||
err := q.db.Select(&completedUUIDs, query, uuidIDs)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to verify command completion: %w", err)
|
||||
}
|
||||
|
||||
// Convert back to strings
|
||||
completedIDs := make([]string, len(completedUUIDs))
|
||||
for i, id := range completedUUIDs {
|
||||
completedIDs[i] = id.String()
|
||||
}
|
||||
|
||||
return completedIDs, nil
|
||||
}
|
||||
|
||||
@@ -23,8 +23,9 @@ type AgentCommand struct {
|
||||
|
||||
// CommandsResponse is returned when an agent checks in for commands
|
||||
type CommandsResponse struct {
|
||||
Commands []CommandItem `json:"commands"`
|
||||
RapidPolling *RapidPollingConfig `json:"rapid_polling,omitempty"`
|
||||
Commands []CommandItem `json:"commands"`
|
||||
RapidPolling *RapidPollingConfig `json:"rapid_polling,omitempty"`
|
||||
AcknowledgedIDs []string `json:"acknowledged_ids,omitempty"` // IDs server has received
|
||||
}
|
||||
|
||||
// RapidPollingConfig contains rapid polling configuration for the agent
|
||||
|
||||
286
aggregator-server/internal/scheduler/queue.go
Normal file
286
aggregator-server/internal/scheduler/queue.go
Normal file
@@ -0,0 +1,286 @@
|
||||
package scheduler
|
||||
|
||||
import (
|
||||
"container/heap"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
)
|
||||
|
||||
// SubsystemJob represents a scheduled subsystem scan
|
||||
type SubsystemJob struct {
|
||||
AgentID uuid.UUID
|
||||
AgentHostname string // For logging/debugging
|
||||
Subsystem string
|
||||
IntervalMinutes int
|
||||
NextRunAt time.Time
|
||||
Enabled bool
|
||||
index int // Heap index (managed by heap.Interface)
|
||||
}
|
||||
|
||||
// String returns a human-readable representation of the job
|
||||
func (j *SubsystemJob) String() string {
|
||||
return fmt.Sprintf("[%s/%s] next_run=%s interval=%dm",
|
||||
j.AgentHostname, j.Subsystem,
|
||||
j.NextRunAt.Format("15:04:05"), j.IntervalMinutes)
|
||||
}
|
||||
|
||||
// jobHeap implements heap.Interface for SubsystemJob priority queue
|
||||
// Jobs are ordered by NextRunAt (earliest first)
|
||||
type jobHeap []*SubsystemJob
|
||||
|
||||
func (h jobHeap) Len() int { return len(h) }
|
||||
|
||||
func (h jobHeap) Less(i, j int) bool {
|
||||
return h[i].NextRunAt.Before(h[j].NextRunAt)
|
||||
}
|
||||
|
||||
func (h jobHeap) Swap(i, j int) {
|
||||
h[i], h[j] = h[j], h[i]
|
||||
h[i].index = i
|
||||
h[j].index = j
|
||||
}
|
||||
|
||||
func (h *jobHeap) Push(x interface{}) {
|
||||
n := len(*h)
|
||||
job := x.(*SubsystemJob)
|
||||
job.index = n
|
||||
*h = append(*h, job)
|
||||
}
|
||||
|
||||
func (h *jobHeap) Pop() interface{} {
|
||||
old := *h
|
||||
n := len(old)
|
||||
job := old[n-1]
|
||||
old[n-1] = nil // Avoid memory leak
|
||||
job.index = -1 // Mark as removed
|
||||
*h = old[0 : n-1]
|
||||
return job
|
||||
}
|
||||
|
||||
// PriorityQueue is a thread-safe priority queue for subsystem jobs
|
||||
// Jobs are ordered by their NextRunAt timestamp (earliest first)
|
||||
type PriorityQueue struct {
|
||||
heap jobHeap
|
||||
mu sync.RWMutex
|
||||
|
||||
// Index for fast lookups by agent_id + subsystem
|
||||
index map[string]*SubsystemJob // key: "agent_id:subsystem"
|
||||
}
|
||||
|
||||
// NewPriorityQueue creates a new empty priority queue
|
||||
func NewPriorityQueue() *PriorityQueue {
|
||||
pq := &PriorityQueue{
|
||||
heap: make(jobHeap, 0),
|
||||
index: make(map[string]*SubsystemJob),
|
||||
}
|
||||
heap.Init(&pq.heap)
|
||||
return pq
|
||||
}
|
||||
|
||||
// Push adds a job to the queue
|
||||
// If a job with the same agent_id + subsystem already exists, it's updated
|
||||
func (pq *PriorityQueue) Push(job *SubsystemJob) {
|
||||
pq.mu.Lock()
|
||||
defer pq.mu.Unlock()
|
||||
|
||||
key := makeKey(job.AgentID, job.Subsystem)
|
||||
|
||||
// Check if job already exists
|
||||
if existing, exists := pq.index[key]; exists {
|
||||
// Update existing job
|
||||
existing.NextRunAt = job.NextRunAt
|
||||
existing.IntervalMinutes = job.IntervalMinutes
|
||||
existing.Enabled = job.Enabled
|
||||
existing.AgentHostname = job.AgentHostname
|
||||
heap.Fix(&pq.heap, existing.index)
|
||||
return
|
||||
}
|
||||
|
||||
// Add new job
|
||||
heap.Push(&pq.heap, job)
|
||||
pq.index[key] = job
|
||||
}
|
||||
|
||||
// Pop removes and returns the job with the earliest NextRunAt
|
||||
// Returns nil if queue is empty
|
||||
func (pq *PriorityQueue) Pop() *SubsystemJob {
|
||||
pq.mu.Lock()
|
||||
defer pq.mu.Unlock()
|
||||
|
||||
if pq.heap.Len() == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
job := heap.Pop(&pq.heap).(*SubsystemJob)
|
||||
key := makeKey(job.AgentID, job.Subsystem)
|
||||
delete(pq.index, key)
|
||||
|
||||
return job
|
||||
}
|
||||
|
||||
// Peek returns the job with the earliest NextRunAt without removing it
|
||||
// Returns nil if queue is empty
|
||||
func (pq *PriorityQueue) Peek() *SubsystemJob {
|
||||
pq.mu.RLock()
|
||||
defer pq.mu.RUnlock()
|
||||
|
||||
if pq.heap.Len() == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
return pq.heap[0]
|
||||
}
|
||||
|
||||
// Remove removes a specific job from the queue
|
||||
// Returns true if job was found and removed, false otherwise
|
||||
func (pq *PriorityQueue) Remove(agentID uuid.UUID, subsystem string) bool {
|
||||
pq.mu.Lock()
|
||||
defer pq.mu.Unlock()
|
||||
|
||||
key := makeKey(agentID, subsystem)
|
||||
job, exists := pq.index[key]
|
||||
if !exists {
|
||||
return false
|
||||
}
|
||||
|
||||
heap.Remove(&pq.heap, job.index)
|
||||
delete(pq.index, key)
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
// Get retrieves a specific job without removing it
|
||||
// Returns nil if not found
|
||||
func (pq *PriorityQueue) Get(agentID uuid.UUID, subsystem string) *SubsystemJob {
|
||||
pq.mu.RLock()
|
||||
defer pq.mu.RUnlock()
|
||||
|
||||
key := makeKey(agentID, subsystem)
|
||||
return pq.index[key]
|
||||
}
|
||||
|
||||
// PopBefore returns all jobs with NextRunAt <= before, up to limit
|
||||
// Jobs are removed from the queue
|
||||
// If limit <= 0, all matching jobs are returned
|
||||
func (pq *PriorityQueue) PopBefore(before time.Time, limit int) []*SubsystemJob {
|
||||
pq.mu.Lock()
|
||||
defer pq.mu.Unlock()
|
||||
|
||||
var jobs []*SubsystemJob
|
||||
|
||||
for pq.heap.Len() > 0 {
|
||||
// Peek at next job
|
||||
next := pq.heap[0]
|
||||
|
||||
// Stop if next job is after our cutoff
|
||||
if next.NextRunAt.After(before) {
|
||||
break
|
||||
}
|
||||
|
||||
// Stop if we've hit the limit
|
||||
if limit > 0 && len(jobs) >= limit {
|
||||
break
|
||||
}
|
||||
|
||||
// Pop and collect the job
|
||||
job := heap.Pop(&pq.heap).(*SubsystemJob)
|
||||
key := makeKey(job.AgentID, job.Subsystem)
|
||||
delete(pq.index, key)
|
||||
|
||||
jobs = append(jobs, job)
|
||||
}
|
||||
|
||||
return jobs
|
||||
}
|
||||
|
||||
// PeekBefore returns all jobs with NextRunAt <= before without removing them
|
||||
// If limit <= 0, all matching jobs are returned
|
||||
func (pq *PriorityQueue) PeekBefore(before time.Time, limit int) []*SubsystemJob {
|
||||
pq.mu.RLock()
|
||||
defer pq.mu.RUnlock()
|
||||
|
||||
var jobs []*SubsystemJob
|
||||
|
||||
for i := 0; i < pq.heap.Len(); i++ {
|
||||
job := pq.heap[i]
|
||||
|
||||
if job.NextRunAt.After(before) {
|
||||
// Since heap is sorted by NextRunAt, we can break early
|
||||
// Note: This is only valid because we peek in order
|
||||
break
|
||||
}
|
||||
|
||||
if limit > 0 && len(jobs) >= limit {
|
||||
break
|
||||
}
|
||||
|
||||
jobs = append(jobs, job)
|
||||
}
|
||||
|
||||
return jobs
|
||||
}
|
||||
|
||||
// Len returns the number of jobs in the queue
|
||||
func (pq *PriorityQueue) Len() int {
|
||||
pq.mu.RLock()
|
||||
defer pq.mu.RUnlock()
|
||||
return pq.heap.Len()
|
||||
}
|
||||
|
||||
// Clear removes all jobs from the queue
|
||||
func (pq *PriorityQueue) Clear() {
|
||||
pq.mu.Lock()
|
||||
defer pq.mu.Unlock()
|
||||
|
||||
pq.heap = make(jobHeap, 0)
|
||||
pq.index = make(map[string]*SubsystemJob)
|
||||
heap.Init(&pq.heap)
|
||||
}
|
||||
|
||||
// GetStats returns statistics about the queue
|
||||
func (pq *PriorityQueue) GetStats() QueueStats {
|
||||
pq.mu.RLock()
|
||||
defer pq.mu.RUnlock()
|
||||
|
||||
stats := QueueStats{
|
||||
Size: pq.heap.Len(),
|
||||
}
|
||||
|
||||
if pq.heap.Len() > 0 {
|
||||
stats.NextRunAt = &pq.heap[0].NextRunAt
|
||||
stats.OldestJob = pq.heap[0].String()
|
||||
}
|
||||
|
||||
// Count jobs by subsystem
|
||||
stats.JobsBySubsystem = make(map[string]int)
|
||||
for _, job := range pq.heap {
|
||||
stats.JobsBySubsystem[job.Subsystem]++
|
||||
}
|
||||
|
||||
return stats
|
||||
}
|
||||
|
||||
// QueueStats holds statistics about the priority queue
|
||||
type QueueStats struct {
|
||||
Size int
|
||||
NextRunAt *time.Time
|
||||
OldestJob string
|
||||
JobsBySubsystem map[string]int
|
||||
}
|
||||
|
||||
// String returns a human-readable representation of stats
|
||||
func (s QueueStats) String() string {
|
||||
nextRun := "empty"
|
||||
if s.NextRunAt != nil {
|
||||
nextRun = s.NextRunAt.Format("15:04:05")
|
||||
}
|
||||
return fmt.Sprintf("size=%d next=%s oldest=%s", s.Size, nextRun, s.OldestJob)
|
||||
}
|
||||
|
||||
// makeKey creates a unique key for agent_id + subsystem
|
||||
func makeKey(agentID uuid.UUID, subsystem string) string {
|
||||
return agentID.String() + ":" + subsystem
|
||||
}
|
||||
539
aggregator-server/internal/scheduler/queue_test.go
Normal file
539
aggregator-server/internal/scheduler/queue_test.go
Normal file
@@ -0,0 +1,539 @@
|
||||
package scheduler
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
)
|
||||
|
||||
func TestPriorityQueue_BasicOperations(t *testing.T) {
|
||||
pq := NewPriorityQueue()
|
||||
|
||||
// Test empty queue
|
||||
if pq.Len() != 0 {
|
||||
t.Fatalf("expected empty queue, got len=%d", pq.Len())
|
||||
}
|
||||
|
||||
if pq.Peek() != nil {
|
||||
t.Fatal("Peek on empty queue should return nil")
|
||||
}
|
||||
|
||||
if pq.Pop() != nil {
|
||||
t.Fatal("Pop on empty queue should return nil")
|
||||
}
|
||||
|
||||
// Push a job
|
||||
agent1 := uuid.New()
|
||||
job1 := &SubsystemJob{
|
||||
AgentID: agent1,
|
||||
AgentHostname: "agent-01",
|
||||
Subsystem: "updates",
|
||||
IntervalMinutes: 15,
|
||||
NextRunAt: time.Now().Add(10 * time.Minute),
|
||||
}
|
||||
pq.Push(job1)
|
||||
|
||||
if pq.Len() != 1 {
|
||||
t.Fatalf("expected len=1 after push, got %d", pq.Len())
|
||||
}
|
||||
|
||||
// Peek should return the job without removing it
|
||||
peeked := pq.Peek()
|
||||
if peeked == nil {
|
||||
t.Fatal("Peek should return job")
|
||||
}
|
||||
if peeked.AgentID != agent1 {
|
||||
t.Fatal("Peek returned wrong job")
|
||||
}
|
||||
if pq.Len() != 1 {
|
||||
t.Fatal("Peek should not remove job")
|
||||
}
|
||||
|
||||
// Pop should return and remove the job
|
||||
popped := pq.Pop()
|
||||
if popped == nil {
|
||||
t.Fatal("Pop should return job")
|
||||
}
|
||||
if popped.AgentID != agent1 {
|
||||
t.Fatal("Pop returned wrong job")
|
||||
}
|
||||
if pq.Len() != 0 {
|
||||
t.Fatal("Pop should remove job")
|
||||
}
|
||||
}
|
||||
|
||||
func TestPriorityQueue_Ordering(t *testing.T) {
|
||||
pq := NewPriorityQueue()
|
||||
now := time.Now()
|
||||
|
||||
// Push jobs in random order
|
||||
jobs := []*SubsystemJob{
|
||||
{
|
||||
AgentID: uuid.New(),
|
||||
Subsystem: "updates",
|
||||
NextRunAt: now.Add(30 * time.Minute), // Third
|
||||
},
|
||||
{
|
||||
AgentID: uuid.New(),
|
||||
Subsystem: "storage",
|
||||
NextRunAt: now.Add(5 * time.Minute), // First
|
||||
},
|
||||
{
|
||||
AgentID: uuid.New(),
|
||||
Subsystem: "docker",
|
||||
NextRunAt: now.Add(15 * time.Minute), // Second
|
||||
},
|
||||
}
|
||||
|
||||
for _, job := range jobs {
|
||||
pq.Push(job)
|
||||
}
|
||||
|
||||
// Pop should return jobs in NextRunAt order
|
||||
first := pq.Pop()
|
||||
if first.Subsystem != "storage" {
|
||||
t.Fatalf("expected 'storage' first, got '%s'", first.Subsystem)
|
||||
}
|
||||
|
||||
second := pq.Pop()
|
||||
if second.Subsystem != "docker" {
|
||||
t.Fatalf("expected 'docker' second, got '%s'", second.Subsystem)
|
||||
}
|
||||
|
||||
third := pq.Pop()
|
||||
if third.Subsystem != "updates" {
|
||||
t.Fatalf("expected 'updates' third, got '%s'", third.Subsystem)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPriorityQueue_UpdateExisting(t *testing.T) {
|
||||
pq := NewPriorityQueue()
|
||||
agentID := uuid.New()
|
||||
now := time.Now()
|
||||
|
||||
// Push initial job
|
||||
job1 := &SubsystemJob{
|
||||
AgentID: agentID,
|
||||
Subsystem: "updates",
|
||||
IntervalMinutes: 15,
|
||||
NextRunAt: now.Add(15 * time.Minute),
|
||||
}
|
||||
pq.Push(job1)
|
||||
|
||||
if pq.Len() != 1 {
|
||||
t.Fatalf("expected len=1, got %d", pq.Len())
|
||||
}
|
||||
|
||||
// Push same agent+subsystem with different NextRunAt
|
||||
job2 := &SubsystemJob{
|
||||
AgentID: agentID,
|
||||
Subsystem: "updates",
|
||||
IntervalMinutes: 30,
|
||||
NextRunAt: now.Add(30 * time.Minute),
|
||||
}
|
||||
pq.Push(job2)
|
||||
|
||||
// Should still be 1 job (updated, not added)
|
||||
if pq.Len() != 1 {
|
||||
t.Fatalf("expected len=1 after update, got %d", pq.Len())
|
||||
}
|
||||
|
||||
// Verify the job was updated
|
||||
job := pq.Pop()
|
||||
if job.IntervalMinutes != 30 {
|
||||
t.Fatalf("expected interval=30, got %d", job.IntervalMinutes)
|
||||
}
|
||||
if !job.NextRunAt.Equal(now.Add(30 * time.Minute)) {
|
||||
t.Fatal("NextRunAt was not updated")
|
||||
}
|
||||
}
|
||||
|
||||
func TestPriorityQueue_Remove(t *testing.T) {
|
||||
pq := NewPriorityQueue()
|
||||
|
||||
agent1 := uuid.New()
|
||||
agent2 := uuid.New()
|
||||
|
||||
pq.Push(&SubsystemJob{
|
||||
AgentID: agent1,
|
||||
Subsystem: "updates",
|
||||
NextRunAt: time.Now(),
|
||||
})
|
||||
pq.Push(&SubsystemJob{
|
||||
AgentID: agent2,
|
||||
Subsystem: "storage",
|
||||
NextRunAt: time.Now(),
|
||||
})
|
||||
|
||||
if pq.Len() != 2 {
|
||||
t.Fatalf("expected len=2, got %d", pq.Len())
|
||||
}
|
||||
|
||||
// Remove existing job
|
||||
removed := pq.Remove(agent1, "updates")
|
||||
if !removed {
|
||||
t.Fatal("Remove should return true for existing job")
|
||||
}
|
||||
if pq.Len() != 1 {
|
||||
t.Fatalf("expected len=1 after remove, got %d", pq.Len())
|
||||
}
|
||||
|
||||
// Remove non-existent job
|
||||
removed = pq.Remove(agent1, "updates")
|
||||
if removed {
|
||||
t.Fatal("Remove should return false for non-existent job")
|
||||
}
|
||||
if pq.Len() != 1 {
|
||||
t.Fatal("Remove of non-existent job should not affect queue")
|
||||
}
|
||||
}
|
||||
|
||||
func TestPriorityQueue_Get(t *testing.T) {
|
||||
pq := NewPriorityQueue()
|
||||
|
||||
agentID := uuid.New()
|
||||
job := &SubsystemJob{
|
||||
AgentID: agentID,
|
||||
Subsystem: "updates",
|
||||
NextRunAt: time.Now(),
|
||||
}
|
||||
pq.Push(job)
|
||||
|
||||
// Get existing job
|
||||
retrieved := pq.Get(agentID, "updates")
|
||||
if retrieved == nil {
|
||||
t.Fatal("Get should return job")
|
||||
}
|
||||
if retrieved.AgentID != agentID {
|
||||
t.Fatal("Get returned wrong job")
|
||||
}
|
||||
if pq.Len() != 1 {
|
||||
t.Fatal("Get should not remove job")
|
||||
}
|
||||
|
||||
// Get non-existent job
|
||||
retrieved = pq.Get(uuid.New(), "storage")
|
||||
if retrieved != nil {
|
||||
t.Fatal("Get should return nil for non-existent job")
|
||||
}
|
||||
}
|
||||
|
||||
func TestPriorityQueue_PopBefore(t *testing.T) {
|
||||
pq := NewPriorityQueue()
|
||||
now := time.Now()
|
||||
|
||||
// Add jobs with different NextRunAt times
|
||||
for i := 0; i < 5; i++ {
|
||||
pq.Push(&SubsystemJob{
|
||||
AgentID: uuid.New(),
|
||||
Subsystem: "updates",
|
||||
NextRunAt: now.Add(time.Duration(i*10) * time.Minute),
|
||||
})
|
||||
}
|
||||
|
||||
if pq.Len() != 5 {
|
||||
t.Fatalf("expected len=5, got %d", pq.Len())
|
||||
}
|
||||
|
||||
// Pop jobs before now+25min (should get 3 jobs: 0, 10, 20 minutes)
|
||||
cutoff := now.Add(25 * time.Minute)
|
||||
jobs := pq.PopBefore(cutoff, 0) // no limit
|
||||
|
||||
if len(jobs) != 3 {
|
||||
t.Fatalf("expected 3 jobs, got %d", len(jobs))
|
||||
}
|
||||
|
||||
// Verify all returned jobs are before cutoff
|
||||
for _, job := range jobs {
|
||||
if job.NextRunAt.After(cutoff) {
|
||||
t.Fatalf("job NextRunAt %v is after cutoff %v", job.NextRunAt, cutoff)
|
||||
}
|
||||
}
|
||||
|
||||
// Queue should have 2 jobs left
|
||||
if pq.Len() != 2 {
|
||||
t.Fatalf("expected len=2 after PopBefore, got %d", pq.Len())
|
||||
}
|
||||
}
|
||||
|
||||
func TestPriorityQueue_PopBeforeWithLimit(t *testing.T) {
|
||||
pq := NewPriorityQueue()
|
||||
now := time.Now()
|
||||
|
||||
// Add 5 jobs all due now
|
||||
for i := 0; i < 5; i++ {
|
||||
pq.Push(&SubsystemJob{
|
||||
AgentID: uuid.New(),
|
||||
Subsystem: "updates",
|
||||
NextRunAt: now,
|
||||
})
|
||||
}
|
||||
|
||||
// Pop with limit of 3
|
||||
jobs := pq.PopBefore(now.Add(1*time.Hour), 3)
|
||||
|
||||
if len(jobs) != 3 {
|
||||
t.Fatalf("expected 3 jobs (limit), got %d", len(jobs))
|
||||
}
|
||||
|
||||
if pq.Len() != 2 {
|
||||
t.Fatalf("expected 2 jobs remaining, got %d", pq.Len())
|
||||
}
|
||||
}
|
||||
|
||||
func TestPriorityQueue_PeekBefore(t *testing.T) {
|
||||
pq := NewPriorityQueue()
|
||||
now := time.Now()
|
||||
|
||||
pq.Push(&SubsystemJob{
|
||||
AgentID: uuid.New(),
|
||||
Subsystem: "updates",
|
||||
NextRunAt: now.Add(5 * time.Minute),
|
||||
})
|
||||
pq.Push(&SubsystemJob{
|
||||
AgentID: uuid.New(),
|
||||
Subsystem: "storage",
|
||||
NextRunAt: now.Add(15 * time.Minute),
|
||||
})
|
||||
|
||||
// Peek before 10 minutes (should see 1 job)
|
||||
jobs := pq.PeekBefore(now.Add(10*time.Minute), 0)
|
||||
|
||||
if len(jobs) != 1 {
|
||||
t.Fatalf("expected 1 job, got %d", len(jobs))
|
||||
}
|
||||
|
||||
// Queue should still have both jobs
|
||||
if pq.Len() != 2 {
|
||||
t.Fatalf("expected len=2 after PeekBefore, got %d", pq.Len())
|
||||
}
|
||||
}
|
||||
|
||||
func TestPriorityQueue_Clear(t *testing.T) {
|
||||
pq := NewPriorityQueue()
|
||||
|
||||
// Add some jobs
|
||||
for i := 0; i < 10; i++ {
|
||||
pq.Push(&SubsystemJob{
|
||||
AgentID: uuid.New(),
|
||||
Subsystem: "updates",
|
||||
NextRunAt: time.Now(),
|
||||
})
|
||||
}
|
||||
|
||||
if pq.Len() != 10 {
|
||||
t.Fatalf("expected len=10, got %d", pq.Len())
|
||||
}
|
||||
|
||||
// Clear the queue
|
||||
pq.Clear()
|
||||
|
||||
if pq.Len() != 0 {
|
||||
t.Fatalf("expected len=0 after clear, got %d", pq.Len())
|
||||
}
|
||||
|
||||
if pq.Peek() != nil {
|
||||
t.Fatal("Peek should return nil after clear")
|
||||
}
|
||||
}
|
||||
|
||||
func TestPriorityQueue_GetStats(t *testing.T) {
|
||||
pq := NewPriorityQueue()
|
||||
now := time.Now()
|
||||
|
||||
// Empty queue stats
|
||||
stats := pq.GetStats()
|
||||
if stats.Size != 0 {
|
||||
t.Fatalf("expected size=0, got %d", stats.Size)
|
||||
}
|
||||
if stats.NextRunAt != nil {
|
||||
t.Fatal("empty queue should have nil NextRunAt")
|
||||
}
|
||||
|
||||
// Add jobs
|
||||
pq.Push(&SubsystemJob{
|
||||
AgentID: uuid.New(),
|
||||
AgentHostname: "agent-01",
|
||||
Subsystem: "updates",
|
||||
NextRunAt: now.Add(5 * time.Minute),
|
||||
IntervalMinutes: 15,
|
||||
})
|
||||
pq.Push(&SubsystemJob{
|
||||
AgentID: uuid.New(),
|
||||
Subsystem: "storage",
|
||||
NextRunAt: now.Add(10 * time.Minute),
|
||||
})
|
||||
pq.Push(&SubsystemJob{
|
||||
AgentID: uuid.New(),
|
||||
Subsystem: "updates",
|
||||
NextRunAt: now.Add(15 * time.Minute),
|
||||
})
|
||||
|
||||
stats = pq.GetStats()
|
||||
|
||||
if stats.Size != 3 {
|
||||
t.Fatalf("expected size=3, got %d", stats.Size)
|
||||
}
|
||||
|
||||
if stats.NextRunAt == nil {
|
||||
t.Fatal("NextRunAt should not be nil")
|
||||
}
|
||||
|
||||
// Should be the earliest job (5 minutes)
|
||||
expectedNext := now.Add(5 * time.Minute)
|
||||
if !stats.NextRunAt.Equal(expectedNext) {
|
||||
t.Fatalf("expected NextRunAt=%v, got %v", expectedNext, stats.NextRunAt)
|
||||
}
|
||||
|
||||
// Check subsystem counts
|
||||
if stats.JobsBySubsystem["updates"] != 2 {
|
||||
t.Fatalf("expected 2 updates jobs, got %d", stats.JobsBySubsystem["updates"])
|
||||
}
|
||||
if stats.JobsBySubsystem["storage"] != 1 {
|
||||
t.Fatalf("expected 1 storage job, got %d", stats.JobsBySubsystem["storage"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestPriorityQueue_Concurrency(t *testing.T) {
|
||||
pq := NewPriorityQueue()
|
||||
var wg sync.WaitGroup
|
||||
|
||||
// Concurrent pushes
|
||||
numGoroutines := 100
|
||||
wg.Add(numGoroutines)
|
||||
|
||||
for i := 0; i < numGoroutines; i++ {
|
||||
go func(idx int) {
|
||||
defer wg.Done()
|
||||
pq.Push(&SubsystemJob{
|
||||
AgentID: uuid.New(),
|
||||
Subsystem: "updates",
|
||||
NextRunAt: time.Now().Add(time.Duration(idx) * time.Second),
|
||||
})
|
||||
}(i)
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
||||
if pq.Len() != numGoroutines {
|
||||
t.Fatalf("expected len=%d after concurrent pushes, got %d", numGoroutines, pq.Len())
|
||||
}
|
||||
|
||||
// Concurrent pops
|
||||
wg.Add(numGoroutines)
|
||||
popped := make(chan *SubsystemJob, numGoroutines)
|
||||
|
||||
for i := 0; i < numGoroutines; i++ {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
if job := pq.Pop(); job != nil {
|
||||
popped <- job
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
close(popped)
|
||||
|
||||
// Count popped jobs
|
||||
count := 0
|
||||
for range popped {
|
||||
count++
|
||||
}
|
||||
|
||||
if count != numGoroutines {
|
||||
t.Fatalf("expected %d popped jobs, got %d", numGoroutines, count)
|
||||
}
|
||||
|
||||
if pq.Len() != 0 {
|
||||
t.Fatalf("expected empty queue after concurrent pops, got len=%d", pq.Len())
|
||||
}
|
||||
}
|
||||
|
||||
func TestPriorityQueue_ConcurrentReadWrite(t *testing.T) {
|
||||
pq := NewPriorityQueue()
|
||||
done := make(chan bool)
|
||||
|
||||
// Writer goroutine
|
||||
go func() {
|
||||
for i := 0; i < 1000; i++ {
|
||||
pq.Push(&SubsystemJob{
|
||||
AgentID: uuid.New(),
|
||||
Subsystem: "updates",
|
||||
NextRunAt: time.Now(),
|
||||
})
|
||||
time.Sleep(1 * time.Microsecond)
|
||||
}
|
||||
done <- true
|
||||
}()
|
||||
|
||||
// Reader goroutine
|
||||
go func() {
|
||||
for i := 0; i < 1000; i++ {
|
||||
pq.Peek()
|
||||
pq.GetStats()
|
||||
time.Sleep(1 * time.Microsecond)
|
||||
}
|
||||
done <- true
|
||||
}()
|
||||
|
||||
// Wait for both to complete
|
||||
<-done
|
||||
<-done
|
||||
|
||||
// Should not panic and queue should be consistent
|
||||
if pq.Len() < 0 {
|
||||
t.Fatal("queue length became negative (race condition)")
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkPriorityQueue_Push(b *testing.B) {
|
||||
pq := NewPriorityQueue()
|
||||
b.ResetTimer()
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
pq.Push(&SubsystemJob{
|
||||
AgentID: uuid.New(),
|
||||
Subsystem: "updates",
|
||||
NextRunAt: time.Now().Add(time.Duration(i) * time.Second),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkPriorityQueue_Pop(b *testing.B) {
|
||||
pq := NewPriorityQueue()
|
||||
|
||||
// Pre-fill the queue
|
||||
for i := 0; i < b.N; i++ {
|
||||
pq.Push(&SubsystemJob{
|
||||
AgentID: uuid.New(),
|
||||
Subsystem: "updates",
|
||||
NextRunAt: time.Now().Add(time.Duration(i) * time.Second),
|
||||
})
|
||||
}
|
||||
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
pq.Pop()
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkPriorityQueue_Peek(b *testing.B) {
|
||||
pq := NewPriorityQueue()
|
||||
|
||||
// Pre-fill with 10000 jobs
|
||||
for i := 0; i < 10000; i++ {
|
||||
pq.Push(&SubsystemJob{
|
||||
AgentID: uuid.New(),
|
||||
Subsystem: "updates",
|
||||
NextRunAt: time.Now().Add(time.Duration(i) * time.Second),
|
||||
})
|
||||
}
|
||||
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
pq.Peek()
|
||||
}
|
||||
}
|
||||
406
aggregator-server/internal/scheduler/scheduler.go
Normal file
406
aggregator-server/internal/scheduler/scheduler.go
Normal file
@@ -0,0 +1,406 @@
|
||||
package scheduler
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log"
|
||||
"math/rand"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/Fimeg/RedFlag/aggregator-server/internal/database/queries"
|
||||
"github.com/Fimeg/RedFlag/aggregator-server/internal/models"
|
||||
"github.com/google/uuid"
|
||||
)
|
||||
|
||||
// Config holds scheduler configuration
|
||||
type Config struct {
|
||||
// CheckInterval is how often to check the queue for due jobs
|
||||
CheckInterval time.Duration
|
||||
|
||||
// LookaheadWindow is how far ahead to look for jobs
|
||||
// Jobs due within this window will be batched and jittered
|
||||
LookaheadWindow time.Duration
|
||||
|
||||
// MaxJitter is the maximum random delay added to job execution
|
||||
MaxJitter time.Duration
|
||||
|
||||
// NumWorkers is the number of parallel workers for command creation
|
||||
NumWorkers int
|
||||
|
||||
// BackpressureThreshold is max pending commands per agent before skipping
|
||||
BackpressureThreshold int
|
||||
|
||||
// RateLimitPerSecond is max commands created per second (0 = unlimited)
|
||||
RateLimitPerSecond int
|
||||
}
|
||||
|
||||
// DefaultConfig returns production-ready default configuration
|
||||
func DefaultConfig() Config {
|
||||
return Config{
|
||||
CheckInterval: 10 * time.Second,
|
||||
LookaheadWindow: 60 * time.Second,
|
||||
MaxJitter: 30 * time.Second,
|
||||
NumWorkers: 10,
|
||||
BackpressureThreshold: 5,
|
||||
RateLimitPerSecond: 100,
|
||||
}
|
||||
}
|
||||
|
||||
// Scheduler manages subsystem job scheduling with priority queue and worker pool
|
||||
type Scheduler struct {
|
||||
config Config
|
||||
queue *PriorityQueue
|
||||
|
||||
// Database queries
|
||||
agentQueries *queries.AgentQueries
|
||||
commandQueries *queries.CommandQueries
|
||||
|
||||
// Worker pool
|
||||
jobChan chan *SubsystemJob
|
||||
workers []*worker
|
||||
|
||||
// Rate limiting
|
||||
rateLimiter chan struct{}
|
||||
|
||||
// Lifecycle management
|
||||
ctx context.Context
|
||||
cancel context.CancelFunc
|
||||
wg sync.WaitGroup
|
||||
shutdown chan struct{}
|
||||
|
||||
// Metrics
|
||||
mu sync.RWMutex
|
||||
stats Stats
|
||||
}
|
||||
|
||||
// Stats holds scheduler statistics
|
||||
type Stats struct {
|
||||
JobsProcessed int64
|
||||
JobsSkipped int64
|
||||
CommandsCreated int64
|
||||
CommandsFailed int64
|
||||
BackpressureSkips int64
|
||||
LastProcessedAt time.Time
|
||||
QueueSize int
|
||||
WorkerPoolUtilized int
|
||||
AverageProcessingMS int64
|
||||
}
|
||||
|
||||
// NewScheduler creates a new scheduler instance
|
||||
func NewScheduler(config Config, agentQueries *queries.AgentQueries, commandQueries *queries.CommandQueries) *Scheduler {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
s := &Scheduler{
|
||||
config: config,
|
||||
queue: NewPriorityQueue(),
|
||||
agentQueries: agentQueries,
|
||||
commandQueries: commandQueries,
|
||||
jobChan: make(chan *SubsystemJob, 1000), // Buffer 1000 jobs
|
||||
workers: make([]*worker, config.NumWorkers),
|
||||
shutdown: make(chan struct{}),
|
||||
ctx: ctx,
|
||||
cancel: cancel,
|
||||
}
|
||||
|
||||
// Initialize rate limiter if configured
|
||||
if config.RateLimitPerSecond > 0 {
|
||||
s.rateLimiter = make(chan struct{}, config.RateLimitPerSecond)
|
||||
go s.refillRateLimiter()
|
||||
}
|
||||
|
||||
// Initialize workers
|
||||
for i := 0; i < config.NumWorkers; i++ {
|
||||
s.workers[i] = &worker{
|
||||
id: i,
|
||||
scheduler: s,
|
||||
}
|
||||
}
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
// LoadSubsystems loads all enabled auto-run subsystems from database into queue
|
||||
func (s *Scheduler) LoadSubsystems(ctx context.Context) error {
|
||||
log.Println("[Scheduler] Loading subsystems from database...")
|
||||
|
||||
// Get all agents (pass empty strings to get all agents regardless of status/os)
|
||||
agents, err := s.agentQueries.ListAgents("", "")
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to get agents: %w", err)
|
||||
}
|
||||
|
||||
// For now, we'll create default subsystems for each agent
|
||||
// In full implementation, this would read from agent_subsystems table
|
||||
subsystems := []string{"updates", "storage", "system", "docker"}
|
||||
intervals := map[string]int{
|
||||
"updates": 15, // 15 minutes
|
||||
"storage": 15,
|
||||
"system": 30,
|
||||
"docker": 15,
|
||||
}
|
||||
|
||||
loaded := 0
|
||||
for _, agent := range agents {
|
||||
// Skip offline agents (haven't checked in for 10+ minutes)
|
||||
if time.Since(agent.LastSeen) > 10*time.Minute {
|
||||
continue
|
||||
}
|
||||
|
||||
for _, subsystem := range subsystems {
|
||||
// TODO: Check agent metadata for subsystem enablement
|
||||
// For now, assume all subsystems are enabled
|
||||
|
||||
job := &SubsystemJob{
|
||||
AgentID: agent.ID,
|
||||
AgentHostname: agent.Hostname,
|
||||
Subsystem: subsystem,
|
||||
IntervalMinutes: intervals[subsystem],
|
||||
NextRunAt: time.Now().Add(time.Duration(intervals[subsystem]) * time.Minute),
|
||||
Enabled: true,
|
||||
}
|
||||
|
||||
s.queue.Push(job)
|
||||
loaded++
|
||||
}
|
||||
}
|
||||
|
||||
log.Printf("[Scheduler] Loaded %d subsystem jobs for %d agents\n", loaded, len(agents))
|
||||
return nil
|
||||
}
|
||||
|
||||
// Start begins the scheduler main loop and workers
|
||||
func (s *Scheduler) Start() error {
|
||||
log.Printf("[Scheduler] Starting with %d workers, check interval %v\n",
|
||||
s.config.NumWorkers, s.config.CheckInterval)
|
||||
|
||||
// Start workers
|
||||
for _, w := range s.workers {
|
||||
s.wg.Add(1)
|
||||
go w.run()
|
||||
}
|
||||
|
||||
// Start main loop
|
||||
s.wg.Add(1)
|
||||
go s.mainLoop()
|
||||
|
||||
log.Println("[Scheduler] Started successfully")
|
||||
return nil
|
||||
}
|
||||
|
||||
// Stop gracefully shuts down the scheduler
|
||||
func (s *Scheduler) Stop() error {
|
||||
log.Println("[Scheduler] Shutting down...")
|
||||
|
||||
// Signal shutdown
|
||||
s.cancel()
|
||||
close(s.shutdown)
|
||||
|
||||
// Close job channel (workers will drain and exit)
|
||||
close(s.jobChan)
|
||||
|
||||
// Wait for all goroutines with timeout
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
s.wg.Wait()
|
||||
close(done)
|
||||
}()
|
||||
|
||||
select {
|
||||
case <-done:
|
||||
log.Println("[Scheduler] Shutdown complete")
|
||||
return nil
|
||||
case <-time.After(30 * time.Second):
|
||||
log.Println("[Scheduler] Shutdown timeout - forcing exit")
|
||||
return fmt.Errorf("shutdown timeout")
|
||||
}
|
||||
}
|
||||
|
||||
// mainLoop is the scheduler's main processing loop
|
||||
func (s *Scheduler) mainLoop() {
|
||||
defer s.wg.Done()
|
||||
|
||||
ticker := time.NewTicker(s.config.CheckInterval)
|
||||
defer ticker.Stop()
|
||||
|
||||
log.Printf("[Scheduler] Main loop started (check every %v)\n", s.config.CheckInterval)
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-s.shutdown:
|
||||
log.Println("[Scheduler] Main loop shutting down")
|
||||
return
|
||||
|
||||
case <-ticker.C:
|
||||
s.processQueue()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// processQueue checks for due jobs and dispatches them to workers
|
||||
func (s *Scheduler) processQueue() {
|
||||
start := time.Now()
|
||||
|
||||
// Get all jobs due within lookahead window
|
||||
cutoff := time.Now().Add(s.config.LookaheadWindow)
|
||||
dueJobs := s.queue.PopBefore(cutoff, 0) // No limit, get all
|
||||
|
||||
if len(dueJobs) == 0 {
|
||||
// No jobs due, just update stats
|
||||
s.mu.Lock()
|
||||
s.stats.QueueSize = s.queue.Len()
|
||||
s.mu.Unlock()
|
||||
return
|
||||
}
|
||||
|
||||
log.Printf("[Scheduler] Processing %d jobs due before %s\n",
|
||||
len(dueJobs), cutoff.Format("15:04:05"))
|
||||
|
||||
// Add jitter to each job and dispatch to workers
|
||||
dispatched := 0
|
||||
for _, job := range dueJobs {
|
||||
// Add random jitter (0 to MaxJitter)
|
||||
jitter := time.Duration(rand.Intn(int(s.config.MaxJitter.Seconds()))) * time.Second
|
||||
job.NextRunAt = job.NextRunAt.Add(jitter)
|
||||
|
||||
// Dispatch to worker pool (non-blocking)
|
||||
select {
|
||||
case s.jobChan <- job:
|
||||
dispatched++
|
||||
default:
|
||||
// Worker pool full, re-queue job
|
||||
log.Printf("[Scheduler] Worker pool full, re-queueing %s\n", job.String())
|
||||
s.queue.Push(job)
|
||||
|
||||
s.mu.Lock()
|
||||
s.stats.JobsSkipped++
|
||||
s.mu.Unlock()
|
||||
}
|
||||
}
|
||||
|
||||
// Update stats
|
||||
duration := time.Since(start)
|
||||
s.mu.Lock()
|
||||
s.stats.JobsProcessed += int64(dispatched)
|
||||
s.stats.LastProcessedAt = time.Now()
|
||||
s.stats.QueueSize = s.queue.Len()
|
||||
s.stats.WorkerPoolUtilized = len(s.jobChan)
|
||||
s.stats.AverageProcessingMS = duration.Milliseconds()
|
||||
s.mu.Unlock()
|
||||
|
||||
log.Printf("[Scheduler] Dispatched %d jobs in %v (queue: %d remaining)\n",
|
||||
dispatched, duration, s.queue.Len())
|
||||
}
|
||||
|
||||
// refillRateLimiter continuously refills the rate limiter token bucket
|
||||
func (s *Scheduler) refillRateLimiter() {
|
||||
ticker := time.NewTicker(time.Second / time.Duration(s.config.RateLimitPerSecond))
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-s.shutdown:
|
||||
return
|
||||
case <-ticker.C:
|
||||
// Try to add token (non-blocking)
|
||||
select {
|
||||
case s.rateLimiter <- struct{}{}:
|
||||
default:
|
||||
// Bucket full, skip
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// GetStats returns current scheduler statistics (thread-safe)
|
||||
func (s *Scheduler) GetStats() Stats {
|
||||
s.mu.RLock()
|
||||
defer s.mu.RUnlock()
|
||||
return s.stats
|
||||
}
|
||||
|
||||
// GetQueueStats returns current queue statistics
|
||||
func (s *Scheduler) GetQueueStats() QueueStats {
|
||||
return s.queue.GetStats()
|
||||
}
|
||||
|
||||
// worker processes jobs from the job channel
|
||||
type worker struct {
|
||||
id int
|
||||
scheduler *Scheduler
|
||||
}
|
||||
|
||||
func (w *worker) run() {
|
||||
defer w.scheduler.wg.Done()
|
||||
|
||||
log.Printf("[Worker %d] Started\n", w.id)
|
||||
|
||||
for job := range w.scheduler.jobChan {
|
||||
if err := w.processJob(job); err != nil {
|
||||
log.Printf("[Worker %d] Failed to process %s: %v\n", w.id, job.String(), err)
|
||||
|
||||
w.scheduler.mu.Lock()
|
||||
w.scheduler.stats.CommandsFailed++
|
||||
w.scheduler.mu.Unlock()
|
||||
} else {
|
||||
w.scheduler.mu.Lock()
|
||||
w.scheduler.stats.CommandsCreated++
|
||||
w.scheduler.mu.Unlock()
|
||||
}
|
||||
|
||||
// Re-queue job for next execution
|
||||
job.NextRunAt = time.Now().Add(time.Duration(job.IntervalMinutes) * time.Minute)
|
||||
w.scheduler.queue.Push(job)
|
||||
}
|
||||
|
||||
log.Printf("[Worker %d] Stopped\n", w.id)
|
||||
}
|
||||
|
||||
func (w *worker) processJob(job *SubsystemJob) error {
|
||||
// Apply rate limiting if configured
|
||||
if w.scheduler.rateLimiter != nil {
|
||||
select {
|
||||
case <-w.scheduler.rateLimiter:
|
||||
// Token acquired
|
||||
case <-w.scheduler.shutdown:
|
||||
return fmt.Errorf("shutdown during rate limit wait")
|
||||
}
|
||||
}
|
||||
|
||||
// Check backpressure: skip if agent has too many pending commands
|
||||
pendingCount, err := w.scheduler.commandQueries.CountPendingCommandsForAgent(job.AgentID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to check pending commands: %w", err)
|
||||
}
|
||||
|
||||
if pendingCount >= w.scheduler.config.BackpressureThreshold {
|
||||
log.Printf("[Worker %d] Backpressure: agent %s has %d pending commands, skipping %s\n",
|
||||
w.id, job.AgentHostname, pendingCount, job.Subsystem)
|
||||
|
||||
w.scheduler.mu.Lock()
|
||||
w.scheduler.stats.BackpressureSkips++
|
||||
w.scheduler.mu.Unlock()
|
||||
|
||||
return nil // Not an error, just skipped
|
||||
}
|
||||
|
||||
// Create command
|
||||
cmd := &models.AgentCommand{
|
||||
ID: uuid.New(),
|
||||
AgentID: job.AgentID,
|
||||
CommandType: fmt.Sprintf("scan_%s", job.Subsystem),
|
||||
Params: models.JSONB{},
|
||||
Status: models.CommandStatusPending,
|
||||
Source: models.CommandSourceSystem,
|
||||
CreatedAt: time.Now(),
|
||||
}
|
||||
|
||||
if err := w.scheduler.commandQueries.CreateCommand(cmd); err != nil {
|
||||
return fmt.Errorf("failed to create command: %w", err)
|
||||
}
|
||||
|
||||
log.Printf("[Worker %d] Created %s command for %s\n",
|
||||
w.id, job.Subsystem, job.AgentHostname)
|
||||
|
||||
return nil
|
||||
}
|
||||
323
aggregator-server/internal/scheduler/scheduler_test.go
Normal file
323
aggregator-server/internal/scheduler/scheduler_test.go
Normal file
@@ -0,0 +1,323 @@
|
||||
package scheduler
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
)
|
||||
|
||||
func TestScheduler_NewScheduler(t *testing.T) {
|
||||
config := DefaultConfig()
|
||||
s := NewScheduler(config, nil, nil)
|
||||
|
||||
if s == nil {
|
||||
t.Fatal("NewScheduler returned nil")
|
||||
}
|
||||
|
||||
if s.config.NumWorkers != 10 {
|
||||
t.Fatalf("expected 10 workers, got %d", s.config.NumWorkers)
|
||||
}
|
||||
|
||||
if s.queue == nil {
|
||||
t.Fatal("queue not initialized")
|
||||
}
|
||||
|
||||
if len(s.workers) != config.NumWorkers {
|
||||
t.Fatalf("expected %d workers, got %d", config.NumWorkers, len(s.workers))
|
||||
}
|
||||
}
|
||||
|
||||
func TestScheduler_DefaultConfig(t *testing.T) {
|
||||
config := DefaultConfig()
|
||||
|
||||
if config.CheckInterval != 10*time.Second {
|
||||
t.Fatalf("expected check interval 10s, got %v", config.CheckInterval)
|
||||
}
|
||||
|
||||
if config.LookaheadWindow != 60*time.Second {
|
||||
t.Fatalf("expected lookahead 60s, got %v", config.LookaheadWindow)
|
||||
}
|
||||
|
||||
if config.MaxJitter != 30*time.Second {
|
||||
t.Fatalf("expected max jitter 30s, got %v", config.MaxJitter)
|
||||
}
|
||||
|
||||
if config.NumWorkers != 10 {
|
||||
t.Fatalf("expected 10 workers, got %d", config.NumWorkers)
|
||||
}
|
||||
|
||||
if config.BackpressureThreshold != 5 {
|
||||
t.Fatalf("expected backpressure threshold 5, got %d", config.BackpressureThreshold)
|
||||
}
|
||||
|
||||
if config.RateLimitPerSecond != 100 {
|
||||
t.Fatalf("expected rate limit 100/s, got %d", config.RateLimitPerSecond)
|
||||
}
|
||||
}
|
||||
|
||||
func TestScheduler_QueueIntegration(t *testing.T) {
|
||||
config := DefaultConfig()
|
||||
s := NewScheduler(config, nil, nil)
|
||||
|
||||
// Add jobs to queue
|
||||
agent1 := uuid.New()
|
||||
agent2 := uuid.New()
|
||||
|
||||
job1 := &SubsystemJob{
|
||||
AgentID: agent1,
|
||||
AgentHostname: "agent-01",
|
||||
Subsystem: "updates",
|
||||
IntervalMinutes: 15,
|
||||
NextRunAt: time.Now().Add(5 * time.Minute),
|
||||
}
|
||||
|
||||
job2 := &SubsystemJob{
|
||||
AgentID: agent2,
|
||||
AgentHostname: "agent-02",
|
||||
Subsystem: "storage",
|
||||
IntervalMinutes: 15,
|
||||
NextRunAt: time.Now().Add(10 * time.Minute),
|
||||
}
|
||||
|
||||
s.queue.Push(job1)
|
||||
s.queue.Push(job2)
|
||||
|
||||
if s.queue.Len() != 2 {
|
||||
t.Fatalf("expected queue len 2, got %d", s.queue.Len())
|
||||
}
|
||||
|
||||
// Get stats
|
||||
stats := s.GetQueueStats()
|
||||
if stats.Size != 2 {
|
||||
t.Fatalf("expected stats size 2, got %d", stats.Size)
|
||||
}
|
||||
}
|
||||
|
||||
func TestScheduler_GetStats(t *testing.T) {
|
||||
config := DefaultConfig()
|
||||
s := NewScheduler(config, nil, nil)
|
||||
|
||||
// Initial stats should be zero
|
||||
stats := s.GetStats()
|
||||
|
||||
if stats.JobsProcessed != 0 {
|
||||
t.Fatalf("expected 0 jobs processed, got %d", stats.JobsProcessed)
|
||||
}
|
||||
|
||||
if stats.CommandsCreated != 0 {
|
||||
t.Fatalf("expected 0 commands created, got %d", stats.CommandsCreated)
|
||||
}
|
||||
|
||||
if stats.BackpressureSkips != 0 {
|
||||
t.Fatalf("expected 0 backpressure skips, got %d", stats.BackpressureSkips)
|
||||
}
|
||||
|
||||
// Manually update stats (simulating processing)
|
||||
s.mu.Lock()
|
||||
s.stats.JobsProcessed = 100
|
||||
s.stats.CommandsCreated = 95
|
||||
s.stats.BackpressureSkips = 5
|
||||
s.mu.Unlock()
|
||||
|
||||
stats = s.GetStats()
|
||||
|
||||
if stats.JobsProcessed != 100 {
|
||||
t.Fatalf("expected 100 jobs processed, got %d", stats.JobsProcessed)
|
||||
}
|
||||
|
||||
if stats.CommandsCreated != 95 {
|
||||
t.Fatalf("expected 95 commands created, got %d", stats.CommandsCreated)
|
||||
}
|
||||
|
||||
if stats.BackpressureSkips != 5 {
|
||||
t.Fatalf("expected 5 backpressure skips, got %d", stats.BackpressureSkips)
|
||||
}
|
||||
}
|
||||
|
||||
func TestScheduler_StartStop(t *testing.T) {
|
||||
config := Config{
|
||||
CheckInterval: 100 * time.Millisecond, // Fast for testing
|
||||
LookaheadWindow: 60 * time.Second,
|
||||
MaxJitter: 1 * time.Second,
|
||||
NumWorkers: 2,
|
||||
BackpressureThreshold: 5,
|
||||
RateLimitPerSecond: 0, // Disable rate limiting for test
|
||||
}
|
||||
|
||||
s := NewScheduler(config, nil, nil)
|
||||
|
||||
// Start scheduler
|
||||
err := s.Start()
|
||||
if err != nil {
|
||||
t.Fatalf("failed to start scheduler: %v", err)
|
||||
}
|
||||
|
||||
// Let it run for a bit
|
||||
time.Sleep(500 * time.Millisecond)
|
||||
|
||||
// Stop scheduler
|
||||
err = s.Stop()
|
||||
if err != nil {
|
||||
t.Fatalf("failed to stop scheduler: %v", err)
|
||||
}
|
||||
|
||||
// Should stop cleanly
|
||||
}
|
||||
|
||||
func TestScheduler_ProcessQueueEmpty(t *testing.T) {
|
||||
config := DefaultConfig()
|
||||
s := NewScheduler(config, nil, nil)
|
||||
|
||||
// Process empty queue should not panic
|
||||
s.processQueue()
|
||||
|
||||
stats := s.GetStats()
|
||||
if stats.JobsProcessed != 0 {
|
||||
t.Fatalf("expected 0 jobs processed on empty queue, got %d", stats.JobsProcessed)
|
||||
}
|
||||
}
|
||||
|
||||
func TestScheduler_ProcessQueueWithJobs(t *testing.T) {
|
||||
config := Config{
|
||||
CheckInterval: 1 * time.Second,
|
||||
LookaheadWindow: 60 * time.Second,
|
||||
MaxJitter: 5 * time.Second,
|
||||
NumWorkers: 2,
|
||||
BackpressureThreshold: 5,
|
||||
RateLimitPerSecond: 0, // Disable for test
|
||||
}
|
||||
|
||||
s := NewScheduler(config, nil, nil)
|
||||
|
||||
// Add jobs that are due now
|
||||
for i := 0; i < 5; i++ {
|
||||
job := &SubsystemJob{
|
||||
AgentID: uuid.New(),
|
||||
AgentHostname: "test-agent",
|
||||
Subsystem: "updates",
|
||||
IntervalMinutes: 15,
|
||||
NextRunAt: time.Now(), // Due now
|
||||
}
|
||||
s.queue.Push(job)
|
||||
}
|
||||
|
||||
if s.queue.Len() != 5 {
|
||||
t.Fatalf("expected 5 jobs in queue, got %d", s.queue.Len())
|
||||
}
|
||||
|
||||
// Process the queue
|
||||
s.processQueue()
|
||||
|
||||
// Jobs should be dispatched to job channel
|
||||
// Note: Without database, workers can't actually process them
|
||||
// But we can verify they were dispatched
|
||||
|
||||
stats := s.GetStats()
|
||||
if stats.JobsProcessed == 0 {
|
||||
t.Fatal("expected some jobs to be processed")
|
||||
}
|
||||
}
|
||||
|
||||
func TestScheduler_RateLimiterRefill(t *testing.T) {
|
||||
config := Config{
|
||||
CheckInterval: 1 * time.Second,
|
||||
LookaheadWindow: 60 * time.Second,
|
||||
MaxJitter: 1 * time.Second,
|
||||
NumWorkers: 2,
|
||||
BackpressureThreshold: 5,
|
||||
RateLimitPerSecond: 10, // 10 tokens per second
|
||||
}
|
||||
|
||||
s := NewScheduler(config, nil, nil)
|
||||
|
||||
if s.rateLimiter == nil {
|
||||
t.Fatal("rate limiter not initialized")
|
||||
}
|
||||
|
||||
// Start refill goroutine
|
||||
go s.refillRateLimiter()
|
||||
|
||||
// Wait for some tokens to be added
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
|
||||
// Should have some tokens available
|
||||
tokensAvailable := 0
|
||||
for i := 0; i < 15; i++ {
|
||||
select {
|
||||
case <-s.rateLimiter:
|
||||
tokensAvailable++
|
||||
default:
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if tokensAvailable == 0 {
|
||||
t.Fatal("expected some tokens to be available after refill")
|
||||
}
|
||||
|
||||
// Should not exceed buffer size (10)
|
||||
if tokensAvailable > 10 {
|
||||
t.Fatalf("token bucket overflowed: got %d tokens, max is 10", tokensAvailable)
|
||||
}
|
||||
}
|
||||
|
||||
func TestScheduler_ConcurrentQueueAccess(t *testing.T) {
|
||||
config := DefaultConfig()
|
||||
s := NewScheduler(config, nil, nil)
|
||||
|
||||
done := make(chan bool)
|
||||
|
||||
// Concurrent pushes
|
||||
go func() {
|
||||
for i := 0; i < 100; i++ {
|
||||
job := &SubsystemJob{
|
||||
AgentID: uuid.New(),
|
||||
Subsystem: "updates",
|
||||
IntervalMinutes: 15,
|
||||
NextRunAt: time.Now(),
|
||||
}
|
||||
s.queue.Push(job)
|
||||
}
|
||||
done <- true
|
||||
}()
|
||||
|
||||
// Concurrent stats reads
|
||||
go func() {
|
||||
for i := 0; i < 100; i++ {
|
||||
s.GetStats()
|
||||
s.GetQueueStats()
|
||||
}
|
||||
done <- true
|
||||
}()
|
||||
|
||||
// Wait for both
|
||||
<-done
|
||||
<-done
|
||||
|
||||
// Should not panic and should have queued jobs
|
||||
if s.queue.Len() <= 0 {
|
||||
t.Fatal("expected jobs in queue after concurrent pushes")
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkScheduler_ProcessQueue(b *testing.B) {
|
||||
config := DefaultConfig()
|
||||
s := NewScheduler(config, nil, nil)
|
||||
|
||||
// Pre-fill queue with jobs
|
||||
for i := 0; i < 1000; i++ {
|
||||
job := &SubsystemJob{
|
||||
AgentID: uuid.New(),
|
||||
Subsystem: "updates",
|
||||
IntervalMinutes: 15,
|
||||
NextRunAt: time.Now(),
|
||||
}
|
||||
s.queue.Push(job)
|
||||
}
|
||||
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
s.processQueue()
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user