feat: add resilience and reliability features for agent subsystems

Added circuit breakers with configurable timeouts for all subsystems (APT, DNF, Docker, Windows, Winget, Storage). Replaces cron-based scheduler with priority queue that should scale beyond 1000+ agents if your homelab is that big. Command acknowledgment system ensures results aren't lost on network failures or restarts. Agent tracks pending acknowledgments with persistent state and automatic retry. - Circuit breakers: 3 failures in 1min opens circuit, 30s cooldown - Per-subsystem timeouts: 30s-10min depending on scanner - Priority queue scheduler: O(log n), worker pool, jitter, backpressure - Acknowledgments: at-least-once delivery, max 10 retries over 24h - All tests passing (26/26)
2025-11-01 18:42:41 -04:00
parent 528848f476
commit bf4d46529f
26 changed files with 2733 additions and 152 deletions
--- a/aggregator-server/cmd/server/main.go
+++ b/aggregator-server/cmd/server/main.go
@@ -1,6 +1,7 @@
 package main

 import (
+	"context"
 	"flag"
 	"fmt"
 	"log"
@@ -12,6 +13,7 @@ import (
 	"github.com/Fimeg/RedFlag/aggregator-server/internal/config"
 	"github.com/Fimeg/RedFlag/aggregator-server/internal/database"
 	"github.com/Fimeg/RedFlag/aggregator-server/internal/database/queries"
+	"github.com/Fimeg/RedFlag/aggregator-server/internal/scheduler"
 	"github.com/Fimeg/RedFlag/aggregator-server/internal/services"
 	"github.com/gin-gonic/gin"
 )
@@ -285,10 +287,45 @@ func main() {
 	timeoutService.Start()
 	log.Println("Timeout service started")

-	// Add graceful shutdown for timeout service
+	// Initialize and start scheduler
+	schedulerConfig := scheduler.DefaultConfig()
+	subsystemScheduler := scheduler.NewScheduler(schedulerConfig, agentQueries, commandQueries)
+
+	// Load subsystems into queue
+	ctx := context.Background()
+	if err := subsystemScheduler.LoadSubsystems(ctx); err != nil {
+		log.Printf("Warning: Failed to load subsystems: %v", err)
+	} else {
+		log.Println("Subsystems loaded into scheduler")
+	}
+
+	// Start scheduler
+	if err := subsystemScheduler.Start(); err != nil {
+		log.Printf("Warning: Failed to start scheduler: %v", err)
+	}
+
+	// Add scheduler stats endpoint (after scheduler is initialized)
+	router.GET("/api/v1/scheduler/stats", middleware.AuthMiddleware(), func(c *gin.Context) {
+		stats := subsystemScheduler.GetStats()
+		queueStats := subsystemScheduler.GetQueueStats()
+		c.JSON(200, gin.H{
+			"scheduler": stats,
+			"queue":     queueStats,
+		})
+	})
+
+	// Add graceful shutdown for services
 	defer func() {
+		log.Println("Shutting down services...")
+
+		// Stop scheduler first
+		if err := subsystemScheduler.Stop(); err != nil {
+			log.Printf("Error stopping scheduler: %v", err)
+		}
+
+		// Stop timeout service
 		timeoutService.Stop()
-		log.Println("Timeout service stopped")
+		log.Println("Services stopped")
 	}()

 	// Start server
--- a/aggregator-server/internal/api/handlers/agents.go
+++ b/aggregator-server/internal/api/handlers/agents.go
@@ -269,6 +269,21 @@ func (h *AgentHandler) GetCommands(c *gin.Context) {
 		return
 	}

+	// Process command acknowledgments if agent sent any
+	var acknowledgedIDs []string
+	if metrics != nil && len(metrics.PendingAcknowledgments) > 0 {
+		// Verify which commands from the agent's pending list have been recorded
+		verified, err := h.commandQueries.VerifyCommandsCompleted(metrics.PendingAcknowledgments)
+		if err != nil {
+			log.Printf("Warning: Failed to verify command acknowledgments for agent %s: %v", agentID, err)
+		} else {
+			acknowledgedIDs = verified
+			if len(acknowledgedIDs) > 0 {
+				log.Printf("Acknowledged %d command results for agent %s", len(acknowledgedIDs), agentID)
+			}
+		}
+	}
+
 	// Process heartbeat metadata from agent check-ins
 	if metrics.Metadata != nil {
 		agent, err := h.agentQueries.GetAgentByID(agentID)
@@ -437,8 +452,9 @@ func (h *AgentHandler) GetCommands(c *gin.Context) {
 	}

 	response := models.CommandsResponse{
-		Commands:     commandItems,
-		RapidPolling: rapidPolling,
+		Commands:        commandItems,
+		RapidPolling:    rapidPolling,
+		AcknowledgedIDs: acknowledgedIDs,
 	}

 	c.JSON(http.StatusOK, response)
--- a/aggregator-server/internal/database/queries/commands.go
+++ b/aggregator-server/internal/database/queries/commands.go
@@ -337,3 +337,61 @@ func (q *CommandQueries) ClearAllFailedCommands(days int) (int64, error) {

 	return result.RowsAffected()
 }
+
+// CountPendingCommandsForAgent returns the number of pending commands for a specific agent
+// Used by scheduler for backpressure detection
+func (q *CommandQueries) CountPendingCommandsForAgent(agentID uuid.UUID) (int, error) {
+	var count int
+	query := `
+		SELECT COUNT(*)
+		FROM agent_commands
+		WHERE agent_id = $1 AND status = 'pending'
+	`
+	err := q.db.Get(&count, query, agentID)
+	return count, err
+}
+
+// VerifyCommandsCompleted checks which command IDs from the provided list have been completed or failed
+// Returns the list of command IDs that have been successfully recorded (completed or failed status)
+func (q *CommandQueries) VerifyCommandsCompleted(commandIDs []string) ([]string, error) {
+	if len(commandIDs) == 0 {
+		return []string{}, nil
+	}
+
+	// Convert string IDs to UUIDs
+	uuidIDs := make([]uuid.UUID, 0, len(commandIDs))
+	for _, idStr := range commandIDs {
+		id, err := uuid.Parse(idStr)
+		if err != nil {
+			// Skip invalid UUIDs
+			continue
+		}
+		uuidIDs = append(uuidIDs, id)
+	}
+
+	if len(uuidIDs) == 0 {
+		return []string{}, nil
+	}
+
+	// Query for commands that are completed or failed
+	query := `
+		SELECT id
+		FROM agent_commands
+		WHERE id = ANY($1)
+		AND status IN ('completed', 'failed')
+	`
+
+	var completedUUIDs []uuid.UUID
+	err := q.db.Select(&completedUUIDs, query, uuidIDs)
+	if err != nil {
+		return nil, fmt.Errorf("failed to verify command completion: %w", err)
+	}
+
+	// Convert back to strings
+	completedIDs := make([]string, len(completedUUIDs))
+	for i, id := range completedUUIDs {
+		completedIDs[i] = id.String()
+	}
+
+	return completedIDs, nil
+}
--- a/aggregator-server/internal/models/command.go
+++ b/aggregator-server/internal/models/command.go
@@ -23,8 +23,9 @@ type AgentCommand struct {

 // CommandsResponse is returned when an agent checks in for commands
 type CommandsResponse struct {
-	Commands       []CommandItem `json:"commands"`
-	RapidPolling   *RapidPollingConfig `json:"rapid_polling,omitempty"`
+	Commands        []CommandItem        `json:"commands"`
+	RapidPolling    *RapidPollingConfig  `json:"rapid_polling,omitempty"`
+	AcknowledgedIDs []string             `json:"acknowledged_ids,omitempty"` // IDs server has received
 }

 // RapidPollingConfig contains rapid polling configuration for the agent
--- a/aggregator-server/internal/scheduler/queue.go
+++ b/aggregator-server/internal/scheduler/queue.go
@@ -0,0 +1,286 @@
+package scheduler
+
+import (
+	"container/heap"
+	"fmt"
+	"sync"
+	"time"
+
+	"github.com/google/uuid"
+)
+
+// SubsystemJob represents a scheduled subsystem scan
+type SubsystemJob struct {
+	AgentID         uuid.UUID
+	AgentHostname   string // For logging/debugging
+	Subsystem       string
+	IntervalMinutes int
+	NextRunAt       time.Time
+	Enabled         bool
+	index           int // Heap index (managed by heap.Interface)
+}
+
+// String returns a human-readable representation of the job
+func (j *SubsystemJob) String() string {
+	return fmt.Sprintf("[%s/%s] next_run=%s interval=%dm",
+		j.AgentHostname, j.Subsystem,
+		j.NextRunAt.Format("15:04:05"), j.IntervalMinutes)
+}
+
+// jobHeap implements heap.Interface for SubsystemJob priority queue
+// Jobs are ordered by NextRunAt (earliest first)
+type jobHeap []*SubsystemJob
+
+func (h jobHeap) Len() int { return len(h) }
+
+func (h jobHeap) Less(i, j int) bool {
+	return h[i].NextRunAt.Before(h[j].NextRunAt)
+}
+
+func (h jobHeap) Swap(i, j int) {
+	h[i], h[j] = h[j], h[i]
+	h[i].index = i
+	h[j].index = j
+}
+
+func (h *jobHeap) Push(x interface{}) {
+	n := len(*h)
+	job := x.(*SubsystemJob)
+	job.index = n
+	*h = append(*h, job)
+}
+
+func (h *jobHeap) Pop() interface{} {
+	old := *h
+	n := len(old)
+	job := old[n-1]
+	old[n-1] = nil  // Avoid memory leak
+	job.index = -1  // Mark as removed
+	*h = old[0 : n-1]
+	return job
+}
+
+// PriorityQueue is a thread-safe priority queue for subsystem jobs
+// Jobs are ordered by their NextRunAt timestamp (earliest first)
+type PriorityQueue struct {
+	heap jobHeap
+	mu   sync.RWMutex
+
+	// Index for fast lookups by agent_id + subsystem
+	index map[string]*SubsystemJob // key: "agent_id:subsystem"
+}
+
+// NewPriorityQueue creates a new empty priority queue
+func NewPriorityQueue() *PriorityQueue {
+	pq := &PriorityQueue{
+		heap:  make(jobHeap, 0),
+		index: make(map[string]*SubsystemJob),
+	}
+	heap.Init(&pq.heap)
+	return pq
+}
+
+// Push adds a job to the queue
+// If a job with the same agent_id + subsystem already exists, it's updated
+func (pq *PriorityQueue) Push(job *SubsystemJob) {
+	pq.mu.Lock()
+	defer pq.mu.Unlock()
+
+	key := makeKey(job.AgentID, job.Subsystem)
+
+	// Check if job already exists
+	if existing, exists := pq.index[key]; exists {
+		// Update existing job
+		existing.NextRunAt = job.NextRunAt
+		existing.IntervalMinutes = job.IntervalMinutes
+		existing.Enabled = job.Enabled
+		existing.AgentHostname = job.AgentHostname
+		heap.Fix(&pq.heap, existing.index)
+		return
+	}
+
+	// Add new job
+	heap.Push(&pq.heap, job)
+	pq.index[key] = job
+}
+
+// Pop removes and returns the job with the earliest NextRunAt
+// Returns nil if queue is empty
+func (pq *PriorityQueue) Pop() *SubsystemJob {
+	pq.mu.Lock()
+	defer pq.mu.Unlock()
+
+	if pq.heap.Len() == 0 {
+		return nil
+	}
+
+	job := heap.Pop(&pq.heap).(*SubsystemJob)
+	key := makeKey(job.AgentID, job.Subsystem)
+	delete(pq.index, key)
+
+	return job
+}
+
+// Peek returns the job with the earliest NextRunAt without removing it
+// Returns nil if queue is empty
+func (pq *PriorityQueue) Peek() *SubsystemJob {
+	pq.mu.RLock()
+	defer pq.mu.RUnlock()
+
+	if pq.heap.Len() == 0 {
+		return nil
+	}
+
+	return pq.heap[0]
+}
+
+// Remove removes a specific job from the queue
+// Returns true if job was found and removed, false otherwise
+func (pq *PriorityQueue) Remove(agentID uuid.UUID, subsystem string) bool {
+	pq.mu.Lock()
+	defer pq.mu.Unlock()
+
+	key := makeKey(agentID, subsystem)
+	job, exists := pq.index[key]
+	if !exists {
+		return false
+	}
+
+	heap.Remove(&pq.heap, job.index)
+	delete(pq.index, key)
+
+	return true
+}
+
+// Get retrieves a specific job without removing it
+// Returns nil if not found
+func (pq *PriorityQueue) Get(agentID uuid.UUID, subsystem string) *SubsystemJob {
+	pq.mu.RLock()
+	defer pq.mu.RUnlock()
+
+	key := makeKey(agentID, subsystem)
+	return pq.index[key]
+}
+
+// PopBefore returns all jobs with NextRunAt <= before, up to limit
+// Jobs are removed from the queue
+// If limit <= 0, all matching jobs are returned
+func (pq *PriorityQueue) PopBefore(before time.Time, limit int) []*SubsystemJob {
+	pq.mu.Lock()
+	defer pq.mu.Unlock()
+
+	var jobs []*SubsystemJob
+
+	for pq.heap.Len() > 0 {
+		// Peek at next job
+		next := pq.heap[0]
+
+		// Stop if next job is after our cutoff
+		if next.NextRunAt.After(before) {
+			break
+		}
+
+		// Stop if we've hit the limit
+		if limit > 0 && len(jobs) >= limit {
+			break
+		}
+
+		// Pop and collect the job
+		job := heap.Pop(&pq.heap).(*SubsystemJob)
+		key := makeKey(job.AgentID, job.Subsystem)
+		delete(pq.index, key)
+
+		jobs = append(jobs, job)
+	}
+
+	return jobs
+}
+
+// PeekBefore returns all jobs with NextRunAt <= before without removing them
+// If limit <= 0, all matching jobs are returned
+func (pq *PriorityQueue) PeekBefore(before time.Time, limit int) []*SubsystemJob {
+	pq.mu.RLock()
+	defer pq.mu.RUnlock()
+
+	var jobs []*SubsystemJob
+
+	for i := 0; i < pq.heap.Len(); i++ {
+		job := pq.heap[i]
+
+		if job.NextRunAt.After(before) {
+			// Since heap is sorted by NextRunAt, we can break early
+			// Note: This is only valid because we peek in order
+			break
+		}
+
+		if limit > 0 && len(jobs) >= limit {
+			break
+		}
+
+		jobs = append(jobs, job)
+	}
+
+	return jobs
+}
+
+// Len returns the number of jobs in the queue
+func (pq *PriorityQueue) Len() int {
+	pq.mu.RLock()
+	defer pq.mu.RUnlock()
+	return pq.heap.Len()
+}
+
+// Clear removes all jobs from the queue
+func (pq *PriorityQueue) Clear() {
+	pq.mu.Lock()
+	defer pq.mu.Unlock()
+
+	pq.heap = make(jobHeap, 0)
+	pq.index = make(map[string]*SubsystemJob)
+	heap.Init(&pq.heap)
+}
+
+// GetStats returns statistics about the queue
+func (pq *PriorityQueue) GetStats() QueueStats {
+	pq.mu.RLock()
+	defer pq.mu.RUnlock()
+
+	stats := QueueStats{
+		Size: pq.heap.Len(),
+	}
+
+	if pq.heap.Len() > 0 {
+		stats.NextRunAt = &pq.heap[0].NextRunAt
+		stats.OldestJob = pq.heap[0].String()
+	}
+
+	// Count jobs by subsystem
+	stats.JobsBySubsystem = make(map[string]int)
+	for _, job := range pq.heap {
+		stats.JobsBySubsystem[job.Subsystem]++
+	}
+
+	return stats
+}
+
+// QueueStats holds statistics about the priority queue
+type QueueStats struct {
+	Size             int
+	NextRunAt        *time.Time
+	OldestJob        string
+	JobsBySubsystem  map[string]int
+}
+
+// String returns a human-readable representation of stats
+func (s QueueStats) String() string {
+	nextRun := "empty"
+	if s.NextRunAt != nil {
+		nextRun = s.NextRunAt.Format("15:04:05")
+	}
+	return fmt.Sprintf("size=%d next=%s oldest=%s", s.Size, nextRun, s.OldestJob)
+}
+
+// makeKey creates a unique key for agent_id + subsystem
+func makeKey(agentID uuid.UUID, subsystem string) string {
+	return agentID.String() + ":" + subsystem
+}
--- a/aggregator-server/internal/scheduler/queue_test.go
+++ b/aggregator-server/internal/scheduler/queue_test.go
@@ -0,0 +1,539 @@
+package scheduler
+
+import (
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/google/uuid"
+)
+
+func TestPriorityQueue_BasicOperations(t *testing.T) {
+	pq := NewPriorityQueue()
+
+	// Test empty queue
+	if pq.Len() != 0 {
+		t.Fatalf("expected empty queue, got len=%d", pq.Len())
+	}
+
+	if pq.Peek() != nil {
+		t.Fatal("Peek on empty queue should return nil")
+	}
+
+	if pq.Pop() != nil {
+		t.Fatal("Pop on empty queue should return nil")
+	}
+
+	// Push a job
+	agent1 := uuid.New()
+	job1 := &SubsystemJob{
+		AgentID:         agent1,
+		AgentHostname:   "agent-01",
+		Subsystem:       "updates",
+		IntervalMinutes: 15,
+		NextRunAt:       time.Now().Add(10 * time.Minute),
+	}
+	pq.Push(job1)
+
+	if pq.Len() != 1 {
+		t.Fatalf("expected len=1 after push, got %d", pq.Len())
+	}
+
+	// Peek should return the job without removing it
+	peeked := pq.Peek()
+	if peeked == nil {
+		t.Fatal("Peek should return job")
+	}
+	if peeked.AgentID != agent1 {
+		t.Fatal("Peek returned wrong job")
+	}
+	if pq.Len() != 1 {
+		t.Fatal("Peek should not remove job")
+	}
+
+	// Pop should return and remove the job
+	popped := pq.Pop()
+	if popped == nil {
+		t.Fatal("Pop should return job")
+	}
+	if popped.AgentID != agent1 {
+		t.Fatal("Pop returned wrong job")
+	}
+	if pq.Len() != 0 {
+		t.Fatal("Pop should remove job")
+	}
+}
+
+func TestPriorityQueue_Ordering(t *testing.T) {
+	pq := NewPriorityQueue()
+	now := time.Now()
+
+	// Push jobs in random order
+	jobs := []*SubsystemJob{
+		{
+			AgentID:   uuid.New(),
+			Subsystem: "updates",
+			NextRunAt: now.Add(30 * time.Minute), // Third
+		},
+		{
+			AgentID:   uuid.New(),
+			Subsystem: "storage",
+			NextRunAt: now.Add(5 * time.Minute), // First
+		},
+		{
+			AgentID:   uuid.New(),
+			Subsystem: "docker",
+			NextRunAt: now.Add(15 * time.Minute), // Second
+		},
+	}
+
+	for _, job := range jobs {
+		pq.Push(job)
+	}
+
+	// Pop should return jobs in NextRunAt order
+	first := pq.Pop()
+	if first.Subsystem != "storage" {
+		t.Fatalf("expected 'storage' first, got '%s'", first.Subsystem)
+	}
+
+	second := pq.Pop()
+	if second.Subsystem != "docker" {
+		t.Fatalf("expected 'docker' second, got '%s'", second.Subsystem)
+	}
+
+	third := pq.Pop()
+	if third.Subsystem != "updates" {
+		t.Fatalf("expected 'updates' third, got '%s'", third.Subsystem)
+	}
+}
+
+func TestPriorityQueue_UpdateExisting(t *testing.T) {
+	pq := NewPriorityQueue()
+	agentID := uuid.New()
+	now := time.Now()
+
+	// Push initial job
+	job1 := &SubsystemJob{
+		AgentID:         agentID,
+		Subsystem:       "updates",
+		IntervalMinutes: 15,
+		NextRunAt:       now.Add(15 * time.Minute),
+	}
+	pq.Push(job1)
+
+	if pq.Len() != 1 {
+		t.Fatalf("expected len=1, got %d", pq.Len())
+	}
+
+	// Push same agent+subsystem with different NextRunAt
+	job2 := &SubsystemJob{
+		AgentID:         agentID,
+		Subsystem:       "updates",
+		IntervalMinutes: 30,
+		NextRunAt:       now.Add(30 * time.Minute),
+	}
+	pq.Push(job2)
+
+	// Should still be 1 job (updated, not added)
+	if pq.Len() != 1 {
+		t.Fatalf("expected len=1 after update, got %d", pq.Len())
+	}
+
+	// Verify the job was updated
+	job := pq.Pop()
+	if job.IntervalMinutes != 30 {
+		t.Fatalf("expected interval=30, got %d", job.IntervalMinutes)
+	}
+	if !job.NextRunAt.Equal(now.Add(30 * time.Minute)) {
+		t.Fatal("NextRunAt was not updated")
+	}
+}
+
+func TestPriorityQueue_Remove(t *testing.T) {
+	pq := NewPriorityQueue()
+
+	agent1 := uuid.New()
+	agent2 := uuid.New()
+
+	pq.Push(&SubsystemJob{
+		AgentID:   agent1,
+		Subsystem: "updates",
+		NextRunAt: time.Now(),
+	})
+	pq.Push(&SubsystemJob{
+		AgentID:   agent2,
+		Subsystem: "storage",
+		NextRunAt: time.Now(),
+	})
+
+	if pq.Len() != 2 {
+		t.Fatalf("expected len=2, got %d", pq.Len())
+	}
+
+	// Remove existing job
+	removed := pq.Remove(agent1, "updates")
+	if !removed {
+		t.Fatal("Remove should return true for existing job")
+	}
+	if pq.Len() != 1 {
+		t.Fatalf("expected len=1 after remove, got %d", pq.Len())
+	}
+
+	// Remove non-existent job
+	removed = pq.Remove(agent1, "updates")
+	if removed {
+		t.Fatal("Remove should return false for non-existent job")
+	}
+	if pq.Len() != 1 {
+		t.Fatal("Remove of non-existent job should not affect queue")
+	}
+}
+
+func TestPriorityQueue_Get(t *testing.T) {
+	pq := NewPriorityQueue()
+
+	agentID := uuid.New()
+	job := &SubsystemJob{
+		AgentID:   agentID,
+		Subsystem: "updates",
+		NextRunAt: time.Now(),
+	}
+	pq.Push(job)
+
+	// Get existing job
+	retrieved := pq.Get(agentID, "updates")
+	if retrieved == nil {
+		t.Fatal("Get should return job")
+	}
+	if retrieved.AgentID != agentID {
+		t.Fatal("Get returned wrong job")
+	}
+	if pq.Len() != 1 {
+		t.Fatal("Get should not remove job")
+	}
+
+	// Get non-existent job
+	retrieved = pq.Get(uuid.New(), "storage")
+	if retrieved != nil {
+		t.Fatal("Get should return nil for non-existent job")
+	}
+}
+
+func TestPriorityQueue_PopBefore(t *testing.T) {
+	pq := NewPriorityQueue()
+	now := time.Now()
+
+	// Add jobs with different NextRunAt times
+	for i := 0; i < 5; i++ {
+		pq.Push(&SubsystemJob{
+			AgentID:   uuid.New(),
+			Subsystem: "updates",
+			NextRunAt: now.Add(time.Duration(i*10) * time.Minute),
+		})
+	}
+
+	if pq.Len() != 5 {
+		t.Fatalf("expected len=5, got %d", pq.Len())
+	}
+
+	// Pop jobs before now+25min (should get 3 jobs: 0, 10, 20 minutes)
+	cutoff := now.Add(25 * time.Minute)
+	jobs := pq.PopBefore(cutoff, 0) // no limit
+
+	if len(jobs) != 3 {
+		t.Fatalf("expected 3 jobs, got %d", len(jobs))
+	}
+
+	// Verify all returned jobs are before cutoff
+	for _, job := range jobs {
+		if job.NextRunAt.After(cutoff) {
+			t.Fatalf("job NextRunAt %v is after cutoff %v", job.NextRunAt, cutoff)
+		}
+	}
+
+	// Queue should have 2 jobs left
+	if pq.Len() != 2 {
+		t.Fatalf("expected len=2 after PopBefore, got %d", pq.Len())
+	}
+}
+
+func TestPriorityQueue_PopBeforeWithLimit(t *testing.T) {
+	pq := NewPriorityQueue()
+	now := time.Now()
+
+	// Add 5 jobs all due now
+	for i := 0; i < 5; i++ {
+		pq.Push(&SubsystemJob{
+			AgentID:   uuid.New(),
+			Subsystem: "updates",
+			NextRunAt: now,
+		})
+	}
+
+	// Pop with limit of 3
+	jobs := pq.PopBefore(now.Add(1*time.Hour), 3)
+
+	if len(jobs) != 3 {
+		t.Fatalf("expected 3 jobs (limit), got %d", len(jobs))
+	}
+
+	if pq.Len() != 2 {
+		t.Fatalf("expected 2 jobs remaining, got %d", pq.Len())
+	}
+}
+
+func TestPriorityQueue_PeekBefore(t *testing.T) {
+	pq := NewPriorityQueue()
+	now := time.Now()
+
+	pq.Push(&SubsystemJob{
+		AgentID:   uuid.New(),
+		Subsystem: "updates",
+		NextRunAt: now.Add(5 * time.Minute),
+	})
+	pq.Push(&SubsystemJob{
+		AgentID:   uuid.New(),
+		Subsystem: "storage",
+		NextRunAt: now.Add(15 * time.Minute),
+	})
+
+	// Peek before 10 minutes (should see 1 job)
+	jobs := pq.PeekBefore(now.Add(10*time.Minute), 0)
+
+	if len(jobs) != 1 {
+		t.Fatalf("expected 1 job, got %d", len(jobs))
+	}
+
+	// Queue should still have both jobs
+	if pq.Len() != 2 {
+		t.Fatalf("expected len=2 after PeekBefore, got %d", pq.Len())
+	}
+}
+
+func TestPriorityQueue_Clear(t *testing.T) {
+	pq := NewPriorityQueue()
+
+	// Add some jobs
+	for i := 0; i < 10; i++ {
+		pq.Push(&SubsystemJob{
+			AgentID:   uuid.New(),
+			Subsystem: "updates",
+			NextRunAt: time.Now(),
+		})
+	}
+
+	if pq.Len() != 10 {
+		t.Fatalf("expected len=10, got %d", pq.Len())
+	}
+
+	// Clear the queue
+	pq.Clear()
+
+	if pq.Len() != 0 {
+		t.Fatalf("expected len=0 after clear, got %d", pq.Len())
+	}
+
+	if pq.Peek() != nil {
+		t.Fatal("Peek should return nil after clear")
+	}
+}
+
+func TestPriorityQueue_GetStats(t *testing.T) {
+	pq := NewPriorityQueue()
+	now := time.Now()
+
+	// Empty queue stats
+	stats := pq.GetStats()
+	if stats.Size != 0 {
+		t.Fatalf("expected size=0, got %d", stats.Size)
+	}
+	if stats.NextRunAt != nil {
+		t.Fatal("empty queue should have nil NextRunAt")
+	}
+
+	// Add jobs
+	pq.Push(&SubsystemJob{
+		AgentID:         uuid.New(),
+		AgentHostname:   "agent-01",
+		Subsystem:       "updates",
+		NextRunAt:       now.Add(5 * time.Minute),
+		IntervalMinutes: 15,
+	})
+	pq.Push(&SubsystemJob{
+		AgentID:   uuid.New(),
+		Subsystem: "storage",
+		NextRunAt: now.Add(10 * time.Minute),
+	})
+	pq.Push(&SubsystemJob{
+		AgentID:   uuid.New(),
+		Subsystem: "updates",
+		NextRunAt: now.Add(15 * time.Minute),
+	})
+
+	stats = pq.GetStats()
+
+	if stats.Size != 3 {
+		t.Fatalf("expected size=3, got %d", stats.Size)
+	}
+
+	if stats.NextRunAt == nil {
+		t.Fatal("NextRunAt should not be nil")
+	}
+
+	// Should be the earliest job (5 minutes)
+	expectedNext := now.Add(5 * time.Minute)
+	if !stats.NextRunAt.Equal(expectedNext) {
+		t.Fatalf("expected NextRunAt=%v, got %v", expectedNext, stats.NextRunAt)
+	}
+
+	// Check subsystem counts
+	if stats.JobsBySubsystem["updates"] != 2 {
+		t.Fatalf("expected 2 updates jobs, got %d", stats.JobsBySubsystem["updates"])
+	}
+	if stats.JobsBySubsystem["storage"] != 1 {
+		t.Fatalf("expected 1 storage job, got %d", stats.JobsBySubsystem["storage"])
+	}
+}
+
+func TestPriorityQueue_Concurrency(t *testing.T) {
+	pq := NewPriorityQueue()
+	var wg sync.WaitGroup
+
+	// Concurrent pushes
+	numGoroutines := 100
+	wg.Add(numGoroutines)
+
+	for i := 0; i < numGoroutines; i++ {
+		go func(idx int) {
+			defer wg.Done()
+			pq.Push(&SubsystemJob{
+				AgentID:   uuid.New(),
+				Subsystem: "updates",
+				NextRunAt: time.Now().Add(time.Duration(idx) * time.Second),
+			})
+		}(i)
+	}
+
+	wg.Wait()
+
+	if pq.Len() != numGoroutines {
+		t.Fatalf("expected len=%d after concurrent pushes, got %d", numGoroutines, pq.Len())
+	}
+
+	// Concurrent pops
+	wg.Add(numGoroutines)
+	popped := make(chan *SubsystemJob, numGoroutines)
+
+	for i := 0; i < numGoroutines; i++ {
+		go func() {
+			defer wg.Done()
+			if job := pq.Pop(); job != nil {
+				popped <- job
+			}
+		}()
+	}
+
+	wg.Wait()
+	close(popped)
+
+	// Count popped jobs
+	count := 0
+	for range popped {
+		count++
+	}
+
+	if count != numGoroutines {
+		t.Fatalf("expected %d popped jobs, got %d", numGoroutines, count)
+	}
+
+	if pq.Len() != 0 {
+		t.Fatalf("expected empty queue after concurrent pops, got len=%d", pq.Len())
+	}
+}
+
+func TestPriorityQueue_ConcurrentReadWrite(t *testing.T) {
+	pq := NewPriorityQueue()
+	done := make(chan bool)
+
+	// Writer goroutine
+	go func() {
+		for i := 0; i < 1000; i++ {
+			pq.Push(&SubsystemJob{
+				AgentID:   uuid.New(),
+				Subsystem: "updates",
+				NextRunAt: time.Now(),
+			})
+			time.Sleep(1 * time.Microsecond)
+		}
+		done <- true
+	}()
+
+	// Reader goroutine
+	go func() {
+		for i := 0; i < 1000; i++ {
+			pq.Peek()
+			pq.GetStats()
+			time.Sleep(1 * time.Microsecond)
+		}
+		done <- true
+	}()
+
+	// Wait for both to complete
+	<-done
+	<-done
+
+	// Should not panic and queue should be consistent
+	if pq.Len() < 0 {
+		t.Fatal("queue length became negative (race condition)")
+	}
+}
+
+func BenchmarkPriorityQueue_Push(b *testing.B) {
+	pq := NewPriorityQueue()
+	b.ResetTimer()
+
+	for i := 0; i < b.N; i++ {
+		pq.Push(&SubsystemJob{
+			AgentID:   uuid.New(),
+			Subsystem: "updates",
+			NextRunAt: time.Now().Add(time.Duration(i) * time.Second),
+		})
+	}
+}
+
+func BenchmarkPriorityQueue_Pop(b *testing.B) {
+	pq := NewPriorityQueue()
+
+	// Pre-fill the queue
+	for i := 0; i < b.N; i++ {
+		pq.Push(&SubsystemJob{
+			AgentID:   uuid.New(),
+			Subsystem: "updates",
+			NextRunAt: time.Now().Add(time.Duration(i) * time.Second),
+		})
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		pq.Pop()
+	}
+}
+
+func BenchmarkPriorityQueue_Peek(b *testing.B) {
+	pq := NewPriorityQueue()
+
+	// Pre-fill with 10000 jobs
+	for i := 0; i < 10000; i++ {
+		pq.Push(&SubsystemJob{
+			AgentID:   uuid.New(),
+			Subsystem: "updates",
+			NextRunAt: time.Now().Add(time.Duration(i) * time.Second),
+		})
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		pq.Peek()
+	}
+}
--- a/aggregator-server/internal/scheduler/scheduler.go
+++ b/aggregator-server/internal/scheduler/scheduler.go
@@ -0,0 +1,406 @@
+package scheduler
+
+import (
+	"context"
+	"fmt"
+	"log"
+	"math/rand"
+	"sync"
+	"time"
+
+	"github.com/Fimeg/RedFlag/aggregator-server/internal/database/queries"
+	"github.com/Fimeg/RedFlag/aggregator-server/internal/models"
+	"github.com/google/uuid"
+)
+
+// Config holds scheduler configuration
+type Config struct {
+	// CheckInterval is how often to check the queue for due jobs
+	CheckInterval time.Duration
+
+	// LookaheadWindow is how far ahead to look for jobs
+	// Jobs due within this window will be batched and jittered
+	LookaheadWindow time.Duration
+
+	// MaxJitter is the maximum random delay added to job execution
+	MaxJitter time.Duration
+
+	// NumWorkers is the number of parallel workers for command creation
+	NumWorkers int
+
+	// BackpressureThreshold is max pending commands per agent before skipping
+	BackpressureThreshold int
+
+	// RateLimitPerSecond is max commands created per second (0 = unlimited)
+	RateLimitPerSecond int
+}
+
+// DefaultConfig returns production-ready default configuration
+func DefaultConfig() Config {
+	return Config{
+		CheckInterval:         10 * time.Second,
+		LookaheadWindow:       60 * time.Second,
+		MaxJitter:             30 * time.Second,
+		NumWorkers:            10,
+		BackpressureThreshold: 5,
+		RateLimitPerSecond:    100,
+	}
+}
+
+// Scheduler manages subsystem job scheduling with priority queue and worker pool
+type Scheduler struct {
+	config Config
+	queue  *PriorityQueue
+
+	// Database queries
+	agentQueries   *queries.AgentQueries
+	commandQueries *queries.CommandQueries
+
+	// Worker pool
+	jobChan chan *SubsystemJob
+	workers []*worker
+
+	// Rate limiting
+	rateLimiter chan struct{}
+
+	// Lifecycle management
+	ctx      context.Context
+	cancel   context.CancelFunc
+	wg       sync.WaitGroup
+	shutdown chan struct{}
+
+	// Metrics
+	mu    sync.RWMutex
+	stats Stats
+}
+
+// Stats holds scheduler statistics
+type Stats struct {
+	JobsProcessed       int64
+	JobsSkipped         int64
+	CommandsCreated     int64
+	CommandsFailed      int64
+	BackpressureSkips   int64
+	LastProcessedAt     time.Time
+	QueueSize           int
+	WorkerPoolUtilized  int
+	AverageProcessingMS int64
+}
+
+// NewScheduler creates a new scheduler instance
+func NewScheduler(config Config, agentQueries *queries.AgentQueries, commandQueries *queries.CommandQueries) *Scheduler {
+	ctx, cancel := context.WithCancel(context.Background())
+
+	s := &Scheduler{
+		config:         config,
+		queue:          NewPriorityQueue(),
+		agentQueries:   agentQueries,
+		commandQueries: commandQueries,
+		jobChan:        make(chan *SubsystemJob, 1000), // Buffer 1000 jobs
+		workers:        make([]*worker, config.NumWorkers),
+		shutdown:       make(chan struct{}),
+		ctx:            ctx,
+		cancel:         cancel,
+	}
+
+	// Initialize rate limiter if configured
+	if config.RateLimitPerSecond > 0 {
+		s.rateLimiter = make(chan struct{}, config.RateLimitPerSecond)
+		go s.refillRateLimiter()
+	}
+
+	// Initialize workers
+	for i := 0; i < config.NumWorkers; i++ {
+		s.workers[i] = &worker{
+			id:        i,
+			scheduler: s,
+		}
+	}
+
+	return s
+}
+
+// LoadSubsystems loads all enabled auto-run subsystems from database into queue
+func (s *Scheduler) LoadSubsystems(ctx context.Context) error {
+	log.Println("[Scheduler] Loading subsystems from database...")
+
+	// Get all agents (pass empty strings to get all agents regardless of status/os)
+	agents, err := s.agentQueries.ListAgents("", "")
+	if err != nil {
+		return fmt.Errorf("failed to get agents: %w", err)
+	}
+
+	// For now, we'll create default subsystems for each agent
+	// In full implementation, this would read from agent_subsystems table
+	subsystems := []string{"updates", "storage", "system", "docker"}
+	intervals := map[string]int{
+		"updates": 15, // 15 minutes
+		"storage": 15,
+		"system":  30,
+		"docker":  15,
+	}
+
+	loaded := 0
+	for _, agent := range agents {
+		// Skip offline agents (haven't checked in for 10+ minutes)
+		if time.Since(agent.LastSeen) > 10*time.Minute {
+			continue
+		}
+
+		for _, subsystem := range subsystems {
+			// TODO: Check agent metadata for subsystem enablement
+			// For now, assume all subsystems are enabled
+
+			job := &SubsystemJob{
+				AgentID:         agent.ID,
+				AgentHostname:   agent.Hostname,
+				Subsystem:       subsystem,
+				IntervalMinutes: intervals[subsystem],
+				NextRunAt:       time.Now().Add(time.Duration(intervals[subsystem]) * time.Minute),
+				Enabled:         true,
+			}
+
+			s.queue.Push(job)
+			loaded++
+		}
+	}
+
+	log.Printf("[Scheduler] Loaded %d subsystem jobs for %d agents\n", loaded, len(agents))
+	return nil
+}
+
+// Start begins the scheduler main loop and workers
+func (s *Scheduler) Start() error {
+	log.Printf("[Scheduler] Starting with %d workers, check interval %v\n",
+		s.config.NumWorkers, s.config.CheckInterval)
+
+	// Start workers
+	for _, w := range s.workers {
+		s.wg.Add(1)
+		go w.run()
+	}
+
+	// Start main loop
+	s.wg.Add(1)
+	go s.mainLoop()
+
+	log.Println("[Scheduler] Started successfully")
+	return nil
+}
+
+// Stop gracefully shuts down the scheduler
+func (s *Scheduler) Stop() error {
+	log.Println("[Scheduler] Shutting down...")
+
+	// Signal shutdown
+	s.cancel()
+	close(s.shutdown)
+
+	// Close job channel (workers will drain and exit)
+	close(s.jobChan)
+
+	// Wait for all goroutines with timeout
+	done := make(chan struct{})
+	go func() {
+		s.wg.Wait()
+		close(done)
+	}()
+
+	select {
+	case <-done:
+		log.Println("[Scheduler] Shutdown complete")
+		return nil
+	case <-time.After(30 * time.Second):
+		log.Println("[Scheduler] Shutdown timeout - forcing exit")
+		return fmt.Errorf("shutdown timeout")
+	}
+}
+
+// mainLoop is the scheduler's main processing loop
+func (s *Scheduler) mainLoop() {
+	defer s.wg.Done()
+
+	ticker := time.NewTicker(s.config.CheckInterval)
+	defer ticker.Stop()
+
+	log.Printf("[Scheduler] Main loop started (check every %v)\n", s.config.CheckInterval)
+
+	for {
+		select {
+		case <-s.shutdown:
+			log.Println("[Scheduler] Main loop shutting down")
+			return
+
+		case <-ticker.C:
+			s.processQueue()
+		}
+	}
+}
+
+// processQueue checks for due jobs and dispatches them to workers
+func (s *Scheduler) processQueue() {
+	start := time.Now()
+
+	// Get all jobs due within lookahead window
+	cutoff := time.Now().Add(s.config.LookaheadWindow)
+	dueJobs := s.queue.PopBefore(cutoff, 0) // No limit, get all
+
+	if len(dueJobs) == 0 {
+		// No jobs due, just update stats
+		s.mu.Lock()
+		s.stats.QueueSize = s.queue.Len()
+		s.mu.Unlock()
+		return
+	}
+
+	log.Printf("[Scheduler] Processing %d jobs due before %s\n",
+		len(dueJobs), cutoff.Format("15:04:05"))
+
+	// Add jitter to each job and dispatch to workers
+	dispatched := 0
+	for _, job := range dueJobs {
+		// Add random jitter (0 to MaxJitter)
+		jitter := time.Duration(rand.Intn(int(s.config.MaxJitter.Seconds()))) * time.Second
+		job.NextRunAt = job.NextRunAt.Add(jitter)
+
+		// Dispatch to worker pool (non-blocking)
+		select {
+		case s.jobChan <- job:
+			dispatched++
+		default:
+			// Worker pool full, re-queue job
+			log.Printf("[Scheduler] Worker pool full, re-queueing %s\n", job.String())
+			s.queue.Push(job)
+
+			s.mu.Lock()
+			s.stats.JobsSkipped++
+			s.mu.Unlock()
+		}
+	}
+
+	// Update stats
+	duration := time.Since(start)
+	s.mu.Lock()
+	s.stats.JobsProcessed += int64(dispatched)
+	s.stats.LastProcessedAt = time.Now()
+	s.stats.QueueSize = s.queue.Len()
+	s.stats.WorkerPoolUtilized = len(s.jobChan)
+	s.stats.AverageProcessingMS = duration.Milliseconds()
+	s.mu.Unlock()
+
+	log.Printf("[Scheduler] Dispatched %d jobs in %v (queue: %d remaining)\n",
+		dispatched, duration, s.queue.Len())
+}
+
+// refillRateLimiter continuously refills the rate limiter token bucket
+func (s *Scheduler) refillRateLimiter() {
+	ticker := time.NewTicker(time.Second / time.Duration(s.config.RateLimitPerSecond))
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-s.shutdown:
+			return
+		case <-ticker.C:
+			// Try to add token (non-blocking)
+			select {
+			case s.rateLimiter <- struct{}{}:
+			default:
+				// Bucket full, skip
+			}
+		}
+	}
+}
+
+// GetStats returns current scheduler statistics (thread-safe)
+func (s *Scheduler) GetStats() Stats {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+	return s.stats
+}
+
+// GetQueueStats returns current queue statistics
+func (s *Scheduler) GetQueueStats() QueueStats {
+	return s.queue.GetStats()
+}
+
+// worker processes jobs from the job channel
+type worker struct {
+	id        int
+	scheduler *Scheduler
+}
+
+func (w *worker) run() {
+	defer w.scheduler.wg.Done()
+
+	log.Printf("[Worker %d] Started\n", w.id)
+
+	for job := range w.scheduler.jobChan {
+		if err := w.processJob(job); err != nil {
+			log.Printf("[Worker %d] Failed to process %s: %v\n", w.id, job.String(), err)
+
+			w.scheduler.mu.Lock()
+			w.scheduler.stats.CommandsFailed++
+			w.scheduler.mu.Unlock()
+		} else {
+			w.scheduler.mu.Lock()
+			w.scheduler.stats.CommandsCreated++
+			w.scheduler.mu.Unlock()
+		}
+
+		// Re-queue job for next execution
+		job.NextRunAt = time.Now().Add(time.Duration(job.IntervalMinutes) * time.Minute)
+		w.scheduler.queue.Push(job)
+	}
+
+	log.Printf("[Worker %d] Stopped\n", w.id)
+}
+
+func (w *worker) processJob(job *SubsystemJob) error {
+	// Apply rate limiting if configured
+	if w.scheduler.rateLimiter != nil {
+		select {
+		case <-w.scheduler.rateLimiter:
+			// Token acquired
+		case <-w.scheduler.shutdown:
+			return fmt.Errorf("shutdown during rate limit wait")
+		}
+	}
+
+	// Check backpressure: skip if agent has too many pending commands
+	pendingCount, err := w.scheduler.commandQueries.CountPendingCommandsForAgent(job.AgentID)
+	if err != nil {
+		return fmt.Errorf("failed to check pending commands: %w", err)
+	}
+
+	if pendingCount >= w.scheduler.config.BackpressureThreshold {
+		log.Printf("[Worker %d] Backpressure: agent %s has %d pending commands, skipping %s\n",
+			w.id, job.AgentHostname, pendingCount, job.Subsystem)
+
+		w.scheduler.mu.Lock()
+		w.scheduler.stats.BackpressureSkips++
+		w.scheduler.mu.Unlock()
+
+		return nil // Not an error, just skipped
+	}
+
+	// Create command
+	cmd := &models.AgentCommand{
+		ID:          uuid.New(),
+		AgentID:     job.AgentID,
+		CommandType: fmt.Sprintf("scan_%s", job.Subsystem),
+		Params:      models.JSONB{},
+		Status:      models.CommandStatusPending,
+		Source:      models.CommandSourceSystem,
+		CreatedAt:   time.Now(),
+	}
+
+	if err := w.scheduler.commandQueries.CreateCommand(cmd); err != nil {
+		return fmt.Errorf("failed to create command: %w", err)
+	}
+
+	log.Printf("[Worker %d] Created %s command for %s\n",
+		w.id, job.Subsystem, job.AgentHostname)
+
+	return nil
+}
--- a/aggregator-server/internal/scheduler/scheduler_test.go
+++ b/aggregator-server/internal/scheduler/scheduler_test.go
@@ -0,0 +1,323 @@
+package scheduler
+
+import (
+	"testing"
+	"time"
+
+	"github.com/google/uuid"
+)
+
+func TestScheduler_NewScheduler(t *testing.T) {
+	config := DefaultConfig()
+	s := NewScheduler(config, nil, nil)
+
+	if s == nil {
+		t.Fatal("NewScheduler returned nil")
+	}
+
+	if s.config.NumWorkers != 10 {
+		t.Fatalf("expected 10 workers, got %d", s.config.NumWorkers)
+	}
+
+	if s.queue == nil {
+		t.Fatal("queue not initialized")
+	}
+
+	if len(s.workers) != config.NumWorkers {
+		t.Fatalf("expected %d workers, got %d", config.NumWorkers, len(s.workers))
+	}
+}
+
+func TestScheduler_DefaultConfig(t *testing.T) {
+	config := DefaultConfig()
+
+	if config.CheckInterval != 10*time.Second {
+		t.Fatalf("expected check interval 10s, got %v", config.CheckInterval)
+	}
+
+	if config.LookaheadWindow != 60*time.Second {
+		t.Fatalf("expected lookahead 60s, got %v", config.LookaheadWindow)
+	}
+
+	if config.MaxJitter != 30*time.Second {
+		t.Fatalf("expected max jitter 30s, got %v", config.MaxJitter)
+	}
+
+	if config.NumWorkers != 10 {
+		t.Fatalf("expected 10 workers, got %d", config.NumWorkers)
+	}
+
+	if config.BackpressureThreshold != 5 {
+		t.Fatalf("expected backpressure threshold 5, got %d", config.BackpressureThreshold)
+	}
+
+	if config.RateLimitPerSecond != 100 {
+		t.Fatalf("expected rate limit 100/s, got %d", config.RateLimitPerSecond)
+	}
+}
+
+func TestScheduler_QueueIntegration(t *testing.T) {
+	config := DefaultConfig()
+	s := NewScheduler(config, nil, nil)
+
+	// Add jobs to queue
+	agent1 := uuid.New()
+	agent2 := uuid.New()
+
+	job1 := &SubsystemJob{
+		AgentID:         agent1,
+		AgentHostname:   "agent-01",
+		Subsystem:       "updates",
+		IntervalMinutes: 15,
+		NextRunAt:       time.Now().Add(5 * time.Minute),
+	}
+
+	job2 := &SubsystemJob{
+		AgentID:         agent2,
+		AgentHostname:   "agent-02",
+		Subsystem:       "storage",
+		IntervalMinutes: 15,
+		NextRunAt:       time.Now().Add(10 * time.Minute),
+	}
+
+	s.queue.Push(job1)
+	s.queue.Push(job2)
+
+	if s.queue.Len() != 2 {
+		t.Fatalf("expected queue len 2, got %d", s.queue.Len())
+	}
+
+	// Get stats
+	stats := s.GetQueueStats()
+	if stats.Size != 2 {
+		t.Fatalf("expected stats size 2, got %d", stats.Size)
+	}
+}
+
+func TestScheduler_GetStats(t *testing.T) {
+	config := DefaultConfig()
+	s := NewScheduler(config, nil, nil)
+
+	// Initial stats should be zero
+	stats := s.GetStats()
+
+	if stats.JobsProcessed != 0 {
+		t.Fatalf("expected 0 jobs processed, got %d", stats.JobsProcessed)
+	}
+
+	if stats.CommandsCreated != 0 {
+		t.Fatalf("expected 0 commands created, got %d", stats.CommandsCreated)
+	}
+
+	if stats.BackpressureSkips != 0 {
+		t.Fatalf("expected 0 backpressure skips, got %d", stats.BackpressureSkips)
+	}
+
+	// Manually update stats (simulating processing)
+	s.mu.Lock()
+	s.stats.JobsProcessed = 100
+	s.stats.CommandsCreated = 95
+	s.stats.BackpressureSkips = 5
+	s.mu.Unlock()
+
+	stats = s.GetStats()
+
+	if stats.JobsProcessed != 100 {
+		t.Fatalf("expected 100 jobs processed, got %d", stats.JobsProcessed)
+	}
+
+	if stats.CommandsCreated != 95 {
+		t.Fatalf("expected 95 commands created, got %d", stats.CommandsCreated)
+	}
+
+	if stats.BackpressureSkips != 5 {
+		t.Fatalf("expected 5 backpressure skips, got %d", stats.BackpressureSkips)
+	}
+}
+
+func TestScheduler_StartStop(t *testing.T) {
+	config := Config{
+		CheckInterval:         100 * time.Millisecond, // Fast for testing
+		LookaheadWindow:       60 * time.Second,
+		MaxJitter:             1 * time.Second,
+		NumWorkers:            2,
+		BackpressureThreshold: 5,
+		RateLimitPerSecond:    0, // Disable rate limiting for test
+	}
+
+	s := NewScheduler(config, nil, nil)
+
+	// Start scheduler
+	err := s.Start()
+	if err != nil {
+		t.Fatalf("failed to start scheduler: %v", err)
+	}
+
+	// Let it run for a bit
+	time.Sleep(500 * time.Millisecond)
+
+	// Stop scheduler
+	err = s.Stop()
+	if err != nil {
+		t.Fatalf("failed to stop scheduler: %v", err)
+	}
+
+	// Should stop cleanly
+}
+
+func TestScheduler_ProcessQueueEmpty(t *testing.T) {
+	config := DefaultConfig()
+	s := NewScheduler(config, nil, nil)
+
+	// Process empty queue should not panic
+	s.processQueue()
+
+	stats := s.GetStats()
+	if stats.JobsProcessed != 0 {
+		t.Fatalf("expected 0 jobs processed on empty queue, got %d", stats.JobsProcessed)
+	}
+}
+
+func TestScheduler_ProcessQueueWithJobs(t *testing.T) {
+	config := Config{
+		CheckInterval:         1 * time.Second,
+		LookaheadWindow:       60 * time.Second,
+		MaxJitter:             5 * time.Second,
+		NumWorkers:            2,
+		BackpressureThreshold: 5,
+		RateLimitPerSecond:    0, // Disable for test
+	}
+
+	s := NewScheduler(config, nil, nil)
+
+	// Add jobs that are due now
+	for i := 0; i < 5; i++ {
+		job := &SubsystemJob{
+			AgentID:         uuid.New(),
+			AgentHostname:   "test-agent",
+			Subsystem:       "updates",
+			IntervalMinutes: 15,
+			NextRunAt:       time.Now(), // Due now
+		}
+		s.queue.Push(job)
+	}
+
+	if s.queue.Len() != 5 {
+		t.Fatalf("expected 5 jobs in queue, got %d", s.queue.Len())
+	}
+
+	// Process the queue
+	s.processQueue()
+
+	// Jobs should be dispatched to job channel
+	// Note: Without database, workers can't actually process them
+	// But we can verify they were dispatched
+
+	stats := s.GetStats()
+	if stats.JobsProcessed == 0 {
+		t.Fatal("expected some jobs to be processed")
+	}
+}
+
+func TestScheduler_RateLimiterRefill(t *testing.T) {
+	config := Config{
+		CheckInterval:         1 * time.Second,
+		LookaheadWindow:       60 * time.Second,
+		MaxJitter:             1 * time.Second,
+		NumWorkers:            2,
+		BackpressureThreshold: 5,
+		RateLimitPerSecond:    10, // 10 tokens per second
+	}
+
+	s := NewScheduler(config, nil, nil)
+
+	if s.rateLimiter == nil {
+		t.Fatal("rate limiter not initialized")
+	}
+
+	// Start refill goroutine
+	go s.refillRateLimiter()
+
+	// Wait for some tokens to be added
+	time.Sleep(200 * time.Millisecond)
+
+	// Should have some tokens available
+	tokensAvailable := 0
+	for i := 0; i < 15; i++ {
+		select {
+		case <-s.rateLimiter:
+			tokensAvailable++
+		default:
+			break
+		}
+	}
+
+	if tokensAvailable == 0 {
+		t.Fatal("expected some tokens to be available after refill")
+	}
+
+	// Should not exceed buffer size (10)
+	if tokensAvailable > 10 {
+		t.Fatalf("token bucket overflowed: got %d tokens, max is 10", tokensAvailable)
+	}
+}
+
+func TestScheduler_ConcurrentQueueAccess(t *testing.T) {
+	config := DefaultConfig()
+	s := NewScheduler(config, nil, nil)
+
+	done := make(chan bool)
+
+	// Concurrent pushes
+	go func() {
+		for i := 0; i < 100; i++ {
+			job := &SubsystemJob{
+				AgentID:         uuid.New(),
+				Subsystem:       "updates",
+				IntervalMinutes: 15,
+				NextRunAt:       time.Now(),
+			}
+			s.queue.Push(job)
+		}
+		done <- true
+	}()
+
+	// Concurrent stats reads
+	go func() {
+		for i := 0; i < 100; i++ {
+			s.GetStats()
+			s.GetQueueStats()
+		}
+		done <- true
+	}()
+
+	// Wait for both
+	<-done
+	<-done
+
+	// Should not panic and should have queued jobs
+	if s.queue.Len() <= 0 {
+		t.Fatal("expected jobs in queue after concurrent pushes")
+	}
+}
+
+func BenchmarkScheduler_ProcessQueue(b *testing.B) {
+	config := DefaultConfig()
+	s := NewScheduler(config, nil, nil)
+
+	// Pre-fill queue with jobs
+	for i := 0; i < 1000; i++ {
+		job := &SubsystemJob{
+			AgentID:         uuid.New(),
+			Subsystem:       "updates",
+			IntervalMinutes: 15,
+			NextRunAt:       time.Now(),
+		}
+		s.queue.Push(job)
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		s.processQueue()
+	}
+}