feat: add resilience and reliability features for agent subsystems

Added circuit breakers with configurable timeouts for all subsystems (APT, DNF, Docker, Windows, Winget, Storage). Replaces cron-based scheduler with priority queue that should scale beyond 1000+ agents if your homelab is that big.

Command acknowledgment system ensures results aren't lost on network failures or restarts. Agent tracks pending acknowledgments with persistent state and automatic retry.

- Circuit breakers: 3 failures in 1min opens circuit, 30s cooldown
- Per-subsystem timeouts: 30s-10min depending on scanner
- Priority queue scheduler: O(log n), worker pool, jitter, backpressure
- Acknowledgments: at-least-once delivery, max 10 retries over 24h
- All tests passing (26/26)
This commit is contained in:
Fimeg
2025-11-01 18:42:41 -04:00
parent 528848f476
commit bf4d46529f
26 changed files with 2733 additions and 152 deletions

View File

@@ -1,6 +1,7 @@
package main
import (
"context"
"flag"
"fmt"
"log"
@@ -12,6 +13,7 @@ import (
"github.com/Fimeg/RedFlag/aggregator-server/internal/config"
"github.com/Fimeg/RedFlag/aggregator-server/internal/database"
"github.com/Fimeg/RedFlag/aggregator-server/internal/database/queries"
"github.com/Fimeg/RedFlag/aggregator-server/internal/scheduler"
"github.com/Fimeg/RedFlag/aggregator-server/internal/services"
"github.com/gin-gonic/gin"
)
@@ -285,10 +287,45 @@ func main() {
timeoutService.Start()
log.Println("Timeout service started")
// Add graceful shutdown for timeout service
// Initialize and start scheduler
schedulerConfig := scheduler.DefaultConfig()
subsystemScheduler := scheduler.NewScheduler(schedulerConfig, agentQueries, commandQueries)
// Load subsystems into queue
ctx := context.Background()
if err := subsystemScheduler.LoadSubsystems(ctx); err != nil {
log.Printf("Warning: Failed to load subsystems: %v", err)
} else {
log.Println("Subsystems loaded into scheduler")
}
// Start scheduler
if err := subsystemScheduler.Start(); err != nil {
log.Printf("Warning: Failed to start scheduler: %v", err)
}
// Add scheduler stats endpoint (after scheduler is initialized)
router.GET("/api/v1/scheduler/stats", middleware.AuthMiddleware(), func(c *gin.Context) {
stats := subsystemScheduler.GetStats()
queueStats := subsystemScheduler.GetQueueStats()
c.JSON(200, gin.H{
"scheduler": stats,
"queue": queueStats,
})
})
// Add graceful shutdown for services
defer func() {
log.Println("Shutting down services...")
// Stop scheduler first
if err := subsystemScheduler.Stop(); err != nil {
log.Printf("Error stopping scheduler: %v", err)
}
// Stop timeout service
timeoutService.Stop()
log.Println("Timeout service stopped")
log.Println("Services stopped")
}()
// Start server