feat: add resilience and reliability features for agent subsystems
Added circuit breakers with configurable timeouts for all subsystems (APT, DNF, Docker, Windows, Winget, Storage). Replaces cron-based scheduler with priority queue that should scale beyond 1000+ agents if your homelab is that big. Command acknowledgment system ensures results aren't lost on network failures or restarts. Agent tracks pending acknowledgments with persistent state and automatic retry. - Circuit breakers: 3 failures in 1min opens circuit, 30s cooldown - Per-subsystem timeouts: 30s-10min depending on scanner - Priority queue scheduler: O(log n), worker pool, jitter, backpressure - Acknowledgments: at-least-once delivery, max 10 retries over 24h - All tests passing (26/26)
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"log"
|
||||
@@ -12,6 +13,7 @@ import (
|
||||
"github.com/Fimeg/RedFlag/aggregator-server/internal/config"
|
||||
"github.com/Fimeg/RedFlag/aggregator-server/internal/database"
|
||||
"github.com/Fimeg/RedFlag/aggregator-server/internal/database/queries"
|
||||
"github.com/Fimeg/RedFlag/aggregator-server/internal/scheduler"
|
||||
"github.com/Fimeg/RedFlag/aggregator-server/internal/services"
|
||||
"github.com/gin-gonic/gin"
|
||||
)
|
||||
@@ -285,10 +287,45 @@ func main() {
|
||||
timeoutService.Start()
|
||||
log.Println("Timeout service started")
|
||||
|
||||
// Add graceful shutdown for timeout service
|
||||
// Initialize and start scheduler
|
||||
schedulerConfig := scheduler.DefaultConfig()
|
||||
subsystemScheduler := scheduler.NewScheduler(schedulerConfig, agentQueries, commandQueries)
|
||||
|
||||
// Load subsystems into queue
|
||||
ctx := context.Background()
|
||||
if err := subsystemScheduler.LoadSubsystems(ctx); err != nil {
|
||||
log.Printf("Warning: Failed to load subsystems: %v", err)
|
||||
} else {
|
||||
log.Println("Subsystems loaded into scheduler")
|
||||
}
|
||||
|
||||
// Start scheduler
|
||||
if err := subsystemScheduler.Start(); err != nil {
|
||||
log.Printf("Warning: Failed to start scheduler: %v", err)
|
||||
}
|
||||
|
||||
// Add scheduler stats endpoint (after scheduler is initialized)
|
||||
router.GET("/api/v1/scheduler/stats", middleware.AuthMiddleware(), func(c *gin.Context) {
|
||||
stats := subsystemScheduler.GetStats()
|
||||
queueStats := subsystemScheduler.GetQueueStats()
|
||||
c.JSON(200, gin.H{
|
||||
"scheduler": stats,
|
||||
"queue": queueStats,
|
||||
})
|
||||
})
|
||||
|
||||
// Add graceful shutdown for services
|
||||
defer func() {
|
||||
log.Println("Shutting down services...")
|
||||
|
||||
// Stop scheduler first
|
||||
if err := subsystemScheduler.Stop(); err != nil {
|
||||
log.Printf("Error stopping scheduler: %v", err)
|
||||
}
|
||||
|
||||
// Stop timeout service
|
||||
timeoutService.Stop()
|
||||
log.Println("Timeout service stopped")
|
||||
log.Println("Services stopped")
|
||||
}()
|
||||
|
||||
// Start server
|
||||
|
||||
Reference in New Issue
Block a user