feat: add resilience and reliability features for agent subsystems

Added circuit breakers with configurable timeouts for all subsystems (APT, DNF, Docker, Windows, Winget, Storage). Replaces cron-based scheduler with priority queue that should scale beyond 1000+ agents if your homelab is that big.

Command acknowledgment system ensures results aren't lost on network failures or restarts. Agent tracks pending acknowledgments with persistent state and automatic retry.

- Circuit breakers: 3 failures in 1min opens circuit, 30s cooldown
- Per-subsystem timeouts: 30s-10min depending on scanner
- Priority queue scheduler: O(log n), worker pool, jitter, backpressure
- Acknowledgments: at-least-once delivery, max 10 retries over 24h
- All tests passing (26/26)
This commit is contained in:
Fimeg
2025-11-01 18:42:41 -04:00
parent 528848f476
commit bf4d46529f
26 changed files with 2733 additions and 152 deletions

62
test_disk_detection.go Normal file
View File

@@ -0,0 +1,62 @@
package main
import (
"encoding/json"
"fmt"
"log"
"path/filepath"
"runtime"
"github.com/redflag-aggregator/aggregator-agent/internal/system"
)
func main() {
// Get the absolute path to this file's directory
_, filename, _, _ := runtime.Caller(0)
dir := filepath.Dir(filename)
// Change to the project root to find the go.mod file
projectRoot := filepath.Dir(dir)
// Test lightweight metrics (most common use case)
fmt.Println("=== Enhanced Lightweight Metrics Test ===")
metrics, err := system.GetLightweightMetrics()
if err != nil {
log.Printf("Error getting lightweight metrics: %v", err)
} else {
// Pretty print the JSON
jsonData, _ := json.MarshalIndent(metrics, "", " ")
fmt.Printf("LightweightMetrics:\n%s\n\n", jsonData)
// Show key findings
fmt.Printf("Root Disk: %.1fGB used / %.1fGB total (%.1f%%)\n",
metrics.DiskUsedGB, metrics.DiskTotalGB, metrics.DiskPercent)
if metrics.LargestDiskTotalGB > 0 {
fmt.Printf("Largest Disk (%s): %.1fGB used / %.1fGB total (%.1f%%)\n",
metrics.LargestDiskMount, metrics.LargestDiskUsedGB, metrics.LargestDiskTotalGB, metrics.LargestDiskPercent)
}
}
// Test full system info (detailed disk inventory)
fmt.Println("\n=== Enhanced System Info Test ===")
sysInfo, err := system.GetSystemInfo("test-v0.1.5")
if err != nil {
log.Printf("Error getting system info: %v", err)
} else {
fmt.Printf("Found %d disks:\n", len(sysInfo.DiskInfo))
for i, disk := range sysInfo.DiskInfo {
fmt.Printf(" Disk %d: %s (%s) - %s, %.1fGB used / %.1fGB total (%.1f%%)",
i+1, disk.Mountpoint, disk.Filesystem, disk.DiskType,
float64(disk.Used)/(1024*1024*1024), float64(disk.Total)/(1024*1024*1024), disk.UsedPercent)
if disk.IsRoot {
fmt.Printf(" [ROOT]")
}
if disk.IsLargest {
fmt.Printf(" [LARGEST]")
}
fmt.Printf("\n")
}
}
}