feat: add resilience and reliability features for agent subsystems

Added circuit breakers with configurable timeouts for all subsystems (APT, DNF, Docker, Windows, Winget, Storage). Replaces cron-based scheduler with priority queue that should scale beyond 1000+ agents if your homelab is that big.

Command acknowledgment system ensures results aren't lost on network failures or restarts. Agent tracks pending acknowledgments with persistent state and automatic retry.

- Circuit breakers: 3 failures in 1min opens circuit, 30s cooldown
- Per-subsystem timeouts: 30s-10min depending on scanner
- Priority queue scheduler: O(log n), worker pool, jitter, backpressure
- Acknowledgments: at-least-once delivery, max 10 retries over 24h
- All tests passing (26/26)
This commit is contained in:
Fimeg
2025-11-01 18:42:41 -04:00
parent 528848f476
commit bf4d46529f
26 changed files with 2733 additions and 152 deletions

View File

@@ -12,6 +12,7 @@ import {
Info,
TrendingUp,
Server,
MemoryStick,
} from 'lucide-react';
import { formatBytes, formatRelativeTime } from '@/lib/utils';
import { agentApi } from '@/lib/api';
@@ -160,120 +161,85 @@ export function AgentStorage({ agentId }: AgentStorageProps) {
</button>
</div>
{/* Simple list - no boxes, just clean rows */}
<div className="space-y-6">
{/* Memory */}
{/* Memory & Disk - matching Overview styling */}
<div className="space-y-4">
{/* Memory - GREEN to differentiate from disks */}
{storageMetrics && storageMetrics.memory_total_gb > 0 && (
<div className="space-y-2">
<div className="flex items-center justify-between text-sm">
<span className="text-gray-600">Memory</span>
<span className="text-gray-900 font-mono">
{storageMetrics.memory_used_gb.toFixed(1)} / {storageMetrics.memory_total_gb.toFixed(1)} GB
<span className="text-gray-500 ml-2">({storageMetrics.memory_percent.toFixed(0)}%)</span>
</span>
<div>
<div className="flex items-center justify-between">
<p className="text-sm text-gray-600 flex items-center">
<MemoryStick className="h-4 w-4 mr-1" />
Memory
</p>
<p className="text-sm font-medium text-gray-900">
{storageMetrics.memory_used_gb.toFixed(1)} GB / {storageMetrics.memory_total_gb.toFixed(1)} GB
</p>
</div>
<div className="w-full h-1 bg-gray-100 rounded-full overflow-hidden">
<div className="w-full bg-gray-200 rounded-full h-2 mt-1">
<div
className="h-full bg-gray-900 transition-all"
className="bg-green-600 h-2 rounded-full transition-all"
style={{ width: `${Math.min(storageMetrics.memory_percent, 100)}%` }}
/>
</div>
<p className="text-xs text-gray-500 mt-1">
{storageMetrics.memory_percent.toFixed(0)}% used
</p>
</div>
)}
{/* Root Disk */}
{storageMetrics && storageMetrics.disk_total_gb > 0 && (
<div className="space-y-2">
<div className="flex items-center justify-between text-sm">
<span className="text-gray-600">Root filesystem</span>
<span className="text-gray-900 font-mono">
{storageMetrics.disk_used_gb.toFixed(1)} / {storageMetrics.disk_total_gb.toFixed(1)} GB
<span className="text-gray-500 ml-2">({storageMetrics.disk_percent.toFixed(0)}%)</span>
</span>
{/* All Disks from system_info.disk_info - BLUE matching Overview */}
{disks.length > 0 && disks.map((disk, index) => (
<div key={index}>
<div className="flex items-center justify-between">
<p className="text-sm text-gray-600 flex items-center">
<HardDrive className="h-4 w-4 mr-1" />
Disk ({disk.mountpoint})
</p>
<p className="text-sm font-medium text-gray-900">
{formatBytes(disk.used)} / {formatBytes(disk.total)}
</p>
</div>
<div className="w-full h-1 bg-gray-100 rounded-full overflow-hidden">
<div className="w-full bg-gray-200 rounded-full h-2 mt-1">
<div
className="h-full bg-gray-900 transition-all"
className="bg-blue-600 h-2 rounded-full transition-all"
style={{ width: `${Math.min(disk.used_percent, 100)}%` }}
/>
</div>
<p className="text-xs text-gray-500 mt-1">
{disk.used_percent.toFixed(0)}% used
</p>
</div>
))}
{/* Fallback if no disk array but we have metadata */}
{disks.length === 0 && storageMetrics && storageMetrics.disk_total_gb > 0 && (
<div>
<div className="flex items-center justify-between">
<p className="text-sm text-gray-600 flex items-center">
<HardDrive className="h-4 w-4 mr-1" />
Disk (/)
</p>
<p className="text-sm font-medium text-gray-900">
{storageMetrics.disk_used_gb.toFixed(1)} GB / {storageMetrics.disk_total_gb.toFixed(1)} GB
</p>
</div>
<div className="w-full bg-gray-200 rounded-full h-2 mt-1">
<div
className="bg-blue-600 h-2 rounded-full transition-all"
style={{ width: `${Math.min(storageMetrics.disk_percent, 100)}%` }}
/>
</div>
</div>
)}
{/* Largest disk if different */}
{storageMetrics && storageMetrics.largest_disk_total_gb > 0 && storageMetrics.largest_disk_mount !== '/' && (
<div className="space-y-2">
<div className="flex items-center justify-between text-sm">
<span className="text-gray-600">{storageMetrics.largest_disk_mount}</span>
<span className="text-gray-900 font-mono">
{storageMetrics.largest_disk_used_gb.toFixed(1)} / {storageMetrics.largest_disk_total_gb.toFixed(1)} GB
<span className="text-gray-500 ml-2">({storageMetrics.largest_disk_percent.toFixed(0)}%)</span>
</span>
</div>
<div className="w-full h-1 bg-gray-100 rounded-full overflow-hidden">
<div
className="h-full bg-gray-900 transition-all"
style={{ width: `${Math.min(storageMetrics.largest_disk_percent, 100)}%` }}
/>
</div>
<p className="text-xs text-gray-500 mt-1">
{storageMetrics.disk_percent.toFixed(0)}% used
</p>
</div>
)}
</div>
{/* All partitions - minimal table */}
{disks.length > 0 && (
<div className="space-y-3">
<h3 className="text-sm font-medium text-gray-600">All partitions</h3>
<div className="border border-gray-200 rounded-lg overflow-hidden">
<table className="min-w-full text-sm divide-y divide-gray-200">
<thead className="bg-gray-50">
<tr>
<th className="text-left px-4 py-2 text-xs font-medium text-gray-500">Mount</th>
<th className="text-left px-4 py-2 text-xs font-medium text-gray-500">Device</th>
<th className="text-left px-4 py-2 text-xs font-medium text-gray-500">Type</th>
<th className="text-right px-4 py-2 text-xs font-medium text-gray-500">Used</th>
<th className="text-right px-4 py-2 text-xs font-medium text-gray-500">Total</th>
<th className="text-right px-4 py-2 text-xs font-medium text-gray-500">Usage</th>
</tr>
</thead>
<tbody className="divide-y divide-gray-100 bg-white">
{disks.map((disk, index) => (
<tr key={index} className="hover:bg-gray-50 transition-colors">
<td className="px-4 py-3 text-sm text-gray-900">
<div className="flex items-center space-x-2">
<span className="font-mono">{disk.mountpoint}</span>
{disk.is_root && <span className="text-xs text-gray-500">root</span>}
</div>
</td>
<td className="px-4 py-3 text-xs text-gray-500 font-mono">{disk.device}</td>
<td className="px-4 py-3 text-xs text-gray-500">{disk.disk_type}</td>
<td className="px-4 py-3 text-sm text-right text-gray-900">{formatBytes(disk.used)}</td>
<td className="px-4 py-3 text-sm text-right text-gray-500">{formatBytes(disk.total)}</td>
<td className="px-4 py-3 text-right">
<div className="flex items-center justify-end space-x-2">
<span className="text-sm text-gray-900">{disk.used_percent.toFixed(0)}%</span>
<div className="w-16 h-1 bg-gray-100 rounded-full overflow-hidden">
<div
className="h-full bg-gray-900"
style={{ width: `${Math.min(disk.used_percent, 100)}%` }}
/>
</div>
</div>
</td>
</tr>
))}
</tbody>
</table>
</div>
</div>
)}
{/* Last updated - minimal */}
{agentData && (
<div className="text-xs text-gray-400">
Last updated {agentData.last_seen ? formatRelativeTime(agentData.last_seen) : 'unknown'}
</div>
)}
{/* Refresh info */}
<div className="text-xs text-gray-400 border-t border-gray-200 pt-4">
Auto-refreshes every 30 seconds Last updated {agentData?.last_seen ? formatRelativeTime(agentData.last_seen) : 'unknown'}
</div>
</div>
);
}