Add docs and project files - force for Culurien

2026-03-28 20:46:24 -04:00
parent dc61797423
commit 484a7f77ce
343 changed files with 119530 additions and 0 deletions
--- a/docs/3_BACKLOG/P3-003_Update-Metrics-Dashboard.md
+++ b/docs/3_BACKLOG/P3-003_Update-Metrics-Dashboard.md
@@ -0,0 +1,325 @@
+# Update Metrics Dashboard
+
+**Priority**: P3 (Enhancement)
+**Source Reference**: From todos.md line 60
+**Status**: Ready for Implementation
+
+## Problem Statement
+
+Administrators lack visibility into update operations across their agent fleet. There is no centralized dashboard showing update success/failure rates, agent update readiness, or performance analytics for update operations.
+
+## Feature Description
+
+Create a comprehensive Update Metrics Dashboard that provides real-time visibility into update operations, including success/failure rates, agent readiness tracking, performance analytics, and historical trend analysis for update management.
+
+## Acceptance Criteria
+
+1. Dashboard showing real-time update metrics across all agents
+2. Update success/failure rates with trend analysis
+3. Agent update readiness status and categorization
+4. Performance analytics for update operations
+5. Historical update operation tracking
+6. Filterable views by agent groups, time ranges, and update types
+7. Export capabilities for reporting
+8. Alert thresholds for update failure rates
+
+## Technical Approach
+
+### 1. Backend Metrics Collection
+
+**Update Metrics Service** (`aggregator-server/internal/services/update_metrics.go`):
+```go
+type UpdateMetrics struct {
+    TotalUpdates        int64     `json:"total_updates"`
+    SuccessfulUpdates   int64     `json:"successful_updates"`
+    FailedUpdates       int64     `json:"failed_updates"`
+    PendingUpdates      int64     `json:"pending_updates"`
+    AverageUpdateTime   float64   `json:"average_update_time"`
+    UpdateSuccessRate   float64   `json:"update_success_rate"`
+    ReadyForUpdate      int64     `json:"ready_for_update"`
+    NotReadyForUpdate   int64     `json:"not_ready_for_update"`
+    LastUpdated         time.Time `json:"last_updated"`
+}
+
+type UpdateMetricsTimeSeries struct {
+    Timestamp   time.Time `json:"timestamp"`
+    SuccessRate float64   `json:"success_rate"`
+    UpdateCount int64     `json:"update_count"`
+    FailureRate float64   `json:"failure_rate"`
+}
+```
+
+**Metrics Calculation**:
+```go
+func (s *UpdateMetricsService) CalculateUpdateMetrics(timeRange time.Duration) (*UpdateMetrics, error) {
+    metrics := &UpdateMetrics{}
+
+    // Get update statistics from database
+    stats, err := s.queries.GetUpdateStats(time.Now().Add(-timeRange))
+    if err != nil {
+        return nil, err
+    }
+
+    metrics.TotalUpdates = stats.TotalUpdates
+    metrics.SuccessfulUpdates = stats.SuccessfulUpdates
+    metrics.FailedUpdates = stats.FailedUpdates
+    metrics.PendingUpdates = stats.PendingUpdates
+
+    if metrics.TotalUpdates > 0 {
+        metrics.UpdateSuccessRate = float64(metrics.SuccessfulUpdates) / float64(metrics.TotalUpdates) * 100
+    }
+
+    // Calculate agent readiness
+    readiness, err := s.queries.GetAgentReadinessStats()
+    if err == nil {
+        metrics.ReadyForUpdate = readiness.ReadyCount
+        metrics.NotReadyForUpdate = readiness.NotReadyCount
+    }
+
+    return metrics, nil
+}
+```
+
+### 2. Database Queries
+
+**Update Statistics** (`aggregator-server/internal/database/queries/updates.go`):
+```sql
+-- Update success/failure statistics
+SELECT
+    COUNT(*) as total_updates,
+    COUNT(CASE WHEN status = 'completed' THEN 1 END) as successful_updates,
+    COUNT(CASE WHEN status = 'failed' THEN 1 END) as failed_updates,
+    COUNT(CASE WHEN status IN ('pending', 'sent') THEN 1 END) as pending_updates,
+    AVG(EXTRACT(EPOCH FROM (completed_at - created_at))) as avg_update_time
+FROM update_events
+WHERE created_at > NOW() - $1::INTERVAL;
+
+-- Agent readiness statistics
+SELECT
+    COUNT(CASE WHEN has_available_updates = true AND last_seen > NOW() - INTERVAL '1 hour' THEN 1 END) as ready_count,
+    COUNT(CASE WHEN has_available_updates = false OR last_seen <= NOW() - INTERVAL '1 hour' THEN 1 END) as not_ready_count
+FROM agents;
+```
+
+### 3. API Endpoints
+
+**Metrics API** (`aggregator-server/internal/api/handlers/metrics.go`):
+```go
+// GET /api/v1/metrics/updates
+func (h *MetricsHandler) GetUpdateMetrics(c *gin.Context) {
+    timeRange := c.DefaultQuery("timeRange", "24h")
+    duration, err := time.ParseDuration(timeRange)
+    if err != nil {
+        c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid time range"})
+        return
+    }
+
+    metrics, err := h.metricsService.CalculateUpdateMetrics(duration)
+    if err != nil {
+        c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+        return
+    }
+
+    c.JSON(http.StatusOK, metrics)
+}
+
+// GET /api/v1/metrics/updates/timeseries
+func (h *MetricsHandler) GetUpdateTimeSeries(c *gin.Context) {
+    // Return time series data for charts
+}
+```
+
+### 4. Frontend Dashboard Components
+
+**Update Metrics Dashboard** (`aggregator-web/src/pages/UpdateMetrics.tsx`):
+```typescript
+interface UpdateMetrics {
+    totalUpdates: number;
+    successfulUpdates: number;
+    failedUpdates: number;
+    pendingUpdates: number;
+    updateSuccessRate: number;
+    readyForUpdate: number;
+    notReadyForUpdate: number;
+}
+
+const UpdateMetricsDashboard: React.FC = () => {
+    const [metrics, setMetrics] = useState<UpdateMetrics | null>(null);
+    const [timeRange, setTimeRange] = useState<string>("24h");
+
+    return (
+        <div className="update-metrics-dashboard">
+            <div className="metrics-header">
+                <h2>Update Operations Dashboard</h2>
+                <TimeRangeSelector value={timeRange} onChange={setTimeRange} />
+            </div>
+
+            <div className="metrics-grid">
+                <MetricCard
+                    title="Success Rate"
+                    value={metrics?.updateSuccessRate || 0}
+                    unit="%"
+                    trend={getSuccessRateTrend()}
+                />
+                <MetricCard
+                    title="Ready for Updates"
+                    value={metrics?.readyForUpdate || 0}
+                    unit="agents"
+                />
+                <MetricCard
+                    title="Failed Updates"
+                    value={metrics?.failedUpdates || 0}
+                    trend={getFailureTrend()}
+                />
+                <MetricCard
+                    title="Pending Updates"
+                    value={metrics?.pendingUpdates || 0}
+                />
+            </div>
+
+            <div className="charts-section">
+                <UpdateSuccessRateChart timeRange={timeRange} />
+                <UpdateVolumeChart timeRange={timeRange} />
+                <AgentReadinessChart timeRange={timeRange} />
+            </div>
+        </div>
+    );
+};
+```
+
+**Chart Components**:
+- `UpdateSuccessRateChart`: Line chart showing success rate over time
+- `UpdateVolumeChart`: Bar chart showing update volume trends
+- `AgentReadinessChart`: Pie chart showing ready vs not-ready agents
+- `FailureReasonChart`: Breakdown of update failure reasons
+
+### 5. Real-time Updates
+
+**WebSocket Integration** (optional):
+```typescript
+// Real-time metrics updates
+useEffect(() => {
+    const ws = new WebSocket(`${API_BASE}/ws/metrics/updates`);
+
+    ws.onmessage = (event) => {
+        const updatedMetrics = JSON.parse(event.data);
+        setMetrics(updatedMetrics);
+    };
+
+    return () => ws.close();
+}, [timeRange]);
+```
+
+## Definition of Done
+
+- ✅ Update metrics calculation service implemented
+- ✅ RESTful API endpoints for metrics data
+- ✅ Comprehensive dashboard with key metrics
+- ✅ Interactive charts showing trends and analytics
+- ✅ Real-time or near real-time updates
+- ✅ Filtering by time range, agent groups, update types
+- ✅ Export functionality for reports
+- ✅ Mobile-responsive design
+- ✅ Performance optimization for large datasets
+
+## Test Plan
+
+1. **Unit Tests**
+   - Metrics calculation accuracy
+   - Time series data generation
+   - API response formatting
+
+2. **Integration Tests**
+   - End-to-end metrics flow
+   - Database query performance
+   - Real-time update functionality
+
+3. **Performance Tests**
+   - Dashboard load times with large datasets
+   - API response times under load
+   - Chart rendering performance
+
+4. **User Acceptance Tests**
+   - Administrators can easily identify update issues
+   - Dashboard provides actionable insights
+   - Interface is intuitive and responsive
+
+## Files to Modify
+
+- `aggregator-server/internal/services/update_metrics.go` - New service
+- `aggregator-server/internal/database/queries/metrics.go` - New queries
+- `aggregator-server/internal/api/handlers/metrics.go` - New handlers
+- `aggregator-web/src/pages/UpdateMetrics.tsx` - New dashboard page
+- `aggregator-web/src/components/MetricCard.tsx` - Metric display component
+- `aggregator-web/src/components/charts/` - Chart components
+- `aggregator-web/src/lib/api.ts` - API integration
+
+## Metrics Categories
+
+### 1. Success Metrics
+- Update success rate percentage
+- Successful update count
+- Average update completion time
+- Agent readiness percentage
+
+### 2. Failure Metrics
+- Failed update count
+- Failure rate percentage
+- Common failure reasons
+- Rollback frequency
+
+### 3. Performance Metrics
+- Update queue length
+- Average processing time
+- Agent response time
+- Server load during updates
+
+### 4. Agent Metrics
+- Agents ready for updates
+- Agents with available updates
+- Agents requiring manual intervention
+- Update distribution by agent version
+
+## Estimated Effort
+
+- **Development**: 20-24 hours
+- **Testing**: 12-16 hours
+- **Review**: 6-8 hours
+- **Design/UX**: 8-10 hours
+
+## Dependencies
+
+- Existing update events database
+- Agent status tracking system
+- Chart library (Chart.js, D3.js, etc.)
+- WebSocket infrastructure (for real-time updates)
+
+## Risk Assessment
+
+**Low-Medium Risk** - Enhancement that creates new functionality without affecting existing systems. Performance considerations for large datasets need attention.
+
+## Implementation Phases
+
+### Phase 1: Core Metrics API
+1. Implement metrics calculation service
+2. Create database queries for statistics
+3. Build REST API endpoints
+
+### Phase 2: Dashboard UI
+1. Create basic dashboard layout
+2. Implement metric cards and charts
+3. Add time range filtering
+
+### Phase 3: Advanced Features
+1. Real-time updates
+2. Export functionality
+3. Alert thresholds
+4. Advanced filtering and search
+
+## Future Enhancements
+
+1. **Predictive Analytics**: Predict update success based on agent patterns
+2. **Automated Recommendations**: Suggest optimal update timing
+3. **Integration with APM**: Correlate update performance with system metrics
+4. **Custom Dashboards**: User-configurable metric views
+5. **SLA Monitoring**: Track update performance against service level agreements