Add docs and project files - force for Culurien
This commit is contained in:
325
docs/3_BACKLOG/P3-003_Update-Metrics-Dashboard.md
Normal file
325
docs/3_BACKLOG/P3-003_Update-Metrics-Dashboard.md
Normal file
@@ -0,0 +1,325 @@
|
||||
# Update Metrics Dashboard
|
||||
|
||||
**Priority**: P3 (Enhancement)
|
||||
**Source Reference**: From todos.md line 60
|
||||
**Status**: Ready for Implementation
|
||||
|
||||
## Problem Statement
|
||||
|
||||
Administrators lack visibility into update operations across their agent fleet. There is no centralized dashboard showing update success/failure rates, agent update readiness, or performance analytics for update operations.
|
||||
|
||||
## Feature Description
|
||||
|
||||
Create a comprehensive Update Metrics Dashboard that provides real-time visibility into update operations, including success/failure rates, agent readiness tracking, performance analytics, and historical trend analysis for update management.
|
||||
|
||||
## Acceptance Criteria
|
||||
|
||||
1. Dashboard showing real-time update metrics across all agents
|
||||
2. Update success/failure rates with trend analysis
|
||||
3. Agent update readiness status and categorization
|
||||
4. Performance analytics for update operations
|
||||
5. Historical update operation tracking
|
||||
6. Filterable views by agent groups, time ranges, and update types
|
||||
7. Export capabilities for reporting
|
||||
8. Alert thresholds for update failure rates
|
||||
|
||||
## Technical Approach
|
||||
|
||||
### 1. Backend Metrics Collection
|
||||
|
||||
**Update Metrics Service** (`aggregator-server/internal/services/update_metrics.go`):
|
||||
```go
|
||||
type UpdateMetrics struct {
|
||||
TotalUpdates int64 `json:"total_updates"`
|
||||
SuccessfulUpdates int64 `json:"successful_updates"`
|
||||
FailedUpdates int64 `json:"failed_updates"`
|
||||
PendingUpdates int64 `json:"pending_updates"`
|
||||
AverageUpdateTime float64 `json:"average_update_time"`
|
||||
UpdateSuccessRate float64 `json:"update_success_rate"`
|
||||
ReadyForUpdate int64 `json:"ready_for_update"`
|
||||
NotReadyForUpdate int64 `json:"not_ready_for_update"`
|
||||
LastUpdated time.Time `json:"last_updated"`
|
||||
}
|
||||
|
||||
type UpdateMetricsTimeSeries struct {
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
SuccessRate float64 `json:"success_rate"`
|
||||
UpdateCount int64 `json:"update_count"`
|
||||
FailureRate float64 `json:"failure_rate"`
|
||||
}
|
||||
```
|
||||
|
||||
**Metrics Calculation**:
|
||||
```go
|
||||
func (s *UpdateMetricsService) CalculateUpdateMetrics(timeRange time.Duration) (*UpdateMetrics, error) {
|
||||
metrics := &UpdateMetrics{}
|
||||
|
||||
// Get update statistics from database
|
||||
stats, err := s.queries.GetUpdateStats(time.Now().Add(-timeRange))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
metrics.TotalUpdates = stats.TotalUpdates
|
||||
metrics.SuccessfulUpdates = stats.SuccessfulUpdates
|
||||
metrics.FailedUpdates = stats.FailedUpdates
|
||||
metrics.PendingUpdates = stats.PendingUpdates
|
||||
|
||||
if metrics.TotalUpdates > 0 {
|
||||
metrics.UpdateSuccessRate = float64(metrics.SuccessfulUpdates) / float64(metrics.TotalUpdates) * 100
|
||||
}
|
||||
|
||||
// Calculate agent readiness
|
||||
readiness, err := s.queries.GetAgentReadinessStats()
|
||||
if err == nil {
|
||||
metrics.ReadyForUpdate = readiness.ReadyCount
|
||||
metrics.NotReadyForUpdate = readiness.NotReadyCount
|
||||
}
|
||||
|
||||
return metrics, nil
|
||||
}
|
||||
```
|
||||
|
||||
### 2. Database Queries
|
||||
|
||||
**Update Statistics** (`aggregator-server/internal/database/queries/updates.go`):
|
||||
```sql
|
||||
-- Update success/failure statistics
|
||||
SELECT
|
||||
COUNT(*) as total_updates,
|
||||
COUNT(CASE WHEN status = 'completed' THEN 1 END) as successful_updates,
|
||||
COUNT(CASE WHEN status = 'failed' THEN 1 END) as failed_updates,
|
||||
COUNT(CASE WHEN status IN ('pending', 'sent') THEN 1 END) as pending_updates,
|
||||
AVG(EXTRACT(EPOCH FROM (completed_at - created_at))) as avg_update_time
|
||||
FROM update_events
|
||||
WHERE created_at > NOW() - $1::INTERVAL;
|
||||
|
||||
-- Agent readiness statistics
|
||||
SELECT
|
||||
COUNT(CASE WHEN has_available_updates = true AND last_seen > NOW() - INTERVAL '1 hour' THEN 1 END) as ready_count,
|
||||
COUNT(CASE WHEN has_available_updates = false OR last_seen <= NOW() - INTERVAL '1 hour' THEN 1 END) as not_ready_count
|
||||
FROM agents;
|
||||
```
|
||||
|
||||
### 3. API Endpoints
|
||||
|
||||
**Metrics API** (`aggregator-server/internal/api/handlers/metrics.go`):
|
||||
```go
|
||||
// GET /api/v1/metrics/updates
|
||||
func (h *MetricsHandler) GetUpdateMetrics(c *gin.Context) {
|
||||
timeRange := c.DefaultQuery("timeRange", "24h")
|
||||
duration, err := time.ParseDuration(timeRange)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid time range"})
|
||||
return
|
||||
}
|
||||
|
||||
metrics, err := h.metricsService.CalculateUpdateMetrics(duration)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, metrics)
|
||||
}
|
||||
|
||||
// GET /api/v1/metrics/updates/timeseries
|
||||
func (h *MetricsHandler) GetUpdateTimeSeries(c *gin.Context) {
|
||||
// Return time series data for charts
|
||||
}
|
||||
```
|
||||
|
||||
### 4. Frontend Dashboard Components
|
||||
|
||||
**Update Metrics Dashboard** (`aggregator-web/src/pages/UpdateMetrics.tsx`):
|
||||
```typescript
|
||||
interface UpdateMetrics {
|
||||
totalUpdates: number;
|
||||
successfulUpdates: number;
|
||||
failedUpdates: number;
|
||||
pendingUpdates: number;
|
||||
updateSuccessRate: number;
|
||||
readyForUpdate: number;
|
||||
notReadyForUpdate: number;
|
||||
}
|
||||
|
||||
const UpdateMetricsDashboard: React.FC = () => {
|
||||
const [metrics, setMetrics] = useState<UpdateMetrics | null>(null);
|
||||
const [timeRange, setTimeRange] = useState<string>("24h");
|
||||
|
||||
return (
|
||||
<div className="update-metrics-dashboard">
|
||||
<div className="metrics-header">
|
||||
<h2>Update Operations Dashboard</h2>
|
||||
<TimeRangeSelector value={timeRange} onChange={setTimeRange} />
|
||||
</div>
|
||||
|
||||
<div className="metrics-grid">
|
||||
<MetricCard
|
||||
title="Success Rate"
|
||||
value={metrics?.updateSuccessRate || 0}
|
||||
unit="%"
|
||||
trend={getSuccessRateTrend()}
|
||||
/>
|
||||
<MetricCard
|
||||
title="Ready for Updates"
|
||||
value={metrics?.readyForUpdate || 0}
|
||||
unit="agents"
|
||||
/>
|
||||
<MetricCard
|
||||
title="Failed Updates"
|
||||
value={metrics?.failedUpdates || 0}
|
||||
trend={getFailureTrend()}
|
||||
/>
|
||||
<MetricCard
|
||||
title="Pending Updates"
|
||||
value={metrics?.pendingUpdates || 0}
|
||||
/>
|
||||
</div>
|
||||
|
||||
<div className="charts-section">
|
||||
<UpdateSuccessRateChart timeRange={timeRange} />
|
||||
<UpdateVolumeChart timeRange={timeRange} />
|
||||
<AgentReadinessChart timeRange={timeRange} />
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
};
|
||||
```
|
||||
|
||||
**Chart Components**:
|
||||
- `UpdateSuccessRateChart`: Line chart showing success rate over time
|
||||
- `UpdateVolumeChart`: Bar chart showing update volume trends
|
||||
- `AgentReadinessChart`: Pie chart showing ready vs not-ready agents
|
||||
- `FailureReasonChart`: Breakdown of update failure reasons
|
||||
|
||||
### 5. Real-time Updates
|
||||
|
||||
**WebSocket Integration** (optional):
|
||||
```typescript
|
||||
// Real-time metrics updates
|
||||
useEffect(() => {
|
||||
const ws = new WebSocket(`${API_BASE}/ws/metrics/updates`);
|
||||
|
||||
ws.onmessage = (event) => {
|
||||
const updatedMetrics = JSON.parse(event.data);
|
||||
setMetrics(updatedMetrics);
|
||||
};
|
||||
|
||||
return () => ws.close();
|
||||
}, [timeRange]);
|
||||
```
|
||||
|
||||
## Definition of Done
|
||||
|
||||
- ✅ Update metrics calculation service implemented
|
||||
- ✅ RESTful API endpoints for metrics data
|
||||
- ✅ Comprehensive dashboard with key metrics
|
||||
- ✅ Interactive charts showing trends and analytics
|
||||
- ✅ Real-time or near real-time updates
|
||||
- ✅ Filtering by time range, agent groups, update types
|
||||
- ✅ Export functionality for reports
|
||||
- ✅ Mobile-responsive design
|
||||
- ✅ Performance optimization for large datasets
|
||||
|
||||
## Test Plan
|
||||
|
||||
1. **Unit Tests**
|
||||
- Metrics calculation accuracy
|
||||
- Time series data generation
|
||||
- API response formatting
|
||||
|
||||
2. **Integration Tests**
|
||||
- End-to-end metrics flow
|
||||
- Database query performance
|
||||
- Real-time update functionality
|
||||
|
||||
3. **Performance Tests**
|
||||
- Dashboard load times with large datasets
|
||||
- API response times under load
|
||||
- Chart rendering performance
|
||||
|
||||
4. **User Acceptance Tests**
|
||||
- Administrators can easily identify update issues
|
||||
- Dashboard provides actionable insights
|
||||
- Interface is intuitive and responsive
|
||||
|
||||
## Files to Modify
|
||||
|
||||
- `aggregator-server/internal/services/update_metrics.go` - New service
|
||||
- `aggregator-server/internal/database/queries/metrics.go` - New queries
|
||||
- `aggregator-server/internal/api/handlers/metrics.go` - New handlers
|
||||
- `aggregator-web/src/pages/UpdateMetrics.tsx` - New dashboard page
|
||||
- `aggregator-web/src/components/MetricCard.tsx` - Metric display component
|
||||
- `aggregator-web/src/components/charts/` - Chart components
|
||||
- `aggregator-web/src/lib/api.ts` - API integration
|
||||
|
||||
## Metrics Categories
|
||||
|
||||
### 1. Success Metrics
|
||||
- Update success rate percentage
|
||||
- Successful update count
|
||||
- Average update completion time
|
||||
- Agent readiness percentage
|
||||
|
||||
### 2. Failure Metrics
|
||||
- Failed update count
|
||||
- Failure rate percentage
|
||||
- Common failure reasons
|
||||
- Rollback frequency
|
||||
|
||||
### 3. Performance Metrics
|
||||
- Update queue length
|
||||
- Average processing time
|
||||
- Agent response time
|
||||
- Server load during updates
|
||||
|
||||
### 4. Agent Metrics
|
||||
- Agents ready for updates
|
||||
- Agents with available updates
|
||||
- Agents requiring manual intervention
|
||||
- Update distribution by agent version
|
||||
|
||||
## Estimated Effort
|
||||
|
||||
- **Development**: 20-24 hours
|
||||
- **Testing**: 12-16 hours
|
||||
- **Review**: 6-8 hours
|
||||
- **Design/UX**: 8-10 hours
|
||||
|
||||
## Dependencies
|
||||
|
||||
- Existing update events database
|
||||
- Agent status tracking system
|
||||
- Chart library (Chart.js, D3.js, etc.)
|
||||
- WebSocket infrastructure (for real-time updates)
|
||||
|
||||
## Risk Assessment
|
||||
|
||||
**Low-Medium Risk** - Enhancement that creates new functionality without affecting existing systems. Performance considerations for large datasets need attention.
|
||||
|
||||
## Implementation Phases
|
||||
|
||||
### Phase 1: Core Metrics API
|
||||
1. Implement metrics calculation service
|
||||
2. Create database queries for statistics
|
||||
3. Build REST API endpoints
|
||||
|
||||
### Phase 2: Dashboard UI
|
||||
1. Create basic dashboard layout
|
||||
2. Implement metric cards and charts
|
||||
3. Add time range filtering
|
||||
|
||||
### Phase 3: Advanced Features
|
||||
1. Real-time updates
|
||||
2. Export functionality
|
||||
3. Alert thresholds
|
||||
4. Advanced filtering and search
|
||||
|
||||
## Future Enhancements
|
||||
|
||||
1. **Predictive Analytics**: Predict update success based on agent patterns
|
||||
2. **Automated Recommendations**: Suggest optimal update timing
|
||||
3. **Integration with APM**: Correlate update performance with system metrics
|
||||
4. **Custom Dashboards**: User-configurable metric views
|
||||
5. **SLA Monitoring**: Track update performance against service level agreements
|
||||
Reference in New Issue
Block a user