Add docs and project files - force for Culurien
This commit is contained in:
432
docs/3_BACKLOG/P3-005_Server-Health-Dashboard.md
Normal file
432
docs/3_BACKLOG/P3-005_Server-Health-Dashboard.md
Normal file
@@ -0,0 +1,432 @@
|
||||
# Server Health Dashboard Component
|
||||
|
||||
**Priority**: P3 (Enhancement)
|
||||
**Source Reference**: From todos.md line 6
|
||||
**Status**: Ready for Implementation
|
||||
|
||||
## Problem Statement
|
||||
|
||||
Administrators lack visibility into server health status, coordination components, and overall system performance. There is no centralized dashboard showing server agent/coordinator selection mechanisms, version verification, config validation, or health check integration.
|
||||
|
||||
## Feature Description
|
||||
|
||||
Create a Server Health Dashboard that provides real-time monitoring of server status, health indicators, coordination components, and performance metrics to help administrators understand system health and troubleshoot issues.
|
||||
|
||||
## Acceptance Criteria
|
||||
|
||||
1. Real-time server status monitoring dashboard
|
||||
2. Health check integration with settings page
|
||||
3. Server agent/coordinator selection mechanism visibility
|
||||
4. Version verification and config validation status
|
||||
5. Performance metrics display (CPU, memory, database connections)
|
||||
6. Alert thresholds for critical server health issues
|
||||
7. Historical health data tracking
|
||||
8. System status indicators (database, API, scheduler)
|
||||
|
||||
## Technical Approach
|
||||
|
||||
### 1. Server Health Service
|
||||
|
||||
**Health Monitoring Service** (`aggregator-server/internal/services/health_service.go`):
|
||||
```go
|
||||
type ServerHealth struct {
|
||||
ServerID string `json:"server_id"`
|
||||
Status string `json:"status"` // "healthy", "degraded", "unhealthy"
|
||||
Uptime time.Duration `json:"uptime"`
|
||||
Version string `json:"version"`
|
||||
DatabaseStatus DatabaseHealth `json:"database_status"`
|
||||
SchedulerStatus SchedulerHealth `json:"scheduler_status"`
|
||||
APIServerStatus APIServerHealth `json:"api_server_status"`
|
||||
SystemMetrics SystemMetrics `json:"system_metrics"`
|
||||
LastHealthCheck time.Time `json:"last_health_check"`
|
||||
HealthIssues []HealthIssue `json:"health_issues"`
|
||||
}
|
||||
|
||||
type DatabaseHealth struct {
|
||||
Status string `json:"status"`
|
||||
ConnectionPool int `json:"connection_pool"`
|
||||
ResponseTime float64 `json:"response_time"`
|
||||
LastChecked time.Time `json:"last_checked"`
|
||||
}
|
||||
|
||||
type SchedulerHealth struct {
|
||||
Status string `json:"status"`
|
||||
RunningJobs int `json:"running_jobs"`
|
||||
QueueLength int `json:"queue_length"`
|
||||
LastJobExecution time.Time `json:"last_job_execution"`
|
||||
}
|
||||
```
|
||||
|
||||
**Health Check Implementation**:
|
||||
```go
|
||||
func (s *HealthService) CheckServerHealth() (*ServerHealth, error) {
|
||||
health := &ServerHealth{
|
||||
ServerID: s.serverID,
|
||||
Status: "healthy",
|
||||
LastHealthCheck: time.Now(),
|
||||
}
|
||||
|
||||
// Database health check
|
||||
dbHealth, err := s.checkDatabaseHealth()
|
||||
if err != nil {
|
||||
health.HealthIssues = append(health.HealthIssues, HealthIssue{
|
||||
Type: "database",
|
||||
Message: fmt.Sprintf("Database health check failed: %v", err),
|
||||
Severity: "critical",
|
||||
})
|
||||
health.Status = "unhealthy"
|
||||
}
|
||||
health.DatabaseStatus = *dbHealth
|
||||
|
||||
// Scheduler health check
|
||||
schedulerHealth := s.checkSchedulerHealth()
|
||||
health.SchedulerStatus = *schedulerHealth
|
||||
|
||||
// System metrics
|
||||
systemMetrics := s.getSystemMetrics()
|
||||
health.SystemMetrics = *systemMetrics
|
||||
|
||||
// Overall status determination
|
||||
health.determineOverallStatus()
|
||||
|
||||
return health, nil
|
||||
}
|
||||
```
|
||||
|
||||
### 2. Database Health Monitoring
|
||||
|
||||
**Database Connection Health**:
|
||||
```go
|
||||
func (s *HealthService) checkDatabaseHealth() (*DatabaseHealth, error) {
|
||||
start := time.Now()
|
||||
|
||||
// Test database connection
|
||||
var result int
|
||||
err := s.db.QueryRow("SELECT 1").Scan(&result)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("database connection failed: %w", err)
|
||||
}
|
||||
|
||||
responseTime := time.Since(start).Seconds()
|
||||
|
||||
// Get connection pool stats
|
||||
stats := s.db.Stats()
|
||||
|
||||
return &DatabaseHealth{
|
||||
Status: "healthy",
|
||||
ConnectionPool: stats.OpenConnections,
|
||||
ResponseTime: responseTime,
|
||||
LastChecked: time.Now(),
|
||||
}, nil
|
||||
}
|
||||
```
|
||||
|
||||
### 3. API Endpoint
|
||||
|
||||
**Health API Handler** (`aggregator-server/internal/api/handlers/health.go`):
|
||||
```go
|
||||
// GET /api/v1/health
|
||||
func (h *HealthHandler) GetServerHealth(c *gin.Context) {
|
||||
health, err := h.healthService.CheckServerHealth()
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{
|
||||
"status": "unhealthy",
|
||||
"error": err.Error(),
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, health)
|
||||
}
|
||||
|
||||
// GET /api/v1/health/history
|
||||
func (h *HealthHandler) GetHealthHistory(c *gin.Context) {
|
||||
// Return historical health data for charts
|
||||
}
|
||||
```
|
||||
|
||||
### 4. Frontend Dashboard Component
|
||||
|
||||
**Server Health Dashboard** (`aggregator-web/src/pages/ServerHealth.tsx`):
|
||||
```typescript
|
||||
interface ServerHealth {
|
||||
server_id: string;
|
||||
status: 'healthy' | 'degraded' | 'unhealthy';
|
||||
uptime: number;
|
||||
version: string;
|
||||
database_status: {
|
||||
status: string;
|
||||
connection_pool: number;
|
||||
response_time: number;
|
||||
};
|
||||
scheduler_status: {
|
||||
status: string;
|
||||
running_jobs: number;
|
||||
queue_length: number;
|
||||
};
|
||||
system_metrics: {
|
||||
cpu_usage: number;
|
||||
memory_usage: number;
|
||||
disk_usage: number;
|
||||
};
|
||||
health_issues: Array<{
|
||||
type: string;
|
||||
message: string;
|
||||
severity: 'info' | 'warning' | 'critical';
|
||||
}>;
|
||||
}
|
||||
|
||||
const ServerHealthDashboard: React.FC = () => {
|
||||
const [health, setHealth] = useState<ServerHealth | null>(null);
|
||||
const [autoRefresh, setAutoRefresh] = useState(true);
|
||||
|
||||
return (
|
||||
<div className="server-health-dashboard">
|
||||
<div className="health-header">
|
||||
<h2>Server Health</h2>
|
||||
<div className="health-controls">
|
||||
<RefreshToggle enabled={autoRefresh} onChange={setAutoRefresh} />
|
||||
<RefreshButton onClick={() => fetchHealthData()} />
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Overall Status */}
|
||||
<div className="overall-status">
|
||||
<StatusIndicator
|
||||
status={health?.status || 'unknown'}
|
||||
message={`Server ${health?.status || 'unknown'}`}
|
||||
/>
|
||||
<div className="uptime">
|
||||
Uptime: {formatDuration(health?.uptime || 0)}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Health Metrics Grid */}
|
||||
<div className="health-metrics-grid">
|
||||
<HealthCard
|
||||
title="Database"
|
||||
status={health?.database_status.status}
|
||||
metrics={[
|
||||
{ label: "Connections", value: health?.database_status.connection_pool },
|
||||
{ label: "Response Time", value: `${health?.database_status.response_time?.toFixed(2)}ms` }
|
||||
]}
|
||||
/>
|
||||
<HealthCard
|
||||
title="Scheduler"
|
||||
status={health?.scheduler_status.status}
|
||||
metrics={[
|
||||
{ label: "Running Jobs", value: health?.scheduler_status.running_jobs },
|
||||
{ label: "Queue Length", value: health?.scheduler_status.queue_length }
|
||||
]}
|
||||
/>
|
||||
<HealthCard
|
||||
title="System Resources"
|
||||
status="healthy"
|
||||
metrics={[
|
||||
{ label: "CPU", value: `${health?.system_metrics.cpu_usage}%` },
|
||||
{ label: "Memory", value: `${health?.system_metrics.memory_usage}%` },
|
||||
{ label: "Disk", value: `${health?.system_metrics.disk_usage}%` }
|
||||
]}
|
||||
/>
|
||||
</div>
|
||||
|
||||
{/* Health Issues */}
|
||||
{health?.health_issues && health.health_issues.length > 0 && (
|
||||
<div className="health-issues">
|
||||
<h3>Health Issues</h3>
|
||||
{health.health_issues.map((issue, index) => (
|
||||
<HealthIssueAlert key={index} issue={issue} />
|
||||
))}
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Historical Charts */}
|
||||
<div className="health-charts">
|
||||
<h3>Historical Health Data</h3>
|
||||
<div className="charts-grid">
|
||||
<HealthChart
|
||||
title="Response Time"
|
||||
data={historicalData.responseTime}
|
||||
unit="ms"
|
||||
/>
|
||||
<HealthChart
|
||||
title="System Load"
|
||||
data={historicalData.systemLoad}
|
||||
unit="%"
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
};
|
||||
```
|
||||
|
||||
### 5. Health Monitoring Components
|
||||
|
||||
**Status Indicator Component**:
|
||||
```typescript
|
||||
const StatusIndicator: React.FC<{ status: string; message: string }> = ({ status, message }) => {
|
||||
const getStatusColor = (status: string) => {
|
||||
switch (status) {
|
||||
case 'healthy': return 'green';
|
||||
case 'degraded': return 'yellow';
|
||||
case 'unhealthy': return 'red';
|
||||
default: return 'gray';
|
||||
}
|
||||
};
|
||||
|
||||
return (
|
||||
<div className={`status-indicator ${getStatusColor(status)}`}>
|
||||
<div className="status-dot"></div>
|
||||
<span className="status-message">{message}</span>
|
||||
</div>
|
||||
);
|
||||
};
|
||||
```
|
||||
|
||||
**Health Card Component**:
|
||||
```typescript
|
||||
interface HealthCardProps {
|
||||
title: string;
|
||||
status: string;
|
||||
metrics: Array<{ label: string; value: string | number }>;
|
||||
}
|
||||
|
||||
const HealthCard: React.FC<HealthCardProps> = ({ title, status, metrics }) => {
|
||||
return (
|
||||
<div className={`health-card status-${status}`}>
|
||||
<div className="card-header">
|
||||
<h3>{title}</h3>
|
||||
<StatusBadge status={status} />
|
||||
</div>
|
||||
<div className="card-metrics">
|
||||
{metrics.map((metric, index) => (
|
||||
<div key={index} className="metric">
|
||||
<span className="metric-label">{metric.label}:</span>
|
||||
<span className="metric-value">{metric.value}</span>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
};
|
||||
```
|
||||
|
||||
## Definition of Done
|
||||
|
||||
- ✅ Server health monitoring service implemented
|
||||
- ✅ Database, scheduler, and system resource health checks
|
||||
- ✅ Real-time health dashboard with status indicators
|
||||
- ✅ Historical health data tracking and visualization
|
||||
- ✅ Alert system for critical health issues
|
||||
- ✅ Auto-refresh functionality
|
||||
- ✅ Mobile-responsive design
|
||||
- ✅ Integration with existing settings page
|
||||
|
||||
## Test Plan
|
||||
|
||||
1. **Unit Tests**
|
||||
- Health check calculations
|
||||
- Status determination logic
|
||||
- Error handling scenarios
|
||||
|
||||
2. **Integration Tests**
|
||||
- Database health check under load
|
||||
- Scheduler monitoring accuracy
|
||||
- System metrics collection
|
||||
|
||||
3. **Stress Tests**
|
||||
- Dashboard performance under high load
|
||||
- Health check impact on system resources
|
||||
- Concurrent health monitoring
|
||||
|
||||
4. **Scenario Tests**
|
||||
- Database connection failures
|
||||
- High system load conditions
|
||||
- Scheduler queue overflow scenarios
|
||||
|
||||
## Files to Modify
|
||||
|
||||
- `aggregator-server/internal/services/health_service.go` - New service
|
||||
- `aggregator-server/internal/api/handlers/health.go` - New handlers
|
||||
- `aggregator-web/src/pages/ServerHealth.tsx` - New dashboard
|
||||
- `aggregator-web/src/components/StatusIndicator.tsx` - Status components
|
||||
- `aggregator-web/src/components/HealthCard.tsx` - Health card component
|
||||
- `aggregator-web/src/lib/api.ts` - API integration
|
||||
|
||||
## Health Check Categories
|
||||
|
||||
### 1. System Health
|
||||
- CPU usage percentage
|
||||
- Memory usage percentage
|
||||
- Disk space availability
|
||||
- Network connectivity
|
||||
|
||||
### 2. Application Health
|
||||
- Database connectivity and response time
|
||||
- API server responsiveness
|
||||
- Scheduler operation status
|
||||
- Background service status
|
||||
|
||||
### 3. Business Logic Health
|
||||
- Agent registration flow
|
||||
- Command queue processing
|
||||
- Update distribution
|
||||
- Token management
|
||||
|
||||
## Alert Thresholds
|
||||
|
||||
### Critical Alerts
|
||||
- Database connection failures
|
||||
- CPU usage > 90% for > 5 minutes
|
||||
- Memory usage > 95%
|
||||
- Scheduler queue length > 1000
|
||||
|
||||
### Warning Alerts
|
||||
- Database response time > 1 second
|
||||
- CPU usage > 80% for > 10 minutes
|
||||
- Memory usage > 85%
|
||||
- Queue length > 500
|
||||
|
||||
## Estimated Effort
|
||||
|
||||
- **Development**: 16-20 hours
|
||||
- **Testing**: 8-12 hours
|
||||
- **Review**: 4-6 hours
|
||||
- **Design/UX**: 6-8 hours
|
||||
|
||||
## Dependencies
|
||||
|
||||
- Existing monitoring infrastructure
|
||||
- System metrics collection
|
||||
- Database connection pooling
|
||||
- Background job processing
|
||||
|
||||
## Risk Assessment
|
||||
|
||||
**Low Risk** - Enhancement that adds monitoring capabilities without affecting core functionality. Health checks are read-only operations with minimal system impact.
|
||||
|
||||
## Implementation Phases
|
||||
|
||||
### Phase 1: Core Health Service
|
||||
1. Implement health check service
|
||||
2. Create health monitoring endpoints
|
||||
3. Basic status determination logic
|
||||
|
||||
### Phase 2: Dashboard UI
|
||||
1. Create health dashboard layout
|
||||
2. Implement status indicators and metrics
|
||||
3. Add real-time updates
|
||||
|
||||
### Phase 3: Advanced Features
|
||||
1. Historical data tracking
|
||||
2. Alert system integration
|
||||
3. Performance optimization
|
||||
|
||||
## Future Enhancements
|
||||
|
||||
1. **Multi-Server Monitoring**: Support for clustered deployments
|
||||
2. **Predictive Health**: ML-based health prediction
|
||||
3. **Automated Remediation**: Self-healing capabilities
|
||||
4. **Integration with External Monitoring**: Prometheus, Grafana
|
||||
5. **Custom Health Checks**: Pluggable health check system
|
||||
Reference in New Issue
Block a user