432 lines
13 KiB
Markdown
432 lines
13 KiB
Markdown
# Server Health Dashboard Component
|
|
|
|
**Priority**: P3 (Enhancement)
|
|
**Source Reference**: From todos.md line 6
|
|
**Status**: Ready for Implementation
|
|
|
|
## Problem Statement
|
|
|
|
Administrators lack visibility into server health status, coordination components, and overall system performance. There is no centralized dashboard showing server agent/coordinator selection mechanisms, version verification, config validation, or health check integration.
|
|
|
|
## Feature Description
|
|
|
|
Create a Server Health Dashboard that provides real-time monitoring of server status, health indicators, coordination components, and performance metrics to help administrators understand system health and troubleshoot issues.
|
|
|
|
## Acceptance Criteria
|
|
|
|
1. Real-time server status monitoring dashboard
|
|
2. Health check integration with settings page
|
|
3. Server agent/coordinator selection mechanism visibility
|
|
4. Version verification and config validation status
|
|
5. Performance metrics display (CPU, memory, database connections)
|
|
6. Alert thresholds for critical server health issues
|
|
7. Historical health data tracking
|
|
8. System status indicators (database, API, scheduler)
|
|
|
|
## Technical Approach
|
|
|
|
### 1. Server Health Service
|
|
|
|
**Health Monitoring Service** (`aggregator-server/internal/services/health_service.go`):
|
|
```go
|
|
type ServerHealth struct {
|
|
ServerID string `json:"server_id"`
|
|
Status string `json:"status"` // "healthy", "degraded", "unhealthy"
|
|
Uptime time.Duration `json:"uptime"`
|
|
Version string `json:"version"`
|
|
DatabaseStatus DatabaseHealth `json:"database_status"`
|
|
SchedulerStatus SchedulerHealth `json:"scheduler_status"`
|
|
APIServerStatus APIServerHealth `json:"api_server_status"`
|
|
SystemMetrics SystemMetrics `json:"system_metrics"`
|
|
LastHealthCheck time.Time `json:"last_health_check"`
|
|
HealthIssues []HealthIssue `json:"health_issues"`
|
|
}
|
|
|
|
type DatabaseHealth struct {
|
|
Status string `json:"status"`
|
|
ConnectionPool int `json:"connection_pool"`
|
|
ResponseTime float64 `json:"response_time"`
|
|
LastChecked time.Time `json:"last_checked"`
|
|
}
|
|
|
|
type SchedulerHealth struct {
|
|
Status string `json:"status"`
|
|
RunningJobs int `json:"running_jobs"`
|
|
QueueLength int `json:"queue_length"`
|
|
LastJobExecution time.Time `json:"last_job_execution"`
|
|
}
|
|
```
|
|
|
|
**Health Check Implementation**:
|
|
```go
|
|
func (s *HealthService) CheckServerHealth() (*ServerHealth, error) {
|
|
health := &ServerHealth{
|
|
ServerID: s.serverID,
|
|
Status: "healthy",
|
|
LastHealthCheck: time.Now(),
|
|
}
|
|
|
|
// Database health check
|
|
dbHealth, err := s.checkDatabaseHealth()
|
|
if err != nil {
|
|
health.HealthIssues = append(health.HealthIssues, HealthIssue{
|
|
Type: "database",
|
|
Message: fmt.Sprintf("Database health check failed: %v", err),
|
|
Severity: "critical",
|
|
})
|
|
health.Status = "unhealthy"
|
|
}
|
|
health.DatabaseStatus = *dbHealth
|
|
|
|
// Scheduler health check
|
|
schedulerHealth := s.checkSchedulerHealth()
|
|
health.SchedulerStatus = *schedulerHealth
|
|
|
|
// System metrics
|
|
systemMetrics := s.getSystemMetrics()
|
|
health.SystemMetrics = *systemMetrics
|
|
|
|
// Overall status determination
|
|
health.determineOverallStatus()
|
|
|
|
return health, nil
|
|
}
|
|
```
|
|
|
|
### 2. Database Health Monitoring
|
|
|
|
**Database Connection Health**:
|
|
```go
|
|
func (s *HealthService) checkDatabaseHealth() (*DatabaseHealth, error) {
|
|
start := time.Now()
|
|
|
|
// Test database connection
|
|
var result int
|
|
err := s.db.QueryRow("SELECT 1").Scan(&result)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("database connection failed: %w", err)
|
|
}
|
|
|
|
responseTime := time.Since(start).Seconds()
|
|
|
|
// Get connection pool stats
|
|
stats := s.db.Stats()
|
|
|
|
return &DatabaseHealth{
|
|
Status: "healthy",
|
|
ConnectionPool: stats.OpenConnections,
|
|
ResponseTime: responseTime,
|
|
LastChecked: time.Now(),
|
|
}, nil
|
|
}
|
|
```
|
|
|
|
### 3. API Endpoint
|
|
|
|
**Health API Handler** (`aggregator-server/internal/api/handlers/health.go`):
|
|
```go
|
|
// GET /api/v1/health
|
|
func (h *HealthHandler) GetServerHealth(c *gin.Context) {
|
|
health, err := h.healthService.CheckServerHealth()
|
|
if err != nil {
|
|
c.JSON(http.StatusInternalServerError, gin.H{
|
|
"status": "unhealthy",
|
|
"error": err.Error(),
|
|
})
|
|
return
|
|
}
|
|
|
|
c.JSON(http.StatusOK, health)
|
|
}
|
|
|
|
// GET /api/v1/health/history
|
|
func (h *HealthHandler) GetHealthHistory(c *gin.Context) {
|
|
// Return historical health data for charts
|
|
}
|
|
```
|
|
|
|
### 4. Frontend Dashboard Component
|
|
|
|
**Server Health Dashboard** (`aggregator-web/src/pages/ServerHealth.tsx`):
|
|
```typescript
|
|
interface ServerHealth {
|
|
server_id: string;
|
|
status: 'healthy' | 'degraded' | 'unhealthy';
|
|
uptime: number;
|
|
version: string;
|
|
database_status: {
|
|
status: string;
|
|
connection_pool: number;
|
|
response_time: number;
|
|
};
|
|
scheduler_status: {
|
|
status: string;
|
|
running_jobs: number;
|
|
queue_length: number;
|
|
};
|
|
system_metrics: {
|
|
cpu_usage: number;
|
|
memory_usage: number;
|
|
disk_usage: number;
|
|
};
|
|
health_issues: Array<{
|
|
type: string;
|
|
message: string;
|
|
severity: 'info' | 'warning' | 'critical';
|
|
}>;
|
|
}
|
|
|
|
const ServerHealthDashboard: React.FC = () => {
|
|
const [health, setHealth] = useState<ServerHealth | null>(null);
|
|
const [autoRefresh, setAutoRefresh] = useState(true);
|
|
|
|
return (
|
|
<div className="server-health-dashboard">
|
|
<div className="health-header">
|
|
<h2>Server Health</h2>
|
|
<div className="health-controls">
|
|
<RefreshToggle enabled={autoRefresh} onChange={setAutoRefresh} />
|
|
<RefreshButton onClick={() => fetchHealthData()} />
|
|
</div>
|
|
</div>
|
|
|
|
{/* Overall Status */}
|
|
<div className="overall-status">
|
|
<StatusIndicator
|
|
status={health?.status || 'unknown'}
|
|
message={`Server ${health?.status || 'unknown'}`}
|
|
/>
|
|
<div className="uptime">
|
|
Uptime: {formatDuration(health?.uptime || 0)}
|
|
</div>
|
|
</div>
|
|
|
|
{/* Health Metrics Grid */}
|
|
<div className="health-metrics-grid">
|
|
<HealthCard
|
|
title="Database"
|
|
status={health?.database_status.status}
|
|
metrics={[
|
|
{ label: "Connections", value: health?.database_status.connection_pool },
|
|
{ label: "Response Time", value: `${health?.database_status.response_time?.toFixed(2)}ms` }
|
|
]}
|
|
/>
|
|
<HealthCard
|
|
title="Scheduler"
|
|
status={health?.scheduler_status.status}
|
|
metrics={[
|
|
{ label: "Running Jobs", value: health?.scheduler_status.running_jobs },
|
|
{ label: "Queue Length", value: health?.scheduler_status.queue_length }
|
|
]}
|
|
/>
|
|
<HealthCard
|
|
title="System Resources"
|
|
status="healthy"
|
|
metrics={[
|
|
{ label: "CPU", value: `${health?.system_metrics.cpu_usage}%` },
|
|
{ label: "Memory", value: `${health?.system_metrics.memory_usage}%` },
|
|
{ label: "Disk", value: `${health?.system_metrics.disk_usage}%` }
|
|
]}
|
|
/>
|
|
</div>
|
|
|
|
{/* Health Issues */}
|
|
{health?.health_issues && health.health_issues.length > 0 && (
|
|
<div className="health-issues">
|
|
<h3>Health Issues</h3>
|
|
{health.health_issues.map((issue, index) => (
|
|
<HealthIssueAlert key={index} issue={issue} />
|
|
))}
|
|
</div>
|
|
)}
|
|
|
|
{/* Historical Charts */}
|
|
<div className="health-charts">
|
|
<h3>Historical Health Data</h3>
|
|
<div className="charts-grid">
|
|
<HealthChart
|
|
title="Response Time"
|
|
data={historicalData.responseTime}
|
|
unit="ms"
|
|
/>
|
|
<HealthChart
|
|
title="System Load"
|
|
data={historicalData.systemLoad}
|
|
unit="%"
|
|
/>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
);
|
|
};
|
|
```
|
|
|
|
### 5. Health Monitoring Components
|
|
|
|
**Status Indicator Component**:
|
|
```typescript
|
|
const StatusIndicator: React.FC<{ status: string; message: string }> = ({ status, message }) => {
|
|
const getStatusColor = (status: string) => {
|
|
switch (status) {
|
|
case 'healthy': return 'green';
|
|
case 'degraded': return 'yellow';
|
|
case 'unhealthy': return 'red';
|
|
default: return 'gray';
|
|
}
|
|
};
|
|
|
|
return (
|
|
<div className={`status-indicator ${getStatusColor(status)}`}>
|
|
<div className="status-dot"></div>
|
|
<span className="status-message">{message}</span>
|
|
</div>
|
|
);
|
|
};
|
|
```
|
|
|
|
**Health Card Component**:
|
|
```typescript
|
|
interface HealthCardProps {
|
|
title: string;
|
|
status: string;
|
|
metrics: Array<{ label: string; value: string | number }>;
|
|
}
|
|
|
|
const HealthCard: React.FC<HealthCardProps> = ({ title, status, metrics }) => {
|
|
return (
|
|
<div className={`health-card status-${status}`}>
|
|
<div className="card-header">
|
|
<h3>{title}</h3>
|
|
<StatusBadge status={status} />
|
|
</div>
|
|
<div className="card-metrics">
|
|
{metrics.map((metric, index) => (
|
|
<div key={index} className="metric">
|
|
<span className="metric-label">{metric.label}:</span>
|
|
<span className="metric-value">{metric.value}</span>
|
|
</div>
|
|
))}
|
|
</div>
|
|
</div>
|
|
);
|
|
};
|
|
```
|
|
|
|
## Definition of Done
|
|
|
|
- ✅ Server health monitoring service implemented
|
|
- ✅ Database, scheduler, and system resource health checks
|
|
- ✅ Real-time health dashboard with status indicators
|
|
- ✅ Historical health data tracking and visualization
|
|
- ✅ Alert system for critical health issues
|
|
- ✅ Auto-refresh functionality
|
|
- ✅ Mobile-responsive design
|
|
- ✅ Integration with existing settings page
|
|
|
|
## Test Plan
|
|
|
|
1. **Unit Tests**
|
|
- Health check calculations
|
|
- Status determination logic
|
|
- Error handling scenarios
|
|
|
|
2. **Integration Tests**
|
|
- Database health check under load
|
|
- Scheduler monitoring accuracy
|
|
- System metrics collection
|
|
|
|
3. **Stress Tests**
|
|
- Dashboard performance under high load
|
|
- Health check impact on system resources
|
|
- Concurrent health monitoring
|
|
|
|
4. **Scenario Tests**
|
|
- Database connection failures
|
|
- High system load conditions
|
|
- Scheduler queue overflow scenarios
|
|
|
|
## Files to Modify
|
|
|
|
- `aggregator-server/internal/services/health_service.go` - New service
|
|
- `aggregator-server/internal/api/handlers/health.go` - New handlers
|
|
- `aggregator-web/src/pages/ServerHealth.tsx` - New dashboard
|
|
- `aggregator-web/src/components/StatusIndicator.tsx` - Status components
|
|
- `aggregator-web/src/components/HealthCard.tsx` - Health card component
|
|
- `aggregator-web/src/lib/api.ts` - API integration
|
|
|
|
## Health Check Categories
|
|
|
|
### 1. System Health
|
|
- CPU usage percentage
|
|
- Memory usage percentage
|
|
- Disk space availability
|
|
- Network connectivity
|
|
|
|
### 2. Application Health
|
|
- Database connectivity and response time
|
|
- API server responsiveness
|
|
- Scheduler operation status
|
|
- Background service status
|
|
|
|
### 3. Business Logic Health
|
|
- Agent registration flow
|
|
- Command queue processing
|
|
- Update distribution
|
|
- Token management
|
|
|
|
## Alert Thresholds
|
|
|
|
### Critical Alerts
|
|
- Database connection failures
|
|
- CPU usage > 90% for > 5 minutes
|
|
- Memory usage > 95%
|
|
- Scheduler queue length > 1000
|
|
|
|
### Warning Alerts
|
|
- Database response time > 1 second
|
|
- CPU usage > 80% for > 10 minutes
|
|
- Memory usage > 85%
|
|
- Queue length > 500
|
|
|
|
## Estimated Effort
|
|
|
|
- **Development**: 16-20 hours
|
|
- **Testing**: 8-12 hours
|
|
- **Review**: 4-6 hours
|
|
- **Design/UX**: 6-8 hours
|
|
|
|
## Dependencies
|
|
|
|
- Existing monitoring infrastructure
|
|
- System metrics collection
|
|
- Database connection pooling
|
|
- Background job processing
|
|
|
|
## Risk Assessment
|
|
|
|
**Low Risk** - Enhancement that adds monitoring capabilities without affecting core functionality. Health checks are read-only operations with minimal system impact.
|
|
|
|
## Implementation Phases
|
|
|
|
### Phase 1: Core Health Service
|
|
1. Implement health check service
|
|
2. Create health monitoring endpoints
|
|
3. Basic status determination logic
|
|
|
|
### Phase 2: Dashboard UI
|
|
1. Create health dashboard layout
|
|
2. Implement status indicators and metrics
|
|
3. Add real-time updates
|
|
|
|
### Phase 3: Advanced Features
|
|
1. Historical data tracking
|
|
2. Alert system integration
|
|
3. Performance optimization
|
|
|
|
## Future Enhancements
|
|
|
|
1. **Multi-Server Monitoring**: Support for clustered deployments
|
|
2. **Predictive Health**: ML-based health prediction
|
|
3. **Automated Remediation**: Self-healing capabilities
|
|
4. **Integration with External Monitoring**: Prometheus, Grafana
|
|
5. **Custom Health Checks**: Pluggable health check system |