Add docs and project files - force for Culurien

2026-03-28 20:46:24 -04:00
parent dc61797423
commit 484a7f77ce
343 changed files with 119530 additions and 0 deletions
--- a/docs/3_BACKLOG/P3-005_Server-Health-Dashboard.md
+++ b/docs/3_BACKLOG/P3-005_Server-Health-Dashboard.md
@@ -0,0 +1,432 @@
+# Server Health Dashboard Component
+
+**Priority**: P3 (Enhancement)
+**Source Reference**: From todos.md line 6
+**Status**: Ready for Implementation
+
+## Problem Statement
+
+Administrators lack visibility into server health status, coordination components, and overall system performance. There is no centralized dashboard showing server agent/coordinator selection mechanisms, version verification, config validation, or health check integration.
+
+## Feature Description
+
+Create a Server Health Dashboard that provides real-time monitoring of server status, health indicators, coordination components, and performance metrics to help administrators understand system health and troubleshoot issues.
+
+## Acceptance Criteria
+
+1. Real-time server status monitoring dashboard
+2. Health check integration with settings page
+3. Server agent/coordinator selection mechanism visibility
+4. Version verification and config validation status
+5. Performance metrics display (CPU, memory, database connections)
+6. Alert thresholds for critical server health issues
+7. Historical health data tracking
+8. System status indicators (database, API, scheduler)
+
+## Technical Approach
+
+### 1. Server Health Service
+
+**Health Monitoring Service** (`aggregator-server/internal/services/health_service.go`):
+```go
+type ServerHealth struct {
+    ServerID         string            `json:"server_id"`
+    Status           string            `json:"status"` // "healthy", "degraded", "unhealthy"
+    Uptime           time.Duration     `json:"uptime"`
+    Version          string            `json:"version"`
+    DatabaseStatus   DatabaseHealth    `json:"database_status"`
+    SchedulerStatus  SchedulerHealth   `json:"scheduler_status"`
+    APIServerStatus  APIServerHealth   `json:"api_server_status"`
+    SystemMetrics    SystemMetrics     `json:"system_metrics"`
+    LastHealthCheck  time.Time         `json:"last_health_check"`
+    HealthIssues     []HealthIssue     `json:"health_issues"`
+}
+
+type DatabaseHealth struct {
+    Status         string    `json:"status"`
+    ConnectionPool int       `json:"connection_pool"`
+    ResponseTime   float64   `json:"response_time"`
+    LastChecked    time.Time `json:"last_checked"`
+}
+
+type SchedulerHealth struct {
+    Status           string    `json:"status"`
+    RunningJobs      int       `json:"running_jobs"`
+    QueueLength      int       `json:"queue_length"`
+    LastJobExecution time.Time `json:"last_job_execution"`
+}
+```
+
+**Health Check Implementation**:
+```go
+func (s *HealthService) CheckServerHealth() (*ServerHealth, error) {
+    health := &ServerHealth{
+        ServerID: s.serverID,
+        Status:   "healthy",
+        LastHealthCheck: time.Now(),
+    }
+
+    // Database health check
+    dbHealth, err := s.checkDatabaseHealth()
+    if err != nil {
+        health.HealthIssues = append(health.HealthIssues, HealthIssue{
+            Type:    "database",
+            Message: fmt.Sprintf("Database health check failed: %v", err),
+            Severity: "critical",
+        })
+        health.Status = "unhealthy"
+    }
+    health.DatabaseStatus = *dbHealth
+
+    // Scheduler health check
+    schedulerHealth := s.checkSchedulerHealth()
+    health.SchedulerStatus = *schedulerHealth
+
+    // System metrics
+    systemMetrics := s.getSystemMetrics()
+    health.SystemMetrics = *systemMetrics
+
+    // Overall status determination
+    health.determineOverallStatus()
+
+    return health, nil
+}
+```
+
+### 2. Database Health Monitoring
+
+**Database Connection Health**:
+```go
+func (s *HealthService) checkDatabaseHealth() (*DatabaseHealth, error) {
+    start := time.Now()
+
+    // Test database connection
+    var result int
+    err := s.db.QueryRow("SELECT 1").Scan(&result)
+    if err != nil {
+        return nil, fmt.Errorf("database connection failed: %w", err)
+    }
+
+    responseTime := time.Since(start).Seconds()
+
+    // Get connection pool stats
+    stats := s.db.Stats()
+
+    return &DatabaseHealth{
+        Status:         "healthy",
+        ConnectionPool: stats.OpenConnections,
+        ResponseTime:   responseTime,
+        LastChecked:    time.Now(),
+    }, nil
+}
+```
+
+### 3. API Endpoint
+
+**Health API Handler** (`aggregator-server/internal/api/handlers/health.go`):
+```go
+// GET /api/v1/health
+func (h *HealthHandler) GetServerHealth(c *gin.Context) {
+    health, err := h.healthService.CheckServerHealth()
+    if err != nil {
+        c.JSON(http.StatusInternalServerError, gin.H{
+            "status": "unhealthy",
+            "error":  err.Error(),
+        })
+        return
+    }
+
+    c.JSON(http.StatusOK, health)
+}
+
+// GET /api/v1/health/history
+func (h *HealthHandler) GetHealthHistory(c *gin.Context) {
+    // Return historical health data for charts
+}
+```
+
+### 4. Frontend Dashboard Component
+
+**Server Health Dashboard** (`aggregator-web/src/pages/ServerHealth.tsx`):
+```typescript
+interface ServerHealth {
+    server_id: string;
+    status: 'healthy' | 'degraded' | 'unhealthy';
+    uptime: number;
+    version: string;
+    database_status: {
+        status: string;
+        connection_pool: number;
+        response_time: number;
+    };
+    scheduler_status: {
+        status: string;
+        running_jobs: number;
+        queue_length: number;
+    };
+    system_metrics: {
+        cpu_usage: number;
+        memory_usage: number;
+        disk_usage: number;
+    };
+    health_issues: Array<{
+        type: string;
+        message: string;
+        severity: 'info' | 'warning' | 'critical';
+    }>;
+}
+
+const ServerHealthDashboard: React.FC = () => {
+    const [health, setHealth] = useState<ServerHealth | null>(null);
+    const [autoRefresh, setAutoRefresh] = useState(true);
+
+    return (
+        <div className="server-health-dashboard">
+            <div className="health-header">
+                <h2>Server Health</h2>
+                <div className="health-controls">
+                    <RefreshToggle enabled={autoRefresh} onChange={setAutoRefresh} />
+                    <RefreshButton onClick={() => fetchHealthData()} />
+                </div>
+            </div>
+
+            {/* Overall Status */}
+            <div className="overall-status">
+                <StatusIndicator
+                    status={health?.status || 'unknown'}
+                    message={`Server ${health?.status || 'unknown'}`}
+                />
+                <div className="uptime">
+                    Uptime: {formatDuration(health?.uptime || 0)}
+                </div>
+            </div>
+
+            {/* Health Metrics Grid */}
+            <div className="health-metrics-grid">
+                <HealthCard
+                    title="Database"
+                    status={health?.database_status.status}
+                    metrics={[
+                        { label: "Connections", value: health?.database_status.connection_pool },
+                        { label: "Response Time", value: `${health?.database_status.response_time?.toFixed(2)}ms` }
+                    ]}
+                />
+                <HealthCard
+                    title="Scheduler"
+                    status={health?.scheduler_status.status}
+                    metrics={[
+                        { label: "Running Jobs", value: health?.scheduler_status.running_jobs },
+                        { label: "Queue Length", value: health?.scheduler_status.queue_length }
+                    ]}
+                />
+                <HealthCard
+                    title="System Resources"
+                    status="healthy"
+                    metrics={[
+                        { label: "CPU", value: `${health?.system_metrics.cpu_usage}%` },
+                        { label: "Memory", value: `${health?.system_metrics.memory_usage}%` },
+                        { label: "Disk", value: `${health?.system_metrics.disk_usage}%` }
+                    ]}
+                />
+            </div>
+
+            {/* Health Issues */}
+            {health?.health_issues && health.health_issues.length > 0 && (
+                <div className="health-issues">
+                    <h3>Health Issues</h3>
+                    {health.health_issues.map((issue, index) => (
+                        <HealthIssueAlert key={index} issue={issue} />
+                    ))}
+                </div>
+            )}
+
+            {/* Historical Charts */}
+            <div className="health-charts">
+                <h3>Historical Health Data</h3>
+                <div className="charts-grid">
+                    <HealthChart
+                        title="Response Time"
+                        data={historicalData.responseTime}
+                        unit="ms"
+                    />
+                    <HealthChart
+                        title="System Load"
+                        data={historicalData.systemLoad}
+                        unit="%"
+                    />
+                </div>
+            </div>
+        </div>
+    );
+};
+```
+
+### 5. Health Monitoring Components
+
+**Status Indicator Component**:
+```typescript
+const StatusIndicator: React.FC<{ status: string; message: string }> = ({ status, message }) => {
+    const getStatusColor = (status: string) => {
+        switch (status) {
+            case 'healthy': return 'green';
+            case 'degraded': return 'yellow';
+            case 'unhealthy': return 'red';
+            default: return 'gray';
+        }
+    };
+
+    return (
+        <div className={`status-indicator ${getStatusColor(status)}`}>
+            <div className="status-dot"></div>
+            <span className="status-message">{message}</span>
+        </div>
+    );
+};
+```
+
+**Health Card Component**:
+```typescript
+interface HealthCardProps {
+    title: string;
+    status: string;
+    metrics: Array<{ label: string; value: string | number }>;
+}
+
+const HealthCard: React.FC<HealthCardProps> = ({ title, status, metrics }) => {
+    return (
+        <div className={`health-card status-${status}`}>
+            <div className="card-header">
+                <h3>{title}</h3>
+                <StatusBadge status={status} />
+            </div>
+            <div className="card-metrics">
+                {metrics.map((metric, index) => (
+                    <div key={index} className="metric">
+                        <span className="metric-label">{metric.label}:</span>
+                        <span className="metric-value">{metric.value}</span>
+                    </div>
+                ))}
+            </div>
+        </div>
+    );
+};
+```
+
+## Definition of Done
+
+- ✅ Server health monitoring service implemented
+- ✅ Database, scheduler, and system resource health checks
+- ✅ Real-time health dashboard with status indicators
+- ✅ Historical health data tracking and visualization
+- ✅ Alert system for critical health issues
+- ✅ Auto-refresh functionality
+- ✅ Mobile-responsive design
+- ✅ Integration with existing settings page
+
+## Test Plan
+
+1. **Unit Tests**
+   - Health check calculations
+   - Status determination logic
+   - Error handling scenarios
+
+2. **Integration Tests**
+   - Database health check under load
+   - Scheduler monitoring accuracy
+   - System metrics collection
+
+3. **Stress Tests**
+   - Dashboard performance under high load
+   - Health check impact on system resources
+   - Concurrent health monitoring
+
+4. **Scenario Tests**
+   - Database connection failures
+   - High system load conditions
+   - Scheduler queue overflow scenarios
+
+## Files to Modify
+
+- `aggregator-server/internal/services/health_service.go` - New service
+- `aggregator-server/internal/api/handlers/health.go` - New handlers
+- `aggregator-web/src/pages/ServerHealth.tsx` - New dashboard
+- `aggregator-web/src/components/StatusIndicator.tsx` - Status components
+- `aggregator-web/src/components/HealthCard.tsx` - Health card component
+- `aggregator-web/src/lib/api.ts` - API integration
+
+## Health Check Categories
+
+### 1. System Health
+- CPU usage percentage
+- Memory usage percentage
+- Disk space availability
+- Network connectivity
+
+### 2. Application Health
+- Database connectivity and response time
+- API server responsiveness
+- Scheduler operation status
+- Background service status
+
+### 3. Business Logic Health
+- Agent registration flow
+- Command queue processing
+- Update distribution
+- Token management
+
+## Alert Thresholds
+
+### Critical Alerts
+- Database connection failures
+- CPU usage > 90% for > 5 minutes
+- Memory usage > 95%
+- Scheduler queue length > 1000
+
+### Warning Alerts
+- Database response time > 1 second
+- CPU usage > 80% for > 10 minutes
+- Memory usage > 85%
+- Queue length > 500
+
+## Estimated Effort
+
+- **Development**: 16-20 hours
+- **Testing**: 8-12 hours
+- **Review**: 4-6 hours
+- **Design/UX**: 6-8 hours
+
+## Dependencies
+
+- Existing monitoring infrastructure
+- System metrics collection
+- Database connection pooling
+- Background job processing
+
+## Risk Assessment
+
+**Low Risk** - Enhancement that adds monitoring capabilities without affecting core functionality. Health checks are read-only operations with minimal system impact.
+
+## Implementation Phases
+
+### Phase 1: Core Health Service
+1. Implement health check service
+2. Create health monitoring endpoints
+3. Basic status determination logic
+
+### Phase 2: Dashboard UI
+1. Create health dashboard layout
+2. Implement status indicators and metrics
+3. Add real-time updates
+
+### Phase 3: Advanced Features
+1. Historical data tracking
+2. Alert system integration
+3. Performance optimization
+
+## Future Enhancements
+
+1. **Multi-Server Monitoring**: Support for clustered deployments
+2. **Predictive Health**: ML-based health prediction
+3. **Automated Remediation**: Self-healing capabilities
+4. **Integration with External Monitoring**: Prometheus, Grafana
+5. **Custom Health Checks**: Pluggable health check system