diff --git a/aggregator-agent/cmd/agent/main.go b/aggregator-agent/cmd/agent/main.go index c49349a..c7403f4 100644 --- a/aggregator-agent/cmd/agent/main.go +++ b/aggregator-agent/cmd/agent/main.go @@ -57,8 +57,8 @@ func reportLogWithAck(apiClient *client.Client, cfg *config.Config, ackTracker * log.Printf("Warning: Failed to save acknowledgment for command %s: %v", logReport.CommandID, err) } - // Report the log to the server - if err := reportLogWithAck(apiClient, cfg, ackTracker, logReport); err != nil { + // Report the log to the server (FIX: was calling itself recursively!) + if err := apiClient.ReportLog(cfg.AgentID, logReport); err != nil { // If reporting failed, increment retry count but don't remove from pending ackTracker.IncrementRetry(logReport.CommandID) return err @@ -623,7 +623,12 @@ func runAgent(cfg *config.Config) error { pendingAcks := ackTracker.GetPending() if len(pendingAcks) > 0 { metrics.PendingAcknowledgments = pendingAcks + log.Printf("Including %d pending acknowledgments in check-in: %v", len(pendingAcks), pendingAcks) + } else { + log.Printf("No pending acknowledgments to send") } + } else { + log.Printf("Metrics is nil - not sending system information or acknowledgments") } // Get commands from server (with optional metrics) diff --git a/aggregator-agent/cmd/agent/subsystem_handlers.go b/aggregator-agent/cmd/agent/subsystem_handlers.go index d52eb11..9c51172 100644 --- a/aggregator-agent/cmd/agent/subsystem_handlers.go +++ b/aggregator-agent/cmd/agent/subsystem_handlers.go @@ -307,10 +307,12 @@ func handleUpdateAgent(apiClient *client.Client, cfg *config.Config, ackTracker // Validate nonce for replay protection log.Printf("[tunturi_ed25519] Validating nonce...") + log.Printf("[SECURITY] Nonce validation - UUID: %s, Timestamp: %s", nonceUUIDStr, nonceTimestampStr) if err := validateNonce(nonceUUIDStr, nonceTimestampStr, nonceSignature); err != nil { + log.Printf("[SECURITY] ✗ Nonce validation FAILED: %v", err) return fmt.Errorf("[tunturi_ed25519] nonce validation failed: %w", err) } - log.Printf("[tunturi_ed25519] ✓ Nonce validated") + log.Printf("[SECURITY] ✓ Nonce validated successfully") // Record start time for duration calculation updateStartTime := time.Now() diff --git a/aggregator-agent/install.sh b/aggregator-agent/install.sh deleted file mode 100755 index 59b479f..0000000 --- a/aggregator-agent/install.sh +++ /dev/null @@ -1,257 +0,0 @@ -#!/bin/bash -set -e - -# RedFlag Agent Installation Script -# This script installs the RedFlag agent as a systemd service with proper permissions - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -AGENT_USER="redflag-agent" -AGENT_HOME="/var/lib/redflag-agent" -AGENT_BINARY="/usr/local/bin/redflag-agent" -SUDOERS_FILE="/etc/sudoers.d/redflag-agent" -SERVICE_FILE="/etc/systemd/system/redflag-agent.service" - -echo "=== RedFlag Agent Installation ===" -echo "" - -# Check if running as root -if [ "$EUID" -ne 0 ]; then - echo "ERROR: This script must be run as root (use sudo)" - exit 1 -fi - -# Function to create user if doesn't exist -create_user() { - if id "$AGENT_USER" &>/dev/null; then - echo "✓ User $AGENT_USER already exists" - else - echo "Creating system user $AGENT_USER..." - useradd -r -s /bin/false -d "$AGENT_HOME" -m "$AGENT_USER" - echo "✓ User $AGENT_USER created" - fi - - # Add user to docker group for Docker update scanning - if getent group docker &>/dev/null; then - echo "Adding $AGENT_USER to docker group..." - usermod -aG docker "$AGENT_USER" - echo "✓ User $AGENT_USER added to docker group" - else - echo "⚠ Docker group not found - Docker updates will not be available" - echo " (Install Docker first, then reinstall the agent to enable Docker support)" - fi -} - -# Function to build agent binary -build_agent() { - echo "Building agent binary..." - cd "$SCRIPT_DIR" - go build -o redflag-agent ./cmd/agent - echo "✓ Agent binary built" -} - -# Function to install agent binary -install_binary() { - echo "Installing agent binary to $AGENT_BINARY..." - cp "$SCRIPT_DIR/redflag-agent" "$AGENT_BINARY" - chmod 755 "$AGENT_BINARY" - chown root:root "$AGENT_BINARY" - echo "✓ Agent binary installed" - - # Set SELinux context for binary if SELinux is enabled - if command -v getenforce >/dev/null 2>&1 && [ "$(getenforce)" != "Disabled" ]; then - echo "SELinux detected, setting file context for binary..." - restorecon -v "$AGENT_BINARY" 2>/dev/null || true - echo "✓ SELinux context set for binary" - fi -} - -# Function to install sudoers configuration -install_sudoers() { - echo "Installing sudoers configuration..." - cat > "$SUDOERS_FILE" <<'EOF' -# RedFlag Agent minimal sudo permissions -# This file is generated automatically during RedFlag agent installation - -# APT package management commands -redflag-agent ALL=(root) NOPASSWD: /usr/bin/apt-get update -redflag-agent ALL=(root) NOPASSWD: /usr/bin/apt-get install -y * -redflag-agent ALL=(root) NOPASSWD: /usr/bin/apt-get upgrade -y * -redflag-agent ALL=(root) NOPASSWD: /usr/bin/apt-get install --dry-run --yes * - -# DNF package management commands -redflag-agent ALL=(root) NOPASSWD: /usr/bin/dnf makecache -redflag-agent ALL=(root) NOPASSWD: /usr/bin/dnf install -y * -redflag-agent ALL=(root) NOPASSWD: /usr/bin/dnf upgrade -y * -redflag-agent ALL=(root) NOPASSWD: /usr/bin/dnf install --assumeno --downloadonly * - -# Docker operations -redflag-agent ALL=(root) NOPASSWD: /usr/bin/docker pull * -redflag-agent ALL=(root) NOPASSWD: /usr/bin/docker image inspect * -redflag-agent ALL=(root) NOPASSWD: /usr/bin/docker manifest inspect * -EOF - - chmod 440 "$SUDOERS_FILE" - - # Validate sudoers file - if visudo -c -f "$SUDOERS_FILE"; then - echo "✓ Sudoers configuration installed and validated" - else - echo "ERROR: Sudoers configuration is invalid" - rm -f "$SUDOERS_FILE" - exit 1 - fi -} - -# Function to install systemd service -install_service() { - echo "Installing systemd service..." - cat > "$SERVICE_FILE" </dev/null 2>&1 && [ "$(getenforce)" != "Disabled" ]; then - echo "Setting SELinux context for config directory..." - restorecon -Rv /etc/aggregator 2>/dev/null || true - echo "✓ SELinux context set for config directory" - fi - - # Register agent (run as regular binary, not as service) - if "$AGENT_BINARY" -register -server "$server_url"; then - echo "✓ Agent registered successfully" - else - echo "ERROR: Agent registration failed" - echo "Please ensure the RedFlag server is running at $server_url" - exit 1 - fi -} - -# Main installation flow -SERVER_URL="${1:-http://localhost:8080}" - -echo "Step 1: Creating system user..." -create_user - -echo "" -echo "Step 2: Building agent binary..." -build_agent - -echo "" -echo "Step 3: Installing agent binary..." -install_binary - -echo "" -echo "Step 4: Registering agent with server..." -register_agent "$SERVER_URL" - -echo "" -echo "Step 5: Setting config file permissions..." -chown redflag-agent:redflag-agent /etc/aggregator/config.json -chmod 600 /etc/aggregator/config.json - -echo "" -echo "Step 6: Installing sudoers configuration..." -install_sudoers - -echo "" -echo "Step 7: Installing systemd service..." -install_service - -echo "" -echo "Step 8: Starting service..." -start_service - -echo "" -echo "=== Installation Complete ===" -echo "" -echo "The RedFlag agent is now installed and running as a systemd service." -echo "Server URL: $SERVER_URL" -echo "" -echo "Useful commands:" -echo " - Check status: sudo systemctl status redflag-agent" -echo " - View logs: sudo journalctl -u redflag-agent -f" -echo " - Restart: sudo systemctl restart redflag-agent" -echo " - Stop: sudo systemctl stop redflag-agent" -echo " - Disable: sudo systemctl disable redflag-agent" -echo "" -echo "Note: To re-register with a different server, edit /etc/aggregator/config.json" -echo "" - -show_status diff --git a/aggregator-agent/internal/config/subsystems.go b/aggregator-agent/internal/config/subsystems.go index 4ee06f5..a1a6765 100644 --- a/aggregator-agent/internal/config/subsystems.go +++ b/aggregator-agent/internal/config/subsystems.go @@ -68,7 +68,7 @@ func GetDefaultSubsystemsConfig() SubsystemsConfig { }, DNF: SubsystemConfig{ Enabled: true, - Timeout: 45 * time.Second, // DNF can be slower + Timeout: 15 * time.Minute, // TODO: Make scanner timeouts user-adjustable via settings. DNF operations can take a long time on large systems CircuitBreaker: defaultCB, }, Docker: SubsystemConfig{ diff --git a/aggregator-server/cmd/server/main.go b/aggregator-server/cmd/server/main.go index 3d2a32f..1bc9802 100644 --- a/aggregator-server/cmd/server/main.go +++ b/aggregator-server/cmd/server/main.go @@ -188,6 +188,9 @@ func main() { // Initialize system handler systemHandler := handlers.NewSystemHandler(signingService) + // Initialize security handler + securityHandler := handlers.NewSecurityHandler(signingService, agentQueries, commandQueries) + // Setup router router := gin.Default() @@ -242,17 +245,6 @@ func main() { verificationHandler.VerifySignature(c) }) agents.DELETE("/:id", agentHandler.UnregisterAgent) - - // Subsystem routes - agents.GET("/:id/subsystems", subsystemHandler.GetSubsystems) - agents.GET("/:id/subsystems/:subsystem", subsystemHandler.GetSubsystem) - agents.PATCH("/:id/subsystems/:subsystem", subsystemHandler.UpdateSubsystem) - agents.POST("/:id/subsystems/:subsystem/enable", subsystemHandler.EnableSubsystem) - agents.POST("/:id/subsystems/:subsystem/disable", subsystemHandler.DisableSubsystem) - agents.POST("/:id/subsystems/:subsystem/trigger", subsystemHandler.TriggerSubsystem) - agents.GET("/:id/subsystems/:subsystem/stats", subsystemHandler.GetSubsystemStats) - agents.POST("/:id/subsystems/:subsystem/auto-run", subsystemHandler.SetAutoRun) - agents.POST("/:id/subsystems/:subsystem/interval", subsystemHandler.SetInterval) } // Dashboard/Web routes (protected by web auth) @@ -263,10 +255,21 @@ func main() { dashboard.GET("/agents", agentHandler.ListAgents) dashboard.GET("/agents/:id", agentHandler.GetAgent) dashboard.POST("/agents/:id/scan", agentHandler.TriggerScan) - dashboard.POST("/agents/:id/update", agentHandler.TriggerUpdate) dashboard.POST("/agents/:id/heartbeat", agentHandler.TriggerHeartbeat) dashboard.GET("/agents/:id/heartbeat", agentHandler.GetHeartbeatStatus) dashboard.POST("/agents/:id/reboot", agentHandler.TriggerReboot) + + // Subsystem routes for web dashboard + dashboard.GET("/agents/:id/subsystems", subsystemHandler.GetSubsystems) + dashboard.GET("/agents/:id/subsystems/:subsystem", subsystemHandler.GetSubsystem) + dashboard.PATCH("/agents/:id/subsystems/:subsystem", subsystemHandler.UpdateSubsystem) + dashboard.POST("/agents/:id/subsystems/:subsystem/enable", subsystemHandler.EnableSubsystem) + dashboard.POST("/agents/:id/subsystems/:subsystem/disable", subsystemHandler.DisableSubsystem) + dashboard.POST("/agents/:id/subsystems/:subsystem/trigger", subsystemHandler.TriggerSubsystem) + dashboard.GET("/agents/:id/subsystems/:subsystem/stats", subsystemHandler.GetSubsystemStats) + dashboard.POST("/agents/:id/subsystems/:subsystem/auto-run", subsystemHandler.SetAutoRun) + dashboard.POST("/agents/:id/subsystems/:subsystem/interval", subsystemHandler.SetInterval) + dashboard.GET("/updates", updateHandler.ListUpdates) dashboard.GET("/updates/:id", updateHandler.GetUpdate) dashboard.GET("/updates/:id/logs", updateHandler.GetUpdateLogs) @@ -326,6 +329,14 @@ func main() { admin.GET("/rate-limits/stats", rateLimiter.RateLimit("admin_operations", middleware.KeyByUserID), rateLimitHandler.GetRateLimitStats) admin.POST("/rate-limits/cleanup", rateLimiter.RateLimit("admin_operations", middleware.KeyByUserID), rateLimitHandler.CleanupRateLimitEntries) } + + // Security Health Check endpoints + dashboard.GET("/security/overview", securityHandler.SecurityOverview) + dashboard.GET("/security/signing", securityHandler.SigningStatus) + dashboard.GET("/security/nonce", securityHandler.NonceValidationStatus) + dashboard.GET("/security/commands", securityHandler.CommandValidationStatus) + dashboard.GET("/security/machine-binding", securityHandler.MachineBindingStatus) + dashboard.GET("/security/metrics", securityHandler.SecurityMetrics) } } @@ -355,7 +366,7 @@ func main() { // Initialize and start scheduler schedulerConfig := scheduler.DefaultConfig() - subsystemScheduler := scheduler.NewScheduler(schedulerConfig, agentQueries, commandQueries) + subsystemScheduler := scheduler.NewScheduler(schedulerConfig, agentQueries, commandQueries, subsystemQueries) // Load subsystems into queue ctx := context.Background() diff --git a/aggregator-server/internal/api/handlers/agents.go b/aggregator-server/internal/api/handlers/agents.go index 0edde0e..f774e6a 100644 --- a/aggregator-server/internal/api/handlers/agents.go +++ b/aggregator-server/internal/api/handlers/agents.go @@ -106,7 +106,8 @@ func (h *AgentHandler) RegisterAgent(c *gin.Context) { // Save to database if err := h.agentQueries.CreateAgent(agent); err != nil { - c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to register agent"}) + log.Printf("ERROR: Failed to create agent in database: %v", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to register agent - database error"}) return } @@ -163,16 +164,17 @@ func (h *AgentHandler) GetCommands(c *gin.Context) { // Try to parse optional system metrics from request body var metrics struct { - CPUPercent float64 `json:"cpu_percent,omitempty"` - MemoryPercent float64 `json:"memory_percent,omitempty"` - MemoryUsedGB float64 `json:"memory_used_gb,omitempty"` - MemoryTotalGB float64 `json:"memory_total_gb,omitempty"` - DiskUsedGB float64 `json:"disk_used_gb,omitempty"` - DiskTotalGB float64 `json:"disk_total_gb,omitempty"` - DiskPercent float64 `json:"disk_percent,omitempty"` - Uptime string `json:"uptime,omitempty"` - Version string `json:"version,omitempty"` - Metadata map[string]interface{} `json:"metadata,omitempty"` + CPUPercent float64 `json:"cpu_percent,omitempty"` + MemoryPercent float64 `json:"memory_percent,omitempty"` + MemoryUsedGB float64 `json:"memory_used_gb,omitempty"` + MemoryTotalGB float64 `json:"memory_total_gb,omitempty"` + DiskUsedGB float64 `json:"disk_used_gb,omitempty"` + DiskTotalGB float64 `json:"disk_total_gb,omitempty"` + DiskPercent float64 `json:"disk_percent,omitempty"` + Uptime string `json:"uptime,omitempty"` + Version string `json:"version,omitempty"` + Metadata map[string]interface{} `json:"metadata,omitempty"` + PendingAcknowledgments []string `json:"pending_acknowledgments,omitempty"` } // Parse metrics if provided (optional, won't fail if empty) @@ -449,10 +451,27 @@ func (h *AgentHandler) GetCommands(c *gin.Context) { } } + // Process command acknowledgments from agent + var acknowledgedIDs []string + if len(metrics.PendingAcknowledgments) > 0 { + log.Printf("DEBUG: Processing %d pending acknowledgments for agent %s: %v", len(metrics.PendingAcknowledgments), agentID, metrics.PendingAcknowledgments) + // Verify which commands from agent's pending list have been recorded + verified, err := h.commandQueries.VerifyCommandsCompleted(metrics.PendingAcknowledgments) + if err != nil { + log.Printf("Warning: Failed to verify command acknowledgments for agent %s: %v", agentID, err) + } else { + acknowledgedIDs = verified + log.Printf("DEBUG: Verified %d completed commands out of %d pending for agent %s", len(acknowledgedIDs), len(metrics.PendingAcknowledgments), agentID) + if len(acknowledgedIDs) > 0 { + log.Printf("Acknowledged %d command results for agent %s", len(acknowledgedIDs), agentID) + } + } + } + response := models.CommandsResponse{ Commands: commandItems, RapidPolling: rapidPolling, - AcknowledgedIDs: []string{}, // No acknowledgments in current implementation + AcknowledgedIDs: acknowledgedIDs, } c.JSON(http.StatusOK, response) @@ -465,7 +484,8 @@ func (h *AgentHandler) ListAgents(c *gin.Context) { agents, err := h.agentQueries.ListAgentsWithLastScan(status, osType) if err != nil { - c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to list agents"}) + log.Printf("ERROR: Failed to list agents: %v", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to list agents - database error"}) return } diff --git a/aggregator-server/internal/api/handlers/downloads.go b/aggregator-server/internal/api/handlers/downloads.go index 03f638e..e58ec59 100644 --- a/aggregator-server/internal/api/handlers/downloads.go +++ b/aggregator-server/internal/api/handlers/downloads.go @@ -31,28 +31,24 @@ func (h *DownloadHandler) getServerURL(c *gin.Context) string { return h.config.Server.PublicURL } - // Priority 2: Detect from request with TLS/proxy awareness + // Priority 2: Construct API server URL from configuration scheme := "http" + host := h.config.Server.Host + port := h.config.Server.Port - // Check if TLS is enabled in config + // Use HTTPS if TLS is enabled in config if h.config.Server.TLS.Enabled { scheme = "https" } - // Check if request came through HTTPS (direct or via proxy) - if c.Request.TLS != nil { - scheme = "https" + // For default host (0.0.0.0), use localhost for client connections + if host == "0.0.0.0" { + host = "localhost" } - // Check X-Forwarded-Proto for reverse proxy setups - if forwardedProto := c.GetHeader("X-Forwarded-Proto"); forwardedProto == "https" { - scheme = "https" - } - - // Use the Host header exactly as received (includes port if present) - host := c.GetHeader("X-Forwarded-Host") - if host == "" { - host = c.Request.Host + // Only include port if it's not the default for the protocol + if (scheme == "http" && port != 80) || (scheme == "https" && port != 443) { + return fmt.Sprintf("%s://%s:%d", scheme, host, port) } return fmt.Sprintf("%s://%s", scheme, host) @@ -155,6 +151,7 @@ AGENT_BINARY="/usr/local/bin/redflag-agent" SUDOERS_FILE="/etc/sudoers.d/redflag-agent" SERVICE_FILE="/etc/systemd/system/redflag-agent.service" CONFIG_DIR="/etc/aggregator" +STATE_DIR="/var/lib/aggregator" echo "=== RedFlag Agent Installation ===" echo "" @@ -301,19 +298,24 @@ else exit 1 fi -# Step 4: Create configuration directory +# Step 4: Create configuration and state directories echo "" -echo "Step 4: Creating configuration directory..." +echo "Step 4: Creating configuration and state directories..." mkdir -p "$CONFIG_DIR" chown "$AGENT_USER:$AGENT_USER" "$CONFIG_DIR" chmod 755 "$CONFIG_DIR" -echo "✓ Configuration directory created" -# Set SELinux context for config directory if SELinux is enabled +# Create state directory for acknowledgment tracking (v0.1.19+) +mkdir -p "$STATE_DIR" +chown "$AGENT_USER:$AGENT_USER" "$STATE_DIR" +chmod 755 "$STATE_DIR" +echo "✓ Configuration and state directories created" + +# Set SELinux context for directories if SELinux is enabled if command -v getenforce >/dev/null 2>&1 && [ "$(getenforce)" != "Disabled" ]; then - echo "Setting SELinux context for config directory..." - restorecon -Rv "$CONFIG_DIR" 2>/dev/null || true - echo "✓ SELinux context set for config directory" + echo "Setting SELinux context for directories..." + restorecon -Rv "$CONFIG_DIR" "$STATE_DIR" 2>/dev/null || true + echo "✓ SELinux context set for directories" fi # Step 5: Install systemd service @@ -338,7 +340,7 @@ RestartSec=30 # NoNewPrivileges=true - DISABLED: Prevents sudo from working, which agent needs for package management ProtectSystem=strict ProtectHome=true -ReadWritePaths=$AGENT_HOME /var/log $CONFIG_DIR +ReadWritePaths=$AGENT_HOME /var/log $CONFIG_DIR $STATE_DIR PrivateTmp=true # Logging diff --git a/aggregator-server/internal/api/handlers/setup.go b/aggregator-server/internal/api/handlers/setup.go index 2982e6d..62d09f6 100644 --- a/aggregator-server/internal/api/handlers/setup.go +++ b/aggregator-server/internal/api/handlers/setup.go @@ -387,9 +387,15 @@ func (h *SetupHandler) ConfigureServer(c *gin.Context) { fmt.Println("Updating PostgreSQL password from bootstrap to user-provided password...") bootstrapPassword := "redflag_bootstrap" // This matches our bootstrap .env if err := updatePostgresPassword(req.DBHost, req.DBPort, req.DBUser, bootstrapPassword, req.DBPassword); err != nil { - fmt.Printf("Warning: Failed to update PostgreSQL password: %v\n", err) - fmt.Println("Will proceed with configuration anyway...") + fmt.Printf("CRITICAL ERROR: Failed to update PostgreSQL password: %v\n", err) + c.JSON(http.StatusInternalServerError, gin.H{ + "error": "Failed to update database password. Setup cannot continue.", + "details": err.Error(), + "help": "Ensure PostgreSQL is accessible and the bootstrap password is correct. Check Docker logs for details.", + }) + return } + fmt.Println("PostgreSQL password successfully updated from bootstrap to user-provided password") // Step 2: Generate configuration content for manual update fmt.Println("Generating configuration content for manual .env file update...") @@ -414,6 +420,11 @@ func (h *SetupHandler) ConfigureServer(c *gin.Context) { // GenerateSigningKeys generates Ed25519 keypair for agent update signing func (h *SetupHandler) GenerateSigningKeys(c *gin.Context) { + // Prevent caching of generated keys (security critical) + c.Header("Cache-Control", "no-store, no-cache, must-revalidate, private") + c.Header("Pragma", "no-cache") + c.Header("Expires", "0") + // Generate Ed25519 keypair publicKey, privateKey, err := ed25519.GenerateKey(rand.Reader) if err != nil { @@ -428,6 +439,9 @@ func (h *SetupHandler) GenerateSigningKeys(c *gin.Context) { // Generate fingerprint (first 16 chars) fingerprint := publicKeyHex[:16] + // Log key generation for security audit trail (only fingerprint, not full key) + fmt.Printf("Generated new Ed25519 keypair - Fingerprint: %s\n", fingerprint) + c.JSON(http.StatusOK, gin.H{ "public_key": publicKeyHex, "private_key": privateKeyHex, diff --git a/aggregator-server/internal/api/handlers/updates.go b/aggregator-server/internal/api/handlers/updates.go index 610705e..8fdd321 100644 --- a/aggregator-server/internal/api/handlers/updates.go +++ b/aggregator-server/internal/api/handlers/updates.go @@ -13,6 +13,16 @@ import ( "github.com/google/uuid" ) +// isValidResult checks if the result value complies with the database constraint +func isValidResult(result string) bool { + validResults := map[string]bool{ + "success": true, + "failed": true, + "partial": true, + } + return validResults[result] +} + type UpdateHandler struct { updateQueries *queries.UpdateQueries agentQueries *queries.AgentQueries @@ -199,11 +209,22 @@ func (h *UpdateHandler) ReportLog(c *gin.Context) { return } + // Validate and map result to comply with database constraint + validResult := req.Result + if !isValidResult(validResult) { + // Map invalid results to valid ones (e.g., "timed_out" -> "failed") + if validResult == "timed_out" || validResult == "timeout" || validResult == "cancelled" { + validResult = "failed" + } else { + validResult = "failed" // Default to failed for any unknown status + } + } + logEntry := &models.UpdateLog{ ID: uuid.New(), AgentID: agentID, Action: req.Action, - Result: req.Result, + Result: validResult, Stdout: req.Stdout, Stderr: req.Stderr, ExitCode: req.ExitCode, @@ -831,8 +852,8 @@ func (h *UpdateHandler) ClearFailedCommands(c *gin.Context) { // Build the appropriate cleanup query based on parameters if allFailed { - // Clear ALL failed commands (most aggressive) - count, err = h.commandQueries.ClearAllFailedCommands(olderThanDays) + // Clear ALL failed commands regardless of age (most aggressive) + count, err = h.commandQueries.ClearAllFailedCommandsRegardlessOfAge() } else if onlyRetried { // Clear only failed commands that have been retried count, err = h.commandQueries.ClearRetriedFailedCommands(olderThanDays) diff --git a/aggregator-server/internal/config/config.go b/aggregator-server/internal/config/config.go index cc84572..86ebd20 100644 --- a/aggregator-server/internal/config/config.go +++ b/aggregator-server/internal/config/config.go @@ -88,6 +88,14 @@ func Load() (*Config, error) { cfg.Timezone = getEnv("TIMEZONE", "UTC") cfg.LatestAgentVersion = getEnv("LATEST_AGENT_VERSION", "0.1.22") cfg.MinAgentVersion = getEnv("MIN_AGENT_VERSION", "0.1.22") + cfg.SigningPrivateKey = getEnv("REDFLAG_SIGNING_PRIVATE_KEY", "") + + // Debug: Log signing key status + if cfg.SigningPrivateKey != "" { + fmt.Printf("[CONFIG] ✅ Ed25519 signing private key configured (%d characters)\n", len(cfg.SigningPrivateKey)) + } else { + fmt.Printf("[CONFIG] ❌ No Ed25519 signing private key found in REDFLAG_SIGNING_PRIVATE_KEY\n") + } // Handle missing secrets if cfg.Admin.Password == "" || cfg.Admin.JWTSecret == "" || cfg.Database.Password == "" { diff --git a/aggregator-server/internal/database/queries/agents.go b/aggregator-server/internal/database/queries/agents.go index 183925e..9280f50 100644 --- a/aggregator-server/internal/database/queries/agents.go +++ b/aggregator-server/internal/database/queries/agents.go @@ -23,14 +23,19 @@ func (q *AgentQueries) CreateAgent(agent *models.Agent) error { query := ` INSERT INTO agents ( id, hostname, os_type, os_version, os_architecture, - agent_version, last_seen, status, metadata + agent_version, current_version, machine_id, public_key_fingerprint, + last_seen, status, metadata ) VALUES ( :id, :hostname, :os_type, :os_version, :os_architecture, - :agent_version, :last_seen, :status, :metadata + :agent_version, :current_version, :machine_id, :public_key_fingerprint, + :last_seen, :status, :metadata ) ` _, err := q.db.NamedExec(query, agent) - return err + if err != nil { + return fmt.Errorf("failed to create agent %s (version %s): %w", agent.Hostname, agent.CurrentVersion, err) + } + return nil } // GetAgentByID retrieves an agent by ID diff --git a/aggregator-server/internal/database/queries/commands.go b/aggregator-server/internal/database/queries/commands.go index 9877281..d4514ee 100644 --- a/aggregator-server/internal/database/queries/commands.go +++ b/aggregator-server/internal/database/queries/commands.go @@ -2,6 +2,7 @@ package queries import ( "fmt" + "strings" "time" "github.com/Fimeg/RedFlag/aggregator-server/internal/models" @@ -31,13 +32,14 @@ func (q *CommandQueries) CreateCommand(cmd *models.AgentCommand) error { } // GetPendingCommands retrieves pending commands for an agent +// Only returns 'pending' status - 'sent' commands are handled by timeout service func (q *CommandQueries) GetPendingCommands(agentID uuid.UUID) ([]models.AgentCommand, error) { var commands []models.AgentCommand query := ` SELECT * FROM agent_commands WHERE agent_id = $1 AND status = 'pending' ORDER BY created_at ASC - LIMIT 10 + LIMIT 100 ` err := q.db.Select(&commands, query, agentID) return commands, err @@ -338,6 +340,23 @@ func (q *CommandQueries) ClearAllFailedCommands(days int) (int64, error) { return result.RowsAffected() } +// ClearAllFailedCommandsRegardlessOfAge archives ALL failed/timed_out commands regardless of age +// This is used when all_failed=true is passed to truly clear all failed commands +func (q *CommandQueries) ClearAllFailedCommandsRegardlessOfAge() (int64, error) { + query := ` + UPDATE agent_commands + SET status = 'archived_failed' + WHERE status IN ('failed', 'timed_out') + ` + + result, err := q.db.Exec(query) + if err != nil { + return 0, fmt.Errorf("failed to archive all failed commands regardless of age: %w", err) + } + + return result.RowsAffected() +} + // CountPendingCommandsForAgent returns the number of pending commands for a specific agent // Used by scheduler for backpressure detection func (q *CommandQueries) CountPendingCommandsForAgent(agentID uuid.UUID) (int, error) { @@ -373,16 +392,30 @@ func (q *CommandQueries) VerifyCommandsCompleted(commandIDs []string) ([]string, return []string{}, nil } + // Convert UUIDs back to strings for SQL query + uuidStrs := make([]string, len(uuidIDs)) + for i, id := range uuidIDs { + uuidStrs[i] = id.String() + } + // Query for commands that are completed or failed - query := ` + // Use ANY with proper array literal for PostgreSQL + placeholders := make([]string, len(uuidStrs)) + args := make([]interface{}, len(uuidStrs)) + for i, id := range uuidStrs { + placeholders[i] = fmt.Sprintf("$%d", i+1) + args[i] = id + } + + query := fmt.Sprintf(` SELECT id FROM agent_commands - WHERE id = ANY($1) - AND status IN ('completed', 'failed') - ` + WHERE id::text = ANY(%s) + AND status IN ('completed', 'failed', 'timed_out') + `, fmt.Sprintf("ARRAY[%s]", strings.Join(placeholders, ","))) var completedUUIDs []uuid.UUID - err := q.db.Select(&completedUUIDs, query, uuidIDs) + err := q.db.Select(&completedUUIDs, query, args...) if err != nil { return nil, fmt.Errorf("failed to verify command completion: %w", err) } diff --git a/aggregator-server/internal/models/agent.go b/aggregator-server/internal/models/agent.go index 073c0f0..55d632a 100644 --- a/aggregator-server/internal/models/agent.go +++ b/aggregator-server/internal/models/agent.go @@ -45,6 +45,11 @@ type AgentWithLastScan struct { CurrentVersion string `json:"current_version" db:"current_version"` // Current running version UpdateAvailable bool `json:"update_available" db:"update_available"` // Whether update is available LastVersionCheck time.Time `json:"last_version_check" db:"last_version_check"` // Last time version was checked + MachineID *string `json:"machine_id,omitempty" db:"machine_id"` // Unique machine identifier + PublicKeyFingerprint *string `json:"public_key_fingerprint,omitempty" db:"public_key_fingerprint"` // Public key fingerprint + IsUpdating bool `json:"is_updating" db:"is_updating"` // Whether agent is currently updating + UpdatingToVersion *string `json:"updating_to_version,omitempty" db:"updating_to_version"` // Target version for ongoing update + UpdateInitiatedAt *time.Time `json:"update_initiated_at,omitempty" db:"update_initiated_at"` // When update process started LastSeen time.Time `json:"last_seen" db:"last_seen"` Status string `json:"status" db:"status"` Metadata JSONB `json:"metadata" db:"metadata"` diff --git a/aggregator-server/internal/scheduler/scheduler.go b/aggregator-server/internal/scheduler/scheduler.go index 56bf962..b1143d5 100644 --- a/aggregator-server/internal/scheduler/scheduler.go +++ b/aggregator-server/internal/scheduler/scheduler.go @@ -53,8 +53,9 @@ type Scheduler struct { queue *PriorityQueue // Database queries - agentQueries *queries.AgentQueries - commandQueries *queries.CommandQueries + agentQueries *queries.AgentQueries + commandQueries *queries.CommandQueries + subsystemQueries *queries.SubsystemQueries // Worker pool jobChan chan *SubsystemJob @@ -88,19 +89,20 @@ type Stats struct { } // NewScheduler creates a new scheduler instance -func NewScheduler(config Config, agentQueries *queries.AgentQueries, commandQueries *queries.CommandQueries) *Scheduler { +func NewScheduler(config Config, agentQueries *queries.AgentQueries, commandQueries *queries.CommandQueries, subsystemQueries *queries.SubsystemQueries) *Scheduler { ctx, cancel := context.WithCancel(context.Background()) s := &Scheduler{ - config: config, - queue: NewPriorityQueue(), - agentQueries: agentQueries, - commandQueries: commandQueries, - jobChan: make(chan *SubsystemJob, 1000), // Buffer 1000 jobs - workers: make([]*worker, config.NumWorkers), - shutdown: make(chan struct{}), - ctx: ctx, - cancel: cancel, + config: config, + queue: NewPriorityQueue(), + agentQueries: agentQueries, + commandQueries: commandQueries, + subsystemQueries: subsystemQueries, + jobChan: make(chan *SubsystemJob, 1000), // Buffer 1000 jobs + workers: make([]*worker, config.NumWorkers), + shutdown: make(chan struct{}), + ctx: ctx, + cancel: cancel, } // Initialize rate limiter if configured @@ -130,16 +132,6 @@ func (s *Scheduler) LoadSubsystems(ctx context.Context) error { return fmt.Errorf("failed to get agents: %w", err) } - // For now, we'll create default subsystems for each agent - // In full implementation, this would read from agent_subsystems table - subsystems := []string{"updates", "storage", "system", "docker"} - intervals := map[string]int{ - "updates": 15, // 15 minutes - "storage": 15, - "system": 30, - "docker": 15, - } - loaded := 0 for _, agent := range agents { // Skip offline agents (haven't checked in for 10+ minutes) @@ -147,28 +139,70 @@ func (s *Scheduler) LoadSubsystems(ctx context.Context) error { continue } - for _, subsystem := range subsystems { - // TODO: Check agent metadata for subsystem enablement - // For now, assume all subsystems are enabled + // Get subsystems from database (respect user settings) + dbSubsystems, err := s.subsystemQueries.GetSubsystems(agent.ID) + if err != nil { + log.Printf("[Scheduler] Failed to get subsystems for agent %s: %v", agent.Hostname, err) + continue + } - job := &SubsystemJob{ - AgentID: agent.ID, - AgentHostname: agent.Hostname, - Subsystem: subsystem, - IntervalMinutes: intervals[subsystem], - NextRunAt: time.Now().Add(time.Duration(intervals[subsystem]) * time.Minute), - Enabled: true, + // Create jobs only for enabled subsystems with auto_run=true + for _, dbSub := range dbSubsystems { + if dbSub.Enabled && dbSub.AutoRun { + // Use database interval, fallback to default + intervalMinutes := dbSub.IntervalMinutes + if intervalMinutes <= 0 { + intervalMinutes = s.getDefaultInterval(dbSub.Subsystem) + } + + var nextRun time.Time + if dbSub.NextRunAt != nil { + nextRun = *dbSub.NextRunAt + } else { + // If no next run is set, schedule it with default interval + nextRun = time.Now().Add(time.Duration(intervalMinutes) * time.Minute) + } + + job := &SubsystemJob{ + AgentID: agent.ID, + AgentHostname: agent.Hostname, + Subsystem: dbSub.Subsystem, + IntervalMinutes: intervalMinutes, + NextRunAt: nextRun, + Enabled: dbSub.Enabled, + } + + s.queue.Push(job) + loaded++ } - - s.queue.Push(job) - loaded++ } } - log.Printf("[Scheduler] Loaded %d subsystem jobs for %d agents\n", loaded, len(agents)) + log.Printf("[Scheduler] Loaded %d subsystem jobs for %d agents (respecting database settings)\n", loaded, len(agents)) return nil } +// getDefaultInterval returns default interval minutes for a subsystem +// TODO: These intervals need to correlate with agent health scanning settings +// Each subsystem should be variable based on user-configurable agent health policies +func (s *Scheduler) getDefaultInterval(subsystem string) int { + defaults := map[string]int{ + "apt": 30, // 30 minutes + "dnf": 240, // 4 hours + "docker": 120, // 2 hours + "storage": 360, // 6 hours + "windows": 480, // 8 hours + "winget": 360, // 6 hours + "updates": 15, // 15 minutes + "system": 30, // 30 minutes + } + + if interval, exists := defaults[subsystem]; exists { + return interval + } + return 30 // Default fallback +} + // Start begins the scheduler main loop and workers func (s *Scheduler) Start() error { log.Printf("[Scheduler] Starting with %d workers, check interval %v\n", diff --git a/aggregator-server/internal/scheduler/scheduler_test.go b/aggregator-server/internal/scheduler/scheduler_test.go index 7bfc149..81e68fc 100644 --- a/aggregator-server/internal/scheduler/scheduler_test.go +++ b/aggregator-server/internal/scheduler/scheduler_test.go @@ -9,7 +9,7 @@ import ( func TestScheduler_NewScheduler(t *testing.T) { config := DefaultConfig() - s := NewScheduler(config, nil, nil) + s := NewScheduler(config, nil, nil, nil) if s == nil { t.Fatal("NewScheduler returned nil") @@ -58,7 +58,7 @@ func TestScheduler_DefaultConfig(t *testing.T) { func TestScheduler_QueueIntegration(t *testing.T) { config := DefaultConfig() - s := NewScheduler(config, nil, nil) + s := NewScheduler(config, nil, nil, nil) // Add jobs to queue agent1 := uuid.New() @@ -96,7 +96,7 @@ func TestScheduler_QueueIntegration(t *testing.T) { func TestScheduler_GetStats(t *testing.T) { config := DefaultConfig() - s := NewScheduler(config, nil, nil) + s := NewScheduler(config, nil, nil, nil) // Initial stats should be zero stats := s.GetStats() @@ -145,7 +145,7 @@ func TestScheduler_StartStop(t *testing.T) { RateLimitPerSecond: 0, // Disable rate limiting for test } - s := NewScheduler(config, nil, nil) + s := NewScheduler(config, nil, nil, nil) // Start scheduler err := s.Start() @@ -167,7 +167,7 @@ func TestScheduler_StartStop(t *testing.T) { func TestScheduler_ProcessQueueEmpty(t *testing.T) { config := DefaultConfig() - s := NewScheduler(config, nil, nil) + s := NewScheduler(config, nil, nil, nil) // Process empty queue should not panic s.processQueue() @@ -188,7 +188,7 @@ func TestScheduler_ProcessQueueWithJobs(t *testing.T) { RateLimitPerSecond: 0, // Disable for test } - s := NewScheduler(config, nil, nil) + s := NewScheduler(config, nil, nil, nil) // Add jobs that are due now for i := 0; i < 5; i++ { @@ -229,7 +229,7 @@ func TestScheduler_RateLimiterRefill(t *testing.T) { RateLimitPerSecond: 10, // 10 tokens per second } - s := NewScheduler(config, nil, nil) + s := NewScheduler(config, nil, nil, nil) if s.rateLimiter == nil { t.Fatal("rate limiter not initialized") @@ -264,7 +264,7 @@ func TestScheduler_RateLimiterRefill(t *testing.T) { func TestScheduler_ConcurrentQueueAccess(t *testing.T) { config := DefaultConfig() - s := NewScheduler(config, nil, nil) + s := NewScheduler(config, nil, nil, nil) done := make(chan bool) @@ -303,7 +303,7 @@ func TestScheduler_ConcurrentQueueAccess(t *testing.T) { func BenchmarkScheduler_ProcessQueue(b *testing.B) { config := DefaultConfig() - s := NewScheduler(config, nil, nil) + s := NewScheduler(config, nil, nil, nil) // Pre-fill queue with jobs for i := 0; i < 1000; i++ { diff --git a/aggregator-server/internal/services/timeout.go b/aggregator-server/internal/services/timeout.go index 6da33e8..dc6d52b 100644 --- a/aggregator-server/internal/services/timeout.go +++ b/aggregator-server/internal/services/timeout.go @@ -12,11 +12,12 @@ import ( // TimeoutService handles timeout management for long-running operations type TimeoutService struct { - commandQueries *queries.CommandQueries - updateQueries *queries.UpdateQueries - ticker *time.Ticker - stopChan chan bool - timeoutDuration time.Duration + commandQueries *queries.CommandQueries + updateQueries *queries.UpdateQueries + ticker *time.Ticker + stopChan chan bool + sentTimeout time.Duration // For commands already sent to agents + pendingTimeout time.Duration // For commands stuck in queue } // NewTimeoutService creates a new timeout service @@ -24,14 +25,16 @@ func NewTimeoutService(cq *queries.CommandQueries, uq *queries.UpdateQueries) *T return &TimeoutService{ commandQueries: cq, updateQueries: uq, - timeoutDuration: 2 * time.Hour, // 2 hours timeout - allows for system upgrades and large operations + sentTimeout: 2 * time.Hour, // 2 hours for commands already sent to agents + pendingTimeout: 30 * time.Minute, // 30 minutes for commands stuck in queue + // TODO: Make these timeout durations user-adjustable in settings stopChan: make(chan bool), } } // Start begins the timeout monitoring service func (ts *TimeoutService) Start() { - log.Printf("Starting timeout service with %v timeout duration", ts.timeoutDuration) + log.Printf("Starting timeout service with %v sent timeout, %v pending timeout", ts.sentTimeout, ts.pendingTimeout) // Create a ticker that runs every 5 minutes ts.ticker = time.NewTicker(5 * time.Minute) @@ -59,25 +62,41 @@ func (ts *TimeoutService) Stop() { func (ts *TimeoutService) checkForTimeouts() { log.Println("Checking for timed out operations...") - // Get all commands that are in 'sent' status - commands, err := ts.commandQueries.GetCommandsByStatus(models.CommandStatusSent) - if err != nil { - log.Printf("Error getting sent commands: %v", err) - return - } - - timeoutThreshold := time.Now().Add(-ts.timeoutDuration) + sentTimeoutThreshold := time.Now().Add(-ts.sentTimeout) + pendingTimeoutThreshold := time.Now().Add(-ts.pendingTimeout) timedOutCommands := make([]models.AgentCommand, 0) - for _, command := range commands { - // Check if command has been sent and is older than timeout threshold - if command.SentAt != nil && command.SentAt.Before(timeoutThreshold) { - timedOutCommands = append(timedOutCommands, command) + // Check 'sent' commands (traditional timeout - 2 hours) + sentCommands, err := ts.commandQueries.GetCommandsByStatus(models.CommandStatusSent) + if err != nil { + log.Printf("Error getting sent commands: %v", err) + } else { + for _, command := range sentCommands { + // Check if command has been sent and is older than sent timeout threshold + if command.SentAt != nil && command.SentAt.Before(sentTimeoutThreshold) { + timedOutCommands = append(timedOutCommands, command) + } + } + } + + // Check 'pending' commands (stuck in queue timeout - 30 minutes) + pendingCommands, err := ts.commandQueries.GetCommandsByStatus(models.CommandStatusPending) + if err != nil { + log.Printf("Error getting pending commands: %v", err) + } else { + for _, command := range pendingCommands { + // Check if command has been pending longer than pending timeout threshold + if command.CreatedAt.Before(pendingTimeoutThreshold) { + timedOutCommands = append(timedOutCommands, command) + log.Printf("Found stuck pending command %s (type: %s, created: %s, age: %v)", + command.ID, command.CommandType, command.CreatedAt.Format(time.RFC3339), time.Since(command.CreatedAt)) + } } } if len(timedOutCommands) > 0 { - log.Printf("Found %d timed out commands", len(timedOutCommands)) + log.Printf("Found %d timed out commands (%d sent >2h, %d stuck pending >30m)", + len(timedOutCommands), len(sentCommands), len(pendingCommands)) for _, command := range timedOutCommands { if err := ts.timeoutCommand(&command); err != nil { @@ -91,6 +110,14 @@ func (ts *TimeoutService) checkForTimeouts() { // timeoutCommand marks a specific command as timed out and updates related entities func (ts *TimeoutService) timeoutCommand(command *models.AgentCommand) error { + // Determine which timeout duration was applied + var appliedTimeout time.Duration + if command.Status == models.CommandStatusSent { + appliedTimeout = ts.sentTimeout + } else { + appliedTimeout = ts.pendingTimeout + } + log.Printf("Timing out command %s (type: %s, agent: %s)", command.ID, command.CommandType, command.AgentID) @@ -103,7 +130,7 @@ func (ts *TimeoutService) timeoutCommand(command *models.AgentCommand) error { result := models.JSONB{ "error": "operation timed out", "timeout_at": time.Now(), - "duration": ts.timeoutDuration.String(), + "duration": appliedTimeout.String(), "command_id": command.ID.String(), } @@ -112,7 +139,7 @@ func (ts *TimeoutService) timeoutCommand(command *models.AgentCommand) error { } // Update related update package status if applicable - if err := ts.updateRelatedPackageStatus(command); err != nil { + if err := ts.updateRelatedPackageStatus(command, appliedTimeout); err != nil { log.Printf("Warning: failed to update related package status: %v", err) // Don't return error here as the main timeout operation succeeded } @@ -123,11 +150,11 @@ func (ts *TimeoutService) timeoutCommand(command *models.AgentCommand) error { AgentID: command.AgentID, UpdatePackageID: ts.extractUpdatePackageID(command), Action: command.CommandType, - Result: "timed_out", + Result: "failed", // Use 'failed' to comply with database constraint Stdout: "", - Stderr: fmt.Sprintf("Command %s timed out after %v", command.CommandType, ts.timeoutDuration), + Stderr: fmt.Sprintf("Command %s timed out after %v (timeout_id: %s)", command.CommandType, appliedTimeout, command.ID), ExitCode: 124, // Standard timeout exit code - DurationSeconds: int(ts.timeoutDuration.Seconds()), + DurationSeconds: int(appliedTimeout.Seconds()), ExecutedAt: time.Now(), } @@ -141,7 +168,7 @@ func (ts *TimeoutService) timeoutCommand(command *models.AgentCommand) error { } // updateRelatedPackageStatus updates the status of related update packages when a command times out -func (ts *TimeoutService) updateRelatedPackageStatus(command *models.AgentCommand) error { +func (ts *TimeoutService) updateRelatedPackageStatus(command *models.AgentCommand, appliedTimeout time.Duration) error { // Extract update_id from command params if it exists _, ok := command.Params["update_id"].(string) if !ok { @@ -153,7 +180,7 @@ func (ts *TimeoutService) updateRelatedPackageStatus(command *models.AgentComman metadata := models.JSONB{ "timeout": true, "timeout_at": time.Now(), - "timeout_duration": ts.timeoutDuration.String(), + "timeout_duration": appliedTimeout.String(), "command_id": command.ID.String(), "failure_reason": "operation timed out", } @@ -196,7 +223,7 @@ func (ts *TimeoutService) GetTimeoutStatus() (map[string]interface{}, error) { } // Count commands approaching timeout (within 5 minutes of timeout) - timeoutThreshold := time.Now().Add(-ts.timeoutDuration + 5*time.Minute) + timeoutThreshold := time.Now().Add(-ts.sentTimeout + 5*time.Minute) approachingTimeout := 0 for _, command := range activeCommands { if command.SentAt != nil && command.SentAt.Before(timeoutThreshold) { @@ -205,16 +232,30 @@ func (ts *TimeoutService) GetTimeoutStatus() (map[string]interface{}, error) { } return map[string]interface{}{ - "total_timed_out": len(timedOutCommands), - "total_active": len(activeCommands), - "approaching_timeout": approachingTimeout, - "timeout_duration": ts.timeoutDuration.String(), - "last_check": time.Now(), + "total_timed_out": len(timedOutCommands), + "total_active": len(activeCommands), + "approaching_timeout": approachingTimeout, + "sent_timeout_duration": ts.sentTimeout.String(), + "pending_timeout_duration": ts.pendingTimeout.String(), + "last_check": time.Now(), }, nil } -// SetTimeoutDuration allows changing the timeout duration +// SetTimeoutDuration allows changing the timeout duration for sent commands +// TODO: This should be deprecated in favor of SetSentTimeout and SetPendingTimeout func (ts *TimeoutService) SetTimeoutDuration(duration time.Duration) { - ts.timeoutDuration = duration - log.Printf("Timeout duration updated to %v", duration) + ts.sentTimeout = duration + log.Printf("Sent timeout duration updated to %v", duration) +} + +// SetSentTimeout allows changing the timeout duration for sent commands +func (ts *TimeoutService) SetSentTimeout(duration time.Duration) { + ts.sentTimeout = duration + log.Printf("Sent timeout duration updated to %v", duration) +} + +// SetPendingTimeout allows changing the timeout duration for pending commands +func (ts *TimeoutService) SetPendingTimeout(duration time.Duration) { + ts.pendingTimeout = duration + log.Printf("Pending timeout duration updated to %v", duration) } \ No newline at end of file diff --git a/aggregator-web/src/components/AgentScanners.tsx b/aggregator-web/src/components/AgentScanners.tsx index c944bba..538e54b 100644 --- a/aggregator-web/src/components/AgentScanners.tsx +++ b/aggregator-web/src/components/AgentScanners.tsx @@ -8,9 +8,14 @@ import { Cpu, Container, Package, + Shield, + Fingerprint, + CheckCircle, + AlertCircle, + XCircle, } from 'lucide-react'; import { formatRelativeTime } from '@/lib/utils'; -import { agentApi } from '@/lib/api'; +import { agentApi, securityApi } from '@/lib/api'; import toast from 'react-hot-toast'; import { cn } from '@/lib/utils'; import { AgentSubsystem } from '@/types'; @@ -60,6 +65,77 @@ export function AgentScanners({ agentId }: AgentScannersProps) { refetchInterval: 30000, // Refresh every 30 seconds }); + // Fetch security health status + const { data: securityOverview, isLoading: securityLoading } = useQuery({ + queryKey: ['security-overview'], + queryFn: async () => { + const data = await securityApi.getOverview(); + return data; + }, + refetchInterval: 60000, // Refresh every minute + }); + + // Helper function to get security status color and icon + const getSecurityStatusDisplay = (status: string) => { + switch (status) { + case 'healthy': + case 'enforced': + case 'operational': + return { + color: 'text-green-600 bg-green-100 border-green-200', + icon: + }; + case 'degraded': + return { + color: 'text-amber-600 bg-amber-100 border-amber-200', + icon: + }; + case 'unhealthy': + case 'unavailable': + return { + color: 'text-red-600 bg-red-100 border-red-200', + icon: + }; + default: + return { + color: 'text-gray-600 bg-gray-100 border-gray-200', + icon: + }; + } + }; + + // Get security icon for subsystem type + const getSecurityIcon = (type: string) => { + switch (type) { + case 'ed25519_signing': + return ; + case 'nonce_validation': + return ; + case 'machine_binding': + return ; + case 'command_validation': + return ; + default: + return ; + } + }; + + // Get display name for security subsystem + const getSecurityDisplayName = (type: string) => { + switch (type) { + case 'ed25519_signing': + return 'Ed25519 Signing'; + case 'nonce_validation': + return 'Nonce Protection'; + case 'machine_binding': + return 'Machine Binding'; + case 'command_validation': + return 'Command Validation'; + default: + return type; + } + }; + // Toggle subsystem enabled/disabled const toggleSubsystemMutation = useMutation({ mutationFn: async ({ subsystem, enabled }: { subsystem: string; enabled: boolean }) => { @@ -176,6 +252,124 @@ export function AgentScanners({ agentId }: AgentScannersProps) { + {/* Security Health Section */} +
+
+
+ +

Security Health

+
+ +
+ + {securityLoading ? ( +
+ + Loading security status... +
+ ) : securityOverview ? ( +
+ {/* Overall Security Status */} +
+
+
+
+

Overall Security Status

+

{securityOverview.overall_status}

+
+
+
+ {securityOverview.overall_status.toUpperCase()} +
+
+ + {/* Individual Security Subsystems */} +
+ {Object.entries(securityOverview.subsystems).map(([key, subsystem]) => { + const display = getSecurityStatusDisplay(subsystem.status); + return ( +
+
+
+ {getSecurityIcon(key)} +
+
+

+ {getSecurityDisplayName(key)} +

+

+ {subsystem.enabled ? 'Enabled' : 'Disabled'} +

+
+
+
+ {display.icon} + {subsystem.status.toUpperCase()} +
+
+ ); + })} +
+ + {/* Security Alerts and Recommendations */} + {(securityOverview.alerts.length > 0 || securityOverview.recommendations.length > 0) && ( +
+ {securityOverview.alerts.length > 0 && ( +
+

Security Alerts

+
    + {securityOverview.alerts.map((alert, index) => ( +
  • + + {alert} +
  • + ))} +
+
+ )} + + {securityOverview.recommendations.length > 0 && ( +
+

Recommendations

+
    + {securityOverview.recommendations.map((recommendation, index) => ( +
  • + + {recommendation} +
  • + ))} +
+
+ )} +
+ )} +
+ ) : ( +
+ +

Unable to load security status

+

Security monitoring may be unavailable

+
+ )} +
+ {/* Subsystem Configuration Table */}
diff --git a/aggregator-web/src/components/AgentUpdatesEnhanced.tsx b/aggregator-web/src/components/AgentUpdatesEnhanced.tsx index ece0cb1..254c6f8 100644 --- a/aggregator-web/src/components/AgentUpdatesEnhanced.tsx +++ b/aggregator-web/src/components/AgentUpdatesEnhanced.tsx @@ -2,13 +2,9 @@ import { useState } from 'react'; import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query'; import { Search, - Package, - Download, Upload, - CheckCircle, RefreshCw, Terminal, - Filter, ChevronDown, ChevronRight, Check, diff --git a/aggregator-web/src/lib/api.ts b/aggregator-web/src/lib/api.ts index 9db9081..1d377d3 100644 --- a/aggregator-web/src/lib/api.ts +++ b/aggregator-web/src/lib/api.ts @@ -692,4 +692,126 @@ export const adminApi = { }, }; +// Security API endpoints +export const securityApi = { + // Get comprehensive security overview + getOverview: async (): Promise<{ + timestamp: string; + overall_status: 'healthy' | 'degraded' | 'unhealthy'; + subsystems: { + ed25519_signing: { status: string; enabled: boolean }; + nonce_validation: { status: string; enabled: boolean }; + machine_binding: { status: string; enabled: boolean }; + command_validation: { status: string; enabled: boolean }; + }; + alerts: string[]; + recommendations: string[]; + }> => { + const response = await api.get('/security/overview'); + return response.data; + }, + + // Get Ed25519 signing service status + getSigningStatus: async (): Promise<{ + status: string; + timestamp: string; + checks: { + service_initialized: boolean; + public_key_available: boolean; + signing_operational: boolean; + }; + public_key_fingerprint?: string; + algorithm?: string; + }> => { + const response = await api.get('/security/signing'); + return response.data; + }, + + // Get nonce validation status + getNonceStatus: async (): Promise<{ + status: string; + timestamp: string; + checks: { + validation_enabled: boolean; + max_age_minutes: number; + recent_validations: number; + validation_failures: number; + }; + details: { + nonce_format: string; + signature_algorithm: string; + replay_protection: string; + }; + }> => { + const response = await api.get('/security/nonce'); + return response.data; + }, + + // Get machine binding status + getMachineBindingStatus: async (): Promise<{ + status: string; + timestamp: string; + checks: { + binding_enforced: boolean; + min_agent_version: string; + fingerprint_required: boolean; + recent_violations: number; + }; + details: { + enforcement_method: string; + binding_scope: string; + violation_action: string; + }; + }> => { + const response = await api.get('/security/machine-binding'); + return response.data; + }, + + // Get command validation status + getCommandValidationStatus: async (): Promise<{ + status: string; + timestamp: string; + metrics: { + total_pending_commands: number; + agents_with_pending: number; + commands_last_hour: number; + commands_last_24h: number; + }; + checks: { + command_processing: string; + backpressure_active: boolean; + agent_responsive: string; + }; + }> => { + const response = await api.get('/security/commands'); + return response.data; + }, + + // Get detailed security metrics + getMetrics: async (): Promise<{ + timestamp: string; + signing: { + public_key_fingerprint: string; + algorithm: string; + key_size: number; + configured: boolean; + }; + nonce: { + max_age_seconds: number; + format: string; + }; + machine_binding: { + min_version: string; + enforcement: string; + }; + command_processing: { + backpressure_threshold: number; + rate_limit_per_second: number; + }; + }> => { + const response = await api.get('/security/metrics'); + return response.data; + }, +}; + export default api; \ No newline at end of file