From db67049e923af86d197b37d4100772f3a7116f91 Mon Sep 17 00:00:00 2001 From: jpetree331 Date: Sun, 29 Mar 2026 09:53:43 -0400 Subject: [PATCH] fix(identity): D-1 machine ID deduplication fixes - Remove unhashed 'unknown-' fallback from registration (F-D1-1) Registration aborts if GetMachineID() fails (no bad data) - Add POST /admin/agents/:id/rebind-machine-id endpoint (F-D1-2) Admin can update stored machine ID after hardware change - Delete dead example_integration.go with wrong usage (F-D1-3) - Remove redundant Windows machineid.ID() retry (F-D1-4) - Replace fmt.Printf with log.Printf in client.go (F-D1-5) Operator note: agents registered with 'unknown-' machine IDs must be rebound before upgrading. See D1_Fix_Implementation.md. All tests pass. No regressions. Co-Authored-By: Claude Opus 4.6 (1M context) --- aggregator-agent/cmd/agent/main.go | 7 +- aggregator-agent/internal/client/client.go | 5 +- .../client/machine_id_logging_test.go | 10 +- .../internal/logging/example_integration.go | 138 ------------------ .../logging/example_integration_test.go | 25 +--- .../internal/system/machine_id.go | 13 +- .../system/machine_id_fallback_test.go | 113 +++++--------- .../internal/system/machine_id_format_test.go | 20 +-- .../system/machine_id_winpath_test.go | 13 +- aggregator-server/cmd/server/main.go | 3 + .../internal/api/handlers/agents.go | 62 ++++++++ .../middleware/machine_id_recovery_test.go | 72 +++------ .../internal/database/queries/agents.go | 7 + docs/D1_Fix_Implementation.md | 47 ++++++ 14 files changed, 210 insertions(+), 325 deletions(-) delete mode 100644 aggregator-agent/internal/logging/example_integration.go create mode 100644 docs/D1_Fix_Implementation.md diff --git a/aggregator-agent/cmd/agent/main.go b/aggregator-agent/cmd/agent/main.go index 1a701fd..6912c35 100644 --- a/aggregator-agent/cmd/agent/main.go +++ b/aggregator-agent/cmd/agent/main.go @@ -421,11 +421,12 @@ func registerAgent(cfg *config.Config, serverURL string) error { } } - // Get machine ID for binding + // Get machine ID for binding (F-D1-1 fix: no unhashed fallback) + // system.GetMachineID() has its own internal fallback chain that + // always produces a SHA256 hash. If even that fails, abort registration. machineID, err := system.GetMachineID() if err != nil { - log.Printf("Warning: Failed to get machine ID: %v", err) - machineID = "unknown-" + sysInfo.Hostname + log.Fatalf("[ERROR] [agent] [registration] machine_id_unavailable error=%q — cannot register without consistent machine ID", err) } // Get embedded public key fingerprint diff --git a/aggregator-agent/internal/client/client.go b/aggregator-agent/internal/client/client.go index d13210c..3489bd2 100644 --- a/aggregator-agent/internal/client/client.go +++ b/aggregator-agent/internal/client/client.go @@ -5,6 +5,7 @@ import ( "encoding/json" "fmt" "io" + "log" "net/http" "os" "path/filepath" @@ -35,8 +36,8 @@ func NewClient(baseURL, token string) *Client { // Get machine ID for security binding (v0.1.22+) machineID, err := system.GetMachineID() if err != nil { - // Log warning but don't fail - older servers may not require it - fmt.Printf("Warning: Failed to get machine ID: %v\n", err) + // Log warning but don't fail — older servers may not require it + log.Printf("[WARNING] [agent] [client] machine_id_error error=%q", err) machineID = "" // Will be handled by server validation } diff --git a/aggregator-agent/internal/client/machine_id_logging_test.go b/aggregator-agent/internal/client/machine_id_logging_test.go index 4e91a26..482f93b 100644 --- a/aggregator-agent/internal/client/machine_id_logging_test.go +++ b/aggregator-agent/internal/client/machine_id_logging_test.go @@ -19,8 +19,7 @@ import ( // --------------------------------------------------------------------------- func TestClientMachineIDErrorUsesFmtPrintf(t *testing.T) { - // F-D1-5 LOW: client.go uses fmt.Printf for machine ID error. - // ETHOS #1 requires log.Printf with [TAG] [system] [component]. + // POST-FIX (F-D1-5): fmt.Printf replaced with log.Printf. content, err := os.ReadFile("client.go") if err != nil { t.Fatalf("failed to read client.go: %v", err) @@ -28,7 +27,6 @@ func TestClientMachineIDErrorUsesFmtPrintf(t *testing.T) { src := string(content) - // Find the NewClient function's machine ID error handling newClientIdx := strings.Index(src, "func NewClient(") if newClientIdx == -1 { t.Fatal("[ERROR] [agent] [client] NewClient function not found") @@ -40,11 +38,11 @@ func TestClientMachineIDErrorUsesFmtPrintf(t *testing.T) { fnBody = fnBody[:nextFn+1] } - if !strings.Contains(fnBody, "fmt.Printf") { - t.Error("[ERROR] [agent] [client] F-D1-5 already fixed: no fmt.Printf in NewClient") + if strings.Contains(fnBody, "fmt.Printf") { + t.Error("[ERROR] [agent] [client] F-D1-5 NOT FIXED: fmt.Printf still in NewClient") } - t.Log("[INFO] [agent] [client] F-D1-5 confirmed: fmt.Printf used for machine ID error") + t.Log("[INFO] [agent] [client] F-D1-5 FIXED: structured logging in NewClient") } // --------------------------------------------------------------------------- diff --git a/aggregator-agent/internal/logging/example_integration.go b/aggregator-agent/internal/logging/example_integration.go deleted file mode 100644 index 7c332f0..0000000 --- a/aggregator-agent/internal/logging/example_integration.go +++ /dev/null @@ -1,138 +0,0 @@ -package logging - -// This file contains example code showing how to integrate the security logger -// into various parts of the agent application. - -import ( - "fmt" - "time" - - "github.com/Fimeg/RedFlag/aggregator-agent/internal/config" - "github.com/denisbrodbeck/machineid" -) - -// Example of how to initialize the security logger in main.go -func ExampleInitializeSecurityLogger(cfg *config.Config, dataDir string) (*SecurityLogger, error) { - // Create the security logger - securityLogger, err := NewSecurityLogger(cfg, dataDir) - if err != nil { - return nil, err - } - - return securityLogger, nil -} - -// Example of using the security logger in command executor -func ExampleCommandExecution(securityLogger *SecurityLogger, command string, signature string) { - // Simulate signature verification - signatureValid := false // In real code, this would be actual verification - - if !signatureValid { - securityLogger.LogCommandVerificationFailure( - "cmd-123", - "signature verification failed: crypto/rsa: verification error", - ) - } else { - // Only log success if configured - event := &SecurityEvent{ - Timestamp: time.Now().UTC(), - Level: "INFO", - EventType: SecurityEventTypes.CmdSignatureVerificationSuccess, - Message: "Command signature verified successfully", - } - securityLogger.Log(event) - } -} - -// Example of using the security logger in update handler -func ExampleUpdateHandler(securityLogger *SecurityLogger, updateID string, updateData []byte, signature string) { - // Simulate nonce validation - nonceValid := false - if !nonceValid { - securityLogger.LogNonceValidationFailure( - "deadbeef-1234-5678-9abc-1234567890ef", - "nonce expired or reused", - ) - } - - // Simulate signature verification - signatureValid := false - if !signatureValid { - securityLogger.LogUpdateSignatureVerificationFailure( - updateID, - "signature does not match update data", - ) - } -} - -// Example of machine ID monitoring -func ExampleMachineIDMonitoring(securityLogger *SecurityLogger) { - // Get current machine ID - currentID, err := machineid.ID() - if err != nil { - return - } - - // In real code, you would store the previous ID somewhere - // This is just an example of how to log when it changes - previousID := "previous-machine-id-here" - - if currentID != previousID { - securityLogger.LogMachineIDChangeDetected( - previousID, - currentID, - ) - } -} - -// Example of configuration monitoring -func ExampleConfigMonitoring(securityLogger *SecurityLogger, configPath string) { - // In real code, you would calculate and store a hash of the config - // and validate it periodically - configTampered := true // Simulate detection - - if configTampered { - securityLogger.LogConfigTamperingWarning( - configPath, - "configuration hash mismatch", - ) - } -} - -// Example of unauthorized command attempt -func ExampleUnauthorizedCommand(securityLogger *SecurityLogger, command string) { - // Check if command is in allowed list - allowedCommands := map[string]bool{ - "scan": true, - "update": true, - "cleanup": true, - } - - if !allowedCommands[command] { - securityLogger.LogUnauthorizedCommandAttempt( - command, - "command not in allowed list", - ) - } -} - -// Example of sending security events to server -func ExampleSendSecurityEvents(securityLogger *SecurityLogger, client interface{}) { - // Get batch of security events - events := securityLogger.GetBatch() - if len(events) > 0 { - // In real code, you would send these to the server - // If successful: - fmt.Printf("Sending %d security events to server...\n", len(events)) - - // Simulate successful send - success := true - if success { - securityLogger.ClearBatch() - fmt.Printf("Security events sent successfully\n") - } else { - // Events remain in buffer for next attempt - fmt.Printf("Failed to send security events, will retry\n") - } - } -} \ No newline at end of file diff --git a/aggregator-agent/internal/logging/example_integration_test.go b/aggregator-agent/internal/logging/example_integration_test.go index 7eb2888..9eb9493 100644 --- a/aggregator-agent/internal/logging/example_integration_test.go +++ b/aggregator-agent/internal/logging/example_integration_test.go @@ -9,7 +9,6 @@ package logging import ( "os" - "strings" "testing" ) @@ -20,28 +19,12 @@ import ( // --------------------------------------------------------------------------- func TestExampleIntegrationFileIsDeadCode(t *testing.T) { - // F-D1-3 LOW: example_integration.go is dead code. - // ExampleMachineIDMonitoring() calls machineid.ID() directly - // — bypassing hashMachineID(). Safe to delete. + // POST-FIX (F-D1-3): File deleted. _, err := os.Stat("example_integration.go") - if err != nil { - t.Error("[ERROR] [agent] [logging] F-D1-3 already fixed: file does not exist") - return + if err == nil { + t.Error("[ERROR] [agent] [logging] F-D1-3 NOT FIXED: dead code file still exists") } - - content, err := os.ReadFile("example_integration.go") - if err != nil { - t.Fatalf("failed to read example_integration.go: %v", err) - } - - src := string(content) - - // Confirm it calls machineid.ID() directly (unhashed) - if !strings.Contains(src, "machineid.ID()") { - t.Error("[ERROR] [agent] [logging] expected machineid.ID() call in example file") - } - - t.Log("[INFO] [agent] [logging] F-D1-3 confirmed: dead code with direct machineid.ID() call") + t.Log("[INFO] [agent] [logging] F-D1-3 FIXED: dead example code deleted") } // --------------------------------------------------------------------------- diff --git a/aggregator-agent/internal/system/machine_id.go b/aggregator-agent/internal/system/machine_id.go index 73db35c..2cf58c4 100644 --- a/aggregator-agent/internal/system/machine_id.go +++ b/aggregator-agent/internal/system/machine_id.go @@ -76,14 +76,13 @@ func getLinuxMachineID() (string, error) { return generateGenericMachineID() } -// getWindowsMachineID gets Windows machine ID +// getWindowsMachineID gets Windows machine ID. +// Note: The primary machineid.ID() in GetMachineID() already tried the +// registry key HKLM\SOFTWARE\Microsoft\Cryptography\MachineGuid. +// If it failed, retrying returns the same error — go straight to generic. +// On Windows reinstall, MachineGuid changes. Use the rebind endpoint +// (POST /admin/agents/:id/rebind-machine-id) to recover without re-registration. func getWindowsMachineID() (string, error) { - // Try machineid library Windows registry keys first - if id, err := machineid.ID(); err == nil && id != "" { - return hashMachineID(id), nil - } - - // Fallback to generating generic ID return generateGenericMachineID() } diff --git a/aggregator-agent/internal/system/machine_id_fallback_test.go b/aggregator-agent/internal/system/machine_id_fallback_test.go index fe44627..77b3e75 100644 --- a/aggregator-agent/internal/system/machine_id_fallback_test.go +++ b/aggregator-agent/internal/system/machine_id_fallback_test.go @@ -1,13 +1,9 @@ package system -// machine_id_fallback_test.go — Pre-fix tests for machine ID registration fallback. +// machine_id_fallback_test.go — Tests for machine ID registration fallback. // -// F-D1-1 HIGH: Registration fallback in main.go:428 produces "unknown-" -// which is unhashed and mismatches the canonical SHA256 hash format. -// If GetMachineID() recovers after a failed registration, the agent -// is permanently locked out (403 from MachineBindingMiddleware). -// -// Run: cd aggregator-agent && go test ./internal/system/... -v -run TestMachineID +// F-D1-1 FIXED: Registration no longer uses unhashed "unknown-" fallback. +// GetMachineID() is trusted as canonical; registration aborts if it fails. import ( "encoding/hex" @@ -17,54 +13,8 @@ import ( "testing" ) -// --------------------------------------------------------------------------- -// Test 1.1 — Documents unhashed registration fallback (F-D1-1) -// -// Category: PASS-NOW (documents the bug) -// --------------------------------------------------------------------------- - func TestRegistrationFallbackIsNotHashed(t *testing.T) { - // F-D1-1 HIGH: Registration fallback produces "unknown-" - // — unhashed, variable length. If GetMachineID() recovers on next - // restart, client sends SHA256 hash that doesn't match → 403 lockout. - mainPath := filepath.Join("..", "..", "cmd", "agent", "main.go") - content, err := os.ReadFile(mainPath) - if err != nil { - t.Fatalf("failed to read main.go: %v", err) - } - - src := string(content) - - // Find the fallback line - if !strings.Contains(src, `"unknown-"`) { - t.Error("[ERROR] [agent] [system] F-D1-1 already fixed: no 'unknown-' fallback found") - } - - // The fallback does NOT call hashMachineID or GetMachineID - // Find the error handling block near GetMachineID - machineIdx := strings.Index(src, `machineID, err := system.GetMachineID()`) - if machineIdx == -1 { - t.Fatal("[ERROR] [agent] [system] GetMachineID call not found in main.go") - } - - errorBlock := src[machineIdx : machineIdx+200] - if strings.Contains(errorBlock, "hashMachineID") || strings.Contains(errorBlock, "Hash") { - t.Error("[ERROR] [agent] [system] F-D1-1 already fixed: fallback is hashed") - } - - t.Log("[INFO] [agent] [system] F-D1-1 confirmed: registration fallback is unhashed 'unknown-' + hostname") -} - -// --------------------------------------------------------------------------- -// Test 1.2 — Registration fallback must use canonical function (assert fix) -// -// Category: FAIL-NOW / PASS-AFTER-FIX -// --------------------------------------------------------------------------- - -func TestRegistrationFallbackUsesCanonicalFunction(t *testing.T) { - // F-D1-1: After fix, ALL machine ID values must go through - // hashMachineID() or system.GetMachineID(). The "unknown-" prefix - // fallback must be removed entirely. + // POST-FIX: "unknown-" fallback is gone from main.go. mainPath := filepath.Join("..", "..", "cmd", "agent", "main.go") content, err := os.ReadFile(mainPath) if err != nil { @@ -74,20 +24,35 @@ func TestRegistrationFallbackUsesCanonicalFunction(t *testing.T) { src := string(content) if strings.Contains(src, `"unknown-"`) { - t.Errorf("[ERROR] [agent] [system] registration fallback still uses 'unknown-' prefix.\n" + - "F-D1-1: remove unhashed fallback. Use GetMachineID() internal fallback or abort.") + t.Error("[ERROR] [agent] [system] F-D1-1 NOT FIXED: 'unknown-' fallback still present") } + + t.Log("[INFO] [agent] [system] F-D1-1 FIXED: unhashed fallback removed from registration") } -// --------------------------------------------------------------------------- -// Test 1.3 — Canonical GetMachineID always produces 64 hex chars -// -// Category: PASS-NOW (canonical function is correct) -// --------------------------------------------------------------------------- +func TestRegistrationFallbackUsesCanonicalFunction(t *testing.T) { + // POST-FIX: Registration uses system.GetMachineID() with no inline fallback. + mainPath := filepath.Join("..", "..", "cmd", "agent", "main.go") + content, err := os.ReadFile(mainPath) + if err != nil { + t.Fatalf("failed to read main.go: %v", err) + } + + src := string(content) + + if strings.Contains(src, `"unknown-"`) { + t.Errorf("[ERROR] [agent] [system] registration still uses 'unknown-' fallback.\n" + + "F-D1-1: must use canonical GetMachineID() or abort.") + } + + if !strings.Contains(src, "system.GetMachineID()") { + t.Error("[ERROR] [agent] [system] registration doesn't call system.GetMachineID()") + } + + t.Log("[INFO] [agent] [system] F-D1-1 FIXED: registration uses canonical function only") +} func TestMachineIDIsAlways64HexChars(t *testing.T) { - // F-D1-1: The canonical GetMachineID() always produces 64 hex chars. - // The bug is in main.go's fallback which bypasses it. id, err := GetMachineID() if err != nil { t.Fatalf("GetMachineID failed: %v", err) @@ -101,24 +66,17 @@ func TestMachineIDIsAlways64HexChars(t *testing.T) { t.Errorf("[ERROR] [agent] [system] machine ID is not valid hex: %v", err) } - // Idempotency check (ETHOS #4) + // Idempotency check id2, _ := GetMachineID() if id != id2 { - t.Error("[ERROR] [agent] [system] GetMachineID returned different values on consecutive calls") + t.Error("[ERROR] [agent] [system] GetMachineID not idempotent") } - t.Logf("[INFO] [agent] [system] canonical machine ID: %s (64 hex chars, idempotent)", id[:16]+"...") + t.Logf("[INFO] [agent] [system] canonical machine ID: %s... (64 hex chars)", id[:16]) } -// --------------------------------------------------------------------------- -// Test 1.4 — Registration and runtime must use same code path (assert fix) -// -// Category: FAIL-NOW / PASS-AFTER-FIX -// --------------------------------------------------------------------------- - func TestRegistrationAndRuntimeUseSameCodePath(t *testing.T) { - // F-D1-1: After fix, both registration and runtime paths must use - // identical code. No inline fallback that differs from canonical. + // POST-FIX: Both paths call system.GetMachineID(), no divergent fallback. mainPath := filepath.Join("..", "..", "cmd", "agent", "main.go") mainContent, err := os.ReadFile(mainPath) if err != nil { @@ -134,7 +92,6 @@ func TestRegistrationAndRuntimeUseSameCodePath(t *testing.T) { mainSrc := string(mainContent) clientSrc := string(clientContent) - // Both should call GetMachineID if !strings.Contains(mainSrc, "system.GetMachineID()") { t.Error("[ERROR] [agent] [system] main.go doesn't call system.GetMachineID()") } @@ -142,9 +99,9 @@ func TestRegistrationAndRuntimeUseSameCodePath(t *testing.T) { t.Error("[ERROR] [agent] [system] client.go doesn't call system.GetMachineID()") } - // main.go must NOT have an inline fallback that differs if strings.Contains(mainSrc, `"unknown-"`) { - t.Errorf("[ERROR] [agent] [system] main.go has divergent 'unknown-' fallback.\n" + - "F-D1-1: registration and runtime paths must use identical machine ID logic.") + t.Errorf("[ERROR] [agent] [system] main.go has divergent 'unknown-' fallback") } + + t.Log("[INFO] [agent] [system] F-D1-1 FIXED: both paths use canonical GetMachineID()") } diff --git a/aggregator-agent/internal/system/machine_id_format_test.go b/aggregator-agent/internal/system/machine_id_format_test.go index 8c0ebb2..f968eeb 100644 --- a/aggregator-agent/internal/system/machine_id_format_test.go +++ b/aggregator-agent/internal/system/machine_id_format_test.go @@ -81,20 +81,16 @@ func TestHashMachineIDAlwaysProduces64HexChars(t *testing.T) { // --------------------------------------------------------------------------- func TestUnknownFallbackFormatDifferentFromHash(t *testing.T) { - // F-D1-1: Proves "unknown-hostname" and the hashed canonical - // value are fundamentally incompatible. - fallback := "unknown-testhost" + // POST-FIX (F-D1-1): The "unknown-" fallback no longer exists in + // registration code. This test confirms canonical paths always hash. canonical := hashMachineID("testhost-linux-fallback") + generic, _ := generateGenericMachineID() - if fallback == canonical { - t.Error("[ERROR] [agent] [system] fallback matches canonical (unexpected)") + // Both canonical paths produce 64 hex chars + if len(canonical) != 64 || len(generic) != 64 { + t.Errorf("[ERROR] [agent] [system] canonical paths not 64 chars: canonical=%d generic=%d", + len(canonical), len(generic)) } - if len(fallback) == 64 { - t.Error("[ERROR] [agent] [system] fallback is 64 chars (same length as hash)") - } - - t.Logf("[INFO] [agent] [system] F-D1-1 confirmed: fallback=%q (%d chars) vs canonical=%q (%d chars)", - fallback, len(fallback), canonical[:16]+"...", len(canonical)) - t.Log("[INFO] [agent] [system] storing one at registration and sending the other at runtime guarantees 403") + t.Log("[INFO] [agent] [system] F-D1-1 FIXED: all machine ID paths produce consistent 64 hex format") } diff --git a/aggregator-agent/internal/system/machine_id_winpath_test.go b/aggregator-agent/internal/system/machine_id_winpath_test.go index 34b6cbf..9c0d8bb 100644 --- a/aggregator-agent/internal/system/machine_id_winpath_test.go +++ b/aggregator-agent/internal/system/machine_id_winpath_test.go @@ -20,9 +20,7 @@ import ( // --------------------------------------------------------------------------- func TestWindowsFallbackHasRedundantRetry(t *testing.T) { - // F-D1-4 LOW: getWindowsMachineID() calls machineid.ID() a second - // time after the primary already failed. The library reads from - // the same registry key — it will fail again. + // POST-FIX (F-D1-4): Redundant retry removed. content, err := os.ReadFile("machine_id.go") if err != nil { t.Fatalf("failed to read machine_id.go: %v", err) @@ -30,7 +28,6 @@ func TestWindowsFallbackHasRedundantRetry(t *testing.T) { src := string(content) - // Find getWindowsMachineID function winIdx := strings.Index(src, "func getWindowsMachineID()") if winIdx == -1 { t.Fatal("[ERROR] [agent] [system] getWindowsMachineID not found") @@ -42,16 +39,12 @@ func TestWindowsFallbackHasRedundantRetry(t *testing.T) { fnBody = fnBody[:nextFn+1] } - // Count machineid.ID() calls in the function count := strings.Count(fnBody, "machineid.ID()") if count > 0 { - t.Logf("[INFO] [agent] [system] F-D1-4 confirmed: getWindowsMachineID calls machineid.ID() %d time(s)", count) - t.Log("[INFO] [agent] [system] this is redundant — primary path already tried machineid.ID()") + t.Errorf("[ERROR] [agent] [system] F-D1-4 NOT FIXED: machineid.ID() still in Windows fallback (%d calls)", count) } - if count == 0 { - t.Error("[ERROR] [agent] [system] F-D1-4 already fixed: no machineid.ID() in Windows fallback") - } + t.Log("[INFO] [agent] [system] F-D1-4 FIXED: redundant retry removed from Windows fallback") } // --------------------------------------------------------------------------- diff --git a/aggregator-server/cmd/server/main.go b/aggregator-server/cmd/server/main.go index eed84d2..518c570 100644 --- a/aggregator-server/cmd/server/main.go +++ b/aggregator-server/cmd/server/main.go @@ -600,6 +600,9 @@ func main() { admin.GET("/registration-tokens/stats", rateLimiter.RateLimit("admin_operations", middleware.KeyByUserID), registrationTokenHandler.GetTokenStats) admin.GET("/registration-tokens/validate", rateLimiter.RateLimit("admin_operations", middleware.KeyByUserID), registrationTokenHandler.ValidateRegistrationToken) + // Machine ID Rebind (F-D1-2: recovery from machine ID mismatch) + admin.POST("/agents/:id/rebind-machine-id", rateLimiter.RateLimit("admin_operations", middleware.KeyByUserID), agentHandler.RebindMachineID) + // Rate Limit Management admin.GET("/rate-limits", rateLimiter.RateLimit("admin_operations", middleware.KeyByUserID), rateLimitHandler.GetRateLimitSettings) admin.PUT("/rate-limits", rateLimiter.RateLimit("admin_operations", middleware.KeyByUserID), rateLimitHandler.UpdateRateLimitSettings) diff --git a/aggregator-server/internal/api/handlers/agents.go b/aggregator-server/internal/api/handlers/agents.go index b8c7ab3..3603383 100644 --- a/aggregator-server/internal/api/handlers/agents.go +++ b/aggregator-server/internal/api/handlers/agents.go @@ -1128,6 +1128,68 @@ func (h *AgentHandler) RenewToken(c *gin.Context) { c.JSON(http.StatusOK, response) } +// RebindMachineID updates an agent's stored machine ID (F-D1-2 admin endpoint). +// Requires WebAuthMiddleware + RequireAdmin. Used when hardware changes or +// to recover from the old "unknown-" fallback machine ID bug (F-D1-1). +func (h *AgentHandler) RebindMachineID(c *gin.Context) { + idStr := c.Param("id") + agentID, err := uuid.Parse(idStr) + if err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": "invalid agent ID"}) + return + } + + var req struct { + NewMachineID string `json:"new_machine_id" binding:"required"` + } + if err := c.ShouldBindJSON(&req); err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) + return + } + + // Validate: must be exactly 64 hex characters (SHA256 hash) + if len(req.NewMachineID) != 64 { + c.JSON(http.StatusBadRequest, gin.H{"error": "machine_id must be exactly 64 hex characters"}) + return + } + for _, ch := range req.NewMachineID { + if !((ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f')) { + c.JSON(http.StatusBadRequest, gin.H{"error": "machine_id must contain only lowercase hex characters [0-9a-f]"}) + return + } + } + + // Get current agent for logging + agent, err := h.agentQueries.GetAgentByID(agentID) + if err != nil { + c.JSON(http.StatusNotFound, gin.H{"error": "agent not found"}) + return + } + + oldMachineID := "" + if agent.MachineID != nil { + oldMachineID = *agent.MachineID + } + + // Update machine ID + if err := h.agentQueries.UpdateMachineID(agentID, req.NewMachineID); err != nil { + log.Printf("[ERROR] [server] [admin] rebind_machine_id_failed agent_id=%s error=%q", agentID, err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to update machine ID"}) + return + } + + adminUserID := c.GetString("user_id") + log.Printf("[INFO] [server] [admin] agent_machine_id_updated agent_id=%s old_id=%s new_id=%s admin_user=%s", + agentID, oldMachineID, req.NewMachineID, adminUserID) + + c.JSON(http.StatusOK, gin.H{ + "message": "machine ID updated", + "agent_id": agentID, + "old_machine_id": oldMachineID, + "new_machine_id": req.NewMachineID, + }) +} + // UnregisterAgent removes an agent from the system func (h *AgentHandler) UnregisterAgent(c *gin.Context) { idStr := c.Param("id") diff --git a/aggregator-server/internal/api/middleware/machine_id_recovery_test.go b/aggregator-server/internal/api/middleware/machine_id_recovery_test.go index 6d8236c..e2ab4e1 100644 --- a/aggregator-server/internal/api/middleware/machine_id_recovery_test.go +++ b/aggregator-server/internal/api/middleware/machine_id_recovery_test.go @@ -1,11 +1,8 @@ package middleware_test -// machine_id_recovery_test.go — Pre-fix tests for machine ID recovery path. +// machine_id_recovery_test.go — Tests for machine ID recovery path. // -// F-D1-2 MEDIUM: No recovery path from machine ID mismatch. -// Must delete and re-register agent. -// -// Run: cd aggregator-server && go test ./internal/api/middleware/... -v -run TestMachineBinding +// F-D1-2 FIXED: POST /admin/agents/:id/rebind-machine-id endpoint added. import ( "os" @@ -14,43 +11,8 @@ import ( "testing" ) -// --------------------------------------------------------------------------- -// Test 3.1 — Documents missing update path (F-D1-2) -// -// Category: PASS-NOW (documents the bug) -// --------------------------------------------------------------------------- - func TestMachineBindingHasNoUpdatePath(t *testing.T) { - // F-D1-2 MEDIUM: MachineBindingMiddleware does simple string - // equality. No API to update stored machine ID without re-registration. - mwPath := filepath.Join(".", "machine_binding.go") - content, err := os.ReadFile(mwPath) - if err != nil { - t.Fatalf("failed to read machine_binding.go: %v", err) - } - - src := string(content) - - hasUpdatePath := strings.Contains(src, "update_machine_id") || - strings.Contains(src, "rebind") || - strings.Contains(src, "UpdateMachineID") - - if hasUpdatePath { - t.Error("[ERROR] [server] [middleware] F-D1-2 already fixed: update path found") - } - - t.Log("[INFO] [server] [middleware] F-D1-2 confirmed: no machine ID update path exists") -} - -// --------------------------------------------------------------------------- -// Test 3.2 — Must have update/recovery path (assert fix) -// -// Category: FAIL-NOW / PASS-AFTER-FIX -// --------------------------------------------------------------------------- - -func TestMachineBindingShouldHaveUpdatePath(t *testing.T) { - // F-D1-2: After fix, provide recovery from machine ID mismatch. - // Check main.go routes or machine_binding.go for update mechanism. + // POST-FIX: Rebind endpoint exists in main.go admin routes. mainPath := filepath.Join("..", "..", "..", "cmd", "server", "main.go") content, err := os.ReadFile(mainPath) if err != nil { @@ -59,12 +21,26 @@ func TestMachineBindingShouldHaveUpdatePath(t *testing.T) { src := strings.ToLower(string(content)) - hasRecovery := strings.Contains(src, "rebind") || - strings.Contains(src, "update-machine") || - strings.Contains(src, "machine_id") && strings.Contains(src, "update") - - if !hasRecovery { - t.Errorf("[ERROR] [server] [middleware] no machine ID recovery mechanism found.\n" + - "F-D1-2: provide admin endpoint to update stored machine ID.") + if !strings.Contains(src, "rebind-machine-id") { + t.Error("[ERROR] [server] [middleware] F-D1-2 NOT FIXED: no rebind endpoint") } + + t.Log("[INFO] [server] [middleware] F-D1-2 FIXED: rebind-machine-id endpoint exists") +} + +func TestMachineBindingShouldHaveUpdatePath(t *testing.T) { + mainPath := filepath.Join("..", "..", "..", "cmd", "server", "main.go") + content, err := os.ReadFile(mainPath) + if err != nil { + t.Fatalf("failed to read main.go: %v", err) + } + + src := strings.ToLower(string(content)) + + if !strings.Contains(src, "rebind-machine-id") { + t.Errorf("[ERROR] [server] [middleware] no rebind endpoint found.\n" + + "F-D1-2: admin endpoint for machine ID rebind required.") + } + + t.Log("[INFO] [server] [middleware] F-D1-2 FIXED: admin rebind endpoint registered") } diff --git a/aggregator-server/internal/database/queries/agents.go b/aggregator-server/internal/database/queries/agents.go index 1eacc41..50878d2 100644 --- a/aggregator-server/internal/database/queries/agents.go +++ b/aggregator-server/internal/database/queries/agents.go @@ -116,6 +116,13 @@ func (q *AgentQueries) ListAgents(status, osType string) ([]models.Agent, error) return agents, err } +// UpdateMachineID updates the machine_id for an agent (F-D1-2 admin rebind) +func (q *AgentQueries) UpdateMachineID(agentID uuid.UUID, newMachineID string) error { + query := `UPDATE agents SET machine_id = $1 WHERE id = $2` + _, err := q.db.Exec(query, newMachineID, agentID) + return err +} + // MarkOfflineAgents marks agents as offline if they haven't checked in recently func (q *AgentQueries) MarkOfflineAgents(threshold time.Duration) error { query := ` diff --git a/docs/D1_Fix_Implementation.md b/docs/D1_Fix_Implementation.md new file mode 100644 index 0000000..af806be --- /dev/null +++ b/docs/D1_Fix_Implementation.md @@ -0,0 +1,47 @@ +# D-1 Machine ID Fix Implementation + +**Date:** 2026-03-29 +**Branch:** culurien + +--- + +## Files Changed + +| File | Change | +|------|--------| +| `cmd/agent/main.go` | Removed unhashed "unknown-" fallback; registration aborts if GetMachineID() fails (F-D1-1) | +| `internal/client/client.go` | Replaced fmt.Printf with log.Printf for machine ID errors (F-D1-5) | +| `internal/system/machine_id.go` | Removed redundant machineid.ID() retry in Windows fallback, added Windows reinstall documentation (F-D1-4) | +| `internal/logging/example_integration.go` | DELETED — dead code with incorrect machineid.ID() usage (F-D1-3) | +| `server/internal/api/handlers/agents.go` | Added RebindMachineID admin endpoint (F-D1-2) | +| `server/internal/database/queries/agents.go` | Added UpdateMachineID query function (F-D1-2) | +| `server/cmd/server/main.go` | Registered rebind-machine-id admin route (F-D1-2) | + +## Strategy (Task 1): Option C + +Used Option C — trust canonical `system.GetMachineID()` entirely. If it fails (which requires ALL fallbacks to fail including hostname-os-arch), abort registration with `log.Fatalf`. This is the safest approach: the internal fallback chain in GetMachineID() always produces a SHA256 hash, so format consistency is guaranteed. + +## Operator Migration Guide + +If any agents were registered with the old "unknown-hostname" fallback (identifiable by `machine_id` not being 64 hex chars in the DB), they will be locked out after this upgrade because the new runtime client sends a proper SHA256 hash. To recover: + +```sql +SELECT id, hostname, machine_id FROM agents +WHERE LENGTH(machine_id) != 64 OR machine_id LIKE 'unknown-%'; +``` + +For each agent found, use the rebind endpoint: +``` +POST /api/v1/admin/agents/{id}/rebind-machine-id +{"new_machine_id": "<64-char hex string from agent>"} +``` + +Or re-register the agent with a new registration token. + +## Rebind Endpoint Specification + +- **Route:** `POST /api/v1/admin/agents/:id/rebind-machine-id` +- **Auth:** WebAuthMiddleware + RequireAdmin (admin group) +- **Input:** `{"new_machine_id": "64-char-lowercase-hex-string"}` +- **Validation:** exactly 64 chars, lowercase hex only [0-9a-f] +- **Audit log:** old and new machine ID logged with admin user ID