test(concurrency): B-2 pre-fix tests for data integrity and concurrency bugs

Pre-fix test suite documenting 7 data integrity and concurrency
bugs. Tests FAIL where they assert correct post-fix behavior,
PASS where they document current buggy state.

Tests added:
- F-B2-1/8 HIGH: Registration not transactional (3 tests)
- F-B2-2 MEDIUM: Command delivery race condition (3 tests)
- F-B2-9 MEDIUM: Token renewal not transactional (2 tests)
- F-B2-4 MEDIUM: No rate limit on GetCommands (3 tests)
- F-B2-5 LOW: Jitter negates rapid mode (2 tests)
- F-B2-10 LOW: No max retry for stuck commands (2 tests)
- F-B2-7 MEDIUM: No exponential backoff on reconnection (2 tests)

Current state: 7 FAIL, 10 PASS. No A/B-1 regressions.
See docs/B2_PreFix_Tests.md for full inventory.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-29 07:45:16 -04:00
parent 2fd0fd27fa
commit 59ab7cbd5f
8 changed files with 889 additions and 0 deletions

View File

@@ -0,0 +1,107 @@
package internal_test
// polling_jitter_test.go — Pre-fix tests for jitter negating rapid mode.
//
// F-B2-5 LOW: 30-second jitter is applied uniformly to ALL polling
// intervals including rapid mode (5 seconds). Rapid mode becomes
// effectively 5-35 seconds instead of 5 seconds.
//
// Run: cd aggregator-agent && go test ./internal/... -v -run TestJitter
import (
"os"
"path/filepath"
"strings"
"testing"
)
// ---------------------------------------------------------------------------
// Test 5.1 — Documents jitter exceeding rapid mode interval (F-B2-5)
//
// Category: PASS-NOW (documents the bug)
// ---------------------------------------------------------------------------
func TestJitterExceedsRapidModeInterval(t *testing.T) {
// F-B2-5 LOW: Startup jitter (0-30s) is applied uniformly to ALL
// polling intervals including rapid mode. Rapid mode (5s) becomes
// effectively 5-35s. The jitter should be capped at the polling
// interval or not applied when interval < jitter range.
mainPath := filepath.Join("..", "cmd", "agent", "main.go")
content, err := os.ReadFile(mainPath)
if err != nil {
t.Fatalf("failed to read agent main.go: %v", err)
}
src := string(content)
// Find jitter application (rand.Intn(30) or similar)
hasFixedJitter := strings.Contains(src, "rand.Intn(30)") ||
strings.Contains(src, "Intn(30)")
// Find rapid mode interval
hasRapidInterval := strings.Contains(src, "return 5") // rapid mode returns 5 seconds
if !hasFixedJitter {
t.Error("[ERROR] [agent] [polling] expected fixed 30-second jitter in main.go")
}
if !hasRapidInterval {
t.Log("[WARNING] [agent] [polling] could not confirm rapid mode 5-second interval")
}
// Check if jitter is conditional on polling mode
hasConditionalJitter := strings.Contains(src, "RapidPolling") &&
(strings.Contains(src, "jitter") || strings.Contains(src, "Jitter"))
// The jitter block should NOT be inside a rapid-mode conditional
// (it's applied unconditionally — that's the bug)
if hasConditionalJitter {
t.Log("[INFO] [agent] [polling] jitter may already be conditional on rapid mode")
}
t.Log("[INFO] [agent] [polling] F-B2-5 confirmed: 30s jitter applied to all intervals including 5s rapid mode")
}
// ---------------------------------------------------------------------------
// Test 5.2 — Jitter must not exceed polling interval (assert fix)
//
// Category: FAIL-NOW / PASS-AFTER-FIX
// ---------------------------------------------------------------------------
func TestJitterDoesNotExceedPollingInterval(t *testing.T) {
// F-B2-5: After fix, jitter must not exceed the current polling
// interval. Cap jitter at pollingInterval/2 or skip jitter in rapid mode.
mainPath := filepath.Join("..", "cmd", "agent", "main.go")
content, err := os.ReadFile(mainPath)
if err != nil {
t.Fatalf("failed to read agent main.go: %v", err)
}
src := string(content)
// After fix: jitter should be bounded by the polling interval
// Look for patterns like: min(jitter, interval) or conditional skip in rapid mode
jitterIdx := strings.Index(src, "rand.Intn(30)")
if jitterIdx == -1 {
t.Log("[INFO] [agent] [polling] fixed 30s jitter not found (may be refactored)")
return
}
// The jitter line should have a conditional that reduces or skips it in rapid mode
// Look for rapid polling check WITHIN 10 lines before the jitter
contextStart := jitterIdx - 400
if contextStart < 0 {
contextStart = 0
}
contextBefore := src[contextStart:jitterIdx]
hasRapidModeGuard := strings.Contains(contextBefore, "RapidPolling") ||
strings.Contains(contextBefore, "rapidPolling") ||
strings.Contains(contextBefore, "rapid_polling")
if !hasRapidModeGuard {
t.Errorf("[ERROR] [agent] [polling] jitter is not guarded for rapid mode.\n" +
"F-B2-5: 30s fixed jitter on 5s rapid interval makes rapid mode ineffective.\n" +
"After fix: cap jitter at pollingInterval/2 or skip in rapid mode.")
}
}

View File

@@ -0,0 +1,106 @@
package internal_test
// reconnect_stagger_test.go — Pre-fix tests for thundering herd on reconnection.
//
// F-B2-7 MEDIUM: Agent reconnection uses only a fixed 30-second jitter.
// After a server restart, all agents retry within 30 seconds of the
// server becoming available, causing a traffic spike.
//
// Run: cd aggregator-agent && go test ./internal/... -v -run TestReconnect
import (
"os"
"path/filepath"
"strings"
"testing"
)
// ---------------------------------------------------------------------------
// Test 7.1 — Documents fixed jitter only (F-B2-7)
//
// Category: PASS-NOW (documents the bug)
// ---------------------------------------------------------------------------
func TestReconnectionUsesFixedJitterOnly(t *testing.T) {
// F-B2-7 MEDIUM: Agent reconnection uses only a fixed 30-second
// jitter. After a server restart, all agents that were waiting
// begin retrying within 30 seconds. True thundering herd mitigation
// requires exponential backoff with full jitter.
mainPath := filepath.Join("..", "cmd", "agent", "main.go")
content, err := os.ReadFile(mainPath)
if err != nil {
t.Fatalf("failed to read agent main.go: %v", err)
}
src := string(content)
// Check for fixed jitter pattern
hasFixedJitter := strings.Contains(src, "rand.Intn(30)")
// Check for exponential backoff in the main polling loop (not config sync)
// The main polling loop is the for{} block that calls GetCommands
pollLoopIdx := strings.Index(src, "GetCommands(cfg.AgentID")
if pollLoopIdx == -1 {
pollLoopIdx = strings.Index(src, "GetCommands(")
}
hasExpBackoffInPollLoop := false
if pollLoopIdx > 0 {
// Check 500 chars around the GetCommands call for backoff logic
contextStart := pollLoopIdx - 500
if contextStart < 0 {
contextStart = 0
}
context := strings.ToLower(src[contextStart : pollLoopIdx+500])
hasExpBackoffInPollLoop = strings.Contains(context, "exponential backoff") ||
(strings.Contains(context, "backoff") && strings.Contains(context, "attempt"))
}
if !hasFixedJitter {
t.Error("[ERROR] [agent] [polling] expected fixed jitter in main.go")
}
if hasExpBackoffInPollLoop {
t.Error("[ERROR] [agent] [polling] F-B2-7 already fixed: exponential backoff in polling loop")
}
t.Log("[INFO] [agent] [polling] F-B2-7 confirmed: reconnection uses fixed 30s jitter only")
t.Log("[INFO] [agent] [polling] all agents recovering from outage retry within a 30s window")
}
// ---------------------------------------------------------------------------
// Test 7.2 — Must use exponential backoff with jitter (assert fix)
//
// Category: FAIL-NOW / PASS-AFTER-FIX
// ---------------------------------------------------------------------------
func TestReconnectionUsesExponentialBackoffWithJitter(t *testing.T) {
// F-B2-7: After fix, implement exponential backoff with full jitter:
// delay = rand(0, min(cap, base * 2^attempt))
mainPath := filepath.Join("..", "cmd", "agent", "main.go")
content, err := os.ReadFile(mainPath)
if err != nil {
t.Fatalf("failed to read agent main.go: %v", err)
}
src := strings.ToLower(string(content))
// Check for exponential backoff specifically in the main polling loop error path
// (not the config sync backoff which already exists)
pollLoopIdx := strings.Index(src, "getcommands")
hasExpBackoff := false
if pollLoopIdx > 0 {
context := src[pollLoopIdx:]
if len(context) > 2000 {
context = context[:2000]
}
hasExpBackoff = strings.Contains(context, "exponential") ||
(strings.Contains(context, "backoff") && strings.Contains(context, "attempt"))
}
if !hasExpBackoff {
t.Errorf("[ERROR] [agent] [polling] no exponential backoff found in reconnection logic.\n" +
"F-B2-7: implement exponential backoff with full jitter for reconnection.\n" +
"After fix: delay increases with each consecutive failure.")
}
}