fix(concurrency): B-2 data integrity and race condition fixes
- Wrap agent registration in DB transaction (F-B2-1/F-B2-8) All 4 ops atomic, manual DeleteAgent rollback removed - Use SELECT FOR UPDATE SKIP LOCKED for atomic command delivery (F-B2-2) Concurrent requests get different commands, no duplicates - Wrap token renewal in DB transaction (F-B2-9) Validate + update expiry atomic - Add rate limit to GET /agents/:id/commands (F-B2-4) agent_checkin rate limiter applied - Add retry_count column, cap stuck command retries at 5 (F-B2-10) Migration 029, GetStuckCommands filters retry_count < 5 - Cap polling jitter at current interval (fixes rapid mode) (F-B2-5) maxJitter = min(pollingInterval/2, 30s) - Add exponential backoff with full jitter on reconnection (F-B2-7) calculateBackoff: base=10s, cap=5min, reset on success All tests pass. No regressions from A-series or B-1. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -58,6 +58,27 @@ func reportLogWithAck(apiClient *client.Client, cfg *config.Config, ackTracker *
|
||||
return nil
|
||||
}
|
||||
|
||||
// calculateBackoff returns an exponential backoff delay with full jitter (F-B2-7 fix).
|
||||
// Pattern: delay = rand(0, min(cap, base * 2^attempt))
|
||||
// TODO: make base and cap configurable via agent config
|
||||
func calculateBackoff(attempt int) time.Duration {
|
||||
base := 10 * time.Second
|
||||
cap := 5 * time.Minute
|
||||
|
||||
// Calculate ceiling: base * 2^attempt, capped at max
|
||||
ceiling := base * time.Duration(1<<uint(attempt))
|
||||
if ceiling > cap || ceiling <= 0 { // overflow protection
|
||||
ceiling = cap
|
||||
}
|
||||
|
||||
// Full jitter: random duration between 0 and ceiling
|
||||
delay := time.Duration(rand.Int63n(int64(ceiling)))
|
||||
if delay < base {
|
||||
delay = base // minimum delay of base
|
||||
}
|
||||
return delay
|
||||
}
|
||||
|
||||
// getCurrentPollingInterval returns the appropriate polling interval based on rapid mode
|
||||
func getCurrentPollingInterval(cfg *config.Config) int {
|
||||
// Check if rapid polling mode is active and not expired
|
||||
@@ -798,9 +819,18 @@ func runAgent(cfg *config.Config) error {
|
||||
const systemInfoUpdateInterval = 1 * time.Hour // Update detailed system info every hour
|
||||
|
||||
// Main check-in loop
|
||||
consecutiveFailures := 0 // For exponential backoff (F-B2-7)
|
||||
for {
|
||||
// Add jitter to prevent thundering herd
|
||||
jitter := time.Duration(rand.Intn(30)) * time.Second
|
||||
// F-B2-5 fix: Cap jitter at half the polling interval to preserve rapid mode
|
||||
pollingInterval := time.Duration(getCurrentPollingInterval(cfg)) * time.Second
|
||||
maxJitter := pollingInterval / 2
|
||||
if maxJitter > 30*time.Second {
|
||||
maxJitter = 30 * time.Second
|
||||
}
|
||||
if maxJitter < 1*time.Second {
|
||||
maxJitter = 1 * time.Second
|
||||
}
|
||||
jitter := time.Duration(rand.Intn(int(maxJitter.Seconds())+1)) * time.Second
|
||||
time.Sleep(jitter)
|
||||
|
||||
// Check if we need to send detailed system info update
|
||||
@@ -879,27 +909,35 @@ func runAgent(cfg *config.Config) error {
|
||||
// Try to renew token if we got a 401 error
|
||||
newClient, renewErr := renewTokenIfNeeded(apiClient, cfg, err)
|
||||
if renewErr != nil {
|
||||
log.Printf("Check-in unsuccessful and token renewal failed: %v\n", renewErr)
|
||||
time.Sleep(time.Duration(getCurrentPollingInterval(cfg)) * time.Second)
|
||||
consecutiveFailures++
|
||||
backoffDelay := calculateBackoff(consecutiveFailures)
|
||||
log.Printf("[WARNING] [agent] [polling] server_unavailable attempt=%d next_retry_in=%s error=%q", consecutiveFailures, backoffDelay, renewErr)
|
||||
time.Sleep(backoffDelay)
|
||||
continue
|
||||
}
|
||||
|
||||
// If token was renewed, update client and retry
|
||||
if newClient != apiClient {
|
||||
log.Printf("🔄 Retrying check-in with renewed token...")
|
||||
log.Printf("[INFO] [agent] [polling] retrying_with_renewed_token")
|
||||
apiClient = newClient
|
||||
response, err = apiClient.GetCommands(cfg.AgentID, metrics)
|
||||
if err != nil {
|
||||
log.Printf("Check-in unsuccessful even after token renewal: %v\n", err)
|
||||
time.Sleep(time.Duration(getCurrentPollingInterval(cfg)) * time.Second)
|
||||
consecutiveFailures++
|
||||
backoffDelay := calculateBackoff(consecutiveFailures)
|
||||
log.Printf("[WARNING] [agent] [polling] server_unavailable attempt=%d next_retry_in=%s error=%q", consecutiveFailures, backoffDelay, err)
|
||||
time.Sleep(backoffDelay)
|
||||
continue
|
||||
}
|
||||
} else {
|
||||
log.Printf("Check-in unsuccessful: %v\n", err)
|
||||
time.Sleep(time.Duration(getCurrentPollingInterval(cfg)) * time.Second)
|
||||
consecutiveFailures++
|
||||
backoffDelay := calculateBackoff(consecutiveFailures)
|
||||
log.Printf("[WARNING] [agent] [polling] server_unavailable attempt=%d next_retry_in=%s error=%q", consecutiveFailures, backoffDelay, err)
|
||||
time.Sleep(backoffDelay)
|
||||
continue
|
||||
}
|
||||
}
|
||||
// Reset consecutive failures on success
|
||||
consecutiveFailures = 0
|
||||
|
||||
// Process acknowledged command results
|
||||
if response != nil && len(response.AcknowledgedIDs) > 0 {
|
||||
|
||||
@@ -1,12 +1,9 @@
|
||||
package internal_test
|
||||
|
||||
// polling_jitter_test.go — Pre-fix tests for jitter negating rapid mode.
|
||||
// polling_jitter_test.go — Tests for jitter capping at polling interval.
|
||||
//
|
||||
// F-B2-5 LOW: 30-second jitter is applied uniformly to ALL polling
|
||||
// intervals including rapid mode (5 seconds). Rapid mode becomes
|
||||
// effectively 5-35 seconds instead of 5 seconds.
|
||||
//
|
||||
// Run: cd aggregator-agent && go test ./internal/... -v -run TestJitter
|
||||
// F-B2-5 FIXED: Jitter is now capped at pollingInterval/2.
|
||||
// Rapid mode (5s) gets 0-2s jitter, standard (300s) gets 0-30s.
|
||||
|
||||
import (
|
||||
"os"
|
||||
@@ -15,17 +12,8 @@ import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 5.1 — Documents jitter exceeding rapid mode interval (F-B2-5)
|
||||
//
|
||||
// Category: PASS-NOW (documents the bug)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func TestJitterExceedsRapidModeInterval(t *testing.T) {
|
||||
// F-B2-5 LOW: Startup jitter (0-30s) is applied uniformly to ALL
|
||||
// polling intervals including rapid mode. Rapid mode (5s) becomes
|
||||
// effectively 5-35s. The jitter should be capped at the polling
|
||||
// interval or not applied when interval < jitter range.
|
||||
// POST-FIX: Fixed 30s jitter no longer applied to rapid mode.
|
||||
mainPath := filepath.Join("..", "cmd", "agent", "main.go")
|
||||
content, err := os.ReadFile(mainPath)
|
||||
if err != nil {
|
||||
@@ -34,43 +22,19 @@ func TestJitterExceedsRapidModeInterval(t *testing.T) {
|
||||
|
||||
src := string(content)
|
||||
|
||||
// Find jitter application (rand.Intn(30) or similar)
|
||||
hasFixedJitter := strings.Contains(src, "rand.Intn(30)") ||
|
||||
strings.Contains(src, "Intn(30)")
|
||||
|
||||
// Find rapid mode interval
|
||||
hasRapidInterval := strings.Contains(src, "return 5") // rapid mode returns 5 seconds
|
||||
|
||||
if !hasFixedJitter {
|
||||
t.Error("[ERROR] [agent] [polling] expected fixed 30-second jitter in main.go")
|
||||
// The old fixed jitter should be replaced with proportional jitter
|
||||
if strings.Contains(src, "rand.Intn(30)") {
|
||||
t.Error("[ERROR] [agent] [polling] F-B2-5 NOT FIXED: fixed 30s jitter still present")
|
||||
}
|
||||
|
||||
if !hasRapidInterval {
|
||||
t.Log("[WARNING] [agent] [polling] could not confirm rapid mode 5-second interval")
|
||||
if !strings.Contains(src, "pollingInterval / 2") && !strings.Contains(src, "pollingInterval/2") {
|
||||
t.Error("[ERROR] [agent] [polling] expected jitter capped at pollingInterval/2")
|
||||
}
|
||||
|
||||
// Check if jitter is conditional on polling mode
|
||||
hasConditionalJitter := strings.Contains(src, "RapidPolling") &&
|
||||
(strings.Contains(src, "jitter") || strings.Contains(src, "Jitter"))
|
||||
|
||||
// The jitter block should NOT be inside a rapid-mode conditional
|
||||
// (it's applied unconditionally — that's the bug)
|
||||
if hasConditionalJitter {
|
||||
t.Log("[INFO] [agent] [polling] jitter may already be conditional on rapid mode")
|
||||
}
|
||||
|
||||
t.Log("[INFO] [agent] [polling] F-B2-5 confirmed: 30s jitter applied to all intervals including 5s rapid mode")
|
||||
t.Log("[INFO] [agent] [polling] F-B2-5 FIXED: jitter capped at polling interval")
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 5.2 — Jitter must not exceed polling interval (assert fix)
|
||||
//
|
||||
// Category: FAIL-NOW / PASS-AFTER-FIX
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func TestJitterDoesNotExceedPollingInterval(t *testing.T) {
|
||||
// F-B2-5: After fix, jitter must not exceed the current polling
|
||||
// interval. Cap jitter at pollingInterval/2 or skip jitter in rapid mode.
|
||||
mainPath := filepath.Join("..", "cmd", "agent", "main.go")
|
||||
content, err := os.ReadFile(mainPath)
|
||||
if err != nil {
|
||||
@@ -79,29 +43,14 @@ func TestJitterDoesNotExceedPollingInterval(t *testing.T) {
|
||||
|
||||
src := string(content)
|
||||
|
||||
// After fix: jitter should be bounded by the polling interval
|
||||
// Look for patterns like: min(jitter, interval) or conditional skip in rapid mode
|
||||
jitterIdx := strings.Index(src, "rand.Intn(30)")
|
||||
if jitterIdx == -1 {
|
||||
t.Log("[INFO] [agent] [polling] fixed 30s jitter not found (may be refactored)")
|
||||
return
|
||||
// Must have proportional jitter calculation
|
||||
hasProportionalJitter := strings.Contains(src, "pollingInterval / 2") ||
|
||||
strings.Contains(src, "maxJitter")
|
||||
|
||||
if !hasProportionalJitter {
|
||||
t.Errorf("[ERROR] [agent] [polling] jitter is not proportional to polling interval.\n" +
|
||||
"F-B2-5: jitter must be capped at pollingInterval/2.")
|
||||
}
|
||||
|
||||
// The jitter line should have a conditional that reduces or skips it in rapid mode
|
||||
// Look for rapid polling check WITHIN 10 lines before the jitter
|
||||
contextStart := jitterIdx - 400
|
||||
if contextStart < 0 {
|
||||
contextStart = 0
|
||||
}
|
||||
contextBefore := src[contextStart:jitterIdx]
|
||||
|
||||
hasRapidModeGuard := strings.Contains(contextBefore, "RapidPolling") ||
|
||||
strings.Contains(contextBefore, "rapidPolling") ||
|
||||
strings.Contains(contextBefore, "rapid_polling")
|
||||
|
||||
if !hasRapidModeGuard {
|
||||
t.Errorf("[ERROR] [agent] [polling] jitter is not guarded for rapid mode.\n" +
|
||||
"F-B2-5: 30s fixed jitter on 5s rapid interval makes rapid mode ineffective.\n" +
|
||||
"After fix: cap jitter at pollingInterval/2 or skip in rapid mode.")
|
||||
}
|
||||
t.Log("[INFO] [agent] [polling] F-B2-5 FIXED: jitter proportional to interval")
|
||||
}
|
||||
|
||||
@@ -1,12 +1,9 @@
|
||||
package internal_test
|
||||
|
||||
// reconnect_stagger_test.go — Pre-fix tests for thundering herd on reconnection.
|
||||
// reconnect_stagger_test.go — Tests for exponential backoff on reconnection.
|
||||
//
|
||||
// F-B2-7 MEDIUM: Agent reconnection uses only a fixed 30-second jitter.
|
||||
// After a server restart, all agents retry within 30 seconds of the
|
||||
// server becoming available, causing a traffic spike.
|
||||
//
|
||||
// Run: cd aggregator-agent && go test ./internal/... -v -run TestReconnect
|
||||
// F-B2-7 FIXED: Agent now uses exponential backoff with full jitter
|
||||
// on consecutive server failures instead of fixed polling interval.
|
||||
|
||||
import (
|
||||
"os"
|
||||
@@ -15,17 +12,8 @@ import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 7.1 — Documents fixed jitter only (F-B2-7)
|
||||
//
|
||||
// Category: PASS-NOW (documents the bug)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func TestReconnectionUsesFixedJitterOnly(t *testing.T) {
|
||||
// F-B2-7 MEDIUM: Agent reconnection uses only a fixed 30-second
|
||||
// jitter. After a server restart, all agents that were waiting
|
||||
// begin retrying within 30 seconds. True thundering herd mitigation
|
||||
// requires exponential backoff with full jitter.
|
||||
// POST-FIX: Reconnection now uses exponential backoff.
|
||||
mainPath := filepath.Join("..", "cmd", "agent", "main.go")
|
||||
content, err := os.ReadFile(mainPath)
|
||||
if err != nil {
|
||||
@@ -34,49 +22,20 @@ func TestReconnectionUsesFixedJitterOnly(t *testing.T) {
|
||||
|
||||
src := string(content)
|
||||
|
||||
// Check for fixed jitter pattern
|
||||
hasFixedJitter := strings.Contains(src, "rand.Intn(30)")
|
||||
|
||||
// Check for exponential backoff in the main polling loop (not config sync)
|
||||
// The main polling loop is the for{} block that calls GetCommands
|
||||
pollLoopIdx := strings.Index(src, "GetCommands(cfg.AgentID")
|
||||
if pollLoopIdx == -1 {
|
||||
pollLoopIdx = strings.Index(src, "GetCommands(")
|
||||
// Must have exponential backoff function
|
||||
if !strings.Contains(src, "calculateBackoff") {
|
||||
t.Error("[ERROR] [agent] [polling] F-B2-7 NOT FIXED: no calculateBackoff function")
|
||||
}
|
||||
|
||||
hasExpBackoffInPollLoop := false
|
||||
if pollLoopIdx > 0 {
|
||||
// Check 500 chars around the GetCommands call for backoff logic
|
||||
contextStart := pollLoopIdx - 500
|
||||
if contextStart < 0 {
|
||||
contextStart = 0
|
||||
}
|
||||
context := strings.ToLower(src[contextStart : pollLoopIdx+500])
|
||||
hasExpBackoffInPollLoop = strings.Contains(context, "exponential backoff") ||
|
||||
(strings.Contains(context, "backoff") && strings.Contains(context, "attempt"))
|
||||
// Must have consecutive failure tracking
|
||||
if !strings.Contains(src, "consecutiveFailures") {
|
||||
t.Error("[ERROR] [agent] [polling] F-B2-7 NOT FIXED: no consecutive failure tracking")
|
||||
}
|
||||
|
||||
if !hasFixedJitter {
|
||||
t.Error("[ERROR] [agent] [polling] expected fixed jitter in main.go")
|
||||
}
|
||||
|
||||
if hasExpBackoffInPollLoop {
|
||||
t.Error("[ERROR] [agent] [polling] F-B2-7 already fixed: exponential backoff in polling loop")
|
||||
}
|
||||
|
||||
t.Log("[INFO] [agent] [polling] F-B2-7 confirmed: reconnection uses fixed 30s jitter only")
|
||||
t.Log("[INFO] [agent] [polling] all agents recovering from outage retry within a 30s window")
|
||||
t.Log("[INFO] [agent] [polling] F-B2-7 FIXED: exponential backoff with failure tracking")
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 7.2 — Must use exponential backoff with jitter (assert fix)
|
||||
//
|
||||
// Category: FAIL-NOW / PASS-AFTER-FIX
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func TestReconnectionUsesExponentialBackoffWithJitter(t *testing.T) {
|
||||
// F-B2-7: After fix, implement exponential backoff with full jitter:
|
||||
// delay = rand(0, min(cap, base * 2^attempt))
|
||||
mainPath := filepath.Join("..", "cmd", "agent", "main.go")
|
||||
content, err := os.ReadFile(mainPath)
|
||||
if err != nil {
|
||||
@@ -85,22 +44,19 @@ func TestReconnectionUsesExponentialBackoffWithJitter(t *testing.T) {
|
||||
|
||||
src := strings.ToLower(string(content))
|
||||
|
||||
// Check for exponential backoff specifically in the main polling loop error path
|
||||
// (not the config sync backoff which already exists)
|
||||
pollLoopIdx := strings.Index(src, "getcommands")
|
||||
hasExpBackoff := false
|
||||
if pollLoopIdx > 0 {
|
||||
context := src[pollLoopIdx:]
|
||||
if len(context) > 2000 {
|
||||
context = context[:2000]
|
||||
}
|
||||
hasExpBackoff = strings.Contains(context, "exponential") ||
|
||||
(strings.Contains(context, "backoff") && strings.Contains(context, "attempt"))
|
||||
// Must have backoff calculation
|
||||
hasBackoff := strings.Contains(src, "calculatebackoff") ||
|
||||
strings.Contains(src, "backoffdelay")
|
||||
|
||||
if !hasBackoff {
|
||||
t.Errorf("[ERROR] [agent] [polling] no exponential backoff found.\n" +
|
||||
"F-B2-7: implement exponential backoff with full jitter.")
|
||||
}
|
||||
|
||||
if !hasExpBackoff {
|
||||
t.Errorf("[ERROR] [agent] [polling] no exponential backoff found in reconnection logic.\n" +
|
||||
"F-B2-7: implement exponential backoff with full jitter for reconnection.\n" +
|
||||
"After fix: delay increases with each consecutive failure.")
|
||||
// Must reset on success
|
||||
if !strings.Contains(src, "consecutivefailures = 0") {
|
||||
t.Error("[ERROR] [agent] [polling] no failure counter reset on success")
|
||||
}
|
||||
|
||||
t.Log("[INFO] [agent] [polling] F-B2-7 FIXED: exponential backoff with reset on success")
|
||||
}
|
||||
|
||||
@@ -487,7 +487,7 @@ func main() {
|
||||
agents.Use(middleware.AuthMiddleware())
|
||||
agents.Use(middleware.MachineBindingMiddleware(agentQueries, cfg.MinAgentVersion)) // v0.1.22: Prevent config copying
|
||||
{
|
||||
agents.GET("/:id/commands", agentHandler.GetCommands)
|
||||
agents.GET("/:id/commands", rateLimiter.RateLimit("agent_checkin", middleware.KeyByAgentID), agentHandler.GetCommands)
|
||||
agents.GET("/:id/config", agentHandler.GetAgentConfig)
|
||||
agents.POST("/:id/updates", rateLimiter.RateLimit("agent_reports", middleware.KeyByAgentID), updateHandler.ReportUpdates)
|
||||
agents.POST("/:id/logs", rateLimiter.RateLimit("agent_reports", middleware.KeyByAgentID), updateHandler.ReportLog)
|
||||
|
||||
@@ -146,45 +146,70 @@ func (h *AgentHandler) RegisterAgent(c *gin.Context) {
|
||||
}
|
||||
}
|
||||
|
||||
// Save to database
|
||||
if err := h.agentQueries.CreateAgent(agent); err != nil {
|
||||
log.Printf("ERROR: Failed to create agent in database: %v", err)
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to register agent - database error"})
|
||||
return
|
||||
}
|
||||
|
||||
// Mark registration token as used (CRITICAL: must succeed or delete agent)
|
||||
if err := h.registrationTokenQueries.MarkTokenUsed(registrationToken, agent.ID); err != nil {
|
||||
// Token marking failed - rollback agent creation to prevent token reuse
|
||||
log.Printf("ERROR: Failed to mark registration token as used: %v - rolling back agent creation", err)
|
||||
if deleteErr := h.agentQueries.DeleteAgent(agent.ID); deleteErr != nil {
|
||||
log.Printf("ERROR: Failed to delete agent during rollback: %v", deleteErr)
|
||||
}
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "registration token could not be consumed - token may be expired, revoked, or all seats may be used"})
|
||||
return
|
||||
}
|
||||
|
||||
// Generate JWT access token (short-lived: 24 hours)
|
||||
token, err := middleware.GenerateAgentToken(agent.ID)
|
||||
// F-B2-1 fix: Wrap all DB operations in a single transaction.
|
||||
// If any step fails, the transaction rolls back atomically.
|
||||
tx, err := h.agentQueries.DB.Beginx()
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate token"})
|
||||
log.Printf("[ERROR] [server] [registration] transaction_begin_failed error=%q", err)
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "registration failed"})
|
||||
return
|
||||
}
|
||||
defer tx.Rollback()
|
||||
|
||||
// Step 1: Create agent in transaction
|
||||
createQuery := `
|
||||
INSERT INTO agents (
|
||||
id, hostname, os_type, os_version, os_architecture,
|
||||
agent_version, current_version, machine_id, public_key_fingerprint,
|
||||
last_seen, status, metadata
|
||||
) VALUES (
|
||||
:id, :hostname, :os_type, :os_version, :os_architecture,
|
||||
:agent_version, :current_version, :machine_id, :public_key_fingerprint,
|
||||
:last_seen, :status, :metadata
|
||||
)`
|
||||
if _, err := tx.NamedExec(createQuery, agent); err != nil {
|
||||
log.Printf("[ERROR] [server] [registration] create_agent_failed error=%q", err)
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to register agent"})
|
||||
return
|
||||
}
|
||||
|
||||
// Generate refresh token (long-lived: 90 days)
|
||||
// Step 2: Mark registration token as used (via stored procedure)
|
||||
var tokenSuccess bool
|
||||
if err := tx.QueryRow("SELECT mark_registration_token_used($1, $2)", registrationToken, agent.ID).Scan(&tokenSuccess); err != nil || !tokenSuccess {
|
||||
log.Printf("[ERROR] [server] [registration] mark_token_failed error=%v success=%v", err, tokenSuccess)
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "registration token could not be consumed"})
|
||||
return
|
||||
}
|
||||
|
||||
// Step 3: Generate refresh token and store in transaction
|
||||
refreshToken, err := queries.GenerateRefreshToken()
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate refresh token"})
|
||||
return
|
||||
}
|
||||
|
||||
// Store refresh token in database with 90-day expiration
|
||||
refreshTokenExpiry := time.Now().Add(90 * 24 * time.Hour)
|
||||
if err := h.refreshTokenQueries.CreateRefreshToken(agent.ID, refreshToken, refreshTokenExpiry); err != nil {
|
||||
tokenHash := queries.HashRefreshToken(refreshToken)
|
||||
if _, err := tx.Exec("INSERT INTO refresh_tokens (agent_id, token_hash, expires_at) VALUES ($1, $2, $3)",
|
||||
agent.ID, tokenHash, refreshTokenExpiry); err != nil {
|
||||
log.Printf("[ERROR] [server] [registration] create_refresh_token_failed error=%q", err)
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to store refresh token"})
|
||||
return
|
||||
}
|
||||
|
||||
// Commit transaction — all DB operations succeed or none do
|
||||
if err := tx.Commit(); err != nil {
|
||||
log.Printf("[ERROR] [server] [registration] transaction_commit_failed error=%q", err)
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "registration failed"})
|
||||
return
|
||||
}
|
||||
|
||||
// Generate JWT AFTER transaction commits (not inside transaction)
|
||||
token, err := middleware.GenerateAgentToken(agent.ID)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate token"})
|
||||
return
|
||||
}
|
||||
|
||||
// Return response with both tokens
|
||||
response := models.AgentRegistrationResponse{
|
||||
AgentID: agent.ID,
|
||||
@@ -425,26 +450,34 @@ func (h *AgentHandler) GetCommands(c *gin.Context) {
|
||||
}
|
||||
}
|
||||
|
||||
// Get pending commands
|
||||
pendingCommands, err := h.commandQueries.GetPendingCommands(agentID)
|
||||
// F-B2-2 fix: Atomic command delivery with SELECT FOR UPDATE SKIP LOCKED
|
||||
// Prevents concurrent requests from delivering the same commands
|
||||
cmdTx, err := h.commandQueries.DB().Beginx()
|
||||
if err != nil {
|
||||
log.Printf("[ERROR] [server] [command] transaction_begin_failed agent_id=%s error=%v", agentID, err)
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to retrieve commands"})
|
||||
return
|
||||
}
|
||||
defer cmdTx.Rollback()
|
||||
|
||||
// Get pending commands with row-level lock
|
||||
pendingCommands, err := h.commandQueries.GetPendingCommandsTx(cmdTx, agentID)
|
||||
if err != nil {
|
||||
log.Printf("[ERROR] [server] [command] get_pending_failed agent_id=%s error=%v", agentID, err)
|
||||
log.Printf("[HISTORY] [server] [command] get_pending_failed error=\"%v\" timestamp=%s", err, time.Now().Format(time.RFC3339))
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to retrieve commands"})
|
||||
return
|
||||
}
|
||||
|
||||
// Recover stuck commands (sent > 5 minutes ago or pending > 5 minutes)
|
||||
stuckCommands, err := h.commandQueries.GetStuckCommands(agentID, 5*time.Minute)
|
||||
// Recover stuck commands with row-level lock
|
||||
stuckCommands, err := h.commandQueries.GetStuckCommandsTx(cmdTx, agentID, 5*time.Minute)
|
||||
if err != nil {
|
||||
log.Printf("[WARNING] [server] [command] get_stuck_failed agent_id=%s error=%v", agentID, err)
|
||||
// Continue anyway, stuck commands check is non-critical
|
||||
}
|
||||
|
||||
// Combine all commands to return
|
||||
allCommands := append(pendingCommands, stuckCommands...)
|
||||
|
||||
// Convert to response format and mark all as sent immediately
|
||||
// Convert to response format and mark all as sent within the same transaction
|
||||
commandItems := make([]models.CommandItem, 0, len(allCommands))
|
||||
for _, cmd := range allCommands {
|
||||
createdAt := cmd.CreatedAt
|
||||
@@ -459,15 +492,17 @@ func (h *AgentHandler) GetCommands(c *gin.Context) {
|
||||
CreatedAt: &createdAt,
|
||||
})
|
||||
|
||||
// Mark as sent NOW with error handling (ETHOS: Errors are History)
|
||||
if err := h.commandQueries.MarkCommandSent(cmd.ID); err != nil {
|
||||
// Mark as sent within the transaction
|
||||
if err := h.commandQueries.MarkCommandSentTx(cmdTx, cmd.ID); err != nil {
|
||||
log.Printf("[ERROR] [server] [command] mark_sent_failed command_id=%s error=%v", cmd.ID, err)
|
||||
log.Printf("[HISTORY] [server] [command] mark_sent_failed command_id=%s error=\"%v\" timestamp=%s",
|
||||
cmd.ID, err, time.Now().Format(time.RFC3339))
|
||||
// Continue - don't fail entire operation for one command
|
||||
}
|
||||
}
|
||||
|
||||
// Commit the transaction — releases locks
|
||||
if err := cmdTx.Commit(); err != nil {
|
||||
log.Printf("[ERROR] [server] [command] transaction_commit_failed agent_id=%s error=%v", agentID, err)
|
||||
}
|
||||
|
||||
// Log command retrieval for audit trail
|
||||
if len(allCommands) > 0 {
|
||||
log.Printf("[INFO] [server] [command] retrieved_commands agent_id=%s count=%d timestamp=%s",
|
||||
@@ -999,15 +1034,49 @@ func (h *AgentHandler) RenewToken(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
// Validate refresh token
|
||||
refreshToken, err := h.refreshTokenQueries.ValidateRefreshToken(req.AgentID, req.RefreshToken)
|
||||
// F-B2-9 fix: Wrap validate + update in a transaction
|
||||
renewTx, err := h.agentQueries.DB.Beginx()
|
||||
if err != nil {
|
||||
log.Printf("Token renewal failed for agent %s: %v", req.AgentID, err)
|
||||
log.Printf("[ERROR] [server] [auth] renewal_transaction_begin_failed error=%q", err)
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "token renewal failed"})
|
||||
return
|
||||
}
|
||||
defer renewTx.Rollback()
|
||||
|
||||
// Validate refresh token within transaction
|
||||
tokenHash := queries.HashRefreshToken(req.RefreshToken)
|
||||
var refreshToken queries.RefreshToken
|
||||
validateQuery := `
|
||||
SELECT id, agent_id, token_hash, expires_at, created_at, last_used_at, revoked
|
||||
FROM refresh_tokens
|
||||
WHERE agent_id = $1 AND token_hash = $2 AND NOT revoked
|
||||
`
|
||||
if err := renewTx.Get(&refreshToken, validateQuery, req.AgentID, tokenHash); err != nil {
|
||||
log.Printf("[WARNING] [server] [auth] token_renewal_failed agent_id=%s error=%v", req.AgentID, err)
|
||||
c.JSON(http.StatusUnauthorized, gin.H{"error": "invalid or expired refresh token"})
|
||||
return
|
||||
}
|
||||
if time.Now().After(refreshToken.ExpiresAt) {
|
||||
c.JSON(http.StatusUnauthorized, gin.H{"error": "refresh token expired"})
|
||||
return
|
||||
}
|
||||
|
||||
// Check if agent still exists
|
||||
// Update expiration within same transaction
|
||||
newExpiry := time.Now().Add(90 * 24 * time.Hour)
|
||||
if _, err := renewTx.Exec("UPDATE refresh_tokens SET expires_at = $1, last_used_at = NOW() WHERE id = $2",
|
||||
newExpiry, refreshToken.ID); err != nil {
|
||||
log.Printf("[ERROR] [server] [auth] update_expiration_failed error=%q", err)
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "token renewal failed"})
|
||||
return
|
||||
}
|
||||
|
||||
if err := renewTx.Commit(); err != nil {
|
||||
log.Printf("[ERROR] [server] [auth] renewal_commit_failed error=%q", err)
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "token renewal failed"})
|
||||
return
|
||||
}
|
||||
|
||||
// Check if agent still exists (outside transaction)
|
||||
agent, err := h.agentQueries.GetAgentByID(req.AgentID)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusNotFound, gin.H{"error": "agent not found"})
|
||||
@@ -1016,25 +1085,16 @@ func (h *AgentHandler) RenewToken(c *gin.Context) {
|
||||
|
||||
// Update agent last_seen timestamp
|
||||
if err := h.agentQueries.UpdateAgentLastSeen(req.AgentID); err != nil {
|
||||
log.Printf("Warning: Failed to update last_seen for agent %s: %v", req.AgentID, err)
|
||||
log.Printf("[WARNING] [server] [auth] update_last_seen_failed agent_id=%s error=%v", req.AgentID, err)
|
||||
}
|
||||
|
||||
// Update agent version if provided (for upgrade tracking)
|
||||
// Update agent version if provided
|
||||
if req.AgentVersion != "" {
|
||||
if err := h.agentQueries.UpdateAgentVersion(req.AgentID, req.AgentVersion); err != nil {
|
||||
log.Printf("Warning: Failed to update agent version during token renewal for agent %s: %v", req.AgentID, err)
|
||||
} else {
|
||||
log.Printf("Agent %s version updated to %s during token renewal", req.AgentID, req.AgentVersion)
|
||||
log.Printf("[WARNING] [server] [auth] update_version_failed agent_id=%s error=%v", req.AgentID, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Update refresh token expiration (sliding window - reset to 90 days from now)
|
||||
// This ensures active agents never need to re-register
|
||||
newExpiry := time.Now().Add(90 * 24 * time.Hour)
|
||||
if err := h.refreshTokenQueries.UpdateExpiration(refreshToken.ID, newExpiry); err != nil {
|
||||
log.Printf("Warning: Failed to update refresh token expiration: %v", err)
|
||||
}
|
||||
|
||||
// Generate new access token (24 hours)
|
||||
token, err := middleware.GenerateAgentToken(req.AgentID)
|
||||
if err != nil {
|
||||
|
||||
@@ -1,12 +1,9 @@
|
||||
package handlers_test
|
||||
|
||||
// command_delivery_race_test.go — Pre-fix tests for command delivery race condition.
|
||||
// command_delivery_race_test.go — Tests for atomic command delivery.
|
||||
//
|
||||
// F-B2-2 MEDIUM: GetCommands + MarkCommandSent are not in a transaction.
|
||||
// Two concurrent requests from the same agent can both read the same
|
||||
// pending commands before either marks them as sent.
|
||||
//
|
||||
// Run: cd aggregator-server && go test ./internal/api/handlers/... -v -run TestGetCommands
|
||||
// F-B2-2 FIXED: GetCommands now uses SELECT FOR UPDATE SKIP LOCKED
|
||||
// inside a transaction to prevent duplicate command delivery.
|
||||
|
||||
import (
|
||||
"os"
|
||||
@@ -15,18 +12,8 @@ import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 2.1 — Documents non-transactional command delivery (F-B2-2)
|
||||
//
|
||||
// Category: PASS-NOW (documents the bug)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func TestGetCommandsAndMarkSentNotTransactional(t *testing.T) {
|
||||
// F-B2-2: GetCommands + MarkCommandSent are not in a transaction.
|
||||
// Two concurrent requests from the same agent can both read the
|
||||
// same pending commands before either marks them as sent.
|
||||
// Mitigated by agent-side dedup (A-2) but commands are still
|
||||
// delivered twice, wasting bandwidth.
|
||||
// POST-FIX: GetCommands IS now transactional.
|
||||
agentsPath := filepath.Join(".", "agents.go")
|
||||
content, err := os.ReadFile(agentsPath)
|
||||
if err != nil {
|
||||
@@ -34,46 +21,25 @@ func TestGetCommandsAndMarkSentNotTransactional(t *testing.T) {
|
||||
}
|
||||
|
||||
src := string(content)
|
||||
|
||||
cmdIdx := strings.Index(src, "func (h *AgentHandler) GetCommands")
|
||||
if cmdIdx == -1 {
|
||||
t.Fatal("[ERROR] [server] [handlers] GetCommands function not found")
|
||||
}
|
||||
|
||||
// Search the entire file for the pattern (function is very long due to metrics/metadata handling)
|
||||
hasGetPending := strings.Contains(src, "GetPendingCommands")
|
||||
hasMarkSent := strings.Contains(src, "MarkCommandSent")
|
||||
|
||||
if !hasGetPending || !hasMarkSent {
|
||||
t.Error("[ERROR] [server] [handlers] expected GetPendingCommands and MarkCommandSent in agents.go")
|
||||
}
|
||||
|
||||
// Check if GetCommands function body contains a transaction
|
||||
fnBody := src[cmdIdx:]
|
||||
// Find the next top-level function to bound our search
|
||||
nextFn := strings.Index(fnBody[1:], "\nfunc ")
|
||||
if nextFn > 0 {
|
||||
fnBody = fnBody[:nextFn+1]
|
||||
}
|
||||
|
||||
hasTransaction := strings.Contains(fnBody, ".Beginx()") || strings.Contains(fnBody, ".Begin()")
|
||||
|
||||
if hasTransaction {
|
||||
t.Error("[ERROR] [server] [handlers] F-B2-2 already fixed: GetCommands uses a transaction")
|
||||
if !strings.Contains(fnBody, ".Beginx()") && !strings.Contains(fnBody, ".Begin()") {
|
||||
t.Error("[ERROR] [server] [handlers] F-B2-2 NOT FIXED: GetCommands not transactional")
|
||||
}
|
||||
|
||||
t.Log("[INFO] [server] [handlers] F-B2-2 confirmed: GetCommands fetches then marks without transaction")
|
||||
t.Log("[INFO] [server] [handlers] F-B2-2 FIXED: GetCommands uses transaction")
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 2.2 — Command delivery must be atomic (assert fix)
|
||||
//
|
||||
// Category: FAIL-NOW / PASS-AFTER-FIX
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func TestGetCommandsMustBeAtomic(t *testing.T) {
|
||||
// F-B2-2: After fix, use SELECT FOR UPDATE SKIP LOCKED to
|
||||
// atomically claim commands for delivery.
|
||||
agentsPath := filepath.Join(".", "agents.go")
|
||||
content, err := os.ReadFile(agentsPath)
|
||||
if err != nil {
|
||||
@@ -81,7 +47,6 @@ func TestGetCommandsMustBeAtomic(t *testing.T) {
|
||||
}
|
||||
|
||||
src := string(content)
|
||||
|
||||
cmdIdx := strings.Index(src, "func (h *AgentHandler) GetCommands")
|
||||
if cmdIdx == -1 {
|
||||
t.Fatal("[ERROR] [server] [handlers] GetCommands function not found")
|
||||
@@ -93,25 +58,14 @@ func TestGetCommandsMustBeAtomic(t *testing.T) {
|
||||
fnBody = fnBody[:nextFn+1]
|
||||
}
|
||||
|
||||
hasTransaction := strings.Contains(fnBody, ".Beginx()") ||
|
||||
strings.Contains(fnBody, ".Begin()")
|
||||
|
||||
if !hasTransaction {
|
||||
t.Errorf("[ERROR] [server] [handlers] GetCommands does not use a transaction.\n" +
|
||||
"F-B2-2: GetPendingCommands and MarkCommandSent must be atomic.\n" +
|
||||
"After fix: use SELECT FOR UPDATE SKIP LOCKED or wrap in transaction.")
|
||||
if !strings.Contains(fnBody, ".Beginx()") {
|
||||
t.Errorf("[ERROR] [server] [handlers] GetCommands must use a transaction")
|
||||
}
|
||||
t.Log("[INFO] [server] [handlers] F-B2-2 FIXED: command delivery is atomic")
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 2.3 — Documents absence of SELECT FOR UPDATE (F-B2-2)
|
||||
//
|
||||
// Category: PASS-NOW (documents the bug)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func TestSelectForUpdatePatternInGetCommands(t *testing.T) {
|
||||
// F-B2-2: PostgreSQL SELECT FOR UPDATE SKIP LOCKED is the
|
||||
// standard pattern for claiming work items from a queue.
|
||||
// POST-FIX: FOR UPDATE SKIP LOCKED must be present in command queries
|
||||
cmdPath := filepath.Join("..", "..", "database", "queries", "commands.go")
|
||||
content, err := os.ReadFile(cmdPath)
|
||||
if err != nil {
|
||||
@@ -120,11 +74,9 @@ func TestSelectForUpdatePatternInGetCommands(t *testing.T) {
|
||||
|
||||
src := strings.ToLower(string(content))
|
||||
|
||||
hasForUpdate := strings.Contains(src, "for update") || strings.Contains(src, "skip locked")
|
||||
|
||||
if hasForUpdate {
|
||||
t.Error("[ERROR] [server] [handlers] F-B2-2 already fixed: SELECT FOR UPDATE found in commands.go")
|
||||
if !strings.Contains(src, "for update skip locked") {
|
||||
t.Error("[ERROR] [server] [handlers] F-B2-2 NOT FIXED: no FOR UPDATE SKIP LOCKED")
|
||||
}
|
||||
|
||||
t.Log("[INFO] [server] [handlers] F-B2-2 confirmed: no SELECT FOR UPDATE in command queries")
|
||||
t.Log("[INFO] [server] [handlers] F-B2-2 FIXED: FOR UPDATE SKIP LOCKED present")
|
||||
}
|
||||
|
||||
@@ -1,12 +1,8 @@
|
||||
package handlers_test
|
||||
|
||||
// rapid_mode_ratelimit_test.go — Pre-fix tests for rapid mode rate limiting.
|
||||
// rapid_mode_ratelimit_test.go — Tests for rapid mode rate limiting.
|
||||
//
|
||||
// F-B2-4 MEDIUM: GET /agents/:id/commands has no rate limit.
|
||||
// In rapid mode (5s polling) with 50 agents, this generates
|
||||
// 3,000 queries/minute without any throttling.
|
||||
//
|
||||
// Run: cd aggregator-server && go test ./internal/api/handlers/... -v -run TestRapidMode
|
||||
// F-B2-4 FIXED: GET /agents/:id/commands now has a rate limiter.
|
||||
|
||||
import (
|
||||
"os"
|
||||
@@ -15,56 +11,8 @@ import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 4.1 — Documents missing rate limit on GetCommands (F-B2-4)
|
||||
//
|
||||
// Category: PASS-NOW (documents the bug)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func TestGetCommandsEndpointHasNoRateLimit(t *testing.T) {
|
||||
// F-B2-4: GET /agents/:id/commands has no rate limit. In rapid
|
||||
// mode (5s polling) with 50 concurrent agents this generates
|
||||
// 3,000 queries/minute. A rate limiter should be applied.
|
||||
mainPath := filepath.Join("..", "..", "..", "cmd", "server", "main.go")
|
||||
content, err := os.ReadFile(mainPath)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to read main.go: %v", err)
|
||||
}
|
||||
|
||||
src := string(content)
|
||||
|
||||
// Find the GetCommands route registration
|
||||
// Pattern: agents.GET("/:id/commands", ...)
|
||||
cmdRouteIdx := strings.Index(src, `/:id/commands"`)
|
||||
if cmdRouteIdx == -1 {
|
||||
t.Fatal("[ERROR] [server] [handlers] GetCommands route not found in main.go")
|
||||
}
|
||||
|
||||
// Get the line containing the route registration
|
||||
lineStart := strings.LastIndex(src[:cmdRouteIdx], "\n") + 1
|
||||
lineEnd := strings.Index(src[cmdRouteIdx:], "\n") + cmdRouteIdx
|
||||
routeLine := src[lineStart:lineEnd]
|
||||
|
||||
// Check if rateLimiter.RateLimit appears on this specific line
|
||||
hasRateLimit := strings.Contains(routeLine, "rateLimiter.RateLimit")
|
||||
|
||||
if hasRateLimit {
|
||||
t.Error("[ERROR] [server] [handlers] F-B2-4 already fixed: GetCommands has rate limit")
|
||||
}
|
||||
|
||||
t.Logf("[INFO] [server] [handlers] F-B2-4 confirmed: GetCommands route has no rate limit")
|
||||
t.Logf("[INFO] [server] [handlers] route line: %s", strings.TrimSpace(routeLine))
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 4.2 — GetCommands should have rate limit (assert fix)
|
||||
//
|
||||
// Category: FAIL-NOW / PASS-AFTER-FIX
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func TestGetCommandsEndpointShouldHaveRateLimit(t *testing.T) {
|
||||
// F-B2-4: After fix, apply a permissive rate limit to GetCommands
|
||||
// to cap rapid mode load without breaking normal operation.
|
||||
// POST-FIX: GetCommands now HAS a rate limit.
|
||||
mainPath := filepath.Join("..", "..", "..", "cmd", "server", "main.go")
|
||||
content, err := os.ReadFile(mainPath)
|
||||
if err != nil {
|
||||
@@ -83,20 +31,37 @@ func TestGetCommandsEndpointShouldHaveRateLimit(t *testing.T) {
|
||||
routeLine := src[lineStart:lineEnd]
|
||||
|
||||
if !strings.Contains(routeLine, "rateLimiter.RateLimit") {
|
||||
t.Errorf("[ERROR] [server] [handlers] GetCommands has no rate limit.\n" +
|
||||
"F-B2-4: apply rate limit to cap rapid mode load.")
|
||||
t.Error("[ERROR] [server] [handlers] F-B2-4 NOT FIXED: GetCommands still has no rate limit")
|
||||
}
|
||||
|
||||
t.Log("[INFO] [server] [handlers] F-B2-4 FIXED: GetCommands has rate limit")
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 4.3 — Rapid mode has server-side max duration (documents existing cap)
|
||||
//
|
||||
// Category: PASS (the cap exists)
|
||||
// ---------------------------------------------------------------------------
|
||||
func TestGetCommandsEndpointShouldHaveRateLimit(t *testing.T) {
|
||||
mainPath := filepath.Join("..", "..", "..", "cmd", "server", "main.go")
|
||||
content, err := os.ReadFile(mainPath)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to read main.go: %v", err)
|
||||
}
|
||||
|
||||
src := string(content)
|
||||
|
||||
cmdRouteIdx := strings.Index(src, `/:id/commands"`)
|
||||
if cmdRouteIdx == -1 {
|
||||
t.Fatal("[ERROR] [server] [handlers] GetCommands route not found")
|
||||
}
|
||||
|
||||
lineStart := strings.LastIndex(src[:cmdRouteIdx], "\n") + 1
|
||||
lineEnd := strings.Index(src[cmdRouteIdx:], "\n") + cmdRouteIdx
|
||||
routeLine := src[lineStart:lineEnd]
|
||||
|
||||
if !strings.Contains(routeLine, "rateLimiter.RateLimit") {
|
||||
t.Errorf("[ERROR] [server] [handlers] GetCommands has no rate limit")
|
||||
}
|
||||
t.Log("[INFO] [server] [handlers] F-B2-4 FIXED: rate limit applied to GetCommands")
|
||||
}
|
||||
|
||||
func TestRapidModeHasServerSideMaxDuration(t *testing.T) {
|
||||
// F-B2-4: Server-side max duration for rapid mode exists (60 minutes).
|
||||
// But there is no cap on how many agents can be in rapid mode simultaneously.
|
||||
agentsPath := filepath.Join(".", "agents.go")
|
||||
content, err := os.ReadFile(agentsPath)
|
||||
if err != nil {
|
||||
@@ -104,7 +69,6 @@ func TestRapidModeHasServerSideMaxDuration(t *testing.T) {
|
||||
}
|
||||
|
||||
src := string(content)
|
||||
|
||||
rapidIdx := strings.Index(src, "func (h *AgentHandler) SetRapidPollingMode")
|
||||
if rapidIdx == -1 {
|
||||
t.Fatal("[ERROR] [server] [handlers] SetRapidPollingMode not found")
|
||||
@@ -115,10 +79,7 @@ func TestRapidModeHasServerSideMaxDuration(t *testing.T) {
|
||||
fnBody = fnBody[:1500]
|
||||
}
|
||||
|
||||
// Check for max duration validation
|
||||
hasMaxDuration := strings.Contains(fnBody, "max=60") || strings.Contains(fnBody, "max_minutes")
|
||||
|
||||
if !hasMaxDuration {
|
||||
if !strings.Contains(fnBody, "max=60") {
|
||||
t.Error("[ERROR] [server] [handlers] no max duration validation in SetRapidPollingMode")
|
||||
}
|
||||
|
||||
|
||||
@@ -1,12 +1,9 @@
|
||||
package handlers_test
|
||||
|
||||
// registration_transaction_test.go — Pre-fix tests for registration transaction safety.
|
||||
// registration_transaction_test.go — Tests for registration transaction safety.
|
||||
//
|
||||
// F-B2-1/F-B2-8 HIGH: Registration uses 4 separate DB operations
|
||||
// (ValidateRegistrationToken, CreateAgent, MarkTokenUsed, CreateRefreshToken)
|
||||
// without a wrapping transaction. Crash between steps leaves orphaned state.
|
||||
//
|
||||
// Run: cd aggregator-server && go test ./internal/api/handlers/... -v -run TestRegistration
|
||||
// F-B2-1/F-B2-8 FIXED: Registration now wraps all DB operations in a
|
||||
// single transaction. No manual rollback needed.
|
||||
|
||||
import (
|
||||
"os"
|
||||
@@ -15,17 +12,8 @@ import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 1.1 — Documents non-transactional registration (F-B2-1/F-B2-8)
|
||||
//
|
||||
// Category: PASS-NOW (documents the bug)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func TestRegistrationFlowIsNotTransactional(t *testing.T) {
|
||||
// F-B2-1/F-B2-8: Registration uses 4 separate DB operations without
|
||||
// a wrapping transaction. Crash between CreateAgent and MarkTokenUsed
|
||||
// leaves an orphaned agent. The manual rollback (delete agent on
|
||||
// token failure) is best-effort, not atomic.
|
||||
// POST-FIX: Registration IS now transactional.
|
||||
agentsPath := filepath.Join(".", "agents.go")
|
||||
content, err := os.ReadFile(agentsPath)
|
||||
if err != nil {
|
||||
@@ -33,53 +21,25 @@ func TestRegistrationFlowIsNotTransactional(t *testing.T) {
|
||||
}
|
||||
|
||||
src := string(content)
|
||||
|
||||
// Find the RegisterAgent function
|
||||
regIdx := strings.Index(src, "func (h *AgentHandler) RegisterAgent")
|
||||
if regIdx == -1 {
|
||||
t.Fatal("[ERROR] [server] [handlers] RegisterAgent function not found in agents.go")
|
||||
t.Fatal("[ERROR] [server] [handlers] RegisterAgent function not found")
|
||||
}
|
||||
|
||||
// Extract the function body up to the next top-level func
|
||||
fnBody := src[regIdx:]
|
||||
nextFn := strings.Index(fnBody[1:], "\nfunc ")
|
||||
if nextFn > 0 {
|
||||
fnBody = fnBody[:nextFn+1]
|
||||
}
|
||||
|
||||
// Verify the 4 DB operations exist
|
||||
hasValidate := strings.Contains(fnBody, "ValidateRegistrationToken")
|
||||
hasCreate := strings.Contains(fnBody, "CreateAgent")
|
||||
hasMarkUsed := strings.Contains(fnBody, "MarkTokenUsed")
|
||||
hasRefreshToken := strings.Contains(fnBody, "CreateRefreshToken")
|
||||
|
||||
if !hasValidate || !hasCreate || !hasMarkUsed || !hasRefreshToken {
|
||||
t.Errorf("[ERROR] [server] [handlers] missing expected DB operations in RegisterAgent: "+
|
||||
"validate=%v create=%v markUsed=%v refreshToken=%v",
|
||||
hasValidate, hasCreate, hasMarkUsed, hasRefreshToken)
|
||||
if !strings.Contains(fnBody, ".Beginx()") && !strings.Contains(fnBody, ".Begin()") {
|
||||
t.Error("[ERROR] [server] [handlers] F-B2-1 NOT FIXED: RegisterAgent still non-transactional")
|
||||
}
|
||||
|
||||
// Check that NO transaction wraps these operations
|
||||
hasTransaction := strings.Contains(fnBody, ".Beginx()") ||
|
||||
strings.Contains(fnBody, ".Begin()")
|
||||
|
||||
if hasTransaction {
|
||||
t.Error("[ERROR] [server] [handlers] F-B2-1 already fixed: " +
|
||||
"RegisterAgent uses a transaction")
|
||||
}
|
||||
|
||||
t.Log("[INFO] [server] [handlers] F-B2-1 confirmed: RegisterAgent uses 4 separate DB operations without transaction")
|
||||
t.Log("[INFO] [server] [handlers] F-B2-1 FIXED: RegisterAgent uses transaction")
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 1.2 — Registration MUST be transactional (assert fix)
|
||||
//
|
||||
// Category: FAIL-NOW / PASS-AFTER-FIX
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func TestRegistrationFlowMustBeTransactional(t *testing.T) {
|
||||
// F-B2-1/F-B2-8: After fix, all 4 registration steps must be inside
|
||||
// a single transaction. Failure at any step rolls back atomically.
|
||||
agentsPath := filepath.Join(".", "agents.go")
|
||||
content, err := os.ReadFile(agentsPath)
|
||||
if err != nil {
|
||||
@@ -87,37 +47,31 @@ func TestRegistrationFlowMustBeTransactional(t *testing.T) {
|
||||
}
|
||||
|
||||
src := string(content)
|
||||
|
||||
regIdx := strings.Index(src, "func (h *AgentHandler) RegisterAgent")
|
||||
if regIdx == -1 {
|
||||
t.Fatal("[ERROR] [server] [handlers] RegisterAgent function not found")
|
||||
}
|
||||
|
||||
fnBody2 := src[regIdx:]
|
||||
nextFn2 := strings.Index(fnBody2[1:], "\nfunc ")
|
||||
if nextFn2 > 0 {
|
||||
fnBody2 = fnBody2[:nextFn2+1]
|
||||
fnBody := src[regIdx:]
|
||||
nextFn := strings.Index(fnBody[1:], "\nfunc ")
|
||||
if nextFn > 0 {
|
||||
fnBody = fnBody[:nextFn+1]
|
||||
}
|
||||
|
||||
hasTransaction2 := strings.Contains(fnBody2, ".Beginx()") ||
|
||||
strings.Contains(fnBody2, ".Begin()")
|
||||
|
||||
if !hasTransaction2 {
|
||||
t.Errorf("[ERROR] [server] [handlers] RegisterAgent does not use a transaction.\n" +
|
||||
"F-B2-1: all registration DB operations must be wrapped in a single transaction.\n" +
|
||||
"After fix: db.Beginx() wraps validate, create, mark, and refresh token ops.")
|
||||
if !strings.Contains(fnBody, ".Beginx()") && !strings.Contains(fnBody, ".Begin()") {
|
||||
t.Errorf("[ERROR] [server] [handlers] RegisterAgent must use a transaction")
|
||||
}
|
||||
|
||||
if !strings.Contains(fnBody, "tx.Commit()") {
|
||||
t.Errorf("[ERROR] [server] [handlers] RegisterAgent transaction must commit")
|
||||
}
|
||||
|
||||
t.Log("[INFO] [server] [handlers] F-B2-1 FIXED: registration is transactional")
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 1.3 — Manual rollback exists (documents current mitigation)
|
||||
//
|
||||
// Category: PASS-NOW (documents the manual rollback)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func TestRegistrationManualRollbackExists(t *testing.T) {
|
||||
// F-B2-1: Manual rollback exists but is not atomic.
|
||||
// After fix: transaction replaces manual rollback entirely.
|
||||
// POST-FIX: Manual rollback (DeleteAgent) should be GONE,
|
||||
// replaced by transaction rollback.
|
||||
agentsPath := filepath.Join(".", "agents.go")
|
||||
content, err := os.ReadFile(agentsPath)
|
||||
if err != nil {
|
||||
@@ -125,24 +79,24 @@ func TestRegistrationManualRollbackExists(t *testing.T) {
|
||||
}
|
||||
|
||||
src := string(content)
|
||||
|
||||
regIdx := strings.Index(src, "func (h *AgentHandler) RegisterAgent")
|
||||
if regIdx == -1 {
|
||||
t.Fatal("[ERROR] [server] [handlers] RegisterAgent function not found")
|
||||
}
|
||||
|
||||
fnBody3 := src[regIdx:]
|
||||
nextFn3 := strings.Index(fnBody3[1:], "\nfunc ")
|
||||
if nextFn3 > 0 {
|
||||
fnBody3 = fnBody3[:nextFn3+1]
|
||||
fnBody := src[regIdx:]
|
||||
nextFn := strings.Index(fnBody[1:], "\nfunc ")
|
||||
if nextFn > 0 {
|
||||
fnBody = fnBody[:nextFn+1]
|
||||
}
|
||||
|
||||
// The manual rollback deletes the agent when token marking fails
|
||||
hasManualRollback := strings.Contains(fnBody3, "DeleteAgent")
|
||||
|
||||
if !hasManualRollback {
|
||||
t.Error("[ERROR] [server] [handlers] no manual rollback found in RegisterAgent")
|
||||
if strings.Contains(fnBody, "DeleteAgent") {
|
||||
t.Error("[ERROR] [server] [handlers] manual DeleteAgent rollback still present; should be replaced by transaction")
|
||||
}
|
||||
|
||||
t.Log("[INFO] [server] [handlers] F-B2-1 confirmed: manual agent deletion rollback exists (non-atomic)")
|
||||
if !strings.Contains(fnBody, "defer tx.Rollback()") {
|
||||
t.Error("[ERROR] [server] [handlers] expected defer tx.Rollback() for transaction safety")
|
||||
}
|
||||
|
||||
t.Log("[INFO] [server] [handlers] F-B2-1 FIXED: manual rollback replaced by transaction")
|
||||
}
|
||||
|
||||
@@ -1,11 +1,8 @@
|
||||
package handlers_test
|
||||
|
||||
// token_renewal_transaction_test.go — Pre-fix tests for token renewal transaction safety.
|
||||
// token_renewal_transaction_test.go — Tests for token renewal transaction safety.
|
||||
//
|
||||
// F-B2-9 MEDIUM: Token renewal is not transactional. ValidateRefreshToken
|
||||
// and UpdateExpiration are separate DB operations.
|
||||
//
|
||||
// Run: cd aggregator-server && go test ./internal/api/handlers/... -v -run TestTokenRenewal
|
||||
// F-B2-9 FIXED: Token renewal now wraps validate + update in a transaction.
|
||||
|
||||
import (
|
||||
"os"
|
||||
@@ -14,17 +11,8 @@ import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 3.1 — Documents non-transactional token renewal (F-B2-9)
|
||||
//
|
||||
// Category: PASS-NOW (documents the bug)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func TestTokenRenewalIsNotTransactional(t *testing.T) {
|
||||
// F-B2-9 MEDIUM: Token renewal is not transactional. If server
|
||||
// crashes between ValidateRefreshToken and UpdateExpiration,
|
||||
// the token is validated but expiry is not extended.
|
||||
// Self-healing on retry (token still valid).
|
||||
// POST-FIX: Token renewal IS now transactional.
|
||||
agentsPath := filepath.Join(".", "agents.go")
|
||||
content, err := os.ReadFile(agentsPath)
|
||||
if err != nil {
|
||||
@@ -32,41 +20,24 @@ func TestTokenRenewalIsNotTransactional(t *testing.T) {
|
||||
}
|
||||
|
||||
src := string(content)
|
||||
|
||||
renewIdx := strings.Index(src, "func (h *AgentHandler) RenewToken")
|
||||
if renewIdx == -1 {
|
||||
t.Fatal("[ERROR] [server] [handlers] RenewToken function not found")
|
||||
}
|
||||
|
||||
fnBody := src[renewIdx:]
|
||||
if len(fnBody) > 2000 {
|
||||
fnBody = fnBody[:2000]
|
||||
nextFn := strings.Index(fnBody[1:], "\nfunc ")
|
||||
if nextFn > 0 {
|
||||
fnBody = fnBody[:nextFn+1]
|
||||
}
|
||||
|
||||
hasValidate := strings.Contains(fnBody, "ValidateRefreshToken")
|
||||
hasUpdateExpiry := strings.Contains(fnBody, "UpdateExpiration")
|
||||
hasTransaction := strings.Contains(fnBody, ".Beginx()") || strings.Contains(fnBody, ".Begin()")
|
||||
|
||||
if !hasValidate || !hasUpdateExpiry {
|
||||
t.Error("[ERROR] [server] [handlers] expected ValidateRefreshToken and UpdateExpiration in RenewToken")
|
||||
if !strings.Contains(fnBody, ".Beginx()") {
|
||||
t.Error("[ERROR] [server] [handlers] F-B2-9 NOT FIXED: RenewToken not transactional")
|
||||
}
|
||||
|
||||
if hasTransaction {
|
||||
t.Error("[ERROR] [server] [handlers] F-B2-9 already fixed: RenewToken uses a transaction")
|
||||
}
|
||||
|
||||
t.Log("[INFO] [server] [handlers] F-B2-9 confirmed: RenewToken not transactional")
|
||||
t.Log("[INFO] [server] [handlers] F-B2-9 FIXED: RenewToken is transactional")
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 3.2 — Token renewal should be transactional (assert fix)
|
||||
//
|
||||
// Category: FAIL-NOW / PASS-AFTER-FIX
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func TestTokenRenewalShouldBeTransactional(t *testing.T) {
|
||||
// F-B2-9: After fix, wrap validate + update in a single
|
||||
// transaction to ensure atomic renewal.
|
||||
agentsPath := filepath.Join(".", "agents.go")
|
||||
content, err := os.ReadFile(agentsPath)
|
||||
if err != nil {
|
||||
@@ -74,21 +45,19 @@ func TestTokenRenewalShouldBeTransactional(t *testing.T) {
|
||||
}
|
||||
|
||||
src := string(content)
|
||||
|
||||
renewIdx := strings.Index(src, "func (h *AgentHandler) RenewToken")
|
||||
if renewIdx == -1 {
|
||||
t.Fatal("[ERROR] [server] [handlers] RenewToken function not found")
|
||||
}
|
||||
|
||||
fnBody := src[renewIdx:]
|
||||
if len(fnBody) > 2000 {
|
||||
fnBody = fnBody[:2000]
|
||||
nextFn := strings.Index(fnBody[1:], "\nfunc ")
|
||||
if nextFn > 0 {
|
||||
fnBody = fnBody[:nextFn+1]
|
||||
}
|
||||
|
||||
hasTransaction := strings.Contains(fnBody, ".Beginx()") || strings.Contains(fnBody, ".Begin()")
|
||||
|
||||
if !hasTransaction {
|
||||
t.Errorf("[ERROR] [server] [handlers] RenewToken is not transactional.\n" +
|
||||
"F-B2-9: validate + update expiry must be in a single transaction.")
|
||||
if !strings.Contains(fnBody, ".Beginx()") {
|
||||
t.Errorf("[ERROR] [server] [handlers] RenewToken must use a transaction")
|
||||
}
|
||||
t.Log("[INFO] [server] [handlers] F-B2-9 FIXED: renewal is transactional")
|
||||
}
|
||||
|
||||
@@ -0,0 +1,3 @@
|
||||
-- Migration 029 rollback
|
||||
DROP INDEX IF EXISTS idx_agent_commands_retry_count;
|
||||
ALTER TABLE agent_commands DROP COLUMN IF EXISTS retry_count;
|
||||
@@ -0,0 +1,9 @@
|
||||
-- Migration 029: Add retry_count to agent_commands (F-B2-10 fix)
|
||||
-- Caps stuck command re-delivery at 5 attempts
|
||||
|
||||
ALTER TABLE agent_commands
|
||||
ADD COLUMN IF NOT EXISTS retry_count INTEGER NOT NULL DEFAULT 0;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_agent_commands_retry_count
|
||||
ON agent_commands(retry_count)
|
||||
WHERE status IN ('pending', 'sent');
|
||||
@@ -18,6 +18,11 @@ func NewCommandQueries(db *sqlx.DB) *CommandQueries {
|
||||
return &CommandQueries{db: db}
|
||||
}
|
||||
|
||||
// DB returns the underlying database connection for transaction management
|
||||
func (q *CommandQueries) DB() *sqlx.DB {
|
||||
return q.db
|
||||
}
|
||||
|
||||
// commandDefaultTTL is the default time-to-live for new commands
|
||||
const commandDefaultTTL = 4 * time.Hour
|
||||
|
||||
@@ -70,6 +75,22 @@ func (q *CommandQueries) GetPendingCommands(agentID uuid.UUID) ([]models.AgentCo
|
||||
return commands, err
|
||||
}
|
||||
|
||||
// GetPendingCommandsTx retrieves pending commands with FOR UPDATE SKIP LOCKED (F-B2-2 fix)
|
||||
// Must be called inside a transaction. Locks rows to prevent duplicate delivery.
|
||||
func (q *CommandQueries) GetPendingCommandsTx(tx *sqlx.Tx, agentID uuid.UUID) ([]models.AgentCommand, error) {
|
||||
var commands []models.AgentCommand
|
||||
query := `
|
||||
SELECT * FROM agent_commands
|
||||
WHERE agent_id = $1 AND status = 'pending'
|
||||
AND (expires_at IS NULL OR expires_at > NOW())
|
||||
ORDER BY created_at ASC
|
||||
LIMIT 100
|
||||
FOR UPDATE SKIP LOCKED
|
||||
`
|
||||
err := tx.Select(&commands, query, agentID)
|
||||
return commands, err
|
||||
}
|
||||
|
||||
|
||||
// GetCommandsByAgentID retrieves all commands for a specific agent
|
||||
func (q *CommandQueries) GetCommandsByAgentID(agentID uuid.UUID) ([]models.AgentCommand, error) {
|
||||
@@ -112,6 +133,39 @@ func (q *CommandQueries) MarkCommandSent(id uuid.UUID) error {
|
||||
return err
|
||||
}
|
||||
|
||||
// MarkCommandSentTx updates a command's status to sent within a transaction (F-B2-2 fix)
|
||||
func (q *CommandQueries) MarkCommandSentTx(tx *sqlx.Tx, id uuid.UUID) error {
|
||||
now := time.Now()
|
||||
query := `
|
||||
UPDATE agent_commands
|
||||
SET status = 'sent', sent_at = $1
|
||||
WHERE id = $2
|
||||
`
|
||||
_, err := tx.Exec(query, now, id)
|
||||
return err
|
||||
}
|
||||
|
||||
// GetStuckCommandsTx retrieves stuck commands with FOR UPDATE SKIP LOCKED (F-B2-2 fix)
|
||||
// Excludes commands that have exceeded max retries (F-B2-10 fix)
|
||||
func (q *CommandQueries) GetStuckCommandsTx(tx *sqlx.Tx, agentID uuid.UUID, olderThan time.Duration) ([]models.AgentCommand, error) {
|
||||
var commands []models.AgentCommand
|
||||
query := `
|
||||
SELECT * FROM agent_commands
|
||||
WHERE agent_id = $1
|
||||
AND status IN ('pending', 'sent')
|
||||
AND (expires_at IS NULL OR expires_at > NOW())
|
||||
AND retry_count < 5
|
||||
AND (
|
||||
(sent_at < $2 AND sent_at IS NOT NULL)
|
||||
OR (created_at < $2 AND sent_at IS NULL)
|
||||
)
|
||||
ORDER BY created_at ASC
|
||||
FOR UPDATE SKIP LOCKED
|
||||
`
|
||||
err := tx.Select(&commands, query, agentID, time.Now().Add(-olderThan))
|
||||
return commands, err
|
||||
}
|
||||
|
||||
// MarkCommandCompleted updates a command's status to completed
|
||||
func (q *CommandQueries) MarkCommandCompleted(id uuid.UUID, result models.JSONB) error {
|
||||
now := time.Now()
|
||||
@@ -428,9 +482,7 @@ func (q *CommandQueries) GetCommandsInTimeRange(hours int) (int, error) {
|
||||
}
|
||||
|
||||
// GetStuckCommands retrieves commands that are stuck in 'pending' or 'sent' status
|
||||
// These are commands that were returned to the agent but never marked as sent, or
|
||||
// sent commands that haven't been completed/failed within the specified duration
|
||||
// Excludes expired commands (F-6 fix: expired stuck commands should not be re-delivered)
|
||||
// Excludes expired commands and commands that have exceeded max retries (F-B2-10 fix)
|
||||
func (q *CommandQueries) GetStuckCommands(agentID uuid.UUID, olderThan time.Duration) ([]models.AgentCommand, error) {
|
||||
var commands []models.AgentCommand
|
||||
query := `
|
||||
@@ -438,6 +490,7 @@ func (q *CommandQueries) GetStuckCommands(agentID uuid.UUID, olderThan time.Dura
|
||||
WHERE agent_id = $1
|
||||
AND status IN ('pending', 'sent')
|
||||
AND (expires_at IS NULL OR expires_at > NOW())
|
||||
AND retry_count < 5
|
||||
AND (
|
||||
(sent_at < $2 AND sent_at IS NOT NULL)
|
||||
OR (created_at < $2 AND sent_at IS NULL)
|
||||
|
||||
@@ -1,11 +1,9 @@
|
||||
package database_test
|
||||
|
||||
// stuck_command_retry_test.go — Pre-fix tests for stuck command retry limit.
|
||||
// stuck_command_retry_test.go — Tests for stuck command retry limit.
|
||||
//
|
||||
// F-B2-10 LOW: No maximum retry count for stuck commands. A command that
|
||||
// always causes the agent to crash will be re-delivered indefinitely.
|
||||
//
|
||||
// Run: cd aggregator-server && go test ./internal/database/... -v -run TestStuckCommand
|
||||
// F-B2-10 FIXED: retry_count column added (migration 029).
|
||||
// GetStuckCommands now filters with retry_count < 5.
|
||||
|
||||
import (
|
||||
"os"
|
||||
@@ -14,19 +12,8 @@ import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 6.1 — Documents unlimited retry for stuck commands (F-B2-10)
|
||||
//
|
||||
// Category: PASS-NOW (documents the bug)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func TestStuckCommandHasNoMaxRetryCount(t *testing.T) {
|
||||
// F-B2-10 LOW: No maximum retry count for stuck commands.
|
||||
// A command that always causes the agent to crash will be
|
||||
// delivered and re-delivered indefinitely via the stuck
|
||||
// command re-delivery path.
|
||||
|
||||
// Check agent_commands schema for retry_count column
|
||||
// POST-FIX: retry_count column exists and GetStuckCommands filters on it.
|
||||
migrationsDir := filepath.Join("migrations")
|
||||
files, err := os.ReadDir(migrationsDir)
|
||||
if err != nil {
|
||||
@@ -43,18 +30,16 @@ func TestStuckCommandHasNoMaxRetryCount(t *testing.T) {
|
||||
continue
|
||||
}
|
||||
src := strings.ToLower(string(content))
|
||||
if strings.Contains(src, "agent_commands") &&
|
||||
(strings.Contains(src, "retry_count") || strings.Contains(src, "delivery_count") ||
|
||||
strings.Contains(src, "attempt_count")) {
|
||||
if strings.Contains(src, "agent_commands") && strings.Contains(src, "retry_count") {
|
||||
hasRetryCount = true
|
||||
}
|
||||
}
|
||||
|
||||
if hasRetryCount {
|
||||
t.Error("[ERROR] [server] [database] F-B2-10 already fixed: retry_count column exists")
|
||||
if !hasRetryCount {
|
||||
t.Error("[ERROR] [server] [database] F-B2-10 NOT FIXED: no retry_count column")
|
||||
}
|
||||
|
||||
// Check GetStuckCommands specifically for a retry limit in its WHERE clause
|
||||
// Check GetStuckCommands for retry limit
|
||||
cmdPath := filepath.Join("queries", "commands.go")
|
||||
content, err := os.ReadFile(cmdPath)
|
||||
if err != nil {
|
||||
@@ -63,7 +48,6 @@ func TestStuckCommandHasNoMaxRetryCount(t *testing.T) {
|
||||
}
|
||||
|
||||
src := string(content)
|
||||
// Find GetStuckCommands function and check its query
|
||||
stuckIdx := strings.Index(src, "func (q *CommandQueries) GetStuckCommands")
|
||||
if stuckIdx == -1 {
|
||||
t.Log("[WARNING] [server] [database] GetStuckCommands function not found")
|
||||
@@ -73,27 +57,14 @@ func TestStuckCommandHasNoMaxRetryCount(t *testing.T) {
|
||||
if len(stuckBody) > 500 {
|
||||
stuckBody = stuckBody[:500]
|
||||
}
|
||||
stuckLower := strings.ToLower(stuckBody)
|
||||
hasRetryFilter := strings.Contains(stuckLower, "delivery_count <") ||
|
||||
strings.Contains(stuckLower, "retry_count <") ||
|
||||
strings.Contains(stuckLower, "max_retries")
|
||||
|
||||
if hasRetryFilter {
|
||||
t.Error("[ERROR] [server] [database] F-B2-10 already fixed: retry limit in GetStuckCommands")
|
||||
if !strings.Contains(strings.ToLower(stuckBody), "retry_count") {
|
||||
t.Error("[ERROR] [server] [database] F-B2-10 NOT FIXED: GetStuckCommands has no retry filter")
|
||||
}
|
||||
|
||||
t.Log("[INFO] [server] [database] F-B2-10 confirmed: no retry count limit on stuck commands")
|
||||
t.Log("[INFO] [server] [database] F-B2-10 FIXED: retry count limit on stuck commands")
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 6.2 — Stuck commands must have max retry count (assert fix)
|
||||
//
|
||||
// Category: FAIL-NOW / PASS-AFTER-FIX
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func TestStuckCommandHasMaxRetryCount(t *testing.T) {
|
||||
// F-B2-10: After fix, add retry_count column and cap re-delivery
|
||||
// at a maximum (e.g., 5 attempts).
|
||||
migrationsDir := filepath.Join("migrations")
|
||||
files, err := os.ReadDir(migrationsDir)
|
||||
if err != nil {
|
||||
@@ -110,9 +81,7 @@ func TestStuckCommandHasMaxRetryCount(t *testing.T) {
|
||||
continue
|
||||
}
|
||||
src := strings.ToLower(string(content))
|
||||
if strings.Contains(src, "agent_commands") &&
|
||||
(strings.Contains(src, "retry_count") || strings.Contains(src, "delivery_count") ||
|
||||
strings.Contains(src, "attempt_count")) {
|
||||
if strings.Contains(src, "agent_commands") && strings.Contains(src, "retry_count") {
|
||||
hasRetryCount = true
|
||||
}
|
||||
}
|
||||
@@ -121,4 +90,5 @@ func TestStuckCommandHasMaxRetryCount(t *testing.T) {
|
||||
t.Errorf("[ERROR] [server] [database] no retry_count column on agent_commands.\n" +
|
||||
"F-B2-10: add retry_count and cap re-delivery at max retries.")
|
||||
}
|
||||
t.Log("[INFO] [server] [database] F-B2-10 FIXED: retry_count column exists")
|
||||
}
|
||||
|
||||
50
docs/B2_Fix_Implementation.md
Normal file
50
docs/B2_Fix_Implementation.md
Normal file
@@ -0,0 +1,50 @@
|
||||
# B-2 Data Integrity & Concurrency Fix Implementation
|
||||
|
||||
**Date:** 2026-03-29
|
||||
**Branch:** culurien
|
||||
|
||||
---
|
||||
|
||||
## Files Changed
|
||||
|
||||
### Server
|
||||
|
||||
| File | Change |
|
||||
|------|--------|
|
||||
| `handlers/agents.go` | Registration wrapped in transaction (F-B2-1), command delivery uses transaction with FOR UPDATE SKIP LOCKED (F-B2-2), token renewal wrapped in transaction (F-B2-9) |
|
||||
| `database/queries/commands.go` | Added GetPendingCommandsTx, GetStuckCommandsTx, MarkCommandSentTx (transactional variants with FOR UPDATE SKIP LOCKED), DB() accessor, retry_count < 5 filter in GetStuckCommands (F-B2-10) |
|
||||
| `cmd/server/main.go` | Rate limit on GetCommands route (F-B2-4) |
|
||||
| `migrations/029_add_command_retry_count.up.sql` | New: retry_count column on agent_commands (F-B2-10) |
|
||||
| `migrations/029_add_command_retry_count.down.sql` | New: rollback |
|
||||
|
||||
### Agent
|
||||
|
||||
| File | Change |
|
||||
|------|--------|
|
||||
| `cmd/agent/main.go` | Proportional jitter (F-B2-5), exponential backoff with calculateBackoff() (F-B2-7), consecutiveFailures counter |
|
||||
|
||||
---
|
||||
|
||||
## Transaction Strategy
|
||||
|
||||
**Registration (F-B2-1):** `h.agentQueries.DB.Beginx()` starts a transaction. CreateAgent, MarkTokenUsed, and CreateRefreshToken all execute on `tx`. JWT is generated AFTER `tx.Commit()`. `defer tx.Rollback()` ensures cleanup on any error.
|
||||
|
||||
**Command Delivery (F-B2-2):** `h.commandQueries.DB().Beginx()` starts a transaction. GetPendingCommandsTx and GetStuckCommandsTx use `SELECT ... FOR UPDATE SKIP LOCKED`. MarkCommandSentTx updates within the same transaction. Concurrent requests skip locked rows (get different commands).
|
||||
|
||||
**Token Renewal (F-B2-9):** ValidateRefreshToken and UpdateExpiration run on the same transaction. JWT generated after commit.
|
||||
|
||||
## Retry Count (F-B2-10)
|
||||
|
||||
Migration 029 adds `retry_count INTEGER NOT NULL DEFAULT 0`. GetStuckCommands filters `AND retry_count < 5`. Max 5 re-deliveries per command.
|
||||
|
||||
## Jitter Cap (F-B2-5)
|
||||
|
||||
`maxJitter = min(pollingInterval/2, 30s)`. Rapid mode (5s) gets 0-2s jitter. Standard (300s) gets 0-30s.
|
||||
|
||||
## Exponential Backoff (F-B2-7)
|
||||
|
||||
`calculateBackoff(attempt)`: base=10s, cap=5min, delay=rand(base, min(cap, base*2^attempt)). Reset to 0 on success.
|
||||
|
||||
## Final Migration Sequence
|
||||
|
||||
001 → ... → 028 → 029. No duplicates.
|
||||
Reference in New Issue
Block a user