Files
Redflag/aggregator-agent/internal/orchestrator/command_handler.go
jpetree331 f97d4845af feat(security): A-1 Ed25519 key rotation + A-2 replay attack fixes
Complete RedFlag codebase with two major security audit implementations.

== A-1: Ed25519 Key Rotation Support ==

Server:
- SignCommand sets SignedAt timestamp and KeyID on every signature
- signing_keys database table (migration 020) for multi-key rotation
- InitializePrimaryKey registers active key at startup
- /api/v1/public-keys endpoint for rotation-aware agents
- SigningKeyQueries for key lifecycle management

Agent:
- Key-ID-aware verification via CheckKeyRotation
- FetchAndCacheAllActiveKeys for rotation pre-caching
- Cache metadata with TTL and staleness fallback
- SecurityLogger events for key rotation and command signing

== A-2: Replay Attack Fixes (F-1 through F-7) ==

F-5 CRITICAL - RetryCommand now signs via signAndCreateCommand
F-1 HIGH     - v3 format: "{agent_id}:{cmd_id}:{type}:{hash}:{ts}"
F-7 HIGH     - Migration 026: expires_at column with partial index
F-6 HIGH     - GetPendingCommands/GetStuckCommands filter by expires_at
F-2 HIGH     - Agent-side executedIDs dedup map with cleanup
F-4 HIGH     - commandMaxAge reduced from 24h to 4h
F-3 CRITICAL - Old-format commands rejected after 48h via CreatedAt

Verification fixes: migration idempotency (ETHOS #4), log format
compliance (ETHOS #1), stale comments updated.

All 24 tests passing. Docker --no-cache build verified.
See docs/ for full audit reports and deviation log (DEV-001 to DEV-019).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-28 21:25:47 -04:00

252 lines
8.7 KiB
Go

package orchestrator
import (
"crypto/ed25519"
"fmt"
"log"
"sync"
"time"
"github.com/Fimeg/RedFlag/aggregator-agent/internal/client"
"github.com/Fimeg/RedFlag/aggregator-agent/internal/config"
"github.com/Fimeg/RedFlag/aggregator-agent/internal/crypto"
"github.com/Fimeg/RedFlag/aggregator-agent/internal/logging"
"github.com/google/uuid"
)
const (
// keyRefreshInterval is how often the agent proactively re-checks the server's primary key
keyRefreshInterval = 6 * time.Hour
// commandMaxAge is the maximum age of a signed command (F-4 fix: reduced from 24h to 4h)
commandMaxAge = 4 * time.Hour
// commandClockSkew is the allowed future clock skew for signed commands
commandClockSkew = 5 * time.Minute
)
// CommandHandler handles command processing with signature verification
type CommandHandler struct {
verifier *crypto.CommandVerifier
securityLogger *logging.SecurityLogger
keyCache map[string]ed25519.PublicKey // key_id -> public key
keyCacheMu sync.RWMutex
executedIDs map[string]time.Time // cmd UUID -> execution time (F-2 fix: dedup)
executedIDsMu sync.Mutex
lastKeyRefresh time.Time
logger *log.Logger
}
// CommandSigningConfig holds configuration for command signing
type CommandSigningConfig struct {
Enabled bool `json:"enabled" env:"REDFLAG_AGENT_COMMAND_SIGNING_ENABLED" default:"true"`
EnforcementMode string `json:"enforcement_mode" env:"REDFLAG_AGENT_COMMAND_ENFORCEMENT_MODE" default:"strict"`
}
// NewCommandHandler creates a new command handler
func NewCommandHandler(cfg *config.Config, securityLogger *logging.SecurityLogger, logger *log.Logger) (*CommandHandler, error) {
handler := &CommandHandler{
securityLogger: securityLogger,
logger: logger,
verifier: crypto.NewCommandVerifier(),
keyCache: make(map[string]ed25519.PublicKey),
executedIDs: make(map[string]time.Time),
}
// Pre-load cached public key if command signing is enabled
if cfg.CommandSigning.Enabled {
if pubKey, err := crypto.LoadCachedPublicKey(); err == nil {
// Store under empty key_id for backward-compat lookup
handler.keyCacheMu.Lock()
handler.keyCache[""] = pubKey
handler.keyCacheMu.Unlock()
logger.Printf("[INFO] [agent] [cmd_handler] primary_public_key_loaded")
} else {
logger.Printf("[WARNING] [agent] [cmd_handler] primary_key_not_cached error=\"%v\"", err)
}
}
return handler, nil
}
// getKeyForCommand returns the appropriate public key for verifying a command.
// Uses key_id-aware lookup with lazy fetch for unknown keys.
func (h *CommandHandler) getKeyForCommand(cmd client.Command, serverURL string) (ed25519.PublicKey, error) {
keyID := cmd.KeyID
// Check in-memory cache first
h.keyCacheMu.RLock()
if key, ok := h.keyCache[keyID]; ok {
h.keyCacheMu.RUnlock()
return key, nil
}
h.keyCacheMu.RUnlock()
// Not in memory — check disk cache via CheckKeyRotation
key, isNew, err := h.verifier.CheckKeyRotation(keyID, serverURL)
if err != nil {
return nil, fmt.Errorf("failed to resolve key %q: %w", keyID, err)
}
if isNew {
h.logger.Printf("[INFO] [agent] [cmd_handler] new_signing_key_cached key_id=%q", keyID)
if h.securityLogger != nil {
h.securityLogger.LogKeyRotationDetected(keyID)
}
}
// Store in memory cache
h.keyCacheMu.Lock()
h.keyCache[keyID] = key
h.keyCacheMu.Unlock()
return key, nil
}
// ProcessCommand processes a command with signature verification
func (h *CommandHandler) ProcessCommand(cmd client.Command, cfg *config.Config, agentID uuid.UUID) error {
// F-2 fix: Check deduplication BEFORE verification
// TODO: persist executedIDs to disk (path: getPublicKeyDir()+
// "/executed_commands.json") to survive restarts.
// Current in-memory implementation allows replay of commands
// issued within commandMaxAge if the agent restarts.
h.executedIDsMu.Lock()
if execTime, found := h.executedIDs[cmd.ID]; found {
h.executedIDsMu.Unlock()
h.logger.Printf("[WARNING] [agent] [cmd_handler] duplicate_command_rejected command_id=%q already_executed_at=%v", cmd.ID, execTime)
if h.securityLogger != nil {
h.securityLogger.LogCommandVerificationFailure(cmd.ID, fmt.Sprintf("duplicate command rejected, already executed at %v", execTime))
}
return fmt.Errorf("duplicate command %s rejected, already executed at %v", cmd.ID, execTime)
}
h.executedIDsMu.Unlock()
signingCfg := cfg.CommandSigning
if !signingCfg.Enabled {
if cmd.Signature != "" {
h.logger.Printf("[INFO] [agent] [cmd_handler] command_has_signature_but_signing_disabled command_id=%q", cmd.ID)
}
h.markExecuted(cmd.ID)
return nil
}
// Resolve the correct public key for this command
pubKey, err := h.getKeyForCommand(cmd, cfg.ServerURL)
if err != nil {
h.logger.Printf("[ERROR] [agent] [cmd_handler] key_resolution_failed command_id=%q error=%q", cmd.ID, err)
if h.securityLogger != nil {
h.securityLogger.LogCommandVerificationFailure(cmd.ID, "key resolution failed: "+err.Error())
}
if signingCfg.EnforcementMode == "strict" {
return fmt.Errorf("command verification failed: %w", err)
}
return nil
}
verifyFunc := func() error {
if cmd.SignedAt != nil {
// New format: timestamp-aware verification
return h.verifier.VerifyCommandWithTimestamp(cmd, pubKey, commandMaxAge, commandClockSkew)
}
// Old format: no timestamp (backward compat)
return h.verifier.VerifyCommand(cmd, pubKey)
}
switch signingCfg.EnforcementMode {
case "strict":
if cmd.Signature == "" {
h.logger.Printf("[ERROR] [agent] [cmd_handler] command_not_signed command_id=%q", cmd.ID)
if h.securityLogger != nil {
h.securityLogger.LogCommandVerificationFailure(cmd.ID, "missing signature")
}
return fmt.Errorf("command verification failed: strict enforcement requires signed commands")
}
if err := verifyFunc(); err != nil {
h.logger.Printf("[ERROR] [agent] [cmd_handler] command_verification_failed command_id=%q error=%q", cmd.ID, err)
if h.securityLogger != nil {
h.securityLogger.LogCommandVerificationFailure(cmd.ID, err.Error())
}
return fmt.Errorf("command verification failed: %w", err)
}
h.logger.Printf("[INFO] [agent] [cmd_handler] command_verified command_id=%q", cmd.ID)
if h.securityLogger != nil {
h.securityLogger.LogCommandVerificationSuccess(cmd.ID)
}
h.markExecuted(cmd.ID)
case "warning":
if cmd.Signature != "" {
if err := verifyFunc(); err != nil {
h.logger.Printf("[WARNING] [agent] [cmd_handler] verification_failed_warning_mode command_id=%q error=%q", cmd.ID, err)
if h.securityLogger != nil {
h.securityLogger.LogCommandVerificationFailure(cmd.ID, err.Error())
}
} else {
if h.securityLogger != nil {
h.securityLogger.LogCommandVerificationSuccess(cmd.ID)
}
}
} else {
h.logger.Printf("[WARNING] [agent] [cmd_handler] unsigned_command_warning_mode command_id=%q", cmd.ID)
}
h.markExecuted(cmd.ID)
// "disabled" or any other value: skip verification
default:
h.markExecuted(cmd.ID)
}
return nil
}
// markExecuted records a command ID in the deduplication set (F-2 fix)
func (h *CommandHandler) markExecuted(cmdID string) {
h.executedIDsMu.Lock()
h.executedIDs[cmdID] = time.Now()
h.executedIDsMu.Unlock()
}
// CleanupExecutedIDs evicts entries older than commandMaxAge from the dedup set.
// Should be called when ShouldRefreshKey() fires (every 6h).
func (h *CommandHandler) CleanupExecutedIDs() {
h.executedIDsMu.Lock()
defer h.executedIDsMu.Unlock()
cutoff := time.Now().Add(-commandMaxAge)
evicted := 0
for id, execTime := range h.executedIDs {
if execTime.Before(cutoff) {
delete(h.executedIDs, id)
evicted++
}
}
if evicted > 0 {
h.logger.Printf("[INFO] [agent] [cmd_handler] cleanup_executed_ids evicted=%d remaining=%d", evicted, len(h.executedIDs))
}
}
// RefreshPrimaryKey proactively re-fetches the server's primary key.
// Should be called every keyRefreshInterval to detect rotations early.
func (h *CommandHandler) RefreshPrimaryKey(serverURL string) error {
h.logger.Printf("[INFO] [agent] [cmd_handler] refreshing_primary_key")
pubKey, err := crypto.FetchAndCacheServerPublicKey(serverURL)
if err != nil {
return fmt.Errorf("failed to refresh primary key: %w", err)
}
h.keyCacheMu.Lock()
h.keyCache[""] = pubKey
h.keyCacheMu.Unlock()
h.lastKeyRefresh = time.Now()
h.logger.Printf("[INFO] [agent] [cmd_handler] primary_key_refreshed")
return nil
}
// ShouldRefreshKey returns true if enough time has passed to warrant a proactive key refresh
func (h *CommandHandler) ShouldRefreshKey() bool {
return time.Since(h.lastKeyRefresh) >= keyRefreshInterval
}
// UpdateServerPublicKey updates the primary cached public key (kept for backward compat)
func (h *CommandHandler) UpdateServerPublicKey(serverURL string) error {
return h.RefreshPrimaryKey(serverURL)
}