11 KiB
11 KiB
P4-003: Agent File Management and Migration System
Priority: P4 (Technical Debt) Source Reference: From analysis of needsfixingbeforepush.md lines 1477-1517 and DEVELOPMENT_TODOS.md lines 1611-1635 Date Identified: 2025-11-12
Problem Description
Agent has no validation that working files belong to current agent binary/version. Stale files from previous agent installations interfere with current operations, causing timeout issues and data corruption. Mixed directory naming creates confusion and maintenance issues.
Impact
- Data Corruption: Stale
last_scan.jsonfiles with wrong agent IDs cause parsing timeouts - Installation Conflicts: No clean migration between agent versions
- Path Inconsistency: Mixed
/var/lib/aggregatorvs/var/lib/redflagpaths - Security Risk: No file validation prevents potential file poisoning attacks
- Maintenance Burden: Manual cleanup required for corrupted files
Current Issues Identified
1. Stale File Problem
// /var/lib/aggregator/last_scan.json from October 14th
{
"last_scan_time": "2025-10-14T10:19:23.20489739-04:00", // OLD!
"agent_id": "49f9a1e8-66db-4d21-b3f4-f416e0523ed1", // OLD!
"updates": [/* 50,000+ lines causing timeouts */]
}
2. Path Inconsistency
- Old paths:
/var/lib/aggregator,/etc/aggregator - New paths:
/var/lib/redflag,/etc/redflag - Mixed usage across codebase
- No standardized migration strategy
3. No Version Validation
- Agent doesn't validate file ownership
- No binary signature validation of working files
- Stale files accumulate and cause issues
- No cleanup mechanisms
Proposed Solution
Implement comprehensive file management and migration system:
1. File Validation and Migration System
type FileManager struct {
CurrentAgentID string
CurrentVersion string
BasePaths PathConfig
MigrationConfig MigrationConfig
}
type PathConfig struct {
Config string // /etc/redflag/config.json
State string // /var/lib/redflag/
Backup string // /var/lib/redflag/backups/
Logs string // /var/log/redflag/
}
type MigrationConfig struct {
OldPaths []string // Legacy paths to migrate from
BackupEnabled bool
MaxBackups int
}
func (fm *FileManager) ValidateAndMigrate() error {
// 1. Check for legacy paths and migrate
if err := fm.migrateLegacyPaths(); err != nil {
return fmt.Errorf("path migration failed: %w", err)
}
// 2. Validate file ownership
if err := fm.validateFileOwnership(); err != nil {
return fmt.Errorf("file ownership validation failed: %w", err)
}
// 3. Clean up stale files
if err := fm.cleanupStaleFiles(); err != nil {
return fmt.Errorf("stale file cleanup failed: %w", err)
}
return nil
}
2. Agent File Ownership Validation
type FileMetadata struct {
AgentID string `json:"agent_id"`
Version string `json:"version"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
Checksum string `json:"checksum"`
}
func (fm *FileManager) ValidateFile(filePath string) error {
// Check if file exists
if _, err := os.Stat(filePath); os.IsNotExist(err) {
return nil // No file to validate
}
// Read file metadata
metadata, err := fm.readFileMetadata(filePath)
if err != nil {
// No metadata found - treat as legacy file
return fm.handleLegacyFile(filePath)
}
// Validate agent ID matches
if metadata.AgentID != fm.CurrentAgentID {
return fm.handleMismatchedFile(filePath, metadata)
}
// Validate version compatibility
if !fm.isVersionCompatible(metadata.Version) {
return fm.handleVersionMismatch(filePath, metadata)
}
// Validate file integrity
if err := fm.validateFileIntegrity(filePath, metadata.Checksum); err != nil {
return fmt.Errorf("file integrity check failed for %s: %w", filePath, err)
}
return nil
}
3. Stale File Detection and Cleanup
func (fm *FileManager) cleanupStaleFiles() error {
files := []string{
filepath.Join(fm.BasePaths.State, "last_scan.json"),
filepath.Join(fm.BasePaths.State, "pending_acks.json"),
filepath.Join(fm.BasePaths.State, "command_history.json"),
}
for _, file := range files {
if err := fm.ValidateFile(file); err != nil {
if isStaleFileError(err) {
// Backup and remove stale file
if err := fm.backupAndRemove(file); err != nil {
log.Printf("Warning: Failed to backup stale file %s: %v", file, err)
} else {
log.Printf("Cleaned up stale file: %s", file)
}
}
}
}
return nil
}
func (fm *FileManager) backupAndRemove(filePath string) error {
if !fm.MigrationConfig.BackupEnabled {
return os.Remove(filePath)
}
// Create backup with timestamp
timestamp := time.Now().Format("20060102-150405")
backupPath := filepath.Join(fm.BasePaths.Backup, fmt.Sprintf("%s.%s", filepath.Base(filePath), timestamp))
// Ensure backup directory exists
if err := os.MkdirAll(fm.BasePaths.Backup, 0755); err != nil {
return err
}
// Copy to backup
if err := copyFile(filePath, backupPath); err != nil {
return err
}
// Remove original
return os.Remove(filePath)
}
4. Path Standardization
// Standardized paths for consistency
const (
DefaultConfigPath = "/etc/redflag/config.json"
DefaultStatePath = "/var/lib/redflag/"
DefaultBackupPath = "/var/lib/redflag/backups/"
DefaultLogPath = "/var/log/redflag/"
)
func GetStandardPaths() PathConfig {
return PathConfig{
Config: DefaultConfigPath,
State: DefaultStatePath,
Backup: DefaultBackupPath,
Logs: DefaultLogPath,
}
}
func (fm *FileManager) migrateLegacyPaths() error {
legacyPaths := []string{
"/etc/aggregator",
"/var/lib/aggregator",
}
for _, legacyPath := range legacyPaths {
if _, err := os.Stat(legacyPath); err == nil {
if err := fm.migrateFromPath(legacyPath); err != nil {
return fmt.Errorf("failed to migrate from %s: %w", legacyPath, err)
}
}
}
return nil
}
5. Binary Signature Validation
func (fm *FileManager) validateBinarySignature(filePath string) error {
// Get current binary signature
currentBinary, err := os.Executable()
if err != nil {
return err
}
currentSignature, err := fm.calculateFileSignature(currentBinary)
if err != nil {
return err
}
// Read file's expected binary signature
metadata, err := fm.readFileMetadata(filePath)
if err != nil {
return err
}
if metadata.BinarySignature != "" && metadata.BinarySignature != currentSignature {
return fmt.Errorf("file was created by different binary version")
}
return nil
}
Definition of Done
- File validation system checks agent ID and version compatibility
- Automatic cleanup of stale files from previous installations
- Path standardization implemented across codebase
- Migration system handles legacy path transitions
- Backup system preserves important files during cleanup
- Binary signature validation prevents file poisoning
- Configuration options for migration behavior
- Comprehensive logging for debugging file issues
Implementation Details
File Locations
- Primary:
aggregator-agent/internal/filesystem/(new package) - Integration:
aggregator-agent/cmd/agent/main.go(initialization) - Config:
aggregator-agent/internal/config/config.go
Configuration Options
{
"file_management": {
"paths": {
"config": "/etc/redflag/config.json",
"state": "/var/lib/redflag/",
"backup": "/var/lib/redflag/backups/",
"logs": "/var/log/redflag/"
},
"migration": {
"cleanup_stale_files": true,
"backup_on_cleanup": true,
"max_backups": 10,
"migrate_legacy_paths": true
},
"validation": {
"validate_agent_id": true,
"validate_version": true,
"validate_binary_signature": false
}
}
}
Integration Points
// Agent initialization
func (a *Agent) initialize() error {
// Existing initialization...
// File management setup
fileManager := filesystem.NewFileManager(a.config, a.agentID, AgentVersion)
if err := fileManager.ValidateAndMigrate(); err != nil {
return fmt.Errorf("file management initialization failed: %w", err)
}
a.fileManager = fileManager
return nil
}
// Before scan operations
func (a *Agent) scanForUpdates() error {
// Validate files before operation
if err := a.fileManager.ValidateAndMigrate(); err != nil {
log.Printf("Warning: File validation failed, proceeding anyway: %v", err)
}
// Continue with scan...
}
Testing Strategy
Unit Tests
- File validation logic
- Migration path handling
- Backup and cleanup operations
- Signature validation
Integration Tests
- Full migration scenarios
- Stale file detection
- Path transition testing
- Configuration validation
Manual Test Scenarios
-
Stale File Cleanup:
- Install agent v1, create state files
- Install agent v2 with different agent ID
- Verify stale files are backed up and cleaned
-
Path Migration:
- Install agent with old paths
- Upgrade to new version
- Verify files are moved to new locations
-
File Corruption Recovery:
- Corrupt state files manually
- Restart agent
- Verify recovery or graceful degradation
Prerequisites
- Configuration system supports nested structures
- Logging infrastructure supports structured output
- Agent has unique ID and version information
- File system permissions allow access to required paths
Effort Estimate
Complexity: Medium-High Effort: 3-4 days
- Day 1: File validation and cleanup system
- Day 2: Path migration and standardization
- Day 3: Binary signature validation
- Day 4: Integration testing and configuration
Success Metrics
- Elimination of timeout issues from stale files
- Zero manual intervention required for upgrades
- Consistent path usage across codebase
- No data loss during migration operations
- Improved system startup reliability
- Enhanced security through file validation
Monitoring
Track these metrics after implementation:
- File validation error rate
- Migration success rate
- Stale file cleanup frequency
- Path standardization compliance
- Agent startup time improvement
- User-reported file issues reduction