Add docs and project files - force for Culurien
This commit is contained in:
219
docs/session_2025-12-18-issue1-proper-design.md
Normal file
219
docs/session_2025-12-18-issue1-proper-design.md
Normal file
@@ -0,0 +1,219 @@
|
||||
# Issue #1 Proper Solution Design
|
||||
|
||||
## Problem Root Cause
|
||||
Agent check-in interval was being incorrectly overridden by scanner subsystem intervals.
|
||||
|
||||
## Current State After Kimi's "Fast Fix"
|
||||
- Line that overrode check-in interval was removed
|
||||
- Scanner intervals are logged but not applied to agent polling
|
||||
- Separation exists but without validation or protection
|
||||
|
||||
## What's Missing (Why It's Not "Proper" Yet)
|
||||
|
||||
### 1. No Validation
|
||||
- No bounds checking on interval values
|
||||
- Could accept negative intervals
|
||||
- Could accept intervals that are too short (causing server overload)
|
||||
- Could accept intervals that are too long (causing agent to appear dead)
|
||||
|
||||
### 2. No Idempotency Verification
|
||||
- Not tested that operations can be run multiple times safely
|
||||
- Config updates might not be idempotent
|
||||
- No verification that rapid mode toggling is safe
|
||||
|
||||
### 3. No Protection Against Regressions
|
||||
- No guardrails to prevent future developer from re-introducing the bug
|
||||
- No comments explaining WHY separation is critical
|
||||
- No architectural documentation
|
||||
- No tests that would catch if someone re-introduces the override
|
||||
|
||||
### 4. Insufficient Error Handling
|
||||
- syncServerConfig runs in goroutine with no error recovery
|
||||
- No retry logic if server temporarily unavailable
|
||||
- No degraded mode operation
|
||||
- No circuit breaker pattern
|
||||
|
||||
### 5. No Comprehensive Logging
|
||||
- No context about WHAT changed in interval
|
||||
- No history of interval changes over time
|
||||
- No error context for debugging
|
||||
|
||||
## Proper Solution Design
|
||||
|
||||
### Component 1: Validation Layer
|
||||
```go
|
||||
type IntervalValidator struct {
|
||||
minCheckInSeconds int // 60 seconds (1 minute)
|
||||
maxCheckInSeconds int // 3600 seconds (1 hour)
|
||||
minScannerMinutes int // 1 minute
|
||||
maxScannerMinutes int // 1440 minutes (24 hours)
|
||||
}
|
||||
|
||||
func (v *IntervalValidator) ValidateCheckInInterval(seconds int) error {
|
||||
if seconds < v.minCheckInSeconds {
|
||||
return fmt.Errorf("check-in interval %d seconds below minimum %d",
|
||||
seconds, v.minCheckInSeconds)
|
||||
}
|
||||
if seconds > v.maxCheckInSeconds {
|
||||
return fmt.Errorf("check-in interval %d seconds above maximum %d",
|
||||
seconds, v.maxCheckInSeconds)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (v *IntervalValidator) ValidateScannerInterval(minutes int) error {
|
||||
if minutes < v.minScannerMinutes {
|
||||
return fmt.Errorf("scanner interval %d minutes below minimum %d",
|
||||
minutes, v.minScannerMinutes)
|
||||
}
|
||||
if minutes > v.maxScannerMinutes {
|
||||
return fmt.Errorf("scanner interval %d minutes above maximum %d",
|
||||
minutes, v.maxScannerMinutes)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
```
|
||||
|
||||
### Component 2: Idempotency Protection
|
||||
```go
|
||||
type IntervalGuardian struct {
|
||||
lastValidatedInterval int
|
||||
violationCount int
|
||||
}
|
||||
|
||||
func (g *IntervalGuardian) CheckForOverrideAttempt(currentInterval, proposedInterval int) error {
|
||||
if currentInterval != proposedInterval {
|
||||
g.violationCount++
|
||||
return fmt.Errorf("INTERVAL_OVERRIDE_DETECTED: current=%d, proposed=%d, violations=%d",
|
||||
currentInterval, proposedInterval, g.violationCount)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (g *IntervalGuardian) GetViolationCount() int {
|
||||
return g.violationCount
|
||||
}
|
||||
```
|
||||
|
||||
### Component 3: Error Recovery with Retry
|
||||
```go
|
||||
|
||||
func syncServerConfigWithRetry(apiClient *client.Client, cfg *config.Config, maxRetries int) error {
|
||||
var lastErr error
|
||||
|
||||
for attempt := 1; attempt <= maxRetries; attempt++ {
|
||||
if err := syncServerConfigProper(apiClient, cfg); err != nil {
|
||||
lastErr = err
|
||||
log.Printf("[ERROR] [agent] [config] sync attempt %d/%d failed: %v",
|
||||
attempt, maxRetries, err)
|
||||
|
||||
if attempt < maxRetries {
|
||||
// Exponential backoff: 1s, 2s, 4s, 8s...
|
||||
backoff := time.Duration(1<<uint(attempt-1)) * time.Second
|
||||
time.Sleep(backoff)
|
||||
}
|
||||
continue
|
||||
}
|
||||
log.Printf("[SUCCESS] [agent] [config] synced after %d attempts", attempt)
|
||||
return nil
|
||||
}
|
||||
|
||||
// After maxRetries, enter degraded mode
|
||||
cfg.SetDegradedMode(true)
|
||||
log.Printf("[WARNING] [agent] [config] entering degraded mode after %d failed attempts: %v",
|
||||
maxRetries, lastErr)
|
||||
|
||||
return lastErr
|
||||
}
|
||||
```
|
||||
|
||||
### Component 4: Comprehensive Logging
|
||||
```go
|
||||
func logIntervalChange(subsystem string, oldInterval, newInterval int, reason string) {
|
||||
log.Printf("[HISTORY] [agent] [interval] subsystem=%s old=%d new=%d reason=%s timestamp=%s",
|
||||
subsystem, oldInterval, newInterval, reason, time.Now().Format(time.RFC3339))
|
||||
|
||||
// Also save to history table for audit
|
||||
// This would write to history.sqlite3 or similar
|
||||
}
|
||||
```
|
||||
|
||||
## Implementation Plan
|
||||
|
||||
### Phase 1: Validation Layer (30 minutes)
|
||||
- [ ] Create `interval_validator.go` with bounds checking
|
||||
- [ ] Add validation calls in syncServerConfig
|
||||
- [ ] Add validation error logging
|
||||
|
||||
### Phase 2: Guardians (30 minutes)
|
||||
- [ ] Create `interval_guardian.go` with override detection
|
||||
- [ ] Add guardian check after server config deserialization
|
||||
- [ ] Add violation counter to state.json
|
||||
|
||||
### Phase 3: Error Recovery (45 minutes)
|
||||
- [ ] Create `retry_sync.go` with exponential backoff
|
||||
- [ ] Add degraded mode to config struct
|
||||
- [ ] Test retry logic manually
|
||||
|
||||
### Phase 4: Comprehensive Logging (30 minutes)
|
||||
- [ ] Add history logging function
|
||||
- [ ] Log all interval changes in structured format
|
||||
- [ ] Create history table schema if not exists
|
||||
|
||||
### Phase 5: Tests (90 minutes)
|
||||
- [ ] Unit tests for validation bounds
|
||||
- [ ] Unit tests for guardian violations
|
||||
- [ ] Unit tests for retry logic
|
||||
- [ ] Integration test for full sync flow
|
||||
- [ ] Integration test for error scenarios
|
||||
- [ ] Integration test for rapid polling edge cases
|
||||
|
||||
### Phase 6: Documentation & Cleanup (30 minutes)
|
||||
- [ ] Update inline comments explaining WHY separation is critical
|
||||
- [ ] Add architecture documentation
|
||||
- [ ] Update session documentation
|
||||
- [ ] Verify all ETHOS checklist items
|
||||
|
||||
**Total Estimated Time**: 4.5 hours for proper fix
|
||||
|
||||
## Idempotency Verification
|
||||
|
||||
Test that operations can be run 3x safely:
|
||||
|
||||
```go
|
||||
func TestSyncServerConfigIdempotency(t *testing.T) {
|
||||
cfg := createTestConfig()
|
||||
apiClient := createTestAPIClient()
|
||||
|
||||
// Run sync 3 times
|
||||
for i := 0; i < 3; i++ {
|
||||
err := syncServerConfigProper(apiClient, cfg)
|
||||
if err != nil {
|
||||
t.Fatalf("Sync %d failed: %v", i+1, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Verify check-in interval unchanged
|
||||
if cfg.CheckInInterval != 300 {
|
||||
t.Fatalf("Check-in interval changed after 3 runs: %d", cfg.CheckInInterval)
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Why This Is "Proper"
|
||||
|
||||
1. **Validation**: Prevents invalid states from ever being accepted
|
||||
2. **Protection**: Guardians catch regressions immediately
|
||||
3. **Recovery**: Robust error handling for transient failures
|
||||
4. **Observability**: Complete history of all changes
|
||||
5. **Tested**: Comprehensive automated tests
|
||||
6. **Documented**: Clear comments and architecture documentation
|
||||
7. **Idempotent**: Verified by tests that operations can be repeated
|
||||
8. **Follows ETHOS**: No shortcuts, honest logging, proper error handling
|
||||
|
||||
## Technical Debt Resolution
|
||||
|
||||
**Debt from Kimi's fast fix**: NONE (Kimi actually did this one mostly right)
|
||||
**New debt introduced**: NONE (if we do these phases properly)
|
||||
|
||||
The only "debt" is that we need to implement all phases, but that’s not debt - that's proper engineering.
|
||||
Reference in New Issue
Block a user