diff --git a/README.md b/README.md new file mode 100644 index 0000000..b983833 --- /dev/null +++ b/README.md @@ -0,0 +1,446 @@ +# RedFlag + +> +## Alpha Release Notice +**⚠️ ALPHA SOFTWARE - USE WITH CAUTION** +> +> This is actively developed software that works, but continues to evolve. While core functionality is stable, occasional bugs may appear and breaking changes can happen between versions. Suitable for production use cases where you understand the alpha nature and can manage updates. See [Current Status](#current-status) below for what's implemented. + +This is alpha software built for homelabs and self-hosters. It's functional and actively used, but: + +- Expect occasional bugs +- Backup your data +- Security model is solid but not audited +- Breaking changes may happen between versions +- Documentation is a work in progress + +That said, it works well for its intended use case. Issues and feedback welcome! + +**Self-hosted update management for homelabs and small MSPs** + +Cross-platform agents • Web dashboard • Hardware binding • Ed25519 signing • Full error transparency • No enterprise BS + +``` +v0.1.27 - Alpha Release (Dec 2025) +``` + +**Latest:** Implemented proper storage metrics subsystem with dedicated table and models. AgentHealth scanner improvements with OS-aware badges and extended defaults. Agent migration system for version management. Error transparency system and hardware binding security. Ed25519 cryptographic signing. Removed 2,369 lines of dead code. Curiously the program stopped working after that... (just kidding). [Update instructions below](#updating). + +--- + +--- + +## What It Does + +RedFlag lets you manage software updates across all your servers from one dashboard. Track pending updates, approve installs, and monitor some basic system health without SSHing into every machine. + +RedFlag implements: +- **Hardware binding** - Machine fingerprint prevents token sharing between machines +- **Registration tokens** - One-time use tokens for secure agent enrollment +- **Refresh tokens** - 90-day sliding window, auto-renewal for active agents +- **Ed25519 signing** - All commands and updates cryptographically signed +- **SHA-256 hashing** - All tokens hashed at rest +- **Rate limiting** - 60 req/min per agent (configurable policies) +- **Minimal privileges** - Agents run with least required permissions +- **Error transparency** - All errors logged locally with full context (not sanitized) + +**Trust Model:** +- Initial agent registration uses token + TLS +- Public key fetched and cached on first run (TOFU model) +- Hardware fingerprint binding prevents config copying attacks +- All subsequent communications verified via Ed25519 signatures + +--- + +## Screenshots + +| Dashboard | Agent Details | Update Management | +|-----------|---------------|-------------------| +| ![Dashboard](Screenshots/RedFlag%20Default%20Dashboard.png) | ![Linux Agent](Screenshots/RedFlag%20Linux%20Agent%20Details.png) | ![Updates](Screenshots/RedFlag%20Updates%20Dashboard.png) | + +| Live Operations | History Tracking | Docker Integration | +|-----------------|------------------|-------------------| +| ![Live Ops](Screenshots/RedFlag%20Live%20Operations%20-%20Failed%20Dashboard.png) | ![History](Screenshots/RedFlag%20History%20Dashboard.png) | ![Docker](Screenshots/RedFlag%20Docker%20Dashboard.png) | + +
+More Screenshots (click to expand) + +| Heartbeat System | Registration Tokens | Settings Page | +|------------------|---------------------|---------------| +| ![Heartbeat](Screenshots/RedFlag%20Heartbeat%20System.png) | ![Tokens](Screenshots/RedFlag%20Registration%20Tokens.jpg) | ![Settings](Screenshots/RedFlag%20Settings%20Page.jpg) | + +| Linux Update History | Windows Agent Details | Agent List | +|---------------------|----------------------|------------| +| ![Linux History](Screenshots/RedFlag%20Linux%20Agent%20History%20Extended.png) | ![Windows Agent](Screenshots/RedFlag%20Windows%20Agent%20Details.png) | ![Agent List](Screenshots/RedFlag%20Agent%20List.png) | + +| Windows Update History | +|------------------------| +| ![Windows History](Screenshots/RedFlag%20Windows%20Agent%20History%20Extended.png) | + +
+ +--- + +## Quick Start + +### Server Deployment (Docker) + +```bash +# Clone and configure +git clone https://github.com/Fimeg/RedFlag.git +cd RedFlag +cp config/.env.bootstrap.example config/.env +docker-compose build +docker-compose up -d + +# Access web UI and run setup +open http://localhost:3000 +# Follow setup wizard, then copy generated .env content + +# Restart with new configuration +docker-compose down +docker-compose up -d +``` + +--- + +### Agent Installation + +**Linux (one-liner):** +```bash +curl -sfL https://your-server.com/install | sudo bash -s -- your-registration-token +``` + +**Windows (PowerShell):** +```powershell +iwr https://your-server.com/install.ps1 | iex +``` + +**Manual installation:** +```bash +# Download agent binary +wget https://your-server.com/download/linux/amd64/redflag-agent + +# Register and install +chmod +x redflag-agent +sudo ./redflag-agent --server https://your-server.com --token your-token --register +``` + +Get registration tokens from the web dashboard under **Settings → Token Management**. + +--- + +### Updating + +To update to the latest version: + +```bash +git pull && docker-compose down && docker-compose build --no-cache && docker-compose up -d +``` + +--- + +
+Full Reinstall (Nuclear Option) + +If things get really broken or you want to start completely fresh: + +```bash +docker-compose down -v --remove-orphans && \ + rm config/.env && \ + docker-compose build --no-cache && \ + cp config/.env.bootstrap.example config/.env && \ + docker-compose up -d +``` + +**What this does:** +- `down -v` - Stops containers and **wipes all data** (including the database) +- `--remove-orphans` - Cleans up leftover containers +- `rm config/.env` - Removes old server config +- `build --no-cache` - Rebuilds images from scratch +- `cp config/.env.bootstrap.example` - Resets to bootstrap mode for setup wizard +- `up -d` - Starts fresh in background + +**Warning:** This deletes everything - all agents, update history, configurations. You'll need to handle existing agents: + +**Option 1 - Re-register agents:** +- Remove agent config: `sudo rm /etc/aggregator/config.json` (Linux) or `C:\ProgramData\RedFlag\config.json` (Windows) +- Re-run the one-liner installer with new registration token +- Scripts handle override/update automatically (one agent per OS install) + +**Option 2 - Clean uninstall/reinstall:** +- Uninstall agent completely first +- Then run installer with new token + +
+ +--- + +
+Full Uninstall + +**Uninstall Server:** +```bash +docker-compose down -v --remove-orphans +rm config/.env +``` + +**Uninstall Linux Agent:** +```bash +# Using uninstall script (recommended) +sudo bash aggregator-agent/uninstall.sh + +# Remove agent configuration +sudo rm /etc/aggregator/config.json + +# Remove agent user (optional - preserves logs) +sudo userdel -r redflag-agent +``` + +**Uninstall Windows Agent:** +```powershell +# Stop and remove service +Stop-Service RedFlagAgent +sc.exe delete RedFlagAgent + +# Remove files +Remove-Item "C:\Program Files\RedFlag\redflag-agent.exe" +Remove-Item "C:\ProgramData\RedFlag\config.json" +``` + +
+ +--- + +## Key Features + +✓ **Hardware Binding** - Machine fingerprint prevents config copying between agents +✓ **Ed25519 Signing** - All updates cryptographically verified before installation +✓ **Secure by Default** - Registration tokens, JWT auth with refresh, rate limiting +✓ **Error Transparency** - All errors logged with full context (no sanitization) +✓ **Idempotent Installs** - Re-running installers won't create duplicate agents +✓ **Real-time Heartbeat** - Interactive operations with rapid polling mode +✓ **Dependency Handling** - Dry-run checks before installing updates +✓ **Multi-seat Tokens** - One token can register multiple agents +✓ **Audit Trails** - Complete history of all operations +✓ **Proxy Support** - HTTP/HTTPS/SOCKS5 for restricted networks +✓ **Native Services** - systemd on Linux, Windows Services on Windows +✓ **Self-hosted** - No cloud dependencies, runs entirely on your infrastructure + +--- + +## Architecture + +``` +┌─────────────────┐ +│ Web Dashboard │ React + TypeScript +│ Port: 3000 │ +└────────┬────────┘ + │ HTTPS + JWT Auth + Machine Binding +┌────────▼────────┐ +│ Server (Go) │ PostgreSQL +│ Port: 8080 │ Ed25519 Signing Service +└────────┬────────┘ + │ Pull-based (agents check in every 5 min) + ┌────┴────┬────────┐ + │ │ │ +┌───▼──┐ ┌──▼──┐ ┌──▼───┐ +│Linux │ │Windows│ │Docker│ +│Agent │ │Agent │ │Agent │ +└──────┘ └───────┘ └──────┘ + └─ APT └─ WUA └─ Images + └─ DNF └─ Winget +``` + +**Key Security Flow:** +1. Agent registers with machine fingerprint + public key +2. Server stores hardware binding in database +3. Every agent request validated against stored fingerprint +4. Commands signed with server Ed25519 private key +5. Agent verifies signature + nonce + timestamp before execution +6. All updates have checksum verification + rollback on failure + +--- + +## Documentation + +- **[API Reference](docs/API.md)** - Complete API documentation +- **[Configuration](docs/CONFIGURATION.md)** - CLI flags, env vars, config files +- **[Architecture](docs/ARCHITECTURE.md)** - System design and database schema +- **[Development](docs/DEVELOPMENT.md)** - Build from source, testing, contributing + +--- + +## Current Status + +**What Works:** +- ✅ Cross-platform agent registration and updates +- ✅ Update scanning for all supported package managers +- ✅ Dry-run dependency checking before installation +- ✅ Real-time heartbeat and rapid polling +- ✅ Multi-seat registration tokens +- ✅ Native service integration (systemd, Windows Services) +- ✅ Web dashboard with full agent management +- ✅ Docker integration for container image updates + +**Known Issues:** +- Windows Winget detection occasionally misses packages (Windows API limitation) +- Some Windows Updates may reappear after installation (known Windows Update quirk) +- Limited mobile dashboard optimization (usable but not ideal) +--- + +## License + +MIT License - See [LICENSE](LICENSE) for details + +**Third-Party Components:** +- Windows Update integration based on [windowsupdate](https://github.com/ceshihao/windowsupdate) (Apache 2.0) + +--- + +## Competitive Position + +**Why This Matters:** + +ConnectWise charges $50/agent/month. For 1000 agents, that's **$600,000 per year**. + +RedFlag costs $0/agent/month + the cost of your VM ($50/month). + +That's not a feature difference - that's a **business model disruption**. + +**What ConnectWise can't do** (architectural limitations): +- ❌ Hardware binding (their cloud model prevents it) +- ❌ Self-hosted by design (they push "MSP Cloud") +- ❌ Code transparency (proprietary, can't audit claims) +- ❌ Ed25519 cryptographic verification (opaque signing process) + +**What RedFlag does** (architectural advantages): +- ✅ Hardware fingerprint binding (machine_id + public_key) +- ✅ Self-hosted by design (runs entirely on your infrastructure) +- ✅ Ed25519 signing throughout (verifiable supply chain) +- ✅ Error transparency (all logs local with full context) +- ✅ $600k/year savings (undeniable math) + +**This isn't about replacing ConnectWise feature-for-feature.** + +It's about: **80% of the functionality for 0% of the cost, plus 3 security advantages they literally cannot match without breaking their business model.** + +**Bottom line**: Built from scratch with hardware binding, Ed25519 signing, and complete error transparency. Works for homelabs and small MSPs who value control, privacy, and cost sanity. Enterprises can keep paying their $600k/year. That's fine. Different tools for different needs. + +--- + +## Cleanup Instructions (Important for Upgrades) + +### Removing Old Versions (Pre-v0.1.20) + +If you're upgrading from versions older than v0.1.20, old agent installations used different paths. It it highly recommended to just uninstall the old version (instructions are below... and yet, for some idiotic reason I have tried to implement this early stage migration - but it's not going to work... don't try that unless you like pain, just uninstall, you can't have more than like... 100 agents already yeah? if so... call me) see below: + +**Old Agent Locations (to remove if present):** +- `/etc/aggregator/` - Old agent configuration directory +- `/etc/redflag/` - Old configuration (moved to `/etc/redflag-agent/`) +- `/usr/local/bin/aggregator-agent` - Old binary location +- `/var/lib/aggregator/` - Old data directory + +**New Agent Locations (v0.1.20+):** +- `/etc/redflag-agent/` - Agent configuration and keys +- `/usr/local/bin/redflag-agent` - Agent binary (Linux) +- `C:\Program Files\RedFlag\` - Agent install (Windows) +- `/var/lib/redflag-agent/` - Agent data and logs (if used) + +**Cleanup Commands:** +```bash +# Linux cleanup (if upgrading from old versions) +sudo rm -rf /etc/aggregator/ +sudo rm -rf /usr/local/bin/aggregator-agent +sudo rm -rf /var/lib/aggregator/ + +# Then install new agent normally +curl -sfL https://your-server.com/install | sudo bash -s -- your-token +``` + +**Windows Cleanup (if upgrading):** +```powershell +# Remove old agent locations +Remove-Item "C:\Program Files\Aggregator\*" -Recurse -ErrorAction SilentlyContinue +Remove-Item "C:\ProgramData\Aggregator\*" -Recurse -ErrorAction SilentlyContinue + +# Then install new agent +iwr https://your-server.com/install.ps1 | iex +``` + +### Full Fresh Install (If Things Are Messy) + +If you want to completely remove everything and start fresh: + +**Option 1: Re-register (preserves most data)** +```bash +# Remove agent config (keeps logs) +sudo rm /etc/redflag-agent/config.json +# Or on Windows +Remove-Item "C:\ProgramData\RedFlag\config.json" + +# Re-run installer (agent will re-register) +curl -sfL https://your-server.com/install | sudo bash -s -- your-new-token +``` + +**Option 2: Complete removal (start completely fresh)** +```bash +# Use uninstall script (preserves logs for debugging) +sudo bash /usr/local/bin/redflag-agent/uninstall.sh + +# Or manual removal +sudo systemctl stop redflag-agent +sudo userdel -r redflag-agent # Optional: removes agent user and home directory +sudo rm /etc/redflag-agent/config.json +sudo rm /usr/local/bin/redflag-agent + +# Then reinstall from scratch +curl -sfL https://your-server.com/install | sudo bash -s -- your-new-token +``` + +**Note**: Re-registering is usually sufficient. Complete removal is only needed if the agent state is corrupted or you want to change the agent user. + +--- + +## Homelab Philosophy + +This software follows ETHOS principles: +- **Honest** - What you see is what you get +- **Transparent** - All errors logged with full context (no sanitization) +- **Secure** - Hardware binding, cryptographic verification, local logging +- **Open Standards** - No vendor lock-in, self-hosted by design + +Made for homelabbers and small MSPs who: +- Value control over their infrastructure +- Want cost sanity ($0 vs $600k/year) +- Prefer transparency over enterprise marketing +- Can handle "alpha software" that actually works + + +## Project Goals + +RedFlag aims to be: +- **Simple** - Deploy in 5 minutes, understand in 10 +- **Honest** - No enterprise marketing speak, no upsell, just useful software +- **Homelab-first** - Built for real use cases, not investor pitches +- **Self-hosted** - Your data, your infrastructure + +If you're looking for an enterprise-grade solution with SLAs and support contracts, this isn't it. Passing the buck has to stop somewhere. If you own your infra - this will be sovreign to you. + +--- + +**Made with ☕ for homelabbers, by homelabbers** + +--- + +## 📜 **TLDR Changelog: Don't trust the transport layer** + +**v0.1.27 (Dec 2025, Christmas Release) 🎄**: +- ✅ Hardware binding with machine fingerprinting (security differentiator) +- ✅ Ed25519 cryptographic signing for all updates (supply chain protection) +- ✅ Error transparency system with full context logging (ETHOS #1) +- ✅ Circuit breakers and retry logic throughout (reliability) +- ✅ Agent auto-update system fully implemented (was marked "placeholder") +- ✅ Rate limiting active (60 req/min, configurable) +- ✅ Command deduplication and idempotency + diff --git a/aggregator-agent/cmd/agent/subsystem_handlers.go b/aggregator-agent/cmd/agent/subsystem_handlers.go index 75d784b..215c5f4 100644 --- a/aggregator-agent/cmd/agent/subsystem_handlers.go +++ b/aggregator-agent/cmd/agent/subsystem_handlers.go @@ -144,7 +144,7 @@ func handleScanSystem(apiClient *client.Client, cfg *config.Config, ackTracker * // Report system metrics to server using dedicated endpoint // Get system scanner and use proper interface - systemScanner := orchestrator.NewSystemScanner("unknown") // TODO: Get actual agent version + systemScanner := orchestrator.NewSystemScanner(cfg.AgentVersion) var metrics []orchestrator.SystemMetric // Declare outside if block for ReportLog access if systemScanner.IsAvailable() { var err error diff --git a/aggregator-server/Dockerfile b/aggregator-server/Dockerfile index f6d8a5b..de66dde 100644 --- a/aggregator-server/Dockerfile +++ b/aggregator-server/Dockerfile @@ -3,15 +3,40 @@ FROM golang:1.24-alpine AS server-builder WORKDIR /app -# Install git for module resolution +# Install git for version detection RUN apk add --no-cache git # Copy go.mod and go.sum COPY aggregator-server/go.mod aggregator-server/go.sum ./ RUN go mod download +# Copy .git to get version info +COPY .git/ ./.git/ + +# Extract semantic version from git (BASE_VERSION.COMMIT_COUNT) +RUN cd /app && \ + # Get latest tag or default to 0.1.0 \ + if git describe --tags --dirty --always >/dev/null 2>&1; then \ + LATEST_TAG=$(git describe --tags --dirty --always); \ + BASE_VERSION=$(echo "$LATEST_TAG" | sed 's/^v//' | cut -d. -f1-3); \ + else \ + BASE_VERSION="0.1.0"; \ + fi && \ + # Count commits since tag \ + COMMITS_SINCE=$(git rev-list $(git describe --tags --dirty --always 2>/dev/null)..HEAD 2>/dev/null | wc -l | tr -d ' ') && \ + if [ "$COMMITS_SINCE" = "" ] || [ "$COMMITS_SINCE" -eq 0 ]; then BUILD=0; else BUILD=$COMMITS_SINCE; fi && \ + VERSION="${BASE_VERSION}.${BUILD}" && \ + echo "Building server version: $VERSION" && \ + echo "$VERSION" > /app/version.txt + +# Copy aggregator-server contents to /app (maintains correct directory structure) COPY aggregator-server/ ./ -RUN CGO_ENABLED=0 go build -o redflag-server cmd/server/main.go + +# Build server with version injection +RUN VERSION=$(cat /app/version.txt) && \ + CGO_ENABLED=0 go build \ + -ldflags "-X github.com/Fimeg/RedFlag/aggregator-server/internal/version.AgentVersion=$VERSION" \ + -o redflag-server cmd/server/main.go # Stage 2: Build agent binaries for all platforms FROM golang:1.24-alpine AS agent-builder @@ -86,6 +111,7 @@ COPY --from=server-builder /app/internal/database ./internal/database COPY --from=agent-builder /build/binaries ./binaries # Copy and setup entrypoint script +# File is in aggregator-server/ directory relative to build context COPY aggregator-server/docker-entrypoint.sh /usr/local/bin/ RUN chmod +x /usr/local/bin/docker-entrypoint.sh diff --git a/aggregator-server/cmd/server/main.go b/aggregator-server/cmd/server/main.go index ff967f5..956eccb 100644 --- a/aggregator-server/cmd/server/main.go +++ b/aggregator-server/cmd/server/main.go @@ -19,6 +19,7 @@ import ( "github.com/Fimeg/RedFlag/aggregator-server/internal/logging" "github.com/Fimeg/RedFlag/aggregator-server/internal/scheduler" "github.com/Fimeg/RedFlag/aggregator-server/internal/services" + "github.com/Fimeg/RedFlag/aggregator-server/internal/version" "github.com/gin-gonic/gin" ) @@ -128,15 +129,15 @@ func main() { // Parse command line flags var setup bool var migrate bool - var version bool + var showVersion bool flag.BoolVar(&setup, "setup", false, "Run setup wizard") flag.BoolVar(&migrate, "migrate", false, "Run database migrations only") - flag.BoolVar(&version, "version", false, "Show version information") + flag.BoolVar(&showVersion, "version", false, "Show version information") flag.Parse() // Handle special commands - if version { - fmt.Printf("RedFlag Server v0.1.0-alpha\n") + if showVersion { + fmt.Printf("RedFlag Server v%s\n", version.AgentVersion) fmt.Printf("Self-hosted update management platform\n") return } @@ -491,6 +492,11 @@ func main() { dashboard.POST("/agents/:id/subsystems/:subsystem/auto-run", subsystemHandler.SetAutoRun) dashboard.POST("/agents/:id/subsystems/:subsystem/interval", subsystemHandler.SetInterval) + // Client error logging (authenticated) + clientErrorHandler := handlers.NewClientErrorHandler(db.DB) + dashboard.POST("/logs/client-error", clientErrorHandler.LogError) + dashboard.GET("/logs/client-errors", clientErrorHandler.GetErrors) + dashboard.GET("/updates", updateHandler.ListUpdates) dashboard.GET("/updates/:id", updateHandler.GetUpdate) dashboard.GET("/updates/:id/logs", updateHandler.GetUpdateLogs) diff --git a/aggregator-server/internal/api/handlers/agents.go b/aggregator-server/internal/api/handlers/agents.go index dce5acb..d83271e 100644 --- a/aggregator-server/internal/api/handlers/agents.go +++ b/aggregator-server/internal/api/handlers/agents.go @@ -1248,7 +1248,7 @@ func (h *AgentHandler) EnableRapidPollingMode(agentID uuid.UUID, durationMinutes } // SetRapidPollingMode enables rapid polling mode for an agent -// TODO: Rate limiting should be implemented for rapid polling endpoints to prevent abuse (technical debt) +// Rate limiting is implemented at router level in cmd/server/main.go func (h *AgentHandler) SetRapidPollingMode(c *gin.Context) { idStr := c.Param("id") agentID, err := uuid.Parse(idStr) diff --git a/aggregator-server/internal/api/handlers/client_errors.go b/aggregator-server/internal/api/handlers/client_errors.go new file mode 100644 index 0000000..721e886 --- /dev/null +++ b/aggregator-server/internal/api/handlers/client_errors.go @@ -0,0 +1,223 @@ +package handlers + +import ( + "encoding/json" + "fmt" + "log" + "net/http" + "time" + + "github.com/gin-gonic/gin" + "github.com/google/uuid" + "github.com/jmoiron/sqlx" +) + +// ClientErrorHandler handles frontend error logging per ETHOS #1 +type ClientErrorHandler struct { + db *sqlx.DB +} + +// NewClientErrorHandler creates a new error handler +func NewClientErrorHandler(db *sqlx.DB) *ClientErrorHandler { + return &ClientErrorHandler{db: db} +} + +// GetErrorsResponse represents paginated error list +type GetErrorsResponse struct { + Errors []ClientErrorResponse `json:"errors"` + Total int64 `json:"total"` + Page int `json:"page"` + PageSize int `json:"page_size"` + TotalPages int `json:"total_pages"` +} + +// ClientErrorResponse represents a single error in response +type ClientErrorResponse struct { + ID string `json:"id"` + AgentID string `json:"agent_id,omitempty"` + Subsystem string `json:"subsystem"` + ErrorType string `json:"error_type"` + Message string `json:"message"` + Metadata map[string]interface{} `json:"metadata,omitempty"` + URL string `json:"url"` + CreatedAt time.Time `json:"created_at"` +} + +// GetErrors returns paginated error logs (admin only) +func (h *ClientErrorHandler) GetErrors(c *gin.Context) { + // Parse pagination params + page := 1 + pageSize := 50 + if p, ok := c.GetQuery("page"); ok { + fmt.Sscanf(p, "%d", &page) + } + if ps, ok := c.GetQuery("page_size"); ok { + fmt.Sscanf(ps, "%d", &pageSize) + } + if pageSize > 100 { + pageSize = 100 // Max page size + } + + // Parse filters + subsystem := c.Query("subsystem") + errorType := c.Query("error_type") + agentIDStr := c.Query("agent_id") + + // Build query + query := `SELECT id, agent_id, subsystem, error_type, message, metadata, url, created_at + FROM client_errors + WHERE 1=1` + params := map[string]interface{}{} + + if subsystem != "" { + query += " AND subsystem = :subsystem" + params["subsystem"] = subsystem + } + if errorType != "" { + query += " AND error_type = :error_type" + params["error_type"] = errorType + } + if agentIDStr != "" { + query += " AND agent_id = :agent_id" + params["agent_id"] = agentIDStr + } + + query += " ORDER BY created_at DESC LIMIT :limit OFFSET :offset" + params["limit"] = pageSize + params["offset"] = (page - 1) * pageSize + + // Execute query + var errors []ClientErrorResponse + if err := h.db.Select(&errors, query, params); err != nil { + log.Printf("[ERROR] [server] [client_error] query_failed error=\"%v\"", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "query failed"}) + return + } + + // Get total count + countQuery := `SELECT COUNT(*) FROM client_errors WHERE 1=1` + if subsystem != "" { + countQuery += " AND subsystem = :subsystem" + } + if errorType != "" { + countQuery += " AND error_type = :error_type" + } + if agentIDStr != "" { + countQuery += " AND agent_id = :agent_id" + } + + var total int64 + if err := h.db.Get(&total, countQuery, params); err != nil { + log.Printf("[ERROR] [server] [client_error] count_failed error=\"%v\"", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "count failed"}) + return + } + + totalPages := int((total + int64(pageSize) - 1) / int64(pageSize)) + + response := GetErrorsResponse{ + Errors: errors, + Total: total, + Page: page, + PageSize: pageSize, + TotalPages: totalPages, + } + + c.JSON(http.StatusOK, response) +} + +// LogErrorRequest represents a client error log entry +type LogErrorRequest struct { + Subsystem string `json:"subsystem" binding:"required"` + ErrorType string `json:"error_type" binding:"required,oneof=javascript_error api_error ui_error validation_error"` + Message string `json:"message" binding:"required,max=10000"` + StackTrace string `json:"stack_trace,omitempty"` + Metadata map[string]interface{} `json:"metadata,omitempty"` + URL string `json:"url" binding:"required"` +} + +// LogError processes and stores frontend errors +func (h *ClientErrorHandler) LogError(c *gin.Context) { + var req LogErrorRequest + if err := c.ShouldBindJSON(&req); err != nil { + log.Printf("[ERROR] [server] [client_error] validation_failed error=\"%v\"", err) + c.JSON(http.StatusBadRequest, gin.H{"error": "invalid request data"}) + return + } + + // Extract agent ID from auth middleware if available + var agentID interface{} + if agentIDValue, exists := c.Get("agentID"); exists { + if id, ok := agentIDValue.(uuid.UUID); ok { + agentID = id + } + } + + // Log to console with HISTORY prefix + log.Printf("[ERROR] [server] [client] [%s] agent_id=%v subsystem=%s message=\"%s\"", + req.ErrorType, agentID, req.Subsystem, truncate(req.Message, 200)) + log.Printf("[HISTORY] [server] [client_error] agent_id=%v subsystem=%s type=%s url=\"%s\" message=\"%s\" timestamp=%s", + agentID, req.Subsystem, req.ErrorType, req.URL, req.Message, time.Now().Format(time.RFC3339)) + + // Store in database with retry logic + if err := h.storeError(agentID, req, c); err != nil { + log.Printf("[ERROR] [server] [client_error] store_failed error=\"%v\"", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to store error"}) + return + } + + c.JSON(http.StatusOK, gin.H{"logged": true}) +} + +// storeError persists error to database with retry +func (h *ClientErrorHandler) storeError(agentID interface{}, req LogErrorRequest, c *gin.Context) error { + const maxRetries = 3 + var lastErr error + + for attempt := 1; attempt <= maxRetries; attempt++ { + query := `INSERT INTO client_errors (agent_id, subsystem, error_type, message, stack_trace, metadata, url, user_agent) + VALUES (:agent_id, :subsystem, :error_type, :message, :stack_trace, :metadata, :url, :user_agent)` + + // Convert metadata map to JSON for PostgreSQL JSONB column + var metadataJSON json.RawMessage + if req.Metadata != nil && len(req.Metadata) > 0 { + jsonBytes, err := json.Marshal(req.Metadata) + if err != nil { + log.Printf("[ERROR] [server] [client_error] metadata_marshal_failed error=\"%v\"", err) + metadataJSON = nil + } else { + metadataJSON = json.RawMessage(jsonBytes) + } + } + + _, err := h.db.NamedExec(query, map[string]interface{}{ + "agent_id": agentID, + "subsystem": req.Subsystem, + "error_type": req.ErrorType, + "message": req.Message, + "stack_trace": req.StackTrace, + "metadata": metadataJSON, + "url": req.URL, + "user_agent": c.GetHeader("User-Agent"), + }) + + if err == nil { + return nil + } + + lastErr = err + if attempt < maxRetries { + time.Sleep(time.Duration(attempt) * time.Second) + continue + } + } + + return fmt.Errorf("failed after %d attempts: %w", maxRetries, lastErr) +} + +func truncate(s string, maxLen int) string { + if len(s) <= maxLen { + return s + } + return s[:maxLen] + "..." +} diff --git a/aggregator-server/internal/command/factory.go b/aggregator-server/internal/command/factory.go new file mode 100644 index 0000000..7dd25d4 --- /dev/null +++ b/aggregator-server/internal/command/factory.go @@ -0,0 +1,66 @@ +package command + +import ( + "errors" + "fmt" + "time" + + "github.com/Fimeg/RedFlag/aggregator-server/internal/models" + "github.com/google/uuid" +) + +// Factory creates validated AgentCommand instances +type Factory struct { + validator *Validator +} + +// NewFactory creates a new command factory +func NewFactory() *Factory { + return &Factory{ + validator: NewValidator(), + } +} + +// Create generates a new validated AgentCommand with unique ID +func (f *Factory) Create(agentID uuid.UUID, commandType string, params map[string]interface{}) (*models.AgentCommand, error) { + cmd := &models.AgentCommand{ + ID: uuid.New(), + AgentID: agentID, + CommandType: commandType, + Status: "pending", + Source: determineSource(commandType), + Params: params, + CreatedAt: time.Now(), + UpdatedAt: time.Now(), + } + + if err := f.validator.Validate(cmd); err != nil { + return nil, fmt.Errorf("command validation failed: %w", err) + } + + return cmd, nil +} + +// determineSource classifies command source based on type +func determineSource(commandType string) string { + if isSystemCommand(commandType) { + return "system" + } + return "manual" +} + +func isSystemCommand(commandType string) bool { + systemCommands := []string{ + "enable_heartbeat", + "disable_heartbeat", + "update_check", + "cleanup_old_logs", + } + + for _, cmd := range systemCommands { + if commandType == cmd { + return true + } + } + return false +} diff --git a/aggregator-server/internal/command/validator.go b/aggregator-server/internal/command/validator.go new file mode 100644 index 0000000..1ecf65a --- /dev/null +++ b/aggregator-server/internal/command/validator.go @@ -0,0 +1,123 @@ +package command + +import ( + "errors" + "fmt" + + "github.com/google/uuid" + "github.com/Fimeg/RedFlag/aggregator-server/internal/models" +) + +// Validator validates command parameters +type Validator struct { + minCheckInSeconds int + maxCheckInSeconds int + minScannerMinutes int + maxScannerMinutes int +} + +// NewValidator creates a new command validator +func NewValidator() *Validator { + return &Validator{ + minCheckInSeconds: 60, // 1 minute minimum + maxCheckInSeconds: 3600, // 1 hour maximum + minScannerMinutes: 1, // 1 minute minimum + maxScannerMinutes: 1440, // 24 hours maximum + } +} + +// Validate performs comprehensive command validation +func (v *Validator) Validate(cmd *models.AgentCommand) error { + if cmd == nil { + return errors.New("command cannot be nil") + } + + if cmd.ID == uuid.Nil { + return errors.New("command ID cannot be zero UUID") + } + + if cmd.AgentID == uuid.Nil { + return errors.New("agent ID is required") + } + + if cmd.CommandType == "" { + return errors.New("command type is required") + } + + if cmd.Status == "" { + return errors.New("status is required") + } + + validStatuses := []string{"pending", "running", "completed", "failed", "cancelled"} + if !contains(validStatuses, cmd.Status) { + return fmt.Errorf("invalid status: %s", cmd.Status) + } + + if cmd.Source != "manual" && cmd.Source != "system" { + return fmt.Errorf("source must be 'manual' or 'system', got: %s", cmd.Source) + } + + // Validate command type format + if err := v.validateCommandType(cmd.CommandType); err != nil { + return err + } + + return nil +} + +// ValidateSubsystemAction validates subsystem-specific actions +func (v *Validator) ValidateSubsystemAction(subsystem string, action string) error { + validActions := map[string][]string{ + "storage": {"trigger", "enable", "disable", "set_interval"}, + "system": {"trigger", "enable", "disable", "set_interval"}, + "docker": {"trigger", "enable", "disable", "set_interval"}, + "updates": {"trigger", "enable", "disable", "set_interval"}, + } + + actions, ok := validActions[subsystem] + if !ok { + return fmt.Errorf("unknown subsystem: %s", subsystem) + } + + if !contains(actions, action) { + return fmt.Errorf("invalid action '%s' for subsystem '%s'", action, subsystem) + } + + return nil +} + +// ValidateInterval ensures scanner intervals are within bounds +func (v *Validator) ValidateInterval(subsystem string, minutes int) error { + if minutes < v.minScannerMinutes { + return fmt.Errorf("interval %d minutes below minimum %d for subsystem %s", + minutes, v.minScannerMinutes, subsystem) + } + + if minutes > v.maxScannerMinutes { + return fmt.Errorf("interval %d minutes above maximum %d for subsystem %s", + minutes, v.maxScannerMinutes, subsystem) + } + + return nil +} + +func (v *Validator) validateCommandType(commandType string) error { + validPrefixes := []string{"scan_", "install_", "update_", "enable_", "disable_", "reboot"} + + for _, prefix := range validPrefixes { + if len(commandType) >= len(prefix) && commandType[:len(prefix)] == prefix { + return nil + } + } + + return fmt.Errorf("invalid command type format: %s", commandType) +} + +func contains(slice []string, item string) bool { + for _, s := range slice { + if s == item { + return true + } + } + return false +} diff --git a/aggregator-server/internal/database/migrations/017_add_machine_id.up.sql b/aggregator-server/internal/database/migrations/017_add_machine_id.up.sql index 5289210..33a5590 100644 --- a/aggregator-server/internal/database/migrations/017_add_machine_id.up.sql +++ b/aggregator-server/internal/database/migrations/017_add_machine_id.up.sql @@ -6,7 +6,8 @@ DROP INDEX IF EXISTS idx_agents_machine_id; -- Create unique index to prevent duplicate machine IDs (allows multiple NULLs) -CREATE UNIQUE INDEX CONCURRENTLY idx_agents_machine_id_unique ON agents(machine_id) WHERE machine_id IS NOT NULL; +-- Note: CONCURRENTLY removed to allow transaction-based migration +CREATE UNIQUE INDEX idx_agents_machine_id_unique ON agents(machine_id) WHERE machine_id IS NOT NULL; -- Add comment for documentation COMMENT ON COLUMN agents.machine_id IS 'SHA-256 hash of hardware fingerprint (prevents agent impersonation via config copying)'; diff --git a/aggregator-server/internal/database/migrations/023_client_error_logging.down.sql b/aggregator-server/internal/database/migrations/023_client_error_logging.down.sql new file mode 100644 index 0000000..ecb4a97 --- /dev/null +++ b/aggregator-server/internal/database/migrations/023_client_error_logging.down.sql @@ -0,0 +1,3 @@ +-- Rollback migration 023: Client Error Logging Schema + +DROP TABLE IF EXISTS client_errors; diff --git a/aggregator-server/internal/database/migrations/023_client_error_logging.up.sql b/aggregator-server/internal/database/migrations/023_client_error_logging.up.sql new file mode 100644 index 0000000..40b1434 --- /dev/null +++ b/aggregator-server/internal/database/migrations/023_client_error_logging.up.sql @@ -0,0 +1,28 @@ +-- Migration 023: Client Error Logging Schema +-- Implements ETHOS #1: Errors are History, Not /dev/null + +CREATE TABLE client_errors ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + agent_id UUID REFERENCES agents(id) ON DELETE SET NULL, + subsystem VARCHAR(50) NOT NULL, + error_type VARCHAR(50) NOT NULL, + message TEXT NOT NULL, + stack_trace TEXT, + metadata JSONB, + url TEXT NOT NULL, + user_agent TEXT, + created_at TIMESTAMP DEFAULT NOW() +); + +-- Indexes for efficient querying +CREATE INDEX idx_client_errors_agent_time ON client_errors(agent_id, created_at DESC); +CREATE INDEX idx_client_errors_subsystem_time ON client_errors(subsystem, created_at DESC); +CREATE INDEX idx_client_errors_error_type_time ON client_errors(error_type, created_at DESC); +CREATE INDEX idx_client_errors_created_at ON client_errors(created_at DESC); + +-- Comments for documentation +COMMENT ON TABLE client_errors IS 'Frontend error logs for debugging and auditing. Implements ETHOS #1.'; +COMMENT ON COLUMN client_errors.agent_id IS 'Agent active when error occurred (NULL for pre-auth errors)'; +COMMENT ON COLUMN client_errors.subsystem IS 'RedFlag subsystem being used (storage, system, docker, etc.)'; +COMMENT ON COLUMN client_errors.error_type IS 'Error category: javascript_error, api_error, ui_error, validation_error'; +COMMENT ON COLUMN client_errors.metadata IS 'Additional context (component name, API response, user actions)'; diff --git a/aggregator-server/internal/database/migrations/023a_command_deduplication.down.sql b/aggregator-server/internal/database/migrations/023a_command_deduplication.down.sql new file mode 100644 index 0000000..fb887b2 --- /dev/null +++ b/aggregator-server/internal/database/migrations/023a_command_deduplication.down.sql @@ -0,0 +1,5 @@ +-- Rollback migration 023a: Command Deduplication Schema + +DROP INDEX IF EXISTS idx_agent_pending_subsystem; +ALTER TABLE agent_commands DROP COLUMN IF EXISTS idempotency_key; +DROP INDEX IF EXISTS idx_agent_commands_idempotency_key; diff --git a/aggregator-server/internal/database/migrations/023a_command_deduplication.up.sql b/aggregator-server/internal/database/migrations/023a_command_deduplication.up.sql new file mode 100644 index 0000000..f46b52b --- /dev/null +++ b/aggregator-server/internal/database/migrations/023a_command_deduplication.up.sql @@ -0,0 +1,16 @@ +-- Migration 023a: Command Deduplication Schema +-- Prevents multiple pending scan commands per subsystem per agent + +-- Add unique constraint to enforce single pending command per subsystem +CREATE UNIQUE INDEX idx_agent_pending_subsystem +ON agent_commands(agent_id, command_type, status) +WHERE status = 'pending'; + +-- Add idempotency key support for retry scenarios +ALTER TABLE agent_commands ADD COLUMN idempotency_key VARCHAR(64) UNIQUE NULL; +CREATE INDEX idx_agent_commands_idempotency_key ON agent_commands(idempotency_key); + +-- Comments for documentation +COMMENT ON TABLE agent_commands IS 'Commands sent to agents for execution'; +COMMENT ON COLUMN agent_commands.idempotency_key IS + 'Prevents duplicate command creation from retry logic. Based on agent_id + subsystem + timestamp window.'; diff --git a/aggregator-server/internal/models/command.go b/aggregator-server/internal/models/command.go index f1c67bc..4c7f6d3 100644 --- a/aggregator-server/internal/models/command.go +++ b/aggregator-server/internal/models/command.go @@ -1,6 +1,7 @@ package models import ( + "errors" "time" "github.com/google/uuid" @@ -16,12 +17,52 @@ type AgentCommand struct { Source string `json:"source" db:"source"` Signature string `json:"signature,omitempty" db:"signature"` CreatedAt time.Time `json:"created_at" db:"created_at"` + UpdatedAt time.Time `json:"updated_at" db:"updated_at"` SentAt *time.Time `json:"sent_at,omitempty" db:"sent_at"` CompletedAt *time.Time `json:"completed_at,omitempty" db:"completed_at"` Result JSONB `json:"result,omitempty" db:"result"` RetriedFromID *uuid.UUID `json:"retried_from_id,omitempty" db:"retried_from_id"` } +// Validate checks if the command has all required fields +func (c *AgentCommand) Validate() error { + if c.ID == uuid.Nil { + return ErrCommandIDRequired + } + if c.AgentID == uuid.Nil { + return ErrAgentIDRequired + } + if c.CommandType == "" { + return ErrCommandTypeRequired + } + if c.Status == "" { + return ErrStatusRequired + } + if c.Source != "manual" && c.Source != "system" { + return ErrInvalidSource + } + return nil +} + +// IsTerminal returns true if the command is in a terminal state +func (c *AgentCommand) IsTerminal() bool { + return c.Status == "completed" || c.Status == "failed" || c.Status == "cancelled" +} + +// CanRetry returns true if the command can be retried +func (c *AgentCommand) CanRetry() bool { + return c.Status == "failed" && c.RetriedFromID == nil +} + +// Predefined errors for validation +var ( + ErrCommandIDRequired = errors.New("command ID cannot be zero UUID") + ErrAgentIDRequired = errors.New("agent ID is required") + ErrCommandTypeRequired = errors.New("command type is required") + ErrStatusRequired = errors.New("status is required") + ErrInvalidSource = errors.New("source must be 'manual' or 'system'") +) + // CommandsResponse is returned when an agent checks in for commands type CommandsResponse struct { Commands []CommandItem `json:"commands"` diff --git a/aggregator-server/internal/version/versions.go b/aggregator-server/internal/version/versions.go index 5198134..09b8c91 100644 --- a/aggregator-server/internal/version/versions.go +++ b/aggregator-server/internal/version/versions.go @@ -7,24 +7,37 @@ import ( // Version coordination for Server Authority model // The server is the single source of truth for all version information +// +// Version Sources: +// - Agent versions: Compiled into agent via ldflags during build (see agent/internal/version) +// - Server versions: Compiled into server via ldflags during build (injected below) +// - Database: agents table stores agent_version at registration -// CurrentVersions holds the authoritative version information +// Build-time injected version information (SERVER AUTHORITY) +// Injected by build script during server compilation +var ( + AgentVersion = "dev" // Server's agent version (format: 0.1.27) + ConfigVersion = "dev" // Config schema version (format: 3) + MinAgentVersion = "dev" // Minimum supported agent version +) + +// CurrentVersions holds the authoritative version information for API responses type CurrentVersions struct { - AgentVersion string `json:"agent_version"` // e.g., "0.1.23.6" - ConfigVersion string `json:"config_version"` // e.g., "6" + AgentVersion string `json:"agent_version"` // e.g., "0.1.27" + ConfigVersion string `json:"config_version"` // e.g., "3" MinAgentVersion string `json:"min_agent_version"` // e.g., "0.1.22" BuildTime time.Time `json:"build_time"` } // GetCurrentVersions returns the current version information -// In production, this would come from a version file, database, or environment +// Version is compiled into the server binary at build time via ldflags func GetCurrentVersions() CurrentVersions { - // TODO: For production, load this from version file or database - // For now, use environment variables with defaults + // Build-time injection allows version updates without code changes + // See Dockerfile for injection via: -ldflags "-X .../version.AgentVersion=0.1.27" return CurrentVersions{ - AgentVersion: "0.1.23", // Should match current branch - ConfigVersion: "3", // Should map from agent version (0.1.23 -> "3") - MinAgentVersion: "0.1.22", + AgentVersion: AgentVersion, + ConfigVersion: ConfigVersion, + MinAgentVersion: MinAgentVersion, BuildTime: time.Now(), } } diff --git a/aggregator-web/src/hooks/useScanState.ts b/aggregator-web/src/hooks/useScanState.ts new file mode 100644 index 0000000..4e5e8bd --- /dev/null +++ b/aggregator-web/src/hooks/useScanState.ts @@ -0,0 +1,108 @@ +import { useState, useCallback } from 'react'; +import { api } from '@/lib/api'; +import { toastWithLogging } from '@/lib/toast-with-logging'; + +interface ScanState { + isScanning: boolean; + commandId?: string; + error?: string; +} + +/** + * Hook for managing scan button state and preventing duplicate scans + * Integrates with backend deduplication (409 Conflict responses) + */ +export function useScanState(agentId: string, subsystem: string) { + const [state, setState] = useState({ + isScanning: false, + }); + + const triggerScan = useCallback(async () => { + if (state.isScanning) { + toastWithLogging.info('Scan already in progress', { subsystem }); + return; + } + + setState({ isScanning: true, commandId: undefined, error: undefined }); + + try { + const result = await api.post(`/agents/${agentId}/subsystems/${subsystem}/trigger`); + + setState(prev => ({ + ...prev, + commandId: result.data.command_id, + })); + + // Poll for completion or wait for subscription update + await waitForScanComplete(agentId, result.data.command_id); + + setState({ isScanning: false, commandId: result.data.command_id }); + + toastWithLogging.success(`${subsystem} scan completed`, { subsystem }); + } catch (error: any) { + const isAlreadyRunning = error.response?.status === 409; + + if (isAlreadyRunning) { + const existingCommandId = error.response?.data?.command_id; + setState({ + isScanning: false, + commandId: existingCommandId, + error: 'Scan already in progress', + }); + + toastWithLogging.info(`Scan already running (command: ${existingCommandId})`, { subsystem }); + } else { + const errorMessage = error.response?.data?.error || error.message; + setState({ + isScanning: false, + error: errorMessage, + }); + + toastWithLogging.error(`Failed to trigger scan: ${errorMessage}`, { subsystem }); + } + } + }, [agentId, subsystem, state.isScanning]); + + const reset = useCallback(() => { + setState({ isScanning: false, commandId: undefined, error: undefined }); + }, []); + + return { + isScanning: state.isScanning, + commandId: state.commandId, + error: state.error, + triggerScan, + reset, + }; +} + +/** + * Wait for scan to complete by polling command status + * Max wait: 5 minutes + */ +async function waitForScanComplete(agentId: string, commandId: string): Promise { + const maxWaitMs = 300000; // 5 minutes max + const startTime = Date.now(); + const pollInterval = 2000; // Poll every 2 seconds + + return new Promise((resolve, reject) => { + const interval = setInterval(async () => { + try { + const result = await api.get(`/agents/${agentId}/commands/${commandId}`); + + if (result.data.status === 'completed' || result.data.status === 'failed') { + clearInterval(interval); + resolve(); + } + } catch (error) { + clearInterval(interval); + reject(error); + } + + if (Date.now() - startTime > maxWaitMs) { + clearInterval(interval); + reject(new Error('Scan timeout')); + } + }, pollInterval); + }); +} diff --git a/aggregator-web/src/lib/api.ts b/aggregator-web/src/lib/api.ts index 92cb2ae..dbdb76c 100644 --- a/aggregator-web/src/lib/api.ts +++ b/aggregator-web/src/lib/api.ts @@ -64,6 +64,43 @@ api.interceptors.response.use( } ); +// Error logging interceptor +import { clientErrorLogger } from './client-error-logger'; +api.interceptors.response.use( + (response) => response, + async (error) => { + // Don't log errors from the error logger itself + if (error.config?.headers?.['X-Error-Logger-Request']) { + return Promise.reject(error); + } + + // Extract subsystem from URL + const subsystem = extractSubsystem(error.config?.url); + + // Log API errors + clientErrorLogger.logError({ + subsystem, + error_type: 'api_error', + message: error.message, + metadata: { + status_code: error.response?.status, + endpoint: error.config?.url, + method: error.config?.method, + response_data: error.response?.data, + }, + }).catch(() => { + // Don't let logging errors hide the original error + }); + + return Promise.reject(error); + } +); + +function extractSubsystem(url: string = ''): string { + const matches = url.match(/\/(storage|system|docker|updates|agent)/); + return matches ? matches[1] : 'unknown'; +} + // API endpoints export const agentApi = { // Get all agents @@ -876,4 +913,7 @@ export const storageMetricsApi = { }, }; +// Named export for api instance +export { api }; + export default api; \ No newline at end of file diff --git a/aggregator-web/src/lib/client-error-logger.ts b/aggregator-web/src/lib/client-error-logger.ts new file mode 100644 index 0000000..e1f186d --- /dev/null +++ b/aggregator-web/src/lib/client-error-logger.ts @@ -0,0 +1,166 @@ +import { api, ApiError } from './api'; + +export interface ClientErrorLog { + subsystem: string; + error_type: 'javascript_error' | 'api_error' | 'ui_error' | 'validation_error'; + message: string; + stack_trace?: string; + metadata?: Record; + url: string; + timestamp: string; +} + +/** + * ClientErrorLogger provides reliable frontend error logging with retry logic + * Implements ETHOS #3: Assume Failure; Build for Resilience + */ +export class ClientErrorLogger { + private maxRetries = 3; + private baseDelayMs = 1000; + private localStorageKey = 'redflag-error-queue'; + private offlineBuffer: ClientErrorLog[] = []; + private isOnline = navigator.onLine; + + constructor() { + // Listen for online/offline events + window.addEventListener('online', () => this.flushOfflineBuffer()); + window.addEventListener('offline', () => { this.isOnline = false; }); + } + + /** + * Log an error with automatic retry and offline queuing + */ + async logError(errorData: Omit): Promise { + const fullError: ClientErrorLog = { + ...errorData, + url: window.location.href, + timestamp: new Date().toISOString(), + }; + + // Try to send immediately + try { + await this.sendWithRetry(fullError); + return; + } catch (error) { + // If failed after retries, queue for later + this.queueForRetry(fullError); + } + } + + /** + * Send error to backend with exponential backoff retry + */ + private async sendWithRetry(error: ClientErrorLog): Promise { + for (let attempt = 1; attempt <= this.maxRetries; attempt++) { + try { + await api.post('/logs/client-error', error, { + headers: { 'X-Error-Logger-Request': 'true' }, + }); + + // Success, remove from queue if it was there + this.removeFromQueue(error); + return; + } catch (error) { + if (attempt === this.maxRetries) { + throw error; // Rethrow after final attempt + } + + // Exponential backoff + await this.sleep(this.baseDelayMs * Math.pow(2, attempt - 1)); + } + } + } + + /** + * Queue error for retry when network is available + */ + private queueForRetry(error: ClientErrorLog): void { + try { + const queue = this.getQueue(); + queue.push({ + ...error, + retryCount: (error as any).retryCount || 0, + queuedAt: new Date().toISOString(), + }); + + // Save to localStorage for persistence + localStorage.setItem(this.localStorageKey, JSON.stringify(queue)); + + // Also keep in memory buffer + this.offlineBuffer.push(error); + } catch (storageError) { + // localStorage might be full or unavailable + console.warn('Failed to queue error for retry:', storageError); + } + } + + /** + * Flush offline buffer when coming back online + */ + private async flushOfflineBuffer(): Promise { + if (!this.isOnline) return; + + const queue = this.getQueue(); + if (queue.length === 0) return; + + const failed: typeof queue = []; + + for (const queuedError of queue) { + try { + await this.sendWithRetry(queuedError); + } catch (error) { + failed.push(queuedError); + } + } + + // Update queue with remaining failed items + if (failed.length < queue.length) { + localStorage.setItem(this.localStorageKey, JSON.stringify(failed)); + } + } + + /** + * Get current error queue from localStorage + */ + private getQueue(): any[] { + try { + const stored = localStorage.getItem(this.localStorageKey); + return stored ? JSON.parse(stored) : []; + } catch { + return []; + } + } + + /** + * Remove successfully sent error from queue + */ + private removeFromQueue(sentError: ClientErrorLog): void { + try { + const queue = this.getQueue(); + const filtered = queue.filter(queued => + queued.timestamp !== sentError.timestamp || + queued.message !== sentError.message + ); + + if (filtered.length < queue.length) { + localStorage.setItem(this.localStorageKey, JSON.stringify(filtered)); + } + } catch { + // Best effort cleanup + } + } + + private sleep(ms: number): Promise { + return new Promise(resolve => setTimeout(resolve, ms)); + } +} + +// Singleton instance +export const clientErrorLogger = new ClientErrorLogger(); + +// Auto-retry failed logs on app load +if (typeof window !== 'undefined') { + window.addEventListener('load', () => { + clientErrorLogger.flushOfflineBuffer().catch(() => {}); + }); +} diff --git a/aggregator-web/src/lib/toast-with-logging.ts b/aggregator-web/src/lib/toast-with-logging.ts new file mode 100644 index 0000000..49b8104 --- /dev/null +++ b/aggregator-web/src/lib/toast-with-logging.ts @@ -0,0 +1,76 @@ +import toast, { ToastOptions } from 'react-hot-toast'; +import { clientErrorLogger } from './client-error-logger'; + +/** + * Extract subsystem from current route + */ +function getCurrentSubsystem(): string { + if (typeof window === 'undefined') return 'unknown'; + + const path = window.location.pathname; + + // Map routes to subsystems + if (path.includes('/storage')) return 'storage'; + if (path.includes('/system')) return 'system'; + if (path.includes('/docker')) return 'docker'; + if (path.includes('/updates')) return 'updates'; + if (path.includes('/agent/')) return 'agent'; + + return 'unknown'; +} + +/** + * Wrap toast.error to automatically log errors to backend + * Implements ETHOS #1: Errors are History + */ +export const toastWithLogging = { + error: (message: string, options?: ToastOptions & { subsystem?: string }) => { + const subsystem = options?.subsystem || getCurrentSubsystem(); + + // Log to backend asynchronously - don't block UI + clientErrorLogger.logError({ + subsystem, + error_type: 'ui_error', + message: message.substring(0, 5000), // Prevent excessively long messages + metadata: { + component: options?.id, + duration: options?.duration, + position: options?.position, + timestamp: new Date().toISOString(), + }, + }).catch(() => { + // Silently ignore logging failures - don't crash the UI + }); + + // Show toast to user + return toast.error(message, options); + }, + + // Passthrough methods + success: toast.success, + info: toast.info, + warning: toast.warning, + loading: toast.loading, + dismiss: toast.dismiss, + remove: toast.remove, + promise: toast.promise, +}; + +/** + * React hook for toast with automatic subsystem detection + */ +export function useToastWithLogging() { + return { + error: (message: string, options?: ToastOptions & { subsystem?: string }) => { + return toastWithLogging.error(message, { + ...options, + subsystem: options?.subsystem || getCurrentSubsystem(), + }); + }, + success: toast.success, + info: toast.info, + warning: toast.warning, + loading: toast.loading, + dismiss: toast.dismiss, + }; +}