Files
Redflag/aggregator-server/internal/services/timeout.go
Fimeg 57be3754c6 fix: agent acknowledgment recursion and subsystem UI improvements
- Fix recursive call in reportLogWithAck that caused infinite loop
- Add machine binding and security API endpoints
- Enhance AgentScanners component with security status display
- Update scheduler and timeout service reliability
- Remove deprecated install.sh script
- Add subsystem configuration and logging improvements
2025-11-03 21:02:57 -05:00

261 lines
9.0 KiB
Go

package services
import (
"fmt"
"log"
"time"
"github.com/Fimeg/RedFlag/aggregator-server/internal/database/queries"
"github.com/Fimeg/RedFlag/aggregator-server/internal/models"
"github.com/google/uuid"
)
// TimeoutService handles timeout management for long-running operations
type TimeoutService struct {
commandQueries *queries.CommandQueries
updateQueries *queries.UpdateQueries
ticker *time.Ticker
stopChan chan bool
sentTimeout time.Duration // For commands already sent to agents
pendingTimeout time.Duration // For commands stuck in queue
}
// NewTimeoutService creates a new timeout service
func NewTimeoutService(cq *queries.CommandQueries, uq *queries.UpdateQueries) *TimeoutService {
return &TimeoutService{
commandQueries: cq,
updateQueries: uq,
sentTimeout: 2 * time.Hour, // 2 hours for commands already sent to agents
pendingTimeout: 30 * time.Minute, // 30 minutes for commands stuck in queue
// TODO: Make these timeout durations user-adjustable in settings
stopChan: make(chan bool),
}
}
// Start begins the timeout monitoring service
func (ts *TimeoutService) Start() {
log.Printf("Starting timeout service with %v sent timeout, %v pending timeout", ts.sentTimeout, ts.pendingTimeout)
// Create a ticker that runs every 5 minutes
ts.ticker = time.NewTicker(5 * time.Minute)
go func() {
for {
select {
case <-ts.ticker.C:
ts.checkForTimeouts()
case <-ts.stopChan:
ts.ticker.Stop()
log.Println("Timeout service stopped")
return
}
}
}()
}
// Stop stops the timeout monitoring service
func (ts *TimeoutService) Stop() {
close(ts.stopChan)
}
// checkForTimeouts checks for commands that have been running too long
func (ts *TimeoutService) checkForTimeouts() {
log.Println("Checking for timed out operations...")
sentTimeoutThreshold := time.Now().Add(-ts.sentTimeout)
pendingTimeoutThreshold := time.Now().Add(-ts.pendingTimeout)
timedOutCommands := make([]models.AgentCommand, 0)
// Check 'sent' commands (traditional timeout - 2 hours)
sentCommands, err := ts.commandQueries.GetCommandsByStatus(models.CommandStatusSent)
if err != nil {
log.Printf("Error getting sent commands: %v", err)
} else {
for _, command := range sentCommands {
// Check if command has been sent and is older than sent timeout threshold
if command.SentAt != nil && command.SentAt.Before(sentTimeoutThreshold) {
timedOutCommands = append(timedOutCommands, command)
}
}
}
// Check 'pending' commands (stuck in queue timeout - 30 minutes)
pendingCommands, err := ts.commandQueries.GetCommandsByStatus(models.CommandStatusPending)
if err != nil {
log.Printf("Error getting pending commands: %v", err)
} else {
for _, command := range pendingCommands {
// Check if command has been pending longer than pending timeout threshold
if command.CreatedAt.Before(pendingTimeoutThreshold) {
timedOutCommands = append(timedOutCommands, command)
log.Printf("Found stuck pending command %s (type: %s, created: %s, age: %v)",
command.ID, command.CommandType, command.CreatedAt.Format(time.RFC3339), time.Since(command.CreatedAt))
}
}
}
if len(timedOutCommands) > 0 {
log.Printf("Found %d timed out commands (%d sent >2h, %d stuck pending >30m)",
len(timedOutCommands), len(sentCommands), len(pendingCommands))
for _, command := range timedOutCommands {
if err := ts.timeoutCommand(&command); err != nil {
log.Printf("Error timing out command %s: %v", command.ID, err)
}
}
} else {
log.Println("No timed out operations found")
}
}
// timeoutCommand marks a specific command as timed out and updates related entities
func (ts *TimeoutService) timeoutCommand(command *models.AgentCommand) error {
// Determine which timeout duration was applied
var appliedTimeout time.Duration
if command.Status == models.CommandStatusSent {
appliedTimeout = ts.sentTimeout
} else {
appliedTimeout = ts.pendingTimeout
}
log.Printf("Timing out command %s (type: %s, agent: %s)",
command.ID, command.CommandType, command.AgentID)
// Update command status to timed_out
if err := ts.commandQueries.UpdateCommandStatus(command.ID, models.CommandStatusTimedOut); err != nil {
return fmt.Errorf("failed to update command status: %w", err)
}
// Update result with timeout information
result := models.JSONB{
"error": "operation timed out",
"timeout_at": time.Now(),
"duration": appliedTimeout.String(),
"command_id": command.ID.String(),
}
if err := ts.commandQueries.UpdateCommandResult(command.ID, result); err != nil {
return fmt.Errorf("failed to update command result: %w", err)
}
// Update related update package status if applicable
if err := ts.updateRelatedPackageStatus(command, appliedTimeout); err != nil {
log.Printf("Warning: failed to update related package status: %v", err)
// Don't return error here as the main timeout operation succeeded
}
// Create a log entry for the timeout
logEntry := &models.UpdateLog{
ID: uuid.New(),
AgentID: command.AgentID,
UpdatePackageID: ts.extractUpdatePackageID(command),
Action: command.CommandType,
Result: "failed", // Use 'failed' to comply with database constraint
Stdout: "",
Stderr: fmt.Sprintf("Command %s timed out after %v (timeout_id: %s)", command.CommandType, appliedTimeout, command.ID),
ExitCode: 124, // Standard timeout exit code
DurationSeconds: int(appliedTimeout.Seconds()),
ExecutedAt: time.Now(),
}
if err := ts.updateQueries.CreateUpdateLog(logEntry); err != nil {
log.Printf("Warning: failed to create timeout log entry: %v", err)
// Don't return error here as the main timeout operation succeeded
}
log.Printf("Successfully timed out command %s", command.ID)
return nil
}
// updateRelatedPackageStatus updates the status of related update packages when a command times out
func (ts *TimeoutService) updateRelatedPackageStatus(command *models.AgentCommand, appliedTimeout time.Duration) error {
// Extract update_id from command params if it exists
_, ok := command.Params["update_id"].(string)
if !ok {
// This command doesn't have an associated update_id, so no package status to update
return nil
}
// Update the package status to 'failed' with timeout reason
metadata := models.JSONB{
"timeout": true,
"timeout_at": time.Now(),
"timeout_duration": appliedTimeout.String(),
"command_id": command.ID.String(),
"failure_reason": "operation timed out",
}
return ts.updateQueries.UpdatePackageStatus(command.AgentID,
command.Params["package_type"].(string),
command.Params["package_name"].(string),
"failed",
metadata,
nil) // nil = use time.Now() for timeout operations
}
// extractUpdatePackageID extracts the update package ID from command params
func (ts *TimeoutService) extractUpdatePackageID(command *models.AgentCommand) *uuid.UUID {
updateIDStr, ok := command.Params["update_id"].(string)
if !ok {
return nil
}
updateID, err := uuid.Parse(updateIDStr)
if err != nil {
return nil
}
return &updateID
}
// GetTimeoutStatus returns statistics about timed out operations
func (ts *TimeoutService) GetTimeoutStatus() (map[string]interface{}, error) {
// Get all timed out commands
timedOutCommands, err := ts.commandQueries.GetCommandsByStatus(models.CommandStatusTimedOut)
if err != nil {
return nil, fmt.Errorf("failed to get timed out commands: %w", err)
}
// Get all active commands
activeCommands, err := ts.commandQueries.GetCommandsByStatus(models.CommandStatusSent)
if err != nil {
return nil, fmt.Errorf("failed to get active commands: %w", err)
}
// Count commands approaching timeout (within 5 minutes of timeout)
timeoutThreshold := time.Now().Add(-ts.sentTimeout + 5*time.Minute)
approachingTimeout := 0
for _, command := range activeCommands {
if command.SentAt != nil && command.SentAt.Before(timeoutThreshold) {
approachingTimeout++
}
}
return map[string]interface{}{
"total_timed_out": len(timedOutCommands),
"total_active": len(activeCommands),
"approaching_timeout": approachingTimeout,
"sent_timeout_duration": ts.sentTimeout.String(),
"pending_timeout_duration": ts.pendingTimeout.String(),
"last_check": time.Now(),
}, nil
}
// SetTimeoutDuration allows changing the timeout duration for sent commands
// TODO: This should be deprecated in favor of SetSentTimeout and SetPendingTimeout
func (ts *TimeoutService) SetTimeoutDuration(duration time.Duration) {
ts.sentTimeout = duration
log.Printf("Sent timeout duration updated to %v", duration)
}
// SetSentTimeout allows changing the timeout duration for sent commands
func (ts *TimeoutService) SetSentTimeout(duration time.Duration) {
ts.sentTimeout = duration
log.Printf("Sent timeout duration updated to %v", duration)
}
// SetPendingTimeout allows changing the timeout duration for pending commands
func (ts *TimeoutService) SetPendingTimeout(duration time.Duration) {
ts.pendingTimeout = duration
log.Printf("Pending timeout duration updated to %v", duration)
}