feat: clean up keep-alive fallback handling (#663)

2026-01-27 17:50:37 -08:00
parent c2fd63f5b5
commit 0df8e51dac
4 changed files with 96 additions and 321 deletions
--- a/src/headless.ts
+++ b/src/headless.ts
@@ -9,10 +9,8 @@ import type { ApprovalCreate } from "@letta-ai/letta-client/resources/agents/mes
 import type { StopReasonType } from "@letta-ai/letta-client/resources/runs/runs";
 import type { ApprovalResult } from "./agent/approval-execution";
 import {
-  buildApprovalRecoveryMessage,
  fetchRunErrorDetail,
  isApprovalPendingError,
-  isApprovalStateDesyncError,
  isConversationBusyError,
  isInvalidToolCallIdsError,
 } from "./agent/approval-recovery";
@@ -1385,26 +1383,11 @@ export async function handleHeadlessCommand(
        }
      }

-      // Detect approval desync once per turn
+      // Fetch run error detail for invalid tool call ID detection
      const detailFromRun = await fetchRunErrorDetail(lastRunId);
-      const approvalDesynced =
-        currentInput.length === 1 &&
-        currentInput[0]?.type === "approval" &&
-        (isApprovalStateDesyncError(detailFromRun) ||
-          isApprovalStateDesyncError(latestErrorText));
-
-      // Track last failure text for emitting on exit
-      const lastFailureText =
-        latestErrorText ||
-        detailFromRun ||
-        (lastRunId
-          ? `An error occurred during agent execution\n(run_id: ${lastRunId}, stop_reason: ${stopReason})`
-          : `An error occurred during agent execution\n(stop_reason: ${stopReason})`);

      // Case 3: Transient LLM API error - retry with exponential backoff up to a limit
      if (stopReason === "llm_api_error") {
-        const shouldUseApprovalRecovery = approvalDesynced;
-
        if (llmApiErrorRetries < LLM_API_ERROR_MAX_RETRIES) {
          const attempt = llmApiErrorRetries + 1;
          const baseDelayMs = 1000;
@@ -1426,104 +1409,64 @@ export async function handleHeadlessCommand(
            console.log(JSON.stringify(retryMsg));
          } else {
            const delaySeconds = Math.round(delayMs / 1000);
-            const recoveryNote = shouldUseApprovalRecovery
-              ? " (approval state desynced - sending keep-going prompt)"
-              : "";
            console.error(
-              `LLM API error encountered (attempt ${attempt} of ${LLM_API_ERROR_MAX_RETRIES}), retrying in ${delaySeconds}s...${recoveryNote}`,
+              `LLM API error encountered (attempt ${attempt} of ${LLM_API_ERROR_MAX_RETRIES}), retrying in ${delaySeconds}s...`,
            );
          }

          // Exponential backoff before retrying the same input
          await new Promise((resolve) => setTimeout(resolve, delayMs));

-          if (shouldUseApprovalRecovery) {
-            currentInput = [buildApprovalRecoveryMessage()];
-          }
          continue;
        }
      }

-      // Fallback: if we were sending only approvals and hit an internal error that
-      // says there is no pending approval, resend using the keep-alive recovery prompt.
-      if (approvalDesynced) {
-        // "Invalid tool call IDs" means server HAS pending approvals but with different IDs.
-        // Fetch the actual pending approvals and process them before retrying.
-        if (
-          isInvalidToolCallIdsError(detailFromRun) ||
-          isInvalidToolCallIdsError(latestErrorText)
-        ) {
-          if (outputFormat === "stream-json") {
-            const recoveryMsg: RecoveryMessage = {
-              type: "recovery",
-              recovery_type: "invalid_tool_call_ids",
-              message:
-                "Tool call ID mismatch; fetching actual pending approvals and resyncing",
-              run_id: lastRunId ?? undefined,
-              session_id: sessionId,
-              uuid: `recovery-${lastRunId || crypto.randomUUID()}`,
-            };
-            console.log(JSON.stringify(recoveryMsg));
-          } else {
-            console.error(
-              "Tool call ID mismatch; fetching actual pending approvals...",
-            );
-          }
+      // "Invalid tool call IDs" means server HAS pending approvals but with different IDs.
+      // Fetch the actual pending approvals and process them before retrying.
+      const invalidIdsDetected =
+        isInvalidToolCallIdsError(detailFromRun) ||
+        isInvalidToolCallIdsError(latestErrorText);

-          try {
-            // Fetch and process actual pending approvals from server
-            await resolveAllPendingApprovals();
-            // After processing, continue to next iteration (fresh state)
-            continue;
-          } catch {
-            // If fetch fails, fall through to general desync recovery
-          }
-        }
-
-        if (llmApiErrorRetries < LLM_API_ERROR_MAX_RETRIES) {
-          llmApiErrorRetries += 1;
-
-          const retryReason = stopReason ?? "error";
-          if (outputFormat === "stream-json") {
-            const retryMsg: RetryMessage = {
-              type: "retry",
-              reason: retryReason,
-              attempt: llmApiErrorRetries,
-              max_attempts: LLM_API_ERROR_MAX_RETRIES,
-              delay_ms: 0,
-              run_id: lastRunId ?? undefined,
-              session_id: sessionId,
-              uuid: `retry-${lastRunId || crypto.randomUUID()}`,
-            };
-            console.log(JSON.stringify(retryMsg));
-          } else {
-            console.error(
-              "Approval state desynced; resending keep-alive recovery prompt...",
-            );
-          }
-
-          // Small pause to avoid rapid-fire retries
-          await new Promise((resolve) => setTimeout(resolve, 250));
-
-          currentInput = [buildApprovalRecoveryMessage()];
-          continue;
-        }
-
-        // No retries left or non-retriable: emit error and exit
+      if (invalidIdsDetected) {
        if (outputFormat === "stream-json") {
-          const errorMsg: ErrorMessage = {
-            type: "error",
-            message: lastFailureText,
-            stop_reason: stopReason,
+          const recoveryMsg: RecoveryMessage = {
+            type: "recovery",
+            recovery_type: "invalid_tool_call_ids",
+            message:
+              "Tool call ID mismatch; fetching actual pending approvals and resyncing",
            run_id: lastRunId ?? undefined,
            session_id: sessionId,
-            uuid: `error-${lastRunId || crypto.randomUUID()}`,
+            uuid: `recovery-${lastRunId || crypto.randomUUID()}`,
          };
-          console.log(JSON.stringify(errorMsg));
+          console.log(JSON.stringify(recoveryMsg));
        } else {
-          console.error(lastFailureText);
+          console.error(
+            "Tool call ID mismatch; fetching actual pending approvals...",
+          );
+        }
+
+        try {
+          // Fetch and process actual pending approvals from server
+          await resolveAllPendingApprovals();
+          // After processing, continue to next iteration (fresh state)
+          continue;
+        } catch {
+          // If fetch fails, exit with error
+          if (outputFormat === "stream-json") {
+            const errorMsg: ErrorMessage = {
+              type: "error",
+              message: "Failed to fetch pending approvals for resync",
+              stop_reason: stopReason,
+              run_id: lastRunId ?? undefined,
+              session_id: sessionId,
+              uuid: `error-${lastRunId || crypto.randomUUID()}`,
+            };
+            console.log(JSON.stringify(errorMsg));
+          } else {
+            console.error("Failed to fetch pending approvals for resync");
+          }
+          process.exit(1);
        }
-        process.exit(1);
      }

      // Unexpected stop reason (error, llm_api_error, etc.)