fix: add retry on approval error from desync (#449)

2026-01-02 14:52:41 -08:00
parent aba42731bf
commit 321519a1b7
5 changed files with 187 additions and 4 deletions
--- a/src/agent/approval-recovery.ts
+++ b/src/agent/approval-recovery.ts
@@ -0,0 +1,50 @@
+import type { MessageCreate } from "@letta-ai/letta-client/resources/agents/agents";
+import { getClient } from "./client";
+import { APPROVAL_RECOVERY_PROMPT } from "./promptAssets";
+
+const APPROVAL_RECOVERY_DETAIL_FRAGMENT =
+  "no tool call is currently awaiting approval";
+
+type RunErrorMetadata =
+  | {
+      error_type?: string;
+      message?: string;
+      detail?: string;
+      error?: { error_type?: string; message?: string; detail?: string };
+    }
+  | undefined
+  | null;
+
+export function isApprovalStateDesyncError(detail: unknown): boolean {
+  if (typeof detail !== "string") return false;
+  return detail.toLowerCase().includes(APPROVAL_RECOVERY_DETAIL_FRAGMENT);
+}
+
+export async function fetchRunErrorDetail(
+  runId: string | null | undefined,
+): Promise<string | null> {
+  if (!runId) return null;
+  try {
+    const client = await getClient();
+    const run = await client.runs.retrieve(runId);
+    const metaError = run.metadata?.error as RunErrorMetadata;
+
+    return (
+      metaError?.detail ??
+      metaError?.message ??
+      metaError?.error?.detail ??
+      metaError?.error?.message ??
+      null
+    );
+  } catch {
+    return null;
+  }
+}
+
+export function buildApprovalRecoveryMessage(): MessageCreate {
+  return {
+    type: "message",
+    role: "user",
+    content: [{ type: "text", text: APPROVAL_RECOVERY_PROMPT }],
+  };
+}
--- a/src/agent/promptAssets.ts
+++ b/src/agent/promptAssets.ts
@@ -1,5 +1,6 @@
 // Additional system prompts for /system command

+import approvalRecoveryAlert from "./prompts/approval_recovery_alert.txt";
 import anthropicPrompt from "./prompts/claude.md";
 import codexPrompt from "./prompts/codex.md";
 import geminiPrompt from "./prompts/gemini.md";
@@ -28,6 +29,7 @@ export const SKILL_UNLOAD_REMINDER = skillUnloadReminder;
 export const SKILL_CREATOR_PROMPT = skillCreatorModePrompt;
 export const REMEMBER_PROMPT = rememberPrompt;
 export const MEMORY_CHECK_REMINDER = memoryCheckReminder;
+export const APPROVAL_RECOVERY_PROMPT = approvalRecoveryAlert;

 export const MEMORY_PROMPTS: Record<string, string> = {
  "persona.mdx": personaPrompt,
--- a/src/agent/prompts/approval_recovery_alert.txt
+++ b/src/agent/prompts/approval_recovery_alert.txt
@@ -0,0 +1 @@
+<system-alert>Automated keep-alive ping. Ignore this message and continue from where you stopped.</system-alert>
--- a/src/cli/App.tsx
+++ b/src/cli/App.tsx
@@ -18,6 +18,11 @@ import {
  type ApprovalResult,
  executeAutoAllowedTools,
 } from "../agent/approval-execution";
+import {
+  buildApprovalRecoveryMessage,
+  fetchRunErrorDetail,
+  isApprovalStateDesyncError,
+} from "../agent/approval-recovery";
 import { prefetchAvailableModelHandles } from "../agent/available-models";
 import { getResumeData } from "../agent/check-approval";
 import { getClient } from "../agent/client";
@@ -1027,7 +1032,8 @@ export default function App({
      initialInput: Array<MessageCreate | ApprovalCreate>,
      options?: { allowReentry?: boolean },
    ): Promise<void> => {
-      const currentInput = initialInput;
+      // Copy so we can safely mutate for retry recovery flows
+      const currentInput = [...initialInput];
      const allowReentry = options?.allowReentry ?? false;

      // Guard against concurrent processConversation calls
@@ -1665,6 +1671,58 @@ export default function App({
          }

          // Unexpected stop reason (error, llm_api_error, etc.)
+          // Check for approval desync errors even if stop_reason isn't llm_api_error.
+          const isApprovalPayload =
+            currentInput.length === 1 && currentInput[0]?.type === "approval";
+
+          const approvalDesyncDetected = async () => {
+            // 1) Check run metadata
+            const detailFromRun = await fetchRunErrorDetail(lastRunId);
+            if (isApprovalStateDesyncError(detailFromRun)) return true;
+
+            // 2) Check the most recent streamed error line in this turn
+            for (let i = buffersRef.current.order.length - 1; i >= 0; i -= 1) {
+              const id = buffersRef.current.order[i];
+              if (!id) continue;
+              const entry = buffersRef.current.byId.get(id);
+              if (entry?.kind === "error") {
+                return isApprovalStateDesyncError(entry.text);
+              }
+            }
+            return false;
+          };
+
+          if (isApprovalPayload && (await approvalDesyncDetected())) {
+            // Limit how many times we try this recovery to avoid loops
+            if (llmApiErrorRetriesRef.current < LLM_API_ERROR_MAX_RETRIES) {
+              llmApiErrorRetriesRef.current += 1;
+              const statusId = uid("status");
+              buffersRef.current.byId.set(statusId, {
+                kind: "status",
+                id: statusId,
+                lines: [
+                  "Approval state desynced; resending keep-alive recovery prompt...",
+                ],
+              });
+              buffersRef.current.order.push(statusId);
+              refreshDerived();
+
+              currentInput.splice(
+                0,
+                currentInput.length,
+                buildApprovalRecoveryMessage(),
+              );
+
+              // Remove the transient status before retrying
+              buffersRef.current.byId.delete(statusId);
+              buffersRef.current.order = buffersRef.current.order.filter(
+                (id) => id !== statusId,
+              );
+              refreshDerived();
+              continue;
+            }
+          }
+
          // Check if this is a retriable error (transient LLM API error)
          const retriable = await isRetriableError(
            stopReasonToHandle,
@@ -1681,10 +1739,13 @@ export default function App({

            // Show subtle grey status message
            const statusId = uid("status");
+            const statusLines = [
+              "Unexpected downstream LLM API error, retrying...",
+            ];
            buffersRef.current.byId.set(statusId, {
              kind: "status",
              id: statusId,
-              lines: ["Unexpected downstream LLM API error, retrying..."],
+              lines: statusLines,
            });
            buffersRef.current.order.push(statusId);
            refreshDerived();
--- a/src/headless.ts
+++ b/src/headless.ts
@@ -10,6 +10,11 @@ import type {
 } from "@letta-ai/letta-client/resources/agents/messages";
 import type { StopReasonType } from "@letta-ai/letta-client/resources/runs/runs";
 import type { ApprovalResult } from "./agent/approval-execution";
+import {
+  buildApprovalRecoveryMessage,
+  fetchRunErrorDetail,
+  isApprovalStateDesyncError,
+} from "./agent/approval-recovery";
 import { getClient } from "./agent/client";
 import { initializeLoadedSkillsFlag, setAgentContext } from "./agent/context";
 import { createAgent } from "./agent/create";
@@ -1061,6 +1066,11 @@ export async function handleHeadlessCommand(

      // Case 3: Transient LLM API error - retry with exponential backoff up to a limit
      if (stopReason === "llm_api_error") {
+        const shouldUseApprovalRecovery =
+          currentInput.length === 1 &&
+          currentInput[0]?.type === "approval" &&
+          isApprovalStateDesyncError(await fetchRunErrorDetail(lastRunId));
+
        if (llmApiErrorRetries < LLM_API_ERROR_MAX_RETRIES) {
          const attempt = llmApiErrorRetries + 1;
          const baseDelayMs = 1000;
@@ -1082,17 +1092,76 @@ export async function handleHeadlessCommand(
            console.log(JSON.stringify(retryMsg));
          } else {
            const delaySeconds = Math.round(delayMs / 1000);
+            const recoveryNote = shouldUseApprovalRecovery
+              ? " (approval state desynced - sending keep-going prompt)"
+              : "";
            console.error(
-              `LLM API error encountered (attempt ${attempt} of ${LLM_API_ERROR_MAX_RETRIES}), retrying in ${delaySeconds}s...`,
+              `LLM API error encountered (attempt ${attempt} of ${LLM_API_ERROR_MAX_RETRIES}), retrying in ${delaySeconds}s...${recoveryNote}`,
            );
          }

          // Exponential backoff before retrying the same input
          await new Promise((resolve) => setTimeout(resolve, delayMs));
+
+          if (shouldUseApprovalRecovery) {
+            currentInput = [buildApprovalRecoveryMessage()];
+          }
          continue;
        }
      }

+      // Fallback: if we were sending only approvals and hit an internal error that
+      // says there is no pending approval, resend using the keep-alive recovery prompt.
+      const isApprovalPayload =
+        currentInput.length === 1 && currentInput[0]?.type === "approval";
+      const approvalDesynced =
+        isApprovalPayload &&
+        (isApprovalStateDesyncError(await fetchRunErrorDetail(lastRunId)) ||
+          (() => {
+            const lines = toLines(buffers);
+            for (let i = lines.length - 1; i >= 0; i -= 1) {
+              const line = lines[i];
+              if (!line) continue;
+              if (
+                line.kind === "error" &&
+                "text" in line &&
+                typeof line.text === "string"
+              ) {
+                return isApprovalStateDesyncError(line.text ?? null);
+              }
+            }
+            return false;
+          })());
+
+      if (approvalDesynced && llmApiErrorRetries < LLM_API_ERROR_MAX_RETRIES) {
+        llmApiErrorRetries += 1;
+
+        const retryReason = stopReason ?? "error";
+        if (outputFormat === "stream-json") {
+          const retryMsg: RetryMessage = {
+            type: "retry",
+            reason: retryReason,
+            attempt: llmApiErrorRetries,
+            max_attempts: LLM_API_ERROR_MAX_RETRIES,
+            delay_ms: 0,
+            run_id: lastRunId ?? undefined,
+            session_id: sessionId,
+            uuid: `retry-${lastRunId || crypto.randomUUID()}`,
+          };
+          console.log(JSON.stringify(retryMsg));
+        } else {
+          console.error(
+            "Approval state desynced; resending keep-alive recovery prompt...",
+          );
+        }
+
+        // Small pause to avoid rapid-fire retries
+        await new Promise((resolve) => setTimeout(resolve, 250));
+
+        currentInput = [buildApprovalRecoveryMessage()];
+        continue;
+      }
+
      // Unexpected stop reason (error, llm_api_error, etc.)
      // Before failing, check run metadata to see if this is a retriable llm_api_error
      // Fallback check: in case stop_reason is "error" but metadata indicates LLM error
@@ -1415,7 +1484,7 @@ async function runBidirectionalMode(
  // Helper to get next line (from queue or wait)
  async function getNextLine(): Promise<string | null> {
    if (lineQueue.length > 0) {
-      return lineQueue.shift()!;
+      return lineQueue.shift() ?? null;
    }
    return new Promise<string | null>((resolve) => {
      lineResolver = resolve;
				`@@ -0,0 +1 @@`
				`<system-alert>Automated keep-alive ping. Ignore this message and continue from where you stopped.</system-alert>`