fix: add retry on approval error from desync (#449)

2026-01-02 14:52:41 -08:00
parent aba42731bf
commit 321519a1b7
5 changed files with 187 additions and 4 deletions
--- a/src/agent/approval-recovery.ts
+++ b/src/agent/approval-recovery.ts
@@ -0,0 +1,50 @@
 import type { MessageCreate } from "@letta-ai/letta-client/resources/agents/agents";
 import { getClient } from "./client";
 import { APPROVAL_RECOVERY_PROMPT } from "./promptAssets";
 const APPROVAL_RECOVERY_DETAIL_FRAGMENT =
  "no tool call is currently awaiting approval";
 type RunErrorMetadata =
  | {
      error_type?: string;
      message?: string;
      detail?: string;
      error?: { error_type?: string; message?: string; detail?: string };
    }
  | undefined
  | null;
 export function isApprovalStateDesyncError(detail: unknown): boolean {
  if (typeof detail !== "string") return false;
  return detail.toLowerCase().includes(APPROVAL_RECOVERY_DETAIL_FRAGMENT);
 }
 export async function fetchRunErrorDetail(
  runId: string | null | undefined,
 ): Promise<string | null> {
  if (!runId) return null;
  try {
    const client = await getClient();
    const run = await client.runs.retrieve(runId);
    const metaError = run.metadata?.error as RunErrorMetadata;
    return (
      metaError?.detail ??
      metaError?.message ??
      metaError?.error?.detail ??
      metaError?.error?.message ??
      null
    );
  } catch {
    return null;
  }
 }
 export function buildApprovalRecoveryMessage(): MessageCreate {
  return {
    type: "message",
    role: "user",
    content: [{ type: "text", text: APPROVAL_RECOVERY_PROMPT }],
  };
 }
--- a/src/agent/promptAssets.ts
+++ b/src/agent/promptAssets.ts
@@ -1,5 +1,6 @@
 // Additional system prompts for /system command
 import approvalRecoveryAlert from "./prompts/approval_recovery_alert.txt";
 import anthropicPrompt from "./prompts/claude.md";
 import codexPrompt from "./prompts/codex.md";
 import geminiPrompt from "./prompts/gemini.md";
@@ -28,6 +29,7 @@ export const SKILL_UNLOAD_REMINDER = skillUnloadReminder;
 export const SKILL_CREATOR_PROMPT = skillCreatorModePrompt;
 export const REMEMBER_PROMPT = rememberPrompt;
 export const MEMORY_CHECK_REMINDER = memoryCheckReminder;
 export const APPROVAL_RECOVERY_PROMPT = approvalRecoveryAlert;
 export const MEMORY_PROMPTS: Record<string, string> = {
  "persona.mdx": personaPrompt,
--- a/src/agent/prompts/approval_recovery_alert.txt
+++ b/src/agent/prompts/approval_recovery_alert.txt
@@ -0,0 +1 @@
 <system-alert>Automated keep-alive ping. Ignore this message and continue from where you stopped.</system-alert>
--- a/src/cli/App.tsx
+++ b/src/cli/App.tsx
@@ -18,6 +18,11 @@ import {
  type ApprovalResult,
  executeAutoAllowedTools,
 } from "../agent/approval-execution";
 import {
  buildApprovalRecoveryMessage,
  fetchRunErrorDetail,
  isApprovalStateDesyncError,
 } from "../agent/approval-recovery";
 import { prefetchAvailableModelHandles } from "../agent/available-models";
 import { getResumeData } from "../agent/check-approval";
 import { getClient } from "../agent/client";
@@ -1027,7 +1032,8 @@ export default function App({
      initialInput: Array<MessageCreate | ApprovalCreate>,
      options?: { allowReentry?: boolean },
    ): Promise<void> => {
-      const currentInput = initialInput;
+      // Copy so we can safely mutate for retry recovery flows
      const currentInput = [...initialInput];
      const allowReentry = options?.allowReentry ?? false;
      // Guard against concurrent processConversation calls
@@ -1665,6 +1671,58 @@ export default function App({
          }
          // Unexpected stop reason (error, llm_api_error, etc.)
          // Check for approval desync errors even if stop_reason isn't llm_api_error.
          const isApprovalPayload =
            currentInput.length === 1 && currentInput[0]?.type === "approval";
          const approvalDesyncDetected = async () => {
            // 1) Check run metadata
            const detailFromRun = await fetchRunErrorDetail(lastRunId);
            if (isApprovalStateDesyncError(detailFromRun)) return true;
            // 2) Check the most recent streamed error line in this turn
            for (let i = buffersRef.current.order.length - 1; i >= 0; i -= 1) {
              const id = buffersRef.current.order[i];
              if (!id) continue;
              const entry = buffersRef.current.byId.get(id);
              if (entry?.kind === "error") {
                return isApprovalStateDesyncError(entry.text);
              }
            }
            return false;
          };
          if (isApprovalPayload && (await approvalDesyncDetected())) {
            // Limit how many times we try this recovery to avoid loops
            if (llmApiErrorRetriesRef.current < LLM_API_ERROR_MAX_RETRIES) {
              llmApiErrorRetriesRef.current += 1;
              const statusId = uid("status");
              buffersRef.current.byId.set(statusId, {
                kind: "status",
                id: statusId,
                lines: [
                  "Approval state desynced; resending keep-alive recovery prompt...",
                ],
              });
              buffersRef.current.order.push(statusId);
              refreshDerived();
              currentInput.splice(
                0,
                currentInput.length,
                buildApprovalRecoveryMessage(),
              );
              // Remove the transient status before retrying
              buffersRef.current.byId.delete(statusId);
              buffersRef.current.order = buffersRef.current.order.filter(
                (id) => id !== statusId,
              );
              refreshDerived();
              continue;
            }
          }
          // Check if this is a retriable error (transient LLM API error)
          const retriable = await isRetriableError(
            stopReasonToHandle,
@@ -1681,10 +1739,13 @@ export default function App({
            // Show subtle grey status message
            const statusId = uid("status");
            const statusLines = [
              "Unexpected downstream LLM API error, retrying...",
            ];
            buffersRef.current.byId.set(statusId, {
              kind: "status",
              id: statusId,
-              lines: ["Unexpected downstream LLM API error, retrying..."],
+              lines: statusLines,
            });
            buffersRef.current.order.push(statusId);
            refreshDerived();
--- a/src/headless.ts
+++ b/src/headless.ts
@@ -10,6 +10,11 @@ import type {
 } from "@letta-ai/letta-client/resources/agents/messages";
 import type { StopReasonType } from "@letta-ai/letta-client/resources/runs/runs";
 import type { ApprovalResult } from "./agent/approval-execution";
 import {
  buildApprovalRecoveryMessage,
  fetchRunErrorDetail,
  isApprovalStateDesyncError,
 } from "./agent/approval-recovery";
 import { getClient } from "./agent/client";
 import { initializeLoadedSkillsFlag, setAgentContext } from "./agent/context";
 import { createAgent } from "./agent/create";
@@ -1061,6 +1066,11 @@ export async function handleHeadlessCommand(
      // Case 3: Transient LLM API error - retry with exponential backoff up to a limit
      if (stopReason === "llm_api_error") {
        const shouldUseApprovalRecovery =
          currentInput.length === 1 &&
          currentInput[0]?.type === "approval" &&
          isApprovalStateDesyncError(await fetchRunErrorDetail(lastRunId));
        if (llmApiErrorRetries < LLM_API_ERROR_MAX_RETRIES) {
          const attempt = llmApiErrorRetries + 1;
          const baseDelayMs = 1000;
@@ -1082,17 +1092,76 @@ export async function handleHeadlessCommand(
            console.log(JSON.stringify(retryMsg));
          } else {
            const delaySeconds = Math.round(delayMs / 1000);
            const recoveryNote = shouldUseApprovalRecovery
              ? " (approval state desynced - sending keep-going prompt)"
              : "";
            console.error(
-              `LLM API error encountered (attempt ${attempt} of ${LLM_API_ERROR_MAX_RETRIES}), retrying in ${delaySeconds}s...`,
+              `LLM API error encountered (attempt ${attempt} of ${LLM_API_ERROR_MAX_RETRIES}), retrying in ${delaySeconds}s...${recoveryNote}`,
            );
          }
          // Exponential backoff before retrying the same input
          await new Promise((resolve) => setTimeout(resolve, delayMs));
          if (shouldUseApprovalRecovery) {
            currentInput = [buildApprovalRecoveryMessage()];
          }
          continue;
        }
      }
      // Fallback: if we were sending only approvals and hit an internal error that
      // says there is no pending approval, resend using the keep-alive recovery prompt.
      const isApprovalPayload =
        currentInput.length === 1 && currentInput[0]?.type === "approval";
      const approvalDesynced =
        isApprovalPayload &&
        (isApprovalStateDesyncError(await fetchRunErrorDetail(lastRunId)) ||
          (() => {
            const lines = toLines(buffers);
            for (let i = lines.length - 1; i >= 0; i -= 1) {
              const line = lines[i];
              if (!line) continue;
              if (
                line.kind === "error" &&
                "text" in line &&
                typeof line.text === "string"
              ) {
                return isApprovalStateDesyncError(line.text ?? null);
              }
            }
            return false;
          })());
      if (approvalDesynced && llmApiErrorRetries < LLM_API_ERROR_MAX_RETRIES) {
        llmApiErrorRetries += 1;
        const retryReason = stopReason ?? "error";
        if (outputFormat === "stream-json") {
          const retryMsg: RetryMessage = {
            type: "retry",
            reason: retryReason,
            attempt: llmApiErrorRetries,
            max_attempts: LLM_API_ERROR_MAX_RETRIES,
            delay_ms: 0,
            run_id: lastRunId ?? undefined,
            session_id: sessionId,
            uuid: `retry-${lastRunId || crypto.randomUUID()}`,
          };
          console.log(JSON.stringify(retryMsg));
        } else {
          console.error(
            "Approval state desynced; resending keep-alive recovery prompt...",
          );
        }
        // Small pause to avoid rapid-fire retries
        await new Promise((resolve) => setTimeout(resolve, 250));
        currentInput = [buildApprovalRecoveryMessage()];
        continue;
      }
      // Unexpected stop reason (error, llm_api_error, etc.)
      // Before failing, check run metadata to see if this is a retriable llm_api_error
      // Fallback check: in case stop_reason is "error" but metadata indicates LLM error
@@ -1415,7 +1484,7 @@ async function runBidirectionalMode(
  // Helper to get next line (from queue or wait)
  async function getNextLine(): Promise<string | null> {
    if (lineQueue.length > 0) {
-      return lineQueue.shift()!;
+      return lineQueue.shift() ?? null;
    }
    return new Promise<string | null>((resolve) => {
      lineResolver = resolve;
		`@@ -0,0 +1 @@`
							`<system-alert>Automated keep-alive ping. Ignore this message and continue from where you stopped.</system-alert>`