fix: add retry on approval error from desync (#449)

This commit is contained in:
Charles Packer
2026-01-02 14:52:41 -08:00
committed by GitHub
parent aba42731bf
commit 321519a1b7
5 changed files with 187 additions and 4 deletions

View File

@@ -0,0 +1,50 @@
import type { MessageCreate } from "@letta-ai/letta-client/resources/agents/agents";
import { getClient } from "./client";
import { APPROVAL_RECOVERY_PROMPT } from "./promptAssets";
const APPROVAL_RECOVERY_DETAIL_FRAGMENT =
"no tool call is currently awaiting approval";
type RunErrorMetadata =
| {
error_type?: string;
message?: string;
detail?: string;
error?: { error_type?: string; message?: string; detail?: string };
}
| undefined
| null;
export function isApprovalStateDesyncError(detail: unknown): boolean {
if (typeof detail !== "string") return false;
return detail.toLowerCase().includes(APPROVAL_RECOVERY_DETAIL_FRAGMENT);
}
export async function fetchRunErrorDetail(
runId: string | null | undefined,
): Promise<string | null> {
if (!runId) return null;
try {
const client = await getClient();
const run = await client.runs.retrieve(runId);
const metaError = run.metadata?.error as RunErrorMetadata;
return (
metaError?.detail ??
metaError?.message ??
metaError?.error?.detail ??
metaError?.error?.message ??
null
);
} catch {
return null;
}
}
export function buildApprovalRecoveryMessage(): MessageCreate {
return {
type: "message",
role: "user",
content: [{ type: "text", text: APPROVAL_RECOVERY_PROMPT }],
};
}

View File

@@ -1,5 +1,6 @@
// Additional system prompts for /system command
import approvalRecoveryAlert from "./prompts/approval_recovery_alert.txt";
import anthropicPrompt from "./prompts/claude.md";
import codexPrompt from "./prompts/codex.md";
import geminiPrompt from "./prompts/gemini.md";
@@ -28,6 +29,7 @@ export const SKILL_UNLOAD_REMINDER = skillUnloadReminder;
export const SKILL_CREATOR_PROMPT = skillCreatorModePrompt;
export const REMEMBER_PROMPT = rememberPrompt;
export const MEMORY_CHECK_REMINDER = memoryCheckReminder;
export const APPROVAL_RECOVERY_PROMPT = approvalRecoveryAlert;
export const MEMORY_PROMPTS: Record<string, string> = {
"persona.mdx": personaPrompt,

View File

@@ -0,0 +1 @@
<system-alert>Automated keep-alive ping. Ignore this message and continue from where you stopped.</system-alert>

View File

@@ -18,6 +18,11 @@ import {
type ApprovalResult,
executeAutoAllowedTools,
} from "../agent/approval-execution";
import {
buildApprovalRecoveryMessage,
fetchRunErrorDetail,
isApprovalStateDesyncError,
} from "../agent/approval-recovery";
import { prefetchAvailableModelHandles } from "../agent/available-models";
import { getResumeData } from "../agent/check-approval";
import { getClient } from "../agent/client";
@@ -1027,7 +1032,8 @@ export default function App({
initialInput: Array<MessageCreate | ApprovalCreate>,
options?: { allowReentry?: boolean },
): Promise<void> => {
const currentInput = initialInput;
// Copy so we can safely mutate for retry recovery flows
const currentInput = [...initialInput];
const allowReentry = options?.allowReentry ?? false;
// Guard against concurrent processConversation calls
@@ -1665,6 +1671,58 @@ export default function App({
}
// Unexpected stop reason (error, llm_api_error, etc.)
// Check for approval desync errors even if stop_reason isn't llm_api_error.
const isApprovalPayload =
currentInput.length === 1 && currentInput[0]?.type === "approval";
const approvalDesyncDetected = async () => {
// 1) Check run metadata
const detailFromRun = await fetchRunErrorDetail(lastRunId);
if (isApprovalStateDesyncError(detailFromRun)) return true;
// 2) Check the most recent streamed error line in this turn
for (let i = buffersRef.current.order.length - 1; i >= 0; i -= 1) {
const id = buffersRef.current.order[i];
if (!id) continue;
const entry = buffersRef.current.byId.get(id);
if (entry?.kind === "error") {
return isApprovalStateDesyncError(entry.text);
}
}
return false;
};
if (isApprovalPayload && (await approvalDesyncDetected())) {
// Limit how many times we try this recovery to avoid loops
if (llmApiErrorRetriesRef.current < LLM_API_ERROR_MAX_RETRIES) {
llmApiErrorRetriesRef.current += 1;
const statusId = uid("status");
buffersRef.current.byId.set(statusId, {
kind: "status",
id: statusId,
lines: [
"Approval state desynced; resending keep-alive recovery prompt...",
],
});
buffersRef.current.order.push(statusId);
refreshDerived();
currentInput.splice(
0,
currentInput.length,
buildApprovalRecoveryMessage(),
);
// Remove the transient status before retrying
buffersRef.current.byId.delete(statusId);
buffersRef.current.order = buffersRef.current.order.filter(
(id) => id !== statusId,
);
refreshDerived();
continue;
}
}
// Check if this is a retriable error (transient LLM API error)
const retriable = await isRetriableError(
stopReasonToHandle,
@@ -1681,10 +1739,13 @@ export default function App({
// Show subtle grey status message
const statusId = uid("status");
const statusLines = [
"Unexpected downstream LLM API error, retrying...",
];
buffersRef.current.byId.set(statusId, {
kind: "status",
id: statusId,
lines: ["Unexpected downstream LLM API error, retrying..."],
lines: statusLines,
});
buffersRef.current.order.push(statusId);
refreshDerived();

View File

@@ -10,6 +10,11 @@ import type {
} from "@letta-ai/letta-client/resources/agents/messages";
import type { StopReasonType } from "@letta-ai/letta-client/resources/runs/runs";
import type { ApprovalResult } from "./agent/approval-execution";
import {
buildApprovalRecoveryMessage,
fetchRunErrorDetail,
isApprovalStateDesyncError,
} from "./agent/approval-recovery";
import { getClient } from "./agent/client";
import { initializeLoadedSkillsFlag, setAgentContext } from "./agent/context";
import { createAgent } from "./agent/create";
@@ -1061,6 +1066,11 @@ export async function handleHeadlessCommand(
// Case 3: Transient LLM API error - retry with exponential backoff up to a limit
if (stopReason === "llm_api_error") {
const shouldUseApprovalRecovery =
currentInput.length === 1 &&
currentInput[0]?.type === "approval" &&
isApprovalStateDesyncError(await fetchRunErrorDetail(lastRunId));
if (llmApiErrorRetries < LLM_API_ERROR_MAX_RETRIES) {
const attempt = llmApiErrorRetries + 1;
const baseDelayMs = 1000;
@@ -1082,17 +1092,76 @@ export async function handleHeadlessCommand(
console.log(JSON.stringify(retryMsg));
} else {
const delaySeconds = Math.round(delayMs / 1000);
const recoveryNote = shouldUseApprovalRecovery
? " (approval state desynced - sending keep-going prompt)"
: "";
console.error(
`LLM API error encountered (attempt ${attempt} of ${LLM_API_ERROR_MAX_RETRIES}), retrying in ${delaySeconds}s...`,
`LLM API error encountered (attempt ${attempt} of ${LLM_API_ERROR_MAX_RETRIES}), retrying in ${delaySeconds}s...${recoveryNote}`,
);
}
// Exponential backoff before retrying the same input
await new Promise((resolve) => setTimeout(resolve, delayMs));
if (shouldUseApprovalRecovery) {
currentInput = [buildApprovalRecoveryMessage()];
}
continue;
}
}
// Fallback: if we were sending only approvals and hit an internal error that
// says there is no pending approval, resend using the keep-alive recovery prompt.
const isApprovalPayload =
currentInput.length === 1 && currentInput[0]?.type === "approval";
const approvalDesynced =
isApprovalPayload &&
(isApprovalStateDesyncError(await fetchRunErrorDetail(lastRunId)) ||
(() => {
const lines = toLines(buffers);
for (let i = lines.length - 1; i >= 0; i -= 1) {
const line = lines[i];
if (!line) continue;
if (
line.kind === "error" &&
"text" in line &&
typeof line.text === "string"
) {
return isApprovalStateDesyncError(line.text ?? null);
}
}
return false;
})());
if (approvalDesynced && llmApiErrorRetries < LLM_API_ERROR_MAX_RETRIES) {
llmApiErrorRetries += 1;
const retryReason = stopReason ?? "error";
if (outputFormat === "stream-json") {
const retryMsg: RetryMessage = {
type: "retry",
reason: retryReason,
attempt: llmApiErrorRetries,
max_attempts: LLM_API_ERROR_MAX_RETRIES,
delay_ms: 0,
run_id: lastRunId ?? undefined,
session_id: sessionId,
uuid: `retry-${lastRunId || crypto.randomUUID()}`,
};
console.log(JSON.stringify(retryMsg));
} else {
console.error(
"Approval state desynced; resending keep-alive recovery prompt...",
);
}
// Small pause to avoid rapid-fire retries
await new Promise((resolve) => setTimeout(resolve, 250));
currentInput = [buildApprovalRecoveryMessage()];
continue;
}
// Unexpected stop reason (error, llm_api_error, etc.)
// Before failing, check run metadata to see if this is a retriable llm_api_error
// Fallback check: in case stop_reason is "error" but metadata indicates LLM error
@@ -1415,7 +1484,7 @@ async function runBidirectionalMode(
// Helper to get next line (from queue or wait)
async function getNextLine(): Promise<string | null> {
if (lineQueue.length > 0) {
return lineQueue.shift()!;
return lineQueue.shift() ?? null;
}
return new Promise<string | null>((resolve) => {
lineResolver = resolve;