fix: add retry on approval error from desync (#449)
This commit is contained in:
50
src/agent/approval-recovery.ts
Normal file
50
src/agent/approval-recovery.ts
Normal file
@@ -0,0 +1,50 @@
|
||||
import type { MessageCreate } from "@letta-ai/letta-client/resources/agents/agents";
|
||||
import { getClient } from "./client";
|
||||
import { APPROVAL_RECOVERY_PROMPT } from "./promptAssets";
|
||||
|
||||
const APPROVAL_RECOVERY_DETAIL_FRAGMENT =
|
||||
"no tool call is currently awaiting approval";
|
||||
|
||||
type RunErrorMetadata =
|
||||
| {
|
||||
error_type?: string;
|
||||
message?: string;
|
||||
detail?: string;
|
||||
error?: { error_type?: string; message?: string; detail?: string };
|
||||
}
|
||||
| undefined
|
||||
| null;
|
||||
|
||||
export function isApprovalStateDesyncError(detail: unknown): boolean {
|
||||
if (typeof detail !== "string") return false;
|
||||
return detail.toLowerCase().includes(APPROVAL_RECOVERY_DETAIL_FRAGMENT);
|
||||
}
|
||||
|
||||
export async function fetchRunErrorDetail(
|
||||
runId: string | null | undefined,
|
||||
): Promise<string | null> {
|
||||
if (!runId) return null;
|
||||
try {
|
||||
const client = await getClient();
|
||||
const run = await client.runs.retrieve(runId);
|
||||
const metaError = run.metadata?.error as RunErrorMetadata;
|
||||
|
||||
return (
|
||||
metaError?.detail ??
|
||||
metaError?.message ??
|
||||
metaError?.error?.detail ??
|
||||
metaError?.error?.message ??
|
||||
null
|
||||
);
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
export function buildApprovalRecoveryMessage(): MessageCreate {
|
||||
return {
|
||||
type: "message",
|
||||
role: "user",
|
||||
content: [{ type: "text", text: APPROVAL_RECOVERY_PROMPT }],
|
||||
};
|
||||
}
|
||||
@@ -1,5 +1,6 @@
|
||||
// Additional system prompts for /system command
|
||||
|
||||
import approvalRecoveryAlert from "./prompts/approval_recovery_alert.txt";
|
||||
import anthropicPrompt from "./prompts/claude.md";
|
||||
import codexPrompt from "./prompts/codex.md";
|
||||
import geminiPrompt from "./prompts/gemini.md";
|
||||
@@ -28,6 +29,7 @@ export const SKILL_UNLOAD_REMINDER = skillUnloadReminder;
|
||||
export const SKILL_CREATOR_PROMPT = skillCreatorModePrompt;
|
||||
export const REMEMBER_PROMPT = rememberPrompt;
|
||||
export const MEMORY_CHECK_REMINDER = memoryCheckReminder;
|
||||
export const APPROVAL_RECOVERY_PROMPT = approvalRecoveryAlert;
|
||||
|
||||
export const MEMORY_PROMPTS: Record<string, string> = {
|
||||
"persona.mdx": personaPrompt,
|
||||
|
||||
1
src/agent/prompts/approval_recovery_alert.txt
Normal file
1
src/agent/prompts/approval_recovery_alert.txt
Normal file
@@ -0,0 +1 @@
|
||||
<system-alert>Automated keep-alive ping. Ignore this message and continue from where you stopped.</system-alert>
|
||||
@@ -18,6 +18,11 @@ import {
|
||||
type ApprovalResult,
|
||||
executeAutoAllowedTools,
|
||||
} from "../agent/approval-execution";
|
||||
import {
|
||||
buildApprovalRecoveryMessage,
|
||||
fetchRunErrorDetail,
|
||||
isApprovalStateDesyncError,
|
||||
} from "../agent/approval-recovery";
|
||||
import { prefetchAvailableModelHandles } from "../agent/available-models";
|
||||
import { getResumeData } from "../agent/check-approval";
|
||||
import { getClient } from "../agent/client";
|
||||
@@ -1027,7 +1032,8 @@ export default function App({
|
||||
initialInput: Array<MessageCreate | ApprovalCreate>,
|
||||
options?: { allowReentry?: boolean },
|
||||
): Promise<void> => {
|
||||
const currentInput = initialInput;
|
||||
// Copy so we can safely mutate for retry recovery flows
|
||||
const currentInput = [...initialInput];
|
||||
const allowReentry = options?.allowReentry ?? false;
|
||||
|
||||
// Guard against concurrent processConversation calls
|
||||
@@ -1665,6 +1671,58 @@ export default function App({
|
||||
}
|
||||
|
||||
// Unexpected stop reason (error, llm_api_error, etc.)
|
||||
// Check for approval desync errors even if stop_reason isn't llm_api_error.
|
||||
const isApprovalPayload =
|
||||
currentInput.length === 1 && currentInput[0]?.type === "approval";
|
||||
|
||||
const approvalDesyncDetected = async () => {
|
||||
// 1) Check run metadata
|
||||
const detailFromRun = await fetchRunErrorDetail(lastRunId);
|
||||
if (isApprovalStateDesyncError(detailFromRun)) return true;
|
||||
|
||||
// 2) Check the most recent streamed error line in this turn
|
||||
for (let i = buffersRef.current.order.length - 1; i >= 0; i -= 1) {
|
||||
const id = buffersRef.current.order[i];
|
||||
if (!id) continue;
|
||||
const entry = buffersRef.current.byId.get(id);
|
||||
if (entry?.kind === "error") {
|
||||
return isApprovalStateDesyncError(entry.text);
|
||||
}
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
if (isApprovalPayload && (await approvalDesyncDetected())) {
|
||||
// Limit how many times we try this recovery to avoid loops
|
||||
if (llmApiErrorRetriesRef.current < LLM_API_ERROR_MAX_RETRIES) {
|
||||
llmApiErrorRetriesRef.current += 1;
|
||||
const statusId = uid("status");
|
||||
buffersRef.current.byId.set(statusId, {
|
||||
kind: "status",
|
||||
id: statusId,
|
||||
lines: [
|
||||
"Approval state desynced; resending keep-alive recovery prompt...",
|
||||
],
|
||||
});
|
||||
buffersRef.current.order.push(statusId);
|
||||
refreshDerived();
|
||||
|
||||
currentInput.splice(
|
||||
0,
|
||||
currentInput.length,
|
||||
buildApprovalRecoveryMessage(),
|
||||
);
|
||||
|
||||
// Remove the transient status before retrying
|
||||
buffersRef.current.byId.delete(statusId);
|
||||
buffersRef.current.order = buffersRef.current.order.filter(
|
||||
(id) => id !== statusId,
|
||||
);
|
||||
refreshDerived();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Check if this is a retriable error (transient LLM API error)
|
||||
const retriable = await isRetriableError(
|
||||
stopReasonToHandle,
|
||||
@@ -1681,10 +1739,13 @@ export default function App({
|
||||
|
||||
// Show subtle grey status message
|
||||
const statusId = uid("status");
|
||||
const statusLines = [
|
||||
"Unexpected downstream LLM API error, retrying...",
|
||||
];
|
||||
buffersRef.current.byId.set(statusId, {
|
||||
kind: "status",
|
||||
id: statusId,
|
||||
lines: ["Unexpected downstream LLM API error, retrying..."],
|
||||
lines: statusLines,
|
||||
});
|
||||
buffersRef.current.order.push(statusId);
|
||||
refreshDerived();
|
||||
|
||||
@@ -10,6 +10,11 @@ import type {
|
||||
} from "@letta-ai/letta-client/resources/agents/messages";
|
||||
import type { StopReasonType } from "@letta-ai/letta-client/resources/runs/runs";
|
||||
import type { ApprovalResult } from "./agent/approval-execution";
|
||||
import {
|
||||
buildApprovalRecoveryMessage,
|
||||
fetchRunErrorDetail,
|
||||
isApprovalStateDesyncError,
|
||||
} from "./agent/approval-recovery";
|
||||
import { getClient } from "./agent/client";
|
||||
import { initializeLoadedSkillsFlag, setAgentContext } from "./agent/context";
|
||||
import { createAgent } from "./agent/create";
|
||||
@@ -1061,6 +1066,11 @@ export async function handleHeadlessCommand(
|
||||
|
||||
// Case 3: Transient LLM API error - retry with exponential backoff up to a limit
|
||||
if (stopReason === "llm_api_error") {
|
||||
const shouldUseApprovalRecovery =
|
||||
currentInput.length === 1 &&
|
||||
currentInput[0]?.type === "approval" &&
|
||||
isApprovalStateDesyncError(await fetchRunErrorDetail(lastRunId));
|
||||
|
||||
if (llmApiErrorRetries < LLM_API_ERROR_MAX_RETRIES) {
|
||||
const attempt = llmApiErrorRetries + 1;
|
||||
const baseDelayMs = 1000;
|
||||
@@ -1082,17 +1092,76 @@ export async function handleHeadlessCommand(
|
||||
console.log(JSON.stringify(retryMsg));
|
||||
} else {
|
||||
const delaySeconds = Math.round(delayMs / 1000);
|
||||
const recoveryNote = shouldUseApprovalRecovery
|
||||
? " (approval state desynced - sending keep-going prompt)"
|
||||
: "";
|
||||
console.error(
|
||||
`LLM API error encountered (attempt ${attempt} of ${LLM_API_ERROR_MAX_RETRIES}), retrying in ${delaySeconds}s...`,
|
||||
`LLM API error encountered (attempt ${attempt} of ${LLM_API_ERROR_MAX_RETRIES}), retrying in ${delaySeconds}s...${recoveryNote}`,
|
||||
);
|
||||
}
|
||||
|
||||
// Exponential backoff before retrying the same input
|
||||
await new Promise((resolve) => setTimeout(resolve, delayMs));
|
||||
|
||||
if (shouldUseApprovalRecovery) {
|
||||
currentInput = [buildApprovalRecoveryMessage()];
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: if we were sending only approvals and hit an internal error that
|
||||
// says there is no pending approval, resend using the keep-alive recovery prompt.
|
||||
const isApprovalPayload =
|
||||
currentInput.length === 1 && currentInput[0]?.type === "approval";
|
||||
const approvalDesynced =
|
||||
isApprovalPayload &&
|
||||
(isApprovalStateDesyncError(await fetchRunErrorDetail(lastRunId)) ||
|
||||
(() => {
|
||||
const lines = toLines(buffers);
|
||||
for (let i = lines.length - 1; i >= 0; i -= 1) {
|
||||
const line = lines[i];
|
||||
if (!line) continue;
|
||||
if (
|
||||
line.kind === "error" &&
|
||||
"text" in line &&
|
||||
typeof line.text === "string"
|
||||
) {
|
||||
return isApprovalStateDesyncError(line.text ?? null);
|
||||
}
|
||||
}
|
||||
return false;
|
||||
})());
|
||||
|
||||
if (approvalDesynced && llmApiErrorRetries < LLM_API_ERROR_MAX_RETRIES) {
|
||||
llmApiErrorRetries += 1;
|
||||
|
||||
const retryReason = stopReason ?? "error";
|
||||
if (outputFormat === "stream-json") {
|
||||
const retryMsg: RetryMessage = {
|
||||
type: "retry",
|
||||
reason: retryReason,
|
||||
attempt: llmApiErrorRetries,
|
||||
max_attempts: LLM_API_ERROR_MAX_RETRIES,
|
||||
delay_ms: 0,
|
||||
run_id: lastRunId ?? undefined,
|
||||
session_id: sessionId,
|
||||
uuid: `retry-${lastRunId || crypto.randomUUID()}`,
|
||||
};
|
||||
console.log(JSON.stringify(retryMsg));
|
||||
} else {
|
||||
console.error(
|
||||
"Approval state desynced; resending keep-alive recovery prompt...",
|
||||
);
|
||||
}
|
||||
|
||||
// Small pause to avoid rapid-fire retries
|
||||
await new Promise((resolve) => setTimeout(resolve, 250));
|
||||
|
||||
currentInput = [buildApprovalRecoveryMessage()];
|
||||
continue;
|
||||
}
|
||||
|
||||
// Unexpected stop reason (error, llm_api_error, etc.)
|
||||
// Before failing, check run metadata to see if this is a retriable llm_api_error
|
||||
// Fallback check: in case stop_reason is "error" but metadata indicates LLM error
|
||||
@@ -1415,7 +1484,7 @@ async function runBidirectionalMode(
|
||||
// Helper to get next line (from queue or wait)
|
||||
async function getNextLine(): Promise<string | null> {
|
||||
if (lineQueue.length > 0) {
|
||||
return lineQueue.shift()!;
|
||||
return lineQueue.shift() ?? null;
|
||||
}
|
||||
return new Promise<string | null>((resolve) => {
|
||||
lineResolver = resolve;
|
||||
|
||||
Reference in New Issue
Block a user