fix: add retry on approval error from desync (#449)
This commit is contained in:
50
src/agent/approval-recovery.ts
Normal file
50
src/agent/approval-recovery.ts
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
import type { MessageCreate } from "@letta-ai/letta-client/resources/agents/agents";
|
||||||
|
import { getClient } from "./client";
|
||||||
|
import { APPROVAL_RECOVERY_PROMPT } from "./promptAssets";
|
||||||
|
|
||||||
|
const APPROVAL_RECOVERY_DETAIL_FRAGMENT =
|
||||||
|
"no tool call is currently awaiting approval";
|
||||||
|
|
||||||
|
type RunErrorMetadata =
|
||||||
|
| {
|
||||||
|
error_type?: string;
|
||||||
|
message?: string;
|
||||||
|
detail?: string;
|
||||||
|
error?: { error_type?: string; message?: string; detail?: string };
|
||||||
|
}
|
||||||
|
| undefined
|
||||||
|
| null;
|
||||||
|
|
||||||
|
export function isApprovalStateDesyncError(detail: unknown): boolean {
|
||||||
|
if (typeof detail !== "string") return false;
|
||||||
|
return detail.toLowerCase().includes(APPROVAL_RECOVERY_DETAIL_FRAGMENT);
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function fetchRunErrorDetail(
|
||||||
|
runId: string | null | undefined,
|
||||||
|
): Promise<string | null> {
|
||||||
|
if (!runId) return null;
|
||||||
|
try {
|
||||||
|
const client = await getClient();
|
||||||
|
const run = await client.runs.retrieve(runId);
|
||||||
|
const metaError = run.metadata?.error as RunErrorMetadata;
|
||||||
|
|
||||||
|
return (
|
||||||
|
metaError?.detail ??
|
||||||
|
metaError?.message ??
|
||||||
|
metaError?.error?.detail ??
|
||||||
|
metaError?.error?.message ??
|
||||||
|
null
|
||||||
|
);
|
||||||
|
} catch {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export function buildApprovalRecoveryMessage(): MessageCreate {
|
||||||
|
return {
|
||||||
|
type: "message",
|
||||||
|
role: "user",
|
||||||
|
content: [{ type: "text", text: APPROVAL_RECOVERY_PROMPT }],
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -1,5 +1,6 @@
|
|||||||
// Additional system prompts for /system command
|
// Additional system prompts for /system command
|
||||||
|
|
||||||
|
import approvalRecoveryAlert from "./prompts/approval_recovery_alert.txt";
|
||||||
import anthropicPrompt from "./prompts/claude.md";
|
import anthropicPrompt from "./prompts/claude.md";
|
||||||
import codexPrompt from "./prompts/codex.md";
|
import codexPrompt from "./prompts/codex.md";
|
||||||
import geminiPrompt from "./prompts/gemini.md";
|
import geminiPrompt from "./prompts/gemini.md";
|
||||||
@@ -28,6 +29,7 @@ export const SKILL_UNLOAD_REMINDER = skillUnloadReminder;
|
|||||||
export const SKILL_CREATOR_PROMPT = skillCreatorModePrompt;
|
export const SKILL_CREATOR_PROMPT = skillCreatorModePrompt;
|
||||||
export const REMEMBER_PROMPT = rememberPrompt;
|
export const REMEMBER_PROMPT = rememberPrompt;
|
||||||
export const MEMORY_CHECK_REMINDER = memoryCheckReminder;
|
export const MEMORY_CHECK_REMINDER = memoryCheckReminder;
|
||||||
|
export const APPROVAL_RECOVERY_PROMPT = approvalRecoveryAlert;
|
||||||
|
|
||||||
export const MEMORY_PROMPTS: Record<string, string> = {
|
export const MEMORY_PROMPTS: Record<string, string> = {
|
||||||
"persona.mdx": personaPrompt,
|
"persona.mdx": personaPrompt,
|
||||||
|
|||||||
1
src/agent/prompts/approval_recovery_alert.txt
Normal file
1
src/agent/prompts/approval_recovery_alert.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
<system-alert>Automated keep-alive ping. Ignore this message and continue from where you stopped.</system-alert>
|
||||||
@@ -18,6 +18,11 @@ import {
|
|||||||
type ApprovalResult,
|
type ApprovalResult,
|
||||||
executeAutoAllowedTools,
|
executeAutoAllowedTools,
|
||||||
} from "../agent/approval-execution";
|
} from "../agent/approval-execution";
|
||||||
|
import {
|
||||||
|
buildApprovalRecoveryMessage,
|
||||||
|
fetchRunErrorDetail,
|
||||||
|
isApprovalStateDesyncError,
|
||||||
|
} from "../agent/approval-recovery";
|
||||||
import { prefetchAvailableModelHandles } from "../agent/available-models";
|
import { prefetchAvailableModelHandles } from "../agent/available-models";
|
||||||
import { getResumeData } from "../agent/check-approval";
|
import { getResumeData } from "../agent/check-approval";
|
||||||
import { getClient } from "../agent/client";
|
import { getClient } from "../agent/client";
|
||||||
@@ -1027,7 +1032,8 @@ export default function App({
|
|||||||
initialInput: Array<MessageCreate | ApprovalCreate>,
|
initialInput: Array<MessageCreate | ApprovalCreate>,
|
||||||
options?: { allowReentry?: boolean },
|
options?: { allowReentry?: boolean },
|
||||||
): Promise<void> => {
|
): Promise<void> => {
|
||||||
const currentInput = initialInput;
|
// Copy so we can safely mutate for retry recovery flows
|
||||||
|
const currentInput = [...initialInput];
|
||||||
const allowReentry = options?.allowReentry ?? false;
|
const allowReentry = options?.allowReentry ?? false;
|
||||||
|
|
||||||
// Guard against concurrent processConversation calls
|
// Guard against concurrent processConversation calls
|
||||||
@@ -1665,6 +1671,58 @@ export default function App({
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Unexpected stop reason (error, llm_api_error, etc.)
|
// Unexpected stop reason (error, llm_api_error, etc.)
|
||||||
|
// Check for approval desync errors even if stop_reason isn't llm_api_error.
|
||||||
|
const isApprovalPayload =
|
||||||
|
currentInput.length === 1 && currentInput[0]?.type === "approval";
|
||||||
|
|
||||||
|
const approvalDesyncDetected = async () => {
|
||||||
|
// 1) Check run metadata
|
||||||
|
const detailFromRun = await fetchRunErrorDetail(lastRunId);
|
||||||
|
if (isApprovalStateDesyncError(detailFromRun)) return true;
|
||||||
|
|
||||||
|
// 2) Check the most recent streamed error line in this turn
|
||||||
|
for (let i = buffersRef.current.order.length - 1; i >= 0; i -= 1) {
|
||||||
|
const id = buffersRef.current.order[i];
|
||||||
|
if (!id) continue;
|
||||||
|
const entry = buffersRef.current.byId.get(id);
|
||||||
|
if (entry?.kind === "error") {
|
||||||
|
return isApprovalStateDesyncError(entry.text);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
|
||||||
|
if (isApprovalPayload && (await approvalDesyncDetected())) {
|
||||||
|
// Limit how many times we try this recovery to avoid loops
|
||||||
|
if (llmApiErrorRetriesRef.current < LLM_API_ERROR_MAX_RETRIES) {
|
||||||
|
llmApiErrorRetriesRef.current += 1;
|
||||||
|
const statusId = uid("status");
|
||||||
|
buffersRef.current.byId.set(statusId, {
|
||||||
|
kind: "status",
|
||||||
|
id: statusId,
|
||||||
|
lines: [
|
||||||
|
"Approval state desynced; resending keep-alive recovery prompt...",
|
||||||
|
],
|
||||||
|
});
|
||||||
|
buffersRef.current.order.push(statusId);
|
||||||
|
refreshDerived();
|
||||||
|
|
||||||
|
currentInput.splice(
|
||||||
|
0,
|
||||||
|
currentInput.length,
|
||||||
|
buildApprovalRecoveryMessage(),
|
||||||
|
);
|
||||||
|
|
||||||
|
// Remove the transient status before retrying
|
||||||
|
buffersRef.current.byId.delete(statusId);
|
||||||
|
buffersRef.current.order = buffersRef.current.order.filter(
|
||||||
|
(id) => id !== statusId,
|
||||||
|
);
|
||||||
|
refreshDerived();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Check if this is a retriable error (transient LLM API error)
|
// Check if this is a retriable error (transient LLM API error)
|
||||||
const retriable = await isRetriableError(
|
const retriable = await isRetriableError(
|
||||||
stopReasonToHandle,
|
stopReasonToHandle,
|
||||||
@@ -1681,10 +1739,13 @@ export default function App({
|
|||||||
|
|
||||||
// Show subtle grey status message
|
// Show subtle grey status message
|
||||||
const statusId = uid("status");
|
const statusId = uid("status");
|
||||||
|
const statusLines = [
|
||||||
|
"Unexpected downstream LLM API error, retrying...",
|
||||||
|
];
|
||||||
buffersRef.current.byId.set(statusId, {
|
buffersRef.current.byId.set(statusId, {
|
||||||
kind: "status",
|
kind: "status",
|
||||||
id: statusId,
|
id: statusId,
|
||||||
lines: ["Unexpected downstream LLM API error, retrying..."],
|
lines: statusLines,
|
||||||
});
|
});
|
||||||
buffersRef.current.order.push(statusId);
|
buffersRef.current.order.push(statusId);
|
||||||
refreshDerived();
|
refreshDerived();
|
||||||
|
|||||||
@@ -10,6 +10,11 @@ import type {
|
|||||||
} from "@letta-ai/letta-client/resources/agents/messages";
|
} from "@letta-ai/letta-client/resources/agents/messages";
|
||||||
import type { StopReasonType } from "@letta-ai/letta-client/resources/runs/runs";
|
import type { StopReasonType } from "@letta-ai/letta-client/resources/runs/runs";
|
||||||
import type { ApprovalResult } from "./agent/approval-execution";
|
import type { ApprovalResult } from "./agent/approval-execution";
|
||||||
|
import {
|
||||||
|
buildApprovalRecoveryMessage,
|
||||||
|
fetchRunErrorDetail,
|
||||||
|
isApprovalStateDesyncError,
|
||||||
|
} from "./agent/approval-recovery";
|
||||||
import { getClient } from "./agent/client";
|
import { getClient } from "./agent/client";
|
||||||
import { initializeLoadedSkillsFlag, setAgentContext } from "./agent/context";
|
import { initializeLoadedSkillsFlag, setAgentContext } from "./agent/context";
|
||||||
import { createAgent } from "./agent/create";
|
import { createAgent } from "./agent/create";
|
||||||
@@ -1061,6 +1066,11 @@ export async function handleHeadlessCommand(
|
|||||||
|
|
||||||
// Case 3: Transient LLM API error - retry with exponential backoff up to a limit
|
// Case 3: Transient LLM API error - retry with exponential backoff up to a limit
|
||||||
if (stopReason === "llm_api_error") {
|
if (stopReason === "llm_api_error") {
|
||||||
|
const shouldUseApprovalRecovery =
|
||||||
|
currentInput.length === 1 &&
|
||||||
|
currentInput[0]?.type === "approval" &&
|
||||||
|
isApprovalStateDesyncError(await fetchRunErrorDetail(lastRunId));
|
||||||
|
|
||||||
if (llmApiErrorRetries < LLM_API_ERROR_MAX_RETRIES) {
|
if (llmApiErrorRetries < LLM_API_ERROR_MAX_RETRIES) {
|
||||||
const attempt = llmApiErrorRetries + 1;
|
const attempt = llmApiErrorRetries + 1;
|
||||||
const baseDelayMs = 1000;
|
const baseDelayMs = 1000;
|
||||||
@@ -1082,17 +1092,76 @@ export async function handleHeadlessCommand(
|
|||||||
console.log(JSON.stringify(retryMsg));
|
console.log(JSON.stringify(retryMsg));
|
||||||
} else {
|
} else {
|
||||||
const delaySeconds = Math.round(delayMs / 1000);
|
const delaySeconds = Math.round(delayMs / 1000);
|
||||||
|
const recoveryNote = shouldUseApprovalRecovery
|
||||||
|
? " (approval state desynced - sending keep-going prompt)"
|
||||||
|
: "";
|
||||||
console.error(
|
console.error(
|
||||||
`LLM API error encountered (attempt ${attempt} of ${LLM_API_ERROR_MAX_RETRIES}), retrying in ${delaySeconds}s...`,
|
`LLM API error encountered (attempt ${attempt} of ${LLM_API_ERROR_MAX_RETRIES}), retrying in ${delaySeconds}s...${recoveryNote}`,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Exponential backoff before retrying the same input
|
// Exponential backoff before retrying the same input
|
||||||
await new Promise((resolve) => setTimeout(resolve, delayMs));
|
await new Promise((resolve) => setTimeout(resolve, delayMs));
|
||||||
|
|
||||||
|
if (shouldUseApprovalRecovery) {
|
||||||
|
currentInput = [buildApprovalRecoveryMessage()];
|
||||||
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Fallback: if we were sending only approvals and hit an internal error that
|
||||||
|
// says there is no pending approval, resend using the keep-alive recovery prompt.
|
||||||
|
const isApprovalPayload =
|
||||||
|
currentInput.length === 1 && currentInput[0]?.type === "approval";
|
||||||
|
const approvalDesynced =
|
||||||
|
isApprovalPayload &&
|
||||||
|
(isApprovalStateDesyncError(await fetchRunErrorDetail(lastRunId)) ||
|
||||||
|
(() => {
|
||||||
|
const lines = toLines(buffers);
|
||||||
|
for (let i = lines.length - 1; i >= 0; i -= 1) {
|
||||||
|
const line = lines[i];
|
||||||
|
if (!line) continue;
|
||||||
|
if (
|
||||||
|
line.kind === "error" &&
|
||||||
|
"text" in line &&
|
||||||
|
typeof line.text === "string"
|
||||||
|
) {
|
||||||
|
return isApprovalStateDesyncError(line.text ?? null);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
})());
|
||||||
|
|
||||||
|
if (approvalDesynced && llmApiErrorRetries < LLM_API_ERROR_MAX_RETRIES) {
|
||||||
|
llmApiErrorRetries += 1;
|
||||||
|
|
||||||
|
const retryReason = stopReason ?? "error";
|
||||||
|
if (outputFormat === "stream-json") {
|
||||||
|
const retryMsg: RetryMessage = {
|
||||||
|
type: "retry",
|
||||||
|
reason: retryReason,
|
||||||
|
attempt: llmApiErrorRetries,
|
||||||
|
max_attempts: LLM_API_ERROR_MAX_RETRIES,
|
||||||
|
delay_ms: 0,
|
||||||
|
run_id: lastRunId ?? undefined,
|
||||||
|
session_id: sessionId,
|
||||||
|
uuid: `retry-${lastRunId || crypto.randomUUID()}`,
|
||||||
|
};
|
||||||
|
console.log(JSON.stringify(retryMsg));
|
||||||
|
} else {
|
||||||
|
console.error(
|
||||||
|
"Approval state desynced; resending keep-alive recovery prompt...",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Small pause to avoid rapid-fire retries
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, 250));
|
||||||
|
|
||||||
|
currentInput = [buildApprovalRecoveryMessage()];
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
// Unexpected stop reason (error, llm_api_error, etc.)
|
// Unexpected stop reason (error, llm_api_error, etc.)
|
||||||
// Before failing, check run metadata to see if this is a retriable llm_api_error
|
// Before failing, check run metadata to see if this is a retriable llm_api_error
|
||||||
// Fallback check: in case stop_reason is "error" but metadata indicates LLM error
|
// Fallback check: in case stop_reason is "error" but metadata indicates LLM error
|
||||||
@@ -1415,7 +1484,7 @@ async function runBidirectionalMode(
|
|||||||
// Helper to get next line (from queue or wait)
|
// Helper to get next line (from queue or wait)
|
||||||
async function getNextLine(): Promise<string | null> {
|
async function getNextLine(): Promise<string | null> {
|
||||||
if (lineQueue.length > 0) {
|
if (lineQueue.length > 0) {
|
||||||
return lineQueue.shift()!;
|
return lineQueue.shift() ?? null;
|
||||||
}
|
}
|
||||||
return new Promise<string | null>((resolve) => {
|
return new Promise<string | null>((resolve) => {
|
||||||
lineResolver = resolve;
|
lineResolver = resolve;
|
||||||
|
|||||||
Reference in New Issue
Block a user