diff --git a/src/agent/approval-recovery.ts b/src/agent/approval-recovery.ts new file mode 100644 index 0000000..caae481 --- /dev/null +++ b/src/agent/approval-recovery.ts @@ -0,0 +1,50 @@ +import type { MessageCreate } from "@letta-ai/letta-client/resources/agents/agents"; +import { getClient } from "./client"; +import { APPROVAL_RECOVERY_PROMPT } from "./promptAssets"; + +const APPROVAL_RECOVERY_DETAIL_FRAGMENT = + "no tool call is currently awaiting approval"; + +type RunErrorMetadata = + | { + error_type?: string; + message?: string; + detail?: string; + error?: { error_type?: string; message?: string; detail?: string }; + } + | undefined + | null; + +export function isApprovalStateDesyncError(detail: unknown): boolean { + if (typeof detail !== "string") return false; + return detail.toLowerCase().includes(APPROVAL_RECOVERY_DETAIL_FRAGMENT); +} + +export async function fetchRunErrorDetail( + runId: string | null | undefined, +): Promise { + if (!runId) return null; + try { + const client = await getClient(); + const run = await client.runs.retrieve(runId); + const metaError = run.metadata?.error as RunErrorMetadata; + + return ( + metaError?.detail ?? + metaError?.message ?? + metaError?.error?.detail ?? + metaError?.error?.message ?? + null + ); + } catch { + return null; + } +} + +export function buildApprovalRecoveryMessage(): MessageCreate { + return { + type: "message", + role: "user", + content: [{ type: "text", text: APPROVAL_RECOVERY_PROMPT }], + }; +} diff --git a/src/agent/promptAssets.ts b/src/agent/promptAssets.ts index 6c58b98..5a8056e 100644 --- a/src/agent/promptAssets.ts +++ b/src/agent/promptAssets.ts @@ -1,5 +1,6 @@ // Additional system prompts for /system command +import approvalRecoveryAlert from "./prompts/approval_recovery_alert.txt"; import anthropicPrompt from "./prompts/claude.md"; import codexPrompt from "./prompts/codex.md"; import geminiPrompt from "./prompts/gemini.md"; @@ -28,6 +29,7 @@ export const SKILL_UNLOAD_REMINDER = skillUnloadReminder; export const SKILL_CREATOR_PROMPT = skillCreatorModePrompt; export const REMEMBER_PROMPT = rememberPrompt; export const MEMORY_CHECK_REMINDER = memoryCheckReminder; +export const APPROVAL_RECOVERY_PROMPT = approvalRecoveryAlert; export const MEMORY_PROMPTS: Record = { "persona.mdx": personaPrompt, diff --git a/src/agent/prompts/approval_recovery_alert.txt b/src/agent/prompts/approval_recovery_alert.txt new file mode 100644 index 0000000..a62df6d --- /dev/null +++ b/src/agent/prompts/approval_recovery_alert.txt @@ -0,0 +1 @@ +Automated keep-alive ping. Ignore this message and continue from where you stopped. diff --git a/src/cli/App.tsx b/src/cli/App.tsx index fac6255..0de5dc2 100644 --- a/src/cli/App.tsx +++ b/src/cli/App.tsx @@ -18,6 +18,11 @@ import { type ApprovalResult, executeAutoAllowedTools, } from "../agent/approval-execution"; +import { + buildApprovalRecoveryMessage, + fetchRunErrorDetail, + isApprovalStateDesyncError, +} from "../agent/approval-recovery"; import { prefetchAvailableModelHandles } from "../agent/available-models"; import { getResumeData } from "../agent/check-approval"; import { getClient } from "../agent/client"; @@ -1027,7 +1032,8 @@ export default function App({ initialInput: Array, options?: { allowReentry?: boolean }, ): Promise => { - const currentInput = initialInput; + // Copy so we can safely mutate for retry recovery flows + const currentInput = [...initialInput]; const allowReentry = options?.allowReentry ?? false; // Guard against concurrent processConversation calls @@ -1665,6 +1671,58 @@ export default function App({ } // Unexpected stop reason (error, llm_api_error, etc.) + // Check for approval desync errors even if stop_reason isn't llm_api_error. + const isApprovalPayload = + currentInput.length === 1 && currentInput[0]?.type === "approval"; + + const approvalDesyncDetected = async () => { + // 1) Check run metadata + const detailFromRun = await fetchRunErrorDetail(lastRunId); + if (isApprovalStateDesyncError(detailFromRun)) return true; + + // 2) Check the most recent streamed error line in this turn + for (let i = buffersRef.current.order.length - 1; i >= 0; i -= 1) { + const id = buffersRef.current.order[i]; + if (!id) continue; + const entry = buffersRef.current.byId.get(id); + if (entry?.kind === "error") { + return isApprovalStateDesyncError(entry.text); + } + } + return false; + }; + + if (isApprovalPayload && (await approvalDesyncDetected())) { + // Limit how many times we try this recovery to avoid loops + if (llmApiErrorRetriesRef.current < LLM_API_ERROR_MAX_RETRIES) { + llmApiErrorRetriesRef.current += 1; + const statusId = uid("status"); + buffersRef.current.byId.set(statusId, { + kind: "status", + id: statusId, + lines: [ + "Approval state desynced; resending keep-alive recovery prompt...", + ], + }); + buffersRef.current.order.push(statusId); + refreshDerived(); + + currentInput.splice( + 0, + currentInput.length, + buildApprovalRecoveryMessage(), + ); + + // Remove the transient status before retrying + buffersRef.current.byId.delete(statusId); + buffersRef.current.order = buffersRef.current.order.filter( + (id) => id !== statusId, + ); + refreshDerived(); + continue; + } + } + // Check if this is a retriable error (transient LLM API error) const retriable = await isRetriableError( stopReasonToHandle, @@ -1681,10 +1739,13 @@ export default function App({ // Show subtle grey status message const statusId = uid("status"); + const statusLines = [ + "Unexpected downstream LLM API error, retrying...", + ]; buffersRef.current.byId.set(statusId, { kind: "status", id: statusId, - lines: ["Unexpected downstream LLM API error, retrying..."], + lines: statusLines, }); buffersRef.current.order.push(statusId); refreshDerived(); diff --git a/src/headless.ts b/src/headless.ts index c9c9e4e..2b29573 100644 --- a/src/headless.ts +++ b/src/headless.ts @@ -10,6 +10,11 @@ import type { } from "@letta-ai/letta-client/resources/agents/messages"; import type { StopReasonType } from "@letta-ai/letta-client/resources/runs/runs"; import type { ApprovalResult } from "./agent/approval-execution"; +import { + buildApprovalRecoveryMessage, + fetchRunErrorDetail, + isApprovalStateDesyncError, +} from "./agent/approval-recovery"; import { getClient } from "./agent/client"; import { initializeLoadedSkillsFlag, setAgentContext } from "./agent/context"; import { createAgent } from "./agent/create"; @@ -1061,6 +1066,11 @@ export async function handleHeadlessCommand( // Case 3: Transient LLM API error - retry with exponential backoff up to a limit if (stopReason === "llm_api_error") { + const shouldUseApprovalRecovery = + currentInput.length === 1 && + currentInput[0]?.type === "approval" && + isApprovalStateDesyncError(await fetchRunErrorDetail(lastRunId)); + if (llmApiErrorRetries < LLM_API_ERROR_MAX_RETRIES) { const attempt = llmApiErrorRetries + 1; const baseDelayMs = 1000; @@ -1082,17 +1092,76 @@ export async function handleHeadlessCommand( console.log(JSON.stringify(retryMsg)); } else { const delaySeconds = Math.round(delayMs / 1000); + const recoveryNote = shouldUseApprovalRecovery + ? " (approval state desynced - sending keep-going prompt)" + : ""; console.error( - `LLM API error encountered (attempt ${attempt} of ${LLM_API_ERROR_MAX_RETRIES}), retrying in ${delaySeconds}s...`, + `LLM API error encountered (attempt ${attempt} of ${LLM_API_ERROR_MAX_RETRIES}), retrying in ${delaySeconds}s...${recoveryNote}`, ); } // Exponential backoff before retrying the same input await new Promise((resolve) => setTimeout(resolve, delayMs)); + + if (shouldUseApprovalRecovery) { + currentInput = [buildApprovalRecoveryMessage()]; + } continue; } } + // Fallback: if we were sending only approvals and hit an internal error that + // says there is no pending approval, resend using the keep-alive recovery prompt. + const isApprovalPayload = + currentInput.length === 1 && currentInput[0]?.type === "approval"; + const approvalDesynced = + isApprovalPayload && + (isApprovalStateDesyncError(await fetchRunErrorDetail(lastRunId)) || + (() => { + const lines = toLines(buffers); + for (let i = lines.length - 1; i >= 0; i -= 1) { + const line = lines[i]; + if (!line) continue; + if ( + line.kind === "error" && + "text" in line && + typeof line.text === "string" + ) { + return isApprovalStateDesyncError(line.text ?? null); + } + } + return false; + })()); + + if (approvalDesynced && llmApiErrorRetries < LLM_API_ERROR_MAX_RETRIES) { + llmApiErrorRetries += 1; + + const retryReason = stopReason ?? "error"; + if (outputFormat === "stream-json") { + const retryMsg: RetryMessage = { + type: "retry", + reason: retryReason, + attempt: llmApiErrorRetries, + max_attempts: LLM_API_ERROR_MAX_RETRIES, + delay_ms: 0, + run_id: lastRunId ?? undefined, + session_id: sessionId, + uuid: `retry-${lastRunId || crypto.randomUUID()}`, + }; + console.log(JSON.stringify(retryMsg)); + } else { + console.error( + "Approval state desynced; resending keep-alive recovery prompt...", + ); + } + + // Small pause to avoid rapid-fire retries + await new Promise((resolve) => setTimeout(resolve, 250)); + + currentInput = [buildApprovalRecoveryMessage()]; + continue; + } + // Unexpected stop reason (error, llm_api_error, etc.) // Before failing, check run metadata to see if this is a retriable llm_api_error // Fallback check: in case stop_reason is "error" but metadata indicates LLM error @@ -1415,7 +1484,7 @@ async function runBidirectionalMode( // Helper to get next line (from queue or wait) async function getNextLine(): Promise { if (lineQueue.length > 0) { - return lineQueue.shift()!; + return lineQueue.shift() ?? null; } return new Promise((resolve) => { lineResolver = resolve;