feat: clean up keep-alive fallback handling (#663)

This commit is contained in:
cthomas
2026-01-27 17:50:37 -08:00
committed by GitHub
parent c2fd63f5b5
commit 0df8e51dac
4 changed files with 96 additions and 321 deletions

View File

@@ -9,10 +9,8 @@ import type { ApprovalCreate } from "@letta-ai/letta-client/resources/agents/mes
import type { StopReasonType } from "@letta-ai/letta-client/resources/runs/runs";
import type { ApprovalResult } from "./agent/approval-execution";
import {
buildApprovalRecoveryMessage,
fetchRunErrorDetail,
isApprovalPendingError,
isApprovalStateDesyncError,
isConversationBusyError,
isInvalidToolCallIdsError,
} from "./agent/approval-recovery";
@@ -1385,26 +1383,11 @@ export async function handleHeadlessCommand(
}
}
// Detect approval desync once per turn
// Fetch run error detail for invalid tool call ID detection
const detailFromRun = await fetchRunErrorDetail(lastRunId);
const approvalDesynced =
currentInput.length === 1 &&
currentInput[0]?.type === "approval" &&
(isApprovalStateDesyncError(detailFromRun) ||
isApprovalStateDesyncError(latestErrorText));
// Track last failure text for emitting on exit
const lastFailureText =
latestErrorText ||
detailFromRun ||
(lastRunId
? `An error occurred during agent execution\n(run_id: ${lastRunId}, stop_reason: ${stopReason})`
: `An error occurred during agent execution\n(stop_reason: ${stopReason})`);
// Case 3: Transient LLM API error - retry with exponential backoff up to a limit
if (stopReason === "llm_api_error") {
const shouldUseApprovalRecovery = approvalDesynced;
if (llmApiErrorRetries < LLM_API_ERROR_MAX_RETRIES) {
const attempt = llmApiErrorRetries + 1;
const baseDelayMs = 1000;
@@ -1426,104 +1409,64 @@ export async function handleHeadlessCommand(
console.log(JSON.stringify(retryMsg));
} else {
const delaySeconds = Math.round(delayMs / 1000);
const recoveryNote = shouldUseApprovalRecovery
? " (approval state desynced - sending keep-going prompt)"
: "";
console.error(
`LLM API error encountered (attempt ${attempt} of ${LLM_API_ERROR_MAX_RETRIES}), retrying in ${delaySeconds}s...${recoveryNote}`,
`LLM API error encountered (attempt ${attempt} of ${LLM_API_ERROR_MAX_RETRIES}), retrying in ${delaySeconds}s...`,
);
}
// Exponential backoff before retrying the same input
await new Promise((resolve) => setTimeout(resolve, delayMs));
if (shouldUseApprovalRecovery) {
currentInput = [buildApprovalRecoveryMessage()];
}
continue;
}
}
// Fallback: if we were sending only approvals and hit an internal error that
// says there is no pending approval, resend using the keep-alive recovery prompt.
if (approvalDesynced) {
// "Invalid tool call IDs" means server HAS pending approvals but with different IDs.
// Fetch the actual pending approvals and process them before retrying.
if (
isInvalidToolCallIdsError(detailFromRun) ||
isInvalidToolCallIdsError(latestErrorText)
) {
if (outputFormat === "stream-json") {
const recoveryMsg: RecoveryMessage = {
type: "recovery",
recovery_type: "invalid_tool_call_ids",
message:
"Tool call ID mismatch; fetching actual pending approvals and resyncing",
run_id: lastRunId ?? undefined,
session_id: sessionId,
uuid: `recovery-${lastRunId || crypto.randomUUID()}`,
};
console.log(JSON.stringify(recoveryMsg));
} else {
console.error(
"Tool call ID mismatch; fetching actual pending approvals...",
);
}
// "Invalid tool call IDs" means server HAS pending approvals but with different IDs.
// Fetch the actual pending approvals and process them before retrying.
const invalidIdsDetected =
isInvalidToolCallIdsError(detailFromRun) ||
isInvalidToolCallIdsError(latestErrorText);
try {
// Fetch and process actual pending approvals from server
await resolveAllPendingApprovals();
// After processing, continue to next iteration (fresh state)
continue;
} catch {
// If fetch fails, fall through to general desync recovery
}
}
if (llmApiErrorRetries < LLM_API_ERROR_MAX_RETRIES) {
llmApiErrorRetries += 1;
const retryReason = stopReason ?? "error";
if (outputFormat === "stream-json") {
const retryMsg: RetryMessage = {
type: "retry",
reason: retryReason,
attempt: llmApiErrorRetries,
max_attempts: LLM_API_ERROR_MAX_RETRIES,
delay_ms: 0,
run_id: lastRunId ?? undefined,
session_id: sessionId,
uuid: `retry-${lastRunId || crypto.randomUUID()}`,
};
console.log(JSON.stringify(retryMsg));
} else {
console.error(
"Approval state desynced; resending keep-alive recovery prompt...",
);
}
// Small pause to avoid rapid-fire retries
await new Promise((resolve) => setTimeout(resolve, 250));
currentInput = [buildApprovalRecoveryMessage()];
continue;
}
// No retries left or non-retriable: emit error and exit
if (invalidIdsDetected) {
if (outputFormat === "stream-json") {
const errorMsg: ErrorMessage = {
type: "error",
message: lastFailureText,
stop_reason: stopReason,
const recoveryMsg: RecoveryMessage = {
type: "recovery",
recovery_type: "invalid_tool_call_ids",
message:
"Tool call ID mismatch; fetching actual pending approvals and resyncing",
run_id: lastRunId ?? undefined,
session_id: sessionId,
uuid: `error-${lastRunId || crypto.randomUUID()}`,
uuid: `recovery-${lastRunId || crypto.randomUUID()}`,
};
console.log(JSON.stringify(errorMsg));
console.log(JSON.stringify(recoveryMsg));
} else {
console.error(lastFailureText);
console.error(
"Tool call ID mismatch; fetching actual pending approvals...",
);
}
try {
// Fetch and process actual pending approvals from server
await resolveAllPendingApprovals();
// After processing, continue to next iteration (fresh state)
continue;
} catch {
// If fetch fails, exit with error
if (outputFormat === "stream-json") {
const errorMsg: ErrorMessage = {
type: "error",
message: "Failed to fetch pending approvals for resync",
stop_reason: stopReason,
run_id: lastRunId ?? undefined,
session_id: sessionId,
uuid: `error-${lastRunId || crypto.randomUUID()}`,
};
console.log(JSON.stringify(errorMsg));
} else {
console.error("Failed to fetch pending approvals for resync");
}
process.exit(1);
}
process.exit(1);
}
// Unexpected stop reason (error, llm_api_error, etc.)