From d5b02158d53725e7a640e5a842de0d5729842732 Mon Sep 17 00:00:00 2001 From: Kevin Lin Date: Thu, 11 Dec 2025 15:54:34 -0800 Subject: [PATCH] feat: Exponential backoff for `llm_api_error` (#181) --- src/headless.ts | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/src/headless.ts b/src/headless.ts index 57e31e8..981d224 100644 --- a/src/headless.ts +++ b/src/headless.ts @@ -24,6 +24,12 @@ import { drainStreamWithResume } from "./cli/helpers/stream"; import { settingsManager } from "./settings-manager"; import { checkToolPermission } from "./tools/manager"; +// Maximum number of times to retry a turn when the backend +// reports an `llm_api_error` stop reason. This helps smooth +// over transient LLM/backend issues without requiring the +// caller to manually resubmit the prompt. +const LLM_API_ERROR_MAX_RETRIES = 3; + export async function handleHeadlessCommand( argv: string[], model?: string, @@ -439,6 +445,7 @@ export async function handleHeadlessCommand( // Track lastRunId outside the while loop so it's available in catch block let lastKnownRunId: string | null = null; + let llmApiErrorRetries = 0; try { while (true) { @@ -789,6 +796,39 @@ export async function handleHeadlessCommand( continue; } + // Case 3: Transient LLM API error - retry with exponential backoff up to a limit + if (stopReason === "llm_api_error") { + if (llmApiErrorRetries < LLM_API_ERROR_MAX_RETRIES) { + const attempt = llmApiErrorRetries + 1; + const baseDelayMs = 1000; + const delayMs = baseDelayMs * 2 ** (attempt - 1); + + llmApiErrorRetries = attempt; + + if (outputFormat === "stream-json") { + console.log( + JSON.stringify({ + type: "retry", + reason: "llm_api_error", + attempt, + max_attempts: LLM_API_ERROR_MAX_RETRIES, + delay_ms: delayMs, + run_id: lastRunId, + }), + ); + } else { + const delaySeconds = Math.round(delayMs / 1000); + console.error( + `LLM API error encountered (attempt ${attempt} of ${LLM_API_ERROR_MAX_RETRIES}), retrying in ${delaySeconds}s...`, + ); + } + + // Exponential backoff before retrying the same input + await new Promise((resolve) => setTimeout(resolve, delayMs)); + continue; + } + } + // Unexpected stop reason (error, llm_api_error, etc.) // Mark incomplete tool calls as cancelled to prevent stuck state markIncompleteToolsAsCancelled(buffers);