feat: Exponential backoff for llm_api_error (#181)

2025-12-11 15:54:34 -08:00
parent a21336bd16
commit d5b02158d5
1 changed files with 40 additions and 0 deletions
--- a/src/headless.ts
+++ b/src/headless.ts
@@ -24,6 +24,12 @@ import { drainStreamWithResume } from "./cli/helpers/stream";
 import { settingsManager } from "./settings-manager";
 import { checkToolPermission } from "./tools/manager";

+// Maximum number of times to retry a turn when the backend
+// reports an `llm_api_error` stop reason. This helps smooth
+// over transient LLM/backend issues without requiring the
+// caller to manually resubmit the prompt.
+const LLM_API_ERROR_MAX_RETRIES = 3;
+
 export async function handleHeadlessCommand(
  argv: string[],
  model?: string,
@@ -439,6 +445,7 @@ export async function handleHeadlessCommand(

  // Track lastRunId outside the while loop so it's available in catch block
  let lastKnownRunId: string | null = null;
+  let llmApiErrorRetries = 0;

  try {
    while (true) {
@@ -789,6 +796,39 @@ export async function handleHeadlessCommand(
        continue;
      }

+      // Case 3: Transient LLM API error - retry with exponential backoff up to a limit
+      if (stopReason === "llm_api_error") {
+        if (llmApiErrorRetries < LLM_API_ERROR_MAX_RETRIES) {
+          const attempt = llmApiErrorRetries + 1;
+          const baseDelayMs = 1000;
+          const delayMs = baseDelayMs * 2 ** (attempt - 1);
+
+          llmApiErrorRetries = attempt;
+
+          if (outputFormat === "stream-json") {
+            console.log(
+              JSON.stringify({
+                type: "retry",
+                reason: "llm_api_error",
+                attempt,
+                max_attempts: LLM_API_ERROR_MAX_RETRIES,
+                delay_ms: delayMs,
+                run_id: lastRunId,
+              }),
+            );
+          } else {
+            const delaySeconds = Math.round(delayMs / 1000);
+            console.error(
+              `LLM API error encountered (attempt ${attempt} of ${LLM_API_ERROR_MAX_RETRIES}), retrying in ${delaySeconds}s...`,
+            );
+          }
+
+          // Exponential backoff before retrying the same input
+          await new Promise((resolve) => setTimeout(resolve, delayMs));
+          continue;
+        }
+      }
+
      // Unexpected stop reason (error, llm_api_error, etc.)
      // Mark incomplete tool calls as cancelled to prevent stuck state
      markIncompleteToolsAsCancelled(buffers);