From a58dddaf3d587d03ab3fc58dc929aee52b2bd4cb Mon Sep 17 00:00:00 2001
From: Kevin Lin <klin5061@gmail.com>
Date: Fri, 12 Dec 2025 09:51:49 -0800
Subject: [PATCH] feat: Add retries for stop_reason `error` with metadata
 (#191)

---
 src/headless.ts | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/src/headless.ts b/src/headless.ts
index 63f4273..4df8a12 100644
--- a/src/headless.ts
+++ b/src/headless.ts
@@ -830,6 +830,55 @@ export async function handleHeadlessCommand(
       }
 
       // Unexpected stop reason (error, llm_api_error, etc.)
+      // Before failing, check run metadata to see if this is a retriable llm_api_error
+      if (
+        stopReason === "error" &&
+        lastRunId &&
+        llmApiErrorRetries < LLM_API_ERROR_MAX_RETRIES
+      ) {
+        try {
+          const run = await client.runs.retrieve(lastRunId);
+          const metaError = run.metadata?.error as
+            | {
+                type?: string;
+                message?: string;
+                detail?: string;
+              }
+            | undefined;
+
+          if (metaError?.type === "llm_api_error") {
+            const attempt = llmApiErrorRetries + 1;
+            const baseDelayMs = 1000;
+            const delayMs = baseDelayMs * 2 ** (attempt - 1);
+
+            llmApiErrorRetries = attempt;
+
+            if (outputFormat === "stream-json") {
+              console.log(
+                JSON.stringify({
+                  type: "retry",
+                  reason: "llm_api_error",
+                  attempt,
+                  max_attempts: LLM_API_ERROR_MAX_RETRIES,
+                  delay_ms: delayMs,
+                  run_id: lastRunId,
+                }),
+              );
+            } else {
+              const delaySeconds = Math.round(delayMs / 1000);
+              console.error(
+                `LLM API error encountered (attempt ${attempt} of ${LLM_API_ERROR_MAX_RETRIES}), retrying in ${delaySeconds}s...`,
+              );
+            }
+
+            await new Promise((resolve) => setTimeout(resolve, delayMs));
+            continue;
+          }
+        } catch (_e) {
+          // If we can't fetch run metadata, fall through to normal error handling
+        }
+      }
+
       // Mark incomplete tool calls as cancelled to prevent stuck state
       markIncompleteToolsAsCancelled(buffers);