feat: Add retries for stop_reason error with metadata (#191)

2025-12-12 09:51:49 -08:00
parent 97e6b7e2a7
commit a58dddaf3d
1 changed files with 49 additions and 0 deletions
--- a/src/headless.ts
+++ b/src/headless.ts
@@ -830,6 +830,55 @@ export async function handleHeadlessCommand(
      }

      // Unexpected stop reason (error, llm_api_error, etc.)
+      // Before failing, check run metadata to see if this is a retriable llm_api_error
+      if (
+        stopReason === "error" &&
+        lastRunId &&
+        llmApiErrorRetries < LLM_API_ERROR_MAX_RETRIES
+      ) {
+        try {
+          const run = await client.runs.retrieve(lastRunId);
+          const metaError = run.metadata?.error as
+            | {
+                type?: string;
+                message?: string;
+                detail?: string;
+              }
+            | undefined;
+
+          if (metaError?.type === "llm_api_error") {
+            const attempt = llmApiErrorRetries + 1;
+            const baseDelayMs = 1000;
+            const delayMs = baseDelayMs * 2 ** (attempt - 1);
+
+            llmApiErrorRetries = attempt;
+
+            if (outputFormat === "stream-json") {
+              console.log(
+                JSON.stringify({
+                  type: "retry",
+                  reason: "llm_api_error",
+                  attempt,
+                  max_attempts: LLM_API_ERROR_MAX_RETRIES,
+                  delay_ms: delayMs,
+                  run_id: lastRunId,
+                }),
+              );
+            } else {
+              const delaySeconds = Math.round(delayMs / 1000);
+              console.error(
+                `LLM API error encountered (attempt ${attempt} of ${LLM_API_ERROR_MAX_RETRIES}), retrying in ${delaySeconds}s...`,
+              );
+            }
+
+            await new Promise((resolve) => setTimeout(resolve, delayMs));
+            continue;
+          }
+        } catch (_e) {
+          // If we can't fetch run metadata, fall through to normal error handling
+        }
+      }
+
      // Mark incomplete tool calls as cancelled to prevent stuck state
      markIncompleteToolsAsCancelled(buffers);