fix: expand chatgpt oauth retry classification (#980)

2026-02-16 14:36:15 -08:00
parent f2f59e4591
commit 5435f44c45
10 changed files with 430 additions and 49 deletions
--- a/src/headless.ts
+++ b/src/headless.ts
@@ -14,6 +14,8 @@ import {
  getPreStreamErrorAction,
  isApprovalPendingError,
  isInvalidToolCallIdsError,
+  parseRetryAfterHeaderMs,
+  shouldRetryRunMetadataError,
 } from "./agent/approval-recovery";
 import { getClient } from "./agent/client";
 import { setAgentContext, setConversationId } from "./agent/context";
@@ -1235,6 +1237,14 @@ ${SYSTEM_REMINDER_CLOSE}
          errorDetail,
          conversationBusyRetries,
          CONVERSATION_BUSY_MAX_RETRIES,
+          {
+            status:
+              preStreamError instanceof APIError
+                ? preStreamError.status
+                : undefined,
+            transientRetries: llmApiErrorRetries,
+            maxTransientRetries: LLM_API_ERROR_MAX_RETRIES,
+          },
        );

        // Check for pending approval blocking new messages - resolve and retry.
@@ -1290,6 +1300,41 @@ ${SYSTEM_REMINDER_CLOSE}
          continue;
        }

+        if (preStreamAction === "retry_transient") {
+          const attempt = llmApiErrorRetries + 1;
+          const retryAfterMs =
+            preStreamError instanceof APIError
+              ? parseRetryAfterHeaderMs(
+                  preStreamError.headers?.get("retry-after"),
+                )
+              : null;
+          const delayMs = retryAfterMs ?? 1000 * 2 ** (attempt - 1);
+
+          llmApiErrorRetries = attempt;
+
+          if (outputFormat === "stream-json") {
+            const retryMsg: RetryMessage = {
+              type: "retry",
+              reason: "llm_api_error",
+              attempt,
+              max_attempts: LLM_API_ERROR_MAX_RETRIES,
+              delay_ms: delayMs,
+              session_id: sessionId,
+              uuid: `retry-pre-stream-${crypto.randomUUID()}`,
+            };
+            console.log(JSON.stringify(retryMsg));
+          } else {
+            const delaySeconds = Math.round(delayMs / 1000);
+            console.error(
+              `Transient API error before streaming (attempt ${attempt} of ${LLM_API_ERROR_MAX_RETRIES}), retrying in ${delaySeconds}s...`,
+            );
+          }
+
+          await new Promise((resolve) => setTimeout(resolve, delayMs));
+          conversationBusyRetries = 0;
+          continue;
+        }
+
        // Reset conversation busy retry counter on other errors
        conversationBusyRetries = 0;

@@ -1696,31 +1741,9 @@ ${SYSTEM_REMINDER_CLOSE}
          const errorType =
            metaError?.error_type ?? metaError?.error?.error_type;

-          // Fallback: detect LLM provider errors from detail even if misclassified
-          // Patterns are derived from handle_llm_error() message formats in the backend
          const detail = metaError?.detail ?? metaError?.error?.detail ?? "";

-          // Don't retry 4xx client errors (validation, auth, malformed requests)
-          // These are not transient and won't succeed on retry
-          const is4xxError = /Error code: 4\d{2}/.test(detail);
-
-          const llmProviderPatterns = [
-            "Anthropic API error", // anthropic_client.py:759
-            "OpenAI API error", // openai_client.py:1034
-            "Google Vertex API error", // google_vertex_client.py:848
-            "overloaded", // anthropic_client.py:753 - used for LLMProviderOverloaded
-            "api_error", // Anthropic SDK error type field
-            "Network error", // Transient network failures during streaming
-            "Connection error during Anthropic streaming", // Peer disconnections, incomplete chunked reads
-          ];
-          const isLlmErrorFromDetail = llmProviderPatterns.some((pattern) =>
-            detail.includes(pattern),
-          );
-
-          if (
-            (errorType === "llm_error" || isLlmErrorFromDetail) &&
-            !is4xxError
-          ) {
+          if (shouldRetryRunMetadataError(errorType, detail)) {
            const attempt = llmApiErrorRetries + 1;
            const baseDelayMs = 1000;
            const delayMs = baseDelayMs * 2 ** (attempt - 1);
@@ -2397,6 +2420,7 @@ async function runBidirectionalMode(
        let numTurns = 0;
        let lastStopReason: StopReasonType | null = null; // Track for result subtype
        let sawStreamError = false; // Track if we emitted an error during streaming
+        let preStreamTransientRetries = 0;

        // Inject available skills as system-reminder for bidirectional mode (LET-7353)
        let enrichedContent = userContent;
@@ -2468,7 +2492,14 @@ async function runBidirectionalMode(

            // Route through shared pre-stream conflict classifier (parity with main loop + TUI)
            // Bidir mode has no conversation-busy retry budget, so pass 0/0 to disable busy-retry.
-            const preStreamAction = getPreStreamErrorAction(errorDetail, 0, 0);
+            const preStreamAction = getPreStreamErrorAction(errorDetail, 0, 0, {
+              status:
+                preStreamError instanceof APIError
+                  ? preStreamError.status
+                  : undefined,
+              transientRetries: preStreamTransientRetries,
+              maxTransientRetries: LLM_API_ERROR_MAX_RETRIES,
+            });

            if (preStreamAction === "resolve_approval_pending") {
              const recoveryMsg: RecoveryMessage = {
@@ -2484,8 +2515,35 @@ async function runBidirectionalMode(
              continue;
            }

+            if (preStreamAction === "retry_transient") {
+              const attempt = preStreamTransientRetries + 1;
+              const retryAfterMs =
+                preStreamError instanceof APIError
+                  ? parseRetryAfterHeaderMs(
+                      preStreamError.headers?.get("retry-after"),
+                    )
+                  : null;
+              const delayMs = retryAfterMs ?? 1000 * 2 ** (attempt - 1);
+              preStreamTransientRetries = attempt;
+
+              const retryMsg: RetryMessage = {
+                type: "retry",
+                reason: "llm_api_error",
+                attempt,
+                max_attempts: LLM_API_ERROR_MAX_RETRIES,
+                delay_ms: delayMs,
+                session_id: sessionId,
+                uuid: `retry-bidir-${crypto.randomUUID()}`,
+              };
+              console.log(JSON.stringify(retryMsg));
+
+              await new Promise((resolve) => setTimeout(resolve, delayMs));
+              continue;
+            }
+
            throw preStreamError;
          }
+          preStreamTransientRetries = 0;
          const streamJsonHook: DrainStreamHook = ({
            chunk,
            shouldOutput,