fix: expand chatgpt oauth retry classification (#980)

2026-02-16 14:36:15 -08:00
parent f2f59e4591
commit 5435f44c45
10 changed files with 430 additions and 49 deletions
--- a/src/cli/App.tsx
+++ b/src/cli/App.tsx
@@ -34,8 +34,10 @@ import {
  getPreStreamErrorAction,
  isApprovalPendingError,
  isInvalidToolCallIdsError,
+  parseRetryAfterHeaderMs,
  rebuildInputWithFreshDenials,
  shouldAttemptApprovalRecovery,
+  shouldRetryRunMetadataError,
 } from "../agent/approval-recovery";
 import { prefetchAvailableModelHandles } from "../agent/available-models";
 import { getResumeData } from "../agent/check-approval";
@@ -482,29 +484,7 @@ async function isRetriableError(
      const errorType = metaError?.error_type ?? metaError?.error?.error_type;
      const detail = metaError?.detail ?? metaError?.error?.detail ?? "";

-      // Don't retry 4xx client errors (validation, auth, malformed requests)
-      // These are not transient and won't succeed on retry
-      const is4xxError = /Error code: 4\d{2}/.test(detail);
-
-      if (errorType === "llm_error" && !is4xxError) return true;
-
-      // Fallback: detect LLM provider errors from detail even if misclassified
-      // This handles edge cases where streaming errors weren't properly converted to LLMError
-      // Patterns are derived from handle_llm_error() message formats in the backend
-      const llmProviderPatterns = [
-        "Anthropic API error", // anthropic_client.py:759
-        "OpenAI API error", // openai_client.py:1034
-        "ChatGPT API error", // chatgpt_oauth_client.py - upstream connect errors
-        "Google Vertex API error", // google_vertex_client.py:848
-        "overloaded", // anthropic_client.py:753 - used for LLMProviderOverloaded
-        "api_error", // Anthropic SDK error type field
-        "Network error", // Transient network failures during streaming
-        "Connection error during", // Peer disconnections, incomplete chunked reads (Anthropic, ChatGPT streaming)
-      ];
-      if (
-        llmProviderPatterns.some((pattern) => detail.includes(pattern)) &&
-        !is4xxError
-      ) {
+      if (shouldRetryRunMetadataError(errorType, detail)) {
        return true;
      }

@@ -3156,6 +3136,14 @@ export default function App({
              errorDetail,
              conversationBusyRetriesRef.current,
              CONVERSATION_BUSY_MAX_RETRIES,
+              {
+                status:
+                  preStreamError instanceof APIError
+                    ? preStreamError.status
+                    : undefined,
+                transientRetries: llmApiErrorRetriesRef.current,
+                maxTransientRetries: LLM_API_ERROR_MAX_RETRIES,
+              },
            );

            // Resolve stale approval conflict: fetch real pending approvals, auto-deny, retry.
@@ -3238,6 +3226,54 @@ export default function App({
              // User pressed ESC - fall through to error handling
            }

+            // Retry pre-stream transient errors (429/5xx/network) with shared LLM retry budget
+            if (preStreamAction === "retry_transient") {
+              llmApiErrorRetriesRef.current += 1;
+              const attempt = llmApiErrorRetriesRef.current;
+              const retryAfterMs =
+                preStreamError instanceof APIError
+                  ? parseRetryAfterHeaderMs(
+                      preStreamError.headers?.get("retry-after"),
+                    )
+                  : null;
+              const delayMs = retryAfterMs ?? 1000 * 2 ** (attempt - 1);
+
+              const statusId = uid("status");
+              buffersRef.current.byId.set(statusId, {
+                kind: "status",
+                id: statusId,
+                lines: [getRetryStatusMessage(errorDetail)],
+              });
+              buffersRef.current.order.push(statusId);
+              refreshDerived();
+
+              let cancelled = false;
+              const startTime = Date.now();
+              while (Date.now() - startTime < delayMs) {
+                if (
+                  abortControllerRef.current?.signal.aborted ||
+                  userCancelledRef.current
+                ) {
+                  cancelled = true;
+                  break;
+                }
+                await new Promise((resolve) => setTimeout(resolve, 100));
+              }
+
+              buffersRef.current.byId.delete(statusId);
+              buffersRef.current.order = buffersRef.current.order.filter(
+                (id) => id !== statusId,
+              );
+              refreshDerived();
+
+              if (!cancelled) {
+                buffersRef.current.interrupted = false;
+                conversationBusyRetriesRef.current = 0;
+                continue;
+              }
+              // User pressed ESC - fall through to error handling
+            }
+
            // Reset conversation busy retry counter on non-busy error
            conversationBusyRetriesRef.current = 0;

--- a/src/cli/helpers/errorFormatter.ts
+++ b/src/cli/helpers/errorFormatter.ts
@@ -448,6 +448,22 @@ export function getRetryStatusMessage(

  if (errorDetail.includes("Anthropic API is overloaded"))
    return "Anthropic API is overloaded, retrying...";
+  if (
+    errorDetail.includes("ChatGPT API error") ||
+    errorDetail.includes("ChatGPT server error") ||
+    errorDetail.includes("upstream connect error")
+  ) {
+    return "OpenAI ChatGPT backend connection failed, retrying...";
+  }
+  if (
+    errorDetail.includes("Connection error during streaming") ||
+    errorDetail.includes("incomplete chunked read") ||
+    errorDetail.includes("connection termination")
+  ) {
+    return "OpenAI ChatGPT streaming connection dropped, retrying...";
+  }
+  if (errorDetail.includes("OpenAI API error"))
+    return "OpenAI API error, retrying...";

  return DEFAULT_RETRY_MESSAGE;
 }
--- a/src/cli/helpers/stream.ts
+++ b/src/cli/helpers/stream.ts
@@ -226,8 +226,9 @@ export async function drainStream(
      fallbackError = errorMessage;
    }

-    // Set error stop reason so drainStreamWithResume can try to reconnect
-    stopReason = "error";
+    // Preserve a stop reason already parsed from stream chunks (e.g. llm_api_error)
+    // and only fall back to generic "error" when none is available.
+    stopReason = streamProcessor.stopReason || "error";
    markIncompleteToolsAsCancelled(buffers, true, "stream_error");
    queueMicrotask(refresh);
  } finally {