fix(retry): increase Cloudflare transient backoff to 5/10/20 (#1307)

2026-03-09 14:51:41 -07:00
parent 28039dcb43
commit a57cf84e03
6 changed files with 207 additions and 21 deletions
--- a/src/agent/approval-recovery.ts
+++ b/src/agent/approval-recovery.ts
@@ -13,12 +13,15 @@ export type {
  PreStreamConflictKind,
  PreStreamErrorAction,
  PreStreamErrorOptions,
+  RetryDelayCategory,
 } from "./turn-recovery-policy";
 // ── Re-export pure policy helpers (single source of truth) ──────────
 export {
  classifyPreStreamConflict,
  extractConflictDetail,
  getPreStreamErrorAction,
+  getRetryDelayMs,
+  getTransientRetryDelayMs,
  isApprovalPendingError,
  isConversationBusyError,
  isEmptyResponseError,
--- a/src/agent/turn-recovery-policy.ts
+++ b/src/agent/turn-recovery-policy.ts
@@ -65,6 +65,11 @@ const NON_RETRYABLE_QUOTA_DETAIL_PATTERNS = [
 ];
 const NON_RETRYABLE_4XX_PATTERN = /Error code:\s*4(0[0-8]|1\d|2\d|3\d|4\d|51)/i;
 const RETRYABLE_429_PATTERN = /Error code:\s*429|rate limit|too many requests/i;
+const DEFAULT_TRANSIENT_RETRY_BASE_DELAY_MS = 1000;
+const CLOUDFLARE_EDGE_52X_RETRY_BASE_DELAY_MS = 5000;
+const CONVERSATION_BUSY_RETRY_BASE_DELAY_MS = 10000;
+const EMPTY_RESPONSE_RETRY_BASE_DELAY_MS = 500;
+
 function isCloudflareEdge52xDetail(detail: unknown): boolean {
  if (typeof detail !== "string") return false;
  return isCloudflareEdge52xHtmlError(detail);
@@ -206,6 +211,56 @@ export function parseRetryAfterHeaderMs(
  return delayMs > 0 ? delayMs : 0;
 }

+export type RetryDelayCategory =
+  | "transient_provider"
+  | "conversation_busy"
+  | "empty_response";
+
+/**
+ * Compute retry delay for known retry classes.
+ * - `transient_provider`: exponential (Cloudflare-specific base) with Retry-After override
+ * - `conversation_busy`: exponential
+ * - `empty_response`: linear
+ */
+export function getRetryDelayMs(opts: {
+  category: RetryDelayCategory;
+  attempt: number;
+  detail?: unknown;
+  retryAfterMs?: number | null;
+}): number {
+  const { category, attempt, detail, retryAfterMs = null } = opts;
+
+  if (category === "transient_provider") {
+    if (retryAfterMs !== null) return retryAfterMs;
+    const baseDelayMs = isCloudflareEdge52xDetail(detail)
+      ? CLOUDFLARE_EDGE_52X_RETRY_BASE_DELAY_MS
+      : DEFAULT_TRANSIENT_RETRY_BASE_DELAY_MS;
+    return baseDelayMs * 2 ** (attempt - 1);
+  }
+
+  if (category === "conversation_busy") {
+    return CONVERSATION_BUSY_RETRY_BASE_DELAY_MS * 2 ** (attempt - 1);
+  }
+
+  return EMPTY_RESPONSE_RETRY_BASE_DELAY_MS * attempt;
+}
+
+/**
+ * Backward-compatible wrapper for transient provider retries.
+ */
+export function getTransientRetryDelayMs(opts: {
+  attempt: number;
+  detail: unknown;
+  retryAfterMs?: number | null;
+}): number {
+  return getRetryDelayMs({
+    category: "transient_provider",
+    attempt: opts.attempt,
+    detail: opts.detail,
+    retryAfterMs: opts.retryAfterMs,
+  });
+}
+
 // ── Pre-stream conflict routing ─────────────────────────────────────

 export type PreStreamConflictKind =
--- a/src/cli/App.tsx
+++ b/src/cli/App.tsx
@@ -32,6 +32,7 @@ import {
  extractConflictDetail,
  fetchRunErrorDetail,
  getPreStreamErrorAction,
+  getRetryDelayMs,
  isApprovalPendingError,
  isEmptyResponseRetryable,
  isInvalidToolCallIdsError,
@@ -331,7 +332,6 @@ const EMPTY_RESPONSE_MAX_RETRIES = 2;

 // Retry config for 409 "conversation busy" errors (exponential backoff)
 const CONVERSATION_BUSY_MAX_RETRIES = 3; // 10s -> 20s -> 40s
-const CONVERSATION_BUSY_RETRY_BASE_DELAY_MS = 10000; // 10 seconds

 // Message shown when user interrupts the stream
 const INTERRUPT_MESSAGE =
@@ -4072,9 +4072,10 @@ export default function App({
            // Check for 409 "conversation busy" error - retry with exponential backoff
            if (preStreamAction === "retry_conversation_busy") {
              conversationBusyRetriesRef.current += 1;
-              const retryDelayMs =
-                CONVERSATION_BUSY_RETRY_BASE_DELAY_MS *
-                2 ** (conversationBusyRetriesRef.current - 1);
+              const retryDelayMs = getRetryDelayMs({
+                category: "conversation_busy",
+                attempt: conversationBusyRetriesRef.current,
+              });

              // Log the conversation-busy error
              telemetry.trackError(
@@ -4142,7 +4143,12 @@ export default function App({
                      preStreamError.headers?.get("retry-after"),
                    )
                  : null;
-              const delayMs = retryAfterMs ?? 1000 * 2 ** (attempt - 1);
+              const delayMs = getRetryDelayMs({
+                category: "transient_provider",
+                attempt,
+                detail: errorDetail,
+                retryAfterMs,
+              });

              // Log the error that triggered the retry
              telemetry.trackError(
@@ -5348,7 +5354,10 @@ export default function App({
          ) {
            emptyResponseRetriesRef.current += 1;
            const attempt = emptyResponseRetriesRef.current;
-            const delayMs = 500 * attempt;
+            const delayMs = getRetryDelayMs({
+              category: "empty_response",
+              attempt,
+            });

            // Only append a nudge on the last attempt
            if (attempt >= EMPTY_RESPONSE_MAX_RETRIES) {
@@ -5397,7 +5406,11 @@ export default function App({
          ) {
            llmApiErrorRetriesRef.current += 1;
            const attempt = llmApiErrorRetriesRef.current;
-            const delayMs = 1000 * 2 ** (attempt - 1); // 1s, 2s, 4s
+            const delayMs = getRetryDelayMs({
+              category: "transient_provider",
+              attempt,
+              detail: detailFromRun ?? fallbackError,
+            });

            // Log the error that triggered the retry
            telemetry.trackError(
--- a/src/headless.ts
+++ b/src/headless.ts
@@ -12,6 +12,7 @@ import {
  extractConflictDetail,
  fetchRunErrorDetail,
  getPreStreamErrorAction,
+  getRetryDelayMs,
  isApprovalPendingError,
  isEmptyResponseRetryable,
  isInvalidToolCallIdsError,
@@ -133,7 +134,6 @@ const EMPTY_RESPONSE_MAX_RETRIES = 2;

 // Retry config for 409 "conversation busy" errors (exponential backoff)
 const CONVERSATION_BUSY_MAX_RETRIES = 3; // 10s -> 20s -> 40s
-const CONVERSATION_BUSY_RETRY_BASE_DELAY_MS = 10000; // 10 seconds

 export type BidirectionalQueuedInput = QueuedTurnInput<
  MessageCreate["content"]
@@ -1544,9 +1544,10 @@ ${SYSTEM_REMINDER_CLOSE}
        // Check for 409 "conversation busy" error - retry once with delay
        if (preStreamAction === "retry_conversation_busy") {
          conversationBusyRetries += 1;
-          const retryDelayMs =
-            CONVERSATION_BUSY_RETRY_BASE_DELAY_MS *
-            2 ** (conversationBusyRetries - 1);
+          const retryDelayMs = getRetryDelayMs({
+            category: "conversation_busy",
+            attempt: conversationBusyRetries,
+          });

          // Emit retry message for stream-json mode
          if (outputFormat === "stream-json") {
@@ -1579,7 +1580,12 @@ ${SYSTEM_REMINDER_CLOSE}
                  preStreamError.headers?.get("retry-after"),
                )
              : null;
-          const delayMs = retryAfterMs ?? 1000 * 2 ** (attempt - 1);
+          const delayMs = getRetryDelayMs({
+            category: "transient_provider",
+            attempt,
+            detail: errorDetail,
+            retryAfterMs,
+          });

          llmApiErrorRetries = attempt;

@@ -1910,8 +1916,11 @@ ${SYSTEM_REMINDER_CLOSE}
      if (stopReason === "llm_api_error") {
        if (llmApiErrorRetries < LLM_API_ERROR_MAX_RETRIES) {
          const attempt = llmApiErrorRetries + 1;
-          const baseDelayMs = 1000;
-          const delayMs = baseDelayMs * 2 ** (attempt - 1);
+          const delayMs = getRetryDelayMs({
+            category: "transient_provider",
+            attempt,
+            detail: detailFromRun,
+          });

          llmApiErrorRetries = attempt;

@@ -2038,7 +2047,10 @@ ${SYSTEM_REMINDER_CLOSE}
            )
          ) {
            const attempt = emptyResponseRetries + 1;
-            const delayMs = 500 * attempt;
+            const delayMs = getRetryDelayMs({
+              category: "empty_response",
+              attempt,
+            });

            emptyResponseRetries = attempt;

@@ -2075,8 +2087,11 @@ ${SYSTEM_REMINDER_CLOSE}

          if (shouldRetryRunMetadataError(errorType, detail)) {
            const attempt = llmApiErrorRetries + 1;
-            const baseDelayMs = 1000;
-            const delayMs = baseDelayMs * 2 ** (attempt - 1);
+            const delayMs = getRetryDelayMs({
+              category: "transient_provider",
+              attempt,
+              detail,
+            });

            llmApiErrorRetries = attempt;

@@ -3169,7 +3184,12 @@ async function runBidirectionalMode(
                      preStreamError.headers?.get("retry-after"),
                    )
                  : null;
-              const delayMs = retryAfterMs ?? 1000 * 2 ** (attempt - 1);
+              const delayMs = getRetryDelayMs({
+                category: "transient_provider",
+                attempt,
+                detail: errorDetail,
+                retryAfterMs,
+              });
              preStreamTransientRetries = attempt;

              const retryMsg: RetryMessage = {
--- a/src/tests/turn-recovery-policy.test.ts
+++ b/src/tests/turn-recovery-policy.test.ts
@@ -3,6 +3,8 @@ import {
  classifyPreStreamConflict,
  extractConflictDetail,
  getPreStreamErrorAction,
+  getRetryDelayMs,
+  getTransientRetryDelayMs,
  isApprovalPendingError,
  isConversationBusyError,
  isEmptyResponseError,
@@ -284,6 +286,90 @@ describe("parseRetryAfterHeaderMs", () => {
  });
 });

+describe("getRetryDelayMs", () => {
+  test("uses default transient backoff for non-Cloudflare details", () => {
+    expect(
+      getRetryDelayMs({
+        category: "transient_provider",
+        attempt: 1,
+        detail: "Connection error during streaming",
+      }),
+    ).toBe(1000);
+    expect(
+      getRetryDelayMs({
+        category: "transient_provider",
+        attempt: 2,
+        detail: "Connection error during streaming",
+      }),
+    ).toBe(2000);
+  });
+
+  test("uses larger transient base for Cloudflare edge 52x details", () => {
+    const detail =
+      "521 <!DOCTYPE html><html><head><title>api.letta.com | 521: Web server is down</title></head><body>Cloudflare Ray ID: 9d431b5f6f656c08</body></html>";
+
+    expect(
+      getRetryDelayMs({
+        category: "transient_provider",
+        attempt: 1,
+        detail,
+      }),
+    ).toBe(5000);
+    expect(
+      getRetryDelayMs({
+        category: "transient_provider",
+        attempt: 3,
+        detail,
+      }),
+    ).toBe(20000);
+  });
+
+  test("uses Retry-After delay when provided for transient retries", () => {
+    const detail =
+      "521 <!DOCTYPE html><html><head><title>api.letta.com | 521: Web server is down</title></head><body>Cloudflare Ray ID: 9d431b5f6f656c08</body></html>";
+
+    expect(
+      getRetryDelayMs({
+        category: "transient_provider",
+        attempt: 3,
+        detail,
+        retryAfterMs: 7000,
+      }),
+    ).toBe(7000);
+  });
+
+  test("uses exponential conversation_busy profile", () => {
+    expect(getRetryDelayMs({ category: "conversation_busy", attempt: 1 })).toBe(
+      10000,
+    );
+    expect(getRetryDelayMs({ category: "conversation_busy", attempt: 2 })).toBe(
+      20000,
+    );
+  });
+
+  test("uses linear empty_response profile", () => {
+    expect(getRetryDelayMs({ category: "empty_response", attempt: 1 })).toBe(
+      500,
+    );
+    expect(getRetryDelayMs({ category: "empty_response", attempt: 2 })).toBe(
+      1000,
+    );
+  });
+});
+
+describe("getTransientRetryDelayMs", () => {
+  test("matches transient_provider category behavior", () => {
+    const detail = "Connection error during streaming";
+    expect(getTransientRetryDelayMs({ attempt: 2, detail })).toBe(
+      getRetryDelayMs({
+        category: "transient_provider",
+        attempt: 2,
+        detail,
+      }),
+    );
+  });
+});
+
 // ── Error text extraction ───────────────────────────────────────────

 describe("extractConflictDetail", () => {
--- a/src/websocket/listen-client.ts
+++ b/src/websocket/listen-client.ts
@@ -23,6 +23,7 @@ import { getStreamToolContextId, sendMessageStream } from "../agent/message";
 import {
  extractConflictDetail,
  getPreStreamErrorAction,
+  getRetryDelayMs,
  isApprovalPendingError,
  isInvalidToolCallIdsError,
  parseRetryAfterHeaderMs,
@@ -1536,7 +1537,7 @@ async function sendMessageStreamWithRetry(
  let transientRetries = 0;
  let conversationBusyRetries = 0;
  let preStreamRecoveryAttempts = 0;
-  const MAX_CONVERSATION_BUSY_RETRIES = 1;
+  const MAX_CONVERSATION_BUSY_RETRIES = 3;

  // eslint-disable-next-line no-constant-condition
  while (true) {
@@ -1609,7 +1610,12 @@ async function sendMessageStreamWithRetry(
                preStreamError.headers?.get("retry-after"),
              )
            : null;
-        const delayMs = retryAfterMs ?? 1000 * 2 ** (attempt - 1);
+        const delayMs = getRetryDelayMs({
+          category: "transient_provider",
+          attempt,
+          detail: errorDetail,
+          retryAfterMs,
+        });
        transientRetries = attempt;

        emitToWS(socket, {
@@ -1631,7 +1637,10 @@ async function sendMessageStreamWithRetry(

      if (action === "retry_conversation_busy") {
        const attempt = conversationBusyRetries + 1;
-        const delayMs = 2500;
+        const delayMs = getRetryDelayMs({
+          category: "conversation_busy",
+          attempt,
+        });
        conversationBusyRetries = attempt;

        emitToWS(socket, {