From a57cf84e035873451c92612fa6f1192ae67e5be3 Mon Sep 17 00:00:00 2001 From: jnjpng Date: Mon, 9 Mar 2026 14:51:41 -0700 Subject: [PATCH] fix(retry): increase Cloudflare transient backoff to 5/10/20 (#1307) --- src/agent/approval-recovery.ts | 3 + src/agent/turn-recovery-policy.ts | 55 ++++++++++++++++ src/cli/App.tsx | 27 +++++--- src/headless.ts | 42 +++++++++---- src/tests/turn-recovery-policy.test.ts | 86 ++++++++++++++++++++++++++ src/websocket/listen-client.ts | 15 ++++- 6 files changed, 207 insertions(+), 21 deletions(-) diff --git a/src/agent/approval-recovery.ts b/src/agent/approval-recovery.ts index f00c30a..d1ec9f3 100644 --- a/src/agent/approval-recovery.ts +++ b/src/agent/approval-recovery.ts @@ -13,12 +13,15 @@ export type { PreStreamConflictKind, PreStreamErrorAction, PreStreamErrorOptions, + RetryDelayCategory, } from "./turn-recovery-policy"; // ── Re-export pure policy helpers (single source of truth) ────────── export { classifyPreStreamConflict, extractConflictDetail, getPreStreamErrorAction, + getRetryDelayMs, + getTransientRetryDelayMs, isApprovalPendingError, isConversationBusyError, isEmptyResponseError, diff --git a/src/agent/turn-recovery-policy.ts b/src/agent/turn-recovery-policy.ts index f7c7104..3277a89 100644 --- a/src/agent/turn-recovery-policy.ts +++ b/src/agent/turn-recovery-policy.ts @@ -65,6 +65,11 @@ const NON_RETRYABLE_QUOTA_DETAIL_PATTERNS = [ ]; const NON_RETRYABLE_4XX_PATTERN = /Error code:\s*4(0[0-8]|1\d|2\d|3\d|4\d|51)/i; const RETRYABLE_429_PATTERN = /Error code:\s*429|rate limit|too many requests/i; +const DEFAULT_TRANSIENT_RETRY_BASE_DELAY_MS = 1000; +const CLOUDFLARE_EDGE_52X_RETRY_BASE_DELAY_MS = 5000; +const CONVERSATION_BUSY_RETRY_BASE_DELAY_MS = 10000; +const EMPTY_RESPONSE_RETRY_BASE_DELAY_MS = 500; + function isCloudflareEdge52xDetail(detail: unknown): boolean { if (typeof detail !== "string") return false; return isCloudflareEdge52xHtmlError(detail); @@ -206,6 +211,56 @@ export function parseRetryAfterHeaderMs( return delayMs > 0 ? delayMs : 0; } +export type RetryDelayCategory = + | "transient_provider" + | "conversation_busy" + | "empty_response"; + +/** + * Compute retry delay for known retry classes. + * - `transient_provider`: exponential (Cloudflare-specific base) with Retry-After override + * - `conversation_busy`: exponential + * - `empty_response`: linear + */ +export function getRetryDelayMs(opts: { + category: RetryDelayCategory; + attempt: number; + detail?: unknown; + retryAfterMs?: number | null; +}): number { + const { category, attempt, detail, retryAfterMs = null } = opts; + + if (category === "transient_provider") { + if (retryAfterMs !== null) return retryAfterMs; + const baseDelayMs = isCloudflareEdge52xDetail(detail) + ? CLOUDFLARE_EDGE_52X_RETRY_BASE_DELAY_MS + : DEFAULT_TRANSIENT_RETRY_BASE_DELAY_MS; + return baseDelayMs * 2 ** (attempt - 1); + } + + if (category === "conversation_busy") { + return CONVERSATION_BUSY_RETRY_BASE_DELAY_MS * 2 ** (attempt - 1); + } + + return EMPTY_RESPONSE_RETRY_BASE_DELAY_MS * attempt; +} + +/** + * Backward-compatible wrapper for transient provider retries. + */ +export function getTransientRetryDelayMs(opts: { + attempt: number; + detail: unknown; + retryAfterMs?: number | null; +}): number { + return getRetryDelayMs({ + category: "transient_provider", + attempt: opts.attempt, + detail: opts.detail, + retryAfterMs: opts.retryAfterMs, + }); +} + // ── Pre-stream conflict routing ───────────────────────────────────── export type PreStreamConflictKind = diff --git a/src/cli/App.tsx b/src/cli/App.tsx index d3218fe..5efb444 100644 --- a/src/cli/App.tsx +++ b/src/cli/App.tsx @@ -32,6 +32,7 @@ import { extractConflictDetail, fetchRunErrorDetail, getPreStreamErrorAction, + getRetryDelayMs, isApprovalPendingError, isEmptyResponseRetryable, isInvalidToolCallIdsError, @@ -331,7 +332,6 @@ const EMPTY_RESPONSE_MAX_RETRIES = 2; // Retry config for 409 "conversation busy" errors (exponential backoff) const CONVERSATION_BUSY_MAX_RETRIES = 3; // 10s -> 20s -> 40s -const CONVERSATION_BUSY_RETRY_BASE_DELAY_MS = 10000; // 10 seconds // Message shown when user interrupts the stream const INTERRUPT_MESSAGE = @@ -4072,9 +4072,10 @@ export default function App({ // Check for 409 "conversation busy" error - retry with exponential backoff if (preStreamAction === "retry_conversation_busy") { conversationBusyRetriesRef.current += 1; - const retryDelayMs = - CONVERSATION_BUSY_RETRY_BASE_DELAY_MS * - 2 ** (conversationBusyRetriesRef.current - 1); + const retryDelayMs = getRetryDelayMs({ + category: "conversation_busy", + attempt: conversationBusyRetriesRef.current, + }); // Log the conversation-busy error telemetry.trackError( @@ -4142,7 +4143,12 @@ export default function App({ preStreamError.headers?.get("retry-after"), ) : null; - const delayMs = retryAfterMs ?? 1000 * 2 ** (attempt - 1); + const delayMs = getRetryDelayMs({ + category: "transient_provider", + attempt, + detail: errorDetail, + retryAfterMs, + }); // Log the error that triggered the retry telemetry.trackError( @@ -5348,7 +5354,10 @@ export default function App({ ) { emptyResponseRetriesRef.current += 1; const attempt = emptyResponseRetriesRef.current; - const delayMs = 500 * attempt; + const delayMs = getRetryDelayMs({ + category: "empty_response", + attempt, + }); // Only append a nudge on the last attempt if (attempt >= EMPTY_RESPONSE_MAX_RETRIES) { @@ -5397,7 +5406,11 @@ export default function App({ ) { llmApiErrorRetriesRef.current += 1; const attempt = llmApiErrorRetriesRef.current; - const delayMs = 1000 * 2 ** (attempt - 1); // 1s, 2s, 4s + const delayMs = getRetryDelayMs({ + category: "transient_provider", + attempt, + detail: detailFromRun ?? fallbackError, + }); // Log the error that triggered the retry telemetry.trackError( diff --git a/src/headless.ts b/src/headless.ts index 78a1857..43b6df6 100644 --- a/src/headless.ts +++ b/src/headless.ts @@ -12,6 +12,7 @@ import { extractConflictDetail, fetchRunErrorDetail, getPreStreamErrorAction, + getRetryDelayMs, isApprovalPendingError, isEmptyResponseRetryable, isInvalidToolCallIdsError, @@ -133,7 +134,6 @@ const EMPTY_RESPONSE_MAX_RETRIES = 2; // Retry config for 409 "conversation busy" errors (exponential backoff) const CONVERSATION_BUSY_MAX_RETRIES = 3; // 10s -> 20s -> 40s -const CONVERSATION_BUSY_RETRY_BASE_DELAY_MS = 10000; // 10 seconds export type BidirectionalQueuedInput = QueuedTurnInput< MessageCreate["content"] @@ -1544,9 +1544,10 @@ ${SYSTEM_REMINDER_CLOSE} // Check for 409 "conversation busy" error - retry once with delay if (preStreamAction === "retry_conversation_busy") { conversationBusyRetries += 1; - const retryDelayMs = - CONVERSATION_BUSY_RETRY_BASE_DELAY_MS * - 2 ** (conversationBusyRetries - 1); + const retryDelayMs = getRetryDelayMs({ + category: "conversation_busy", + attempt: conversationBusyRetries, + }); // Emit retry message for stream-json mode if (outputFormat === "stream-json") { @@ -1579,7 +1580,12 @@ ${SYSTEM_REMINDER_CLOSE} preStreamError.headers?.get("retry-after"), ) : null; - const delayMs = retryAfterMs ?? 1000 * 2 ** (attempt - 1); + const delayMs = getRetryDelayMs({ + category: "transient_provider", + attempt, + detail: errorDetail, + retryAfterMs, + }); llmApiErrorRetries = attempt; @@ -1910,8 +1916,11 @@ ${SYSTEM_REMINDER_CLOSE} if (stopReason === "llm_api_error") { if (llmApiErrorRetries < LLM_API_ERROR_MAX_RETRIES) { const attempt = llmApiErrorRetries + 1; - const baseDelayMs = 1000; - const delayMs = baseDelayMs * 2 ** (attempt - 1); + const delayMs = getRetryDelayMs({ + category: "transient_provider", + attempt, + detail: detailFromRun, + }); llmApiErrorRetries = attempt; @@ -2038,7 +2047,10 @@ ${SYSTEM_REMINDER_CLOSE} ) ) { const attempt = emptyResponseRetries + 1; - const delayMs = 500 * attempt; + const delayMs = getRetryDelayMs({ + category: "empty_response", + attempt, + }); emptyResponseRetries = attempt; @@ -2075,8 +2087,11 @@ ${SYSTEM_REMINDER_CLOSE} if (shouldRetryRunMetadataError(errorType, detail)) { const attempt = llmApiErrorRetries + 1; - const baseDelayMs = 1000; - const delayMs = baseDelayMs * 2 ** (attempt - 1); + const delayMs = getRetryDelayMs({ + category: "transient_provider", + attempt, + detail, + }); llmApiErrorRetries = attempt; @@ -3169,7 +3184,12 @@ async function runBidirectionalMode( preStreamError.headers?.get("retry-after"), ) : null; - const delayMs = retryAfterMs ?? 1000 * 2 ** (attempt - 1); + const delayMs = getRetryDelayMs({ + category: "transient_provider", + attempt, + detail: errorDetail, + retryAfterMs, + }); preStreamTransientRetries = attempt; const retryMsg: RetryMessage = { diff --git a/src/tests/turn-recovery-policy.test.ts b/src/tests/turn-recovery-policy.test.ts index e9a0101..5f3d33e 100644 --- a/src/tests/turn-recovery-policy.test.ts +++ b/src/tests/turn-recovery-policy.test.ts @@ -3,6 +3,8 @@ import { classifyPreStreamConflict, extractConflictDetail, getPreStreamErrorAction, + getRetryDelayMs, + getTransientRetryDelayMs, isApprovalPendingError, isConversationBusyError, isEmptyResponseError, @@ -284,6 +286,90 @@ describe("parseRetryAfterHeaderMs", () => { }); }); +describe("getRetryDelayMs", () => { + test("uses default transient backoff for non-Cloudflare details", () => { + expect( + getRetryDelayMs({ + category: "transient_provider", + attempt: 1, + detail: "Connection error during streaming", + }), + ).toBe(1000); + expect( + getRetryDelayMs({ + category: "transient_provider", + attempt: 2, + detail: "Connection error during streaming", + }), + ).toBe(2000); + }); + + test("uses larger transient base for Cloudflare edge 52x details", () => { + const detail = + "521 api.letta.com | 521: Web server is downCloudflare Ray ID: 9d431b5f6f656c08"; + + expect( + getRetryDelayMs({ + category: "transient_provider", + attempt: 1, + detail, + }), + ).toBe(5000); + expect( + getRetryDelayMs({ + category: "transient_provider", + attempt: 3, + detail, + }), + ).toBe(20000); + }); + + test("uses Retry-After delay when provided for transient retries", () => { + const detail = + "521 api.letta.com | 521: Web server is downCloudflare Ray ID: 9d431b5f6f656c08"; + + expect( + getRetryDelayMs({ + category: "transient_provider", + attempt: 3, + detail, + retryAfterMs: 7000, + }), + ).toBe(7000); + }); + + test("uses exponential conversation_busy profile", () => { + expect(getRetryDelayMs({ category: "conversation_busy", attempt: 1 })).toBe( + 10000, + ); + expect(getRetryDelayMs({ category: "conversation_busy", attempt: 2 })).toBe( + 20000, + ); + }); + + test("uses linear empty_response profile", () => { + expect(getRetryDelayMs({ category: "empty_response", attempt: 1 })).toBe( + 500, + ); + expect(getRetryDelayMs({ category: "empty_response", attempt: 2 })).toBe( + 1000, + ); + }); +}); + +describe("getTransientRetryDelayMs", () => { + test("matches transient_provider category behavior", () => { + const detail = "Connection error during streaming"; + expect(getTransientRetryDelayMs({ attempt: 2, detail })).toBe( + getRetryDelayMs({ + category: "transient_provider", + attempt: 2, + detail, + }), + ); + }); +}); + // ── Error text extraction ─────────────────────────────────────────── describe("extractConflictDetail", () => { diff --git a/src/websocket/listen-client.ts b/src/websocket/listen-client.ts index fa6ddc8..69ed17f 100644 --- a/src/websocket/listen-client.ts +++ b/src/websocket/listen-client.ts @@ -23,6 +23,7 @@ import { getStreamToolContextId, sendMessageStream } from "../agent/message"; import { extractConflictDetail, getPreStreamErrorAction, + getRetryDelayMs, isApprovalPendingError, isInvalidToolCallIdsError, parseRetryAfterHeaderMs, @@ -1536,7 +1537,7 @@ async function sendMessageStreamWithRetry( let transientRetries = 0; let conversationBusyRetries = 0; let preStreamRecoveryAttempts = 0; - const MAX_CONVERSATION_BUSY_RETRIES = 1; + const MAX_CONVERSATION_BUSY_RETRIES = 3; // eslint-disable-next-line no-constant-condition while (true) { @@ -1609,7 +1610,12 @@ async function sendMessageStreamWithRetry( preStreamError.headers?.get("retry-after"), ) : null; - const delayMs = retryAfterMs ?? 1000 * 2 ** (attempt - 1); + const delayMs = getRetryDelayMs({ + category: "transient_provider", + attempt, + detail: errorDetail, + retryAfterMs, + }); transientRetries = attempt; emitToWS(socket, { @@ -1631,7 +1637,10 @@ async function sendMessageStreamWithRetry( if (action === "retry_conversation_busy") { const attempt = conversationBusyRetries + 1; - const delayMs = 2500; + const delayMs = getRetryDelayMs({ + category: "conversation_busy", + attempt, + }); conversationBusyRetries = attempt; emitToWS(socket, {