feat: resume server-side run on pre-stream 409 conversation busy (#1370)

Co-authored-by: Letta Code <noreply@letta.com>
2026-03-12 17:59:45 -06:00
parent af45355617
commit 0992c00a03
4 changed files with 231 additions and 49 deletions
--- a/src/cli/App.tsx
+++ b/src/cli/App.tsx
@@ -52,7 +52,11 @@ import {
  ensureMemoryFilesystemDirs,
  getMemoryFilesystemRoot,
 } from "../agent/memoryFilesystem";
-import { getStreamToolContextId, sendMessageStream } from "../agent/message";
+import {
  getStreamToolContextId,
  type StreamRequestContext,
  sendMessageStream,
 } from "../agent/message";
 import {
  getModelInfo,
  getModelInfoForLlmConfig,
@@ -268,7 +272,13 @@ import {
 import { formatStatusLineHelp } from "./helpers/statusLineHelp";
 import { buildStatusLinePayload } from "./helpers/statusLinePayload";
 import { executeStatusLineCommand } from "./helpers/statusLineRuntime";
-import { type ApprovalRequest, drainStreamWithResume } from "./helpers/stream";
+import {
  type ApprovalRequest,
  type DrainResult,
  discoverFallbackRunIdWithTimeout,
  drainStream,
  drainStreamWithResume,
 } from "./helpers/stream";
 import {
  collectFinishedTaskToolCalls,
  createSubagentGroupItem,
@@ -3942,6 +3952,10 @@ export default function App({
          clearCompletedSubagents();
        }
        // Capture once before the retry loop so the temporal filter in
        // discoverFallbackRunIdForResume covers runs created by any attempt.
        const requestStartedAtMs = Date.now();
        while (true) {
          // Capture the signal BEFORE any async operations
          // This prevents a race where handleInterrupt nulls the ref during await
@@ -3985,15 +3999,18 @@ export default function App({
          // Wrap in try-catch to handle pre-stream desync errors (when sendMessageStream
          // throws before streaming begins, e.g., retry after LLM error when backend
          // already cleared the approval)
-          let stream: Awaited<ReturnType<typeof sendMessageStream>>;
+          let stream: Awaited<ReturnType<typeof sendMessageStream>> | null =
            null;
          let turnToolContextId: string | null = null;
          let preStreamResumeResult: DrainResult | null = null;
          try {
-            stream = await sendMessageStream(
+            const nextStream = await sendMessageStream(
              conversationIdRef.current,
              currentInput,
              { agentId: agentIdRef.current },
            );
-            turnToolContextId = getStreamToolContextId(stream);
+            stream = nextStream;
            turnToolContextId = getStreamToolContextId(nextStream);
          } catch (preStreamError) {
            debugLog(
              "stream",
@@ -4082,42 +4099,134 @@ export default function App({
                },
              );
-              // Show status message
+              // Attempt to discover and resume the in-flight run before waiting
-              const statusId = uid("status");
+              try {
-              buffersRef.current.byId.set(statusId, {
+                const resumeCtx: StreamRequestContext = {
-                kind: "status",
+                  conversationId: conversationIdRef.current,
-                id: statusId,
+                  resolvedConversationId: conversationIdRef.current,
-                lines: ["Conversation is busy, waiting and retrying…"],
+                  agentId: agentIdRef.current,
-              });
+                  requestStartedAtMs,
-              buffersRef.current.order.push(statusId);
+                };
-              refreshDerived();
+                debugLog(
                  "stream",
                  "Conversation busy: attempting run discovery for resume (conv=%s, agent=%s)",
                  resumeCtx.conversationId,
                  resumeCtx.agentId,
                );
                const client = await getClient();
                const discoveredRunId = await discoverFallbackRunIdWithTimeout(
                  client,
                  resumeCtx,
                );
                debugLog(
                  "stream",
                  "Run discovery result: %s",
                  discoveredRunId ?? "none",
                );
-              // Wait with abort checking (same pattern as LLM API error retry)
+                if (discoveredRunId) {
-              let cancelled = false;
+                  if (signal?.aborted || userCancelledRef.current) {
-              const startTime = Date.now();
+                    const isStaleAtAbort =
-              while (Date.now() - startTime < retryDelayMs) {
+                      myGeneration !== conversationGenerationRef.current;
-                if (
+                    if (!isStaleAtAbort) {
-                  abortControllerRef.current?.signal.aborted ||
+                      setStreaming(false);
-                  userCancelledRef.current
+                    }
-                ) {
+                    return;
-                  cancelled = true;
+                  }
-                  break;
+
                  // Found a running run — resume its stream
                  buffersRef.current.interrupted = false;
                  buffersRef.current.commitGeneration =
                    (buffersRef.current.commitGeneration || 0) + 1;
                  const resumeStream = await client.runs.messages.stream(
                    discoveredRunId,
                    {
                      starting_after: 0,
                      batch_size: 1000,
                    },
                  );
                  preStreamResumeResult = await drainStream(
                    resumeStream,
                    buffersRef.current,
                    refreshDerivedThrottled,
                    signal,
                    undefined, // no handleFirstMessage on resume
                    undefined,
                    contextTrackerRef.current,
                  );
                  // Attach the discovered run ID
                  if (!preStreamResumeResult.lastRunId) {
                    preStreamResumeResult.lastRunId = discoveredRunId;
                  }
                  debugLog(
                    "stream",
                    "Pre-stream resume succeeded (runId=%s, stopReason=%s)",
                    discoveredRunId,
                    preStreamResumeResult.stopReason,
                  );
                  // Fall through — preStreamResumeResult will short-circuit drainStreamWithResume
                }
-                await new Promise((resolve) => setTimeout(resolve, 100));
+              } catch (resumeError) {
                if (signal?.aborted || userCancelledRef.current) {
                  const isStaleAtAbort =
                    myGeneration !== conversationGenerationRef.current;
                  if (!isStaleAtAbort) {
                    setStreaming(false);
                  }
                  return;
                }
                debugLog(
                  "stream",
                  "Pre-stream resume failed, falling back to wait/retry: %s",
                  resumeError instanceof Error
                    ? resumeError.message
                    : String(resumeError),
                );
                // Fall through to existing wait/retry behavior
              }
-              // Remove status message
+              // If resume succeeded, skip the wait/retry loop
-              buffersRef.current.byId.delete(statusId);
+              if (!preStreamResumeResult) {
-              buffersRef.current.order = buffersRef.current.order.filter(
+                // Show status message
-                (id) => id !== statusId,
+                const statusId = uid("status");
-              );
+                buffersRef.current.byId.set(statusId, {
-              refreshDerived();
+                  kind: "status",
                  id: statusId,
                  lines: ["Conversation is busy, waiting and retrying…"],
                });
                buffersRef.current.order.push(statusId);
                refreshDerived();
-              if (!cancelled) {
+                // Wait with abort checking (same pattern as LLM API error retry)
-                // Reset interrupted flag so retry stream chunks are processed
+                let cancelled = false;
-                buffersRef.current.interrupted = false;
+                const startTime = Date.now();
-                restorePinnedPermissionMode();
+                while (Date.now() - startTime < retryDelayMs) {
-                continue;
+                  if (
                    abortControllerRef.current?.signal.aborted ||
                    userCancelledRef.current
                  ) {
                    cancelled = true;
                    break;
                  }
                  await new Promise((resolve) => setTimeout(resolve, 100));
                }
                // Remove status message
                buffersRef.current.byId.delete(statusId);
                buffersRef.current.order = buffersRef.current.order.filter(
                  (id) => id !== statusId,
                );
                refreshDerived();
                if (!cancelled) {
                  // Reset interrupted flag so retry stream chunks are processed
                  buffersRef.current.interrupted = false;
                  restorePinnedPermissionMode();
                  continue;
                }
              }
              // User pressed ESC - fall through to error handling
            }
@@ -4297,7 +4406,10 @@ export default function App({
            }
            // Not a recoverable desync - re-throw to outer catch
-            throw preStreamError;
+            // (unless pre-stream resume already succeeded)
            if (!preStreamResumeResult) {
              throw preStreamError;
            }
          }
          // Check again after network call - user may have pressed Escape during sendMessageStream
@@ -4403,6 +4515,25 @@ export default function App({
            contextTrackerRef.current.currentTurnId++;
          }
          const drainResult = preStreamResumeResult
            ? preStreamResumeResult
            : (() => {
                if (!stream) {
                  throw new Error(
                    "Expected stream when pre-stream resume did not succeed",
                  );
                }
                return drainStreamWithResume(
                  stream,
                  buffersRef.current,
                  refreshDerivedThrottled,
                  signal, // Use captured signal, not ref (which may be nulled by handleInterrupt)
                  handleFirstMessage,
                  undefined,
                  contextTrackerRef.current,
                );
              })();
          const {
            stopReason,
            approval,
@@ -4410,15 +4541,7 @@ export default function App({
            apiDurationMs,
            lastRunId,
            fallbackError,
-          } = await drainStreamWithResume(
+          } = await drainResult;
            stream,
            buffersRef.current,
            refreshDerivedThrottled,
            signal, // Use captured signal, not ref (which may be nulled by handleInterrupt)
            handleFirstMessage,
            undefined,
            contextTrackerRef.current,
          );
          // Update currentRunId for error reporting in catch block
          currentRunId = lastRunId ?? undefined;
--- a/src/cli/helpers/stream.ts
+++ b/src/cli/helpers/stream.ts
@@ -16,7 +16,7 @@ import {
  type StreamRequestContext,
 } from "../../agent/message";
 import { telemetry } from "../../telemetry";
-import { debugWarn } from "../../utils/debug";
+import { debugLog, debugWarn } from "../../utils/debug";
 import { formatDuration, logTiming } from "../../utils/timing";
 import {
@@ -57,7 +57,7 @@ export type DrainStreamHook = (
  | undefined
  | Promise<DrainStreamHookResult | undefined>;
-type DrainResult = {
+export type DrainResult = {
  stopReason: StopReasonType;
  lastRunId?: string | null;
  lastSeqId?: number | null;
@@ -101,7 +101,7 @@ function parseRunCreatedAtMs(run: Run): number {
  return Number.isFinite(parsed) ? parsed : 0;
 }
-async function discoverFallbackRunIdWithTimeout(
+export async function discoverFallbackRunIdWithTimeout(
  client: RunsListClient,
  ctx: StreamRequestContext,
 ): Promise<string | null> {
@@ -512,6 +512,9 @@ export async function drainStreamWithResume(
  );
  let runIdToResume = result.lastRunId ?? null;
  let runIdSource: "stream_chunk" | "discovery" | null = result.lastRunId
    ? "stream_chunk"
    : null;
  // If the stream failed before exposing run_id, try to discover the latest
  // running/created run for this conversation that was created after send start.
@@ -523,13 +526,25 @@ export async function drainStreamWithResume(
    !abortSignal.aborted
  ) {
    try {
      debugLog(
        "stream",
        "Mid-stream resume: attempting run discovery (conv=%s, agent=%s)",
        streamRequestContext.conversationId,
        streamRequestContext.agentId,
      );
      const client = await lazyClient();
      runIdToResume = await discoverFallbackRunIdWithTimeout(
        client,
        streamRequestContext,
      );
      debugLog(
        "stream",
        "Mid-stream resume: run discovery result: %s",
        runIdToResume ?? "none",
      );
      if (runIdToResume) {
        result.lastRunId = runIdToResume;
        runIdSource = "discovery";
      }
    } catch (lookupError) {
      const lookupErrorMsg =
@@ -574,6 +589,21 @@ export async function drainStreamWithResume(
      },
    );
    debugLog(
      "stream",
      "Mid-stream resume: fetching run stream (source=%s, runId=%s, lastSeqId=%s)",
      runIdSource ?? "unknown",
      runIdToResume,
      result.lastSeqId ?? 0,
    );
    debugLog(
      "stream",
      "Mid-stream resume: attempting resume (runId=%s, lastSeqId=%s)",
      runIdToResume,
      result.lastSeqId ?? 0,
    );
    try {
      const client = await lazyClient();
@@ -613,6 +643,12 @@ export async function drainStreamWithResume(
      // Use the resume result (should have proper stop_reason now)
      // Clear the original stream error since we recovered
      debugLog(
        "stream",
        "Mid-stream resume succeeded (runId=%s, stopReason=%s)",
        runIdToResume,
        resumeResult.stopReason,
      );
      result = resumeResult;
      // The resumed stream uses a fresh streamProcessor that won't have
@@ -635,6 +671,12 @@ export async function drainStreamWithResume(
        resumeError instanceof Error
          ? resumeError.message
          : String(resumeError);
      debugLog(
        "stream",
        "Mid-stream resume failed (runId=%s): %s",
        runIdToResume,
        resumeErrorMsg,
      );
      telemetry.trackError(
        "stream_resume_failed",
        resumeErrorMsg,
@@ -655,6 +697,11 @@ export async function drainStreamWithResume(
    // Only log if we actually skipped for a reason (i.e., we didn't enter the resume branch above)
    if (skipReasons.length > 0) {
      debugLog(
        "stream",
        "Mid-stream resume skipped: %s",
        skipReasons.join(", "),
      );
      telemetry.trackError(
        "stream_resume_skipped",
        `${result.fallbackError || "Stream error (no client-side detail)"} [skip: ${skipReasons.join(", ")}]`,
--- a/src/headless.ts
+++ b/src/headless.ts
@@ -1610,6 +1610,11 @@ ${SYSTEM_REMINDER_CLOSE}
        }
        // Check for 409 "conversation busy" error - retry once with delay
        // TODO: Add pre-stream resume logic for parity with App.tsx.
        // Before waiting, attempt to discover the in-flight run via
        // discoverFallbackRunIdWithTimeout() and resume its stream with
        // client.runs.messages.stream() + drainStream(). See App.tsx
        // retry_conversation_busy handler for reference implementation.
        if (preStreamAction === "retry_conversation_busy") {
          conversationBusyRetries += 1;
          const retryDelayMs = getRetryDelayMs({
--- a/src/websocket/listen-client.ts
+++ b/src/websocket/listen-client.ts
@@ -2216,6 +2216,13 @@ async function sendMessageStreamWithRetry(
      }
      if (action === "retry_conversation_busy") {
        // TODO: Add pre-stream resume logic for parity with App.tsx.
        // Before waiting, attempt to discover the in-flight run via
        // discoverFallbackRunIdWithTimeout() and resume its stream with
        // client.runs.messages.stream() + drainStream(). This avoids
        // blind wait/retry cycles when the server already created a run
        // from the original request. See App.tsx retry_conversation_busy
        // handler for reference implementation.
        const attempt = conversationBusyRetries + 1;
        const delayMs = getRetryDelayMs({
          category: "conversation_busy",