feat: resume server-side run on pre-stream 409 conversation busy (#1370)

Co-authored-by: Letta Code <noreply@letta.com>
2026-03-12 17:59:45 -06:00
parent af45355617
commit 0992c00a03
4 changed files with 231 additions and 49 deletions
--- a/src/cli/App.tsx
+++ b/src/cli/App.tsx
@@ -52,7 +52,11 @@ import {
  ensureMemoryFilesystemDirs,
  getMemoryFilesystemRoot,
 } from "../agent/memoryFilesystem";
-import { getStreamToolContextId, sendMessageStream } from "../agent/message";
+import {
+  getStreamToolContextId,
+  type StreamRequestContext,
+  sendMessageStream,
+} from "../agent/message";
 import {
  getModelInfo,
  getModelInfoForLlmConfig,
@@ -268,7 +272,13 @@ import {
 import { formatStatusLineHelp } from "./helpers/statusLineHelp";
 import { buildStatusLinePayload } from "./helpers/statusLinePayload";
 import { executeStatusLineCommand } from "./helpers/statusLineRuntime";
-import { type ApprovalRequest, drainStreamWithResume } from "./helpers/stream";
+import {
+  type ApprovalRequest,
+  type DrainResult,
+  discoverFallbackRunIdWithTimeout,
+  drainStream,
+  drainStreamWithResume,
+} from "./helpers/stream";
 import {
  collectFinishedTaskToolCalls,
  createSubagentGroupItem,
@@ -3942,6 +3952,10 @@ export default function App({
          clearCompletedSubagents();
        }

+        // Capture once before the retry loop so the temporal filter in
+        // discoverFallbackRunIdForResume covers runs created by any attempt.
+        const requestStartedAtMs = Date.now();
+
        while (true) {
          // Capture the signal BEFORE any async operations
          // This prevents a race where handleInterrupt nulls the ref during await
@@ -3985,15 +3999,18 @@ export default function App({
          // Wrap in try-catch to handle pre-stream desync errors (when sendMessageStream
          // throws before streaming begins, e.g., retry after LLM error when backend
          // already cleared the approval)
-          let stream: Awaited<ReturnType<typeof sendMessageStream>>;
+          let stream: Awaited<ReturnType<typeof sendMessageStream>> | null =
+            null;
          let turnToolContextId: string | null = null;
+          let preStreamResumeResult: DrainResult | null = null;
          try {
-            stream = await sendMessageStream(
+            const nextStream = await sendMessageStream(
              conversationIdRef.current,
              currentInput,
              { agentId: agentIdRef.current },
            );
-            turnToolContextId = getStreamToolContextId(stream);
+            stream = nextStream;
+            turnToolContextId = getStreamToolContextId(nextStream);
          } catch (preStreamError) {
            debugLog(
              "stream",
@@ -4082,42 +4099,134 @@ export default function App({
                },
              );

-              // Show status message
-              const statusId = uid("status");
-              buffersRef.current.byId.set(statusId, {
-                kind: "status",
-                id: statusId,
-                lines: ["Conversation is busy, waiting and retrying…"],
-              });
-              buffersRef.current.order.push(statusId);
-              refreshDerived();
+              // Attempt to discover and resume the in-flight run before waiting
+              try {
+                const resumeCtx: StreamRequestContext = {
+                  conversationId: conversationIdRef.current,
+                  resolvedConversationId: conversationIdRef.current,
+                  agentId: agentIdRef.current,
+                  requestStartedAtMs,
+                };
+                debugLog(
+                  "stream",
+                  "Conversation busy: attempting run discovery for resume (conv=%s, agent=%s)",
+                  resumeCtx.conversationId,
+                  resumeCtx.agentId,
+                );
+                const client = await getClient();
+                const discoveredRunId = await discoverFallbackRunIdWithTimeout(
+                  client,
+                  resumeCtx,
+                );
+                debugLog(
+                  "stream",
+                  "Run discovery result: %s",
+                  discoveredRunId ?? "none",
+                );

-              // Wait with abort checking (same pattern as LLM API error retry)
-              let cancelled = false;
-              const startTime = Date.now();
-              while (Date.now() - startTime < retryDelayMs) {
-                if (
-                  abortControllerRef.current?.signal.aborted ||
-                  userCancelledRef.current
-                ) {
-                  cancelled = true;
-                  break;
+                if (discoveredRunId) {
+                  if (signal?.aborted || userCancelledRef.current) {
+                    const isStaleAtAbort =
+                      myGeneration !== conversationGenerationRef.current;
+                    if (!isStaleAtAbort) {
+                      setStreaming(false);
+                    }
+                    return;
+                  }
+
+                  // Found a running run — resume its stream
+                  buffersRef.current.interrupted = false;
+                  buffersRef.current.commitGeneration =
+                    (buffersRef.current.commitGeneration || 0) + 1;
+
+                  const resumeStream = await client.runs.messages.stream(
+                    discoveredRunId,
+                    {
+                      starting_after: 0,
+                      batch_size: 1000,
+                    },
+                  );
+
+                  preStreamResumeResult = await drainStream(
+                    resumeStream,
+                    buffersRef.current,
+                    refreshDerivedThrottled,
+                    signal,
+                    undefined, // no handleFirstMessage on resume
+                    undefined,
+                    contextTrackerRef.current,
+                  );
+                  // Attach the discovered run ID
+                  if (!preStreamResumeResult.lastRunId) {
+                    preStreamResumeResult.lastRunId = discoveredRunId;
+                  }
+                  debugLog(
+                    "stream",
+                    "Pre-stream resume succeeded (runId=%s, stopReason=%s)",
+                    discoveredRunId,
+                    preStreamResumeResult.stopReason,
+                  );
+                  // Fall through — preStreamResumeResult will short-circuit drainStreamWithResume
                }
-                await new Promise((resolve) => setTimeout(resolve, 100));
+              } catch (resumeError) {
+                if (signal?.aborted || userCancelledRef.current) {
+                  const isStaleAtAbort =
+                    myGeneration !== conversationGenerationRef.current;
+                  if (!isStaleAtAbort) {
+                    setStreaming(false);
+                  }
+                  return;
+                }
+
+                debugLog(
+                  "stream",
+                  "Pre-stream resume failed, falling back to wait/retry: %s",
+                  resumeError instanceof Error
+                    ? resumeError.message
+                    : String(resumeError),
+                );
+                // Fall through to existing wait/retry behavior
              }

-              // Remove status message
-              buffersRef.current.byId.delete(statusId);
-              buffersRef.current.order = buffersRef.current.order.filter(
-                (id) => id !== statusId,
-              );
-              refreshDerived();
+              // If resume succeeded, skip the wait/retry loop
+              if (!preStreamResumeResult) {
+                // Show status message
+                const statusId = uid("status");
+                buffersRef.current.byId.set(statusId, {
+                  kind: "status",
+                  id: statusId,
+                  lines: ["Conversation is busy, waiting and retrying…"],
+                });
+                buffersRef.current.order.push(statusId);
+                refreshDerived();

-              if (!cancelled) {
-                // Reset interrupted flag so retry stream chunks are processed
-                buffersRef.current.interrupted = false;
-                restorePinnedPermissionMode();
-                continue;
+                // Wait with abort checking (same pattern as LLM API error retry)
+                let cancelled = false;
+                const startTime = Date.now();
+                while (Date.now() - startTime < retryDelayMs) {
+                  if (
+                    abortControllerRef.current?.signal.aborted ||
+                    userCancelledRef.current
+                  ) {
+                    cancelled = true;
+                    break;
+                  }
+                  await new Promise((resolve) => setTimeout(resolve, 100));
+                }
+
+                // Remove status message
+                buffersRef.current.byId.delete(statusId);
+                buffersRef.current.order = buffersRef.current.order.filter(
+                  (id) => id !== statusId,
+                );
+                refreshDerived();
+
+                if (!cancelled) {
+                  // Reset interrupted flag so retry stream chunks are processed
+                  buffersRef.current.interrupted = false;
+                  restorePinnedPermissionMode();
+                  continue;
+                }
              }
              // User pressed ESC - fall through to error handling
            }
@@ -4297,7 +4406,10 @@ export default function App({
            }

            // Not a recoverable desync - re-throw to outer catch
-            throw preStreamError;
+            // (unless pre-stream resume already succeeded)
+            if (!preStreamResumeResult) {
+              throw preStreamError;
+            }
          }

          // Check again after network call - user may have pressed Escape during sendMessageStream
@@ -4403,6 +4515,25 @@ export default function App({
            contextTrackerRef.current.currentTurnId++;
          }

+          const drainResult = preStreamResumeResult
+            ? preStreamResumeResult
+            : (() => {
+                if (!stream) {
+                  throw new Error(
+                    "Expected stream when pre-stream resume did not succeed",
+                  );
+                }
+                return drainStreamWithResume(
+                  stream,
+                  buffersRef.current,
+                  refreshDerivedThrottled,
+                  signal, // Use captured signal, not ref (which may be nulled by handleInterrupt)
+                  handleFirstMessage,
+                  undefined,
+                  contextTrackerRef.current,
+                );
+              })();
+
          const {
            stopReason,
            approval,
@@ -4410,15 +4541,7 @@ export default function App({
            apiDurationMs,
            lastRunId,
            fallbackError,
-          } = await drainStreamWithResume(
-            stream,
-            buffersRef.current,
-            refreshDerivedThrottled,
-            signal, // Use captured signal, not ref (which may be nulled by handleInterrupt)
-            handleFirstMessage,
-            undefined,
-            contextTrackerRef.current,
-          );
+          } = await drainResult;

          // Update currentRunId for error reporting in catch block
          currentRunId = lastRunId ?? undefined;
--- a/src/cli/helpers/stream.ts
+++ b/src/cli/helpers/stream.ts
@@ -16,7 +16,7 @@ import {
  type StreamRequestContext,
 } from "../../agent/message";
 import { telemetry } from "../../telemetry";
-import { debugWarn } from "../../utils/debug";
+import { debugLog, debugWarn } from "../../utils/debug";
 import { formatDuration, logTiming } from "../../utils/timing";

 import {
@@ -57,7 +57,7 @@ export type DrainStreamHook = (
  | undefined
  | Promise<DrainStreamHookResult | undefined>;

-type DrainResult = {
+export type DrainResult = {
  stopReason: StopReasonType;
  lastRunId?: string | null;
  lastSeqId?: number | null;
@@ -101,7 +101,7 @@ function parseRunCreatedAtMs(run: Run): number {
  return Number.isFinite(parsed) ? parsed : 0;
 }

-async function discoverFallbackRunIdWithTimeout(
+export async function discoverFallbackRunIdWithTimeout(
  client: RunsListClient,
  ctx: StreamRequestContext,
 ): Promise<string | null> {
@@ -512,6 +512,9 @@ export async function drainStreamWithResume(
  );

  let runIdToResume = result.lastRunId ?? null;
+  let runIdSource: "stream_chunk" | "discovery" | null = result.lastRunId
+    ? "stream_chunk"
+    : null;

  // If the stream failed before exposing run_id, try to discover the latest
  // running/created run for this conversation that was created after send start.
@@ -523,13 +526,25 @@ export async function drainStreamWithResume(
    !abortSignal.aborted
  ) {
    try {
+      debugLog(
+        "stream",
+        "Mid-stream resume: attempting run discovery (conv=%s, agent=%s)",
+        streamRequestContext.conversationId,
+        streamRequestContext.agentId,
+      );
      const client = await lazyClient();
      runIdToResume = await discoverFallbackRunIdWithTimeout(
        client,
        streamRequestContext,
      );
+      debugLog(
+        "stream",
+        "Mid-stream resume: run discovery result: %s",
+        runIdToResume ?? "none",
+      );
      if (runIdToResume) {
        result.lastRunId = runIdToResume;
+        runIdSource = "discovery";
      }
    } catch (lookupError) {
      const lookupErrorMsg =
@@ -574,6 +589,21 @@ export async function drainStreamWithResume(
      },
    );

+    debugLog(
+      "stream",
+      "Mid-stream resume: fetching run stream (source=%s, runId=%s, lastSeqId=%s)",
+      runIdSource ?? "unknown",
+      runIdToResume,
+      result.lastSeqId ?? 0,
+    );
+
+    debugLog(
+      "stream",
+      "Mid-stream resume: attempting resume (runId=%s, lastSeqId=%s)",
+      runIdToResume,
+      result.lastSeqId ?? 0,
+    );
+
    try {
      const client = await lazyClient();

@@ -613,6 +643,12 @@ export async function drainStreamWithResume(

      // Use the resume result (should have proper stop_reason now)
      // Clear the original stream error since we recovered
+      debugLog(
+        "stream",
+        "Mid-stream resume succeeded (runId=%s, stopReason=%s)",
+        runIdToResume,
+        resumeResult.stopReason,
+      );
      result = resumeResult;

      // The resumed stream uses a fresh streamProcessor that won't have
@@ -635,6 +671,12 @@ export async function drainStreamWithResume(
        resumeError instanceof Error
          ? resumeError.message
          : String(resumeError);
+      debugLog(
+        "stream",
+        "Mid-stream resume failed (runId=%s): %s",
+        runIdToResume,
+        resumeErrorMsg,
+      );
      telemetry.trackError(
        "stream_resume_failed",
        resumeErrorMsg,
@@ -655,6 +697,11 @@ export async function drainStreamWithResume(

    // Only log if we actually skipped for a reason (i.e., we didn't enter the resume branch above)
    if (skipReasons.length > 0) {
+      debugLog(
+        "stream",
+        "Mid-stream resume skipped: %s",
+        skipReasons.join(", "),
+      );
      telemetry.trackError(
        "stream_resume_skipped",
        `${result.fallbackError || "Stream error (no client-side detail)"} [skip: ${skipReasons.join(", ")}]`,
--- a/src/headless.ts
+++ b/src/headless.ts
@@ -1610,6 +1610,11 @@ ${SYSTEM_REMINDER_CLOSE}
        }

        // Check for 409 "conversation busy" error - retry once with delay
+        // TODO: Add pre-stream resume logic for parity with App.tsx.
+        // Before waiting, attempt to discover the in-flight run via
+        // discoverFallbackRunIdWithTimeout() and resume its stream with
+        // client.runs.messages.stream() + drainStream(). See App.tsx
+        // retry_conversation_busy handler for reference implementation.
        if (preStreamAction === "retry_conversation_busy") {
          conversationBusyRetries += 1;
          const retryDelayMs = getRetryDelayMs({
--- a/src/websocket/listen-client.ts
+++ b/src/websocket/listen-client.ts
@@ -2216,6 +2216,13 @@ async function sendMessageStreamWithRetry(
      }

      if (action === "retry_conversation_busy") {
+        // TODO: Add pre-stream resume logic for parity with App.tsx.
+        // Before waiting, attempt to discover the in-flight run via
+        // discoverFallbackRunIdWithTimeout() and resume its stream with
+        // client.runs.messages.stream() + drainStream(). This avoids
+        // blind wait/retry cycles when the server already created a run
+        // from the original request. See App.tsx retry_conversation_busy
+        // handler for reference implementation.
        const attempt = conversationBusyRetries + 1;
        const delayMs = getRetryDelayMs({
          category: "conversation_busy",