feat: resume server-side run on pre-stream 409 conversation busy (#1370)

Co-authored-by: Letta Code <noreply@letta.com>
This commit is contained in:
jnjpng
2026-03-12 17:59:45 -06:00
committed by GitHub
parent af45355617
commit 0992c00a03
4 changed files with 231 additions and 49 deletions

View File

@@ -52,7 +52,11 @@ import {
ensureMemoryFilesystemDirs, ensureMemoryFilesystemDirs,
getMemoryFilesystemRoot, getMemoryFilesystemRoot,
} from "../agent/memoryFilesystem"; } from "../agent/memoryFilesystem";
import { getStreamToolContextId, sendMessageStream } from "../agent/message"; import {
getStreamToolContextId,
type StreamRequestContext,
sendMessageStream,
} from "../agent/message";
import { import {
getModelInfo, getModelInfo,
getModelInfoForLlmConfig, getModelInfoForLlmConfig,
@@ -268,7 +272,13 @@ import {
import { formatStatusLineHelp } from "./helpers/statusLineHelp"; import { formatStatusLineHelp } from "./helpers/statusLineHelp";
import { buildStatusLinePayload } from "./helpers/statusLinePayload"; import { buildStatusLinePayload } from "./helpers/statusLinePayload";
import { executeStatusLineCommand } from "./helpers/statusLineRuntime"; import { executeStatusLineCommand } from "./helpers/statusLineRuntime";
import { type ApprovalRequest, drainStreamWithResume } from "./helpers/stream"; import {
type ApprovalRequest,
type DrainResult,
discoverFallbackRunIdWithTimeout,
drainStream,
drainStreamWithResume,
} from "./helpers/stream";
import { import {
collectFinishedTaskToolCalls, collectFinishedTaskToolCalls,
createSubagentGroupItem, createSubagentGroupItem,
@@ -3942,6 +3952,10 @@ export default function App({
clearCompletedSubagents(); clearCompletedSubagents();
} }
// Capture once before the retry loop so the temporal filter in
// discoverFallbackRunIdForResume covers runs created by any attempt.
const requestStartedAtMs = Date.now();
while (true) { while (true) {
// Capture the signal BEFORE any async operations // Capture the signal BEFORE any async operations
// This prevents a race where handleInterrupt nulls the ref during await // This prevents a race where handleInterrupt nulls the ref during await
@@ -3985,15 +3999,18 @@ export default function App({
// Wrap in try-catch to handle pre-stream desync errors (when sendMessageStream // Wrap in try-catch to handle pre-stream desync errors (when sendMessageStream
// throws before streaming begins, e.g., retry after LLM error when backend // throws before streaming begins, e.g., retry after LLM error when backend
// already cleared the approval) // already cleared the approval)
let stream: Awaited<ReturnType<typeof sendMessageStream>>; let stream: Awaited<ReturnType<typeof sendMessageStream>> | null =
null;
let turnToolContextId: string | null = null; let turnToolContextId: string | null = null;
let preStreamResumeResult: DrainResult | null = null;
try { try {
stream = await sendMessageStream( const nextStream = await sendMessageStream(
conversationIdRef.current, conversationIdRef.current,
currentInput, currentInput,
{ agentId: agentIdRef.current }, { agentId: agentIdRef.current },
); );
turnToolContextId = getStreamToolContextId(stream); stream = nextStream;
turnToolContextId = getStreamToolContextId(nextStream);
} catch (preStreamError) { } catch (preStreamError) {
debugLog( debugLog(
"stream", "stream",
@@ -4082,42 +4099,134 @@ export default function App({
}, },
); );
// Show status message // Attempt to discover and resume the in-flight run before waiting
const statusId = uid("status"); try {
buffersRef.current.byId.set(statusId, { const resumeCtx: StreamRequestContext = {
kind: "status", conversationId: conversationIdRef.current,
id: statusId, resolvedConversationId: conversationIdRef.current,
lines: ["Conversation is busy, waiting and retrying…"], agentId: agentIdRef.current,
}); requestStartedAtMs,
buffersRef.current.order.push(statusId); };
refreshDerived(); debugLog(
"stream",
"Conversation busy: attempting run discovery for resume (conv=%s, agent=%s)",
resumeCtx.conversationId,
resumeCtx.agentId,
);
const client = await getClient();
const discoveredRunId = await discoverFallbackRunIdWithTimeout(
client,
resumeCtx,
);
debugLog(
"stream",
"Run discovery result: %s",
discoveredRunId ?? "none",
);
// Wait with abort checking (same pattern as LLM API error retry) if (discoveredRunId) {
let cancelled = false; if (signal?.aborted || userCancelledRef.current) {
const startTime = Date.now(); const isStaleAtAbort =
while (Date.now() - startTime < retryDelayMs) { myGeneration !== conversationGenerationRef.current;
if ( if (!isStaleAtAbort) {
abortControllerRef.current?.signal.aborted || setStreaming(false);
userCancelledRef.current }
) { return;
cancelled = true; }
break;
// Found a running run — resume its stream
buffersRef.current.interrupted = false;
buffersRef.current.commitGeneration =
(buffersRef.current.commitGeneration || 0) + 1;
const resumeStream = await client.runs.messages.stream(
discoveredRunId,
{
starting_after: 0,
batch_size: 1000,
},
);
preStreamResumeResult = await drainStream(
resumeStream,
buffersRef.current,
refreshDerivedThrottled,
signal,
undefined, // no handleFirstMessage on resume
undefined,
contextTrackerRef.current,
);
// Attach the discovered run ID
if (!preStreamResumeResult.lastRunId) {
preStreamResumeResult.lastRunId = discoveredRunId;
}
debugLog(
"stream",
"Pre-stream resume succeeded (runId=%s, stopReason=%s)",
discoveredRunId,
preStreamResumeResult.stopReason,
);
// Fall through — preStreamResumeResult will short-circuit drainStreamWithResume
} }
await new Promise((resolve) => setTimeout(resolve, 100)); } catch (resumeError) {
if (signal?.aborted || userCancelledRef.current) {
const isStaleAtAbort =
myGeneration !== conversationGenerationRef.current;
if (!isStaleAtAbort) {
setStreaming(false);
}
return;
}
debugLog(
"stream",
"Pre-stream resume failed, falling back to wait/retry: %s",
resumeError instanceof Error
? resumeError.message
: String(resumeError),
);
// Fall through to existing wait/retry behavior
} }
// Remove status message // If resume succeeded, skip the wait/retry loop
buffersRef.current.byId.delete(statusId); if (!preStreamResumeResult) {
buffersRef.current.order = buffersRef.current.order.filter( // Show status message
(id) => id !== statusId, const statusId = uid("status");
); buffersRef.current.byId.set(statusId, {
refreshDerived(); kind: "status",
id: statusId,
lines: ["Conversation is busy, waiting and retrying…"],
});
buffersRef.current.order.push(statusId);
refreshDerived();
if (!cancelled) { // Wait with abort checking (same pattern as LLM API error retry)
// Reset interrupted flag so retry stream chunks are processed let cancelled = false;
buffersRef.current.interrupted = false; const startTime = Date.now();
restorePinnedPermissionMode(); while (Date.now() - startTime < retryDelayMs) {
continue; if (
abortControllerRef.current?.signal.aborted ||
userCancelledRef.current
) {
cancelled = true;
break;
}
await new Promise((resolve) => setTimeout(resolve, 100));
}
// Remove status message
buffersRef.current.byId.delete(statusId);
buffersRef.current.order = buffersRef.current.order.filter(
(id) => id !== statusId,
);
refreshDerived();
if (!cancelled) {
// Reset interrupted flag so retry stream chunks are processed
buffersRef.current.interrupted = false;
restorePinnedPermissionMode();
continue;
}
} }
// User pressed ESC - fall through to error handling // User pressed ESC - fall through to error handling
} }
@@ -4297,7 +4406,10 @@ export default function App({
} }
// Not a recoverable desync - re-throw to outer catch // Not a recoverable desync - re-throw to outer catch
throw preStreamError; // (unless pre-stream resume already succeeded)
if (!preStreamResumeResult) {
throw preStreamError;
}
} }
// Check again after network call - user may have pressed Escape during sendMessageStream // Check again after network call - user may have pressed Escape during sendMessageStream
@@ -4403,6 +4515,25 @@ export default function App({
contextTrackerRef.current.currentTurnId++; contextTrackerRef.current.currentTurnId++;
} }
const drainResult = preStreamResumeResult
? preStreamResumeResult
: (() => {
if (!stream) {
throw new Error(
"Expected stream when pre-stream resume did not succeed",
);
}
return drainStreamWithResume(
stream,
buffersRef.current,
refreshDerivedThrottled,
signal, // Use captured signal, not ref (which may be nulled by handleInterrupt)
handleFirstMessage,
undefined,
contextTrackerRef.current,
);
})();
const { const {
stopReason, stopReason,
approval, approval,
@@ -4410,15 +4541,7 @@ export default function App({
apiDurationMs, apiDurationMs,
lastRunId, lastRunId,
fallbackError, fallbackError,
} = await drainStreamWithResume( } = await drainResult;
stream,
buffersRef.current,
refreshDerivedThrottled,
signal, // Use captured signal, not ref (which may be nulled by handleInterrupt)
handleFirstMessage,
undefined,
contextTrackerRef.current,
);
// Update currentRunId for error reporting in catch block // Update currentRunId for error reporting in catch block
currentRunId = lastRunId ?? undefined; currentRunId = lastRunId ?? undefined;

View File

@@ -16,7 +16,7 @@ import {
type StreamRequestContext, type StreamRequestContext,
} from "../../agent/message"; } from "../../agent/message";
import { telemetry } from "../../telemetry"; import { telemetry } from "../../telemetry";
import { debugWarn } from "../../utils/debug"; import { debugLog, debugWarn } from "../../utils/debug";
import { formatDuration, logTiming } from "../../utils/timing"; import { formatDuration, logTiming } from "../../utils/timing";
import { import {
@@ -57,7 +57,7 @@ export type DrainStreamHook = (
| undefined | undefined
| Promise<DrainStreamHookResult | undefined>; | Promise<DrainStreamHookResult | undefined>;
type DrainResult = { export type DrainResult = {
stopReason: StopReasonType; stopReason: StopReasonType;
lastRunId?: string | null; lastRunId?: string | null;
lastSeqId?: number | null; lastSeqId?: number | null;
@@ -101,7 +101,7 @@ function parseRunCreatedAtMs(run: Run): number {
return Number.isFinite(parsed) ? parsed : 0; return Number.isFinite(parsed) ? parsed : 0;
} }
async function discoverFallbackRunIdWithTimeout( export async function discoverFallbackRunIdWithTimeout(
client: RunsListClient, client: RunsListClient,
ctx: StreamRequestContext, ctx: StreamRequestContext,
): Promise<string | null> { ): Promise<string | null> {
@@ -512,6 +512,9 @@ export async function drainStreamWithResume(
); );
let runIdToResume = result.lastRunId ?? null; let runIdToResume = result.lastRunId ?? null;
let runIdSource: "stream_chunk" | "discovery" | null = result.lastRunId
? "stream_chunk"
: null;
// If the stream failed before exposing run_id, try to discover the latest // If the stream failed before exposing run_id, try to discover the latest
// running/created run for this conversation that was created after send start. // running/created run for this conversation that was created after send start.
@@ -523,13 +526,25 @@ export async function drainStreamWithResume(
!abortSignal.aborted !abortSignal.aborted
) { ) {
try { try {
debugLog(
"stream",
"Mid-stream resume: attempting run discovery (conv=%s, agent=%s)",
streamRequestContext.conversationId,
streamRequestContext.agentId,
);
const client = await lazyClient(); const client = await lazyClient();
runIdToResume = await discoverFallbackRunIdWithTimeout( runIdToResume = await discoverFallbackRunIdWithTimeout(
client, client,
streamRequestContext, streamRequestContext,
); );
debugLog(
"stream",
"Mid-stream resume: run discovery result: %s",
runIdToResume ?? "none",
);
if (runIdToResume) { if (runIdToResume) {
result.lastRunId = runIdToResume; result.lastRunId = runIdToResume;
runIdSource = "discovery";
} }
} catch (lookupError) { } catch (lookupError) {
const lookupErrorMsg = const lookupErrorMsg =
@@ -574,6 +589,21 @@ export async function drainStreamWithResume(
}, },
); );
debugLog(
"stream",
"Mid-stream resume: fetching run stream (source=%s, runId=%s, lastSeqId=%s)",
runIdSource ?? "unknown",
runIdToResume,
result.lastSeqId ?? 0,
);
debugLog(
"stream",
"Mid-stream resume: attempting resume (runId=%s, lastSeqId=%s)",
runIdToResume,
result.lastSeqId ?? 0,
);
try { try {
const client = await lazyClient(); const client = await lazyClient();
@@ -613,6 +643,12 @@ export async function drainStreamWithResume(
// Use the resume result (should have proper stop_reason now) // Use the resume result (should have proper stop_reason now)
// Clear the original stream error since we recovered // Clear the original stream error since we recovered
debugLog(
"stream",
"Mid-stream resume succeeded (runId=%s, stopReason=%s)",
runIdToResume,
resumeResult.stopReason,
);
result = resumeResult; result = resumeResult;
// The resumed stream uses a fresh streamProcessor that won't have // The resumed stream uses a fresh streamProcessor that won't have
@@ -635,6 +671,12 @@ export async function drainStreamWithResume(
resumeError instanceof Error resumeError instanceof Error
? resumeError.message ? resumeError.message
: String(resumeError); : String(resumeError);
debugLog(
"stream",
"Mid-stream resume failed (runId=%s): %s",
runIdToResume,
resumeErrorMsg,
);
telemetry.trackError( telemetry.trackError(
"stream_resume_failed", "stream_resume_failed",
resumeErrorMsg, resumeErrorMsg,
@@ -655,6 +697,11 @@ export async function drainStreamWithResume(
// Only log if we actually skipped for a reason (i.e., we didn't enter the resume branch above) // Only log if we actually skipped for a reason (i.e., we didn't enter the resume branch above)
if (skipReasons.length > 0) { if (skipReasons.length > 0) {
debugLog(
"stream",
"Mid-stream resume skipped: %s",
skipReasons.join(", "),
);
telemetry.trackError( telemetry.trackError(
"stream_resume_skipped", "stream_resume_skipped",
`${result.fallbackError || "Stream error (no client-side detail)"} [skip: ${skipReasons.join(", ")}]`, `${result.fallbackError || "Stream error (no client-side detail)"} [skip: ${skipReasons.join(", ")}]`,

View File

@@ -1610,6 +1610,11 @@ ${SYSTEM_REMINDER_CLOSE}
} }
// Check for 409 "conversation busy" error - retry once with delay // Check for 409 "conversation busy" error - retry once with delay
// TODO: Add pre-stream resume logic for parity with App.tsx.
// Before waiting, attempt to discover the in-flight run via
// discoverFallbackRunIdWithTimeout() and resume its stream with
// client.runs.messages.stream() + drainStream(). See App.tsx
// retry_conversation_busy handler for reference implementation.
if (preStreamAction === "retry_conversation_busy") { if (preStreamAction === "retry_conversation_busy") {
conversationBusyRetries += 1; conversationBusyRetries += 1;
const retryDelayMs = getRetryDelayMs({ const retryDelayMs = getRetryDelayMs({

View File

@@ -2216,6 +2216,13 @@ async function sendMessageStreamWithRetry(
} }
if (action === "retry_conversation_busy") { if (action === "retry_conversation_busy") {
// TODO: Add pre-stream resume logic for parity with App.tsx.
// Before waiting, attempt to discover the in-flight run via
// discoverFallbackRunIdWithTimeout() and resume its stream with
// client.runs.messages.stream() + drainStream(). This avoids
// blind wait/retry cycles when the server already created a run
// from the original request. See App.tsx retry_conversation_busy
// handler for reference implementation.
const attempt = conversationBusyRetries + 1; const attempt = conversationBusyRetries + 1;
const delayMs = getRetryDelayMs({ const delayMs = getRetryDelayMs({
category: "conversation_busy", category: "conversation_busy",