feat: resume server-side run on pre-stream 409 conversation busy (#1370)
Co-authored-by: Letta Code <noreply@letta.com>
This commit is contained in:
215
src/cli/App.tsx
215
src/cli/App.tsx
@@ -52,7 +52,11 @@ import {
|
||||
ensureMemoryFilesystemDirs,
|
||||
getMemoryFilesystemRoot,
|
||||
} from "../agent/memoryFilesystem";
|
||||
import { getStreamToolContextId, sendMessageStream } from "../agent/message";
|
||||
import {
|
||||
getStreamToolContextId,
|
||||
type StreamRequestContext,
|
||||
sendMessageStream,
|
||||
} from "../agent/message";
|
||||
import {
|
||||
getModelInfo,
|
||||
getModelInfoForLlmConfig,
|
||||
@@ -268,7 +272,13 @@ import {
|
||||
import { formatStatusLineHelp } from "./helpers/statusLineHelp";
|
||||
import { buildStatusLinePayload } from "./helpers/statusLinePayload";
|
||||
import { executeStatusLineCommand } from "./helpers/statusLineRuntime";
|
||||
import { type ApprovalRequest, drainStreamWithResume } from "./helpers/stream";
|
||||
import {
|
||||
type ApprovalRequest,
|
||||
type DrainResult,
|
||||
discoverFallbackRunIdWithTimeout,
|
||||
drainStream,
|
||||
drainStreamWithResume,
|
||||
} from "./helpers/stream";
|
||||
import {
|
||||
collectFinishedTaskToolCalls,
|
||||
createSubagentGroupItem,
|
||||
@@ -3942,6 +3952,10 @@ export default function App({
|
||||
clearCompletedSubagents();
|
||||
}
|
||||
|
||||
// Capture once before the retry loop so the temporal filter in
|
||||
// discoverFallbackRunIdForResume covers runs created by any attempt.
|
||||
const requestStartedAtMs = Date.now();
|
||||
|
||||
while (true) {
|
||||
// Capture the signal BEFORE any async operations
|
||||
// This prevents a race where handleInterrupt nulls the ref during await
|
||||
@@ -3985,15 +3999,18 @@ export default function App({
|
||||
// Wrap in try-catch to handle pre-stream desync errors (when sendMessageStream
|
||||
// throws before streaming begins, e.g., retry after LLM error when backend
|
||||
// already cleared the approval)
|
||||
let stream: Awaited<ReturnType<typeof sendMessageStream>>;
|
||||
let stream: Awaited<ReturnType<typeof sendMessageStream>> | null =
|
||||
null;
|
||||
let turnToolContextId: string | null = null;
|
||||
let preStreamResumeResult: DrainResult | null = null;
|
||||
try {
|
||||
stream = await sendMessageStream(
|
||||
const nextStream = await sendMessageStream(
|
||||
conversationIdRef.current,
|
||||
currentInput,
|
||||
{ agentId: agentIdRef.current },
|
||||
);
|
||||
turnToolContextId = getStreamToolContextId(stream);
|
||||
stream = nextStream;
|
||||
turnToolContextId = getStreamToolContextId(nextStream);
|
||||
} catch (preStreamError) {
|
||||
debugLog(
|
||||
"stream",
|
||||
@@ -4082,42 +4099,134 @@ export default function App({
|
||||
},
|
||||
);
|
||||
|
||||
// Show status message
|
||||
const statusId = uid("status");
|
||||
buffersRef.current.byId.set(statusId, {
|
||||
kind: "status",
|
||||
id: statusId,
|
||||
lines: ["Conversation is busy, waiting and retrying…"],
|
||||
});
|
||||
buffersRef.current.order.push(statusId);
|
||||
refreshDerived();
|
||||
// Attempt to discover and resume the in-flight run before waiting
|
||||
try {
|
||||
const resumeCtx: StreamRequestContext = {
|
||||
conversationId: conversationIdRef.current,
|
||||
resolvedConversationId: conversationIdRef.current,
|
||||
agentId: agentIdRef.current,
|
||||
requestStartedAtMs,
|
||||
};
|
||||
debugLog(
|
||||
"stream",
|
||||
"Conversation busy: attempting run discovery for resume (conv=%s, agent=%s)",
|
||||
resumeCtx.conversationId,
|
||||
resumeCtx.agentId,
|
||||
);
|
||||
const client = await getClient();
|
||||
const discoveredRunId = await discoverFallbackRunIdWithTimeout(
|
||||
client,
|
||||
resumeCtx,
|
||||
);
|
||||
debugLog(
|
||||
"stream",
|
||||
"Run discovery result: %s",
|
||||
discoveredRunId ?? "none",
|
||||
);
|
||||
|
||||
// Wait with abort checking (same pattern as LLM API error retry)
|
||||
let cancelled = false;
|
||||
const startTime = Date.now();
|
||||
while (Date.now() - startTime < retryDelayMs) {
|
||||
if (
|
||||
abortControllerRef.current?.signal.aborted ||
|
||||
userCancelledRef.current
|
||||
) {
|
||||
cancelled = true;
|
||||
break;
|
||||
if (discoveredRunId) {
|
||||
if (signal?.aborted || userCancelledRef.current) {
|
||||
const isStaleAtAbort =
|
||||
myGeneration !== conversationGenerationRef.current;
|
||||
if (!isStaleAtAbort) {
|
||||
setStreaming(false);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Found a running run — resume its stream
|
||||
buffersRef.current.interrupted = false;
|
||||
buffersRef.current.commitGeneration =
|
||||
(buffersRef.current.commitGeneration || 0) + 1;
|
||||
|
||||
const resumeStream = await client.runs.messages.stream(
|
||||
discoveredRunId,
|
||||
{
|
||||
starting_after: 0,
|
||||
batch_size: 1000,
|
||||
},
|
||||
);
|
||||
|
||||
preStreamResumeResult = await drainStream(
|
||||
resumeStream,
|
||||
buffersRef.current,
|
||||
refreshDerivedThrottled,
|
||||
signal,
|
||||
undefined, // no handleFirstMessage on resume
|
||||
undefined,
|
||||
contextTrackerRef.current,
|
||||
);
|
||||
// Attach the discovered run ID
|
||||
if (!preStreamResumeResult.lastRunId) {
|
||||
preStreamResumeResult.lastRunId = discoveredRunId;
|
||||
}
|
||||
debugLog(
|
||||
"stream",
|
||||
"Pre-stream resume succeeded (runId=%s, stopReason=%s)",
|
||||
discoveredRunId,
|
||||
preStreamResumeResult.stopReason,
|
||||
);
|
||||
// Fall through — preStreamResumeResult will short-circuit drainStreamWithResume
|
||||
}
|
||||
await new Promise((resolve) => setTimeout(resolve, 100));
|
||||
} catch (resumeError) {
|
||||
if (signal?.aborted || userCancelledRef.current) {
|
||||
const isStaleAtAbort =
|
||||
myGeneration !== conversationGenerationRef.current;
|
||||
if (!isStaleAtAbort) {
|
||||
setStreaming(false);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
debugLog(
|
||||
"stream",
|
||||
"Pre-stream resume failed, falling back to wait/retry: %s",
|
||||
resumeError instanceof Error
|
||||
? resumeError.message
|
||||
: String(resumeError),
|
||||
);
|
||||
// Fall through to existing wait/retry behavior
|
||||
}
|
||||
|
||||
// Remove status message
|
||||
buffersRef.current.byId.delete(statusId);
|
||||
buffersRef.current.order = buffersRef.current.order.filter(
|
||||
(id) => id !== statusId,
|
||||
);
|
||||
refreshDerived();
|
||||
// If resume succeeded, skip the wait/retry loop
|
||||
if (!preStreamResumeResult) {
|
||||
// Show status message
|
||||
const statusId = uid("status");
|
||||
buffersRef.current.byId.set(statusId, {
|
||||
kind: "status",
|
||||
id: statusId,
|
||||
lines: ["Conversation is busy, waiting and retrying…"],
|
||||
});
|
||||
buffersRef.current.order.push(statusId);
|
||||
refreshDerived();
|
||||
|
||||
if (!cancelled) {
|
||||
// Reset interrupted flag so retry stream chunks are processed
|
||||
buffersRef.current.interrupted = false;
|
||||
restorePinnedPermissionMode();
|
||||
continue;
|
||||
// Wait with abort checking (same pattern as LLM API error retry)
|
||||
let cancelled = false;
|
||||
const startTime = Date.now();
|
||||
while (Date.now() - startTime < retryDelayMs) {
|
||||
if (
|
||||
abortControllerRef.current?.signal.aborted ||
|
||||
userCancelledRef.current
|
||||
) {
|
||||
cancelled = true;
|
||||
break;
|
||||
}
|
||||
await new Promise((resolve) => setTimeout(resolve, 100));
|
||||
}
|
||||
|
||||
// Remove status message
|
||||
buffersRef.current.byId.delete(statusId);
|
||||
buffersRef.current.order = buffersRef.current.order.filter(
|
||||
(id) => id !== statusId,
|
||||
);
|
||||
refreshDerived();
|
||||
|
||||
if (!cancelled) {
|
||||
// Reset interrupted flag so retry stream chunks are processed
|
||||
buffersRef.current.interrupted = false;
|
||||
restorePinnedPermissionMode();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// User pressed ESC - fall through to error handling
|
||||
}
|
||||
@@ -4297,7 +4406,10 @@ export default function App({
|
||||
}
|
||||
|
||||
// Not a recoverable desync - re-throw to outer catch
|
||||
throw preStreamError;
|
||||
// (unless pre-stream resume already succeeded)
|
||||
if (!preStreamResumeResult) {
|
||||
throw preStreamError;
|
||||
}
|
||||
}
|
||||
|
||||
// Check again after network call - user may have pressed Escape during sendMessageStream
|
||||
@@ -4403,6 +4515,25 @@ export default function App({
|
||||
contextTrackerRef.current.currentTurnId++;
|
||||
}
|
||||
|
||||
const drainResult = preStreamResumeResult
|
||||
? preStreamResumeResult
|
||||
: (() => {
|
||||
if (!stream) {
|
||||
throw new Error(
|
||||
"Expected stream when pre-stream resume did not succeed",
|
||||
);
|
||||
}
|
||||
return drainStreamWithResume(
|
||||
stream,
|
||||
buffersRef.current,
|
||||
refreshDerivedThrottled,
|
||||
signal, // Use captured signal, not ref (which may be nulled by handleInterrupt)
|
||||
handleFirstMessage,
|
||||
undefined,
|
||||
contextTrackerRef.current,
|
||||
);
|
||||
})();
|
||||
|
||||
const {
|
||||
stopReason,
|
||||
approval,
|
||||
@@ -4410,15 +4541,7 @@ export default function App({
|
||||
apiDurationMs,
|
||||
lastRunId,
|
||||
fallbackError,
|
||||
} = await drainStreamWithResume(
|
||||
stream,
|
||||
buffersRef.current,
|
||||
refreshDerivedThrottled,
|
||||
signal, // Use captured signal, not ref (which may be nulled by handleInterrupt)
|
||||
handleFirstMessage,
|
||||
undefined,
|
||||
contextTrackerRef.current,
|
||||
);
|
||||
} = await drainResult;
|
||||
|
||||
// Update currentRunId for error reporting in catch block
|
||||
currentRunId = lastRunId ?? undefined;
|
||||
|
||||
@@ -16,7 +16,7 @@ import {
|
||||
type StreamRequestContext,
|
||||
} from "../../agent/message";
|
||||
import { telemetry } from "../../telemetry";
|
||||
import { debugWarn } from "../../utils/debug";
|
||||
import { debugLog, debugWarn } from "../../utils/debug";
|
||||
import { formatDuration, logTiming } from "../../utils/timing";
|
||||
|
||||
import {
|
||||
@@ -57,7 +57,7 @@ export type DrainStreamHook = (
|
||||
| undefined
|
||||
| Promise<DrainStreamHookResult | undefined>;
|
||||
|
||||
type DrainResult = {
|
||||
export type DrainResult = {
|
||||
stopReason: StopReasonType;
|
||||
lastRunId?: string | null;
|
||||
lastSeqId?: number | null;
|
||||
@@ -101,7 +101,7 @@ function parseRunCreatedAtMs(run: Run): number {
|
||||
return Number.isFinite(parsed) ? parsed : 0;
|
||||
}
|
||||
|
||||
async function discoverFallbackRunIdWithTimeout(
|
||||
export async function discoverFallbackRunIdWithTimeout(
|
||||
client: RunsListClient,
|
||||
ctx: StreamRequestContext,
|
||||
): Promise<string | null> {
|
||||
@@ -512,6 +512,9 @@ export async function drainStreamWithResume(
|
||||
);
|
||||
|
||||
let runIdToResume = result.lastRunId ?? null;
|
||||
let runIdSource: "stream_chunk" | "discovery" | null = result.lastRunId
|
||||
? "stream_chunk"
|
||||
: null;
|
||||
|
||||
// If the stream failed before exposing run_id, try to discover the latest
|
||||
// running/created run for this conversation that was created after send start.
|
||||
@@ -523,13 +526,25 @@ export async function drainStreamWithResume(
|
||||
!abortSignal.aborted
|
||||
) {
|
||||
try {
|
||||
debugLog(
|
||||
"stream",
|
||||
"Mid-stream resume: attempting run discovery (conv=%s, agent=%s)",
|
||||
streamRequestContext.conversationId,
|
||||
streamRequestContext.agentId,
|
||||
);
|
||||
const client = await lazyClient();
|
||||
runIdToResume = await discoverFallbackRunIdWithTimeout(
|
||||
client,
|
||||
streamRequestContext,
|
||||
);
|
||||
debugLog(
|
||||
"stream",
|
||||
"Mid-stream resume: run discovery result: %s",
|
||||
runIdToResume ?? "none",
|
||||
);
|
||||
if (runIdToResume) {
|
||||
result.lastRunId = runIdToResume;
|
||||
runIdSource = "discovery";
|
||||
}
|
||||
} catch (lookupError) {
|
||||
const lookupErrorMsg =
|
||||
@@ -574,6 +589,21 @@ export async function drainStreamWithResume(
|
||||
},
|
||||
);
|
||||
|
||||
debugLog(
|
||||
"stream",
|
||||
"Mid-stream resume: fetching run stream (source=%s, runId=%s, lastSeqId=%s)",
|
||||
runIdSource ?? "unknown",
|
||||
runIdToResume,
|
||||
result.lastSeqId ?? 0,
|
||||
);
|
||||
|
||||
debugLog(
|
||||
"stream",
|
||||
"Mid-stream resume: attempting resume (runId=%s, lastSeqId=%s)",
|
||||
runIdToResume,
|
||||
result.lastSeqId ?? 0,
|
||||
);
|
||||
|
||||
try {
|
||||
const client = await lazyClient();
|
||||
|
||||
@@ -613,6 +643,12 @@ export async function drainStreamWithResume(
|
||||
|
||||
// Use the resume result (should have proper stop_reason now)
|
||||
// Clear the original stream error since we recovered
|
||||
debugLog(
|
||||
"stream",
|
||||
"Mid-stream resume succeeded (runId=%s, stopReason=%s)",
|
||||
runIdToResume,
|
||||
resumeResult.stopReason,
|
||||
);
|
||||
result = resumeResult;
|
||||
|
||||
// The resumed stream uses a fresh streamProcessor that won't have
|
||||
@@ -635,6 +671,12 @@ export async function drainStreamWithResume(
|
||||
resumeError instanceof Error
|
||||
? resumeError.message
|
||||
: String(resumeError);
|
||||
debugLog(
|
||||
"stream",
|
||||
"Mid-stream resume failed (runId=%s): %s",
|
||||
runIdToResume,
|
||||
resumeErrorMsg,
|
||||
);
|
||||
telemetry.trackError(
|
||||
"stream_resume_failed",
|
||||
resumeErrorMsg,
|
||||
@@ -655,6 +697,11 @@ export async function drainStreamWithResume(
|
||||
|
||||
// Only log if we actually skipped for a reason (i.e., we didn't enter the resume branch above)
|
||||
if (skipReasons.length > 0) {
|
||||
debugLog(
|
||||
"stream",
|
||||
"Mid-stream resume skipped: %s",
|
||||
skipReasons.join(", "),
|
||||
);
|
||||
telemetry.trackError(
|
||||
"stream_resume_skipped",
|
||||
`${result.fallbackError || "Stream error (no client-side detail)"} [skip: ${skipReasons.join(", ")}]`,
|
||||
|
||||
@@ -1610,6 +1610,11 @@ ${SYSTEM_REMINDER_CLOSE}
|
||||
}
|
||||
|
||||
// Check for 409 "conversation busy" error - retry once with delay
|
||||
// TODO: Add pre-stream resume logic for parity with App.tsx.
|
||||
// Before waiting, attempt to discover the in-flight run via
|
||||
// discoverFallbackRunIdWithTimeout() and resume its stream with
|
||||
// client.runs.messages.stream() + drainStream(). See App.tsx
|
||||
// retry_conversation_busy handler for reference implementation.
|
||||
if (preStreamAction === "retry_conversation_busy") {
|
||||
conversationBusyRetries += 1;
|
||||
const retryDelayMs = getRetryDelayMs({
|
||||
|
||||
@@ -2216,6 +2216,13 @@ async function sendMessageStreamWithRetry(
|
||||
}
|
||||
|
||||
if (action === "retry_conversation_busy") {
|
||||
// TODO: Add pre-stream resume logic for parity with App.tsx.
|
||||
// Before waiting, attempt to discover the in-flight run via
|
||||
// discoverFallbackRunIdWithTimeout() and resume its stream with
|
||||
// client.runs.messages.stream() + drainStream(). This avoids
|
||||
// blind wait/retry cycles when the server already created a run
|
||||
// from the original request. See App.tsx retry_conversation_busy
|
||||
// handler for reference implementation.
|
||||
const attempt = conversationBusyRetries + 1;
|
||||
const delayMs = getRetryDelayMs({
|
||||
category: "conversation_busy",
|
||||
|
||||
Reference in New Issue
Block a user