fix: recover run_id for stream resume after early disconnect (#1212)
Co-authored-by: Letta <noreply@letta.com>
This commit is contained in:
@@ -17,6 +17,13 @@ import { getClient } from "./client";
|
||||
|
||||
const streamRequestStartTimes = new WeakMap<object, number>();
|
||||
const streamToolContextIds = new WeakMap<object, string>();
|
||||
export type StreamRequestContext = {
|
||||
conversationId: string;
|
||||
resolvedConversationId: string;
|
||||
agentId: string | null;
|
||||
requestStartedAtMs: number;
|
||||
};
|
||||
const streamRequestContexts = new WeakMap<object, StreamRequestContext>();
|
||||
|
||||
export function getStreamRequestStartTime(
|
||||
stream: Stream<LettaStreamingResponse>,
|
||||
@@ -30,6 +37,12 @@ export function getStreamToolContextId(
|
||||
return streamToolContextIds.get(stream as object) ?? null;
|
||||
}
|
||||
|
||||
export function getStreamRequestContext(
|
||||
stream: Stream<LettaStreamingResponse>,
|
||||
): StreamRequestContext | undefined {
|
||||
return streamRequestContexts.get(stream as object);
|
||||
}
|
||||
|
||||
/**
|
||||
* Send a message to a conversation and return a streaming response.
|
||||
* Uses the conversations API for all conversations.
|
||||
@@ -52,6 +65,7 @@ export async function sendMessageStream(
|
||||
requestOptions: { maxRetries?: number } = { maxRetries: 0 },
|
||||
): Promise<Stream<LettaStreamingResponse>> {
|
||||
const requestStartTime = isTimingsEnabled() ? performance.now() : undefined;
|
||||
const requestStartedAtMs = Date.now();
|
||||
const client = await getClient();
|
||||
|
||||
// Wait for any in-progress toolset switch to complete before reading tools
|
||||
@@ -93,6 +107,12 @@ export async function sendMessageStream(
|
||||
streamRequestStartTimes.set(stream as object, requestStartTime);
|
||||
}
|
||||
streamToolContextIds.set(stream as object, contextId);
|
||||
streamRequestContexts.set(stream as object, {
|
||||
conversationId,
|
||||
resolvedConversationId,
|
||||
agentId: opts.agentId ?? null,
|
||||
requestStartedAtMs,
|
||||
});
|
||||
|
||||
return stream;
|
||||
}
|
||||
|
||||
@@ -1,13 +1,20 @@
|
||||
import { APIError } from "@letta-ai/letta-client/core/error";
|
||||
import type { Stream } from "@letta-ai/letta-client/core/streaming";
|
||||
import type { LettaStreamingResponse } from "@letta-ai/letta-client/resources/agents/messages";
|
||||
import type {
|
||||
LettaStreamingResponse,
|
||||
Run,
|
||||
} from "@letta-ai/letta-client/resources/agents/messages";
|
||||
import type { StopReasonType } from "@letta-ai/letta-client/resources/runs/runs";
|
||||
import {
|
||||
clearLastSDKDiagnostic,
|
||||
consumeLastSDKDiagnostic,
|
||||
getClient,
|
||||
} from "../../agent/client";
|
||||
import { getStreamRequestStartTime } from "../../agent/message";
|
||||
import {
|
||||
getStreamRequestContext,
|
||||
getStreamRequestStartTime,
|
||||
type StreamRequestContext,
|
||||
} from "../../agent/message";
|
||||
import { telemetry } from "../../telemetry";
|
||||
import { debugWarn } from "../../utils/debug";
|
||||
import { formatDuration, logTiming } from "../../utils/timing";
|
||||
@@ -60,6 +67,143 @@ type DrainResult = {
|
||||
fallbackError?: string | null; // Error message for when we can't fetch details from server (no run_id)
|
||||
};
|
||||
|
||||
type RunsListResponse =
|
||||
| Run[]
|
||||
| {
|
||||
getPaginatedItems?: () => Run[];
|
||||
};
|
||||
|
||||
type RunsListClient = {
|
||||
runs: {
|
||||
list: (query: {
|
||||
conversation_id?: string | null;
|
||||
agent_id?: string | null;
|
||||
statuses?: string[] | null;
|
||||
order?: string | null;
|
||||
limit?: number | null;
|
||||
}) => Promise<RunsListResponse>;
|
||||
};
|
||||
};
|
||||
|
||||
const FALLBACK_RUN_DISCOVERY_TIMEOUT_MS = 5000;
|
||||
|
||||
function hasPaginatedItems(
|
||||
response: RunsListResponse,
|
||||
): response is { getPaginatedItems: () => Run[] } {
|
||||
return (
|
||||
!Array.isArray(response) && typeof response.getPaginatedItems === "function"
|
||||
);
|
||||
}
|
||||
|
||||
function parseRunCreatedAtMs(run: Run): number {
|
||||
if (!run.created_at) return 0;
|
||||
const parsed = Date.parse(run.created_at);
|
||||
return Number.isFinite(parsed) ? parsed : 0;
|
||||
}
|
||||
|
||||
async function discoverFallbackRunIdWithTimeout(
|
||||
client: RunsListClient,
|
||||
ctx: StreamRequestContext,
|
||||
): Promise<string | null> {
|
||||
return withTimeout(
|
||||
discoverFallbackRunIdForResume(client, ctx),
|
||||
FALLBACK_RUN_DISCOVERY_TIMEOUT_MS,
|
||||
`Fallback run discovery timed out after ${FALLBACK_RUN_DISCOVERY_TIMEOUT_MS}ms`,
|
||||
);
|
||||
}
|
||||
|
||||
function withTimeout<T>(
|
||||
promise: Promise<T>,
|
||||
timeoutMs: number,
|
||||
timeoutMessage: string,
|
||||
): Promise<T> {
|
||||
return new Promise<T>((resolve, reject) => {
|
||||
const timer = setTimeout(
|
||||
() => reject(new Error(timeoutMessage)),
|
||||
timeoutMs,
|
||||
);
|
||||
promise.then(
|
||||
(value) => {
|
||||
clearTimeout(timer);
|
||||
resolve(value);
|
||||
},
|
||||
(error) => {
|
||||
clearTimeout(timer);
|
||||
reject(error);
|
||||
},
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
function toRunsArray(listResponse: RunsListResponse): Run[] {
|
||||
if (Array.isArray(listResponse)) return listResponse;
|
||||
if (hasPaginatedItems(listResponse)) {
|
||||
return listResponse.getPaginatedItems() ?? [];
|
||||
}
|
||||
return [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempt to discover a run ID to resume when the initial stream failed before
|
||||
* any run_id-bearing chunk arrived.
|
||||
*/
|
||||
export async function discoverFallbackRunIdForResume(
|
||||
client: RunsListClient,
|
||||
ctx: StreamRequestContext,
|
||||
): Promise<string | null> {
|
||||
const statuses = ["running"];
|
||||
const requestStartedAtMs = ctx.requestStartedAtMs;
|
||||
|
||||
const listCandidates = async (query: {
|
||||
conversation_id?: string | null;
|
||||
agent_id?: string | null;
|
||||
}): Promise<Run[]> => {
|
||||
const response = await client.runs.list({
|
||||
...query,
|
||||
statuses,
|
||||
order: "desc",
|
||||
limit: 1,
|
||||
});
|
||||
return toRunsArray(response).filter((run) => {
|
||||
if (!run.id) return false;
|
||||
if (run.status !== "running") return false;
|
||||
// Best-effort temporal filter: only consider runs created after
|
||||
// this send request started. In rare concurrent-send races within
|
||||
// the same conversation, this heuristic can still pick a neighbor run.
|
||||
return parseRunCreatedAtMs(run) >= requestStartedAtMs;
|
||||
});
|
||||
};
|
||||
|
||||
const lookupQueries: Array<{
|
||||
conversation_id?: string | null;
|
||||
agent_id?: string | null;
|
||||
}> = [];
|
||||
|
||||
if (ctx.conversationId === "default") {
|
||||
// Default conversation routes through resolvedConversationId (typically agent ID).
|
||||
lookupQueries.push({ conversation_id: ctx.resolvedConversationId });
|
||||
} else {
|
||||
// Named conversation: first use the explicit conversation id.
|
||||
lookupQueries.push({ conversation_id: ctx.conversationId });
|
||||
|
||||
// Keep resolved route as backup only when it differs.
|
||||
if (ctx.resolvedConversationId !== ctx.conversationId) {
|
||||
lookupQueries.push({ conversation_id: ctx.resolvedConversationId });
|
||||
}
|
||||
}
|
||||
|
||||
if (ctx.agentId) {
|
||||
lookupQueries.push({ agent_id: ctx.agentId });
|
||||
}
|
||||
|
||||
for (const query of lookupQueries) {
|
||||
const candidates = await listCandidates(query);
|
||||
if (candidates[0]?.id) return candidates[0].id;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
export async function drainStream(
|
||||
stream: Stream<LettaStreamingResponse>,
|
||||
buffers: ReturnType<typeof createBuffers>,
|
||||
@@ -346,6 +490,15 @@ export async function drainStreamWithResume(
|
||||
contextTracker?: ContextTracker,
|
||||
): Promise<DrainResult> {
|
||||
const overallStartTime = performance.now();
|
||||
const streamRequestContext = getStreamRequestContext(stream);
|
||||
|
||||
let _client: Awaited<ReturnType<typeof getClient>> | undefined;
|
||||
const lazyClient = async () => {
|
||||
if (!_client) {
|
||||
_client = await getClient();
|
||||
}
|
||||
return _client;
|
||||
};
|
||||
|
||||
// Attempt initial drain
|
||||
let result = await drainStream(
|
||||
@@ -358,12 +511,51 @@ export async function drainStreamWithResume(
|
||||
contextTracker,
|
||||
);
|
||||
|
||||
let runIdToResume = result.lastRunId ?? null;
|
||||
|
||||
// If the stream failed before exposing run_id, try to discover the latest
|
||||
// running/created run for this conversation that was created after send start.
|
||||
if (
|
||||
result.stopReason === "error" &&
|
||||
!runIdToResume &&
|
||||
streamRequestContext &&
|
||||
abortSignal &&
|
||||
!abortSignal.aborted
|
||||
) {
|
||||
try {
|
||||
const client = await lazyClient();
|
||||
runIdToResume = await discoverFallbackRunIdWithTimeout(
|
||||
client,
|
||||
streamRequestContext,
|
||||
);
|
||||
if (runIdToResume) {
|
||||
result.lastRunId = runIdToResume;
|
||||
}
|
||||
} catch (lookupError) {
|
||||
const lookupErrorMsg =
|
||||
lookupError instanceof Error
|
||||
? lookupError.message
|
||||
: String(lookupError);
|
||||
telemetry.trackError(
|
||||
"stream_resume_lookup_failed",
|
||||
lookupErrorMsg,
|
||||
"stream_resume",
|
||||
);
|
||||
|
||||
debugWarn(
|
||||
"drainStreamWithResume",
|
||||
"Fallback run_id lookup failed:",
|
||||
lookupError,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// If stream ended without proper stop_reason and we have resume info, try once to reconnect
|
||||
// Only resume if we have an abortSignal AND it's not aborted (explicit check prevents
|
||||
// undefined abortSignal from accidentally allowing resume after user cancellation)
|
||||
if (
|
||||
result.stopReason === "error" &&
|
||||
result.lastRunId &&
|
||||
runIdToResume &&
|
||||
abortSignal &&
|
||||
!abortSignal.aborted
|
||||
) {
|
||||
@@ -378,12 +570,12 @@ export async function drainStreamWithResume(
|
||||
originalFallbackError || "Stream error (no client-side detail)",
|
||||
"stream_resume",
|
||||
{
|
||||
runId: result.lastRunId,
|
||||
runId: result.lastRunId ?? undefined,
|
||||
},
|
||||
);
|
||||
|
||||
try {
|
||||
const client = await getClient();
|
||||
const client = await lazyClient();
|
||||
|
||||
// Reset interrupted flag so resumed chunks can be processed by onChunk.
|
||||
// Without this, tool_return_message for server-side tools (web_search, fetch_webpage)
|
||||
@@ -397,7 +589,7 @@ export async function drainStreamWithResume(
|
||||
// TODO: Re-enable once issues are resolved - disabled retries were causing problems
|
||||
// Disable SDK retries - state management happens outside, retries would create race conditions
|
||||
const resumeStream = await client.runs.messages.stream(
|
||||
result.lastRunId,
|
||||
runIdToResume,
|
||||
{
|
||||
// If lastSeqId is null the stream failed before any seq_id-bearing
|
||||
// chunk arrived; use 0 to replay the run from the beginning.
|
||||
|
||||
148
src/tests/cli/stream-resume-fallback.test.ts
Normal file
148
src/tests/cli/stream-resume-fallback.test.ts
Normal file
@@ -0,0 +1,148 @@
|
||||
import { describe, expect, test } from "bun:test";
|
||||
import type { Run } from "@letta-ai/letta-client/resources/agents/messages";
|
||||
import { discoverFallbackRunIdForResume } from "../../cli/helpers/stream";
|
||||
|
||||
type RunsListClient = {
|
||||
runs: {
|
||||
list: (query: {
|
||||
conversation_id?: string | null;
|
||||
agent_id?: string | null;
|
||||
statuses?: string[] | null;
|
||||
order?: string | null;
|
||||
limit?: number | null;
|
||||
}) => Promise<Run[] | { getPaginatedItems?: () => Run[] }>;
|
||||
};
|
||||
};
|
||||
|
||||
function makeRunsListClient(
|
||||
runsList: RunsListClient["runs"]["list"],
|
||||
): RunsListClient {
|
||||
return { runs: { list: runsList } };
|
||||
}
|
||||
|
||||
function run(id: string, createdAt: string): Run {
|
||||
return {
|
||||
id,
|
||||
agent_id: "agent-test",
|
||||
created_at: createdAt,
|
||||
status: "running",
|
||||
};
|
||||
}
|
||||
|
||||
describe("discoverFallbackRunIdForResume", () => {
|
||||
test("returns the latest conversation-scoped running run after request start", async () => {
|
||||
const runsList = async (query: {
|
||||
conversation_id?: string | null;
|
||||
agent_id?: string | null;
|
||||
}): Promise<Run[]> => {
|
||||
if (query.conversation_id === "conv-123") {
|
||||
expect(query).toMatchObject({
|
||||
statuses: ["running"],
|
||||
order: "desc",
|
||||
limit: 1,
|
||||
});
|
||||
return [run("run-new", "2026-02-27T10:01:10.000Z")];
|
||||
}
|
||||
return [];
|
||||
};
|
||||
|
||||
const candidate = await discoverFallbackRunIdForResume(
|
||||
makeRunsListClient(runsList),
|
||||
{
|
||||
conversationId: "conv-123",
|
||||
resolvedConversationId: "conv-123",
|
||||
agentId: "agent-test",
|
||||
requestStartedAtMs: Date.parse("2026-02-27T10:01:00.000Z"),
|
||||
},
|
||||
);
|
||||
|
||||
expect(candidate).toBe("run-new");
|
||||
});
|
||||
|
||||
test("for default conversation falls back to agent lookup when conversation lookup misses", async () => {
|
||||
const calls: Array<{
|
||||
conversation_id?: string | null;
|
||||
agent_id?: string | null;
|
||||
}> = [];
|
||||
|
||||
const runsList = async (query: {
|
||||
conversation_id?: string | null;
|
||||
agent_id?: string | null;
|
||||
}): Promise<Run[]> => {
|
||||
calls.push({
|
||||
conversation_id: query.conversation_id,
|
||||
agent_id: query.agent_id,
|
||||
});
|
||||
|
||||
if (query.agent_id === "agent-test") {
|
||||
return [run("run-agent-fallback", "2026-02-27T11:00:05.000Z")];
|
||||
}
|
||||
|
||||
return [];
|
||||
};
|
||||
|
||||
const candidate = await discoverFallbackRunIdForResume(
|
||||
makeRunsListClient(runsList),
|
||||
{
|
||||
conversationId: "default",
|
||||
resolvedConversationId: "agent-test",
|
||||
agentId: "agent-test",
|
||||
requestStartedAtMs: Date.parse("2026-02-27T11:00:00.000Z"),
|
||||
},
|
||||
);
|
||||
|
||||
expect(candidate).toBe("run-agent-fallback");
|
||||
expect(calls).toEqual([
|
||||
{ conversation_id: "agent-test", agent_id: undefined },
|
||||
{ conversation_id: undefined, agent_id: "agent-test" },
|
||||
]);
|
||||
});
|
||||
|
||||
test("returns null when latest running run is older than request start", async () => {
|
||||
const runsList = async (): Promise<Run[]> => [
|
||||
run("run-old-1", "2026-02-27T09:59:58.000Z"),
|
||||
run("run-old-2", "2026-02-27T09:59:59.000Z"),
|
||||
];
|
||||
|
||||
const candidate = await discoverFallbackRunIdForResume(
|
||||
makeRunsListClient(runsList),
|
||||
{
|
||||
conversationId: "conv-abc",
|
||||
resolvedConversationId: "conv-abc",
|
||||
agentId: "agent-test",
|
||||
requestStartedAtMs: Date.parse("2026-02-27T10:00:00.000Z"),
|
||||
},
|
||||
);
|
||||
|
||||
expect(candidate).toBeNull();
|
||||
});
|
||||
|
||||
test("ignores created runs when selecting fallback resume run", async () => {
|
||||
const runsList = async (query: {
|
||||
conversation_id?: string | null;
|
||||
agent_id?: string | null;
|
||||
}): Promise<Run[]> => {
|
||||
expect(query).toMatchObject({ statuses: ["running"], limit: 1 });
|
||||
return [
|
||||
{
|
||||
id: "run-created",
|
||||
agent_id: "agent-test",
|
||||
created_at: "2026-02-27T12:00:01.000Z",
|
||||
status: "created",
|
||||
},
|
||||
];
|
||||
};
|
||||
|
||||
const candidate = await discoverFallbackRunIdForResume(
|
||||
makeRunsListClient(runsList),
|
||||
{
|
||||
conversationId: "conv-created",
|
||||
resolvedConversationId: "conv-created",
|
||||
agentId: "agent-test",
|
||||
requestStartedAtMs: Date.parse("2026-02-27T12:00:00.000Z"),
|
||||
},
|
||||
);
|
||||
|
||||
expect(candidate).toBeNull();
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user