feat: add 409 retry, error improvements, and queue restoration (#618)

Co-authored-by: Letta <noreply@letta.com>
2026-01-21 14:57:48 -08:00
parent 802136c868
commit 6a0bcdd683
5 changed files with 281 additions and 36 deletions
--- a/src/headless.ts
+++ b/src/headless.ts
@@ -13,6 +13,7 @@ import {
  fetchRunErrorDetail,
  isApprovalPendingError,
  isApprovalStateDesyncError,
+  isConversationBusyError,
 } from "./agent/approval-recovery";
 import { getClient } from "./agent/client";
 import { initializeLoadedSkillsFlag, setAgentContext } from "./agent/context";
@@ -59,6 +60,10 @@ import {
 // caller to manually resubmit the prompt.
 const LLM_API_ERROR_MAX_RETRIES = 3;

+// Retry config for 409 "conversation busy" errors
+const CONVERSATION_BUSY_MAX_RETRIES = 1; // Only retry once, fail on 2nd 409
+const CONVERSATION_BUSY_RETRY_DELAY_MS = 2500; // 2.5 seconds
+
 export async function handleHeadlessCommand(
  argv: string[],
  model?: string,
@@ -945,15 +950,83 @@ export async function handleHeadlessCommand(
  // Track lastRunId outside the while loop so it's available in catch block
  let lastKnownRunId: string | null = null;
  let llmApiErrorRetries = 0;
+  let conversationBusyRetries = 0;

  markMilestone("HEADLESS_FIRST_STREAM_START");
  measureSinceMilestone("headless-setup-total", "HEADLESS_CLIENT_READY");

  try {
    while (true) {
-      const stream = await sendMessageStream(conversationId, currentInput, {
-        agentId: agent.id,
-      });
+      // Wrap sendMessageStream in try-catch to handle pre-stream errors (e.g., 409)
+      let stream: Awaited<ReturnType<typeof sendMessageStream>>;
+      try {
+        stream = await sendMessageStream(conversationId, currentInput, {
+          agentId: agent.id,
+        });
+      } catch (preStreamError) {
+        // Extract error detail from APIError
+        let errorDetail = "";
+        if (
+          preStreamError instanceof APIError &&
+          preStreamError.error &&
+          typeof preStreamError.error === "object"
+        ) {
+          const errObj = preStreamError.error as Record<string, unknown>;
+          if (
+            errObj.error &&
+            typeof errObj.error === "object" &&
+            "detail" in errObj.error
+          ) {
+            const nested = errObj.error as Record<string, unknown>;
+            errorDetail =
+              typeof nested.detail === "string" ? nested.detail : "";
+          }
+          if (!errorDetail && typeof errObj.detail === "string") {
+            errorDetail = errObj.detail;
+          }
+        }
+        if (!errorDetail && preStreamError instanceof Error) {
+          errorDetail = preStreamError.message;
+        }
+
+        // Check for 409 "conversation busy" error - retry once with delay
+        if (
+          isConversationBusyError(errorDetail) &&
+          conversationBusyRetries < CONVERSATION_BUSY_MAX_RETRIES
+        ) {
+          conversationBusyRetries += 1;
+
+          // Emit retry message for stream-json mode
+          if (outputFormat === "stream-json") {
+            const retryMsg: RetryMessage = {
+              type: "retry",
+              reason: "error", // 409 conversation busy is a pre-stream error
+              attempt: conversationBusyRetries,
+              max_attempts: CONVERSATION_BUSY_MAX_RETRIES,
+              delay_ms: CONVERSATION_BUSY_RETRY_DELAY_MS,
+              session_id: sessionId,
+              uuid: `retry-conversation-busy-${crypto.randomUUID()}`,
+            };
+            console.log(JSON.stringify(retryMsg));
+          } else {
+            console.error(
+              `Conversation is busy, waiting ${CONVERSATION_BUSY_RETRY_DELAY_MS / 1000}s and retrying...`,
+            );
+          }
+
+          // Wait before retry
+          await new Promise((resolve) =>
+            setTimeout(resolve, CONVERSATION_BUSY_RETRY_DELAY_MS),
+          );
+          continue;
+        }
+
+        // Reset conversation busy retry counter on other errors
+        conversationBusyRetries = 0;
+
+        // Re-throw to outer catch for other errors
+        throw preStreamError;
+      }

      // For stream-json, output each chunk as it arrives
      let stopReason: StopReasonType | null = null;
@@ -1147,6 +1220,9 @@ export async function handleHeadlessCommand(

      // Case 1: Turn ended normally
      if (stopReason === "end_turn") {
+        // Reset retry counters on success
+        llmApiErrorRetries = 0;
+        conversationBusyRetries = 0;
        break;
      }