feat: clean up keep-alive fallback handling (#663)

2026-01-27 17:50:37 -08:00
parent c2fd63f5b5
commit 0df8e51dac
4 changed files with 96 additions and 321 deletions
--- a/src/agent/approval-recovery.ts
+++ b/src/agent/approval-recovery.ts
@@ -1,10 +1,4 @@
-import type { MessageCreate } from "@letta-ai/letta-client/resources/agents/agents";
 import { getClient } from "./client";
-import { APPROVAL_RECOVERY_PROMPT } from "./promptAssets";
-
-// Error when trying to SEND approval but server has no pending approval
-const APPROVAL_RECOVERY_DETAIL_FRAGMENT =
-  "no tool call is currently awaiting approval";

 // Error when approval tool call IDs don't match what server expects
 // Format: "Invalid tool call IDs: Expected [...], got [...]"
@@ -30,15 +24,6 @@ type RunErrorMetadata =
  | undefined
  | null;

-export function isApprovalStateDesyncError(detail: unknown): boolean {
-  if (typeof detail !== "string") return false;
-  const lower = detail.toLowerCase();
-  return (
-    lower.includes(APPROVAL_RECOVERY_DETAIL_FRAGMENT) ||
-    lower.includes(INVALID_TOOL_CALL_IDS_FRAGMENT)
-  );
-}
-
 /**
 * Check if error specifically indicates tool call ID mismatch.
 * This is a subtype of desync where the server HAS pending approvals,
@@ -100,11 +85,3 @@ export async function fetchRunErrorDetail(
    return null;
  }
 }
-
-export function buildApprovalRecoveryMessage(): MessageCreate {
-  return {
-    type: "message",
-    role: "user",
-    content: [{ type: "text", text: APPROVAL_RECOVERY_PROMPT }],
-  };
-}
--- a/src/cli/App.tsx
+++ b/src/cli/App.tsx
@@ -29,10 +29,8 @@ import {
  getDisplayableToolReturn,
 } from "../agent/approval-execution";
 import {
-  buildApprovalRecoveryMessage,
  fetchRunErrorDetail,
  isApprovalPendingError,
-  isApprovalStateDesyncError,
  isConversationBusyError,
  isInvalidToolCallIdsError,
 } from "../agent/approval-recovery";
@@ -2506,72 +2504,11 @@ export default function App({
                    sendDesktopNotification("Approval needed");
                    return;
                  }
-                  // No approvals found - fall through to general desync recovery
+                  // No approvals found - fall through to error handling below
                } catch {
-                  // Fetch failed - fall through to general desync recovery
+                  // Fetch failed - fall through to error handling below
                }
              }
-
-              // General desync: "no tool call awaiting" or fetch failed above
-              // Recover with keep-alive prompt or strip stale approvals
-              if (
-                isApprovalStateDesyncError(errorDetail) &&
-                llmApiErrorRetriesRef.current < LLM_API_ERROR_MAX_RETRIES
-              ) {
-                llmApiErrorRetriesRef.current += 1;
-
-                // Show transient status (matches post-stream desync handler UX)
-                const statusId = uid("status");
-                buffersRef.current.byId.set(statusId, {
-                  kind: "status",
-                  id: statusId,
-                  lines: [
-                    "Approval state desynced; resending keep-alive recovery prompt...",
-                  ],
-                });
-                buffersRef.current.order.push(statusId);
-                refreshDerived();
-
-                // Swap payload to recovery message (or strip stale approvals)
-                const isApprovalOnlyPayload =
-                  hasApprovalInPayload && currentInput.length === 1;
-                if (isApprovalOnlyPayload) {
-                  currentInput.splice(
-                    0,
-                    currentInput.length,
-                    buildApprovalRecoveryMessage(),
-                  );
-                } else {
-                  // Mixed payload: strip stale approvals, keep user message
-                  const messageItems = currentInput.filter(
-                    (item) => item?.type !== "approval",
-                  );
-                  if (messageItems.length > 0) {
-                    currentInput.splice(
-                      0,
-                      currentInput.length,
-                      ...messageItems,
-                    );
-                  } else {
-                    currentInput.splice(
-                      0,
-                      currentInput.length,
-                      buildApprovalRecoveryMessage(),
-                    );
-                  }
-                }
-
-                // Remove transient status before retry
-                buffersRef.current.byId.delete(statusId);
-                buffersRef.current.order = buffersRef.current.order.filter(
-                  (id) => id !== statusId,
-                );
-                refreshDerived();
-
-                // Reset interrupted flag so retry stream chunks are processed
-                buffersRef.current.interrupted = false;
-                continue;
-              }
            }

            // Not a recoverable desync - re-throw to outer catch
@@ -3332,8 +3269,6 @@ export default function App({
          const hasApprovalInPayload = currentInput.some(
            (item) => item?.type === "approval",
          );
-          const isApprovalOnlyPayload =
-            hasApprovalInPayload && currentInput.length === 1;

          // Capture the most recent error text in this turn (if any)
          let latestErrorText: string | null = null;
@@ -3347,17 +3282,9 @@ export default function App({
            }
          }

-          // Detect approval desync once per turn
-          const detailFromRun = await fetchRunErrorDetail(lastRunId);
-          const desyncDetected =
-            isApprovalStateDesyncError(detailFromRun) ||
-            isApprovalStateDesyncError(latestErrorText);
-
-          // Track last failure info so we can emit it if retries stop
-          const lastFailureMessage = latestErrorText || detailFromRun || null;
-
-          // "Invalid tool call IDs" means server HAS pending approvals but with different IDs.
+          // Check for "Invalid tool call IDs" error - server HAS pending approvals but with different IDs.
          // Fetch the actual pending approvals and show them to the user.
+          const detailFromRun = await fetchRunErrorDetail(lastRunId);
          const invalidIdsDetected =
            isInvalidToolCallIdsError(detailFromRun) ||
            isInvalidToolCallIdsError(latestErrorText);
@@ -3438,85 +3365,12 @@ export default function App({
                sendDesktopNotification("Approval needed");
                return;
              }
-              // No approvals found - fall through to general desync recovery
+              // No approvals found - fall through to error handling below
            } catch {
-              // Fetch failed - fall through to general desync recovery
+              // Fetch failed - fall through to error handling below
            }
          }

-          // Check for approval desync errors even if stop_reason isn't llm_api_error.
-          // Handle both approval-only payloads and mixed [approval, message] payloads.
-          if (hasApprovalInPayload && desyncDetected) {
-            if (llmApiErrorRetriesRef.current < LLM_API_ERROR_MAX_RETRIES) {
-              llmApiErrorRetriesRef.current += 1;
-              const statusId = uid("status");
-              buffersRef.current.byId.set(statusId, {
-                kind: "status",
-                id: statusId,
-                lines: [
-                  "Approval state desynced; resending keep-alive recovery prompt...",
-                ],
-              });
-              buffersRef.current.order.push(statusId);
-              refreshDerived();
-
-              if (isApprovalOnlyPayload) {
-                // Approval-only payload: send recovery prompt
-                currentInput.splice(
-                  0,
-                  currentInput.length,
-                  buildApprovalRecoveryMessage(),
-                );
-              } else {
-                // Mixed payload [approval, message]: strip stale approval, keep user message
-                const messageItems = currentInput.filter(
-                  (item) => item?.type !== "approval",
-                );
-                if (messageItems.length > 0) {
-                  currentInput.splice(0, currentInput.length, ...messageItems);
-                } else {
-                  // Fallback if somehow no message items remain
-                  currentInput.splice(
-                    0,
-                    currentInput.length,
-                    buildApprovalRecoveryMessage(),
-                  );
-                }
-              }
-
-              // Remove the transient status before retrying
-              buffersRef.current.byId.delete(statusId);
-              buffersRef.current.order = buffersRef.current.order.filter(
-                (id) => id !== statusId,
-              );
-              refreshDerived();
-
-              // Reset interrupted flag so retry stream chunks are processed
-              buffersRef.current.interrupted = false;
-              continue;
-            }
-
-            // No retries left: emit the failure and exit
-            const errorToShow =
-              lastFailureMessage ||
-              `An error occurred during agent execution\n(run_id: ${lastRunId ?? "unknown"}, stop_reason: ${stopReasonToHandle})`;
-            appendError(errorToShow, true);
-            appendError(ERROR_FEEDBACK_HINT, true);
-
-            // Restore dequeued message to input on error
-            if (lastDequeuedMessageRef.current) {
-              setRestoredInput(lastDequeuedMessageRef.current);
-              lastDequeuedMessageRef.current = null;
-            }
-            // Clear any remaining queue on error
-            setMessageQueue([]);
-
-            setStreaming(false);
-            sendDesktopNotification("Agent execution error", "error");
-            refreshDerived();
-            return;
-          }
-
          // Check for approval pending error (sent user message while approval waiting)
          // This is the lazy recovery path for when needsEagerApprovalCheck is false
          const approvalPendingDetected =
--- a/src/headless.ts
+++ b/src/headless.ts
@@ -9,10 +9,8 @@ import type { ApprovalCreate } from "@letta-ai/letta-client/resources/agents/mes
 import type { StopReasonType } from "@letta-ai/letta-client/resources/runs/runs";
 import type { ApprovalResult } from "./agent/approval-execution";
 import {
-  buildApprovalRecoveryMessage,
  fetchRunErrorDetail,
  isApprovalPendingError,
-  isApprovalStateDesyncError,
  isConversationBusyError,
  isInvalidToolCallIdsError,
 } from "./agent/approval-recovery";
@@ -1385,26 +1383,11 @@ export async function handleHeadlessCommand(
        }
      }

-      // Detect approval desync once per turn
+      // Fetch run error detail for invalid tool call ID detection
      const detailFromRun = await fetchRunErrorDetail(lastRunId);
-      const approvalDesynced =
-        currentInput.length === 1 &&
-        currentInput[0]?.type === "approval" &&
-        (isApprovalStateDesyncError(detailFromRun) ||
-          isApprovalStateDesyncError(latestErrorText));
-
-      // Track last failure text for emitting on exit
-      const lastFailureText =
-        latestErrorText ||
-        detailFromRun ||
-        (lastRunId
-          ? `An error occurred during agent execution\n(run_id: ${lastRunId}, stop_reason: ${stopReason})`
-          : `An error occurred during agent execution\n(stop_reason: ${stopReason})`);

      // Case 3: Transient LLM API error - retry with exponential backoff up to a limit
      if (stopReason === "llm_api_error") {
-        const shouldUseApprovalRecovery = approvalDesynced;
-
        if (llmApiErrorRetries < LLM_API_ERROR_MAX_RETRIES) {
          const attempt = llmApiErrorRetries + 1;
          const baseDelayMs = 1000;
@@ -1426,104 +1409,64 @@ export async function handleHeadlessCommand(
            console.log(JSON.stringify(retryMsg));
          } else {
            const delaySeconds = Math.round(delayMs / 1000);
-            const recoveryNote = shouldUseApprovalRecovery
-              ? " (approval state desynced - sending keep-going prompt)"
-              : "";
            console.error(
-              `LLM API error encountered (attempt ${attempt} of ${LLM_API_ERROR_MAX_RETRIES}), retrying in ${delaySeconds}s...${recoveryNote}`,
+              `LLM API error encountered (attempt ${attempt} of ${LLM_API_ERROR_MAX_RETRIES}), retrying in ${delaySeconds}s...`,
            );
          }

          // Exponential backoff before retrying the same input
          await new Promise((resolve) => setTimeout(resolve, delayMs));

-          if (shouldUseApprovalRecovery) {
-            currentInput = [buildApprovalRecoveryMessage()];
-          }
          continue;
        }
      }

-      // Fallback: if we were sending only approvals and hit an internal error that
-      // says there is no pending approval, resend using the keep-alive recovery prompt.
-      if (approvalDesynced) {
-        // "Invalid tool call IDs" means server HAS pending approvals but with different IDs.
-        // Fetch the actual pending approvals and process them before retrying.
-        if (
-          isInvalidToolCallIdsError(detailFromRun) ||
-          isInvalidToolCallIdsError(latestErrorText)
-        ) {
-          if (outputFormat === "stream-json") {
-            const recoveryMsg: RecoveryMessage = {
-              type: "recovery",
-              recovery_type: "invalid_tool_call_ids",
-              message:
-                "Tool call ID mismatch; fetching actual pending approvals and resyncing",
-              run_id: lastRunId ?? undefined,
-              session_id: sessionId,
-              uuid: `recovery-${lastRunId || crypto.randomUUID()}`,
-            };
-            console.log(JSON.stringify(recoveryMsg));
-          } else {
-            console.error(
-              "Tool call ID mismatch; fetching actual pending approvals...",
-            );
-          }
+      // "Invalid tool call IDs" means server HAS pending approvals but with different IDs.
+      // Fetch the actual pending approvals and process them before retrying.
+      const invalidIdsDetected =
+        isInvalidToolCallIdsError(detailFromRun) ||
+        isInvalidToolCallIdsError(latestErrorText);

-          try {
-            // Fetch and process actual pending approvals from server
-            await resolveAllPendingApprovals();
-            // After processing, continue to next iteration (fresh state)
-            continue;
-          } catch {
-            // If fetch fails, fall through to general desync recovery
-          }
-        }
-
-        if (llmApiErrorRetries < LLM_API_ERROR_MAX_RETRIES) {
-          llmApiErrorRetries += 1;
-
-          const retryReason = stopReason ?? "error";
-          if (outputFormat === "stream-json") {
-            const retryMsg: RetryMessage = {
-              type: "retry",
-              reason: retryReason,
-              attempt: llmApiErrorRetries,
-              max_attempts: LLM_API_ERROR_MAX_RETRIES,
-              delay_ms: 0,
-              run_id: lastRunId ?? undefined,
-              session_id: sessionId,
-              uuid: `retry-${lastRunId || crypto.randomUUID()}`,
-            };
-            console.log(JSON.stringify(retryMsg));
-          } else {
-            console.error(
-              "Approval state desynced; resending keep-alive recovery prompt...",
-            );
-          }
-
-          // Small pause to avoid rapid-fire retries
-          await new Promise((resolve) => setTimeout(resolve, 250));
-
-          currentInput = [buildApprovalRecoveryMessage()];
-          continue;
-        }
-
-        // No retries left or non-retriable: emit error and exit
+      if (invalidIdsDetected) {
        if (outputFormat === "stream-json") {
-          const errorMsg: ErrorMessage = {
-            type: "error",
-            message: lastFailureText,
-            stop_reason: stopReason,
+          const recoveryMsg: RecoveryMessage = {
+            type: "recovery",
+            recovery_type: "invalid_tool_call_ids",
+            message:
+              "Tool call ID mismatch; fetching actual pending approvals and resyncing",
            run_id: lastRunId ?? undefined,
            session_id: sessionId,
-            uuid: `error-${lastRunId || crypto.randomUUID()}`,
+            uuid: `recovery-${lastRunId || crypto.randomUUID()}`,
          };
-          console.log(JSON.stringify(errorMsg));
+          console.log(JSON.stringify(recoveryMsg));
        } else {
-          console.error(lastFailureText);
+          console.error(
+            "Tool call ID mismatch; fetching actual pending approvals...",
+          );
+        }
+
+        try {
+          // Fetch and process actual pending approvals from server
+          await resolveAllPendingApprovals();
+          // After processing, continue to next iteration (fresh state)
+          continue;
+        } catch {
+          // If fetch fails, exit with error
+          if (outputFormat === "stream-json") {
+            const errorMsg: ErrorMessage = {
+              type: "error",
+              message: "Failed to fetch pending approvals for resync",
+              stop_reason: stopReason,
+              run_id: lastRunId ?? undefined,
+              session_id: sessionId,
+              uuid: `error-${lastRunId || crypto.randomUUID()}`,
+            };
+            console.log(JSON.stringify(errorMsg));
+          } else {
+            console.error("Failed to fetch pending approvals for resync");
+          }
+          process.exit(1);
        }
-        process.exit(1);
      }

      // Unexpected stop reason (error, llm_api_error, etc.)
--- a/src/tests/approval-recovery.test.ts
+++ b/src/tests/approval-recovery.test.ts
@@ -2,7 +2,7 @@ import { describe, expect, test } from "bun:test";
 import type { Message } from "@letta-ai/letta-client/resources/agents/messages";
 import {
  isApprovalPendingError,
-  isApprovalStateDesyncError,
+  isConversationBusyError,
  isInvalidToolCallIdsError,
 } from "../agent/approval-recovery";
 import { extractApprovals } from "../agent/check-approval";
@@ -10,56 +10,12 @@ import { extractApprovals } from "../agent/check-approval";
 /**
 * Tests for approval error detection helpers (LET-7101).
 *
- * These functions detect two opposite error conditions:
- * 1. isApprovalStateDesyncError: Sent approval, but server has no pending approval
+ * These functions detect different error conditions related to approvals:
+ * 1. isInvalidToolCallIdsError: Tool call IDs don't match server's pending approvals
 * 2. isApprovalPendingError: Sent user message, but server has pending approval waiting
+ * 3. isConversationBusyError: Another request is being processed (409 CONFLICT)
 */

-describe("isApprovalStateDesyncError", () => {
-  test("detects desync error in detail string", () => {
-    const detail = "No tool call is currently awaiting approval";
-    expect(isApprovalStateDesyncError(detail)).toBe(true);
-  });
-
-  test("detects desync error case-insensitively", () => {
-    const detail = "NO TOOL CALL IS CURRENTLY AWAITING APPROVAL";
-    expect(isApprovalStateDesyncError(detail)).toBe(true);
-  });
-
-  test("detects desync error in longer message", () => {
-    const detail =
-      "Error: No tool call is currently awaiting approval. The approval request may have expired.";
-    expect(isApprovalStateDesyncError(detail)).toBe(true);
-  });
-
-  test("detects invalid tool call IDs error", () => {
-    const detail =
-      "Invalid tool call IDs: Expected ['tc_abc123'], got ['tc_xyz789']";
-    expect(isApprovalStateDesyncError(detail)).toBe(true);
-  });
-
-  test("detects invalid tool call IDs error case-insensitively", () => {
-    expect(
-      isApprovalStateDesyncError("INVALID TOOL CALL IDS: Expected X, got Y"),
-    ).toBe(true);
-    expect(isApprovalStateDesyncError("invalid tool call ids: mismatch")).toBe(
-      true,
-    );
-  });
-
-  test("returns false for unrelated errors", () => {
-    expect(isApprovalStateDesyncError("Connection timeout")).toBe(false);
-    expect(isApprovalStateDesyncError("Internal server error")).toBe(false);
-  });
-
-  test("returns false for non-string input", () => {
-    expect(isApprovalStateDesyncError(null)).toBe(false);
-    expect(isApprovalStateDesyncError(undefined)).toBe(false);
-    expect(isApprovalStateDesyncError(123)).toBe(false);
-    expect(isApprovalStateDesyncError({ error: "test" })).toBe(false);
-  });
-});
-
 describe("isInvalidToolCallIdsError", () => {
  test("detects invalid tool call IDs error", () => {
    const detail =
@@ -137,6 +93,51 @@ describe("isApprovalPendingError", () => {
  });
 });

+describe("isConversationBusyError", () => {
+  const REAL_ERROR_DETAIL =
+    "CONFLICT: Cannot send a new message: Another request is currently being processed for this conversation.";
+
+  test("detects conversation busy error in real error format", () => {
+    expect(isConversationBusyError(REAL_ERROR_DETAIL)).toBe(true);
+  });
+
+  test("detects conversation busy error case-insensitively", () => {
+    expect(
+      isConversationBusyError("ANOTHER REQUEST IS CURRENTLY BEING PROCESSED"),
+    ).toBe(true);
+    expect(
+      isConversationBusyError("another request is currently being processed"),
+    ).toBe(true);
+  });
+
+  test("detects partial match in longer message", () => {
+    const detail =
+      "Error: Another request is currently being processed. Please wait.";
+    expect(isConversationBusyError(detail)).toBe(true);
+  });
+
+  test("returns false for approval pending errors (different case)", () => {
+    expect(
+      isConversationBusyError(
+        "Cannot send a new message: The agent is waiting for approval",
+      ),
+    ).toBe(false);
+  });
+
+  test("returns false for unrelated errors", () => {
+    expect(isConversationBusyError("Connection timeout")).toBe(false);
+    expect(isConversationBusyError("Rate limit exceeded")).toBe(false);
+    expect(isConversationBusyError("Invalid API key")).toBe(false);
+  });
+
+  test("returns false for non-string input", () => {
+    expect(isConversationBusyError(null)).toBe(false);
+    expect(isConversationBusyError(undefined)).toBe(false);
+    expect(isConversationBusyError(123)).toBe(false);
+    expect(isConversationBusyError({ detail: REAL_ERROR_DETAIL })).toBe(false);
+  });
+});
+
 /**
 * Tests for parallel tool call approval extraction.
 * Ensures lazy recovery handles multiple simultaneous tool calls correctly.