feat: add automatic retry for transient LLM API errors (#389)

Co-authored-by: Letta <noreply@letta.com>
2025-12-25 09:49:19 -08:00
parent 6410bdece7
commit f4c71adab6
2 changed files with 98 additions and 2 deletions
--- a/src/cli/App.tsx
+++ b/src/cli/App.tsx
@@ -11,6 +11,7 @@ import type {
  Message,
 } from "@letta-ai/letta-client/resources/agents/messages";
 import type { LlmConfig } from "@letta-ai/letta-client/resources/models/models";
+import type { StopReasonType } from "@letta-ai/letta-client/resources/runs/runs";
 import { Box, Static, Text } from "ink";
 import { useCallback, useEffect, useMemo, useRef, useState } from "react";
 import {
@@ -133,11 +134,41 @@ const CHECK_PENDING_APPROVALS_BEFORE_SEND = true;
 // When false, wait for backend to send "cancelled" stop_reason (useful for testing backend behavior)
 const EAGER_CANCEL = true;

+// Maximum retries for transient LLM API errors (matches headless.ts)
+const LLM_API_ERROR_MAX_RETRIES = 3;
+
 // tiny helper for unique ids (avoid overwriting prior user lines)
 function uid(prefix: string) {
  return `${prefix}-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 8)}`;
 }

+// Check if error is retriable based on stop reason and run metadata
+async function isRetriableError(
+  stopReason: StopReasonType,
+  lastRunId: string | null | undefined,
+): Promise<boolean> {
+  // Primary check: backend sets stop_reason=llm_api_error for LLMError exceptions
+  if (stopReason === "llm_api_error") return true;
+
+  // Fallback check: in case stop_reason is "error" but metadata indicates LLM error
+  // This could happen if there's a backend edge case where LLMError is raised but
+  // stop_reason isn't set correctly. The metadata.error is a LettaErrorMessage with
+  // error_type="llm_error" for LLM errors (see streaming_service.py:402-411)
+  if (stopReason === "error" && lastRunId) {
+    try {
+      const client = await getClient();
+      const run = await client.runs.retrieve(lastRunId);
+      const metaError = run.metadata?.error as
+        | { error_type?: string }
+        | undefined;
+      return metaError?.error_type === "llm_error";
+    } catch {
+      return false;
+    }
+  }
+  return false;
+}
+
 // Save current agent as lastAgent before exiting
 // This ensures subagent overwrites during the session don't persist
 function saveLastAgentBeforeExit() {
@@ -538,6 +569,9 @@ export default function App({
  // Track if user wants to cancel (persists across state updates)
  const userCancelledRef = useRef(false);

+  // Retry counter for transient LLM API errors (ref for synchronous access in loop)
+  const llmApiErrorRetriesRef = useRef(0);
+
  // Message queue state for queueing messages during streaming
  const [messageQueue, setMessageQueue] = useState<string[]>([]);

@@ -884,6 +918,11 @@ export default function App({
      }
      processingConversationRef.current += 1;

+      // Reset retry counter for new conversation turns (fresh budget per user message)
+      if (!allowReentry) {
+        llmApiErrorRetriesRef.current = 0;
+      }
+
      // Track last run ID for error reporting (accessible in catch block)
      let currentRunId: string | undefined;

@@ -1007,6 +1046,7 @@ export default function App({
          // Case 1: Turn ended normally
          if (stopReason === "end_turn") {
            setStreaming(false);
+            llmApiErrorRetriesRef.current = 0; // Reset retry counter on success

            // Check if we were waiting for cancel but stream finished naturally
            if (waitingForQueueCancelRef.current) {
@@ -1409,6 +1449,58 @@ export default function App({
          }

          // Unexpected stop reason (error, llm_api_error, etc.)
+          // Check if this is a retriable error (transient LLM API error)
+          const retriable = await isRetriableError(stopReason, lastRunId);
+
+          if (
+            retriable &&
+            llmApiErrorRetriesRef.current < LLM_API_ERROR_MAX_RETRIES
+          ) {
+            llmApiErrorRetriesRef.current += 1;
+            const attempt = llmApiErrorRetriesRef.current;
+            const delayMs = 1000 * 2 ** (attempt - 1); // 1s, 2s, 4s
+
+            // Show subtle grey status message
+            const statusId = uid("status");
+            buffersRef.current.byId.set(statusId, {
+              kind: "status",
+              id: statusId,
+              lines: ["Unexpected downstream LLM API error, retrying..."],
+            });
+            buffersRef.current.order.push(statusId);
+            refreshDerived();
+
+            // Wait before retry (check abort signal periodically for ESC cancellation)
+            let cancelled = false;
+            const startTime = Date.now();
+            while (Date.now() - startTime < delayMs) {
+              if (
+                abortControllerRef.current?.signal.aborted ||
+                userCancelledRef.current
+              ) {
+                cancelled = true;
+                break;
+              }
+              await new Promise((resolve) => setTimeout(resolve, 100)); // Check every 100ms
+            }
+
+            // Remove status message
+            buffersRef.current.byId.delete(statusId);
+            buffersRef.current.order = buffersRef.current.order.filter(
+              (id) => id !== statusId,
+            );
+            refreshDerived();
+
+            if (!cancelled) {
+              // Retry by continuing the while loop (same currentInput)
+              continue;
+            }
+            // User pressed ESC - fall through to error handling
+          }
+
+          // Reset retry counter on non-retriable error (or max retries exceeded)
+          llmApiErrorRetriesRef.current = 0;
+
          // Mark incomplete tool calls as finished to prevent stuck blinking UI
          markIncompleteToolsAsCancelled(buffersRef.current);

--- a/src/headless.ts
+++ b/src/headless.ts
@@ -956,6 +956,10 @@ export async function handleHeadlessCommand(

      // Unexpected stop reason (error, llm_api_error, etc.)
      // Before failing, check run metadata to see if this is a retriable llm_api_error
+      // Fallback check: in case stop_reason is "error" but metadata indicates LLM error
+      // This could happen if there's a backend edge case where LLMError is raised but
+      // stop_reason isn't set correctly. The metadata.error is a LettaErrorMessage with
+      // error_type="llm_error" for LLM errors (see streaming_service.py:402-411)
      if (
        stopReason === "error" &&
        lastRunId &&
@@ -965,13 +969,13 @@ export async function handleHeadlessCommand(
          const run = await client.runs.retrieve(lastRunId);
          const metaError = run.metadata?.error as
            | {
-                type?: string;
+                error_type?: string;
                message?: string;
                detail?: string;
              }
            | undefined;

-          if (metaError?.type === "llm_api_error") {
+          if (metaError?.error_type === "llm_error") {
            const attempt = llmApiErrorRetries + 1;
            const baseDelayMs = 1000;
            const delayMs = baseDelayMs * 2 ** (attempt - 1);