diff --git a/src/cli/App.tsx b/src/cli/App.tsx index 818ebcc..9c468ee 100644 --- a/src/cli/App.tsx +++ b/src/cli/App.tsx @@ -11,6 +11,7 @@ import type { Message, } from "@letta-ai/letta-client/resources/agents/messages"; import type { LlmConfig } from "@letta-ai/letta-client/resources/models/models"; +import type { StopReasonType } from "@letta-ai/letta-client/resources/runs/runs"; import { Box, Static, Text } from "ink"; import { useCallback, useEffect, useMemo, useRef, useState } from "react"; import { @@ -133,11 +134,41 @@ const CHECK_PENDING_APPROVALS_BEFORE_SEND = true; // When false, wait for backend to send "cancelled" stop_reason (useful for testing backend behavior) const EAGER_CANCEL = true; +// Maximum retries for transient LLM API errors (matches headless.ts) +const LLM_API_ERROR_MAX_RETRIES = 3; + // tiny helper for unique ids (avoid overwriting prior user lines) function uid(prefix: string) { return `${prefix}-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 8)}`; } +// Check if error is retriable based on stop reason and run metadata +async function isRetriableError( + stopReason: StopReasonType, + lastRunId: string | null | undefined, +): Promise { + // Primary check: backend sets stop_reason=llm_api_error for LLMError exceptions + if (stopReason === "llm_api_error") return true; + + // Fallback check: in case stop_reason is "error" but metadata indicates LLM error + // This could happen if there's a backend edge case where LLMError is raised but + // stop_reason isn't set correctly. The metadata.error is a LettaErrorMessage with + // error_type="llm_error" for LLM errors (see streaming_service.py:402-411) + if (stopReason === "error" && lastRunId) { + try { + const client = await getClient(); + const run = await client.runs.retrieve(lastRunId); + const metaError = run.metadata?.error as + | { error_type?: string } + | undefined; + return metaError?.error_type === "llm_error"; + } catch { + return false; + } + } + return false; +} + // Save current agent as lastAgent before exiting // This ensures subagent overwrites during the session don't persist function saveLastAgentBeforeExit() { @@ -538,6 +569,9 @@ export default function App({ // Track if user wants to cancel (persists across state updates) const userCancelledRef = useRef(false); + // Retry counter for transient LLM API errors (ref for synchronous access in loop) + const llmApiErrorRetriesRef = useRef(0); + // Message queue state for queueing messages during streaming const [messageQueue, setMessageQueue] = useState([]); @@ -884,6 +918,11 @@ export default function App({ } processingConversationRef.current += 1; + // Reset retry counter for new conversation turns (fresh budget per user message) + if (!allowReentry) { + llmApiErrorRetriesRef.current = 0; + } + // Track last run ID for error reporting (accessible in catch block) let currentRunId: string | undefined; @@ -1007,6 +1046,7 @@ export default function App({ // Case 1: Turn ended normally if (stopReason === "end_turn") { setStreaming(false); + llmApiErrorRetriesRef.current = 0; // Reset retry counter on success // Check if we were waiting for cancel but stream finished naturally if (waitingForQueueCancelRef.current) { @@ -1409,6 +1449,58 @@ export default function App({ } // Unexpected stop reason (error, llm_api_error, etc.) + // Check if this is a retriable error (transient LLM API error) + const retriable = await isRetriableError(stopReason, lastRunId); + + if ( + retriable && + llmApiErrorRetriesRef.current < LLM_API_ERROR_MAX_RETRIES + ) { + llmApiErrorRetriesRef.current += 1; + const attempt = llmApiErrorRetriesRef.current; + const delayMs = 1000 * 2 ** (attempt - 1); // 1s, 2s, 4s + + // Show subtle grey status message + const statusId = uid("status"); + buffersRef.current.byId.set(statusId, { + kind: "status", + id: statusId, + lines: ["Unexpected downstream LLM API error, retrying..."], + }); + buffersRef.current.order.push(statusId); + refreshDerived(); + + // Wait before retry (check abort signal periodically for ESC cancellation) + let cancelled = false; + const startTime = Date.now(); + while (Date.now() - startTime < delayMs) { + if ( + abortControllerRef.current?.signal.aborted || + userCancelledRef.current + ) { + cancelled = true; + break; + } + await new Promise((resolve) => setTimeout(resolve, 100)); // Check every 100ms + } + + // Remove status message + buffersRef.current.byId.delete(statusId); + buffersRef.current.order = buffersRef.current.order.filter( + (id) => id !== statusId, + ); + refreshDerived(); + + if (!cancelled) { + // Retry by continuing the while loop (same currentInput) + continue; + } + // User pressed ESC - fall through to error handling + } + + // Reset retry counter on non-retriable error (or max retries exceeded) + llmApiErrorRetriesRef.current = 0; + // Mark incomplete tool calls as finished to prevent stuck blinking UI markIncompleteToolsAsCancelled(buffersRef.current); diff --git a/src/headless.ts b/src/headless.ts index 7421e52..132e20f 100644 --- a/src/headless.ts +++ b/src/headless.ts @@ -956,6 +956,10 @@ export async function handleHeadlessCommand( // Unexpected stop reason (error, llm_api_error, etc.) // Before failing, check run metadata to see if this is a retriable llm_api_error + // Fallback check: in case stop_reason is "error" but metadata indicates LLM error + // This could happen if there's a backend edge case where LLMError is raised but + // stop_reason isn't set correctly. The metadata.error is a LettaErrorMessage with + // error_type="llm_error" for LLM errors (see streaming_service.py:402-411) if ( stopReason === "error" && lastRunId && @@ -965,13 +969,13 @@ export async function handleHeadlessCommand( const run = await client.runs.retrieve(lastRunId); const metaError = run.metadata?.error as | { - type?: string; + error_type?: string; message?: string; detail?: string; } | undefined; - if (metaError?.type === "llm_api_error") { + if (metaError?.error_type === "llm_error") { const attempt = llmApiErrorRetries + 1; const baseDelayMs = 1000; const delayMs = baseDelayMs * 2 ** (attempt - 1);