feat: add automatic retry for transient LLM API errors (#389)

Co-authored-by: Letta <noreply@letta.com>
This commit is contained in:
Charles Packer
2025-12-25 09:49:19 -08:00
committed by GitHub
parent 6410bdece7
commit f4c71adab6
2 changed files with 98 additions and 2 deletions

View File

@@ -11,6 +11,7 @@ import type {
Message,
} from "@letta-ai/letta-client/resources/agents/messages";
import type { LlmConfig } from "@letta-ai/letta-client/resources/models/models";
import type { StopReasonType } from "@letta-ai/letta-client/resources/runs/runs";
import { Box, Static, Text } from "ink";
import { useCallback, useEffect, useMemo, useRef, useState } from "react";
import {
@@ -133,11 +134,41 @@ const CHECK_PENDING_APPROVALS_BEFORE_SEND = true;
// When false, wait for backend to send "cancelled" stop_reason (useful for testing backend behavior)
const EAGER_CANCEL = true;
// Maximum retries for transient LLM API errors (matches headless.ts)
const LLM_API_ERROR_MAX_RETRIES = 3;
// tiny helper for unique ids (avoid overwriting prior user lines)
function uid(prefix: string) {
return `${prefix}-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 8)}`;
}
// Check if error is retriable based on stop reason and run metadata
async function isRetriableError(
stopReason: StopReasonType,
lastRunId: string | null | undefined,
): Promise<boolean> {
// Primary check: backend sets stop_reason=llm_api_error for LLMError exceptions
if (stopReason === "llm_api_error") return true;
// Fallback check: in case stop_reason is "error" but metadata indicates LLM error
// This could happen if there's a backend edge case where LLMError is raised but
// stop_reason isn't set correctly. The metadata.error is a LettaErrorMessage with
// error_type="llm_error" for LLM errors (see streaming_service.py:402-411)
if (stopReason === "error" && lastRunId) {
try {
const client = await getClient();
const run = await client.runs.retrieve(lastRunId);
const metaError = run.metadata?.error as
| { error_type?: string }
| undefined;
return metaError?.error_type === "llm_error";
} catch {
return false;
}
}
return false;
}
// Save current agent as lastAgent before exiting
// This ensures subagent overwrites during the session don't persist
function saveLastAgentBeforeExit() {
@@ -538,6 +569,9 @@ export default function App({
// Track if user wants to cancel (persists across state updates)
const userCancelledRef = useRef(false);
// Retry counter for transient LLM API errors (ref for synchronous access in loop)
const llmApiErrorRetriesRef = useRef(0);
// Message queue state for queueing messages during streaming
const [messageQueue, setMessageQueue] = useState<string[]>([]);
@@ -884,6 +918,11 @@ export default function App({
}
processingConversationRef.current += 1;
// Reset retry counter for new conversation turns (fresh budget per user message)
if (!allowReentry) {
llmApiErrorRetriesRef.current = 0;
}
// Track last run ID for error reporting (accessible in catch block)
let currentRunId: string | undefined;
@@ -1007,6 +1046,7 @@ export default function App({
// Case 1: Turn ended normally
if (stopReason === "end_turn") {
setStreaming(false);
llmApiErrorRetriesRef.current = 0; // Reset retry counter on success
// Check if we were waiting for cancel but stream finished naturally
if (waitingForQueueCancelRef.current) {
@@ -1409,6 +1449,58 @@ export default function App({
}
// Unexpected stop reason (error, llm_api_error, etc.)
// Check if this is a retriable error (transient LLM API error)
const retriable = await isRetriableError(stopReason, lastRunId);
if (
retriable &&
llmApiErrorRetriesRef.current < LLM_API_ERROR_MAX_RETRIES
) {
llmApiErrorRetriesRef.current += 1;
const attempt = llmApiErrorRetriesRef.current;
const delayMs = 1000 * 2 ** (attempt - 1); // 1s, 2s, 4s
// Show subtle grey status message
const statusId = uid("status");
buffersRef.current.byId.set(statusId, {
kind: "status",
id: statusId,
lines: ["Unexpected downstream LLM API error, retrying..."],
});
buffersRef.current.order.push(statusId);
refreshDerived();
// Wait before retry (check abort signal periodically for ESC cancellation)
let cancelled = false;
const startTime = Date.now();
while (Date.now() - startTime < delayMs) {
if (
abortControllerRef.current?.signal.aborted ||
userCancelledRef.current
) {
cancelled = true;
break;
}
await new Promise((resolve) => setTimeout(resolve, 100)); // Check every 100ms
}
// Remove status message
buffersRef.current.byId.delete(statusId);
buffersRef.current.order = buffersRef.current.order.filter(
(id) => id !== statusId,
);
refreshDerived();
if (!cancelled) {
// Retry by continuing the while loop (same currentInput)
continue;
}
// User pressed ESC - fall through to error handling
}
// Reset retry counter on non-retriable error (or max retries exceeded)
llmApiErrorRetriesRef.current = 0;
// Mark incomplete tool calls as finished to prevent stuck blinking UI
markIncompleteToolsAsCancelled(buffersRef.current);

View File

@@ -956,6 +956,10 @@ export async function handleHeadlessCommand(
// Unexpected stop reason (error, llm_api_error, etc.)
// Before failing, check run metadata to see if this is a retriable llm_api_error
// Fallback check: in case stop_reason is "error" but metadata indicates LLM error
// This could happen if there's a backend edge case where LLMError is raised but
// stop_reason isn't set correctly. The metadata.error is a LettaErrorMessage with
// error_type="llm_error" for LLM errors (see streaming_service.py:402-411)
if (
stopReason === "error" &&
lastRunId &&
@@ -965,13 +969,13 @@ export async function handleHeadlessCommand(
const run = await client.runs.retrieve(lastRunId);
const metaError = run.metadata?.error as
| {
type?: string;
error_type?: string;
message?: string;
detail?: string;
}
| undefined;
if (metaError?.type === "llm_api_error") {
if (metaError?.error_type === "llm_error") {
const attempt = llmApiErrorRetries + 1;
const baseDelayMs = 1000;
const delayMs = baseDelayMs * 2 ** (attempt - 1);