feat: add automatic retry for transient LLM API errors (#389)
Co-authored-by: Letta <noreply@letta.com>
This commit is contained in:
@@ -11,6 +11,7 @@ import type {
|
||||
Message,
|
||||
} from "@letta-ai/letta-client/resources/agents/messages";
|
||||
import type { LlmConfig } from "@letta-ai/letta-client/resources/models/models";
|
||||
import type { StopReasonType } from "@letta-ai/letta-client/resources/runs/runs";
|
||||
import { Box, Static, Text } from "ink";
|
||||
import { useCallback, useEffect, useMemo, useRef, useState } from "react";
|
||||
import {
|
||||
@@ -133,11 +134,41 @@ const CHECK_PENDING_APPROVALS_BEFORE_SEND = true;
|
||||
// When false, wait for backend to send "cancelled" stop_reason (useful for testing backend behavior)
|
||||
const EAGER_CANCEL = true;
|
||||
|
||||
// Maximum retries for transient LLM API errors (matches headless.ts)
|
||||
const LLM_API_ERROR_MAX_RETRIES = 3;
|
||||
|
||||
// tiny helper for unique ids (avoid overwriting prior user lines)
|
||||
function uid(prefix: string) {
|
||||
return `${prefix}-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 8)}`;
|
||||
}
|
||||
|
||||
// Check if error is retriable based on stop reason and run metadata
|
||||
async function isRetriableError(
|
||||
stopReason: StopReasonType,
|
||||
lastRunId: string | null | undefined,
|
||||
): Promise<boolean> {
|
||||
// Primary check: backend sets stop_reason=llm_api_error for LLMError exceptions
|
||||
if (stopReason === "llm_api_error") return true;
|
||||
|
||||
// Fallback check: in case stop_reason is "error" but metadata indicates LLM error
|
||||
// This could happen if there's a backend edge case where LLMError is raised but
|
||||
// stop_reason isn't set correctly. The metadata.error is a LettaErrorMessage with
|
||||
// error_type="llm_error" for LLM errors (see streaming_service.py:402-411)
|
||||
if (stopReason === "error" && lastRunId) {
|
||||
try {
|
||||
const client = await getClient();
|
||||
const run = await client.runs.retrieve(lastRunId);
|
||||
const metaError = run.metadata?.error as
|
||||
| { error_type?: string }
|
||||
| undefined;
|
||||
return metaError?.error_type === "llm_error";
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Save current agent as lastAgent before exiting
|
||||
// This ensures subagent overwrites during the session don't persist
|
||||
function saveLastAgentBeforeExit() {
|
||||
@@ -538,6 +569,9 @@ export default function App({
|
||||
// Track if user wants to cancel (persists across state updates)
|
||||
const userCancelledRef = useRef(false);
|
||||
|
||||
// Retry counter for transient LLM API errors (ref for synchronous access in loop)
|
||||
const llmApiErrorRetriesRef = useRef(0);
|
||||
|
||||
// Message queue state for queueing messages during streaming
|
||||
const [messageQueue, setMessageQueue] = useState<string[]>([]);
|
||||
|
||||
@@ -884,6 +918,11 @@ export default function App({
|
||||
}
|
||||
processingConversationRef.current += 1;
|
||||
|
||||
// Reset retry counter for new conversation turns (fresh budget per user message)
|
||||
if (!allowReentry) {
|
||||
llmApiErrorRetriesRef.current = 0;
|
||||
}
|
||||
|
||||
// Track last run ID for error reporting (accessible in catch block)
|
||||
let currentRunId: string | undefined;
|
||||
|
||||
@@ -1007,6 +1046,7 @@ export default function App({
|
||||
// Case 1: Turn ended normally
|
||||
if (stopReason === "end_turn") {
|
||||
setStreaming(false);
|
||||
llmApiErrorRetriesRef.current = 0; // Reset retry counter on success
|
||||
|
||||
// Check if we were waiting for cancel but stream finished naturally
|
||||
if (waitingForQueueCancelRef.current) {
|
||||
@@ -1409,6 +1449,58 @@ export default function App({
|
||||
}
|
||||
|
||||
// Unexpected stop reason (error, llm_api_error, etc.)
|
||||
// Check if this is a retriable error (transient LLM API error)
|
||||
const retriable = await isRetriableError(stopReason, lastRunId);
|
||||
|
||||
if (
|
||||
retriable &&
|
||||
llmApiErrorRetriesRef.current < LLM_API_ERROR_MAX_RETRIES
|
||||
) {
|
||||
llmApiErrorRetriesRef.current += 1;
|
||||
const attempt = llmApiErrorRetriesRef.current;
|
||||
const delayMs = 1000 * 2 ** (attempt - 1); // 1s, 2s, 4s
|
||||
|
||||
// Show subtle grey status message
|
||||
const statusId = uid("status");
|
||||
buffersRef.current.byId.set(statusId, {
|
||||
kind: "status",
|
||||
id: statusId,
|
||||
lines: ["Unexpected downstream LLM API error, retrying..."],
|
||||
});
|
||||
buffersRef.current.order.push(statusId);
|
||||
refreshDerived();
|
||||
|
||||
// Wait before retry (check abort signal periodically for ESC cancellation)
|
||||
let cancelled = false;
|
||||
const startTime = Date.now();
|
||||
while (Date.now() - startTime < delayMs) {
|
||||
if (
|
||||
abortControllerRef.current?.signal.aborted ||
|
||||
userCancelledRef.current
|
||||
) {
|
||||
cancelled = true;
|
||||
break;
|
||||
}
|
||||
await new Promise((resolve) => setTimeout(resolve, 100)); // Check every 100ms
|
||||
}
|
||||
|
||||
// Remove status message
|
||||
buffersRef.current.byId.delete(statusId);
|
||||
buffersRef.current.order = buffersRef.current.order.filter(
|
||||
(id) => id !== statusId,
|
||||
);
|
||||
refreshDerived();
|
||||
|
||||
if (!cancelled) {
|
||||
// Retry by continuing the while loop (same currentInput)
|
||||
continue;
|
||||
}
|
||||
// User pressed ESC - fall through to error handling
|
||||
}
|
||||
|
||||
// Reset retry counter on non-retriable error (or max retries exceeded)
|
||||
llmApiErrorRetriesRef.current = 0;
|
||||
|
||||
// Mark incomplete tool calls as finished to prevent stuck blinking UI
|
||||
markIncompleteToolsAsCancelled(buffersRef.current);
|
||||
|
||||
|
||||
@@ -956,6 +956,10 @@ export async function handleHeadlessCommand(
|
||||
|
||||
// Unexpected stop reason (error, llm_api_error, etc.)
|
||||
// Before failing, check run metadata to see if this is a retriable llm_api_error
|
||||
// Fallback check: in case stop_reason is "error" but metadata indicates LLM error
|
||||
// This could happen if there's a backend edge case where LLMError is raised but
|
||||
// stop_reason isn't set correctly. The metadata.error is a LettaErrorMessage with
|
||||
// error_type="llm_error" for LLM errors (see streaming_service.py:402-411)
|
||||
if (
|
||||
stopReason === "error" &&
|
||||
lastRunId &&
|
||||
@@ -965,13 +969,13 @@ export async function handleHeadlessCommand(
|
||||
const run = await client.runs.retrieve(lastRunId);
|
||||
const metaError = run.metadata?.error as
|
||||
| {
|
||||
type?: string;
|
||||
error_type?: string;
|
||||
message?: string;
|
||||
detail?: string;
|
||||
}
|
||||
| undefined;
|
||||
|
||||
if (metaError?.type === "llm_api_error") {
|
||||
if (metaError?.error_type === "llm_error") {
|
||||
const attempt = llmApiErrorRetries + 1;
|
||||
const baseDelayMs = 1000;
|
||||
const delayMs = baseDelayMs * 2 ** (attempt - 1);
|
||||
|
||||
Reference in New Issue
Block a user