fix: don't retry llm_error for 4xx client errors (#725)

Co-authored-by: Letta <noreply@letta.com>
This commit is contained in:
cthomas
2026-01-28 14:41:27 -08:00
committed by GitHub
parent 03db8545ad
commit 5f5252e5a2
2 changed files with 20 additions and 4 deletions

View File

@@ -375,12 +375,17 @@ async function isRetriableError(
// Check for llm_error at top level or nested (handles error.error nesting)
const errorType = metaError?.error_type ?? metaError?.error?.error_type;
if (errorType === "llm_error") return true;
const detail = metaError?.detail ?? metaError?.error?.detail ?? "";
// Don't retry 4xx client errors (validation, auth, malformed requests)
// These are not transient and won't succeed on retry
const is4xxError = /Error code: 4\d{2}/.test(detail);
if (errorType === "llm_error" && !is4xxError) return true;
// Fallback: detect LLM provider errors from detail even if misclassified
// This handles edge cases where streaming errors weren't properly converted to LLMError
// Patterns are derived from handle_llm_error() message formats in the backend
const detail = metaError?.detail ?? metaError?.error?.detail ?? "";
const llmProviderPatterns = [
"Anthropic API error", // anthropic_client.py:759
"OpenAI API error", // openai_client.py:1034
@@ -390,7 +395,10 @@ async function isRetriableError(
"Network error", // Transient network failures during streaming
"Connection error during Anthropic streaming", // Peer disconnections, incomplete chunked reads
];
if (llmProviderPatterns.some((pattern) => detail.includes(pattern))) {
if (
llmProviderPatterns.some((pattern) => detail.includes(pattern)) &&
!is4xxError
) {
return true;
}

View File

@@ -1507,6 +1507,11 @@ export async function handleHeadlessCommand(
// Fallback: detect LLM provider errors from detail even if misclassified
// Patterns are derived from handle_llm_error() message formats in the backend
const detail = metaError?.detail ?? metaError?.error?.detail ?? "";
// Don't retry 4xx client errors (validation, auth, malformed requests)
// These are not transient and won't succeed on retry
const is4xxError = /Error code: 4\d{2}/.test(detail);
const llmProviderPatterns = [
"Anthropic API error", // anthropic_client.py:759
"OpenAI API error", // openai_client.py:1034
@@ -1520,7 +1525,10 @@ export async function handleHeadlessCommand(
detail.includes(pattern),
);
if (errorType === "llm_error" || isLlmErrorFromDetail) {
if (
(errorType === "llm_error" || isLlmErrorFromDetail) &&
!is4xxError
) {
const attempt = llmApiErrorRetries + 1;
const baseDelayMs = 1000;
const delayMs = baseDelayMs * 2 ** (attempt - 1);