feat: add telemetry tracking for retry-inducing errors (#1131)
Co-authored-by: Letta <noreply@letta.com>
This commit is contained in:
@@ -3491,6 +3491,20 @@ export default function App({
|
||||
: null;
|
||||
const delayMs = retryAfterMs ?? 1000 * 2 ** (attempt - 1);
|
||||
|
||||
// Log the error that triggered the retry
|
||||
telemetry.trackError(
|
||||
"retry_pre_stream_transient",
|
||||
errorDetail || "Pre-stream transient error",
|
||||
"pre_stream_retry",
|
||||
{
|
||||
httpStatus:
|
||||
preStreamError instanceof APIError
|
||||
? preStreamError.status
|
||||
: undefined,
|
||||
modelId: currentModelId || undefined,
|
||||
},
|
||||
);
|
||||
|
||||
const statusId = uid("status");
|
||||
buffersRef.current.byId.set(statusId, {
|
||||
kind: "status",
|
||||
@@ -4674,6 +4688,19 @@ export default function App({
|
||||
const attempt = llmApiErrorRetriesRef.current;
|
||||
const delayMs = 1000 * 2 ** (attempt - 1); // 1s, 2s, 4s
|
||||
|
||||
// Log the error that triggered the retry
|
||||
telemetry.trackError(
|
||||
"retry_post_stream_error",
|
||||
detailFromRun ||
|
||||
fallbackError ||
|
||||
`Stream stopped: ${stopReasonToHandle}`,
|
||||
"post_stream_retry",
|
||||
{
|
||||
modelId: currentModelId || undefined,
|
||||
runId: lastRunId ?? undefined,
|
||||
},
|
||||
);
|
||||
|
||||
// Show subtle grey status message
|
||||
const statusId = uid("status");
|
||||
const statusLines = [getRetryStatusMessage(detailFromRun)];
|
||||
@@ -4740,9 +4767,9 @@ export default function App({
|
||||
},
|
||||
);
|
||||
|
||||
// If we have a client-side stream error (e.g., JSON parse error), show it directly
|
||||
// Fallback error: no run_id available, show whatever error message we have
|
||||
if (fallbackError) {
|
||||
// If we have a client-side stream error with no run_id, show it directly.
|
||||
// When lastRunId is present, prefer the richer server-side error details below.
|
||||
if (fallbackError && !lastRunId) {
|
||||
setNetworkPhase("error");
|
||||
const errorMsg = lastRunId
|
||||
? `Stream error: ${fallbackError}\n(run_id: ${lastRunId})`
|
||||
|
||||
@@ -223,11 +223,11 @@ export async function drainStream(
|
||||
}
|
||||
}
|
||||
|
||||
// Only set fallbackError if we don't have a run_id - if we have a run_id,
|
||||
// App.tsx will fetch detailed error info from the server which is better
|
||||
if (!streamProcessor.lastRunId) {
|
||||
fallbackError = errorMessage;
|
||||
}
|
||||
// Always capture the client-side error message. Even when we have a run_id
|
||||
// (and App.tsx can fetch server-side detail), the client-side exception is
|
||||
// valuable for telemetry — e.g. stream disconnections where the server run
|
||||
// is still in-progress and has no error metadata yet.
|
||||
fallbackError = errorMessage;
|
||||
|
||||
// Preserve a stop reason already parsed from stream chunks (e.g. llm_api_error)
|
||||
// and only fall back to generic "error" when none is available.
|
||||
|
||||
Reference in New Issue
Block a user