feat: add telemetry tracking for retry-inducing errors (#1131)

Co-authored-by: Letta <noreply@letta.com>
2026-02-24 22:20:46 -08:00
parent 35920fbc91
commit 423215fd56
2 changed files with 35 additions and 8 deletions
--- a/src/cli/App.tsx
+++ b/src/cli/App.tsx
@@ -3491,6 +3491,20 @@ export default function App({
                  : null;
              const delayMs = retryAfterMs ?? 1000 * 2 ** (attempt - 1);

+              // Log the error that triggered the retry
+              telemetry.trackError(
+                "retry_pre_stream_transient",
+                errorDetail || "Pre-stream transient error",
+                "pre_stream_retry",
+                {
+                  httpStatus:
+                    preStreamError instanceof APIError
+                      ? preStreamError.status
+                      : undefined,
+                  modelId: currentModelId || undefined,
+                },
+              );
+
              const statusId = uid("status");
              buffersRef.current.byId.set(statusId, {
                kind: "status",
@@ -4674,6 +4688,19 @@ export default function App({
            const attempt = llmApiErrorRetriesRef.current;
            const delayMs = 1000 * 2 ** (attempt - 1); // 1s, 2s, 4s

+            // Log the error that triggered the retry
+            telemetry.trackError(
+              "retry_post_stream_error",
+              detailFromRun ||
+                fallbackError ||
+                `Stream stopped: ${stopReasonToHandle}`,
+              "post_stream_retry",
+              {
+                modelId: currentModelId || undefined,
+                runId: lastRunId ?? undefined,
+              },
+            );
+
            // Show subtle grey status message
            const statusId = uid("status");
            const statusLines = [getRetryStatusMessage(detailFromRun)];
@@ -4740,9 +4767,9 @@ export default function App({
            },
          );

-          // If we have a client-side stream error (e.g., JSON parse error), show it directly
-          // Fallback error: no run_id available, show whatever error message we have
-          if (fallbackError) {
+          // If we have a client-side stream error with no run_id, show it directly.
+          // When lastRunId is present, prefer the richer server-side error details below.
+          if (fallbackError && !lastRunId) {
            setNetworkPhase("error");
            const errorMsg = lastRunId
              ? `Stream error: ${fallbackError}\n(run_id: ${lastRunId})`
--- a/src/cli/helpers/stream.ts
+++ b/src/cli/helpers/stream.ts
@@ -223,11 +223,11 @@ export async function drainStream(
      }
    }

-    // Only set fallbackError if we don't have a run_id - if we have a run_id,
-    // App.tsx will fetch detailed error info from the server which is better
-    if (!streamProcessor.lastRunId) {
-      fallbackError = errorMessage;
-    }
+    // Always capture the client-side error message. Even when we have a run_id
+    // (and App.tsx can fetch server-side detail), the client-side exception is
+    // valuable for telemetry — e.g. stream disconnections where the server run
+    // is still in-progress and has no error metadata yet.
+    fallbackError = errorMessage;

    // Preserve a stop reason already parsed from stream chunks (e.g. llm_api_error)
    // and only fall back to generic "error" when none is available.