From 423215fd565c37c031f408f6547811438440bdff Mon Sep 17 00:00:00 2001
From: jnjpng <jinjpeng@gmail.com>
Date: Tue, 24 Feb 2026 22:20:46 -0800
Subject: [PATCH] feat: add telemetry tracking for retry-inducing errors
 (#1131)

Co-authored-by: Letta <noreply@letta.com>
---
 src/cli/App.tsx           | 33 ++++++++++++++++++++++++++++++---
 src/cli/helpers/stream.ts | 10 +++++-----
 2 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/src/cli/App.tsx b/src/cli/App.tsx
index bbafdeb..7690c84 100644
--- a/src/cli/App.tsx
+++ b/src/cli/App.tsx
@@ -3491,6 +3491,20 @@ export default function App({
                   : null;
               const delayMs = retryAfterMs ?? 1000 * 2 ** (attempt - 1);
 
+              // Log the error that triggered the retry
+              telemetry.trackError(
+                "retry_pre_stream_transient",
+                errorDetail || "Pre-stream transient error",
+                "pre_stream_retry",
+                {
+                  httpStatus:
+                    preStreamError instanceof APIError
+                      ? preStreamError.status
+                      : undefined,
+                  modelId: currentModelId || undefined,
+                },
+              );
+
               const statusId = uid("status");
               buffersRef.current.byId.set(statusId, {
                 kind: "status",
@@ -4674,6 +4688,19 @@ export default function App({
             const attempt = llmApiErrorRetriesRef.current;
             const delayMs = 1000 * 2 ** (attempt - 1); // 1s, 2s, 4s
 
+            // Log the error that triggered the retry
+            telemetry.trackError(
+              "retry_post_stream_error",
+              detailFromRun ||
+                fallbackError ||
+                `Stream stopped: ${stopReasonToHandle}`,
+              "post_stream_retry",
+              {
+                modelId: currentModelId || undefined,
+                runId: lastRunId ?? undefined,
+              },
+            );
+
             // Show subtle grey status message
             const statusId = uid("status");
             const statusLines = [getRetryStatusMessage(detailFromRun)];
@@ -4740,9 +4767,9 @@ export default function App({
             },
           );
 
-          // If we have a client-side stream error (e.g., JSON parse error), show it directly
-          // Fallback error: no run_id available, show whatever error message we have
-          if (fallbackError) {
+          // If we have a client-side stream error with no run_id, show it directly.
+          // When lastRunId is present, prefer the richer server-side error details below.
+          if (fallbackError && !lastRunId) {
             setNetworkPhase("error");
             const errorMsg = lastRunId
               ? `Stream error: ${fallbackError}\n(run_id: ${lastRunId})`
diff --git a/src/cli/helpers/stream.ts b/src/cli/helpers/stream.ts
index 02cdb96..2dce382 100644
--- a/src/cli/helpers/stream.ts
+++ b/src/cli/helpers/stream.ts
@@ -223,11 +223,11 @@ export async function drainStream(
       }
     }
 
-    // Only set fallbackError if we don't have a run_id - if we have a run_id,
-    // App.tsx will fetch detailed error info from the server which is better
-    if (!streamProcessor.lastRunId) {
-      fallbackError = errorMessage;
-    }
+    // Always capture the client-side error message. Even when we have a run_id
+    // (and App.tsx can fetch server-side detail), the client-side exception is
+    // valuable for telemetry — e.g. stream disconnections where the server run
+    // is still in-progress and has no error metadata yet.
+    fallbackError = errorMessage;
 
     // Preserve a stop reason already parsed from stream chunks (e.g. llm_api_error)
     // and only fall back to generic "error" when none is available.