From d23f0f9328875987c89877886853376c60e54f6d Mon Sep 17 00:00:00 2001
From: Cameron <cameron@pfiffer.org>
Date: Mon, 9 Mar 2026 13:22:26 -0700
Subject: [PATCH] fix: harden voice memo delivery diagnostics (#536)

Co-authored-by: Letta Code <noreply@letta.com>
---
 docs/voice.md                  | 23 ++++++++++++-
 skills/voice-memo/lettabot-tts | 46 +++++++++++++++++++++-----
 src/channels/telegram.test.ts  | 46 ++++++++++++++++++++++++++
 src/channels/telegram.ts       | 37 ++++++++++++++++++---
 src/core/bot.ts                | 59 ++++++++++++++++++++++++++++++++--
 5 files changed, 195 insertions(+), 16 deletions(-)

diff --git a/docs/voice.md b/docs/voice.md
index e73d101..0128b85 100644
--- a/docs/voice.md
+++ b/docs/voice.md
@@ -187,7 +187,28 @@ All environment variables can be overridden by the equivalent YAML config fields
 
 1. Check that a TTS provider is configured -- either in `lettabot.yaml` under `tts` or via `ELEVENLABS_API_KEY` / `OPENAI_API_KEY`
 2. Check that `jq` and `curl` are installed (required by the `lettabot-tts` script)
-3. Check logs for TTS API errors (HTTP status codes, rate limits)
+3. Check logs for voice pipeline events:
+   - `[Bot] Directive voice: generating memo (...)`
+   - `[Bot] Directive voice: generated file ...`
+   - `[Bot] Directive voice failed: ...`
+   - `[Telegram] sendVoice failed, falling back to sendAudio: ...`
+4. Check logs for TTS API errors (HTTP status codes, rate limits)
+
+### Docker checklist for voice
+
+For container images, ensure these binaries are available:
+
+- `bash` (required by `lettabot-tts` shebang)
+- `curl` and `jq` (required for TTS API calls)
+- `ffmpeg` (recommended for full inbound voice transcription compatibility)
+- `ca-certificates` (required for HTTPS API calls)
+
+Quick runtime validation from inside the container:
+
+```bash
+which bash curl jq ffmpeg
+lettabot-tts "TTS health check"
+```
 
 ### Telegram voice privacy
 
diff --git a/skills/voice-memo/lettabot-tts b/skills/voice-memo/lettabot-tts
index 8c09f1b..6a0caf4 100755
--- a/skills/voice-memo/lettabot-tts
+++ b/skills/voice-memo/lettabot-tts
@@ -27,6 +27,20 @@ OUTBOUND_DIR="${LETTABOT_WORKING_DIR:-$(pwd)}/data/outbound"
 
 PROVIDER="${TTS_PROVIDER:-elevenlabs}"
 
+require_cmd() {
+  if ! command -v "$1" >/dev/null 2>&1; then
+    echo "Error: Required command '$1' is not installed or not on PATH" >&2
+    exit 1
+  fi
+}
+
+preflight() {
+  require_cmd curl
+  require_cmd jq
+}
+
+preflight
+
 # Ensure output directory exists
 mkdir -p "$OUTBOUND_DIR"
 
@@ -52,7 +66,7 @@ tts_elevenlabs() {
   local model_id="${ELEVENLABS_MODEL_ID:-eleven_multilingual_v2}"
 
   local http_code
-  http_code=$(curl -s -w "%{http_code}" -o "$OUTPUT" \
+  http_code=$(curl -sS -w "%{http_code}" -o "$OUTPUT" \
     "https://api.elevenlabs.io/v1/text-to-speech/${voice_id}" \
     -H "xi-api-key: ${ELEVENLABS_API_KEY}" \
     -H "Content-Type: application/json" \
@@ -67,13 +81,21 @@ tts_elevenlabs() {
     )")
 
   if [ "$http_code" -lt 200 ] || [ "$http_code" -ge 300 ]; then
-    echo "Error: ElevenLabs API returned HTTP $http_code" >&2
-    if file "$OUTPUT" | grep -q "text\|JSON\|ASCII"; then
-      cat "$OUTPUT" >&2
+    echo "Error: ElevenLabs API returned HTTP $http_code (model=$model_id voice_id=$voice_id)" >&2
+    if [ -s "$OUTPUT" ]; then
+      echo "Error response preview:" >&2
+      head -c 2000 "$OUTPUT" >&2 || true
+      echo >&2
     fi
     rm -f "$OUTPUT"
     exit 1
   fi
+
+  if [ ! -s "$OUTPUT" ]; then
+    echo "Error: ElevenLabs TTS response was empty" >&2
+    rm -f "$OUTPUT"
+    exit 1
+  fi
 }
 
 # ---------------------------------------------------------------------------
@@ -89,7 +111,7 @@ tts_openai() {
   local model="${OPENAI_TTS_MODEL:-tts-1}"
 
   local http_code
-  http_code=$(curl -s -w "%{http_code}" -o "$OUTPUT" \
+  http_code=$(curl -sS -w "%{http_code}" -o "$OUTPUT" \
     "https://api.openai.com/v1/audio/speech" \
     -H "Authorization: Bearer ${OPENAI_API_KEY}" \
     -H "Content-Type: application/json" \
@@ -106,13 +128,21 @@ tts_openai() {
     )")
 
   if [ "$http_code" -lt 200 ] || [ "$http_code" -ge 300 ]; then
-    echo "Error: OpenAI TTS API returned HTTP $http_code" >&2
-    if file "$OUTPUT" | grep -q "text\|JSON\|ASCII"; then
-      cat "$OUTPUT" >&2
+    echo "Error: OpenAI TTS API returned HTTP $http_code (model=$model voice=$voice)" >&2
+    if [ -s "$OUTPUT" ]; then
+      echo "Error response preview:" >&2
+      head -c 2000 "$OUTPUT" >&2 || true
+      echo >&2
     fi
     rm -f "$OUTPUT"
     exit 1
   fi
+
+  if [ ! -s "$OUTPUT" ]; then
+    echo "Error: OpenAI TTS response was empty" >&2
+    rm -f "$OUTPUT"
+    exit 1
+  fi
 }
 
 # ---------------------------------------------------------------------------
diff --git a/src/channels/telegram.test.ts b/src/channels/telegram.test.ts
index 4f008e7..6c3f946 100644
--- a/src/channels/telegram.test.ts
+++ b/src/channels/telegram.test.ts
@@ -32,3 +32,49 @@ describe('TelegramAdapter reactions', () => {
     ]);
   });
 });
+
+describe('TelegramAdapter audio fallback', () => {
+  afterEach(() => {
+    vi.restoreAllMocks();
+  });
+
+  it('falls back to sendAudio for VOICE_MESSAGES_FORBIDDEN errors', async () => {
+    const adapter = new TelegramAdapter({ token: 'test-token' });
+    const sendVoice = vi
+      .spyOn(adapter.getBot().api, 'sendVoice')
+      .mockRejectedValue({ description: 'Bad Request: VOICE_MESSAGES_FORBIDDEN' } as any);
+    const sendAudio = vi
+      .spyOn(adapter.getBot().api, 'sendAudio')
+      .mockResolvedValue({ message_id: 987 } as any);
+
+    const result = await adapter.sendFile({
+      chatId: '123',
+      filePath: '/tmp/voice.ogg',
+      kind: 'audio',
+    });
+
+    expect(sendVoice).toHaveBeenCalledTimes(1);
+    expect(sendAudio).toHaveBeenCalledTimes(1);
+    expect(result).toEqual({ messageId: '987' });
+  });
+
+  it('does not fall back to sendAudio for non-voice transport failures', async () => {
+    const adapter = new TelegramAdapter({ token: 'test-token' });
+    const timeoutError = new Error('socket hang up');
+    const sendVoice = vi
+      .spyOn(adapter.getBot().api, 'sendVoice')
+      .mockRejectedValue(timeoutError);
+    const sendAudio = vi
+      .spyOn(adapter.getBot().api, 'sendAudio')
+      .mockResolvedValue({ message_id: 999 } as any);
+
+    await expect(adapter.sendFile({
+      chatId: '123',
+      filePath: '/tmp/voice.ogg',
+      kind: 'audio',
+    })).rejects.toBe(timeoutError);
+
+    expect(sendVoice).toHaveBeenCalledTimes(1);
+    expect(sendAudio).not.toHaveBeenCalled();
+  });
+});
diff --git a/src/channels/telegram.ts b/src/channels/telegram.ts
index 04bc67d..93e6655 100644
--- a/src/channels/telegram.ts
+++ b/src/channels/telegram.ts
@@ -26,6 +26,27 @@ import { resolveDailyLimits, checkDailyLimit, type GroupModeConfig } from './gro
 import { createLogger } from '../logger.js';
 
 const log = createLogger('Telegram');
+
+function getTelegramErrorReason(err: unknown): string {
+  if (err && typeof err === 'object') {
+    const maybeError = err as { description?: string; message?: string };
+    if (typeof maybeError.description === 'string' && maybeError.description.trim().length > 0) {
+      return maybeError.description;
+    }
+    if (typeof maybeError.message === 'string' && maybeError.message.trim().length > 0) {
+      return maybeError.message;
+    }
+  }
+  return String(err);
+}
+
+function shouldFallbackToAudio(err: unknown): boolean {
+  if (!err || typeof err !== 'object') return false;
+  const description = (err as { description?: string }).description;
+  if (typeof description !== 'string') return false;
+  return description.includes('VOICE_MESSAGES_FORBIDDEN');
+}
+
 export interface TelegramConfig {
   token: string;
   dmPolicy?: DmPolicy;           // 'pairing' (default), 'allowlist', or 'open'
@@ -593,13 +614,21 @@ export class TelegramAdapter implements ChannelAdapter {
         const result = await this.bot.api.sendVoice(file.chatId, input, { caption });
         return { messageId: String(result.message_id) };
       } catch (err: any) {
-        // Fall back to sendAudio if voice messages are restricted (Telegram Premium privacy setting)
-        if (err?.description?.includes('VOICE_MESSAGES_FORBIDDEN')) {
-          log.warn('sendVoice forbidden, falling back to sendAudio');
+        const reason = getTelegramErrorReason(err);
+        // Only retry with sendAudio for deterministic voice-policy rejections.
+        // For network/timeout errors we rethrow to avoid possible duplicate sends.
+        if (!shouldFallbackToAudio(err)) {
+          throw err;
+        }
+        log.warn('sendVoice failed with VOICE_MESSAGES_FORBIDDEN, falling back to sendAudio:', reason);
+        try {
           const result = await this.bot.api.sendAudio(file.chatId, new InputFile(file.filePath), { caption });
           return { messageId: String(result.message_id) };
+        } catch (fallbackErr: any) {
+          const fallbackReason = getTelegramErrorReason(fallbackErr);
+          log.error('sendAudio fallback also failed:', fallbackReason);
+          throw fallbackErr;
         }
-        throw err;
       }
     }
 
diff --git a/src/core/bot.ts b/src/core/bot.ts
index 5d0a9d5..50e2e85 100644
--- a/src/core/bot.ts
+++ b/src/core/bot.ts
@@ -461,11 +461,24 @@ export class LettaBot implements AgentSession {
           .map(dir => join(dir, 'lettabot-tts'))
           .find(p => existsSync(p));
 
+        const ttsProvider = (process.env.TTS_PROVIDER || 'elevenlabs').toLowerCase();
+        const ttsVoice = ttsProvider === 'openai'
+          ? (process.env.OPENAI_TTS_VOICE || 'alloy')
+          : (process.env.ELEVENLABS_VOICE_ID || 'onwK4e9ZLuTAKqWW03F9');
+        const ttsModel = ttsProvider === 'openai'
+          ? (process.env.OPENAI_TTS_MODEL || 'tts-1')
+          : (process.env.ELEVENLABS_MODEL_ID || 'eleven_multilingual_v2');
+
         if (!ttsPath) {
           log.warn('Directive voice skipped: lettabot-tts not found in skill dirs');
           continue;
         }
 
+        log.info(
+          `Directive voice: generating memo (provider=${ttsProvider}, model=${ttsModel}, voice=${ttsVoice}, textLen=${directive.text.length})`,
+        );
+        log.info(`Directive voice: helper=${ttsPath}`);
+
         try {
           const outputPath = await new Promise<string>((resolve, reject) => {
             execFile(ttsPath, [directive.text], {
@@ -474,13 +487,37 @@ export class LettaBot implements AgentSession {
               timeout: 30_000,
             }, (err, stdout, stderr) => {
               if (err) {
-                reject(new Error(stderr?.trim() || err.message));
+                const execErr = new Error(stderr?.trim() || err.message) as Error & {
+                  code?: string | number | null;
+                  signal?: NodeJS.Signals;
+                  stdout?: string;
+                  stderr?: string;
+                };
+                execErr.code = err.code;
+                execErr.signal = err.signal;
+                execErr.stdout = stdout?.trim();
+                execErr.stderr = stderr?.trim();
+                reject(execErr);
               } else {
-                resolve(stdout.trim());
+                const output = stdout.trim();
+                if (!output) {
+                  reject(new Error('lettabot-tts returned an empty output path'));
+                  return;
+                }
+                if (stderr?.trim()) {
+                  log.warn('Directive voice: lettabot-tts stderr:', stderr.trim());
+                }
+                resolve(output.split('\n').at(-1)?.trim() || output);
               }
             });
           });
 
+          const outputStats = await stat(outputPath);
+          if (!outputStats.isFile()) {
+            throw new Error(`Generated TTS output is not a file: ${outputPath}`);
+          }
+          log.info(`Directive voice: generated file ${outputPath} (${outputStats.size} bytes)`);
+
           await adapter.sendFile({
             chatId,
             filePath: outputPath,
@@ -493,7 +530,23 @@ export class LettaBot implements AgentSession {
           // Clean up generated file
           try { await unlink(outputPath); } catch {}
         } catch (err) {
-          log.warn('Directive voice failed:', err instanceof Error ? err.message : err);
+          const execErr = err as Error & {
+            code?: string | number | null;
+            signal?: NodeJS.Signals;
+            stdout?: string;
+            stderr?: string;
+          };
+          log.warn('Directive voice failed:', {
+            message: execErr?.message || String(err),
+            code: execErr?.code,
+            signal: execErr?.signal,
+            stdout: typeof execErr?.stdout === 'string' ? execErr.stdout.slice(0, 300) : undefined,
+            stderr: typeof execErr?.stderr === 'string' ? execErr.stderr.slice(0, 1200) : undefined,
+            provider: ttsProvider,
+            model: ttsModel,
+            voice: ttsVoice,
+            helper: ttsPath,
+          });
         }
       }
     }