fix: harden voice memo delivery diagnostics (#536)

Co-authored-by: Letta Code <noreply@letta.com>
2026-03-09 13:22:26 -07:00
parent 7da991206f
commit d23f0f9328
5 changed files with 195 additions and 16 deletions
--- a/docs/voice.md
+++ b/docs/voice.md
@@ -187,7 +187,28 @@ All environment variables can be overridden by the equivalent YAML config fields

 1. Check that a TTS provider is configured -- either in `lettabot.yaml` under `tts` or via `ELEVENLABS_API_KEY` / `OPENAI_API_KEY`
 2. Check that `jq` and `curl` are installed (required by the `lettabot-tts` script)
-3. Check logs for TTS API errors (HTTP status codes, rate limits)
+3. Check logs for voice pipeline events:
+   - `[Bot] Directive voice: generating memo (...)`
+   - `[Bot] Directive voice: generated file ...`
+   - `[Bot] Directive voice failed: ...`
+   - `[Telegram] sendVoice failed, falling back to sendAudio: ...`
+4. Check logs for TTS API errors (HTTP status codes, rate limits)
+
+### Docker checklist for voice
+
+For container images, ensure these binaries are available:
+
+- `bash` (required by `lettabot-tts` shebang)
+- `curl` and `jq` (required for TTS API calls)
+- `ffmpeg` (recommended for full inbound voice transcription compatibility)
+- `ca-certificates` (required for HTTPS API calls)
+
+Quick runtime validation from inside the container:
+
+```bash
+which bash curl jq ffmpeg
+lettabot-tts "TTS health check"
+```

 ### Telegram voice privacy

--- a/skills/voice-memo/lettabot-tts
+++ b/skills/voice-memo/lettabot-tts
@@ -27,6 +27,20 @@ OUTBOUND_DIR="${LETTABOT_WORKING_DIR:-$(pwd)}/data/outbound"

 PROVIDER="${TTS_PROVIDER:-elevenlabs}"

+require_cmd() {
+  if ! command -v "$1" >/dev/null 2>&1; then
+    echo "Error: Required command '$1' is not installed or not on PATH" >&2
+    exit 1
+  fi
+}
+
+preflight() {
+  require_cmd curl
+  require_cmd jq
+}
+
+preflight
+
 # Ensure output directory exists
 mkdir -p "$OUTBOUND_DIR"

@@ -52,7 +66,7 @@ tts_elevenlabs() {
  local model_id="${ELEVENLABS_MODEL_ID:-eleven_multilingual_v2}"

  local http_code
-  http_code=$(curl -s -w "%{http_code}" -o "$OUTPUT" \
+  http_code=$(curl -sS -w "%{http_code}" -o "$OUTPUT" \
    "https://api.elevenlabs.io/v1/text-to-speech/${voice_id}" \
    -H "xi-api-key: ${ELEVENLABS_API_KEY}" \
    -H "Content-Type: application/json" \
@@ -67,13 +81,21 @@ tts_elevenlabs() {
    )")

  if [ "$http_code" -lt 200 ] || [ "$http_code" -ge 300 ]; then
-    echo "Error: ElevenLabs API returned HTTP $http_code" >&2
-    if file "$OUTPUT" | grep -q "text\|JSON\|ASCII"; then
-      cat "$OUTPUT" >&2
+    echo "Error: ElevenLabs API returned HTTP $http_code (model=$model_id voice_id=$voice_id)" >&2
+    if [ -s "$OUTPUT" ]; then
+      echo "Error response preview:" >&2
+      head -c 2000 "$OUTPUT" >&2 || true
+      echo >&2
    fi
    rm -f "$OUTPUT"
    exit 1
  fi
+
+  if [ ! -s "$OUTPUT" ]; then
+    echo "Error: ElevenLabs TTS response was empty" >&2
+    rm -f "$OUTPUT"
+    exit 1
+  fi
 }

 # ---------------------------------------------------------------------------
@@ -89,7 +111,7 @@ tts_openai() {
  local model="${OPENAI_TTS_MODEL:-tts-1}"

  local http_code
-  http_code=$(curl -s -w "%{http_code}" -o "$OUTPUT" \
+  http_code=$(curl -sS -w "%{http_code}" -o "$OUTPUT" \
    "https://api.openai.com/v1/audio/speech" \
    -H "Authorization: Bearer ${OPENAI_API_KEY}" \
    -H "Content-Type: application/json" \
@@ -106,13 +128,21 @@ tts_openai() {
    )")

  if [ "$http_code" -lt 200 ] || [ "$http_code" -ge 300 ]; then
-    echo "Error: OpenAI TTS API returned HTTP $http_code" >&2
-    if file "$OUTPUT" | grep -q "text\|JSON\|ASCII"; then
-      cat "$OUTPUT" >&2
+    echo "Error: OpenAI TTS API returned HTTP $http_code (model=$model voice=$voice)" >&2
+    if [ -s "$OUTPUT" ]; then
+      echo "Error response preview:" >&2
+      head -c 2000 "$OUTPUT" >&2 || true
+      echo >&2
    fi
    rm -f "$OUTPUT"
    exit 1
  fi
+
+  if [ ! -s "$OUTPUT" ]; then
+    echo "Error: OpenAI TTS response was empty" >&2
+    rm -f "$OUTPUT"
+    exit 1
+  fi
 }

 # ---------------------------------------------------------------------------
--- a/src/channels/telegram.test.ts
+++ b/src/channels/telegram.test.ts
@@ -32,3 +32,49 @@ describe('TelegramAdapter reactions', () => {
    ]);
  });
 });
+
+describe('TelegramAdapter audio fallback', () => {
+  afterEach(() => {
+    vi.restoreAllMocks();
+  });
+
+  it('falls back to sendAudio for VOICE_MESSAGES_FORBIDDEN errors', async () => {
+    const adapter = new TelegramAdapter({ token: 'test-token' });
+    const sendVoice = vi
+      .spyOn(adapter.getBot().api, 'sendVoice')
+      .mockRejectedValue({ description: 'Bad Request: VOICE_MESSAGES_FORBIDDEN' } as any);
+    const sendAudio = vi
+      .spyOn(adapter.getBot().api, 'sendAudio')
+      .mockResolvedValue({ message_id: 987 } as any);
+
+    const result = await adapter.sendFile({
+      chatId: '123',
+      filePath: '/tmp/voice.ogg',
+      kind: 'audio',
+    });
+
+    expect(sendVoice).toHaveBeenCalledTimes(1);
+    expect(sendAudio).toHaveBeenCalledTimes(1);
+    expect(result).toEqual({ messageId: '987' });
+  });
+
+  it('does not fall back to sendAudio for non-voice transport failures', async () => {
+    const adapter = new TelegramAdapter({ token: 'test-token' });
+    const timeoutError = new Error('socket hang up');
+    const sendVoice = vi
+      .spyOn(adapter.getBot().api, 'sendVoice')
+      .mockRejectedValue(timeoutError);
+    const sendAudio = vi
+      .spyOn(adapter.getBot().api, 'sendAudio')
+      .mockResolvedValue({ message_id: 999 } as any);
+
+    await expect(adapter.sendFile({
+      chatId: '123',
+      filePath: '/tmp/voice.ogg',
+      kind: 'audio',
+    })).rejects.toBe(timeoutError);
+
+    expect(sendVoice).toHaveBeenCalledTimes(1);
+    expect(sendAudio).not.toHaveBeenCalled();
+  });
+});
--- a/src/channels/telegram.ts
+++ b/src/channels/telegram.ts
@@ -26,6 +26,27 @@ import { resolveDailyLimits, checkDailyLimit, type GroupModeConfig } from './gro
 import { createLogger } from '../logger.js';

 const log = createLogger('Telegram');
+
+function getTelegramErrorReason(err: unknown): string {
+  if (err && typeof err === 'object') {
+    const maybeError = err as { description?: string; message?: string };
+    if (typeof maybeError.description === 'string' && maybeError.description.trim().length > 0) {
+      return maybeError.description;
+    }
+    if (typeof maybeError.message === 'string' && maybeError.message.trim().length > 0) {
+      return maybeError.message;
+    }
+  }
+  return String(err);
+}
+
+function shouldFallbackToAudio(err: unknown): boolean {
+  if (!err || typeof err !== 'object') return false;
+  const description = (err as { description?: string }).description;
+  if (typeof description !== 'string') return false;
+  return description.includes('VOICE_MESSAGES_FORBIDDEN');
+}
+
 export interface TelegramConfig {
  token: string;
  dmPolicy?: DmPolicy;           // 'pairing' (default), 'allowlist', or 'open'
@@ -593,13 +614,21 @@ export class TelegramAdapter implements ChannelAdapter {
        const result = await this.bot.api.sendVoice(file.chatId, input, { caption });
        return { messageId: String(result.message_id) };
      } catch (err: any) {
-        // Fall back to sendAudio if voice messages are restricted (Telegram Premium privacy setting)
-        if (err?.description?.includes('VOICE_MESSAGES_FORBIDDEN')) {
-          log.warn('sendVoice forbidden, falling back to sendAudio');
+        const reason = getTelegramErrorReason(err);
+        // Only retry with sendAudio for deterministic voice-policy rejections.
+        // For network/timeout errors we rethrow to avoid possible duplicate sends.
+        if (!shouldFallbackToAudio(err)) {
+          throw err;
+        }
+        log.warn('sendVoice failed with VOICE_MESSAGES_FORBIDDEN, falling back to sendAudio:', reason);
+        try {
          const result = await this.bot.api.sendAudio(file.chatId, new InputFile(file.filePath), { caption });
          return { messageId: String(result.message_id) };
+        } catch (fallbackErr: any) {
+          const fallbackReason = getTelegramErrorReason(fallbackErr);
+          log.error('sendAudio fallback also failed:', fallbackReason);
+          throw fallbackErr;
        }
-        throw err;
      }
    }

--- a/src/core/bot.ts
+++ b/src/core/bot.ts
@@ -461,11 +461,24 @@ export class LettaBot implements AgentSession {
          .map(dir => join(dir, 'lettabot-tts'))
          .find(p => existsSync(p));

+        const ttsProvider = (process.env.TTS_PROVIDER || 'elevenlabs').toLowerCase();
+        const ttsVoice = ttsProvider === 'openai'
+          ? (process.env.OPENAI_TTS_VOICE || 'alloy')
+          : (process.env.ELEVENLABS_VOICE_ID || 'onwK4e9ZLuTAKqWW03F9');
+        const ttsModel = ttsProvider === 'openai'
+          ? (process.env.OPENAI_TTS_MODEL || 'tts-1')
+          : (process.env.ELEVENLABS_MODEL_ID || 'eleven_multilingual_v2');
+
        if (!ttsPath) {
          log.warn('Directive voice skipped: lettabot-tts not found in skill dirs');
          continue;
        }

+        log.info(
+          `Directive voice: generating memo (provider=${ttsProvider}, model=${ttsModel}, voice=${ttsVoice}, textLen=${directive.text.length})`,
+        );
+        log.info(`Directive voice: helper=${ttsPath}`);
+
        try {
          const outputPath = await new Promise<string>((resolve, reject) => {
            execFile(ttsPath, [directive.text], {
@@ -474,13 +487,37 @@ export class LettaBot implements AgentSession {
              timeout: 30_000,
            }, (err, stdout, stderr) => {
              if (err) {
-                reject(new Error(stderr?.trim() || err.message));
+                const execErr = new Error(stderr?.trim() || err.message) as Error & {
+                  code?: string | number | null;
+                  signal?: NodeJS.Signals;
+                  stdout?: string;
+                  stderr?: string;
+                };
+                execErr.code = err.code;
+                execErr.signal = err.signal;
+                execErr.stdout = stdout?.trim();
+                execErr.stderr = stderr?.trim();
+                reject(execErr);
              } else {
-                resolve(stdout.trim());
+                const output = stdout.trim();
+                if (!output) {
+                  reject(new Error('lettabot-tts returned an empty output path'));
+                  return;
+                }
+                if (stderr?.trim()) {
+                  log.warn('Directive voice: lettabot-tts stderr:', stderr.trim());
+                }
+                resolve(output.split('\n').at(-1)?.trim() || output);
              }
            });
          });

+          const outputStats = await stat(outputPath);
+          if (!outputStats.isFile()) {
+            throw new Error(`Generated TTS output is not a file: ${outputPath}`);
+          }
+          log.info(`Directive voice: generated file ${outputPath} (${outputStats.size} bytes)`);
+
          await adapter.sendFile({
            chatId,
            filePath: outputPath,
@@ -493,7 +530,23 @@ export class LettaBot implements AgentSession {
          // Clean up generated file
          try { await unlink(outputPath); } catch {}
        } catch (err) {
-          log.warn('Directive voice failed:', err instanceof Error ? err.message : err);
+          const execErr = err as Error & {
+            code?: string | number | null;
+            signal?: NodeJS.Signals;
+            stdout?: string;
+            stderr?: string;
+          };
+          log.warn('Directive voice failed:', {
+            message: execErr?.message || String(err),
+            code: execErr?.code,
+            signal: execErr?.signal,
+            stdout: typeof execErr?.stdout === 'string' ? execErr.stdout.slice(0, 300) : undefined,
+            stderr: typeof execErr?.stderr === 'string' ? execErr.stderr.slice(0, 1200) : undefined,
+            provider: ttsProvider,
+            model: ttsModel,
+            voice: ttsVoice,
+            helper: ttsPath,
+          });
        }
      }
    }