From d23f0f9328875987c89877886853376c60e54f6d Mon Sep 17 00:00:00 2001 From: Cameron Date: Mon, 9 Mar 2026 13:22:26 -0700 Subject: [PATCH] fix: harden voice memo delivery diagnostics (#536) Co-authored-by: Letta Code --- docs/voice.md | 23 ++++++++++++- skills/voice-memo/lettabot-tts | 46 +++++++++++++++++++++----- src/channels/telegram.test.ts | 46 ++++++++++++++++++++++++++ src/channels/telegram.ts | 37 ++++++++++++++++++--- src/core/bot.ts | 59 ++++++++++++++++++++++++++++++++-- 5 files changed, 195 insertions(+), 16 deletions(-) diff --git a/docs/voice.md b/docs/voice.md index e73d101..0128b85 100644 --- a/docs/voice.md +++ b/docs/voice.md @@ -187,7 +187,28 @@ All environment variables can be overridden by the equivalent YAML config fields 1. Check that a TTS provider is configured -- either in `lettabot.yaml` under `tts` or via `ELEVENLABS_API_KEY` / `OPENAI_API_KEY` 2. Check that `jq` and `curl` are installed (required by the `lettabot-tts` script) -3. Check logs for TTS API errors (HTTP status codes, rate limits) +3. Check logs for voice pipeline events: + - `[Bot] Directive voice: generating memo (...)` + - `[Bot] Directive voice: generated file ...` + - `[Bot] Directive voice failed: ...` + - `[Telegram] sendVoice failed, falling back to sendAudio: ...` +4. Check logs for TTS API errors (HTTP status codes, rate limits) + +### Docker checklist for voice + +For container images, ensure these binaries are available: + +- `bash` (required by `lettabot-tts` shebang) +- `curl` and `jq` (required for TTS API calls) +- `ffmpeg` (recommended for full inbound voice transcription compatibility) +- `ca-certificates` (required for HTTPS API calls) + +Quick runtime validation from inside the container: + +```bash +which bash curl jq ffmpeg +lettabot-tts "TTS health check" +``` ### Telegram voice privacy diff --git a/skills/voice-memo/lettabot-tts b/skills/voice-memo/lettabot-tts index 8c09f1b..6a0caf4 100755 --- a/skills/voice-memo/lettabot-tts +++ b/skills/voice-memo/lettabot-tts @@ -27,6 +27,20 @@ OUTBOUND_DIR="${LETTABOT_WORKING_DIR:-$(pwd)}/data/outbound" PROVIDER="${TTS_PROVIDER:-elevenlabs}" +require_cmd() { + if ! command -v "$1" >/dev/null 2>&1; then + echo "Error: Required command '$1' is not installed or not on PATH" >&2 + exit 1 + fi +} + +preflight() { + require_cmd curl + require_cmd jq +} + +preflight + # Ensure output directory exists mkdir -p "$OUTBOUND_DIR" @@ -52,7 +66,7 @@ tts_elevenlabs() { local model_id="${ELEVENLABS_MODEL_ID:-eleven_multilingual_v2}" local http_code - http_code=$(curl -s -w "%{http_code}" -o "$OUTPUT" \ + http_code=$(curl -sS -w "%{http_code}" -o "$OUTPUT" \ "https://api.elevenlabs.io/v1/text-to-speech/${voice_id}" \ -H "xi-api-key: ${ELEVENLABS_API_KEY}" \ -H "Content-Type: application/json" \ @@ -67,13 +81,21 @@ tts_elevenlabs() { )") if [ "$http_code" -lt 200 ] || [ "$http_code" -ge 300 ]; then - echo "Error: ElevenLabs API returned HTTP $http_code" >&2 - if file "$OUTPUT" | grep -q "text\|JSON\|ASCII"; then - cat "$OUTPUT" >&2 + echo "Error: ElevenLabs API returned HTTP $http_code (model=$model_id voice_id=$voice_id)" >&2 + if [ -s "$OUTPUT" ]; then + echo "Error response preview:" >&2 + head -c 2000 "$OUTPUT" >&2 || true + echo >&2 fi rm -f "$OUTPUT" exit 1 fi + + if [ ! -s "$OUTPUT" ]; then + echo "Error: ElevenLabs TTS response was empty" >&2 + rm -f "$OUTPUT" + exit 1 + fi } # --------------------------------------------------------------------------- @@ -89,7 +111,7 @@ tts_openai() { local model="${OPENAI_TTS_MODEL:-tts-1}" local http_code - http_code=$(curl -s -w "%{http_code}" -o "$OUTPUT" \ + http_code=$(curl -sS -w "%{http_code}" -o "$OUTPUT" \ "https://api.openai.com/v1/audio/speech" \ -H "Authorization: Bearer ${OPENAI_API_KEY}" \ -H "Content-Type: application/json" \ @@ -106,13 +128,21 @@ tts_openai() { )") if [ "$http_code" -lt 200 ] || [ "$http_code" -ge 300 ]; then - echo "Error: OpenAI TTS API returned HTTP $http_code" >&2 - if file "$OUTPUT" | grep -q "text\|JSON\|ASCII"; then - cat "$OUTPUT" >&2 + echo "Error: OpenAI TTS API returned HTTP $http_code (model=$model voice=$voice)" >&2 + if [ -s "$OUTPUT" ]; then + echo "Error response preview:" >&2 + head -c 2000 "$OUTPUT" >&2 || true + echo >&2 fi rm -f "$OUTPUT" exit 1 fi + + if [ ! -s "$OUTPUT" ]; then + echo "Error: OpenAI TTS response was empty" >&2 + rm -f "$OUTPUT" + exit 1 + fi } # --------------------------------------------------------------------------- diff --git a/src/channels/telegram.test.ts b/src/channels/telegram.test.ts index 4f008e7..6c3f946 100644 --- a/src/channels/telegram.test.ts +++ b/src/channels/telegram.test.ts @@ -32,3 +32,49 @@ describe('TelegramAdapter reactions', () => { ]); }); }); + +describe('TelegramAdapter audio fallback', () => { + afterEach(() => { + vi.restoreAllMocks(); + }); + + it('falls back to sendAudio for VOICE_MESSAGES_FORBIDDEN errors', async () => { + const adapter = new TelegramAdapter({ token: 'test-token' }); + const sendVoice = vi + .spyOn(adapter.getBot().api, 'sendVoice') + .mockRejectedValue({ description: 'Bad Request: VOICE_MESSAGES_FORBIDDEN' } as any); + const sendAudio = vi + .spyOn(adapter.getBot().api, 'sendAudio') + .mockResolvedValue({ message_id: 987 } as any); + + const result = await adapter.sendFile({ + chatId: '123', + filePath: '/tmp/voice.ogg', + kind: 'audio', + }); + + expect(sendVoice).toHaveBeenCalledTimes(1); + expect(sendAudio).toHaveBeenCalledTimes(1); + expect(result).toEqual({ messageId: '987' }); + }); + + it('does not fall back to sendAudio for non-voice transport failures', async () => { + const adapter = new TelegramAdapter({ token: 'test-token' }); + const timeoutError = new Error('socket hang up'); + const sendVoice = vi + .spyOn(adapter.getBot().api, 'sendVoice') + .mockRejectedValue(timeoutError); + const sendAudio = vi + .spyOn(adapter.getBot().api, 'sendAudio') + .mockResolvedValue({ message_id: 999 } as any); + + await expect(adapter.sendFile({ + chatId: '123', + filePath: '/tmp/voice.ogg', + kind: 'audio', + })).rejects.toBe(timeoutError); + + expect(sendVoice).toHaveBeenCalledTimes(1); + expect(sendAudio).not.toHaveBeenCalled(); + }); +}); diff --git a/src/channels/telegram.ts b/src/channels/telegram.ts index 04bc67d..93e6655 100644 --- a/src/channels/telegram.ts +++ b/src/channels/telegram.ts @@ -26,6 +26,27 @@ import { resolveDailyLimits, checkDailyLimit, type GroupModeConfig } from './gro import { createLogger } from '../logger.js'; const log = createLogger('Telegram'); + +function getTelegramErrorReason(err: unknown): string { + if (err && typeof err === 'object') { + const maybeError = err as { description?: string; message?: string }; + if (typeof maybeError.description === 'string' && maybeError.description.trim().length > 0) { + return maybeError.description; + } + if (typeof maybeError.message === 'string' && maybeError.message.trim().length > 0) { + return maybeError.message; + } + } + return String(err); +} + +function shouldFallbackToAudio(err: unknown): boolean { + if (!err || typeof err !== 'object') return false; + const description = (err as { description?: string }).description; + if (typeof description !== 'string') return false; + return description.includes('VOICE_MESSAGES_FORBIDDEN'); +} + export interface TelegramConfig { token: string; dmPolicy?: DmPolicy; // 'pairing' (default), 'allowlist', or 'open' @@ -593,13 +614,21 @@ export class TelegramAdapter implements ChannelAdapter { const result = await this.bot.api.sendVoice(file.chatId, input, { caption }); return { messageId: String(result.message_id) }; } catch (err: any) { - // Fall back to sendAudio if voice messages are restricted (Telegram Premium privacy setting) - if (err?.description?.includes('VOICE_MESSAGES_FORBIDDEN')) { - log.warn('sendVoice forbidden, falling back to sendAudio'); + const reason = getTelegramErrorReason(err); + // Only retry with sendAudio for deterministic voice-policy rejections. + // For network/timeout errors we rethrow to avoid possible duplicate sends. + if (!shouldFallbackToAudio(err)) { + throw err; + } + log.warn('sendVoice failed with VOICE_MESSAGES_FORBIDDEN, falling back to sendAudio:', reason); + try { const result = await this.bot.api.sendAudio(file.chatId, new InputFile(file.filePath), { caption }); return { messageId: String(result.message_id) }; + } catch (fallbackErr: any) { + const fallbackReason = getTelegramErrorReason(fallbackErr); + log.error('sendAudio fallback also failed:', fallbackReason); + throw fallbackErr; } - throw err; } } diff --git a/src/core/bot.ts b/src/core/bot.ts index 5d0a9d5..50e2e85 100644 --- a/src/core/bot.ts +++ b/src/core/bot.ts @@ -461,11 +461,24 @@ export class LettaBot implements AgentSession { .map(dir => join(dir, 'lettabot-tts')) .find(p => existsSync(p)); + const ttsProvider = (process.env.TTS_PROVIDER || 'elevenlabs').toLowerCase(); + const ttsVoice = ttsProvider === 'openai' + ? (process.env.OPENAI_TTS_VOICE || 'alloy') + : (process.env.ELEVENLABS_VOICE_ID || 'onwK4e9ZLuTAKqWW03F9'); + const ttsModel = ttsProvider === 'openai' + ? (process.env.OPENAI_TTS_MODEL || 'tts-1') + : (process.env.ELEVENLABS_MODEL_ID || 'eleven_multilingual_v2'); + if (!ttsPath) { log.warn('Directive voice skipped: lettabot-tts not found in skill dirs'); continue; } + log.info( + `Directive voice: generating memo (provider=${ttsProvider}, model=${ttsModel}, voice=${ttsVoice}, textLen=${directive.text.length})`, + ); + log.info(`Directive voice: helper=${ttsPath}`); + try { const outputPath = await new Promise((resolve, reject) => { execFile(ttsPath, [directive.text], { @@ -474,13 +487,37 @@ export class LettaBot implements AgentSession { timeout: 30_000, }, (err, stdout, stderr) => { if (err) { - reject(new Error(stderr?.trim() || err.message)); + const execErr = new Error(stderr?.trim() || err.message) as Error & { + code?: string | number | null; + signal?: NodeJS.Signals; + stdout?: string; + stderr?: string; + }; + execErr.code = err.code; + execErr.signal = err.signal; + execErr.stdout = stdout?.trim(); + execErr.stderr = stderr?.trim(); + reject(execErr); } else { - resolve(stdout.trim()); + const output = stdout.trim(); + if (!output) { + reject(new Error('lettabot-tts returned an empty output path')); + return; + } + if (stderr?.trim()) { + log.warn('Directive voice: lettabot-tts stderr:', stderr.trim()); + } + resolve(output.split('\n').at(-1)?.trim() || output); } }); }); + const outputStats = await stat(outputPath); + if (!outputStats.isFile()) { + throw new Error(`Generated TTS output is not a file: ${outputPath}`); + } + log.info(`Directive voice: generated file ${outputPath} (${outputStats.size} bytes)`); + await adapter.sendFile({ chatId, filePath: outputPath, @@ -493,7 +530,23 @@ export class LettaBot implements AgentSession { // Clean up generated file try { await unlink(outputPath); } catch {} } catch (err) { - log.warn('Directive voice failed:', err instanceof Error ? err.message : err); + const execErr = err as Error & { + code?: string | number | null; + signal?: NodeJS.Signals; + stdout?: string; + stderr?: string; + }; + log.warn('Directive voice failed:', { + message: execErr?.message || String(err), + code: execErr?.code, + signal: execErr?.signal, + stdout: typeof execErr?.stdout === 'string' ? execErr.stdout.slice(0, 300) : undefined, + stderr: typeof execErr?.stderr === 'string' ? execErr.stderr.slice(0, 1200) : undefined, + provider: ttsProvider, + model: ttsModel, + voice: ttsVoice, + helper: ttsPath, + }); } } }