From d6113cab66767f94b7e352f712f603627bac538d Mon Sep 17 00:00:00 2001 From: Cameron Date: Wed, 4 Feb 2026 19:31:50 -0800 Subject: [PATCH] fix: graceful transcription fallback when ffmpeg unavailable (#155) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: graceful transcription fallback when ffmpeg unavailable When voice transcription fails (e.g., ffmpeg not installed), the agent now receives informative error messages instead of silent failures. Changes: - transcribeAudio() returns TranscriptionResult with success/error/audioPath - Tiered fallback: try format rename first, then ffmpeg, then fail gracefully - Check ffmpeg availability once and cache result - All channel adapters updated to show transcription errors to agent - Agent can explain to user why transcription failed Before: Agent sees: "[Voice message received]" Agent: "I received your voice message but there's no content..." After: Agent sees: "[Voice message - transcription failed: Cannot transcribe .aac format. Install ffmpeg for audio conversion, or send in a supported format (mp3, ogg, wav, m4a). Audio saved to: /path/to/file.aac]" Agent: "I couldn't transcribe your voice message because ffmpeg isn't installed. You could type your message instead." Fixes voice transcription on systems without ffmpeg. Written by Cameron ◯ Letta Code "Fail gracefully, inform clearly." - Error handling wisdom * fix: handle undefined transcription errors better * fix: correct API param for tool approval + workaround letta-client type bug --- src/channels/discord.ts | 12 ++- src/channels/signal.ts | 21 ++++- src/channels/slack.ts | 12 ++- src/channels/telegram.ts | 29 +++++-- src/tools/letta-api.ts | 6 +- src/transcription/index.ts | 2 +- src/transcription/openai.ts | 168 +++++++++++++++++++++++++++++------- 7 files changed, 202 insertions(+), 48 deletions(-) diff --git a/src/channels/discord.ts b/src/channels/discord.ts index 67640b8..836f354 100644 --- a/src/channels/discord.ts +++ b/src/channels/discord.ts @@ -161,13 +161,19 @@ Ask the bot owner to approve with: const { transcribeAudio } = await import('../transcription/index.js'); const ext = audioAttachment.contentType?.split('/')[1] || 'mp3'; - const transcript = await transcribeAudio(buffer, audioAttachment.name || `audio.${ext}`); + const result = await transcribeAudio(buffer, audioAttachment.name || `audio.${ext}`); - console.log(`[Discord] Transcribed audio: "${transcript.slice(0, 50)}..."`); - content = (content ? content + '\n' : '') + `[Voice message]: ${transcript}`; + if (result.success && result.text) { + console.log(`[Discord] Transcribed audio: "${result.text.slice(0, 50)}..."`); + content = (content ? content + '\n' : '') + `[Voice message]: ${result.text}`; + } else { + console.error(`[Discord] Transcription failed: ${result.error}`); + content = (content ? content + '\n' : '') + `[Voice message - transcription failed: ${result.error}]`; + } } } catch (error) { console.error('[Discord] Error transcribing audio:', error); + content = (content ? content + '\n' : '') + `[Voice message - error: ${error instanceof Error ? error.message : 'unknown error'}]`; } } diff --git a/src/channels/signal.ts b/src/channels/signal.ts index 11d4058..389885b 100644 --- a/src/channels/signal.ts +++ b/src/channels/signal.ts @@ -628,13 +628,28 @@ This code expires in 1 hour.`; const { transcribeAudio } = await import('../transcription/index.js'); const ext = voiceAttachment.contentType?.split('/')[1] || 'ogg'; - const transcript = await transcribeAudio(buffer, `voice.${ext}`); + const result = await transcribeAudio(buffer, `voice.${ext}`, { audioPath: attachmentPath }); - console.log(`[Signal] Transcribed voice message: "${transcript.slice(0, 50)}..."`); - messageText = (messageText ? messageText + '\n' : '') + `[Voice message]: ${transcript}`; + if (result.success) { + if (result.text) { + console.log(`[Signal] Transcribed voice message: "${result.text.slice(0, 50)}..."`); + messageText = (messageText ? messageText + '\n' : '') + `[Voice message]: ${result.text}`; + } else { + console.warn(`[Signal] Transcription returned empty text`); + messageText = (messageText ? messageText + '\n' : '') + `[Voice message - transcription returned empty]`; + } + } else { + const errorMsg = result.error || 'Unknown transcription error'; + console.error(`[Signal] Transcription failed: ${errorMsg}`); + const errorInfo = result.audioPath + ? `[Voice message - transcription failed: ${errorMsg}. Audio saved to: ${result.audioPath}]` + : `[Voice message - transcription failed: ${errorMsg}]`; + messageText = (messageText ? messageText + '\n' : '') + errorInfo; + } } } catch (error) { console.error('[Signal] Error transcribing voice message:', error); + messageText = (messageText ? messageText + '\n' : '') + `[Voice message - error: ${error instanceof Error ? error.message : 'unknown error'}]`; } } else if (attachments?.some(a => a.contentType?.startsWith('audio/'))) { // Audio attachment exists but has no ID diff --git a/src/channels/slack.ts b/src/channels/slack.ts index 8af82e7..bb54840 100644 --- a/src/channels/slack.ts +++ b/src/channels/slack.ts @@ -83,13 +83,19 @@ export class SlackAdapter implements ChannelAdapter { const { transcribeAudio } = await import('../transcription/index.js'); const ext = audioFile.mimetype?.split('/')[1] || 'mp3'; - const transcript = await transcribeAudio(buffer, audioFile.name || `audio.${ext}`); + const result = await transcribeAudio(buffer, audioFile.name || `audio.${ext}`); - console.log(`[Slack] Transcribed audio: "${transcript.slice(0, 50)}..."`); - text = (text ? text + '\n' : '') + `[Voice message]: ${transcript}`; + if (result.success && result.text) { + console.log(`[Slack] Transcribed audio: "${result.text.slice(0, 50)}..."`); + text = (text ? text + '\n' : '') + `[Voice message]: ${result.text}`; + } else { + console.error(`[Slack] Transcription failed: ${result.error}`); + text = (text ? text + '\n' : '') + `[Voice message - transcription failed: ${result.error}]`; + } } } catch (error) { console.error('[Slack] Error transcribing audio:', error); + text = (text ? text + '\n' : '') + `[Voice message - error: ${error instanceof Error ? error.message : 'unknown error'}]`; } } diff --git a/src/channels/telegram.ts b/src/channels/telegram.ts index 2c97fcc..e2d8207 100644 --- a/src/channels/telegram.ts +++ b/src/channels/telegram.ts @@ -247,11 +247,18 @@ export class TelegramAdapter implements ChannelAdapter { // Transcribe const { transcribeAudio } = await import('../transcription/index.js'); - const transcript = await transcribeAudio(buffer, 'voice.ogg'); + const result = await transcribeAudio(buffer, 'voice.ogg'); - console.log(`[Telegram] Transcribed voice message: "${transcript.slice(0, 50)}..."`); + let messageText: string; + if (result.success && result.text) { + console.log(`[Telegram] Transcribed voice message: "${result.text.slice(0, 50)}..."`); + messageText = `[Voice message]: ${result.text}`; + } else { + console.error(`[Telegram] Transcription failed: ${result.error}`); + messageText = `[Voice message - transcription failed: ${result.error}]`; + } - // Send to agent as text with prefix + // Send to agent if (this.onMessage) { await this.onMessage({ channel: 'telegram', @@ -259,14 +266,24 @@ export class TelegramAdapter implements ChannelAdapter { userId: String(userId), userName: ctx.from.username || ctx.from.first_name, messageId: String(ctx.message.message_id), - text: `[Voice message]: ${transcript}`, + text: messageText, timestamp: new Date(), }); } } catch (error) { console.error('[Telegram] Error processing voice message:', error); - // Optionally notify user - await ctx.reply('Sorry, I could not transcribe that voice message.'); + // Send error to agent so it can explain + if (this.onMessage) { + await this.onMessage({ + channel: 'telegram', + chatId: String(chatId), + userId: String(userId), + userName: ctx.from?.username || ctx.from?.first_name, + messageId: String(ctx.message.message_id), + text: `[Voice message - error: ${error instanceof Error ? error.message : 'unknown error'}]`, + timestamp: new Date(), + }); + } } }); diff --git a/src/tools/letta-api.ts b/src/tools/letta-api.ts index f4d8a0c..255f404 100644 --- a/src/tools/letta-api.ts +++ b/src/tools/letta-api.ts @@ -352,10 +352,12 @@ export async function disableToolApproval( ): Promise { try { const client = getClient(); + // Note: API expects 'requires_approval' but client types say 'body_requires_approval' + // This is a bug in @letta-ai/letta-client - filed issue, using workaround await client.agents.tools.updateApproval(toolName, { agent_id: agentId, - body_requires_approval: false, - }); + requires_approval: false, + } as unknown as Parameters[1]); console.log(`[Letta API] Disabled approval requirement for tool ${toolName} on agent ${agentId}`); return true; } catch (e) { diff --git a/src/transcription/index.ts b/src/transcription/index.ts index dab5d69..a7d9f2d 100644 --- a/src/transcription/index.ts +++ b/src/transcription/index.ts @@ -4,4 +4,4 @@ * Currently supports OpenAI Whisper. Future providers can be added here. */ -export { transcribeAudio } from './openai.js'; +export { transcribeAudio, type TranscriptionResult } from './openai.js'; diff --git a/src/transcription/openai.ts b/src/transcription/openai.ts index c355458..ff4c99e 100644 --- a/src/transcription/openai.ts +++ b/src/transcription/openai.ts @@ -1,5 +1,10 @@ /** * OpenAI Whisper transcription service + * + * Supports tiered fallback: + * 1. Try format rename (AAC → M4A, etc.) - no external deps + * 2. Try ffmpeg conversion if available + * 3. Return informative error if both fail */ import OpenAI from 'openai'; @@ -16,6 +21,16 @@ const CHUNK_DURATION_SECONDS = 600; let openaiClient: OpenAI | null = null; +/** + * Result of a transcription attempt + */ +export interface TranscriptionResult { + success: boolean; + text?: string; + error?: string; + audioPath?: string; // Path to original audio (for agent to reference) +} + function getClient(): OpenAI { if (!openaiClient) { const config = loadConfig(); @@ -34,40 +49,129 @@ function getModel(): string { return config.transcription?.model || process.env.TRANSCRIPTION_MODEL || 'whisper-1'; } -/** - * Transcribe audio using OpenAI Whisper API - * - * @param audioBuffer - The audio data as a Buffer - * @param filename - Filename with extension (e.g., 'voice.ogg') - * @returns The transcribed text - */ -export async function transcribeAudio(audioBuffer: Buffer, filename: string = 'audio.ogg'): Promise { - const ext = filename.split('.').pop()?.toLowerCase() || ''; - - // Check if format needs conversion (not just renaming) - let finalBuffer = audioBuffer; - let finalExt = ext; - - if (NEEDS_CONVERSION.includes(ext)) { - console.log(`[Transcription] Converting .${ext} to .mp3 with ffmpeg`); - finalBuffer = convertAudioToMp3(audioBuffer, ext); - finalExt = 'mp3'; +// Cache ffmpeg availability check +let ffmpegAvailable: boolean | null = null; + +function isFfmpegAvailable(): boolean { + if (ffmpegAvailable === null) { + try { + execSync('which ffmpeg', { stdio: 'ignore' }); + ffmpegAvailable = true; + } catch { + ffmpegAvailable = false; + console.warn('[Transcription] ffmpeg not found - audio conversion will be skipped'); + } } - - // Check if file is too large and needs chunking - if (finalBuffer.length > MAX_FILE_SIZE) { - console.log(`[Transcription] File too large (${(finalBuffer.length / 1024 / 1024).toFixed(1)}MB), splitting into chunks`); - return transcribeInChunks(finalBuffer, finalExt); - } - - // Single file transcription - return transcribeSingleFile(finalBuffer, filename, finalExt); + return ffmpegAvailable; } /** - * Transcribe a single audio file (under size limit) + * Transcribe audio using OpenAI Whisper API + * + * Returns a result object instead of throwing, so callers can handle failures gracefully. + * + * @param audioBuffer - The audio data as a Buffer + * @param filename - Filename with extension (e.g., 'voice.ogg') + * @param options - Optional settings + * @returns TranscriptionResult with success/text or error info */ -async function transcribeSingleFile(audioBuffer: Buffer, originalFilename: string, ext: string): Promise { +export async function transcribeAudio( + audioBuffer: Buffer, + filename: string = 'audio.ogg', + options?: { audioPath?: string } +): Promise { + const ext = filename.split('.').pop()?.toLowerCase() || ''; + + try { + let finalBuffer = audioBuffer; + let finalExt = ext; + + // Check if format needs handling + if (NEEDS_CONVERSION.includes(ext)) { + // Tier 1: Try format mapping first (just rename, no conversion) + const mapped = FORMAT_MAP[ext]; + if (mapped) { + console.log(`[Transcription] Trying .${ext} as .${mapped} (no conversion)`); + finalExt = mapped; + + // Try without conversion first + try { + const text = await attemptTranscription(finalBuffer, filename, finalExt); + return { success: true, text }; + } catch (renameError) { + console.log(`[Transcription] Rename approach failed: ${renameError instanceof Error ? renameError.message : renameError}`); + + // Tier 2: Try ffmpeg conversion if available + if (isFfmpegAvailable()) { + console.log(`[Transcription] Attempting ffmpeg conversion .${ext} → .mp3`); + try { + finalBuffer = convertAudioToMp3(audioBuffer, ext); + finalExt = 'mp3'; + const text = await attemptTranscription(finalBuffer, filename, finalExt); + console.log(`[Transcription] Success after conversion, text length: ${text?.length || 0}`); + return { success: true, text }; + } catch (conversionError: unknown) { + // Both approaches failed + console.error(`[Transcription] Failed after conversion:`, conversionError); + const errorMsg = conversionError instanceof Error + ? conversionError.message + : (conversionError ? String(conversionError) : 'Unknown error after conversion'); + return { + success: false, + error: `Transcription failed after conversion: ${errorMsg}`, + audioPath: options?.audioPath, + }; + } + } else { + // No ffmpeg, rename failed + return { + success: false, + error: `Cannot transcribe .${ext} format. Install ffmpeg for audio conversion, or send in a supported format (mp3, ogg, wav, m4a).`, + audioPath: options?.audioPath, + }; + } + } + } else { + // No mapping available + if (isFfmpegAvailable()) { + console.log(`[Transcription] Converting .${ext} to .mp3 with ffmpeg`); + finalBuffer = convertAudioToMp3(audioBuffer, ext); + finalExt = 'mp3'; + } else { + return { + success: false, + error: `Unsupported format .${ext} and ffmpeg not available for conversion.`, + audioPath: options?.audioPath, + }; + } + } + } + + // Check file size and chunk if needed + if (finalBuffer.length > MAX_FILE_SIZE) { + console.log(`[Transcription] File too large (${(finalBuffer.length / 1024 / 1024).toFixed(1)}MB), splitting into chunks`); + const text = await transcribeInChunks(finalBuffer, finalExt); + return { success: true, text }; + } + + // Single file transcription + const text = await attemptTranscription(finalBuffer, filename, finalExt); + return { success: true, text }; + + } catch (error) { + const errorMsg = error instanceof Error ? error.message : String(error); + return { + success: false, + error: errorMsg, + audioPath: options?.audioPath, + }; + } +} + +/** + * Attempt a single transcription (may throw) + */ +async function attemptTranscription(audioBuffer: Buffer, originalFilename: string, ext: string): Promise { const client = getClient(); const finalFilename = normalizeFilename(originalFilename.replace(/\.[^.]+$/, `.${ext}`)); @@ -87,6 +191,10 @@ async function transcribeSingleFile(audioBuffer: Buffer, originalFilename: strin * Split large audio into chunks and transcribe each */ async function transcribeInChunks(audioBuffer: Buffer, ext: string): Promise { + if (!isFfmpegAvailable()) { + throw new Error('Cannot split large audio files without ffmpeg'); + } + const tempDir = join(tmpdir(), 'lettabot-transcription', `chunks-${Date.now()}`); mkdirSync(tempDir, { recursive: true }); @@ -122,7 +230,7 @@ async function transcribeInChunks(audioBuffer: Buffer, ext: string): Promise