fix: graceful transcription fallback when ffmpeg unavailable (#155)

* fix: graceful transcription fallback when ffmpeg unavailable When voice transcription fails (e.g., ffmpeg not installed), the agent now receives informative error messages instead of silent failures. Changes: - transcribeAudio() returns TranscriptionResult with success/error/audioPath - Tiered fallback: try format rename first, then ffmpeg, then fail gracefully - Check ffmpeg availability once and cache result - All channel adapters updated to show transcription errors to agent - Agent can explain to user why transcription failed Before: Agent sees: "[Voice message received]" Agent: "I received your voice message but there's no content..." After: Agent sees: "[Voice message - transcription failed: Cannot transcribe .aac format. Install ffmpeg for audio conversion, or send in a supported format (mp3, ogg, wav, m4a). Audio saved to: /path/to/file.aac]" Agent: "I couldn't transcribe your voice message because ffmpeg isn't installed. You could type your message instead." Fixes voice transcription on systems without ffmpeg. Written by Cameron ◯ Letta Code "Fail gracefully, inform clearly." - Error handling wisdom * fix: handle undefined transcription errors better * fix: correct API param for tool approval + workaround letta-client type bug
2026-02-04 19:31:50 -08:00
parent b4058f17ce
commit d6113cab66
7 changed files with 202 additions and 48 deletions
--- a/src/channels/discord.ts
+++ b/src/channels/discord.ts
@@ -161,13 +161,19 @@ Ask the bot owner to approve with:
            
            const { transcribeAudio } = await import('../transcription/index.js');
            const ext = audioAttachment.contentType?.split('/')[1] || 'mp3';
-            const transcript = await transcribeAudio(buffer, audioAttachment.name || `audio.${ext}`);
+            const result = await transcribeAudio(buffer, audioAttachment.name || `audio.${ext}`);
            
-            console.log(`[Discord] Transcribed audio: "${transcript.slice(0, 50)}..."`);
-            content = (content ? content + '\n' : '') + `[Voice message]: ${transcript}`;
+            if (result.success && result.text) {
+              console.log(`[Discord] Transcribed audio: "${result.text.slice(0, 50)}..."`);
+              content = (content ? content + '\n' : '') + `[Voice message]: ${result.text}`;
+            } else {
+              console.error(`[Discord] Transcription failed: ${result.error}`);
+              content = (content ? content + '\n' : '') + `[Voice message - transcription failed: ${result.error}]`;
+            }
          }
        } catch (error) {
          console.error('[Discord] Error transcribing audio:', error);
+          content = (content ? content + '\n' : '') + `[Voice message - error: ${error instanceof Error ? error.message : 'unknown error'}]`;
        }
      }

--- a/src/channels/signal.ts
+++ b/src/channels/signal.ts
@@ -628,13 +628,28 @@ This code expires in 1 hour.`;
            
            const { transcribeAudio } = await import('../transcription/index.js');
            const ext = voiceAttachment.contentType?.split('/')[1] || 'ogg';
-            const transcript = await transcribeAudio(buffer, `voice.${ext}`);
+            const result = await transcribeAudio(buffer, `voice.${ext}`, { audioPath: attachmentPath });
            
-            console.log(`[Signal] Transcribed voice message: "${transcript.slice(0, 50)}..."`);
-            messageText = (messageText ? messageText + '\n' : '') + `[Voice message]: ${transcript}`;
+            if (result.success) {
+              if (result.text) {
+                console.log(`[Signal] Transcribed voice message: "${result.text.slice(0, 50)}..."`);
+                messageText = (messageText ? messageText + '\n' : '') + `[Voice message]: ${result.text}`;
+              } else {
+                console.warn(`[Signal] Transcription returned empty text`);
+                messageText = (messageText ? messageText + '\n' : '') + `[Voice message - transcription returned empty]`;
+              }
+            } else {
+              const errorMsg = result.error || 'Unknown transcription error';
+              console.error(`[Signal] Transcription failed: ${errorMsg}`);
+              const errorInfo = result.audioPath 
+                ? `[Voice message - transcription failed: ${errorMsg}. Audio saved to: ${result.audioPath}]`
+                : `[Voice message - transcription failed: ${errorMsg}]`;
+              messageText = (messageText ? messageText + '\n' : '') + errorInfo;
+            }
          }
        } catch (error) {
          console.error('[Signal] Error transcribing voice message:', error);
+          messageText = (messageText ? messageText + '\n' : '') + `[Voice message - error: ${error instanceof Error ? error.message : 'unknown error'}]`;
        }
      } else if (attachments?.some(a => a.contentType?.startsWith('audio/'))) {
        // Audio attachment exists but has no ID
--- a/src/channels/slack.ts
+++ b/src/channels/slack.ts
@@ -83,13 +83,19 @@ export class SlackAdapter implements ChannelAdapter {
            
            const { transcribeAudio } = await import('../transcription/index.js');
            const ext = audioFile.mimetype?.split('/')[1] || 'mp3';
-            const transcript = await transcribeAudio(buffer, audioFile.name || `audio.${ext}`);
+            const result = await transcribeAudio(buffer, audioFile.name || `audio.${ext}`);
            
-            console.log(`[Slack] Transcribed audio: "${transcript.slice(0, 50)}..."`);
-            text = (text ? text + '\n' : '') + `[Voice message]: ${transcript}`;
+            if (result.success && result.text) {
+              console.log(`[Slack] Transcribed audio: "${result.text.slice(0, 50)}..."`);
+              text = (text ? text + '\n' : '') + `[Voice message]: ${result.text}`;
+            } else {
+              console.error(`[Slack] Transcription failed: ${result.error}`);
+              text = (text ? text + '\n' : '') + `[Voice message - transcription failed: ${result.error}]`;
+            }
          }
        } catch (error) {
          console.error('[Slack] Error transcribing audio:', error);
+          text = (text ? text + '\n' : '') + `[Voice message - error: ${error instanceof Error ? error.message : 'unknown error'}]`;
        }
      }
      
--- a/src/channels/telegram.ts
+++ b/src/channels/telegram.ts
@@ -247,11 +247,18 @@ export class TelegramAdapter implements ChannelAdapter {

        // Transcribe
        const { transcribeAudio } = await import('../transcription/index.js');
-        const transcript = await transcribeAudio(buffer, 'voice.ogg');
+        const result = await transcribeAudio(buffer, 'voice.ogg');

-        console.log(`[Telegram] Transcribed voice message: "${transcript.slice(0, 50)}..."`);
+        let messageText: string;
+        if (result.success && result.text) {
+          console.log(`[Telegram] Transcribed voice message: "${result.text.slice(0, 50)}..."`);
+          messageText = `[Voice message]: ${result.text}`;
+        } else {
+          console.error(`[Telegram] Transcription failed: ${result.error}`);
+          messageText = `[Voice message - transcription failed: ${result.error}]`;
+        }

-        // Send to agent as text with prefix
+        // Send to agent
        if (this.onMessage) {
          await this.onMessage({
            channel: 'telegram',
@@ -259,14 +266,24 @@ export class TelegramAdapter implements ChannelAdapter {
            userId: String(userId),
            userName: ctx.from.username || ctx.from.first_name,
            messageId: String(ctx.message.message_id),
-            text: `[Voice message]: ${transcript}`,
+            text: messageText,
            timestamp: new Date(),
          });
        }
      } catch (error) {
        console.error('[Telegram] Error processing voice message:', error);
-        // Optionally notify user
-        await ctx.reply('Sorry, I could not transcribe that voice message.');
+        // Send error to agent so it can explain
+        if (this.onMessage) {
+          await this.onMessage({
+            channel: 'telegram',
+            chatId: String(chatId),
+            userId: String(userId),
+            userName: ctx.from?.username || ctx.from?.first_name,
+            messageId: String(ctx.message.message_id),
+            text: `[Voice message - error: ${error instanceof Error ? error.message : 'unknown error'}]`,
+            timestamp: new Date(),
+          });
+        }
      }
    });

--- a/src/tools/letta-api.ts
+++ b/src/tools/letta-api.ts
@@ -352,10 +352,12 @@ export async function disableToolApproval(
 ): Promise<boolean> {
  try {
    const client = getClient();
+    // Note: API expects 'requires_approval' but client types say 'body_requires_approval'
+    // This is a bug in @letta-ai/letta-client - filed issue, using workaround
    await client.agents.tools.updateApproval(toolName, {
      agent_id: agentId,
-      body_requires_approval: false,
-    });
+      requires_approval: false,
+    } as unknown as Parameters<typeof client.agents.tools.updateApproval>[1]);
    console.log(`[Letta API] Disabled approval requirement for tool ${toolName} on agent ${agentId}`);
    return true;
  } catch (e) {
--- a/src/transcription/index.ts
+++ b/src/transcription/index.ts
@@ -4,4 +4,4 @@
 * Currently supports OpenAI Whisper. Future providers can be added here.
 */

-export { transcribeAudio } from './openai.js';
+export { transcribeAudio, type TranscriptionResult } from './openai.js';
--- a/src/transcription/openai.ts
+++ b/src/transcription/openai.ts
@@ -1,5 +1,10 @@
 /**
 * OpenAI Whisper transcription service
+ * 
+ * Supports tiered fallback:
+ * 1. Try format rename (AAC → M4A, etc.) - no external deps
+ * 2. Try ffmpeg conversion if available
+ * 3. Return informative error if both fail
 */

 import OpenAI from 'openai';
@@ -16,6 +21,16 @@ const CHUNK_DURATION_SECONDS = 600;

 let openaiClient: OpenAI | null = null;

+/**
+ * Result of a transcription attempt
+ */
+export interface TranscriptionResult {
+  success: boolean;
+  text?: string;
+  error?: string;
+  audioPath?: string;  // Path to original audio (for agent to reference)
+}
+
 function getClient(): OpenAI {
  if (!openaiClient) {
    const config = loadConfig();
@@ -34,40 +49,129 @@ function getModel(): string {
  return config.transcription?.model || process.env.TRANSCRIPTION_MODEL || 'whisper-1';
 }

-/**
- * Transcribe audio using OpenAI Whisper API
- * 
- * @param audioBuffer - The audio data as a Buffer
- * @param filename - Filename with extension (e.g., 'voice.ogg')
- * @returns The transcribed text
- */
-export async function transcribeAudio(audioBuffer: Buffer, filename: string = 'audio.ogg'): Promise<string> {
-  const ext = filename.split('.').pop()?.toLowerCase() || '';
-  
-  // Check if format needs conversion (not just renaming)
-  let finalBuffer = audioBuffer;
-  let finalExt = ext;
-  
-  if (NEEDS_CONVERSION.includes(ext)) {
-    console.log(`[Transcription] Converting .${ext} to .mp3 with ffmpeg`);
-    finalBuffer = convertAudioToMp3(audioBuffer, ext);
-    finalExt = 'mp3';
+// Cache ffmpeg availability check
+let ffmpegAvailable: boolean | null = null;
+
+function isFfmpegAvailable(): boolean {
+  if (ffmpegAvailable === null) {
+    try {
+      execSync('which ffmpeg', { stdio: 'ignore' });
+      ffmpegAvailable = true;
+    } catch {
+      ffmpegAvailable = false;
+      console.warn('[Transcription] ffmpeg not found - audio conversion will be skipped');
+    }
  }
-  
-  // Check if file is too large and needs chunking
-  if (finalBuffer.length > MAX_FILE_SIZE) {
-    console.log(`[Transcription] File too large (${(finalBuffer.length / 1024 / 1024).toFixed(1)}MB), splitting into chunks`);
-    return transcribeInChunks(finalBuffer, finalExt);
-  }
-  
-  // Single file transcription
-  return transcribeSingleFile(finalBuffer, filename, finalExt);
+  return ffmpegAvailable;
 }

 /**
- * Transcribe a single audio file (under size limit)
+ * Transcribe audio using OpenAI Whisper API
+ * 
+ * Returns a result object instead of throwing, so callers can handle failures gracefully.
+ * 
+ * @param audioBuffer - The audio data as a Buffer
+ * @param filename - Filename with extension (e.g., 'voice.ogg')
+ * @param options - Optional settings
+ * @returns TranscriptionResult with success/text or error info
 */
-async function transcribeSingleFile(audioBuffer: Buffer, originalFilename: string, ext: string): Promise<string> {
+export async function transcribeAudio(
+  audioBuffer: Buffer, 
+  filename: string = 'audio.ogg',
+  options?: { audioPath?: string }
+): Promise<TranscriptionResult> {
+  const ext = filename.split('.').pop()?.toLowerCase() || '';
+  
+  try {
+    let finalBuffer = audioBuffer;
+    let finalExt = ext;
+    
+    // Check if format needs handling
+    if (NEEDS_CONVERSION.includes(ext)) {
+      // Tier 1: Try format mapping first (just rename, no conversion)
+      const mapped = FORMAT_MAP[ext];
+      if (mapped) {
+        console.log(`[Transcription] Trying .${ext} as .${mapped} (no conversion)`);
+        finalExt = mapped;
+        
+        // Try without conversion first
+        try {
+          const text = await attemptTranscription(finalBuffer, filename, finalExt);
+          return { success: true, text };
+        } catch (renameError) {
+          console.log(`[Transcription] Rename approach failed: ${renameError instanceof Error ? renameError.message : renameError}`);
+          
+          // Tier 2: Try ffmpeg conversion if available
+          if (isFfmpegAvailable()) {
+            console.log(`[Transcription] Attempting ffmpeg conversion .${ext} → .mp3`);
+            try {
+              finalBuffer = convertAudioToMp3(audioBuffer, ext);
+              finalExt = 'mp3';
+              const text = await attemptTranscription(finalBuffer, filename, finalExt);
+              console.log(`[Transcription] Success after conversion, text length: ${text?.length || 0}`);
+              return { success: true, text };
+            } catch (conversionError: unknown) {
+              // Both approaches failed
+              console.error(`[Transcription] Failed after conversion:`, conversionError);
+              const errorMsg = conversionError instanceof Error 
+                ? conversionError.message 
+                : (conversionError ? String(conversionError) : 'Unknown error after conversion');
+              return {
+                success: false,
+                error: `Transcription failed after conversion: ${errorMsg}`,
+                audioPath: options?.audioPath,
+              };
+            }
+          } else {
+            // No ffmpeg, rename failed
+            return {
+              success: false,
+              error: `Cannot transcribe .${ext} format. Install ffmpeg for audio conversion, or send in a supported format (mp3, ogg, wav, m4a).`,
+              audioPath: options?.audioPath,
+            };
+          }
+        }
+      } else {
+        // No mapping available
+        if (isFfmpegAvailable()) {
+          console.log(`[Transcription] Converting .${ext} to .mp3 with ffmpeg`);
+          finalBuffer = convertAudioToMp3(audioBuffer, ext);
+          finalExt = 'mp3';
+        } else {
+          return {
+            success: false,
+            error: `Unsupported format .${ext} and ffmpeg not available for conversion.`,
+            audioPath: options?.audioPath,
+          };
+        }
+      }
+    }
+    
+    // Check file size and chunk if needed
+    if (finalBuffer.length > MAX_FILE_SIZE) {
+      console.log(`[Transcription] File too large (${(finalBuffer.length / 1024 / 1024).toFixed(1)}MB), splitting into chunks`);
+      const text = await transcribeInChunks(finalBuffer, finalExt);
+      return { success: true, text };
+    }
+    
+    // Single file transcription
+    const text = await attemptTranscription(finalBuffer, filename, finalExt);
+    return { success: true, text };
+    
+  } catch (error) {
+    const errorMsg = error instanceof Error ? error.message : String(error);
+    return {
+      success: false,
+      error: errorMsg,
+      audioPath: options?.audioPath,
+    };
+  }
+}
+
+/**
+ * Attempt a single transcription (may throw)
+ */
+async function attemptTranscription(audioBuffer: Buffer, originalFilename: string, ext: string): Promise<string> {
  const client = getClient();
  const finalFilename = normalizeFilename(originalFilename.replace(/\.[^.]+$/, `.${ext}`));
  
@@ -87,6 +191,10 @@ async function transcribeSingleFile(audioBuffer: Buffer, originalFilename: strin
 * Split large audio into chunks and transcribe each
 */
 async function transcribeInChunks(audioBuffer: Buffer, ext: string): Promise<string> {
+  if (!isFfmpegAvailable()) {
+    throw new Error('Cannot split large audio files without ffmpeg');
+  }
+  
  const tempDir = join(tmpdir(), 'lettabot-transcription', `chunks-${Date.now()}`);
  mkdirSync(tempDir, { recursive: true });
  
@@ -122,7 +230,7 @@ async function transcribeInChunks(audioBuffer: Buffer, ext: string): Promise<str
      
      console.log(`[Transcription] Transcribing chunk ${i + 1}/${chunkFiles.length} (${(chunkBuffer.length / 1024).toFixed(0)}KB)`);
      
-      const text = await transcribeSingleFile(chunkBuffer, chunkFiles[i], 'mp3');
+      const text = await attemptTranscription(chunkBuffer, chunkFiles[i], 'mp3');
      if (text.trim()) {
        transcriptions.push(text.trim());
      }