From d6113cab66767f94b7e352f712f603627bac538d Mon Sep 17 00:00:00 2001
From: Cameron <cameron@pfiffer.org>
Date: Wed, 4 Feb 2026 19:31:50 -0800
Subject: [PATCH] fix: graceful transcription fallback when ffmpeg unavailable
 (#155)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix: graceful transcription fallback when ffmpeg unavailable

When voice transcription fails (e.g., ffmpeg not installed), the agent
now receives informative error messages instead of silent failures.

Changes:
- transcribeAudio() returns TranscriptionResult with success/error/audioPath
- Tiered fallback: try format rename first, then ffmpeg, then fail gracefully
- Check ffmpeg availability once and cache result
- All channel adapters updated to show transcription errors to agent
- Agent can explain to user why transcription failed

Before:
  Agent sees: "[Voice message received]"
  Agent: "I received your voice message but there's no content..."

After:
  Agent sees: "[Voice message - transcription failed: Cannot transcribe .aac format. Install ffmpeg for audio conversion, or send in a supported format (mp3, ogg, wav, m4a). Audio saved to: /path/to/file.aac]"
  Agent: "I couldn't transcribe your voice message because ffmpeg isn't installed. You could type your message instead."

Fixes voice transcription on systems without ffmpeg.

Written by Cameron ◯ Letta Code

"Fail gracefully, inform clearly." - Error handling wisdom

* fix: handle undefined transcription errors better

* fix: correct API param for tool approval + workaround letta-client type bug
---
 src/channels/discord.ts     |  12 ++-
 src/channels/signal.ts      |  21 ++++-
 src/channels/slack.ts       |  12 ++-
 src/channels/telegram.ts    |  29 +++++--
 src/tools/letta-api.ts      |   6 +-
 src/transcription/index.ts  |   2 +-
 src/transcription/openai.ts | 168 +++++++++++++++++++++++++++++-------
 7 files changed, 202 insertions(+), 48 deletions(-)

diff --git a/src/channels/discord.ts b/src/channels/discord.ts
index 67640b8..836f354 100644
--- a/src/channels/discord.ts
+++ b/src/channels/discord.ts
@@ -161,13 +161,19 @@ Ask the bot owner to approve with:
             
             const { transcribeAudio } = await import('../transcription/index.js');
             const ext = audioAttachment.contentType?.split('/')[1] || 'mp3';
-            const transcript = await transcribeAudio(buffer, audioAttachment.name || `audio.${ext}`);
+            const result = await transcribeAudio(buffer, audioAttachment.name || `audio.${ext}`);
             
-            console.log(`[Discord] Transcribed audio: "${transcript.slice(0, 50)}..."`);
-            content = (content ? content + '\n' : '') + `[Voice message]: ${transcript}`;
+            if (result.success && result.text) {
+              console.log(`[Discord] Transcribed audio: "${result.text.slice(0, 50)}..."`);
+              content = (content ? content + '\n' : '') + `[Voice message]: ${result.text}`;
+            } else {
+              console.error(`[Discord] Transcription failed: ${result.error}`);
+              content = (content ? content + '\n' : '') + `[Voice message - transcription failed: ${result.error}]`;
+            }
           }
         } catch (error) {
           console.error('[Discord] Error transcribing audio:', error);
+          content = (content ? content + '\n' : '') + `[Voice message - error: ${error instanceof Error ? error.message : 'unknown error'}]`;
         }
       }
 
diff --git a/src/channels/signal.ts b/src/channels/signal.ts
index 11d4058..389885b 100644
--- a/src/channels/signal.ts
+++ b/src/channels/signal.ts
@@ -628,13 +628,28 @@ This code expires in 1 hour.`;
             
             const { transcribeAudio } = await import('../transcription/index.js');
             const ext = voiceAttachment.contentType?.split('/')[1] || 'ogg';
-            const transcript = await transcribeAudio(buffer, `voice.${ext}`);
+            const result = await transcribeAudio(buffer, `voice.${ext}`, { audioPath: attachmentPath });
             
-            console.log(`[Signal] Transcribed voice message: "${transcript.slice(0, 50)}..."`);
-            messageText = (messageText ? messageText + '\n' : '') + `[Voice message]: ${transcript}`;
+            if (result.success) {
+              if (result.text) {
+                console.log(`[Signal] Transcribed voice message: "${result.text.slice(0, 50)}..."`);
+                messageText = (messageText ? messageText + '\n' : '') + `[Voice message]: ${result.text}`;
+              } else {
+                console.warn(`[Signal] Transcription returned empty text`);
+                messageText = (messageText ? messageText + '\n' : '') + `[Voice message - transcription returned empty]`;
+              }
+            } else {
+              const errorMsg = result.error || 'Unknown transcription error';
+              console.error(`[Signal] Transcription failed: ${errorMsg}`);
+              const errorInfo = result.audioPath 
+                ? `[Voice message - transcription failed: ${errorMsg}. Audio saved to: ${result.audioPath}]`
+                : `[Voice message - transcription failed: ${errorMsg}]`;
+              messageText = (messageText ? messageText + '\n' : '') + errorInfo;
+            }
           }
         } catch (error) {
           console.error('[Signal] Error transcribing voice message:', error);
+          messageText = (messageText ? messageText + '\n' : '') + `[Voice message - error: ${error instanceof Error ? error.message : 'unknown error'}]`;
         }
       } else if (attachments?.some(a => a.contentType?.startsWith('audio/'))) {
         // Audio attachment exists but has no ID
diff --git a/src/channels/slack.ts b/src/channels/slack.ts
index 8af82e7..bb54840 100644
--- a/src/channels/slack.ts
+++ b/src/channels/slack.ts
@@ -83,13 +83,19 @@ export class SlackAdapter implements ChannelAdapter {
             
             const { transcribeAudio } = await import('../transcription/index.js');
             const ext = audioFile.mimetype?.split('/')[1] || 'mp3';
-            const transcript = await transcribeAudio(buffer, audioFile.name || `audio.${ext}`);
+            const result = await transcribeAudio(buffer, audioFile.name || `audio.${ext}`);
             
-            console.log(`[Slack] Transcribed audio: "${transcript.slice(0, 50)}..."`);
-            text = (text ? text + '\n' : '') + `[Voice message]: ${transcript}`;
+            if (result.success && result.text) {
+              console.log(`[Slack] Transcribed audio: "${result.text.slice(0, 50)}..."`);
+              text = (text ? text + '\n' : '') + `[Voice message]: ${result.text}`;
+            } else {
+              console.error(`[Slack] Transcription failed: ${result.error}`);
+              text = (text ? text + '\n' : '') + `[Voice message - transcription failed: ${result.error}]`;
+            }
           }
         } catch (error) {
           console.error('[Slack] Error transcribing audio:', error);
+          text = (text ? text + '\n' : '') + `[Voice message - error: ${error instanceof Error ? error.message : 'unknown error'}]`;
         }
       }
       
diff --git a/src/channels/telegram.ts b/src/channels/telegram.ts
index 2c97fcc..e2d8207 100644
--- a/src/channels/telegram.ts
+++ b/src/channels/telegram.ts
@@ -247,11 +247,18 @@ export class TelegramAdapter implements ChannelAdapter {
 
         // Transcribe
         const { transcribeAudio } = await import('../transcription/index.js');
-        const transcript = await transcribeAudio(buffer, 'voice.ogg');
+        const result = await transcribeAudio(buffer, 'voice.ogg');
 
-        console.log(`[Telegram] Transcribed voice message: "${transcript.slice(0, 50)}..."`);
+        let messageText: string;
+        if (result.success && result.text) {
+          console.log(`[Telegram] Transcribed voice message: "${result.text.slice(0, 50)}..."`);
+          messageText = `[Voice message]: ${result.text}`;
+        } else {
+          console.error(`[Telegram] Transcription failed: ${result.error}`);
+          messageText = `[Voice message - transcription failed: ${result.error}]`;
+        }
 
-        // Send to agent as text with prefix
+        // Send to agent
         if (this.onMessage) {
           await this.onMessage({
             channel: 'telegram',
@@ -259,14 +266,24 @@ export class TelegramAdapter implements ChannelAdapter {
             userId: String(userId),
             userName: ctx.from.username || ctx.from.first_name,
             messageId: String(ctx.message.message_id),
-            text: `[Voice message]: ${transcript}`,
+            text: messageText,
             timestamp: new Date(),
           });
         }
       } catch (error) {
         console.error('[Telegram] Error processing voice message:', error);
-        // Optionally notify user
-        await ctx.reply('Sorry, I could not transcribe that voice message.');
+        // Send error to agent so it can explain
+        if (this.onMessage) {
+          await this.onMessage({
+            channel: 'telegram',
+            chatId: String(chatId),
+            userId: String(userId),
+            userName: ctx.from?.username || ctx.from?.first_name,
+            messageId: String(ctx.message.message_id),
+            text: `[Voice message - error: ${error instanceof Error ? error.message : 'unknown error'}]`,
+            timestamp: new Date(),
+          });
+        }
       }
     });
 
diff --git a/src/tools/letta-api.ts b/src/tools/letta-api.ts
index f4d8a0c..255f404 100644
--- a/src/tools/letta-api.ts
+++ b/src/tools/letta-api.ts
@@ -352,10 +352,12 @@ export async function disableToolApproval(
 ): Promise<boolean> {
   try {
     const client = getClient();
+    // Note: API expects 'requires_approval' but client types say 'body_requires_approval'
+    // This is a bug in @letta-ai/letta-client - filed issue, using workaround
     await client.agents.tools.updateApproval(toolName, {
       agent_id: agentId,
-      body_requires_approval: false,
-    });
+      requires_approval: false,
+    } as unknown as Parameters<typeof client.agents.tools.updateApproval>[1]);
     console.log(`[Letta API] Disabled approval requirement for tool ${toolName} on agent ${agentId}`);
     return true;
   } catch (e) {
diff --git a/src/transcription/index.ts b/src/transcription/index.ts
index dab5d69..a7d9f2d 100644
--- a/src/transcription/index.ts
+++ b/src/transcription/index.ts
@@ -4,4 +4,4 @@
  * Currently supports OpenAI Whisper. Future providers can be added here.
  */
 
-export { transcribeAudio } from './openai.js';
+export { transcribeAudio, type TranscriptionResult } from './openai.js';
diff --git a/src/transcription/openai.ts b/src/transcription/openai.ts
index c355458..ff4c99e 100644
--- a/src/transcription/openai.ts
+++ b/src/transcription/openai.ts
@@ -1,5 +1,10 @@
 /**
  * OpenAI Whisper transcription service
+ * 
+ * Supports tiered fallback:
+ * 1. Try format rename (AAC → M4A, etc.) - no external deps
+ * 2. Try ffmpeg conversion if available
+ * 3. Return informative error if both fail
  */
 
 import OpenAI from 'openai';
@@ -16,6 +21,16 @@ const CHUNK_DURATION_SECONDS = 600;
 
 let openaiClient: OpenAI | null = null;
 
+/**
+ * Result of a transcription attempt
+ */
+export interface TranscriptionResult {
+  success: boolean;
+  text?: string;
+  error?: string;
+  audioPath?: string;  // Path to original audio (for agent to reference)
+}
+
 function getClient(): OpenAI {
   if (!openaiClient) {
     const config = loadConfig();
@@ -34,40 +49,129 @@ function getModel(): string {
   return config.transcription?.model || process.env.TRANSCRIPTION_MODEL || 'whisper-1';
 }
 
-/**
- * Transcribe audio using OpenAI Whisper API
- * 
- * @param audioBuffer - The audio data as a Buffer
- * @param filename - Filename with extension (e.g., 'voice.ogg')
- * @returns The transcribed text
- */
-export async function transcribeAudio(audioBuffer: Buffer, filename: string = 'audio.ogg'): Promise<string> {
-  const ext = filename.split('.').pop()?.toLowerCase() || '';
-  
-  // Check if format needs conversion (not just renaming)
-  let finalBuffer = audioBuffer;
-  let finalExt = ext;
-  
-  if (NEEDS_CONVERSION.includes(ext)) {
-    console.log(`[Transcription] Converting .${ext} to .mp3 with ffmpeg`);
-    finalBuffer = convertAudioToMp3(audioBuffer, ext);
-    finalExt = 'mp3';
+// Cache ffmpeg availability check
+let ffmpegAvailable: boolean | null = null;
+
+function isFfmpegAvailable(): boolean {
+  if (ffmpegAvailable === null) {
+    try {
+      execSync('which ffmpeg', { stdio: 'ignore' });
+      ffmpegAvailable = true;
+    } catch {
+      ffmpegAvailable = false;
+      console.warn('[Transcription] ffmpeg not found - audio conversion will be skipped');
+    }
   }
-  
-  // Check if file is too large and needs chunking
-  if (finalBuffer.length > MAX_FILE_SIZE) {
-    console.log(`[Transcription] File too large (${(finalBuffer.length / 1024 / 1024).toFixed(1)}MB), splitting into chunks`);
-    return transcribeInChunks(finalBuffer, finalExt);
-  }
-  
-  // Single file transcription
-  return transcribeSingleFile(finalBuffer, filename, finalExt);
+  return ffmpegAvailable;
 }
 
 /**
- * Transcribe a single audio file (under size limit)
+ * Transcribe audio using OpenAI Whisper API
+ * 
+ * Returns a result object instead of throwing, so callers can handle failures gracefully.
+ * 
+ * @param audioBuffer - The audio data as a Buffer
+ * @param filename - Filename with extension (e.g., 'voice.ogg')
+ * @param options - Optional settings
+ * @returns TranscriptionResult with success/text or error info
  */
-async function transcribeSingleFile(audioBuffer: Buffer, originalFilename: string, ext: string): Promise<string> {
+export async function transcribeAudio(
+  audioBuffer: Buffer, 
+  filename: string = 'audio.ogg',
+  options?: { audioPath?: string }
+): Promise<TranscriptionResult> {
+  const ext = filename.split('.').pop()?.toLowerCase() || '';
+  
+  try {
+    let finalBuffer = audioBuffer;
+    let finalExt = ext;
+    
+    // Check if format needs handling
+    if (NEEDS_CONVERSION.includes(ext)) {
+      // Tier 1: Try format mapping first (just rename, no conversion)
+      const mapped = FORMAT_MAP[ext];
+      if (mapped) {
+        console.log(`[Transcription] Trying .${ext} as .${mapped} (no conversion)`);
+        finalExt = mapped;
+        
+        // Try without conversion first
+        try {
+          const text = await attemptTranscription(finalBuffer, filename, finalExt);
+          return { success: true, text };
+        } catch (renameError) {
+          console.log(`[Transcription] Rename approach failed: ${renameError instanceof Error ? renameError.message : renameError}`);
+          
+          // Tier 2: Try ffmpeg conversion if available
+          if (isFfmpegAvailable()) {
+            console.log(`[Transcription] Attempting ffmpeg conversion .${ext} → .mp3`);
+            try {
+              finalBuffer = convertAudioToMp3(audioBuffer, ext);
+              finalExt = 'mp3';
+              const text = await attemptTranscription(finalBuffer, filename, finalExt);
+              console.log(`[Transcription] Success after conversion, text length: ${text?.length || 0}`);
+              return { success: true, text };
+            } catch (conversionError: unknown) {
+              // Both approaches failed
+              console.error(`[Transcription] Failed after conversion:`, conversionError);
+              const errorMsg = conversionError instanceof Error 
+                ? conversionError.message 
+                : (conversionError ? String(conversionError) : 'Unknown error after conversion');
+              return {
+                success: false,
+                error: `Transcription failed after conversion: ${errorMsg}`,
+                audioPath: options?.audioPath,
+              };
+            }
+          } else {
+            // No ffmpeg, rename failed
+            return {
+              success: false,
+              error: `Cannot transcribe .${ext} format. Install ffmpeg for audio conversion, or send in a supported format (mp3, ogg, wav, m4a).`,
+              audioPath: options?.audioPath,
+            };
+          }
+        }
+      } else {
+        // No mapping available
+        if (isFfmpegAvailable()) {
+          console.log(`[Transcription] Converting .${ext} to .mp3 with ffmpeg`);
+          finalBuffer = convertAudioToMp3(audioBuffer, ext);
+          finalExt = 'mp3';
+        } else {
+          return {
+            success: false,
+            error: `Unsupported format .${ext} and ffmpeg not available for conversion.`,
+            audioPath: options?.audioPath,
+          };
+        }
+      }
+    }
+    
+    // Check file size and chunk if needed
+    if (finalBuffer.length > MAX_FILE_SIZE) {
+      console.log(`[Transcription] File too large (${(finalBuffer.length / 1024 / 1024).toFixed(1)}MB), splitting into chunks`);
+      const text = await transcribeInChunks(finalBuffer, finalExt);
+      return { success: true, text };
+    }
+    
+    // Single file transcription
+    const text = await attemptTranscription(finalBuffer, filename, finalExt);
+    return { success: true, text };
+    
+  } catch (error) {
+    const errorMsg = error instanceof Error ? error.message : String(error);
+    return {
+      success: false,
+      error: errorMsg,
+      audioPath: options?.audioPath,
+    };
+  }
+}
+
+/**
+ * Attempt a single transcription (may throw)
+ */
+async function attemptTranscription(audioBuffer: Buffer, originalFilename: string, ext: string): Promise<string> {
   const client = getClient();
   const finalFilename = normalizeFilename(originalFilename.replace(/\.[^.]+$/, `.${ext}`));
   
@@ -87,6 +191,10 @@ async function transcribeSingleFile(audioBuffer: Buffer, originalFilename: strin
  * Split large audio into chunks and transcribe each
  */
 async function transcribeInChunks(audioBuffer: Buffer, ext: string): Promise<string> {
+  if (!isFfmpegAvailable()) {
+    throw new Error('Cannot split large audio files without ffmpeg');
+  }
+  
   const tempDir = join(tmpdir(), 'lettabot-transcription', `chunks-${Date.now()}`);
   mkdirSync(tempDir, { recursive: true });
   
@@ -122,7 +230,7 @@ async function transcribeInChunks(audioBuffer: Buffer, ext: string): Promise<str
       
       console.log(`[Transcription] Transcribing chunk ${i + 1}/${chunkFiles.length} (${(chunkBuffer.length / 1024).toFixed(0)}KB)`);
       
-      const text = await transcribeSingleFile(chunkBuffer, chunkFiles[i], 'mp3');
+      const text = await attemptTranscription(chunkBuffer, chunkFiles[i], 'mp3');
       if (text.trim()) {
         transcriptions.push(text.trim());
       }