fix: graceful transcription fallback when ffmpeg unavailable (#155)

* fix: graceful transcription fallback when ffmpeg unavailable

When voice transcription fails (e.g., ffmpeg not installed), the agent
now receives informative error messages instead of silent failures.

Changes:
- transcribeAudio() returns TranscriptionResult with success/error/audioPath
- Tiered fallback: try format rename first, then ffmpeg, then fail gracefully
- Check ffmpeg availability once and cache result
- All channel adapters updated to show transcription errors to agent
- Agent can explain to user why transcription failed

Before:
  Agent sees: "[Voice message received]"
  Agent: "I received your voice message but there's no content..."

After:
  Agent sees: "[Voice message - transcription failed: Cannot transcribe .aac format. Install ffmpeg for audio conversion, or send in a supported format (mp3, ogg, wav, m4a). Audio saved to: /path/to/file.aac]"
  Agent: "I couldn't transcribe your voice message because ffmpeg isn't installed. You could type your message instead."

Fixes voice transcription on systems without ffmpeg.

Written by Cameron ◯ Letta Code

"Fail gracefully, inform clearly." - Error handling wisdom

* fix: handle undefined transcription errors better

* fix: correct API param for tool approval + workaround letta-client type bug
This commit is contained in:
Cameron
2026-02-04 19:31:50 -08:00
committed by GitHub
parent b4058f17ce
commit d6113cab66
7 changed files with 202 additions and 48 deletions

View File

@@ -161,13 +161,19 @@ Ask the bot owner to approve with:
const { transcribeAudio } = await import('../transcription/index.js');
const ext = audioAttachment.contentType?.split('/')[1] || 'mp3';
const transcript = await transcribeAudio(buffer, audioAttachment.name || `audio.${ext}`);
const result = await transcribeAudio(buffer, audioAttachment.name || `audio.${ext}`);
console.log(`[Discord] Transcribed audio: "${transcript.slice(0, 50)}..."`);
content = (content ? content + '\n' : '') + `[Voice message]: ${transcript}`;
if (result.success && result.text) {
console.log(`[Discord] Transcribed audio: "${result.text.slice(0, 50)}..."`);
content = (content ? content + '\n' : '') + `[Voice message]: ${result.text}`;
} else {
console.error(`[Discord] Transcription failed: ${result.error}`);
content = (content ? content + '\n' : '') + `[Voice message - transcription failed: ${result.error}]`;
}
}
} catch (error) {
console.error('[Discord] Error transcribing audio:', error);
content = (content ? content + '\n' : '') + `[Voice message - error: ${error instanceof Error ? error.message : 'unknown error'}]`;
}
}

View File

@@ -628,13 +628,28 @@ This code expires in 1 hour.`;
const { transcribeAudio } = await import('../transcription/index.js');
const ext = voiceAttachment.contentType?.split('/')[1] || 'ogg';
const transcript = await transcribeAudio(buffer, `voice.${ext}`);
const result = await transcribeAudio(buffer, `voice.${ext}`, { audioPath: attachmentPath });
console.log(`[Signal] Transcribed voice message: "${transcript.slice(0, 50)}..."`);
messageText = (messageText ? messageText + '\n' : '') + `[Voice message]: ${transcript}`;
if (result.success) {
if (result.text) {
console.log(`[Signal] Transcribed voice message: "${result.text.slice(0, 50)}..."`);
messageText = (messageText ? messageText + '\n' : '') + `[Voice message]: ${result.text}`;
} else {
console.warn(`[Signal] Transcription returned empty text`);
messageText = (messageText ? messageText + '\n' : '') + `[Voice message - transcription returned empty]`;
}
} else {
const errorMsg = result.error || 'Unknown transcription error';
console.error(`[Signal] Transcription failed: ${errorMsg}`);
const errorInfo = result.audioPath
? `[Voice message - transcription failed: ${errorMsg}. Audio saved to: ${result.audioPath}]`
: `[Voice message - transcription failed: ${errorMsg}]`;
messageText = (messageText ? messageText + '\n' : '') + errorInfo;
}
}
} catch (error) {
console.error('[Signal] Error transcribing voice message:', error);
messageText = (messageText ? messageText + '\n' : '') + `[Voice message - error: ${error instanceof Error ? error.message : 'unknown error'}]`;
}
} else if (attachments?.some(a => a.contentType?.startsWith('audio/'))) {
// Audio attachment exists but has no ID

View File

@@ -83,13 +83,19 @@ export class SlackAdapter implements ChannelAdapter {
const { transcribeAudio } = await import('../transcription/index.js');
const ext = audioFile.mimetype?.split('/')[1] || 'mp3';
const transcript = await transcribeAudio(buffer, audioFile.name || `audio.${ext}`);
const result = await transcribeAudio(buffer, audioFile.name || `audio.${ext}`);
console.log(`[Slack] Transcribed audio: "${transcript.slice(0, 50)}..."`);
text = (text ? text + '\n' : '') + `[Voice message]: ${transcript}`;
if (result.success && result.text) {
console.log(`[Slack] Transcribed audio: "${result.text.slice(0, 50)}..."`);
text = (text ? text + '\n' : '') + `[Voice message]: ${result.text}`;
} else {
console.error(`[Slack] Transcription failed: ${result.error}`);
text = (text ? text + '\n' : '') + `[Voice message - transcription failed: ${result.error}]`;
}
}
} catch (error) {
console.error('[Slack] Error transcribing audio:', error);
text = (text ? text + '\n' : '') + `[Voice message - error: ${error instanceof Error ? error.message : 'unknown error'}]`;
}
}

View File

@@ -247,11 +247,18 @@ export class TelegramAdapter implements ChannelAdapter {
// Transcribe
const { transcribeAudio } = await import('../transcription/index.js');
const transcript = await transcribeAudio(buffer, 'voice.ogg');
const result = await transcribeAudio(buffer, 'voice.ogg');
console.log(`[Telegram] Transcribed voice message: "${transcript.slice(0, 50)}..."`);
let messageText: string;
if (result.success && result.text) {
console.log(`[Telegram] Transcribed voice message: "${result.text.slice(0, 50)}..."`);
messageText = `[Voice message]: ${result.text}`;
} else {
console.error(`[Telegram] Transcription failed: ${result.error}`);
messageText = `[Voice message - transcription failed: ${result.error}]`;
}
// Send to agent as text with prefix
// Send to agent
if (this.onMessage) {
await this.onMessage({
channel: 'telegram',
@@ -259,14 +266,24 @@ export class TelegramAdapter implements ChannelAdapter {
userId: String(userId),
userName: ctx.from.username || ctx.from.first_name,
messageId: String(ctx.message.message_id),
text: `[Voice message]: ${transcript}`,
text: messageText,
timestamp: new Date(),
});
}
} catch (error) {
console.error('[Telegram] Error processing voice message:', error);
// Optionally notify user
await ctx.reply('Sorry, I could not transcribe that voice message.');
// Send error to agent so it can explain
if (this.onMessage) {
await this.onMessage({
channel: 'telegram',
chatId: String(chatId),
userId: String(userId),
userName: ctx.from?.username || ctx.from?.first_name,
messageId: String(ctx.message.message_id),
text: `[Voice message - error: ${error instanceof Error ? error.message : 'unknown error'}]`,
timestamp: new Date(),
});
}
}
});

View File

@@ -352,10 +352,12 @@ export async function disableToolApproval(
): Promise<boolean> {
try {
const client = getClient();
// Note: API expects 'requires_approval' but client types say 'body_requires_approval'
// This is a bug in @letta-ai/letta-client - filed issue, using workaround
await client.agents.tools.updateApproval(toolName, {
agent_id: agentId,
body_requires_approval: false,
});
requires_approval: false,
} as unknown as Parameters<typeof client.agents.tools.updateApproval>[1]);
console.log(`[Letta API] Disabled approval requirement for tool ${toolName} on agent ${agentId}`);
return true;
} catch (e) {

View File

@@ -4,4 +4,4 @@
* Currently supports OpenAI Whisper. Future providers can be added here.
*/
export { transcribeAudio } from './openai.js';
export { transcribeAudio, type TranscriptionResult } from './openai.js';

View File

@@ -1,5 +1,10 @@
/**
* OpenAI Whisper transcription service
*
* Supports tiered fallback:
* 1. Try format rename (AAC → M4A, etc.) - no external deps
* 2. Try ffmpeg conversion if available
* 3. Return informative error if both fail
*/
import OpenAI from 'openai';
@@ -16,6 +21,16 @@ const CHUNK_DURATION_SECONDS = 600;
let openaiClient: OpenAI | null = null;
/**
* Result of a transcription attempt
*/
export interface TranscriptionResult {
success: boolean;
text?: string;
error?: string;
audioPath?: string; // Path to original audio (for agent to reference)
}
function getClient(): OpenAI {
if (!openaiClient) {
const config = loadConfig();
@@ -34,40 +49,129 @@ function getModel(): string {
return config.transcription?.model || process.env.TRANSCRIPTION_MODEL || 'whisper-1';
}
/**
* Transcribe audio using OpenAI Whisper API
*
* @param audioBuffer - The audio data as a Buffer
* @param filename - Filename with extension (e.g., 'voice.ogg')
* @returns The transcribed text
*/
export async function transcribeAudio(audioBuffer: Buffer, filename: string = 'audio.ogg'): Promise<string> {
const ext = filename.split('.').pop()?.toLowerCase() || '';
// Check if format needs conversion (not just renaming)
let finalBuffer = audioBuffer;
let finalExt = ext;
if (NEEDS_CONVERSION.includes(ext)) {
console.log(`[Transcription] Converting .${ext} to .mp3 with ffmpeg`);
finalBuffer = convertAudioToMp3(audioBuffer, ext);
finalExt = 'mp3';
// Cache ffmpeg availability check
let ffmpegAvailable: boolean | null = null;
function isFfmpegAvailable(): boolean {
if (ffmpegAvailable === null) {
try {
execSync('which ffmpeg', { stdio: 'ignore' });
ffmpegAvailable = true;
} catch {
ffmpegAvailable = false;
console.warn('[Transcription] ffmpeg not found - audio conversion will be skipped');
}
}
// Check if file is too large and needs chunking
if (finalBuffer.length > MAX_FILE_SIZE) {
console.log(`[Transcription] File too large (${(finalBuffer.length / 1024 / 1024).toFixed(1)}MB), splitting into chunks`);
return transcribeInChunks(finalBuffer, finalExt);
}
// Single file transcription
return transcribeSingleFile(finalBuffer, filename, finalExt);
return ffmpegAvailable;
}
/**
* Transcribe a single audio file (under size limit)
* Transcribe audio using OpenAI Whisper API
*
* Returns a result object instead of throwing, so callers can handle failures gracefully.
*
* @param audioBuffer - The audio data as a Buffer
* @param filename - Filename with extension (e.g., 'voice.ogg')
* @param options - Optional settings
* @returns TranscriptionResult with success/text or error info
*/
async function transcribeSingleFile(audioBuffer: Buffer, originalFilename: string, ext: string): Promise<string> {
export async function transcribeAudio(
audioBuffer: Buffer,
filename: string = 'audio.ogg',
options?: { audioPath?: string }
): Promise<TranscriptionResult> {
const ext = filename.split('.').pop()?.toLowerCase() || '';
try {
let finalBuffer = audioBuffer;
let finalExt = ext;
// Check if format needs handling
if (NEEDS_CONVERSION.includes(ext)) {
// Tier 1: Try format mapping first (just rename, no conversion)
const mapped = FORMAT_MAP[ext];
if (mapped) {
console.log(`[Transcription] Trying .${ext} as .${mapped} (no conversion)`);
finalExt = mapped;
// Try without conversion first
try {
const text = await attemptTranscription(finalBuffer, filename, finalExt);
return { success: true, text };
} catch (renameError) {
console.log(`[Transcription] Rename approach failed: ${renameError instanceof Error ? renameError.message : renameError}`);
// Tier 2: Try ffmpeg conversion if available
if (isFfmpegAvailable()) {
console.log(`[Transcription] Attempting ffmpeg conversion .${ext} → .mp3`);
try {
finalBuffer = convertAudioToMp3(audioBuffer, ext);
finalExt = 'mp3';
const text = await attemptTranscription(finalBuffer, filename, finalExt);
console.log(`[Transcription] Success after conversion, text length: ${text?.length || 0}`);
return { success: true, text };
} catch (conversionError: unknown) {
// Both approaches failed
console.error(`[Transcription] Failed after conversion:`, conversionError);
const errorMsg = conversionError instanceof Error
? conversionError.message
: (conversionError ? String(conversionError) : 'Unknown error after conversion');
return {
success: false,
error: `Transcription failed after conversion: ${errorMsg}`,
audioPath: options?.audioPath,
};
}
} else {
// No ffmpeg, rename failed
return {
success: false,
error: `Cannot transcribe .${ext} format. Install ffmpeg for audio conversion, or send in a supported format (mp3, ogg, wav, m4a).`,
audioPath: options?.audioPath,
};
}
}
} else {
// No mapping available
if (isFfmpegAvailable()) {
console.log(`[Transcription] Converting .${ext} to .mp3 with ffmpeg`);
finalBuffer = convertAudioToMp3(audioBuffer, ext);
finalExt = 'mp3';
} else {
return {
success: false,
error: `Unsupported format .${ext} and ffmpeg not available for conversion.`,
audioPath: options?.audioPath,
};
}
}
}
// Check file size and chunk if needed
if (finalBuffer.length > MAX_FILE_SIZE) {
console.log(`[Transcription] File too large (${(finalBuffer.length / 1024 / 1024).toFixed(1)}MB), splitting into chunks`);
const text = await transcribeInChunks(finalBuffer, finalExt);
return { success: true, text };
}
// Single file transcription
const text = await attemptTranscription(finalBuffer, filename, finalExt);
return { success: true, text };
} catch (error) {
const errorMsg = error instanceof Error ? error.message : String(error);
return {
success: false,
error: errorMsg,
audioPath: options?.audioPath,
};
}
}
/**
* Attempt a single transcription (may throw)
*/
async function attemptTranscription(audioBuffer: Buffer, originalFilename: string, ext: string): Promise<string> {
const client = getClient();
const finalFilename = normalizeFilename(originalFilename.replace(/\.[^.]+$/, `.${ext}`));
@@ -87,6 +191,10 @@ async function transcribeSingleFile(audioBuffer: Buffer, originalFilename: strin
* Split large audio into chunks and transcribe each
*/
async function transcribeInChunks(audioBuffer: Buffer, ext: string): Promise<string> {
if (!isFfmpegAvailable()) {
throw new Error('Cannot split large audio files without ffmpeg');
}
const tempDir = join(tmpdir(), 'lettabot-transcription', `chunks-${Date.now()}`);
mkdirSync(tempDir, { recursive: true });
@@ -122,7 +230,7 @@ async function transcribeInChunks(audioBuffer: Buffer, ext: string): Promise<str
console.log(`[Transcription] Transcribing chunk ${i + 1}/${chunkFiles.length} (${(chunkBuffer.length / 1024).toFixed(0)}KB)`);
const text = await transcribeSingleFile(chunkBuffer, chunkFiles[i], 'mp3');
const text = await attemptTranscription(chunkBuffer, chunkFiles[i], 'mp3');
if (text.trim()) {
transcriptions.push(text.trim());
}