fix: graceful transcription fallback when ffmpeg unavailable (#155)
* fix: graceful transcription fallback when ffmpeg unavailable When voice transcription fails (e.g., ffmpeg not installed), the agent now receives informative error messages instead of silent failures. Changes: - transcribeAudio() returns TranscriptionResult with success/error/audioPath - Tiered fallback: try format rename first, then ffmpeg, then fail gracefully - Check ffmpeg availability once and cache result - All channel adapters updated to show transcription errors to agent - Agent can explain to user why transcription failed Before: Agent sees: "[Voice message received]" Agent: "I received your voice message but there's no content..." After: Agent sees: "[Voice message - transcription failed: Cannot transcribe .aac format. Install ffmpeg for audio conversion, or send in a supported format (mp3, ogg, wav, m4a). Audio saved to: /path/to/file.aac]" Agent: "I couldn't transcribe your voice message because ffmpeg isn't installed. You could type your message instead." Fixes voice transcription on systems without ffmpeg. Written by Cameron ◯ Letta Code "Fail gracefully, inform clearly." - Error handling wisdom * fix: handle undefined transcription errors better * fix: correct API param for tool approval + workaround letta-client type bug
This commit is contained in:
@@ -161,13 +161,19 @@ Ask the bot owner to approve with:
|
||||
|
||||
const { transcribeAudio } = await import('../transcription/index.js');
|
||||
const ext = audioAttachment.contentType?.split('/')[1] || 'mp3';
|
||||
const transcript = await transcribeAudio(buffer, audioAttachment.name || `audio.${ext}`);
|
||||
const result = await transcribeAudio(buffer, audioAttachment.name || `audio.${ext}`);
|
||||
|
||||
console.log(`[Discord] Transcribed audio: "${transcript.slice(0, 50)}..."`);
|
||||
content = (content ? content + '\n' : '') + `[Voice message]: ${transcript}`;
|
||||
if (result.success && result.text) {
|
||||
console.log(`[Discord] Transcribed audio: "${result.text.slice(0, 50)}..."`);
|
||||
content = (content ? content + '\n' : '') + `[Voice message]: ${result.text}`;
|
||||
} else {
|
||||
console.error(`[Discord] Transcription failed: ${result.error}`);
|
||||
content = (content ? content + '\n' : '') + `[Voice message - transcription failed: ${result.error}]`;
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('[Discord] Error transcribing audio:', error);
|
||||
content = (content ? content + '\n' : '') + `[Voice message - error: ${error instanceof Error ? error.message : 'unknown error'}]`;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -628,13 +628,28 @@ This code expires in 1 hour.`;
|
||||
|
||||
const { transcribeAudio } = await import('../transcription/index.js');
|
||||
const ext = voiceAttachment.contentType?.split('/')[1] || 'ogg';
|
||||
const transcript = await transcribeAudio(buffer, `voice.${ext}`);
|
||||
const result = await transcribeAudio(buffer, `voice.${ext}`, { audioPath: attachmentPath });
|
||||
|
||||
console.log(`[Signal] Transcribed voice message: "${transcript.slice(0, 50)}..."`);
|
||||
messageText = (messageText ? messageText + '\n' : '') + `[Voice message]: ${transcript}`;
|
||||
if (result.success) {
|
||||
if (result.text) {
|
||||
console.log(`[Signal] Transcribed voice message: "${result.text.slice(0, 50)}..."`);
|
||||
messageText = (messageText ? messageText + '\n' : '') + `[Voice message]: ${result.text}`;
|
||||
} else {
|
||||
console.warn(`[Signal] Transcription returned empty text`);
|
||||
messageText = (messageText ? messageText + '\n' : '') + `[Voice message - transcription returned empty]`;
|
||||
}
|
||||
} else {
|
||||
const errorMsg = result.error || 'Unknown transcription error';
|
||||
console.error(`[Signal] Transcription failed: ${errorMsg}`);
|
||||
const errorInfo = result.audioPath
|
||||
? `[Voice message - transcription failed: ${errorMsg}. Audio saved to: ${result.audioPath}]`
|
||||
: `[Voice message - transcription failed: ${errorMsg}]`;
|
||||
messageText = (messageText ? messageText + '\n' : '') + errorInfo;
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('[Signal] Error transcribing voice message:', error);
|
||||
messageText = (messageText ? messageText + '\n' : '') + `[Voice message - error: ${error instanceof Error ? error.message : 'unknown error'}]`;
|
||||
}
|
||||
} else if (attachments?.some(a => a.contentType?.startsWith('audio/'))) {
|
||||
// Audio attachment exists but has no ID
|
||||
|
||||
@@ -83,13 +83,19 @@ export class SlackAdapter implements ChannelAdapter {
|
||||
|
||||
const { transcribeAudio } = await import('../transcription/index.js');
|
||||
const ext = audioFile.mimetype?.split('/')[1] || 'mp3';
|
||||
const transcript = await transcribeAudio(buffer, audioFile.name || `audio.${ext}`);
|
||||
const result = await transcribeAudio(buffer, audioFile.name || `audio.${ext}`);
|
||||
|
||||
console.log(`[Slack] Transcribed audio: "${transcript.slice(0, 50)}..."`);
|
||||
text = (text ? text + '\n' : '') + `[Voice message]: ${transcript}`;
|
||||
if (result.success && result.text) {
|
||||
console.log(`[Slack] Transcribed audio: "${result.text.slice(0, 50)}..."`);
|
||||
text = (text ? text + '\n' : '') + `[Voice message]: ${result.text}`;
|
||||
} else {
|
||||
console.error(`[Slack] Transcription failed: ${result.error}`);
|
||||
text = (text ? text + '\n' : '') + `[Voice message - transcription failed: ${result.error}]`;
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('[Slack] Error transcribing audio:', error);
|
||||
text = (text ? text + '\n' : '') + `[Voice message - error: ${error instanceof Error ? error.message : 'unknown error'}]`;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -247,11 +247,18 @@ export class TelegramAdapter implements ChannelAdapter {
|
||||
|
||||
// Transcribe
|
||||
const { transcribeAudio } = await import('../transcription/index.js');
|
||||
const transcript = await transcribeAudio(buffer, 'voice.ogg');
|
||||
const result = await transcribeAudio(buffer, 'voice.ogg');
|
||||
|
||||
console.log(`[Telegram] Transcribed voice message: "${transcript.slice(0, 50)}..."`);
|
||||
let messageText: string;
|
||||
if (result.success && result.text) {
|
||||
console.log(`[Telegram] Transcribed voice message: "${result.text.slice(0, 50)}..."`);
|
||||
messageText = `[Voice message]: ${result.text}`;
|
||||
} else {
|
||||
console.error(`[Telegram] Transcription failed: ${result.error}`);
|
||||
messageText = `[Voice message - transcription failed: ${result.error}]`;
|
||||
}
|
||||
|
||||
// Send to agent as text with prefix
|
||||
// Send to agent
|
||||
if (this.onMessage) {
|
||||
await this.onMessage({
|
||||
channel: 'telegram',
|
||||
@@ -259,14 +266,24 @@ export class TelegramAdapter implements ChannelAdapter {
|
||||
userId: String(userId),
|
||||
userName: ctx.from.username || ctx.from.first_name,
|
||||
messageId: String(ctx.message.message_id),
|
||||
text: `[Voice message]: ${transcript}`,
|
||||
text: messageText,
|
||||
timestamp: new Date(),
|
||||
});
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('[Telegram] Error processing voice message:', error);
|
||||
// Optionally notify user
|
||||
await ctx.reply('Sorry, I could not transcribe that voice message.');
|
||||
// Send error to agent so it can explain
|
||||
if (this.onMessage) {
|
||||
await this.onMessage({
|
||||
channel: 'telegram',
|
||||
chatId: String(chatId),
|
||||
userId: String(userId),
|
||||
userName: ctx.from?.username || ctx.from?.first_name,
|
||||
messageId: String(ctx.message.message_id),
|
||||
text: `[Voice message - error: ${error instanceof Error ? error.message : 'unknown error'}]`,
|
||||
timestamp: new Date(),
|
||||
});
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
@@ -352,10 +352,12 @@ export async function disableToolApproval(
|
||||
): Promise<boolean> {
|
||||
try {
|
||||
const client = getClient();
|
||||
// Note: API expects 'requires_approval' but client types say 'body_requires_approval'
|
||||
// This is a bug in @letta-ai/letta-client - filed issue, using workaround
|
||||
await client.agents.tools.updateApproval(toolName, {
|
||||
agent_id: agentId,
|
||||
body_requires_approval: false,
|
||||
});
|
||||
requires_approval: false,
|
||||
} as unknown as Parameters<typeof client.agents.tools.updateApproval>[1]);
|
||||
console.log(`[Letta API] Disabled approval requirement for tool ${toolName} on agent ${agentId}`);
|
||||
return true;
|
||||
} catch (e) {
|
||||
|
||||
@@ -4,4 +4,4 @@
|
||||
* Currently supports OpenAI Whisper. Future providers can be added here.
|
||||
*/
|
||||
|
||||
export { transcribeAudio } from './openai.js';
|
||||
export { transcribeAudio, type TranscriptionResult } from './openai.js';
|
||||
|
||||
@@ -1,5 +1,10 @@
|
||||
/**
|
||||
* OpenAI Whisper transcription service
|
||||
*
|
||||
* Supports tiered fallback:
|
||||
* 1. Try format rename (AAC → M4A, etc.) - no external deps
|
||||
* 2. Try ffmpeg conversion if available
|
||||
* 3. Return informative error if both fail
|
||||
*/
|
||||
|
||||
import OpenAI from 'openai';
|
||||
@@ -16,6 +21,16 @@ const CHUNK_DURATION_SECONDS = 600;
|
||||
|
||||
let openaiClient: OpenAI | null = null;
|
||||
|
||||
/**
|
||||
* Result of a transcription attempt
|
||||
*/
|
||||
export interface TranscriptionResult {
|
||||
success: boolean;
|
||||
text?: string;
|
||||
error?: string;
|
||||
audioPath?: string; // Path to original audio (for agent to reference)
|
||||
}
|
||||
|
||||
function getClient(): OpenAI {
|
||||
if (!openaiClient) {
|
||||
const config = loadConfig();
|
||||
@@ -34,40 +49,129 @@ function getModel(): string {
|
||||
return config.transcription?.model || process.env.TRANSCRIPTION_MODEL || 'whisper-1';
|
||||
}
|
||||
|
||||
/**
|
||||
* Transcribe audio using OpenAI Whisper API
|
||||
*
|
||||
* @param audioBuffer - The audio data as a Buffer
|
||||
* @param filename - Filename with extension (e.g., 'voice.ogg')
|
||||
* @returns The transcribed text
|
||||
*/
|
||||
export async function transcribeAudio(audioBuffer: Buffer, filename: string = 'audio.ogg'): Promise<string> {
|
||||
const ext = filename.split('.').pop()?.toLowerCase() || '';
|
||||
|
||||
// Check if format needs conversion (not just renaming)
|
||||
let finalBuffer = audioBuffer;
|
||||
let finalExt = ext;
|
||||
|
||||
if (NEEDS_CONVERSION.includes(ext)) {
|
||||
console.log(`[Transcription] Converting .${ext} to .mp3 with ffmpeg`);
|
||||
finalBuffer = convertAudioToMp3(audioBuffer, ext);
|
||||
finalExt = 'mp3';
|
||||
// Cache ffmpeg availability check
|
||||
let ffmpegAvailable: boolean | null = null;
|
||||
|
||||
function isFfmpegAvailable(): boolean {
|
||||
if (ffmpegAvailable === null) {
|
||||
try {
|
||||
execSync('which ffmpeg', { stdio: 'ignore' });
|
||||
ffmpegAvailable = true;
|
||||
} catch {
|
||||
ffmpegAvailable = false;
|
||||
console.warn('[Transcription] ffmpeg not found - audio conversion will be skipped');
|
||||
}
|
||||
}
|
||||
|
||||
// Check if file is too large and needs chunking
|
||||
if (finalBuffer.length > MAX_FILE_SIZE) {
|
||||
console.log(`[Transcription] File too large (${(finalBuffer.length / 1024 / 1024).toFixed(1)}MB), splitting into chunks`);
|
||||
return transcribeInChunks(finalBuffer, finalExt);
|
||||
}
|
||||
|
||||
// Single file transcription
|
||||
return transcribeSingleFile(finalBuffer, filename, finalExt);
|
||||
return ffmpegAvailable;
|
||||
}
|
||||
|
||||
/**
|
||||
* Transcribe a single audio file (under size limit)
|
||||
* Transcribe audio using OpenAI Whisper API
|
||||
*
|
||||
* Returns a result object instead of throwing, so callers can handle failures gracefully.
|
||||
*
|
||||
* @param audioBuffer - The audio data as a Buffer
|
||||
* @param filename - Filename with extension (e.g., 'voice.ogg')
|
||||
* @param options - Optional settings
|
||||
* @returns TranscriptionResult with success/text or error info
|
||||
*/
|
||||
async function transcribeSingleFile(audioBuffer: Buffer, originalFilename: string, ext: string): Promise<string> {
|
||||
export async function transcribeAudio(
|
||||
audioBuffer: Buffer,
|
||||
filename: string = 'audio.ogg',
|
||||
options?: { audioPath?: string }
|
||||
): Promise<TranscriptionResult> {
|
||||
const ext = filename.split('.').pop()?.toLowerCase() || '';
|
||||
|
||||
try {
|
||||
let finalBuffer = audioBuffer;
|
||||
let finalExt = ext;
|
||||
|
||||
// Check if format needs handling
|
||||
if (NEEDS_CONVERSION.includes(ext)) {
|
||||
// Tier 1: Try format mapping first (just rename, no conversion)
|
||||
const mapped = FORMAT_MAP[ext];
|
||||
if (mapped) {
|
||||
console.log(`[Transcription] Trying .${ext} as .${mapped} (no conversion)`);
|
||||
finalExt = mapped;
|
||||
|
||||
// Try without conversion first
|
||||
try {
|
||||
const text = await attemptTranscription(finalBuffer, filename, finalExt);
|
||||
return { success: true, text };
|
||||
} catch (renameError) {
|
||||
console.log(`[Transcription] Rename approach failed: ${renameError instanceof Error ? renameError.message : renameError}`);
|
||||
|
||||
// Tier 2: Try ffmpeg conversion if available
|
||||
if (isFfmpegAvailable()) {
|
||||
console.log(`[Transcription] Attempting ffmpeg conversion .${ext} → .mp3`);
|
||||
try {
|
||||
finalBuffer = convertAudioToMp3(audioBuffer, ext);
|
||||
finalExt = 'mp3';
|
||||
const text = await attemptTranscription(finalBuffer, filename, finalExt);
|
||||
console.log(`[Transcription] Success after conversion, text length: ${text?.length || 0}`);
|
||||
return { success: true, text };
|
||||
} catch (conversionError: unknown) {
|
||||
// Both approaches failed
|
||||
console.error(`[Transcription] Failed after conversion:`, conversionError);
|
||||
const errorMsg = conversionError instanceof Error
|
||||
? conversionError.message
|
||||
: (conversionError ? String(conversionError) : 'Unknown error after conversion');
|
||||
return {
|
||||
success: false,
|
||||
error: `Transcription failed after conversion: ${errorMsg}`,
|
||||
audioPath: options?.audioPath,
|
||||
};
|
||||
}
|
||||
} else {
|
||||
// No ffmpeg, rename failed
|
||||
return {
|
||||
success: false,
|
||||
error: `Cannot transcribe .${ext} format. Install ffmpeg for audio conversion, or send in a supported format (mp3, ogg, wav, m4a).`,
|
||||
audioPath: options?.audioPath,
|
||||
};
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// No mapping available
|
||||
if (isFfmpegAvailable()) {
|
||||
console.log(`[Transcription] Converting .${ext} to .mp3 with ffmpeg`);
|
||||
finalBuffer = convertAudioToMp3(audioBuffer, ext);
|
||||
finalExt = 'mp3';
|
||||
} else {
|
||||
return {
|
||||
success: false,
|
||||
error: `Unsupported format .${ext} and ffmpeg not available for conversion.`,
|
||||
audioPath: options?.audioPath,
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check file size and chunk if needed
|
||||
if (finalBuffer.length > MAX_FILE_SIZE) {
|
||||
console.log(`[Transcription] File too large (${(finalBuffer.length / 1024 / 1024).toFixed(1)}MB), splitting into chunks`);
|
||||
const text = await transcribeInChunks(finalBuffer, finalExt);
|
||||
return { success: true, text };
|
||||
}
|
||||
|
||||
// Single file transcription
|
||||
const text = await attemptTranscription(finalBuffer, filename, finalExt);
|
||||
return { success: true, text };
|
||||
|
||||
} catch (error) {
|
||||
const errorMsg = error instanceof Error ? error.message : String(error);
|
||||
return {
|
||||
success: false,
|
||||
error: errorMsg,
|
||||
audioPath: options?.audioPath,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempt a single transcription (may throw)
|
||||
*/
|
||||
async function attemptTranscription(audioBuffer: Buffer, originalFilename: string, ext: string): Promise<string> {
|
||||
const client = getClient();
|
||||
const finalFilename = normalizeFilename(originalFilename.replace(/\.[^.]+$/, `.${ext}`));
|
||||
|
||||
@@ -87,6 +191,10 @@ async function transcribeSingleFile(audioBuffer: Buffer, originalFilename: strin
|
||||
* Split large audio into chunks and transcribe each
|
||||
*/
|
||||
async function transcribeInChunks(audioBuffer: Buffer, ext: string): Promise<string> {
|
||||
if (!isFfmpegAvailable()) {
|
||||
throw new Error('Cannot split large audio files without ffmpeg');
|
||||
}
|
||||
|
||||
const tempDir = join(tmpdir(), 'lettabot-transcription', `chunks-${Date.now()}`);
|
||||
mkdirSync(tempDir, { recursive: true });
|
||||
|
||||
@@ -122,7 +230,7 @@ async function transcribeInChunks(audioBuffer: Buffer, ext: string): Promise<str
|
||||
|
||||
console.log(`[Transcription] Transcribing chunk ${i + 1}/${chunkFiles.length} (${(chunkBuffer.length / 1024).toFixed(0)}KB)`);
|
||||
|
||||
const text = await transcribeSingleFile(chunkBuffer, chunkFiles[i], 'mp3');
|
||||
const text = await attemptTranscription(chunkBuffer, chunkFiles[i], 'mp3');
|
||||
if (text.trim()) {
|
||||
transcriptions.push(text.trim());
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user