From cae5b104b3004c15b4b06045b6ebe57b8fd19dc3 Mon Sep 17 00:00:00 2001 From: jamesdanielwhitford <70632508+jamesdanielwhitford@users.noreply.github.com> Date: Mon, 23 Feb 2026 23:37:12 +0200 Subject: [PATCH] feat: add Mistral Voxtral transcription support (#228) --- README.md | 22 +- docs/signal-setup.md | 29 ++- docs/slack-setup.md | 1 + src/channels/discord.ts | 7 +- src/channels/setup.ts | 6 +- src/channels/signal.ts | 12 +- src/channels/slack.ts | 50 ++++- src/channels/telegram.ts | 7 +- src/channels/whatsapp/inbound/extract.ts | 6 +- src/channels/whatsapp/inbound/media.ts | 57 +++++- src/config/types.ts | 6 +- src/onboard.ts | 53 +++-- src/setup/slack-wizard.ts | 21 +- src/transcription/index.ts | 40 +++- src/transcription/mistral.ts | 244 +++++++++++++++++++++++ 15 files changed, 496 insertions(+), 65 deletions(-) create mode 100644 src/transcription/mistral.ts diff --git a/README.md b/README.md index fb973c1..0830d8e 100644 --- a/README.md +++ b/README.md @@ -109,12 +109,14 @@ That's it! Message your bot on Telegram. ## Voice Messages -LettaBot can transcribe voice messages using OpenAI Whisper. Voice messages are automatically converted to text and sent to the agent with a `[Voice message]:` prefix. +LettaBot can transcribe voice messages using either OpenAI Whisper or Mistral Voxtral. Voice messages are automatically converted to text and sent to the agent with a `[Voice message]:` prefix. **Supported channels:** Telegram, WhatsApp, Signal, Slack, Discord ### Configuration +**Option 1: OpenAI Whisper** + Add your OpenAI API key to `lettabot.yaml`: ```yaml @@ -130,7 +132,23 @@ Or set via environment variable: export OPENAI_API_KEY=sk-... ``` -If no API key is configured, voice messages are silently ignored. +**Option 2: Mistral Voxtral** (2x faster, 2x cheaper) + +Add your Mistral API key to `lettabot.yaml`: + +```yaml +transcription: + provider: mistral + apiKey: ... +``` + +Or set via environment variable: + +```bash +export MISTRAL_API_KEY=... +``` + +If no API key is configured, users will receive an error message with a link to this section. ## Skills LettaBot is compatible with [skills.sh](https://skills.sh) and [Clawdhub](https://clawdhub.com/). diff --git a/docs/signal-setup.md b/docs/signal-setup.md index a8b0037..1110f19 100644 --- a/docs/signal-setup.md +++ b/docs/signal-setup.md @@ -19,7 +19,32 @@ brew install signal-cli ### 2. Register Your Phone Number -You need a phone number that can receive SMS for verification. +You have two options: + +#### Option A: Link as Secondary Device (Recommended) + +Link signal-cli to your existing Signal account without disrupting your phone app: + +```bash +# Generate a linking QR code/URI +signal-cli link -n "LettaBot" +``` + +This will display a `sgnl://linkdevice?uuid=...` URI. On your phone: +1. Open Signal → Settings (tap your profile) +2. Tap "Linked Devices" +3. Tap "Link New Device" (+ button) +4. Scan the QR code or enter the URI + +**Benefits:** +- Your phone's Signal app continues to work normally +- Bot runs as a linked device (like Signal Desktop) +- Both your phone and the bot receive messages +- You can unlink the bot anytime from your phone + +#### Option B: Primary Registration (Dedicated Number Only) + +Register signal-cli as the primary device (requires a dedicated phone number): ```bash # Request verification code (sent via SMS) @@ -29,7 +54,7 @@ signal-cli -a +1XXXXXXXXXX register signal-cli -a +1XXXXXXXXXX verify CODE ``` -**Note:** You can only have one Signal client per number. Registering signal-cli will log out your Signal mobile app. Consider using a secondary number. +**Warning:** This will log out your Signal mobile app. Only use this option with a dedicated bot number, not your personal number. ## Configuration diff --git a/docs/slack-setup.md b/docs/slack-setup.md index e046f2d..0070cef 100644 --- a/docs/slack-setup.md +++ b/docs/slack-setup.md @@ -48,6 +48,7 @@ Socket Mode lets your bot connect without exposing a public endpoint. |-------|---------| | `app_mentions:read` | React when someone @mentions your bot | | `chat:write` | Send messages | +| `files:read` | Download voice message attachments | | `im:history` | Read DM message history | | `im:read` | View DM channel info | | `im:write` | Start DM conversations | diff --git a/src/channels/discord.ts b/src/channels/discord.ts index 09b514c..d7cb605 100644 --- a/src/channels/discord.ts +++ b/src/channels/discord.ts @@ -180,10 +180,9 @@ Ask the bot owner to approve with: const audioAttachment = message.attachments.find(a => a.contentType?.startsWith('audio/')); if (audioAttachment?.url) { try { - const { loadConfig } = await import('../config/index.js'); - const config = loadConfig(); - if (!config.transcription?.apiKey && !process.env.OPENAI_API_KEY) { - await message.reply('Voice messages require OpenAI API key for transcription. See: https://github.com/letta-ai/lettabot#voice-messages'); + const { isTranscriptionConfigured } = await import('../transcription/index.js'); + if (!isTranscriptionConfigured()) { + await message.reply('Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice-messages'); } else { // Download audio const response = await fetch(audioAttachment.url); diff --git a/src/channels/setup.ts b/src/channels/setup.ts index 43ddcbb..b2a5b88 100644 --- a/src/channels/setup.ts +++ b/src/channels/setup.ts @@ -494,9 +494,9 @@ export async function setupSignal(existing?: any): Promise { p.note( 'See docs/signal-setup.md for detailed instructions.\n' + - 'Requires signal-cli registered with your phone number.\n\n' + - '⚠️ Security: Has full access to your Signal account.\n' + - 'Can see all messages and send as you.', + 'Recommended: Link as secondary device (signal-cli link -n "LettaBot")\n' + + 'This keeps your phone\'s Signal app working normally.\n\n' + + 'Requires signal-cli registered or linked with your phone number.', 'Signal Setup' ); diff --git a/src/channels/signal.ts b/src/channels/signal.ts index 48617cc..bb8c1f5 100644 --- a/src/channels/signal.ts +++ b/src/channels/signal.ts @@ -623,14 +623,12 @@ This code expires in 1 hour.`; } try { - const { loadConfig } = await import('../config/index.js'); - const config = loadConfig(); - if (!config.transcription?.apiKey && !process.env.OPENAI_API_KEY) { + const { isTranscriptionConfigured } = await import('../transcription/index.js'); + if (!isTranscriptionConfigured()) { if (chatId) { - const audioInfo = savedAudioPath ? ` Audio saved to: ${savedAudioPath}` : ''; - await this.sendMessage({ - chatId, - text: `Voice messages require OpenAI API key for transcription.${audioInfo} See: https://github.com/letta-ai/lettabot#voice-messages` + await this.sendMessage({ + chatId, + text: 'Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice-messages' }); } } else { diff --git a/src/channels/slack.ts b/src/channels/slack.ts index bf83660..00a8129 100644 --- a/src/channels/slack.ts +++ b/src/channels/slack.ts @@ -60,9 +60,9 @@ export class SlackAdapter implements ChannelAdapter { // Handle messages this.app.message(async ({ message, say, client }) => { - // Type guard for regular messages - if (message.subtype !== undefined) return; - if (!('user' in message) || !('text' in message)) return; + // Type guard for regular messages (allow file_share for voice messages) + if (message.subtype !== undefined && message.subtype !== 'file_share') return; + if (!('user' in message)) return; const userId = message.user; let text = message.text || ''; @@ -74,10 +74,9 @@ export class SlackAdapter implements ChannelAdapter { const audioFile = files?.find(f => f.mimetype?.startsWith('audio/')); if (audioFile?.url_private_download) { try { - const { loadConfig } = await import('../config/index.js'); - const config = loadConfig(); - if (!config.transcription?.apiKey && !process.env.OPENAI_API_KEY) { - await say('Voice messages require OpenAI API key for transcription. See: https://github.com/letta-ai/lettabot#voice-messages'); + const { isTranscriptionConfigured } = await import('../transcription/index.js'); + if (!isTranscriptionConfigured()) { + await say('Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice-messages'); } else { // Download file (requires bot token for auth) const response = await fetch(audioFile.url_private_download, { @@ -173,10 +172,43 @@ export class SlackAdapter implements ChannelAdapter { // Handle app mentions (@bot) this.app.event('app_mention', async ({ event }) => { const userId = event.user || ''; - const text = (event.text || '').replace(/<@[A-Z0-9]+>/g, '').trim(); // Remove mention + let text = (event.text || '').replace(/<@[A-Z0-9]+>/g, '').trim(); // Remove mention const channelId = event.channel; const threadTs = event.thread_ts || event.ts; // Reply in thread, or start new thread from the mention - + + // Handle audio file attachments + const files = (event as any).files as Array<{ mimetype?: string; url_private_download?: string; name?: string }> | undefined; + const audioFile = files?.find(f => f.mimetype?.startsWith('audio/')); + if (audioFile?.url_private_download) { + try { + const { isTranscriptionConfigured } = await import('../transcription/index.js'); + if (!isTranscriptionConfigured()) { + await this.sendMessage({ chatId: channelId, text: 'Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice-messages', threadId: threadTs }); + return; + } + // Download file (requires bot token for auth) + const response = await fetch(audioFile.url_private_download, { + headers: { 'Authorization': `Bearer ${this.config.botToken}` } + }); + const buffer = Buffer.from(await response.arrayBuffer()); + + const { transcribeAudio } = await import('../transcription/index.js'); + const ext = audioFile.mimetype?.split('/')[1] || 'mp3'; + const result = await transcribeAudio(buffer, audioFile.name || `audio.${ext}`); + + if (result.success && result.text) { + console.log(`[Slack] Transcribed audio: "${result.text.slice(0, 50)}..."`); + text = (text ? text + '\n' : '') + `[Voice message]: ${result.text}`; + } else { + console.error(`[Slack] Transcription failed: ${result.error}`); + text = (text ? text + '\n' : '') + `[Voice message - transcription failed: ${result.error}]`; + } + } catch (error) { + console.error('[Slack] Error transcribing audio:', error); + text = (text ? text + '\n' : '') + `[Voice message - error: ${error instanceof Error ? error.message : 'unknown error'}]`; + } + } + if (this.config.allowedUsers && this.config.allowedUsers.length > 0) { if (!userId || !this.config.allowedUsers.includes(userId)) { // Can't use say() in app_mention event the same way diff --git a/src/channels/telegram.ts b/src/channels/telegram.ts index 8b0f89b..d0f2551 100644 --- a/src/channels/telegram.ts +++ b/src/channels/telegram.ts @@ -346,10 +346,9 @@ export class TelegramAdapter implements ChannelAdapter { const { isGroup, groupName, wasMentioned, isListeningMode } = gating; // Check if transcription is configured (config or env) - const { loadConfig } = await import('../config/index.js'); - const config = loadConfig(); - if (!config.transcription?.apiKey && !process.env.OPENAI_API_KEY) { - await ctx.reply('Voice messages require OpenAI API key for transcription. See: https://github.com/letta-ai/lettabot#voice-messages'); + const { isTranscriptionConfigured } = await import('../transcription/index.js'); + if (!isTranscriptionConfigured()) { + await ctx.reply('Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice-messages'); return; } diff --git a/src/channels/whatsapp/inbound/extract.ts b/src/channels/whatsapp/inbound/extract.ts index 242f6e0..ba37bdf 100644 --- a/src/channels/whatsapp/inbound/extract.ts +++ b/src/channels/whatsapp/inbound/extract.ts @@ -143,18 +143,22 @@ export async function extractInboundMessage( // Collect attachments if media present and config provided let attachments: InboundAttachment[] = []; + let voiceTranscription: string | undefined; if (preview.hasMedia && attachmentConfig) { const result = await collectAttachments({ messageContent, chatId: remoteJid, messageId: messageId || 'unknown', + sock, ...attachmentConfig, }); attachments = result.attachments; + voiceTranscription = result.voiceTranscription; } // Use caption as fallback text (for media-only messages) - const finalBody = body || preview.caption || ''; + // For voice messages, use transcription if available + const finalBody = voiceTranscription || body || preview.caption || ''; if (!finalBody && attachments.length === 0) { return null; // Skip messages with no text and no media } diff --git a/src/channels/whatsapp/inbound/media.ts b/src/channels/whatsapp/inbound/media.ts index 5251ca1..5a24283 100644 --- a/src/channels/whatsapp/inbound/media.ts +++ b/src/channels/whatsapp/inbound/media.ts @@ -55,19 +55,21 @@ export function extractMediaPreview(messageContent: any): { hasMedia: boolean; c * Handles 5 media types: image, video, audio, document, sticker. * Downloads using Baileys' downloadContentFromMessage and saves to disk. * Enforces size limits and supports metadata-only mode. + * Transcribes voice messages (ptt: true) using configured transcription provider. * * @param params - Attachment collection parameters - * @returns Attachments array and optional caption + * @returns Attachments array, optional caption, and optional transcribed text for voice messages */ export async function collectAttachments(params: { messageContent: any; chatId: string; messageId: string; downloadContentFromMessage: (message: any, type: string) => Promise>; + sock: import("@whiskeysockets/baileys").WASocket; attachmentsDir?: string; attachmentsMaxBytes?: number; -}): Promise<{ attachments: InboundAttachment[]; caption?: string }> { - const { messageContent, chatId, messageId, downloadContentFromMessage, attachmentsDir, attachmentsMaxBytes } = params; +}): Promise<{ attachments: InboundAttachment[]; caption?: string; voiceTranscription?: string }> { + const { messageContent, chatId, messageId, downloadContentFromMessage, sock, attachmentsDir, attachmentsMaxBytes } = params; const attachments: InboundAttachment[] = []; if (!messageContent) return { attachments }; @@ -122,6 +124,10 @@ export async function collectAttachments(params: { kind, }; + // Check if this is a voice message (ptt = push-to-talk) + const isPttVoiceMessage = mediaType === 'audio' && mediaMessage.ptt === true; + let voiceTranscription: string | undefined; + // Download if attachmentsDir is configured if (attachmentsDir) { // Metadata-only mode (attachmentsMaxBytes = 0) @@ -151,9 +157,52 @@ export async function collectAttachments(params: { } } + // Transcribe voice messages + if (isPttVoiceMessage) { + try { + const { isTranscriptionConfigured } = await import('../../../transcription/index.js'); + if (!isTranscriptionConfigured()) { + // Send error message directly to user (matches Telegram/Slack/Discord/Signal behavior) + try { + await sock.sendMessage(chatId, { + text: 'Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice-messages' + }); + } catch (sendError) { + console.error('[WhatsApp] Failed to send transcription error message:', sendError); + } + // Don't forward error to agent - return early + const caption = mediaMessage.caption as string | undefined; + return { attachments, caption }; + } + + // Download audio buffer for transcription + const stream = await downloadContentFromMessage(mediaMessage, mediaType); + const chunks: Uint8Array[] = []; + for await (const chunk of stream) { + chunks.push(chunk); + } + const buffer = Buffer.concat(chunks); + + // Transcribe audio + const { transcribeAudio } = await import('../../../transcription/index.js'); + const result = await transcribeAudio(buffer, name); + + if (result.success && result.text) { + console.log(`[WhatsApp] Transcribed voice message: "${result.text.slice(0, 50)}..."`); + voiceTranscription = `[Voice message]: ${result.text}`; + } else { + console.error(`[WhatsApp] Transcription failed: ${result.error}`); + voiceTranscription = `[Voice message - transcription failed: ${result.error}]`; + } + } catch (error) { + console.error('[WhatsApp] Error transcribing voice message:', error); + voiceTranscription = `[Voice message - error: ${error instanceof Error ? error.message : 'unknown error'}]`; + } + } + attachments.push(attachment); const caption = mediaMessage.caption as string | undefined; - return { attachments, caption }; + return { attachments, caption, voiceTranscription }; } /** diff --git a/src/config/types.ts b/src/config/types.ts index 06a2210..fdc5fd8 100644 --- a/src/config/types.ts +++ b/src/config/types.ts @@ -183,9 +183,9 @@ export interface LettaBotConfig { } export interface TranscriptionConfig { - provider: 'openai'; // Only OpenAI supported currently - apiKey?: string; // Falls back to OPENAI_API_KEY env var - model?: string; // Defaults to 'whisper-1' + provider: 'openai' | 'mistral'; + apiKey?: string; // Falls back to OPENAI_API_KEY or MISTRAL_API_KEY env var + model?: string; // Defaults to 'whisper-1' (OpenAI) or 'voxtral-mini-latest' (Mistral) } export interface PollingYamlConfig { diff --git a/src/onboard.ts b/src/onboard.ts index 207fdc7..4f62a97 100644 --- a/src/onboard.ts +++ b/src/onboard.ts @@ -290,7 +290,7 @@ interface OnboardConfig { cron: boolean; // Transcription (voice messages) - transcription: { enabled: boolean; apiKey?: string; model?: string }; + transcription: { enabled: boolean; provider?: 'openai' | 'mistral'; apiKey?: string; model?: string }; } const isPlaceholder = (val?: string) => !val || /^(your_|sk-\.\.\.|placeholder|example)/i.test(val); @@ -665,6 +665,7 @@ async function stepProviders(config: OnboardConfig, env: Record) }); if (!p.isCancel(enableTranscription) && enableTranscription) { config.transcription.enabled = true; + config.transcription.provider = 'openai'; config.transcription.apiKey = providerKey; } } @@ -838,23 +839,39 @@ async function stepFeatures(config: OnboardConfig): Promise { // Voice Transcription Setup // ============================================================================ -async function stepTranscription(config: OnboardConfig): Promise { - // Skip if already configured from the providers step - if (config.transcription.enabled && config.transcription.apiKey) return; +async function stepTranscription(config: OnboardConfig, forcePrompt?: boolean): Promise { + // Skip if already configured (e.g. from OpenAI shortcut in stepProviders) + if (!forcePrompt && config.transcription.enabled && config.transcription.apiKey) return; const setupTranscription = await p.confirm({ - message: 'Enable voice message transcription? (uses OpenAI Whisper)', + message: 'Enable voice message transcription?', initialValue: config.transcription.enabled, }); if (p.isCancel(setupTranscription)) { p.cancel('Setup cancelled'); process.exit(0); } config.transcription.enabled = setupTranscription; if (setupTranscription) { - const existingKey = process.env.OPENAI_API_KEY; + const providerChoice = await p.select({ + message: 'Transcription provider', + options: [ + { value: 'openai', label: 'OpenAI Whisper', hint: 'whisper-1' }, + { value: 'mistral', label: 'Mistral Voxtral', hint: 'voxtral-mini-latest' }, + ], + initialValue: config.transcription.provider || 'openai', + }); + if (p.isCancel(providerChoice)) { p.cancel('Setup cancelled'); process.exit(0); } + config.transcription.provider = providerChoice as 'openai' | 'mistral'; + + const isMistral = config.transcription.provider === 'mistral'; + // Check env vars first, then check if key was already entered for LLM provider + const existingKey = isMistral + ? process.env.MISTRAL_API_KEY + : (process.env.OPENAI_API_KEY || config.providers?.find(p => p.id === 'openai')?.apiKey); + const providerLabel = isMistral ? 'Mistral' : 'OpenAI'; const apiKey = await p.text({ - message: 'OpenAI API Key (for Whisper transcription)', - placeholder: 'sk-...', + message: `${providerLabel} API Key`, + placeholder: isMistral ? '' : 'sk-...', initialValue: existingKey || '', validate: (v) => { if (!v) return 'API key is required for voice transcription'; @@ -1197,7 +1214,10 @@ function showSummary(config: OnboardConfig): void { lines.push(`Features: ${features.length > 0 ? features.join(', ') : 'None'}`); // Transcription - lines.push(`Voice: ${config.transcription.enabled ? 'Enabled (OpenAI Whisper)' : 'Disabled'}`); + const voiceLabel = config.transcription.enabled + ? `Enabled (${config.transcription.provider === 'mistral' ? 'Mistral Voxtral' : 'OpenAI Whisper'})` + : 'Disabled'; + lines.push(`Voice: ${voiceLabel}`); // Google if (config.google.enabled) { @@ -1243,7 +1263,7 @@ async function reviewLoop(config: OnboardConfig, env: Record): P } else if (choice === 'channels') await stepChannels(config, env); else if (choice === 'features') await stepFeatures(config); - else if (choice === 'transcription') await stepTranscription(config); + else if (choice === 'transcription') await stepTranscription(config, true); else if (choice === 'google') await stepGoogle(config); } } @@ -1473,7 +1493,8 @@ export async function onboard(options?: { nonInteractive?: boolean }): Promise { - p.log.step('Step 1/3: Create Slack App from Manifest'); + p.log.step('Step 1/4: Create Slack App from Manifest'); // Inline manifest for Socket Mode configuration const appName = process.env.SLACK_APP_NAME || process.env.LETTA_AGENT_NAME || 'LettaBot'; @@ -99,6 +102,7 @@ oauth_config: bot: - app_mentions:read - chat:write + - files:read - im:history - im:read - im:write @@ -117,7 +121,7 @@ settings: p.note( 'Creates app with everything pre-configured:\n' + ' • Socket Mode enabled\n' + - ' • 5 bot scopes (app_mentions:read, chat:write, im:*)\n' + + ' • 6 bot scopes (app_mentions:read, chat:write, files:read, im:*)\n' + ' • 2 event subscriptions (app_mention, message.im)\n\n' + 'Just review and click "Create"!', 'One-Click Setup' @@ -162,7 +166,7 @@ settings: } async function stepEnableSocketMode(existingToken?: string): Promise { - p.log.step('Step 3/3: Get App-Level Token'); + p.log.step('Step 4/4: Get App-Level Token'); p.note( '1. In the left sidebar, click "Socket Mode"\n' + @@ -197,6 +201,7 @@ async function stepConfigureScopes(): Promise { '3. Click "Add an OAuth Scope" for each:\n' + ' • app_mentions:read\n' + ' • chat:write\n' + + ' • files:read\n' + ' • im:history\n' + ' • im:read\n' + ' • im:write', @@ -244,7 +249,7 @@ async function stepConfigureEvents(): Promise { } async function stepConfigureAppHome(): Promise { - p.log.step('Step 5/6: Configure App Home'); + p.log.step('Step 2/4: Configure App Home'); p.note( '1. Go to "App Home" in left sidebar\n' + @@ -267,7 +272,7 @@ async function stepConfigureAppHome(): Promise { } async function stepInstallApp(existingToken?: string): Promise { - p.log.step('Step 6/6: Install to Workspace'); + p.log.step('Step 3/4: Install to Workspace'); p.note( '1. Go to "Install App" in left sidebar\n' + diff --git a/src/transcription/index.ts b/src/transcription/index.ts index a7d9f2d..b229c7f 100644 --- a/src/transcription/index.ts +++ b/src/transcription/index.ts @@ -1,7 +1,39 @@ /** - * Transcription service - * - * Currently supports OpenAI Whisper. Future providers can be added here. + * Transcription service router + * + * Delegates to the correct provider based on config.transcription.provider. + * Defaults to OpenAI Whisper for backwards compatibility. */ -export { transcribeAudio, type TranscriptionResult } from './openai.js'; +import { loadConfig } from '../config/index.js'; +import type { TranscriptionResult } from './openai.js'; +import { transcribeAudio as openaiTranscribe } from './openai.js'; +import { transcribeAudio as mistralTranscribe } from './mistral.js'; + +export type { TranscriptionResult } from './openai.js'; + +/** + * Check whether a transcription API key is available for the configured provider. + * Used by channel handlers to gate voice message processing. + */ +export function isTranscriptionConfigured(): boolean { + const config = loadConfig(); + const provider = config.transcription?.provider || 'openai'; + return !!(config.transcription?.apiKey + || (provider === 'mistral' ? process.env.MISTRAL_API_KEY : process.env.OPENAI_API_KEY)); +} + +export async function transcribeAudio( + audioBuffer: Buffer, + filename?: string, + options?: { audioPath?: string } +): Promise { + const config = loadConfig(); + const provider = config.transcription?.provider || 'openai'; + + if (provider === 'mistral') { + return mistralTranscribe(audioBuffer, filename, options); + } + + return openaiTranscribe(audioBuffer, filename, options); +} diff --git a/src/transcription/mistral.ts b/src/transcription/mistral.ts new file mode 100644 index 0000000..0644b9d --- /dev/null +++ b/src/transcription/mistral.ts @@ -0,0 +1,244 @@ +/** + * Mistral Voxtral transcription service + * + * Uses Voxtral Transcribe 2 via the Mistral REST API. + * Simple multipart POST — no SDK dependency needed. + */ + +import { loadConfig } from '../config/index.js'; +import { execSync } from 'node:child_process'; +import { writeFileSync, readFileSync, unlinkSync, mkdirSync, readdirSync } from 'node:fs'; +import { join } from 'node:path'; +import { tmpdir } from 'node:os'; +import type { TranscriptionResult } from './openai.js'; + +const MAX_FILE_SIZE = 20 * 1024 * 1024; +const CHUNK_DURATION_SECONDS = 600; + +function getApiKey(): string { + const config = loadConfig(); + const apiKey = config.transcription?.apiKey || process.env.MISTRAL_API_KEY; + if (!apiKey) { + throw new Error('Mistral API key required for transcription. Set in config (transcription.apiKey) or MISTRAL_API_KEY env var.'); + } + return apiKey; +} + +function getModel(): string { + const config = loadConfig(); + return config.transcription?.model || process.env.TRANSCRIPTION_MODEL || 'voxtral-mini-latest'; +} + +function getMimeType(filename: string): string { + const ext = filename.split('.').pop()?.toLowerCase(); + const mimeTypes: Record = { + 'ogg': 'audio/ogg', + 'oga': 'audio/ogg', + 'mp3': 'audio/mpeg', + 'mp4': 'audio/mp4', + 'm4a': 'audio/mp4', + 'wav': 'audio/wav', + 'flac': 'audio/flac', + 'webm': 'audio/webm', + }; + return mimeTypes[ext || ''] || 'audio/ogg'; +} + +const NEEDS_CONVERSION = ['aac', 'amr', 'caf', 'x-caf', '3gp', '3gpp']; + +const FORMAT_MAP: Record = { + 'aac': 'm4a', + 'amr': 'mp3', + 'opus': 'ogg', + 'x-caf': 'm4a', + 'caf': 'm4a', + '3gp': 'mp4', + '3gpp': 'mp4', +}; + +let ffmpegAvailable: boolean | null = null; + +function isFfmpegAvailable(): boolean { + if (ffmpegAvailable === null) { + try { + execSync('which ffmpeg', { stdio: 'ignore' }); + ffmpegAvailable = true; + } catch { + ffmpegAvailable = false; + } + } + return ffmpegAvailable; +} + +function convertAudioToMp3(audioBuffer: Buffer, inputExt: string): Buffer { + const tempDir = join(tmpdir(), 'lettabot-transcription'); + mkdirSync(tempDir, { recursive: true }); + + const inputPath = join(tempDir, `input-${Date.now()}.${inputExt}`); + const outputPath = join(tempDir, `output-${Date.now()}.mp3`); + + try { + writeFileSync(inputPath, audioBuffer); + execSync(`ffmpeg -y -i "${inputPath}" -acodec libmp3lame -q:a 2 "${outputPath}" 2>/dev/null`, { + timeout: 30000, + }); + const converted = readFileSync(outputPath); + console.log(`[Transcription] Converted ${audioBuffer.length} bytes → ${converted.length} bytes`); + return converted; + } finally { + try { unlinkSync(inputPath); } catch {} + try { unlinkSync(outputPath); } catch {} + } +} + +/** + * Send a single buffer to the Voxtral API and return the text. + */ +async function attemptTranscription(audioBuffer: Buffer, filename: string): Promise { + const apiKey = getApiKey(); + const model = getModel(); + + const file = new File([new Uint8Array(audioBuffer)], filename, { + type: getMimeType(filename), + }); + + const formData = new FormData(); + formData.append('model', model); + formData.append('file', file); + + const response = await fetch('https://api.mistral.ai/v1/audio/transcriptions', { + method: 'POST', + headers: { 'Authorization': `Bearer ${apiKey}` }, + body: formData, + }); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error(`Mistral API error (${response.status}): ${errorText}`); + } + + const data = await response.json() as { text: string }; + return data.text; +} + +/** + * Split large audio into chunks and transcribe each. + */ +async function transcribeInChunks(audioBuffer: Buffer, ext: string): Promise { + if (!isFfmpegAvailable()) { + throw new Error('Cannot split large audio files without ffmpeg'); + } + + const tempDir = join(tmpdir(), 'lettabot-transcription', `chunks-${Date.now()}`); + mkdirSync(tempDir, { recursive: true }); + + const inputPath = join(tempDir, `input.${ext}`); + const outputPattern = join(tempDir, 'chunk-%03d.mp3'); + + try { + writeFileSync(inputPath, audioBuffer); + + execSync( + `ffmpeg -y -i "${inputPath}" -f segment -segment_time ${CHUNK_DURATION_SECONDS} -reset_timestamps 1 -acodec libmp3lame -q:a 2 "${outputPattern}" 2>/dev/null`, + { timeout: 120000 } + ); + + const chunkFiles = readdirSync(tempDir) + .filter(f => f.startsWith('chunk-') && f.endsWith('.mp3')) + .sort(); + + if (chunkFiles.length === 0) { + throw new Error('Failed to split audio into chunks'); + } + + console.log(`[Transcription] Split into ${chunkFiles.length} chunks`); + + const transcriptions: string[] = []; + for (let i = 0; i < chunkFiles.length; i++) { + const chunkPath = join(tempDir, chunkFiles[i]); + const chunkBuffer = readFileSync(chunkPath); + console.log(`[Transcription] Transcribing chunk ${i + 1}/${chunkFiles.length} (${(chunkBuffer.length / 1024).toFixed(0)}KB)`); + const text = await attemptTranscription(chunkBuffer, chunkFiles[i]); + if (text.trim()) { + transcriptions.push(text.trim()); + } + } + + const combined = transcriptions.join(' '); + console.log(`[Transcription] Combined ${transcriptions.length} chunks into ${combined.length} chars`); + return combined; + } finally { + try { + const files = readdirSync(tempDir); + for (const file of files) { + unlinkSync(join(tempDir, file)); + } + execSync(`rmdir "${tempDir}" 2>/dev/null || true`); + } catch {} + } +} + +/** + * Transcribe audio using Mistral Voxtral API + * + * Voxtral supports: wav, mp3, flac, ogg, webm + * Telegram voice messages (OGG/Opus) work natively. + */ +export async function transcribeAudio( + audioBuffer: Buffer, + filename: string = 'audio.ogg', + options?: { audioPath?: string } +): Promise { + const ext = filename.split('.').pop()?.toLowerCase() || ''; + + try { + let finalBuffer = audioBuffer; + let finalFilename = filename; + + // Convert unsupported formats via ffmpeg + if (NEEDS_CONVERSION.includes(ext)) { + const mapped = FORMAT_MAP[ext]; + if (mapped) { + console.log(`[Transcription] Trying .${ext} as .${mapped} (no conversion)`); + finalFilename = filename.replace(/\.[^.]+$/, `.${mapped}`); + + try { + const text = await attemptTranscription(finalBuffer, finalFilename); + return { success: true, text }; + } catch { + console.log(`[Transcription] Rename approach failed for .${ext}`); + } + } + + if (isFfmpegAvailable()) { + console.log(`[Transcription] Converting .${ext} → .mp3 with ffmpeg`); + finalBuffer = convertAudioToMp3(audioBuffer, ext); + finalFilename = filename.replace(/\.[^.]+$/, '.mp3'); + } else { + return { + success: false, + error: `Cannot transcribe .${ext} format. Install ffmpeg for audio conversion, or send in a supported format (mp3, ogg, wav, flac).`, + audioPath: options?.audioPath, + }; + } + } + + // Check file size and chunk if needed + if (finalBuffer.length > MAX_FILE_SIZE) { + const finalExt = finalFilename.split('.').pop()?.toLowerCase() || 'ogg'; + console.log(`[Transcription] File too large (${(finalBuffer.length / 1024 / 1024).toFixed(1)}MB), splitting into chunks`); + const text = await transcribeInChunks(finalBuffer, finalExt); + return { success: true, text }; + } + + const text = await attemptTranscription(finalBuffer, finalFilename); + return { success: true, text }; + } catch (error) { + const errorMsg = error instanceof Error ? error.message : String(error); + return { + success: false, + error: errorMsg, + audioPath: options?.audioPath, + }; + } +}