From cae5b104b3004c15b4b06045b6ebe57b8fd19dc3 Mon Sep 17 00:00:00 2001
From: jamesdanielwhitford
 <70632508+jamesdanielwhitford@users.noreply.github.com>
Date: Mon, 23 Feb 2026 23:37:12 +0200
Subject: [PATCH] feat: add Mistral Voxtral transcription support (#228)

---
 README.md                                |  22 +-
 docs/signal-setup.md                     |  29 ++-
 docs/slack-setup.md                      |   1 +
 src/channels/discord.ts                  |   7 +-
 src/channels/setup.ts                    |   6 +-
 src/channels/signal.ts                   |  12 +-
 src/channels/slack.ts                    |  50 ++++-
 src/channels/telegram.ts                 |   7 +-
 src/channels/whatsapp/inbound/extract.ts |   6 +-
 src/channels/whatsapp/inbound/media.ts   |  57 +++++-
 src/config/types.ts                      |   6 +-
 src/onboard.ts                           |  53 +++--
 src/setup/slack-wizard.ts                |  21 +-
 src/transcription/index.ts               |  40 +++-
 src/transcription/mistral.ts             | 244 +++++++++++++++++++++++
 15 files changed, 496 insertions(+), 65 deletions(-)
 create mode 100644 src/transcription/mistral.ts

diff --git a/README.md b/README.md
index fb973c1..0830d8e 100644
--- a/README.md
+++ b/README.md
@@ -109,12 +109,14 @@ That's it! Message your bot on Telegram.
 
 ## Voice Messages
 
-LettaBot can transcribe voice messages using OpenAI Whisper. Voice messages are automatically converted to text and sent to the agent with a `[Voice message]:` prefix.
+LettaBot can transcribe voice messages using either OpenAI Whisper or Mistral Voxtral. Voice messages are automatically converted to text and sent to the agent with a `[Voice message]:` prefix.
 
 **Supported channels:** Telegram, WhatsApp, Signal, Slack, Discord
 
 ### Configuration
 
+**Option 1: OpenAI Whisper**
+
 Add your OpenAI API key to `lettabot.yaml`:
 
 ```yaml
@@ -130,7 +132,23 @@ Or set via environment variable:
 export OPENAI_API_KEY=sk-...
 ```
 
-If no API key is configured, voice messages are silently ignored.
+**Option 2: Mistral Voxtral** (2x faster, 2x cheaper)
+
+Add your Mistral API key to `lettabot.yaml`:
+
+```yaml
+transcription:
+  provider: mistral
+  apiKey: ...
+```
+
+Or set via environment variable:
+
+```bash
+export MISTRAL_API_KEY=...
+```
+
+If no API key is configured, users will receive an error message with a link to this section.
 
 ## Skills
 LettaBot is compatible with [skills.sh](https://skills.sh) and [Clawdhub](https://clawdhub.com/). 
diff --git a/docs/signal-setup.md b/docs/signal-setup.md
index a8b0037..1110f19 100644
--- a/docs/signal-setup.md
+++ b/docs/signal-setup.md
@@ -19,7 +19,32 @@ brew install signal-cli
 
 ### 2. Register Your Phone Number
 
-You need a phone number that can receive SMS for verification.
+You have two options:
+
+#### Option A: Link as Secondary Device (Recommended)
+
+Link signal-cli to your existing Signal account without disrupting your phone app:
+
+```bash
+# Generate a linking QR code/URI
+signal-cli link -n "LettaBot"
+```
+
+This will display a `sgnl://linkdevice?uuid=...` URI. On your phone:
+1. Open Signal → Settings (tap your profile)
+2. Tap "Linked Devices"
+3. Tap "Link New Device" (+ button)
+4. Scan the QR code or enter the URI
+
+**Benefits:**
+- Your phone's Signal app continues to work normally
+- Bot runs as a linked device (like Signal Desktop)
+- Both your phone and the bot receive messages
+- You can unlink the bot anytime from your phone
+
+#### Option B: Primary Registration (Dedicated Number Only)
+
+Register signal-cli as the primary device (requires a dedicated phone number):
 
 ```bash
 # Request verification code (sent via SMS)
@@ -29,7 +54,7 @@ signal-cli -a +1XXXXXXXXXX register
 signal-cli -a +1XXXXXXXXXX verify CODE
 ```
 
-**Note:** You can only have one Signal client per number. Registering signal-cli will log out your Signal mobile app. Consider using a secondary number.
+**Warning:** This will log out your Signal mobile app. Only use this option with a dedicated bot number, not your personal number.
 
 ## Configuration
 
diff --git a/docs/slack-setup.md b/docs/slack-setup.md
index e046f2d..0070cef 100644
--- a/docs/slack-setup.md
+++ b/docs/slack-setup.md
@@ -48,6 +48,7 @@ Socket Mode lets your bot connect without exposing a public endpoint.
 |-------|---------|
 | `app_mentions:read` | React when someone @mentions your bot |
 | `chat:write` | Send messages |
+| `files:read` | Download voice message attachments |
 | `im:history` | Read DM message history |
 | `im:read` | View DM channel info |
 | `im:write` | Start DM conversations |
diff --git a/src/channels/discord.ts b/src/channels/discord.ts
index 09b514c..d7cb605 100644
--- a/src/channels/discord.ts
+++ b/src/channels/discord.ts
@@ -180,10 +180,9 @@ Ask the bot owner to approve with:
       const audioAttachment = message.attachments.find(a => a.contentType?.startsWith('audio/'));
       if (audioAttachment?.url) {
         try {
-          const { loadConfig } = await import('../config/index.js');
-          const config = loadConfig();
-          if (!config.transcription?.apiKey && !process.env.OPENAI_API_KEY) {
-            await message.reply('Voice messages require OpenAI API key for transcription. See: https://github.com/letta-ai/lettabot#voice-messages');
+          const { isTranscriptionConfigured } = await import('../transcription/index.js');
+          if (!isTranscriptionConfigured()) {
+            await message.reply('Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice-messages');
           } else {
             // Download audio
             const response = await fetch(audioAttachment.url);
diff --git a/src/channels/setup.ts b/src/channels/setup.ts
index 43ddcbb..b2a5b88 100644
--- a/src/channels/setup.ts
+++ b/src/channels/setup.ts
@@ -494,9 +494,9 @@ export async function setupSignal(existing?: any): Promise<any> {
   
   p.note(
     'See docs/signal-setup.md for detailed instructions.\n' +
-    'Requires signal-cli registered with your phone number.\n\n' +
-    '⚠️  Security: Has full access to your Signal account.\n' +
-    'Can see all messages and send as you.',
+    'Recommended: Link as secondary device (signal-cli link -n "LettaBot")\n' +
+    'This keeps your phone\'s Signal app working normally.\n\n' +
+    'Requires signal-cli registered or linked with your phone number.',
     'Signal Setup'
   );
   
diff --git a/src/channels/signal.ts b/src/channels/signal.ts
index 48617cc..bb8c1f5 100644
--- a/src/channels/signal.ts
+++ b/src/channels/signal.ts
@@ -623,14 +623,12 @@ This code expires in 1 hour.`;
         }
         
         try {
-          const { loadConfig } = await import('../config/index.js');
-          const config = loadConfig();
-          if (!config.transcription?.apiKey && !process.env.OPENAI_API_KEY) {
+          const { isTranscriptionConfigured } = await import('../transcription/index.js');
+          if (!isTranscriptionConfigured()) {
             if (chatId) {
-              const audioInfo = savedAudioPath ? ` Audio saved to: ${savedAudioPath}` : '';
-              await this.sendMessage({ 
-                chatId, 
-                text: `Voice messages require OpenAI API key for transcription.${audioInfo} See: https://github.com/letta-ai/lettabot#voice-messages` 
+              await this.sendMessage({
+                chatId,
+                text: 'Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice-messages'
               });
             }
           } else {
diff --git a/src/channels/slack.ts b/src/channels/slack.ts
index bf83660..00a8129 100644
--- a/src/channels/slack.ts
+++ b/src/channels/slack.ts
@@ -60,9 +60,9 @@ export class SlackAdapter implements ChannelAdapter {
     
     // Handle messages
     this.app.message(async ({ message, say, client }) => {
-      // Type guard for regular messages
-      if (message.subtype !== undefined) return;
-      if (!('user' in message) || !('text' in message)) return;
+      // Type guard for regular messages (allow file_share for voice messages)
+      if (message.subtype !== undefined && message.subtype !== 'file_share') return;
+      if (!('user' in message)) return;
       
       const userId = message.user;
       let text = message.text || '';
@@ -74,10 +74,9 @@ export class SlackAdapter implements ChannelAdapter {
       const audioFile = files?.find(f => f.mimetype?.startsWith('audio/'));
       if (audioFile?.url_private_download) {
         try {
-          const { loadConfig } = await import('../config/index.js');
-          const config = loadConfig();
-          if (!config.transcription?.apiKey && !process.env.OPENAI_API_KEY) {
-            await say('Voice messages require OpenAI API key for transcription. See: https://github.com/letta-ai/lettabot#voice-messages');
+          const { isTranscriptionConfigured } = await import('../transcription/index.js');
+          if (!isTranscriptionConfigured()) {
+            await say('Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice-messages');
           } else {
             // Download file (requires bot token for auth)
             const response = await fetch(audioFile.url_private_download, {
@@ -173,10 +172,43 @@ export class SlackAdapter implements ChannelAdapter {
     // Handle app mentions (@bot)
     this.app.event('app_mention', async ({ event }) => {
       const userId = event.user || '';
-      const text = (event.text || '').replace(/<@[A-Z0-9]+>/g, '').trim(); // Remove mention
+      let text = (event.text || '').replace(/<@[A-Z0-9]+>/g, '').trim(); // Remove mention
       const channelId = event.channel;
       const threadTs = event.thread_ts || event.ts; // Reply in thread, or start new thread from the mention
-      
+
+      // Handle audio file attachments
+      const files = (event as any).files as Array<{ mimetype?: string; url_private_download?: string; name?: string }> | undefined;
+      const audioFile = files?.find(f => f.mimetype?.startsWith('audio/'));
+      if (audioFile?.url_private_download) {
+        try {
+          const { isTranscriptionConfigured } = await import('../transcription/index.js');
+          if (!isTranscriptionConfigured()) {
+            await this.sendMessage({ chatId: channelId, text: 'Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice-messages', threadId: threadTs });
+            return;
+          }
+          // Download file (requires bot token for auth)
+          const response = await fetch(audioFile.url_private_download, {
+            headers: { 'Authorization': `Bearer ${this.config.botToken}` }
+          });
+          const buffer = Buffer.from(await response.arrayBuffer());
+
+          const { transcribeAudio } = await import('../transcription/index.js');
+          const ext = audioFile.mimetype?.split('/')[1] || 'mp3';
+          const result = await transcribeAudio(buffer, audioFile.name || `audio.${ext}`);
+
+          if (result.success && result.text) {
+            console.log(`[Slack] Transcribed audio: "${result.text.slice(0, 50)}..."`);
+            text = (text ? text + '\n' : '') + `[Voice message]: ${result.text}`;
+          } else {
+            console.error(`[Slack] Transcription failed: ${result.error}`);
+            text = (text ? text + '\n' : '') + `[Voice message - transcription failed: ${result.error}]`;
+          }
+        } catch (error) {
+          console.error('[Slack] Error transcribing audio:', error);
+          text = (text ? text + '\n' : '') + `[Voice message - error: ${error instanceof Error ? error.message : 'unknown error'}]`;
+        }
+      }
+
       if (this.config.allowedUsers && this.config.allowedUsers.length > 0) {
         if (!userId || !this.config.allowedUsers.includes(userId)) {
           // Can't use say() in app_mention event the same way
diff --git a/src/channels/telegram.ts b/src/channels/telegram.ts
index 8b0f89b..d0f2551 100644
--- a/src/channels/telegram.ts
+++ b/src/channels/telegram.ts
@@ -346,10 +346,9 @@ export class TelegramAdapter implements ChannelAdapter {
       const { isGroup, groupName, wasMentioned, isListeningMode } = gating;
 
       // Check if transcription is configured (config or env)
-      const { loadConfig } = await import('../config/index.js');
-      const config = loadConfig();
-      if (!config.transcription?.apiKey && !process.env.OPENAI_API_KEY) {
-        await ctx.reply('Voice messages require OpenAI API key for transcription. See: https://github.com/letta-ai/lettabot#voice-messages');
+      const { isTranscriptionConfigured } = await import('../transcription/index.js');
+      if (!isTranscriptionConfigured()) {
+        await ctx.reply('Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice-messages');
         return;
       }
 
diff --git a/src/channels/whatsapp/inbound/extract.ts b/src/channels/whatsapp/inbound/extract.ts
index 242f6e0..ba37bdf 100644
--- a/src/channels/whatsapp/inbound/extract.ts
+++ b/src/channels/whatsapp/inbound/extract.ts
@@ -143,18 +143,22 @@ export async function extractInboundMessage(
 
   // Collect attachments if media present and config provided
   let attachments: InboundAttachment[] = [];
+  let voiceTranscription: string | undefined;
   if (preview.hasMedia && attachmentConfig) {
     const result = await collectAttachments({
       messageContent,
       chatId: remoteJid,
       messageId: messageId || 'unknown',
+      sock,
       ...attachmentConfig,
     });
     attachments = result.attachments;
+    voiceTranscription = result.voiceTranscription;
   }
 
   // Use caption as fallback text (for media-only messages)
-  const finalBody = body || preview.caption || '';
+  // For voice messages, use transcription if available
+  const finalBody = voiceTranscription || body || preview.caption || '';
   if (!finalBody && attachments.length === 0) {
     return null; // Skip messages with no text and no media
   }
diff --git a/src/channels/whatsapp/inbound/media.ts b/src/channels/whatsapp/inbound/media.ts
index 5251ca1..5a24283 100644
--- a/src/channels/whatsapp/inbound/media.ts
+++ b/src/channels/whatsapp/inbound/media.ts
@@ -55,19 +55,21 @@ export function extractMediaPreview(messageContent: any): { hasMedia: boolean; c
  * Handles 5 media types: image, video, audio, document, sticker.
  * Downloads using Baileys' downloadContentFromMessage and saves to disk.
  * Enforces size limits and supports metadata-only mode.
+ * Transcribes voice messages (ptt: true) using configured transcription provider.
  *
  * @param params - Attachment collection parameters
- * @returns Attachments array and optional caption
+ * @returns Attachments array, optional caption, and optional transcribed text for voice messages
  */
 export async function collectAttachments(params: {
   messageContent: any;
   chatId: string;
   messageId: string;
   downloadContentFromMessage: (message: any, type: string) => Promise<AsyncIterable<Uint8Array>>;
+  sock: import("@whiskeysockets/baileys").WASocket;
   attachmentsDir?: string;
   attachmentsMaxBytes?: number;
-}): Promise<{ attachments: InboundAttachment[]; caption?: string }> {
-  const { messageContent, chatId, messageId, downloadContentFromMessage, attachmentsDir, attachmentsMaxBytes } = params;
+}): Promise<{ attachments: InboundAttachment[]; caption?: string; voiceTranscription?: string }> {
+  const { messageContent, chatId, messageId, downloadContentFromMessage, sock, attachmentsDir, attachmentsMaxBytes } = params;
   const attachments: InboundAttachment[] = [];
 
   if (!messageContent) return { attachments };
@@ -122,6 +124,10 @@ export async function collectAttachments(params: {
     kind,
   };
 
+  // Check if this is a voice message (ptt = push-to-talk)
+  const isPttVoiceMessage = mediaType === 'audio' && mediaMessage.ptt === true;
+  let voiceTranscription: string | undefined;
+
   // Download if attachmentsDir is configured
   if (attachmentsDir) {
     // Metadata-only mode (attachmentsMaxBytes = 0)
@@ -151,9 +157,52 @@ export async function collectAttachments(params: {
     }
   }
 
+  // Transcribe voice messages
+  if (isPttVoiceMessage) {
+    try {
+      const { isTranscriptionConfigured } = await import('../../../transcription/index.js');
+      if (!isTranscriptionConfigured()) {
+        // Send error message directly to user (matches Telegram/Slack/Discord/Signal behavior)
+        try {
+          await sock.sendMessage(chatId, {
+            text: 'Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice-messages'
+          });
+        } catch (sendError) {
+          console.error('[WhatsApp] Failed to send transcription error message:', sendError);
+        }
+        // Don't forward error to agent - return early
+        const caption = mediaMessage.caption as string | undefined;
+        return { attachments, caption };
+      }
+
+      // Download audio buffer for transcription
+      const stream = await downloadContentFromMessage(mediaMessage, mediaType);
+      const chunks: Uint8Array[] = [];
+      for await (const chunk of stream) {
+        chunks.push(chunk);
+      }
+      const buffer = Buffer.concat(chunks);
+
+      // Transcribe audio
+      const { transcribeAudio } = await import('../../../transcription/index.js');
+      const result = await transcribeAudio(buffer, name);
+
+      if (result.success && result.text) {
+        console.log(`[WhatsApp] Transcribed voice message: "${result.text.slice(0, 50)}..."`);
+        voiceTranscription = `[Voice message]: ${result.text}`;
+      } else {
+        console.error(`[WhatsApp] Transcription failed: ${result.error}`);
+        voiceTranscription = `[Voice message - transcription failed: ${result.error}]`;
+      }
+    } catch (error) {
+      console.error('[WhatsApp] Error transcribing voice message:', error);
+      voiceTranscription = `[Voice message - error: ${error instanceof Error ? error.message : 'unknown error'}]`;
+    }
+  }
+
   attachments.push(attachment);
   const caption = mediaMessage.caption as string | undefined;
-  return { attachments, caption };
+  return { attachments, caption, voiceTranscription };
 }
 
 /**
diff --git a/src/config/types.ts b/src/config/types.ts
index 06a2210..fdc5fd8 100644
--- a/src/config/types.ts
+++ b/src/config/types.ts
@@ -183,9 +183,9 @@ export interface LettaBotConfig {
 }
 
 export interface TranscriptionConfig {
-  provider: 'openai';  // Only OpenAI supported currently
-  apiKey?: string;     // Falls back to OPENAI_API_KEY env var
-  model?: string;      // Defaults to 'whisper-1'
+  provider: 'openai' | 'mistral';
+  apiKey?: string;     // Falls back to OPENAI_API_KEY or MISTRAL_API_KEY env var
+  model?: string;      // Defaults to 'whisper-1' (OpenAI) or 'voxtral-mini-latest' (Mistral)
 }
 
 export interface PollingYamlConfig {
diff --git a/src/onboard.ts b/src/onboard.ts
index 207fdc7..4f62a97 100644
--- a/src/onboard.ts
+++ b/src/onboard.ts
@@ -290,7 +290,7 @@ interface OnboardConfig {
   cron: boolean;
 
   // Transcription (voice messages)
-  transcription: { enabled: boolean; apiKey?: string; model?: string };
+  transcription: { enabled: boolean; provider?: 'openai' | 'mistral'; apiKey?: string; model?: string };
 }
 
 const isPlaceholder = (val?: string) => !val || /^(your_|sk-\.\.\.|placeholder|example)/i.test(val);
@@ -665,6 +665,7 @@ async function stepProviders(config: OnboardConfig, env: Record<string, string>)
             });
             if (!p.isCancel(enableTranscription) && enableTranscription) {
               config.transcription.enabled = true;
+              config.transcription.provider = 'openai';
               config.transcription.apiKey = providerKey;
             }
           }
@@ -838,23 +839,39 @@ async function stepFeatures(config: OnboardConfig): Promise<void> {
 // Voice Transcription Setup
 // ============================================================================
 
-async function stepTranscription(config: OnboardConfig): Promise<void> {
-  // Skip if already configured from the providers step
-  if (config.transcription.enabled && config.transcription.apiKey) return;
+async function stepTranscription(config: OnboardConfig, forcePrompt?: boolean): Promise<void> {
+  // Skip if already configured (e.g. from OpenAI shortcut in stepProviders)
+  if (!forcePrompt && config.transcription.enabled && config.transcription.apiKey) return;
 
   const setupTranscription = await p.confirm({
-    message: 'Enable voice message transcription? (uses OpenAI Whisper)',
+    message: 'Enable voice message transcription?',
     initialValue: config.transcription.enabled,
   });
   if (p.isCancel(setupTranscription)) { p.cancel('Setup cancelled'); process.exit(0); }
   config.transcription.enabled = setupTranscription;
 
   if (setupTranscription) {
-    const existingKey = process.env.OPENAI_API_KEY;
+    const providerChoice = await p.select({
+      message: 'Transcription provider',
+      options: [
+        { value: 'openai', label: 'OpenAI Whisper', hint: 'whisper-1' },
+        { value: 'mistral', label: 'Mistral Voxtral', hint: 'voxtral-mini-latest' },
+      ],
+      initialValue: config.transcription.provider || 'openai',
+    });
+    if (p.isCancel(providerChoice)) { p.cancel('Setup cancelled'); process.exit(0); }
+    config.transcription.provider = providerChoice as 'openai' | 'mistral';
+
+    const isMistral = config.transcription.provider === 'mistral';
+    // Check env vars first, then check if key was already entered for LLM provider
+    const existingKey = isMistral
+      ? process.env.MISTRAL_API_KEY
+      : (process.env.OPENAI_API_KEY || config.providers?.find(p => p.id === 'openai')?.apiKey);
+    const providerLabel = isMistral ? 'Mistral' : 'OpenAI';
 
     const apiKey = await p.text({
-      message: 'OpenAI API Key (for Whisper transcription)',
-      placeholder: 'sk-...',
+      message: `${providerLabel} API Key`,
+      placeholder: isMistral ? '' : 'sk-...',
       initialValue: existingKey || '',
       validate: (v) => {
         if (!v) return 'API key is required for voice transcription';
@@ -1197,7 +1214,10 @@ function showSummary(config: OnboardConfig): void {
   lines.push(`Features:  ${features.length > 0 ? features.join(', ') : 'None'}`);
   
   // Transcription
-  lines.push(`Voice:     ${config.transcription.enabled ? 'Enabled (OpenAI Whisper)' : 'Disabled'}`);
+  const voiceLabel = config.transcription.enabled
+    ? `Enabled (${config.transcription.provider === 'mistral' ? 'Mistral Voxtral' : 'OpenAI Whisper'})`
+    : 'Disabled';
+  lines.push(`Voice:     ${voiceLabel}`);
 
   // Google
   if (config.google.enabled) {
@@ -1243,7 +1263,7 @@ async function reviewLoop(config: OnboardConfig, env: Record<string, string>): P
     }
     else if (choice === 'channels') await stepChannels(config, env);
     else if (choice === 'features') await stepFeatures(config);
-    else if (choice === 'transcription') await stepTranscription(config);
+    else if (choice === 'transcription') await stepTranscription(config, true);
     else if (choice === 'google') await stepGoogle(config);
   }
 }
@@ -1473,7 +1493,8 @@ export async function onboard(options?: { nonInteractive?: boolean }): Promise<v
     },
     cron: existingConfig.features?.cron || false,
     transcription: {
-      enabled: !!existingConfig.transcription?.apiKey || !!process.env.OPENAI_API_KEY,
+      enabled: !!existingConfig.transcription?.apiKey || !!process.env.OPENAI_API_KEY || !!process.env.MISTRAL_API_KEY,
+      provider: existingConfig.transcription?.provider || 'openai',
       apiKey: existingConfig.transcription?.apiKey,
       model: existingConfig.transcription?.model,
     },
@@ -1639,7 +1660,11 @@ export async function onboard(options?: { nonInteractive?: boolean }): Promise<v
   }
 
   if (config.transcription.enabled && config.transcription.apiKey) {
-    env.OPENAI_API_KEY = config.transcription.apiKey;
+    if (config.transcription.provider === 'mistral') {
+      env.MISTRAL_API_KEY = config.transcription.apiKey;
+    } else {
+      env.OPENAI_API_KEY = config.transcription.apiKey;
+    }
   }
 
   // Helper to format access control status
@@ -1670,7 +1695,7 @@ export async function onboard(options?: { nonInteractive?: boolean }): Promise<v
     'Features:',
     config.heartbeat.enabled ? `  ✓ Heartbeat (${config.heartbeat.interval}min)` : '  ✗ Heartbeat',
     config.cron ? '  ✓ Cron jobs' : '  ✗ Cron jobs',
-    config.transcription.enabled ? '  ✓ Voice transcription (OpenAI Whisper)' : '  ✗ Voice transcription',
+    config.transcription.enabled ? `  ✓ Voice transcription (${config.transcription.provider === 'mistral' ? 'Mistral Voxtral' : 'OpenAI Whisper'})` : '  ✗ Voice transcription',
   ].join('\n');
   
   p.note(summary, 'Configuration Summary');
@@ -1782,7 +1807,7 @@ export async function onboard(options?: { nonInteractive?: boolean }): Promise<v
     agents: [agentConfig],
     ...(config.transcription.enabled && config.transcription.apiKey ? {
       transcription: {
-        provider: 'openai' as const,
+        provider: config.transcription.provider || 'openai',
         apiKey: config.transcription.apiKey,
         ...(config.transcription.model ? { model: config.transcription.model } : {}),
       },
diff --git a/src/setup/slack-wizard.ts b/src/setup/slack-wizard.ts
index d2a3a49..91b696e 100644
--- a/src/setup/slack-wizard.ts
+++ b/src/setup/slack-wizard.ts
@@ -57,12 +57,15 @@ export async function runSlackWizard(existingConfig?: {
   // Step 1: Create Slack App from Manifest (scopes, events, Socket Mode all pre-configured)
   const createdApp = await stepCreateApp();
   if (!createdApp) return null;
-  
-  // Step 2: Install to Workspace + Get Bot Token
+
+  // Step 2: Configure App Home (enable DM messaging)
+  await stepConfigureAppHome();
+
+  // Step 3: Install to Workspace + Get Bot Token
   const botToken = await stepInstallApp(existingConfig?.botToken);
   if (!botToken) return null;
   
-  // Step 3: Enable Socket Mode + Get App Token
+  // Step 4: Enable Socket Mode + Get App Token
   const appToken = await stepEnableSocketMode(existingConfig?.appToken);
   if (!appToken) return null;
   
@@ -82,7 +85,7 @@ export async function runSlackWizard(existingConfig?: {
 }
 
 async function stepCreateApp(): Promise<boolean> {
-  p.log.step('Step 1/3: Create Slack App from Manifest');
+  p.log.step('Step 1/4: Create Slack App from Manifest');
   
   // Inline manifest for Socket Mode configuration
   const appName = process.env.SLACK_APP_NAME || process.env.LETTA_AGENT_NAME || 'LettaBot';
@@ -99,6 +102,7 @@ oauth_config:
     bot:
       - app_mentions:read
       - chat:write
+      - files:read
       - im:history
       - im:read
       - im:write
@@ -117,7 +121,7 @@ settings:
   p.note(
     'Creates app with everything pre-configured:\n' +
     '  • Socket Mode enabled\n' +
-    '  • 5 bot scopes (app_mentions:read, chat:write, im:*)\n' +
+    '  • 6 bot scopes (app_mentions:read, chat:write, files:read, im:*)\n' +
     '  • 2 event subscriptions (app_mention, message.im)\n\n' +
     'Just review and click "Create"!',
     'One-Click Setup'
@@ -162,7 +166,7 @@ settings:
 }
 
 async function stepEnableSocketMode(existingToken?: string): Promise<string | null> {
-  p.log.step('Step 3/3: Get App-Level Token');
+  p.log.step('Step 4/4: Get App-Level Token');
   
   p.note(
     '1. In the left sidebar, click "Socket Mode"\n' +
@@ -197,6 +201,7 @@ async function stepConfigureScopes(): Promise<boolean> {
     '3. Click "Add an OAuth Scope" for each:\n' +
     '   • app_mentions:read\n' +
     '   • chat:write\n' +
+    '   • files:read\n' +
     '   • im:history\n' +
     '   • im:read\n' +
     '   • im:write',
@@ -244,7 +249,7 @@ async function stepConfigureEvents(): Promise<boolean> {
 }
 
 async function stepConfigureAppHome(): Promise<boolean> {
-  p.log.step('Step 5/6: Configure App Home');
+  p.log.step('Step 2/4: Configure App Home');
   
   p.note(
     '1. Go to "App Home" in left sidebar\n' +
@@ -267,7 +272,7 @@ async function stepConfigureAppHome(): Promise<boolean> {
 }
 
 async function stepInstallApp(existingToken?: string): Promise<string | null> {
-  p.log.step('Step 6/6: Install to Workspace');
+  p.log.step('Step 3/4: Install to Workspace');
   
   p.note(
     '1. Go to "Install App" in left sidebar\n' +
diff --git a/src/transcription/index.ts b/src/transcription/index.ts
index a7d9f2d..b229c7f 100644
--- a/src/transcription/index.ts
+++ b/src/transcription/index.ts
@@ -1,7 +1,39 @@
 /**
- * Transcription service
- * 
- * Currently supports OpenAI Whisper. Future providers can be added here.
+ * Transcription service router
+ *
+ * Delegates to the correct provider based on config.transcription.provider.
+ * Defaults to OpenAI Whisper for backwards compatibility.
  */
 
-export { transcribeAudio, type TranscriptionResult } from './openai.js';
+import { loadConfig } from '../config/index.js';
+import type { TranscriptionResult } from './openai.js';
+import { transcribeAudio as openaiTranscribe } from './openai.js';
+import { transcribeAudio as mistralTranscribe } from './mistral.js';
+
+export type { TranscriptionResult } from './openai.js';
+
+/**
+ * Check whether a transcription API key is available for the configured provider.
+ * Used by channel handlers to gate voice message processing.
+ */
+export function isTranscriptionConfigured(): boolean {
+  const config = loadConfig();
+  const provider = config.transcription?.provider || 'openai';
+  return !!(config.transcription?.apiKey
+    || (provider === 'mistral' ? process.env.MISTRAL_API_KEY : process.env.OPENAI_API_KEY));
+}
+
+export async function transcribeAudio(
+  audioBuffer: Buffer,
+  filename?: string,
+  options?: { audioPath?: string }
+): Promise<TranscriptionResult> {
+  const config = loadConfig();
+  const provider = config.transcription?.provider || 'openai';
+
+  if (provider === 'mistral') {
+    return mistralTranscribe(audioBuffer, filename, options);
+  }
+
+  return openaiTranscribe(audioBuffer, filename, options);
+}
diff --git a/src/transcription/mistral.ts b/src/transcription/mistral.ts
new file mode 100644
index 0000000..0644b9d
--- /dev/null
+++ b/src/transcription/mistral.ts
@@ -0,0 +1,244 @@
+/**
+ * Mistral Voxtral transcription service
+ *
+ * Uses Voxtral Transcribe 2 via the Mistral REST API.
+ * Simple multipart POST — no SDK dependency needed.
+ */
+
+import { loadConfig } from '../config/index.js';
+import { execSync } from 'node:child_process';
+import { writeFileSync, readFileSync, unlinkSync, mkdirSync, readdirSync } from 'node:fs';
+import { join } from 'node:path';
+import { tmpdir } from 'node:os';
+import type { TranscriptionResult } from './openai.js';
+
+const MAX_FILE_SIZE = 20 * 1024 * 1024;
+const CHUNK_DURATION_SECONDS = 600;
+
+function getApiKey(): string {
+  const config = loadConfig();
+  const apiKey = config.transcription?.apiKey || process.env.MISTRAL_API_KEY;
+  if (!apiKey) {
+    throw new Error('Mistral API key required for transcription. Set in config (transcription.apiKey) or MISTRAL_API_KEY env var.');
+  }
+  return apiKey;
+}
+
+function getModel(): string {
+  const config = loadConfig();
+  return config.transcription?.model || process.env.TRANSCRIPTION_MODEL || 'voxtral-mini-latest';
+}
+
+function getMimeType(filename: string): string {
+  const ext = filename.split('.').pop()?.toLowerCase();
+  const mimeTypes: Record<string, string> = {
+    'ogg': 'audio/ogg',
+    'oga': 'audio/ogg',
+    'mp3': 'audio/mpeg',
+    'mp4': 'audio/mp4',
+    'm4a': 'audio/mp4',
+    'wav': 'audio/wav',
+    'flac': 'audio/flac',
+    'webm': 'audio/webm',
+  };
+  return mimeTypes[ext || ''] || 'audio/ogg';
+}
+
+const NEEDS_CONVERSION = ['aac', 'amr', 'caf', 'x-caf', '3gp', '3gpp'];
+
+const FORMAT_MAP: Record<string, string> = {
+  'aac': 'm4a',
+  'amr': 'mp3',
+  'opus': 'ogg',
+  'x-caf': 'm4a',
+  'caf': 'm4a',
+  '3gp': 'mp4',
+  '3gpp': 'mp4',
+};
+
+let ffmpegAvailable: boolean | null = null;
+
+function isFfmpegAvailable(): boolean {
+  if (ffmpegAvailable === null) {
+    try {
+      execSync('which ffmpeg', { stdio: 'ignore' });
+      ffmpegAvailable = true;
+    } catch {
+      ffmpegAvailable = false;
+    }
+  }
+  return ffmpegAvailable;
+}
+
+function convertAudioToMp3(audioBuffer: Buffer, inputExt: string): Buffer {
+  const tempDir = join(tmpdir(), 'lettabot-transcription');
+  mkdirSync(tempDir, { recursive: true });
+
+  const inputPath = join(tempDir, `input-${Date.now()}.${inputExt}`);
+  const outputPath = join(tempDir, `output-${Date.now()}.mp3`);
+
+  try {
+    writeFileSync(inputPath, audioBuffer);
+    execSync(`ffmpeg -y -i "${inputPath}" -acodec libmp3lame -q:a 2 "${outputPath}" 2>/dev/null`, {
+      timeout: 30000,
+    });
+    const converted = readFileSync(outputPath);
+    console.log(`[Transcription] Converted ${audioBuffer.length} bytes → ${converted.length} bytes`);
+    return converted;
+  } finally {
+    try { unlinkSync(inputPath); } catch {}
+    try { unlinkSync(outputPath); } catch {}
+  }
+}
+
+/**
+ * Send a single buffer to the Voxtral API and return the text.
+ */
+async function attemptTranscription(audioBuffer: Buffer, filename: string): Promise<string> {
+  const apiKey = getApiKey();
+  const model = getModel();
+
+  const file = new File([new Uint8Array(audioBuffer)], filename, {
+    type: getMimeType(filename),
+  });
+
+  const formData = new FormData();
+  formData.append('model', model);
+  formData.append('file', file);
+
+  const response = await fetch('https://api.mistral.ai/v1/audio/transcriptions', {
+    method: 'POST',
+    headers: { 'Authorization': `Bearer ${apiKey}` },
+    body: formData,
+  });
+
+  if (!response.ok) {
+    const errorText = await response.text();
+    throw new Error(`Mistral API error (${response.status}): ${errorText}`);
+  }
+
+  const data = await response.json() as { text: string };
+  return data.text;
+}
+
+/**
+ * Split large audio into chunks and transcribe each.
+ */
+async function transcribeInChunks(audioBuffer: Buffer, ext: string): Promise<string> {
+  if (!isFfmpegAvailable()) {
+    throw new Error('Cannot split large audio files without ffmpeg');
+  }
+
+  const tempDir = join(tmpdir(), 'lettabot-transcription', `chunks-${Date.now()}`);
+  mkdirSync(tempDir, { recursive: true });
+
+  const inputPath = join(tempDir, `input.${ext}`);
+  const outputPattern = join(tempDir, 'chunk-%03d.mp3');
+
+  try {
+    writeFileSync(inputPath, audioBuffer);
+
+    execSync(
+      `ffmpeg -y -i "${inputPath}" -f segment -segment_time ${CHUNK_DURATION_SECONDS} -reset_timestamps 1 -acodec libmp3lame -q:a 2 "${outputPattern}" 2>/dev/null`,
+      { timeout: 120000 }
+    );
+
+    const chunkFiles = readdirSync(tempDir)
+      .filter(f => f.startsWith('chunk-') && f.endsWith('.mp3'))
+      .sort();
+
+    if (chunkFiles.length === 0) {
+      throw new Error('Failed to split audio into chunks');
+    }
+
+    console.log(`[Transcription] Split into ${chunkFiles.length} chunks`);
+
+    const transcriptions: string[] = [];
+    for (let i = 0; i < chunkFiles.length; i++) {
+      const chunkPath = join(tempDir, chunkFiles[i]);
+      const chunkBuffer = readFileSync(chunkPath);
+      console.log(`[Transcription] Transcribing chunk ${i + 1}/${chunkFiles.length} (${(chunkBuffer.length / 1024).toFixed(0)}KB)`);
+      const text = await attemptTranscription(chunkBuffer, chunkFiles[i]);
+      if (text.trim()) {
+        transcriptions.push(text.trim());
+      }
+    }
+
+    const combined = transcriptions.join(' ');
+    console.log(`[Transcription] Combined ${transcriptions.length} chunks into ${combined.length} chars`);
+    return combined;
+  } finally {
+    try {
+      const files = readdirSync(tempDir);
+      for (const file of files) {
+        unlinkSync(join(tempDir, file));
+      }
+      execSync(`rmdir "${tempDir}" 2>/dev/null || true`);
+    } catch {}
+  }
+}
+
+/**
+ * Transcribe audio using Mistral Voxtral API
+ *
+ * Voxtral supports: wav, mp3, flac, ogg, webm
+ * Telegram voice messages (OGG/Opus) work natively.
+ */
+export async function transcribeAudio(
+  audioBuffer: Buffer,
+  filename: string = 'audio.ogg',
+  options?: { audioPath?: string }
+): Promise<TranscriptionResult> {
+  const ext = filename.split('.').pop()?.toLowerCase() || '';
+
+  try {
+    let finalBuffer = audioBuffer;
+    let finalFilename = filename;
+
+    // Convert unsupported formats via ffmpeg
+    if (NEEDS_CONVERSION.includes(ext)) {
+      const mapped = FORMAT_MAP[ext];
+      if (mapped) {
+        console.log(`[Transcription] Trying .${ext} as .${mapped} (no conversion)`);
+        finalFilename = filename.replace(/\.[^.]+$/, `.${mapped}`);
+
+        try {
+          const text = await attemptTranscription(finalBuffer, finalFilename);
+          return { success: true, text };
+        } catch {
+          console.log(`[Transcription] Rename approach failed for .${ext}`);
+        }
+      }
+
+      if (isFfmpegAvailable()) {
+        console.log(`[Transcription] Converting .${ext} → .mp3 with ffmpeg`);
+        finalBuffer = convertAudioToMp3(audioBuffer, ext);
+        finalFilename = filename.replace(/\.[^.]+$/, '.mp3');
+      } else {
+        return {
+          success: false,
+          error: `Cannot transcribe .${ext} format. Install ffmpeg for audio conversion, or send in a supported format (mp3, ogg, wav, flac).`,
+          audioPath: options?.audioPath,
+        };
+      }
+    }
+
+    // Check file size and chunk if needed
+    if (finalBuffer.length > MAX_FILE_SIZE) {
+      const finalExt = finalFilename.split('.').pop()?.toLowerCase() || 'ogg';
+      console.log(`[Transcription] File too large (${(finalBuffer.length / 1024 / 1024).toFixed(1)}MB), splitting into chunks`);
+      const text = await transcribeInChunks(finalBuffer, finalExt);
+      return { success: true, text };
+    }
+
+    const text = await attemptTranscription(finalBuffer, finalFilename);
+    return { success: true, text };
+  } catch (error) {
+    const errorMsg = error instanceof Error ? error.message : String(error);
+    return {
+      success: false,
+      error: errorMsg,
+      audioPath: options?.audioPath,
+    };
+  }
+}