feat: add Mistral Voxtral transcription support (#228)

2026-02-23 23:37:12 +02:00
parent 6bda859559
commit cae5b104b3
15 changed files with 496 additions and 65 deletions
--- a/README.md
+++ b/README.md
@@ -109,12 +109,14 @@ That's it! Message your bot on Telegram.

 ## Voice Messages

-LettaBot can transcribe voice messages using OpenAI Whisper. Voice messages are automatically converted to text and sent to the agent with a `[Voice message]:` prefix.
+LettaBot can transcribe voice messages using either OpenAI Whisper or Mistral Voxtral. Voice messages are automatically converted to text and sent to the agent with a `[Voice message]:` prefix.

 **Supported channels:** Telegram, WhatsApp, Signal, Slack, Discord

 ### Configuration

+**Option 1: OpenAI Whisper**
+
 Add your OpenAI API key to `lettabot.yaml`:

 ```yaml
@@ -130,7 +132,23 @@ Or set via environment variable:
 export OPENAI_API_KEY=sk-...
 ```

-If no API key is configured, voice messages are silently ignored.
+**Option 2: Mistral Voxtral** (2x faster, 2x cheaper)
+
+Add your Mistral API key to `lettabot.yaml`:
+
+```yaml
+transcription:
+  provider: mistral
+  apiKey: ...
+```
+
+Or set via environment variable:
+
+```bash
+export MISTRAL_API_KEY=...
+```
+
+If no API key is configured, users will receive an error message with a link to this section.

 ## Skills
 LettaBot is compatible with [skills.sh](https://skills.sh) and [Clawdhub](https://clawdhub.com/). 
--- a/docs/signal-setup.md
+++ b/docs/signal-setup.md
@@ -19,7 +19,32 @@ brew install signal-cli

 ### 2. Register Your Phone Number

-You need a phone number that can receive SMS for verification.
+You have two options:
+
+#### Option A: Link as Secondary Device (Recommended)
+
+Link signal-cli to your existing Signal account without disrupting your phone app:
+
+```bash
+# Generate a linking QR code/URI
+signal-cli link -n "LettaBot"
+```
+
+This will display a `sgnl://linkdevice?uuid=...` URI. On your phone:
+1. Open Signal → Settings (tap your profile)
+2. Tap "Linked Devices"
+3. Tap "Link New Device" (+ button)
+4. Scan the QR code or enter the URI
+
+**Benefits:**
+- Your phone's Signal app continues to work normally
+- Bot runs as a linked device (like Signal Desktop)
+- Both your phone and the bot receive messages
+- You can unlink the bot anytime from your phone
+
+#### Option B: Primary Registration (Dedicated Number Only)
+
+Register signal-cli as the primary device (requires a dedicated phone number):

 ```bash
 # Request verification code (sent via SMS)
@@ -29,7 +54,7 @@ signal-cli -a +1XXXXXXXXXX register
 signal-cli -a +1XXXXXXXXXX verify CODE
 ```

-**Note:** You can only have one Signal client per number. Registering signal-cli will log out your Signal mobile app. Consider using a secondary number.
+**Warning:** This will log out your Signal mobile app. Only use this option with a dedicated bot number, not your personal number.

 ## Configuration

--- a/docs/slack-setup.md
+++ b/docs/slack-setup.md
@@ -48,6 +48,7 @@ Socket Mode lets your bot connect without exposing a public endpoint.
 |-------|---------|
 | `app_mentions:read` | React when someone @mentions your bot |
 | `chat:write` | Send messages |
+| `files:read` | Download voice message attachments |
 | `im:history` | Read DM message history |
 | `im:read` | View DM channel info |
 | `im:write` | Start DM conversations |
--- a/src/channels/discord.ts
+++ b/src/channels/discord.ts
@@ -180,10 +180,9 @@ Ask the bot owner to approve with:
      const audioAttachment = message.attachments.find(a => a.contentType?.startsWith('audio/'));
      if (audioAttachment?.url) {
        try {
-          const { loadConfig } = await import('../config/index.js');
-          const config = loadConfig();
-          if (!config.transcription?.apiKey && !process.env.OPENAI_API_KEY) {
-            await message.reply('Voice messages require OpenAI API key for transcription. See: https://github.com/letta-ai/lettabot#voice-messages');
+          const { isTranscriptionConfigured } = await import('../transcription/index.js');
+          if (!isTranscriptionConfigured()) {
+            await message.reply('Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice-messages');
          } else {
            // Download audio
            const response = await fetch(audioAttachment.url);
--- a/src/channels/setup.ts
+++ b/src/channels/setup.ts
@@ -494,9 +494,9 @@ export async function setupSignal(existing?: any): Promise<any> {
  
  p.note(
    'See docs/signal-setup.md for detailed instructions.\n' +
-    'Requires signal-cli registered with your phone number.\n\n' +
-    '⚠️  Security: Has full access to your Signal account.\n' +
-    'Can see all messages and send as you.',
+    'Recommended: Link as secondary device (signal-cli link -n "LettaBot")\n' +
+    'This keeps your phone\'s Signal app working normally.\n\n' +
+    'Requires signal-cli registered or linked with your phone number.',
    'Signal Setup'
  );
  
--- a/src/channels/signal.ts
+++ b/src/channels/signal.ts
@@ -623,14 +623,12 @@ This code expires in 1 hour.`;
        }
        
        try {
-          const { loadConfig } = await import('../config/index.js');
-          const config = loadConfig();
-          if (!config.transcription?.apiKey && !process.env.OPENAI_API_KEY) {
+          const { isTranscriptionConfigured } = await import('../transcription/index.js');
+          if (!isTranscriptionConfigured()) {
            if (chatId) {
-              const audioInfo = savedAudioPath ? ` Audio saved to: ${savedAudioPath}` : '';
              await this.sendMessage({
                chatId,
-                text: `Voice messages require OpenAI API key for transcription.${audioInfo} See: https://github.com/letta-ai/lettabot#voice-messages` 
+                text: 'Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice-messages'
              });
            }
          } else {
--- a/src/channels/slack.ts
+++ b/src/channels/slack.ts
@@ -60,9 +60,9 @@ export class SlackAdapter implements ChannelAdapter {
    
    // Handle messages
    this.app.message(async ({ message, say, client }) => {
-      // Type guard for regular messages
-      if (message.subtype !== undefined) return;
-      if (!('user' in message) || !('text' in message)) return;
+      // Type guard for regular messages (allow file_share for voice messages)
+      if (message.subtype !== undefined && message.subtype !== 'file_share') return;
+      if (!('user' in message)) return;
      
      const userId = message.user;
      let text = message.text || '';
@@ -74,10 +74,9 @@ export class SlackAdapter implements ChannelAdapter {
      const audioFile = files?.find(f => f.mimetype?.startsWith('audio/'));
      if (audioFile?.url_private_download) {
        try {
-          const { loadConfig } = await import('../config/index.js');
-          const config = loadConfig();
-          if (!config.transcription?.apiKey && !process.env.OPENAI_API_KEY) {
-            await say('Voice messages require OpenAI API key for transcription. See: https://github.com/letta-ai/lettabot#voice-messages');
+          const { isTranscriptionConfigured } = await import('../transcription/index.js');
+          if (!isTranscriptionConfigured()) {
+            await say('Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice-messages');
          } else {
            // Download file (requires bot token for auth)
            const response = await fetch(audioFile.url_private_download, {
@@ -173,10 +172,43 @@ export class SlackAdapter implements ChannelAdapter {
    // Handle app mentions (@bot)
    this.app.event('app_mention', async ({ event }) => {
      const userId = event.user || '';
-      const text = (event.text || '').replace(/<@[A-Z0-9]+>/g, '').trim(); // Remove mention
+      let text = (event.text || '').replace(/<@[A-Z0-9]+>/g, '').trim(); // Remove mention
      const channelId = event.channel;
      const threadTs = event.thread_ts || event.ts; // Reply in thread, or start new thread from the mention

+      // Handle audio file attachments
+      const files = (event as any).files as Array<{ mimetype?: string; url_private_download?: string; name?: string }> | undefined;
+      const audioFile = files?.find(f => f.mimetype?.startsWith('audio/'));
+      if (audioFile?.url_private_download) {
+        try {
+          const { isTranscriptionConfigured } = await import('../transcription/index.js');
+          if (!isTranscriptionConfigured()) {
+            await this.sendMessage({ chatId: channelId, text: 'Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice-messages', threadId: threadTs });
+            return;
+          }
+          // Download file (requires bot token for auth)
+          const response = await fetch(audioFile.url_private_download, {
+            headers: { 'Authorization': `Bearer ${this.config.botToken}` }
+          });
+          const buffer = Buffer.from(await response.arrayBuffer());
+
+          const { transcribeAudio } = await import('../transcription/index.js');
+          const ext = audioFile.mimetype?.split('/')[1] || 'mp3';
+          const result = await transcribeAudio(buffer, audioFile.name || `audio.${ext}`);
+
+          if (result.success && result.text) {
+            console.log(`[Slack] Transcribed audio: "${result.text.slice(0, 50)}..."`);
+            text = (text ? text + '\n' : '') + `[Voice message]: ${result.text}`;
+          } else {
+            console.error(`[Slack] Transcription failed: ${result.error}`);
+            text = (text ? text + '\n' : '') + `[Voice message - transcription failed: ${result.error}]`;
+          }
+        } catch (error) {
+          console.error('[Slack] Error transcribing audio:', error);
+          text = (text ? text + '\n' : '') + `[Voice message - error: ${error instanceof Error ? error.message : 'unknown error'}]`;
+        }
+      }
+
      if (this.config.allowedUsers && this.config.allowedUsers.length > 0) {
        if (!userId || !this.config.allowedUsers.includes(userId)) {
          // Can't use say() in app_mention event the same way
--- a/src/channels/telegram.ts
+++ b/src/channels/telegram.ts
@@ -346,10 +346,9 @@ export class TelegramAdapter implements ChannelAdapter {
      const { isGroup, groupName, wasMentioned, isListeningMode } = gating;

      // Check if transcription is configured (config or env)
-      const { loadConfig } = await import('../config/index.js');
-      const config = loadConfig();
-      if (!config.transcription?.apiKey && !process.env.OPENAI_API_KEY) {
-        await ctx.reply('Voice messages require OpenAI API key for transcription. See: https://github.com/letta-ai/lettabot#voice-messages');
+      const { isTranscriptionConfigured } = await import('../transcription/index.js');
+      if (!isTranscriptionConfigured()) {
+        await ctx.reply('Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice-messages');
        return;
      }

--- a/src/channels/whatsapp/inbound/extract.ts
+++ b/src/channels/whatsapp/inbound/extract.ts
@@ -143,18 +143,22 @@ export async function extractInboundMessage(

  // Collect attachments if media present and config provided
  let attachments: InboundAttachment[] = [];
+  let voiceTranscription: string | undefined;
  if (preview.hasMedia && attachmentConfig) {
    const result = await collectAttachments({
      messageContent,
      chatId: remoteJid,
      messageId: messageId || 'unknown',
+      sock,
      ...attachmentConfig,
    });
    attachments = result.attachments;
+    voiceTranscription = result.voiceTranscription;
  }

  // Use caption as fallback text (for media-only messages)
-  const finalBody = body || preview.caption || '';
+  // For voice messages, use transcription if available
+  const finalBody = voiceTranscription || body || preview.caption || '';
  if (!finalBody && attachments.length === 0) {
    return null; // Skip messages with no text and no media
  }
--- a/src/channels/whatsapp/inbound/media.ts
+++ b/src/channels/whatsapp/inbound/media.ts
@@ -55,19 +55,21 @@ export function extractMediaPreview(messageContent: any): { hasMedia: boolean; c
 * Handles 5 media types: image, video, audio, document, sticker.
 * Downloads using Baileys' downloadContentFromMessage and saves to disk.
 * Enforces size limits and supports metadata-only mode.
+ * Transcribes voice messages (ptt: true) using configured transcription provider.
 *
 * @param params - Attachment collection parameters
- * @returns Attachments array and optional caption
+ * @returns Attachments array, optional caption, and optional transcribed text for voice messages
 */
 export async function collectAttachments(params: {
  messageContent: any;
  chatId: string;
  messageId: string;
  downloadContentFromMessage: (message: any, type: string) => Promise<AsyncIterable<Uint8Array>>;
+  sock: import("@whiskeysockets/baileys").WASocket;
  attachmentsDir?: string;
  attachmentsMaxBytes?: number;
-}): Promise<{ attachments: InboundAttachment[]; caption?: string }> {
-  const { messageContent, chatId, messageId, downloadContentFromMessage, attachmentsDir, attachmentsMaxBytes } = params;
+}): Promise<{ attachments: InboundAttachment[]; caption?: string; voiceTranscription?: string }> {
+  const { messageContent, chatId, messageId, downloadContentFromMessage, sock, attachmentsDir, attachmentsMaxBytes } = params;
  const attachments: InboundAttachment[] = [];

  if (!messageContent) return { attachments };
@@ -122,6 +124,10 @@ export async function collectAttachments(params: {
    kind,
  };

+  // Check if this is a voice message (ptt = push-to-talk)
+  const isPttVoiceMessage = mediaType === 'audio' && mediaMessage.ptt === true;
+  let voiceTranscription: string | undefined;
+
  // Download if attachmentsDir is configured
  if (attachmentsDir) {
    // Metadata-only mode (attachmentsMaxBytes = 0)
@@ -151,11 +157,54 @@ export async function collectAttachments(params: {
    }
  }

-  attachments.push(attachment);
+  // Transcribe voice messages
+  if (isPttVoiceMessage) {
+    try {
+      const { isTranscriptionConfigured } = await import('../../../transcription/index.js');
+      if (!isTranscriptionConfigured()) {
+        // Send error message directly to user (matches Telegram/Slack/Discord/Signal behavior)
+        try {
+          await sock.sendMessage(chatId, {
+            text: 'Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice-messages'
+          });
+        } catch (sendError) {
+          console.error('[WhatsApp] Failed to send transcription error message:', sendError);
+        }
+        // Don't forward error to agent - return early
        const caption = mediaMessage.caption as string | undefined;
        return { attachments, caption };
      }

+      // Download audio buffer for transcription
+      const stream = await downloadContentFromMessage(mediaMessage, mediaType);
+      const chunks: Uint8Array[] = [];
+      for await (const chunk of stream) {
+        chunks.push(chunk);
+      }
+      const buffer = Buffer.concat(chunks);
+
+      // Transcribe audio
+      const { transcribeAudio } = await import('../../../transcription/index.js');
+      const result = await transcribeAudio(buffer, name);
+
+      if (result.success && result.text) {
+        console.log(`[WhatsApp] Transcribed voice message: "${result.text.slice(0, 50)}..."`);
+        voiceTranscription = `[Voice message]: ${result.text}`;
+      } else {
+        console.error(`[WhatsApp] Transcription failed: ${result.error}`);
+        voiceTranscription = `[Voice message - transcription failed: ${result.error}]`;
+      }
+    } catch (error) {
+      console.error('[WhatsApp] Error transcribing voice message:', error);
+      voiceTranscription = `[Voice message - error: ${error instanceof Error ? error.message : 'unknown error'}]`;
+    }
+  }
+
+  attachments.push(attachment);
+  const caption = mediaMessage.caption as string | undefined;
+  return { attachments, caption, voiceTranscription };
+}
+
 /**
 * Extract file extension from MIME type.
 *
--- a/src/config/types.ts
+++ b/src/config/types.ts
@@ -183,9 +183,9 @@ export interface LettaBotConfig {
 }

 export interface TranscriptionConfig {
-  provider: 'openai';  // Only OpenAI supported currently
-  apiKey?: string;     // Falls back to OPENAI_API_KEY env var
-  model?: string;      // Defaults to 'whisper-1'
+  provider: 'openai' | 'mistral';
+  apiKey?: string;     // Falls back to OPENAI_API_KEY or MISTRAL_API_KEY env var
+  model?: string;      // Defaults to 'whisper-1' (OpenAI) or 'voxtral-mini-latest' (Mistral)
 }

 export interface PollingYamlConfig {
--- a/src/onboard.ts
+++ b/src/onboard.ts
@@ -290,7 +290,7 @@ interface OnboardConfig {
  cron: boolean;

  // Transcription (voice messages)
-  transcription: { enabled: boolean; apiKey?: string; model?: string };
+  transcription: { enabled: boolean; provider?: 'openai' | 'mistral'; apiKey?: string; model?: string };
 }

 const isPlaceholder = (val?: string) => !val || /^(your_|sk-\.\.\.|placeholder|example)/i.test(val);
@@ -665,6 +665,7 @@ async function stepProviders(config: OnboardConfig, env: Record<string, string>)
            });
            if (!p.isCancel(enableTranscription) && enableTranscription) {
              config.transcription.enabled = true;
+              config.transcription.provider = 'openai';
              config.transcription.apiKey = providerKey;
            }
          }
@@ -838,23 +839,39 @@ async function stepFeatures(config: OnboardConfig): Promise<void> {
 // Voice Transcription Setup
 // ============================================================================

-async function stepTranscription(config: OnboardConfig): Promise<void> {
-  // Skip if already configured from the providers step
-  if (config.transcription.enabled && config.transcription.apiKey) return;
+async function stepTranscription(config: OnboardConfig, forcePrompt?: boolean): Promise<void> {
+  // Skip if already configured (e.g. from OpenAI shortcut in stepProviders)
+  if (!forcePrompt && config.transcription.enabled && config.transcription.apiKey) return;

  const setupTranscription = await p.confirm({
-    message: 'Enable voice message transcription? (uses OpenAI Whisper)',
+    message: 'Enable voice message transcription?',
    initialValue: config.transcription.enabled,
  });
  if (p.isCancel(setupTranscription)) { p.cancel('Setup cancelled'); process.exit(0); }
  config.transcription.enabled = setupTranscription;

  if (setupTranscription) {
-    const existingKey = process.env.OPENAI_API_KEY;
+    const providerChoice = await p.select({
+      message: 'Transcription provider',
+      options: [
+        { value: 'openai', label: 'OpenAI Whisper', hint: 'whisper-1' },
+        { value: 'mistral', label: 'Mistral Voxtral', hint: 'voxtral-mini-latest' },
+      ],
+      initialValue: config.transcription.provider || 'openai',
+    });
+    if (p.isCancel(providerChoice)) { p.cancel('Setup cancelled'); process.exit(0); }
+    config.transcription.provider = providerChoice as 'openai' | 'mistral';
+
+    const isMistral = config.transcription.provider === 'mistral';
+    // Check env vars first, then check if key was already entered for LLM provider
+    const existingKey = isMistral
+      ? process.env.MISTRAL_API_KEY
+      : (process.env.OPENAI_API_KEY || config.providers?.find(p => p.id === 'openai')?.apiKey);
+    const providerLabel = isMistral ? 'Mistral' : 'OpenAI';

    const apiKey = await p.text({
-      message: 'OpenAI API Key (for Whisper transcription)',
-      placeholder: 'sk-...',
+      message: `${providerLabel} API Key`,
+      placeholder: isMistral ? '' : 'sk-...',
      initialValue: existingKey || '',
      validate: (v) => {
        if (!v) return 'API key is required for voice transcription';
@@ -1197,7 +1214,10 @@ function showSummary(config: OnboardConfig): void {
  lines.push(`Features:  ${features.length > 0 ? features.join(', ') : 'None'}`);
  
  // Transcription
-  lines.push(`Voice:     ${config.transcription.enabled ? 'Enabled (OpenAI Whisper)' : 'Disabled'}`);
+  const voiceLabel = config.transcription.enabled
+    ? `Enabled (${config.transcription.provider === 'mistral' ? 'Mistral Voxtral' : 'OpenAI Whisper'})`
+    : 'Disabled';
+  lines.push(`Voice:     ${voiceLabel}`);

  // Google
  if (config.google.enabled) {
@@ -1243,7 +1263,7 @@ async function reviewLoop(config: OnboardConfig, env: Record<string, string>): P
    }
    else if (choice === 'channels') await stepChannels(config, env);
    else if (choice === 'features') await stepFeatures(config);
-    else if (choice === 'transcription') await stepTranscription(config);
+    else if (choice === 'transcription') await stepTranscription(config, true);
    else if (choice === 'google') await stepGoogle(config);
  }
 }
@@ -1473,7 +1493,8 @@ export async function onboard(options?: { nonInteractive?: boolean }): Promise<v
    },
    cron: existingConfig.features?.cron || false,
    transcription: {
-      enabled: !!existingConfig.transcription?.apiKey || !!process.env.OPENAI_API_KEY,
+      enabled: !!existingConfig.transcription?.apiKey || !!process.env.OPENAI_API_KEY || !!process.env.MISTRAL_API_KEY,
+      provider: existingConfig.transcription?.provider || 'openai',
      apiKey: existingConfig.transcription?.apiKey,
      model: existingConfig.transcription?.model,
    },
@@ -1639,8 +1660,12 @@ export async function onboard(options?: { nonInteractive?: boolean }): Promise<v
  }

  if (config.transcription.enabled && config.transcription.apiKey) {
+    if (config.transcription.provider === 'mistral') {
+      env.MISTRAL_API_KEY = config.transcription.apiKey;
+    } else {
      env.OPENAI_API_KEY = config.transcription.apiKey;
    }
+  }

  // Helper to format access control status
  const formatAccess = (policy?: string, allowedUsers?: string[]) => {
@@ -1670,7 +1695,7 @@ export async function onboard(options?: { nonInteractive?: boolean }): Promise<v
    'Features:',
    config.heartbeat.enabled ? `  ✓ Heartbeat (${config.heartbeat.interval}min)` : '  ✗ Heartbeat',
    config.cron ? '  ✓ Cron jobs' : '  ✗ Cron jobs',
-    config.transcription.enabled ? '  ✓ Voice transcription (OpenAI Whisper)' : '  ✗ Voice transcription',
+    config.transcription.enabled ? `  ✓ Voice transcription (${config.transcription.provider === 'mistral' ? 'Mistral Voxtral' : 'OpenAI Whisper'})` : '  ✗ Voice transcription',
  ].join('\n');
  
  p.note(summary, 'Configuration Summary');
@@ -1782,7 +1807,7 @@ export async function onboard(options?: { nonInteractive?: boolean }): Promise<v
    agents: [agentConfig],
    ...(config.transcription.enabled && config.transcription.apiKey ? {
      transcription: {
-        provider: 'openai' as const,
+        provider: config.transcription.provider || 'openai',
        apiKey: config.transcription.apiKey,
        ...(config.transcription.model ? { model: config.transcription.model } : {}),
      },
--- a/src/setup/slack-wizard.ts
+++ b/src/setup/slack-wizard.ts
@@ -58,11 +58,14 @@ export async function runSlackWizard(existingConfig?: {
  const createdApp = await stepCreateApp();
  if (!createdApp) return null;

-  // Step 2: Install to Workspace + Get Bot Token
+  // Step 2: Configure App Home (enable DM messaging)
+  await stepConfigureAppHome();
+
+  // Step 3: Install to Workspace + Get Bot Token
  const botToken = await stepInstallApp(existingConfig?.botToken);
  if (!botToken) return null;
  
-  // Step 3: Enable Socket Mode + Get App Token
+  // Step 4: Enable Socket Mode + Get App Token
  const appToken = await stepEnableSocketMode(existingConfig?.appToken);
  if (!appToken) return null;
  
@@ -82,7 +85,7 @@ export async function runSlackWizard(existingConfig?: {
 }

 async function stepCreateApp(): Promise<boolean> {
-  p.log.step('Step 1/3: Create Slack App from Manifest');
+  p.log.step('Step 1/4: Create Slack App from Manifest');
  
  // Inline manifest for Socket Mode configuration
  const appName = process.env.SLACK_APP_NAME || process.env.LETTA_AGENT_NAME || 'LettaBot';
@@ -99,6 +102,7 @@ oauth_config:
    bot:
      - app_mentions:read
      - chat:write
+      - files:read
      - im:history
      - im:read
      - im:write
@@ -117,7 +121,7 @@ settings:
  p.note(
    'Creates app with everything pre-configured:\n' +
    '  • Socket Mode enabled\n' +
-    '  • 5 bot scopes (app_mentions:read, chat:write, im:*)\n' +
+    '  • 6 bot scopes (app_mentions:read, chat:write, files:read, im:*)\n' +
    '  • 2 event subscriptions (app_mention, message.im)\n\n' +
    'Just review and click "Create"!',
    'One-Click Setup'
@@ -162,7 +166,7 @@ settings:
 }

 async function stepEnableSocketMode(existingToken?: string): Promise<string | null> {
-  p.log.step('Step 3/3: Get App-Level Token');
+  p.log.step('Step 4/4: Get App-Level Token');
  
  p.note(
    '1. In the left sidebar, click "Socket Mode"\n' +
@@ -197,6 +201,7 @@ async function stepConfigureScopes(): Promise<boolean> {
    '3. Click "Add an OAuth Scope" for each:\n' +
    '   • app_mentions:read\n' +
    '   • chat:write\n' +
+    '   • files:read\n' +
    '   • im:history\n' +
    '   • im:read\n' +
    '   • im:write',
@@ -244,7 +249,7 @@ async function stepConfigureEvents(): Promise<boolean> {
 }

 async function stepConfigureAppHome(): Promise<boolean> {
-  p.log.step('Step 5/6: Configure App Home');
+  p.log.step('Step 2/4: Configure App Home');
  
  p.note(
    '1. Go to "App Home" in left sidebar\n' +
@@ -267,7 +272,7 @@ async function stepConfigureAppHome(): Promise<boolean> {
 }

 async function stepInstallApp(existingToken?: string): Promise<string | null> {
-  p.log.step('Step 6/6: Install to Workspace');
+  p.log.step('Step 3/4: Install to Workspace');
  
  p.note(
    '1. Go to "Install App" in left sidebar\n' +
--- a/src/transcription/index.ts
+++ b/src/transcription/index.ts
@@ -1,7 +1,39 @@
 /**
- * Transcription service
+ * Transcription service router
 *
- * Currently supports OpenAI Whisper. Future providers can be added here.
+ * Delegates to the correct provider based on config.transcription.provider.
+ * Defaults to OpenAI Whisper for backwards compatibility.
 */

-export { transcribeAudio, type TranscriptionResult } from './openai.js';
+import { loadConfig } from '../config/index.js';
+import type { TranscriptionResult } from './openai.js';
+import { transcribeAudio as openaiTranscribe } from './openai.js';
+import { transcribeAudio as mistralTranscribe } from './mistral.js';
+
+export type { TranscriptionResult } from './openai.js';
+
+/**
+ * Check whether a transcription API key is available for the configured provider.
+ * Used by channel handlers to gate voice message processing.
+ */
+export function isTranscriptionConfigured(): boolean {
+  const config = loadConfig();
+  const provider = config.transcription?.provider || 'openai';
+  return !!(config.transcription?.apiKey
+    || (provider === 'mistral' ? process.env.MISTRAL_API_KEY : process.env.OPENAI_API_KEY));
+}
+
+export async function transcribeAudio(
+  audioBuffer: Buffer,
+  filename?: string,
+  options?: { audioPath?: string }
+): Promise<TranscriptionResult> {
+  const config = loadConfig();
+  const provider = config.transcription?.provider || 'openai';
+
+  if (provider === 'mistral') {
+    return mistralTranscribe(audioBuffer, filename, options);
+  }
+
+  return openaiTranscribe(audioBuffer, filename, options);
+}
--- a/src/transcription/mistral.ts
+++ b/src/transcription/mistral.ts
@@ -0,0 +1,244 @@
+/**
+ * Mistral Voxtral transcription service
+ *
+ * Uses Voxtral Transcribe 2 via the Mistral REST API.
+ * Simple multipart POST — no SDK dependency needed.
+ */
+
+import { loadConfig } from '../config/index.js';
+import { execSync } from 'node:child_process';
+import { writeFileSync, readFileSync, unlinkSync, mkdirSync, readdirSync } from 'node:fs';
+import { join } from 'node:path';
+import { tmpdir } from 'node:os';
+import type { TranscriptionResult } from './openai.js';
+
+const MAX_FILE_SIZE = 20 * 1024 * 1024;
+const CHUNK_DURATION_SECONDS = 600;
+
+function getApiKey(): string {
+  const config = loadConfig();
+  const apiKey = config.transcription?.apiKey || process.env.MISTRAL_API_KEY;
+  if (!apiKey) {
+    throw new Error('Mistral API key required for transcription. Set in config (transcription.apiKey) or MISTRAL_API_KEY env var.');
+  }
+  return apiKey;
+}
+
+function getModel(): string {
+  const config = loadConfig();
+  return config.transcription?.model || process.env.TRANSCRIPTION_MODEL || 'voxtral-mini-latest';
+}
+
+function getMimeType(filename: string): string {
+  const ext = filename.split('.').pop()?.toLowerCase();
+  const mimeTypes: Record<string, string> = {
+    'ogg': 'audio/ogg',
+    'oga': 'audio/ogg',
+    'mp3': 'audio/mpeg',
+    'mp4': 'audio/mp4',
+    'm4a': 'audio/mp4',
+    'wav': 'audio/wav',
+    'flac': 'audio/flac',
+    'webm': 'audio/webm',
+  };
+  return mimeTypes[ext || ''] || 'audio/ogg';
+}
+
+const NEEDS_CONVERSION = ['aac', 'amr', 'caf', 'x-caf', '3gp', '3gpp'];
+
+const FORMAT_MAP: Record<string, string> = {
+  'aac': 'm4a',
+  'amr': 'mp3',
+  'opus': 'ogg',
+  'x-caf': 'm4a',
+  'caf': 'm4a',
+  '3gp': 'mp4',
+  '3gpp': 'mp4',
+};
+
+let ffmpegAvailable: boolean | null = null;
+
+function isFfmpegAvailable(): boolean {
+  if (ffmpegAvailable === null) {
+    try {
+      execSync('which ffmpeg', { stdio: 'ignore' });
+      ffmpegAvailable = true;
+    } catch {
+      ffmpegAvailable = false;
+    }
+  }
+  return ffmpegAvailable;
+}
+
+function convertAudioToMp3(audioBuffer: Buffer, inputExt: string): Buffer {
+  const tempDir = join(tmpdir(), 'lettabot-transcription');
+  mkdirSync(tempDir, { recursive: true });
+
+  const inputPath = join(tempDir, `input-${Date.now()}.${inputExt}`);
+  const outputPath = join(tempDir, `output-${Date.now()}.mp3`);
+
+  try {
+    writeFileSync(inputPath, audioBuffer);
+    execSync(`ffmpeg -y -i "${inputPath}" -acodec libmp3lame -q:a 2 "${outputPath}" 2>/dev/null`, {
+      timeout: 30000,
+    });
+    const converted = readFileSync(outputPath);
+    console.log(`[Transcription] Converted ${audioBuffer.length} bytes → ${converted.length} bytes`);
+    return converted;
+  } finally {
+    try { unlinkSync(inputPath); } catch {}
+    try { unlinkSync(outputPath); } catch {}
+  }
+}
+
+/**
+ * Send a single buffer to the Voxtral API and return the text.
+ */
+async function attemptTranscription(audioBuffer: Buffer, filename: string): Promise<string> {
+  const apiKey = getApiKey();
+  const model = getModel();
+
+  const file = new File([new Uint8Array(audioBuffer)], filename, {
+    type: getMimeType(filename),
+  });
+
+  const formData = new FormData();
+  formData.append('model', model);
+  formData.append('file', file);
+
+  const response = await fetch('https://api.mistral.ai/v1/audio/transcriptions', {
+    method: 'POST',
+    headers: { 'Authorization': `Bearer ${apiKey}` },
+    body: formData,
+  });
+
+  if (!response.ok) {
+    const errorText = await response.text();
+    throw new Error(`Mistral API error (${response.status}): ${errorText}`);
+  }
+
+  const data = await response.json() as { text: string };
+  return data.text;
+}
+
+/**
+ * Split large audio into chunks and transcribe each.
+ */
+async function transcribeInChunks(audioBuffer: Buffer, ext: string): Promise<string> {
+  if (!isFfmpegAvailable()) {
+    throw new Error('Cannot split large audio files without ffmpeg');
+  }
+
+  const tempDir = join(tmpdir(), 'lettabot-transcription', `chunks-${Date.now()}`);
+  mkdirSync(tempDir, { recursive: true });
+
+  const inputPath = join(tempDir, `input.${ext}`);
+  const outputPattern = join(tempDir, 'chunk-%03d.mp3');
+
+  try {
+    writeFileSync(inputPath, audioBuffer);
+
+    execSync(
+      `ffmpeg -y -i "${inputPath}" -f segment -segment_time ${CHUNK_DURATION_SECONDS} -reset_timestamps 1 -acodec libmp3lame -q:a 2 "${outputPattern}" 2>/dev/null`,
+      { timeout: 120000 }
+    );
+
+    const chunkFiles = readdirSync(tempDir)
+      .filter(f => f.startsWith('chunk-') && f.endsWith('.mp3'))
+      .sort();
+
+    if (chunkFiles.length === 0) {
+      throw new Error('Failed to split audio into chunks');
+    }
+
+    console.log(`[Transcription] Split into ${chunkFiles.length} chunks`);
+
+    const transcriptions: string[] = [];
+    for (let i = 0; i < chunkFiles.length; i++) {
+      const chunkPath = join(tempDir, chunkFiles[i]);
+      const chunkBuffer = readFileSync(chunkPath);
+      console.log(`[Transcription] Transcribing chunk ${i + 1}/${chunkFiles.length} (${(chunkBuffer.length / 1024).toFixed(0)}KB)`);
+      const text = await attemptTranscription(chunkBuffer, chunkFiles[i]);
+      if (text.trim()) {
+        transcriptions.push(text.trim());
+      }
+    }
+
+    const combined = transcriptions.join(' ');
+    console.log(`[Transcription] Combined ${transcriptions.length} chunks into ${combined.length} chars`);
+    return combined;
+  } finally {
+    try {
+      const files = readdirSync(tempDir);
+      for (const file of files) {
+        unlinkSync(join(tempDir, file));
+      }
+      execSync(`rmdir "${tempDir}" 2>/dev/null || true`);
+    } catch {}
+  }
+}
+
+/**
+ * Transcribe audio using Mistral Voxtral API
+ *
+ * Voxtral supports: wav, mp3, flac, ogg, webm
+ * Telegram voice messages (OGG/Opus) work natively.
+ */
+export async function transcribeAudio(
+  audioBuffer: Buffer,
+  filename: string = 'audio.ogg',
+  options?: { audioPath?: string }
+): Promise<TranscriptionResult> {
+  const ext = filename.split('.').pop()?.toLowerCase() || '';
+
+  try {
+    let finalBuffer = audioBuffer;
+    let finalFilename = filename;
+
+    // Convert unsupported formats via ffmpeg
+    if (NEEDS_CONVERSION.includes(ext)) {
+      const mapped = FORMAT_MAP[ext];
+      if (mapped) {
+        console.log(`[Transcription] Trying .${ext} as .${mapped} (no conversion)`);
+        finalFilename = filename.replace(/\.[^.]+$/, `.${mapped}`);
+
+        try {
+          const text = await attemptTranscription(finalBuffer, finalFilename);
+          return { success: true, text };
+        } catch {
+          console.log(`[Transcription] Rename approach failed for .${ext}`);
+        }
+      }
+
+      if (isFfmpegAvailable()) {
+        console.log(`[Transcription] Converting .${ext} → .mp3 with ffmpeg`);
+        finalBuffer = convertAudioToMp3(audioBuffer, ext);
+        finalFilename = filename.replace(/\.[^.]+$/, '.mp3');
+      } else {
+        return {
+          success: false,
+          error: `Cannot transcribe .${ext} format. Install ffmpeg for audio conversion, or send in a supported format (mp3, ogg, wav, flac).`,
+          audioPath: options?.audioPath,
+        };
+      }
+    }
+
+    // Check file size and chunk if needed
+    if (finalBuffer.length > MAX_FILE_SIZE) {
+      const finalExt = finalFilename.split('.').pop()?.toLowerCase() || 'ogg';
+      console.log(`[Transcription] File too large (${(finalBuffer.length / 1024 / 1024).toFixed(1)}MB), splitting into chunks`);
+      const text = await transcribeInChunks(finalBuffer, finalExt);
+      return { success: true, text };
+    }
+
+    const text = await attemptTranscription(finalBuffer, finalFilename);
+    return { success: true, text };
+  } catch (error) {
+    const errorMsg = error instanceof Error ? error.message : String(error);
+    return {
+      success: false,
+      error: errorMsg,
+      audioPath: options?.audioPath,
+    };
+  }
+}