feat: add Mistral Voxtral transcription support (#228)

This commit is contained in:
jamesdanielwhitford
2026-02-23 23:37:12 +02:00
committed by GitHub
parent 6bda859559
commit cae5b104b3
15 changed files with 496 additions and 65 deletions

View File

@@ -109,12 +109,14 @@ That's it! Message your bot on Telegram.
## Voice Messages
LettaBot can transcribe voice messages using OpenAI Whisper. Voice messages are automatically converted to text and sent to the agent with a `[Voice message]:` prefix.
LettaBot can transcribe voice messages using either OpenAI Whisper or Mistral Voxtral. Voice messages are automatically converted to text and sent to the agent with a `[Voice message]:` prefix.
**Supported channels:** Telegram, WhatsApp, Signal, Slack, Discord
### Configuration
**Option 1: OpenAI Whisper**
Add your OpenAI API key to `lettabot.yaml`:
```yaml
@@ -130,7 +132,23 @@ Or set via environment variable:
export OPENAI_API_KEY=sk-...
```
If no API key is configured, voice messages are silently ignored.
**Option 2: Mistral Voxtral** (2x faster, 2x cheaper)
Add your Mistral API key to `lettabot.yaml`:
```yaml
transcription:
provider: mistral
apiKey: ...
```
Or set via environment variable:
```bash
export MISTRAL_API_KEY=...
```
If no API key is configured, users will receive an error message with a link to this section.
## Skills
LettaBot is compatible with [skills.sh](https://skills.sh) and [Clawdhub](https://clawdhub.com/).

View File

@@ -19,7 +19,32 @@ brew install signal-cli
### 2. Register Your Phone Number
You need a phone number that can receive SMS for verification.
You have two options:
#### Option A: Link as Secondary Device (Recommended)
Link signal-cli to your existing Signal account without disrupting your phone app:
```bash
# Generate a linking QR code/URI
signal-cli link -n "LettaBot"
```
This will display a `sgnl://linkdevice?uuid=...` URI. On your phone:
1. Open Signal → Settings (tap your profile)
2. Tap "Linked Devices"
3. Tap "Link New Device" (+ button)
4. Scan the QR code or enter the URI
**Benefits:**
- Your phone's Signal app continues to work normally
- Bot runs as a linked device (like Signal Desktop)
- Both your phone and the bot receive messages
- You can unlink the bot anytime from your phone
#### Option B: Primary Registration (Dedicated Number Only)
Register signal-cli as the primary device (requires a dedicated phone number):
```bash
# Request verification code (sent via SMS)
@@ -29,7 +54,7 @@ signal-cli -a +1XXXXXXXXXX register
signal-cli -a +1XXXXXXXXXX verify CODE
```
**Note:** You can only have one Signal client per number. Registering signal-cli will log out your Signal mobile app. Consider using a secondary number.
**Warning:** This will log out your Signal mobile app. Only use this option with a dedicated bot number, not your personal number.
## Configuration

View File

@@ -48,6 +48,7 @@ Socket Mode lets your bot connect without exposing a public endpoint.
|-------|---------|
| `app_mentions:read` | React when someone @mentions your bot |
| `chat:write` | Send messages |
| `files:read` | Download voice message attachments |
| `im:history` | Read DM message history |
| `im:read` | View DM channel info |
| `im:write` | Start DM conversations |

View File

@@ -180,10 +180,9 @@ Ask the bot owner to approve with:
const audioAttachment = message.attachments.find(a => a.contentType?.startsWith('audio/'));
if (audioAttachment?.url) {
try {
const { loadConfig } = await import('../config/index.js');
const config = loadConfig();
if (!config.transcription?.apiKey && !process.env.OPENAI_API_KEY) {
await message.reply('Voice messages require OpenAI API key for transcription. See: https://github.com/letta-ai/lettabot#voice-messages');
const { isTranscriptionConfigured } = await import('../transcription/index.js');
if (!isTranscriptionConfigured()) {
await message.reply('Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice-messages');
} else {
// Download audio
const response = await fetch(audioAttachment.url);

View File

@@ -494,9 +494,9 @@ export async function setupSignal(existing?: any): Promise<any> {
p.note(
'See docs/signal-setup.md for detailed instructions.\n' +
'Requires signal-cli registered with your phone number.\n\n' +
'⚠️ Security: Has full access to your Signal account.\n' +
'Can see all messages and send as you.',
'Recommended: Link as secondary device (signal-cli link -n "LettaBot")\n' +
'This keeps your phone\'s Signal app working normally.\n\n' +
'Requires signal-cli registered or linked with your phone number.',
'Signal Setup'
);

View File

@@ -623,14 +623,12 @@ This code expires in 1 hour.`;
}
try {
const { loadConfig } = await import('../config/index.js');
const config = loadConfig();
if (!config.transcription?.apiKey && !process.env.OPENAI_API_KEY) {
const { isTranscriptionConfigured } = await import('../transcription/index.js');
if (!isTranscriptionConfigured()) {
if (chatId) {
const audioInfo = savedAudioPath ? ` Audio saved to: ${savedAudioPath}` : '';
await this.sendMessage({
chatId,
text: `Voice messages require OpenAI API key for transcription.${audioInfo} See: https://github.com/letta-ai/lettabot#voice-messages`
text: 'Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice-messages'
});
}
} else {

View File

@@ -60,9 +60,9 @@ export class SlackAdapter implements ChannelAdapter {
// Handle messages
this.app.message(async ({ message, say, client }) => {
// Type guard for regular messages
if (message.subtype !== undefined) return;
if (!('user' in message) || !('text' in message)) return;
// Type guard for regular messages (allow file_share for voice messages)
if (message.subtype !== undefined && message.subtype !== 'file_share') return;
if (!('user' in message)) return;
const userId = message.user;
let text = message.text || '';
@@ -74,10 +74,9 @@ export class SlackAdapter implements ChannelAdapter {
const audioFile = files?.find(f => f.mimetype?.startsWith('audio/'));
if (audioFile?.url_private_download) {
try {
const { loadConfig } = await import('../config/index.js');
const config = loadConfig();
if (!config.transcription?.apiKey && !process.env.OPENAI_API_KEY) {
await say('Voice messages require OpenAI API key for transcription. See: https://github.com/letta-ai/lettabot#voice-messages');
const { isTranscriptionConfigured } = await import('../transcription/index.js');
if (!isTranscriptionConfigured()) {
await say('Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice-messages');
} else {
// Download file (requires bot token for auth)
const response = await fetch(audioFile.url_private_download, {
@@ -173,10 +172,43 @@ export class SlackAdapter implements ChannelAdapter {
// Handle app mentions (@bot)
this.app.event('app_mention', async ({ event }) => {
const userId = event.user || '';
const text = (event.text || '').replace(/<@[A-Z0-9]+>/g, '').trim(); // Remove mention
let text = (event.text || '').replace(/<@[A-Z0-9]+>/g, '').trim(); // Remove mention
const channelId = event.channel;
const threadTs = event.thread_ts || event.ts; // Reply in thread, or start new thread from the mention
// Handle audio file attachments
const files = (event as any).files as Array<{ mimetype?: string; url_private_download?: string; name?: string }> | undefined;
const audioFile = files?.find(f => f.mimetype?.startsWith('audio/'));
if (audioFile?.url_private_download) {
try {
const { isTranscriptionConfigured } = await import('../transcription/index.js');
if (!isTranscriptionConfigured()) {
await this.sendMessage({ chatId: channelId, text: 'Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice-messages', threadId: threadTs });
return;
}
// Download file (requires bot token for auth)
const response = await fetch(audioFile.url_private_download, {
headers: { 'Authorization': `Bearer ${this.config.botToken}` }
});
const buffer = Buffer.from(await response.arrayBuffer());
const { transcribeAudio } = await import('../transcription/index.js');
const ext = audioFile.mimetype?.split('/')[1] || 'mp3';
const result = await transcribeAudio(buffer, audioFile.name || `audio.${ext}`);
if (result.success && result.text) {
console.log(`[Slack] Transcribed audio: "${result.text.slice(0, 50)}..."`);
text = (text ? text + '\n' : '') + `[Voice message]: ${result.text}`;
} else {
console.error(`[Slack] Transcription failed: ${result.error}`);
text = (text ? text + '\n' : '') + `[Voice message - transcription failed: ${result.error}]`;
}
} catch (error) {
console.error('[Slack] Error transcribing audio:', error);
text = (text ? text + '\n' : '') + `[Voice message - error: ${error instanceof Error ? error.message : 'unknown error'}]`;
}
}
if (this.config.allowedUsers && this.config.allowedUsers.length > 0) {
if (!userId || !this.config.allowedUsers.includes(userId)) {
// Can't use say() in app_mention event the same way

View File

@@ -346,10 +346,9 @@ export class TelegramAdapter implements ChannelAdapter {
const { isGroup, groupName, wasMentioned, isListeningMode } = gating;
// Check if transcription is configured (config or env)
const { loadConfig } = await import('../config/index.js');
const config = loadConfig();
if (!config.transcription?.apiKey && !process.env.OPENAI_API_KEY) {
await ctx.reply('Voice messages require OpenAI API key for transcription. See: https://github.com/letta-ai/lettabot#voice-messages');
const { isTranscriptionConfigured } = await import('../transcription/index.js');
if (!isTranscriptionConfigured()) {
await ctx.reply('Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice-messages');
return;
}

View File

@@ -143,18 +143,22 @@ export async function extractInboundMessage(
// Collect attachments if media present and config provided
let attachments: InboundAttachment[] = [];
let voiceTranscription: string | undefined;
if (preview.hasMedia && attachmentConfig) {
const result = await collectAttachments({
messageContent,
chatId: remoteJid,
messageId: messageId || 'unknown',
sock,
...attachmentConfig,
});
attachments = result.attachments;
voiceTranscription = result.voiceTranscription;
}
// Use caption as fallback text (for media-only messages)
const finalBody = body || preview.caption || '';
// For voice messages, use transcription if available
const finalBody = voiceTranscription || body || preview.caption || '';
if (!finalBody && attachments.length === 0) {
return null; // Skip messages with no text and no media
}

View File

@@ -55,19 +55,21 @@ export function extractMediaPreview(messageContent: any): { hasMedia: boolean; c
* Handles 5 media types: image, video, audio, document, sticker.
* Downloads using Baileys' downloadContentFromMessage and saves to disk.
* Enforces size limits and supports metadata-only mode.
* Transcribes voice messages (ptt: true) using configured transcription provider.
*
* @param params - Attachment collection parameters
* @returns Attachments array and optional caption
* @returns Attachments array, optional caption, and optional transcribed text for voice messages
*/
export async function collectAttachments(params: {
messageContent: any;
chatId: string;
messageId: string;
downloadContentFromMessage: (message: any, type: string) => Promise<AsyncIterable<Uint8Array>>;
sock: import("@whiskeysockets/baileys").WASocket;
attachmentsDir?: string;
attachmentsMaxBytes?: number;
}): Promise<{ attachments: InboundAttachment[]; caption?: string }> {
const { messageContent, chatId, messageId, downloadContentFromMessage, attachmentsDir, attachmentsMaxBytes } = params;
}): Promise<{ attachments: InboundAttachment[]; caption?: string; voiceTranscription?: string }> {
const { messageContent, chatId, messageId, downloadContentFromMessage, sock, attachmentsDir, attachmentsMaxBytes } = params;
const attachments: InboundAttachment[] = [];
if (!messageContent) return { attachments };
@@ -122,6 +124,10 @@ export async function collectAttachments(params: {
kind,
};
// Check if this is a voice message (ptt = push-to-talk)
const isPttVoiceMessage = mediaType === 'audio' && mediaMessage.ptt === true;
let voiceTranscription: string | undefined;
// Download if attachmentsDir is configured
if (attachmentsDir) {
// Metadata-only mode (attachmentsMaxBytes = 0)
@@ -151,9 +157,52 @@ export async function collectAttachments(params: {
}
}
// Transcribe voice messages
if (isPttVoiceMessage) {
try {
const { isTranscriptionConfigured } = await import('../../../transcription/index.js');
if (!isTranscriptionConfigured()) {
// Send error message directly to user (matches Telegram/Slack/Discord/Signal behavior)
try {
await sock.sendMessage(chatId, {
text: 'Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice-messages'
});
} catch (sendError) {
console.error('[WhatsApp] Failed to send transcription error message:', sendError);
}
// Don't forward error to agent - return early
const caption = mediaMessage.caption as string | undefined;
return { attachments, caption };
}
// Download audio buffer for transcription
const stream = await downloadContentFromMessage(mediaMessage, mediaType);
const chunks: Uint8Array[] = [];
for await (const chunk of stream) {
chunks.push(chunk);
}
const buffer = Buffer.concat(chunks);
// Transcribe audio
const { transcribeAudio } = await import('../../../transcription/index.js');
const result = await transcribeAudio(buffer, name);
if (result.success && result.text) {
console.log(`[WhatsApp] Transcribed voice message: "${result.text.slice(0, 50)}..."`);
voiceTranscription = `[Voice message]: ${result.text}`;
} else {
console.error(`[WhatsApp] Transcription failed: ${result.error}`);
voiceTranscription = `[Voice message - transcription failed: ${result.error}]`;
}
} catch (error) {
console.error('[WhatsApp] Error transcribing voice message:', error);
voiceTranscription = `[Voice message - error: ${error instanceof Error ? error.message : 'unknown error'}]`;
}
}
attachments.push(attachment);
const caption = mediaMessage.caption as string | undefined;
return { attachments, caption };
return { attachments, caption, voiceTranscription };
}
/**

View File

@@ -183,9 +183,9 @@ export interface LettaBotConfig {
}
export interface TranscriptionConfig {
provider: 'openai'; // Only OpenAI supported currently
apiKey?: string; // Falls back to OPENAI_API_KEY env var
model?: string; // Defaults to 'whisper-1'
provider: 'openai' | 'mistral';
apiKey?: string; // Falls back to OPENAI_API_KEY or MISTRAL_API_KEY env var
model?: string; // Defaults to 'whisper-1' (OpenAI) or 'voxtral-mini-latest' (Mistral)
}
export interface PollingYamlConfig {

View File

@@ -290,7 +290,7 @@ interface OnboardConfig {
cron: boolean;
// Transcription (voice messages)
transcription: { enabled: boolean; apiKey?: string; model?: string };
transcription: { enabled: boolean; provider?: 'openai' | 'mistral'; apiKey?: string; model?: string };
}
const isPlaceholder = (val?: string) => !val || /^(your_|sk-\.\.\.|placeholder|example)/i.test(val);
@@ -665,6 +665,7 @@ async function stepProviders(config: OnboardConfig, env: Record<string, string>)
});
if (!p.isCancel(enableTranscription) && enableTranscription) {
config.transcription.enabled = true;
config.transcription.provider = 'openai';
config.transcription.apiKey = providerKey;
}
}
@@ -838,23 +839,39 @@ async function stepFeatures(config: OnboardConfig): Promise<void> {
// Voice Transcription Setup
// ============================================================================
async function stepTranscription(config: OnboardConfig): Promise<void> {
// Skip if already configured from the providers step
if (config.transcription.enabled && config.transcription.apiKey) return;
async function stepTranscription(config: OnboardConfig, forcePrompt?: boolean): Promise<void> {
// Skip if already configured (e.g. from OpenAI shortcut in stepProviders)
if (!forcePrompt && config.transcription.enabled && config.transcription.apiKey) return;
const setupTranscription = await p.confirm({
message: 'Enable voice message transcription? (uses OpenAI Whisper)',
message: 'Enable voice message transcription?',
initialValue: config.transcription.enabled,
});
if (p.isCancel(setupTranscription)) { p.cancel('Setup cancelled'); process.exit(0); }
config.transcription.enabled = setupTranscription;
if (setupTranscription) {
const existingKey = process.env.OPENAI_API_KEY;
const providerChoice = await p.select({
message: 'Transcription provider',
options: [
{ value: 'openai', label: 'OpenAI Whisper', hint: 'whisper-1' },
{ value: 'mistral', label: 'Mistral Voxtral', hint: 'voxtral-mini-latest' },
],
initialValue: config.transcription.provider || 'openai',
});
if (p.isCancel(providerChoice)) { p.cancel('Setup cancelled'); process.exit(0); }
config.transcription.provider = providerChoice as 'openai' | 'mistral';
const isMistral = config.transcription.provider === 'mistral';
// Check env vars first, then check if key was already entered for LLM provider
const existingKey = isMistral
? process.env.MISTRAL_API_KEY
: (process.env.OPENAI_API_KEY || config.providers?.find(p => p.id === 'openai')?.apiKey);
const providerLabel = isMistral ? 'Mistral' : 'OpenAI';
const apiKey = await p.text({
message: 'OpenAI API Key (for Whisper transcription)',
placeholder: 'sk-...',
message: `${providerLabel} API Key`,
placeholder: isMistral ? '' : 'sk-...',
initialValue: existingKey || '',
validate: (v) => {
if (!v) return 'API key is required for voice transcription';
@@ -1197,7 +1214,10 @@ function showSummary(config: OnboardConfig): void {
lines.push(`Features: ${features.length > 0 ? features.join(', ') : 'None'}`);
// Transcription
lines.push(`Voice: ${config.transcription.enabled ? 'Enabled (OpenAI Whisper)' : 'Disabled'}`);
const voiceLabel = config.transcription.enabled
? `Enabled (${config.transcription.provider === 'mistral' ? 'Mistral Voxtral' : 'OpenAI Whisper'})`
: 'Disabled';
lines.push(`Voice: ${voiceLabel}`);
// Google
if (config.google.enabled) {
@@ -1243,7 +1263,7 @@ async function reviewLoop(config: OnboardConfig, env: Record<string, string>): P
}
else if (choice === 'channels') await stepChannels(config, env);
else if (choice === 'features') await stepFeatures(config);
else if (choice === 'transcription') await stepTranscription(config);
else if (choice === 'transcription') await stepTranscription(config, true);
else if (choice === 'google') await stepGoogle(config);
}
}
@@ -1473,7 +1493,8 @@ export async function onboard(options?: { nonInteractive?: boolean }): Promise<v
},
cron: existingConfig.features?.cron || false,
transcription: {
enabled: !!existingConfig.transcription?.apiKey || !!process.env.OPENAI_API_KEY,
enabled: !!existingConfig.transcription?.apiKey || !!process.env.OPENAI_API_KEY || !!process.env.MISTRAL_API_KEY,
provider: existingConfig.transcription?.provider || 'openai',
apiKey: existingConfig.transcription?.apiKey,
model: existingConfig.transcription?.model,
},
@@ -1639,7 +1660,11 @@ export async function onboard(options?: { nonInteractive?: boolean }): Promise<v
}
if (config.transcription.enabled && config.transcription.apiKey) {
env.OPENAI_API_KEY = config.transcription.apiKey;
if (config.transcription.provider === 'mistral') {
env.MISTRAL_API_KEY = config.transcription.apiKey;
} else {
env.OPENAI_API_KEY = config.transcription.apiKey;
}
}
// Helper to format access control status
@@ -1670,7 +1695,7 @@ export async function onboard(options?: { nonInteractive?: boolean }): Promise<v
'Features:',
config.heartbeat.enabled ? ` ✓ Heartbeat (${config.heartbeat.interval}min)` : ' ✗ Heartbeat',
config.cron ? ' ✓ Cron jobs' : ' ✗ Cron jobs',
config.transcription.enabled ? ' ✓ Voice transcription (OpenAI Whisper)' : ' ✗ Voice transcription',
config.transcription.enabled ? ` ✓ Voice transcription (${config.transcription.provider === 'mistral' ? 'Mistral Voxtral' : 'OpenAI Whisper'})` : ' ✗ Voice transcription',
].join('\n');
p.note(summary, 'Configuration Summary');
@@ -1782,7 +1807,7 @@ export async function onboard(options?: { nonInteractive?: boolean }): Promise<v
agents: [agentConfig],
...(config.transcription.enabled && config.transcription.apiKey ? {
transcription: {
provider: 'openai' as const,
provider: config.transcription.provider || 'openai',
apiKey: config.transcription.apiKey,
...(config.transcription.model ? { model: config.transcription.model } : {}),
},

View File

@@ -58,11 +58,14 @@ export async function runSlackWizard(existingConfig?: {
const createdApp = await stepCreateApp();
if (!createdApp) return null;
// Step 2: Install to Workspace + Get Bot Token
// Step 2: Configure App Home (enable DM messaging)
await stepConfigureAppHome();
// Step 3: Install to Workspace + Get Bot Token
const botToken = await stepInstallApp(existingConfig?.botToken);
if (!botToken) return null;
// Step 3: Enable Socket Mode + Get App Token
// Step 4: Enable Socket Mode + Get App Token
const appToken = await stepEnableSocketMode(existingConfig?.appToken);
if (!appToken) return null;
@@ -82,7 +85,7 @@ export async function runSlackWizard(existingConfig?: {
}
async function stepCreateApp(): Promise<boolean> {
p.log.step('Step 1/3: Create Slack App from Manifest');
p.log.step('Step 1/4: Create Slack App from Manifest');
// Inline manifest for Socket Mode configuration
const appName = process.env.SLACK_APP_NAME || process.env.LETTA_AGENT_NAME || 'LettaBot';
@@ -99,6 +102,7 @@ oauth_config:
bot:
- app_mentions:read
- chat:write
- files:read
- im:history
- im:read
- im:write
@@ -117,7 +121,7 @@ settings:
p.note(
'Creates app with everything pre-configured:\n' +
' • Socket Mode enabled\n' +
' • 5 bot scopes (app_mentions:read, chat:write, im:*)\n' +
' • 6 bot scopes (app_mentions:read, chat:write, files:read, im:*)\n' +
' • 2 event subscriptions (app_mention, message.im)\n\n' +
'Just review and click "Create"!',
'One-Click Setup'
@@ -162,7 +166,7 @@ settings:
}
async function stepEnableSocketMode(existingToken?: string): Promise<string | null> {
p.log.step('Step 3/3: Get App-Level Token');
p.log.step('Step 4/4: Get App-Level Token');
p.note(
'1. In the left sidebar, click "Socket Mode"\n' +
@@ -197,6 +201,7 @@ async function stepConfigureScopes(): Promise<boolean> {
'3. Click "Add an OAuth Scope" for each:\n' +
' • app_mentions:read\n' +
' • chat:write\n' +
' • files:read\n' +
' • im:history\n' +
' • im:read\n' +
' • im:write',
@@ -244,7 +249,7 @@ async function stepConfigureEvents(): Promise<boolean> {
}
async function stepConfigureAppHome(): Promise<boolean> {
p.log.step('Step 5/6: Configure App Home');
p.log.step('Step 2/4: Configure App Home');
p.note(
'1. Go to "App Home" in left sidebar\n' +
@@ -267,7 +272,7 @@ async function stepConfigureAppHome(): Promise<boolean> {
}
async function stepInstallApp(existingToken?: string): Promise<string | null> {
p.log.step('Step 6/6: Install to Workspace');
p.log.step('Step 3/4: Install to Workspace');
p.note(
'1. Go to "Install App" in left sidebar\n' +

View File

@@ -1,7 +1,39 @@
/**
* Transcription service
* Transcription service router
*
* Currently supports OpenAI Whisper. Future providers can be added here.
* Delegates to the correct provider based on config.transcription.provider.
* Defaults to OpenAI Whisper for backwards compatibility.
*/
export { transcribeAudio, type TranscriptionResult } from './openai.js';
import { loadConfig } from '../config/index.js';
import type { TranscriptionResult } from './openai.js';
import { transcribeAudio as openaiTranscribe } from './openai.js';
import { transcribeAudio as mistralTranscribe } from './mistral.js';
export type { TranscriptionResult } from './openai.js';
/**
* Check whether a transcription API key is available for the configured provider.
* Used by channel handlers to gate voice message processing.
*/
export function isTranscriptionConfigured(): boolean {
const config = loadConfig();
const provider = config.transcription?.provider || 'openai';
return !!(config.transcription?.apiKey
|| (provider === 'mistral' ? process.env.MISTRAL_API_KEY : process.env.OPENAI_API_KEY));
}
export async function transcribeAudio(
audioBuffer: Buffer,
filename?: string,
options?: { audioPath?: string }
): Promise<TranscriptionResult> {
const config = loadConfig();
const provider = config.transcription?.provider || 'openai';
if (provider === 'mistral') {
return mistralTranscribe(audioBuffer, filename, options);
}
return openaiTranscribe(audioBuffer, filename, options);
}

View File

@@ -0,0 +1,244 @@
/**
* Mistral Voxtral transcription service
*
* Uses Voxtral Transcribe 2 via the Mistral REST API.
* Simple multipart POST — no SDK dependency needed.
*/
import { loadConfig } from '../config/index.js';
import { execSync } from 'node:child_process';
import { writeFileSync, readFileSync, unlinkSync, mkdirSync, readdirSync } from 'node:fs';
import { join } from 'node:path';
import { tmpdir } from 'node:os';
import type { TranscriptionResult } from './openai.js';
const MAX_FILE_SIZE = 20 * 1024 * 1024;
const CHUNK_DURATION_SECONDS = 600;
function getApiKey(): string {
const config = loadConfig();
const apiKey = config.transcription?.apiKey || process.env.MISTRAL_API_KEY;
if (!apiKey) {
throw new Error('Mistral API key required for transcription. Set in config (transcription.apiKey) or MISTRAL_API_KEY env var.');
}
return apiKey;
}
function getModel(): string {
const config = loadConfig();
return config.transcription?.model || process.env.TRANSCRIPTION_MODEL || 'voxtral-mini-latest';
}
function getMimeType(filename: string): string {
const ext = filename.split('.').pop()?.toLowerCase();
const mimeTypes: Record<string, string> = {
'ogg': 'audio/ogg',
'oga': 'audio/ogg',
'mp3': 'audio/mpeg',
'mp4': 'audio/mp4',
'm4a': 'audio/mp4',
'wav': 'audio/wav',
'flac': 'audio/flac',
'webm': 'audio/webm',
};
return mimeTypes[ext || ''] || 'audio/ogg';
}
const NEEDS_CONVERSION = ['aac', 'amr', 'caf', 'x-caf', '3gp', '3gpp'];
const FORMAT_MAP: Record<string, string> = {
'aac': 'm4a',
'amr': 'mp3',
'opus': 'ogg',
'x-caf': 'm4a',
'caf': 'm4a',
'3gp': 'mp4',
'3gpp': 'mp4',
};
let ffmpegAvailable: boolean | null = null;
function isFfmpegAvailable(): boolean {
if (ffmpegAvailable === null) {
try {
execSync('which ffmpeg', { stdio: 'ignore' });
ffmpegAvailable = true;
} catch {
ffmpegAvailable = false;
}
}
return ffmpegAvailable;
}
function convertAudioToMp3(audioBuffer: Buffer, inputExt: string): Buffer {
const tempDir = join(tmpdir(), 'lettabot-transcription');
mkdirSync(tempDir, { recursive: true });
const inputPath = join(tempDir, `input-${Date.now()}.${inputExt}`);
const outputPath = join(tempDir, `output-${Date.now()}.mp3`);
try {
writeFileSync(inputPath, audioBuffer);
execSync(`ffmpeg -y -i "${inputPath}" -acodec libmp3lame -q:a 2 "${outputPath}" 2>/dev/null`, {
timeout: 30000,
});
const converted = readFileSync(outputPath);
console.log(`[Transcription] Converted ${audioBuffer.length} bytes → ${converted.length} bytes`);
return converted;
} finally {
try { unlinkSync(inputPath); } catch {}
try { unlinkSync(outputPath); } catch {}
}
}
/**
* Send a single buffer to the Voxtral API and return the text.
*/
async function attemptTranscription(audioBuffer: Buffer, filename: string): Promise<string> {
const apiKey = getApiKey();
const model = getModel();
const file = new File([new Uint8Array(audioBuffer)], filename, {
type: getMimeType(filename),
});
const formData = new FormData();
formData.append('model', model);
formData.append('file', file);
const response = await fetch('https://api.mistral.ai/v1/audio/transcriptions', {
method: 'POST',
headers: { 'Authorization': `Bearer ${apiKey}` },
body: formData,
});
if (!response.ok) {
const errorText = await response.text();
throw new Error(`Mistral API error (${response.status}): ${errorText}`);
}
const data = await response.json() as { text: string };
return data.text;
}
/**
* Split large audio into chunks and transcribe each.
*/
async function transcribeInChunks(audioBuffer: Buffer, ext: string): Promise<string> {
if (!isFfmpegAvailable()) {
throw new Error('Cannot split large audio files without ffmpeg');
}
const tempDir = join(tmpdir(), 'lettabot-transcription', `chunks-${Date.now()}`);
mkdirSync(tempDir, { recursive: true });
const inputPath = join(tempDir, `input.${ext}`);
const outputPattern = join(tempDir, 'chunk-%03d.mp3');
try {
writeFileSync(inputPath, audioBuffer);
execSync(
`ffmpeg -y -i "${inputPath}" -f segment -segment_time ${CHUNK_DURATION_SECONDS} -reset_timestamps 1 -acodec libmp3lame -q:a 2 "${outputPattern}" 2>/dev/null`,
{ timeout: 120000 }
);
const chunkFiles = readdirSync(tempDir)
.filter(f => f.startsWith('chunk-') && f.endsWith('.mp3'))
.sort();
if (chunkFiles.length === 0) {
throw new Error('Failed to split audio into chunks');
}
console.log(`[Transcription] Split into ${chunkFiles.length} chunks`);
const transcriptions: string[] = [];
for (let i = 0; i < chunkFiles.length; i++) {
const chunkPath = join(tempDir, chunkFiles[i]);
const chunkBuffer = readFileSync(chunkPath);
console.log(`[Transcription] Transcribing chunk ${i + 1}/${chunkFiles.length} (${(chunkBuffer.length / 1024).toFixed(0)}KB)`);
const text = await attemptTranscription(chunkBuffer, chunkFiles[i]);
if (text.trim()) {
transcriptions.push(text.trim());
}
}
const combined = transcriptions.join(' ');
console.log(`[Transcription] Combined ${transcriptions.length} chunks into ${combined.length} chars`);
return combined;
} finally {
try {
const files = readdirSync(tempDir);
for (const file of files) {
unlinkSync(join(tempDir, file));
}
execSync(`rmdir "${tempDir}" 2>/dev/null || true`);
} catch {}
}
}
/**
* Transcribe audio using Mistral Voxtral API
*
* Voxtral supports: wav, mp3, flac, ogg, webm
* Telegram voice messages (OGG/Opus) work natively.
*/
export async function transcribeAudio(
audioBuffer: Buffer,
filename: string = 'audio.ogg',
options?: { audioPath?: string }
): Promise<TranscriptionResult> {
const ext = filename.split('.').pop()?.toLowerCase() || '';
try {
let finalBuffer = audioBuffer;
let finalFilename = filename;
// Convert unsupported formats via ffmpeg
if (NEEDS_CONVERSION.includes(ext)) {
const mapped = FORMAT_MAP[ext];
if (mapped) {
console.log(`[Transcription] Trying .${ext} as .${mapped} (no conversion)`);
finalFilename = filename.replace(/\.[^.]+$/, `.${mapped}`);
try {
const text = await attemptTranscription(finalBuffer, finalFilename);
return { success: true, text };
} catch {
console.log(`[Transcription] Rename approach failed for .${ext}`);
}
}
if (isFfmpegAvailable()) {
console.log(`[Transcription] Converting .${ext} → .mp3 with ffmpeg`);
finalBuffer = convertAudioToMp3(audioBuffer, ext);
finalFilename = filename.replace(/\.[^.]+$/, '.mp3');
} else {
return {
success: false,
error: `Cannot transcribe .${ext} format. Install ffmpeg for audio conversion, or send in a supported format (mp3, ogg, wav, flac).`,
audioPath: options?.audioPath,
};
}
}
// Check file size and chunk if needed
if (finalBuffer.length > MAX_FILE_SIZE) {
const finalExt = finalFilename.split('.').pop()?.toLowerCase() || 'ogg';
console.log(`[Transcription] File too large (${(finalBuffer.length / 1024 / 1024).toFixed(1)}MB), splitting into chunks`);
const text = await transcribeInChunks(finalBuffer, finalExt);
return { success: true, text };
}
const text = await attemptTranscription(finalBuffer, finalFilename);
return { success: true, text };
} catch (error) {
const errorMsg = error instanceof Error ? error.message : String(error);
return {
success: false,
error: errorMsg,
audioPath: options?.audioPath,
};
}
}