feat: add Mistral Voxtral transcription support (#228)
This commit is contained in:
committed by
GitHub
parent
6bda859559
commit
cae5b104b3
22
README.md
22
README.md
@@ -109,12 +109,14 @@ That's it! Message your bot on Telegram.
|
||||
|
||||
## Voice Messages
|
||||
|
||||
LettaBot can transcribe voice messages using OpenAI Whisper. Voice messages are automatically converted to text and sent to the agent with a `[Voice message]:` prefix.
|
||||
LettaBot can transcribe voice messages using either OpenAI Whisper or Mistral Voxtral. Voice messages are automatically converted to text and sent to the agent with a `[Voice message]:` prefix.
|
||||
|
||||
**Supported channels:** Telegram, WhatsApp, Signal, Slack, Discord
|
||||
|
||||
### Configuration
|
||||
|
||||
**Option 1: OpenAI Whisper**
|
||||
|
||||
Add your OpenAI API key to `lettabot.yaml`:
|
||||
|
||||
```yaml
|
||||
@@ -130,7 +132,23 @@ Or set via environment variable:
|
||||
export OPENAI_API_KEY=sk-...
|
||||
```
|
||||
|
||||
If no API key is configured, voice messages are silently ignored.
|
||||
**Option 2: Mistral Voxtral** (2x faster, 2x cheaper)
|
||||
|
||||
Add your Mistral API key to `lettabot.yaml`:
|
||||
|
||||
```yaml
|
||||
transcription:
|
||||
provider: mistral
|
||||
apiKey: ...
|
||||
```
|
||||
|
||||
Or set via environment variable:
|
||||
|
||||
```bash
|
||||
export MISTRAL_API_KEY=...
|
||||
```
|
||||
|
||||
If no API key is configured, users will receive an error message with a link to this section.
|
||||
|
||||
## Skills
|
||||
LettaBot is compatible with [skills.sh](https://skills.sh) and [Clawdhub](https://clawdhub.com/).
|
||||
|
||||
@@ -19,7 +19,32 @@ brew install signal-cli
|
||||
|
||||
### 2. Register Your Phone Number
|
||||
|
||||
You need a phone number that can receive SMS for verification.
|
||||
You have two options:
|
||||
|
||||
#### Option A: Link as Secondary Device (Recommended)
|
||||
|
||||
Link signal-cli to your existing Signal account without disrupting your phone app:
|
||||
|
||||
```bash
|
||||
# Generate a linking QR code/URI
|
||||
signal-cli link -n "LettaBot"
|
||||
```
|
||||
|
||||
This will display a `sgnl://linkdevice?uuid=...` URI. On your phone:
|
||||
1. Open Signal → Settings (tap your profile)
|
||||
2. Tap "Linked Devices"
|
||||
3. Tap "Link New Device" (+ button)
|
||||
4. Scan the QR code or enter the URI
|
||||
|
||||
**Benefits:**
|
||||
- Your phone's Signal app continues to work normally
|
||||
- Bot runs as a linked device (like Signal Desktop)
|
||||
- Both your phone and the bot receive messages
|
||||
- You can unlink the bot anytime from your phone
|
||||
|
||||
#### Option B: Primary Registration (Dedicated Number Only)
|
||||
|
||||
Register signal-cli as the primary device (requires a dedicated phone number):
|
||||
|
||||
```bash
|
||||
# Request verification code (sent via SMS)
|
||||
@@ -29,7 +54,7 @@ signal-cli -a +1XXXXXXXXXX register
|
||||
signal-cli -a +1XXXXXXXXXX verify CODE
|
||||
```
|
||||
|
||||
**Note:** You can only have one Signal client per number. Registering signal-cli will log out your Signal mobile app. Consider using a secondary number.
|
||||
**Warning:** This will log out your Signal mobile app. Only use this option with a dedicated bot number, not your personal number.
|
||||
|
||||
## Configuration
|
||||
|
||||
|
||||
@@ -48,6 +48,7 @@ Socket Mode lets your bot connect without exposing a public endpoint.
|
||||
|-------|---------|
|
||||
| `app_mentions:read` | React when someone @mentions your bot |
|
||||
| `chat:write` | Send messages |
|
||||
| `files:read` | Download voice message attachments |
|
||||
| `im:history` | Read DM message history |
|
||||
| `im:read` | View DM channel info |
|
||||
| `im:write` | Start DM conversations |
|
||||
|
||||
@@ -180,10 +180,9 @@ Ask the bot owner to approve with:
|
||||
const audioAttachment = message.attachments.find(a => a.contentType?.startsWith('audio/'));
|
||||
if (audioAttachment?.url) {
|
||||
try {
|
||||
const { loadConfig } = await import('../config/index.js');
|
||||
const config = loadConfig();
|
||||
if (!config.transcription?.apiKey && !process.env.OPENAI_API_KEY) {
|
||||
await message.reply('Voice messages require OpenAI API key for transcription. See: https://github.com/letta-ai/lettabot#voice-messages');
|
||||
const { isTranscriptionConfigured } = await import('../transcription/index.js');
|
||||
if (!isTranscriptionConfigured()) {
|
||||
await message.reply('Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice-messages');
|
||||
} else {
|
||||
// Download audio
|
||||
const response = await fetch(audioAttachment.url);
|
||||
|
||||
@@ -494,9 +494,9 @@ export async function setupSignal(existing?: any): Promise<any> {
|
||||
|
||||
p.note(
|
||||
'See docs/signal-setup.md for detailed instructions.\n' +
|
||||
'Requires signal-cli registered with your phone number.\n\n' +
|
||||
'⚠️ Security: Has full access to your Signal account.\n' +
|
||||
'Can see all messages and send as you.',
|
||||
'Recommended: Link as secondary device (signal-cli link -n "LettaBot")\n' +
|
||||
'This keeps your phone\'s Signal app working normally.\n\n' +
|
||||
'Requires signal-cli registered or linked with your phone number.',
|
||||
'Signal Setup'
|
||||
);
|
||||
|
||||
|
||||
@@ -623,14 +623,12 @@ This code expires in 1 hour.`;
|
||||
}
|
||||
|
||||
try {
|
||||
const { loadConfig } = await import('../config/index.js');
|
||||
const config = loadConfig();
|
||||
if (!config.transcription?.apiKey && !process.env.OPENAI_API_KEY) {
|
||||
const { isTranscriptionConfigured } = await import('../transcription/index.js');
|
||||
if (!isTranscriptionConfigured()) {
|
||||
if (chatId) {
|
||||
const audioInfo = savedAudioPath ? ` Audio saved to: ${savedAudioPath}` : '';
|
||||
await this.sendMessage({
|
||||
chatId,
|
||||
text: `Voice messages require OpenAI API key for transcription.${audioInfo} See: https://github.com/letta-ai/lettabot#voice-messages`
|
||||
text: 'Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice-messages'
|
||||
});
|
||||
}
|
||||
} else {
|
||||
|
||||
@@ -60,9 +60,9 @@ export class SlackAdapter implements ChannelAdapter {
|
||||
|
||||
// Handle messages
|
||||
this.app.message(async ({ message, say, client }) => {
|
||||
// Type guard for regular messages
|
||||
if (message.subtype !== undefined) return;
|
||||
if (!('user' in message) || !('text' in message)) return;
|
||||
// Type guard for regular messages (allow file_share for voice messages)
|
||||
if (message.subtype !== undefined && message.subtype !== 'file_share') return;
|
||||
if (!('user' in message)) return;
|
||||
|
||||
const userId = message.user;
|
||||
let text = message.text || '';
|
||||
@@ -74,10 +74,9 @@ export class SlackAdapter implements ChannelAdapter {
|
||||
const audioFile = files?.find(f => f.mimetype?.startsWith('audio/'));
|
||||
if (audioFile?.url_private_download) {
|
||||
try {
|
||||
const { loadConfig } = await import('../config/index.js');
|
||||
const config = loadConfig();
|
||||
if (!config.transcription?.apiKey && !process.env.OPENAI_API_KEY) {
|
||||
await say('Voice messages require OpenAI API key for transcription. See: https://github.com/letta-ai/lettabot#voice-messages');
|
||||
const { isTranscriptionConfigured } = await import('../transcription/index.js');
|
||||
if (!isTranscriptionConfigured()) {
|
||||
await say('Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice-messages');
|
||||
} else {
|
||||
// Download file (requires bot token for auth)
|
||||
const response = await fetch(audioFile.url_private_download, {
|
||||
@@ -173,10 +172,43 @@ export class SlackAdapter implements ChannelAdapter {
|
||||
// Handle app mentions (@bot)
|
||||
this.app.event('app_mention', async ({ event }) => {
|
||||
const userId = event.user || '';
|
||||
const text = (event.text || '').replace(/<@[A-Z0-9]+>/g, '').trim(); // Remove mention
|
||||
let text = (event.text || '').replace(/<@[A-Z0-9]+>/g, '').trim(); // Remove mention
|
||||
const channelId = event.channel;
|
||||
const threadTs = event.thread_ts || event.ts; // Reply in thread, or start new thread from the mention
|
||||
|
||||
// Handle audio file attachments
|
||||
const files = (event as any).files as Array<{ mimetype?: string; url_private_download?: string; name?: string }> | undefined;
|
||||
const audioFile = files?.find(f => f.mimetype?.startsWith('audio/'));
|
||||
if (audioFile?.url_private_download) {
|
||||
try {
|
||||
const { isTranscriptionConfigured } = await import('../transcription/index.js');
|
||||
if (!isTranscriptionConfigured()) {
|
||||
await this.sendMessage({ chatId: channelId, text: 'Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice-messages', threadId: threadTs });
|
||||
return;
|
||||
}
|
||||
// Download file (requires bot token for auth)
|
||||
const response = await fetch(audioFile.url_private_download, {
|
||||
headers: { 'Authorization': `Bearer ${this.config.botToken}` }
|
||||
});
|
||||
const buffer = Buffer.from(await response.arrayBuffer());
|
||||
|
||||
const { transcribeAudio } = await import('../transcription/index.js');
|
||||
const ext = audioFile.mimetype?.split('/')[1] || 'mp3';
|
||||
const result = await transcribeAudio(buffer, audioFile.name || `audio.${ext}`);
|
||||
|
||||
if (result.success && result.text) {
|
||||
console.log(`[Slack] Transcribed audio: "${result.text.slice(0, 50)}..."`);
|
||||
text = (text ? text + '\n' : '') + `[Voice message]: ${result.text}`;
|
||||
} else {
|
||||
console.error(`[Slack] Transcription failed: ${result.error}`);
|
||||
text = (text ? text + '\n' : '') + `[Voice message - transcription failed: ${result.error}]`;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('[Slack] Error transcribing audio:', error);
|
||||
text = (text ? text + '\n' : '') + `[Voice message - error: ${error instanceof Error ? error.message : 'unknown error'}]`;
|
||||
}
|
||||
}
|
||||
|
||||
if (this.config.allowedUsers && this.config.allowedUsers.length > 0) {
|
||||
if (!userId || !this.config.allowedUsers.includes(userId)) {
|
||||
// Can't use say() in app_mention event the same way
|
||||
|
||||
@@ -346,10 +346,9 @@ export class TelegramAdapter implements ChannelAdapter {
|
||||
const { isGroup, groupName, wasMentioned, isListeningMode } = gating;
|
||||
|
||||
// Check if transcription is configured (config or env)
|
||||
const { loadConfig } = await import('../config/index.js');
|
||||
const config = loadConfig();
|
||||
if (!config.transcription?.apiKey && !process.env.OPENAI_API_KEY) {
|
||||
await ctx.reply('Voice messages require OpenAI API key for transcription. See: https://github.com/letta-ai/lettabot#voice-messages');
|
||||
const { isTranscriptionConfigured } = await import('../transcription/index.js');
|
||||
if (!isTranscriptionConfigured()) {
|
||||
await ctx.reply('Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice-messages');
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
@@ -143,18 +143,22 @@ export async function extractInboundMessage(
|
||||
|
||||
// Collect attachments if media present and config provided
|
||||
let attachments: InboundAttachment[] = [];
|
||||
let voiceTranscription: string | undefined;
|
||||
if (preview.hasMedia && attachmentConfig) {
|
||||
const result = await collectAttachments({
|
||||
messageContent,
|
||||
chatId: remoteJid,
|
||||
messageId: messageId || 'unknown',
|
||||
sock,
|
||||
...attachmentConfig,
|
||||
});
|
||||
attachments = result.attachments;
|
||||
voiceTranscription = result.voiceTranscription;
|
||||
}
|
||||
|
||||
// Use caption as fallback text (for media-only messages)
|
||||
const finalBody = body || preview.caption || '';
|
||||
// For voice messages, use transcription if available
|
||||
const finalBody = voiceTranscription || body || preview.caption || '';
|
||||
if (!finalBody && attachments.length === 0) {
|
||||
return null; // Skip messages with no text and no media
|
||||
}
|
||||
|
||||
@@ -55,19 +55,21 @@ export function extractMediaPreview(messageContent: any): { hasMedia: boolean; c
|
||||
* Handles 5 media types: image, video, audio, document, sticker.
|
||||
* Downloads using Baileys' downloadContentFromMessage and saves to disk.
|
||||
* Enforces size limits and supports metadata-only mode.
|
||||
* Transcribes voice messages (ptt: true) using configured transcription provider.
|
||||
*
|
||||
* @param params - Attachment collection parameters
|
||||
* @returns Attachments array and optional caption
|
||||
* @returns Attachments array, optional caption, and optional transcribed text for voice messages
|
||||
*/
|
||||
export async function collectAttachments(params: {
|
||||
messageContent: any;
|
||||
chatId: string;
|
||||
messageId: string;
|
||||
downloadContentFromMessage: (message: any, type: string) => Promise<AsyncIterable<Uint8Array>>;
|
||||
sock: import("@whiskeysockets/baileys").WASocket;
|
||||
attachmentsDir?: string;
|
||||
attachmentsMaxBytes?: number;
|
||||
}): Promise<{ attachments: InboundAttachment[]; caption?: string }> {
|
||||
const { messageContent, chatId, messageId, downloadContentFromMessage, attachmentsDir, attachmentsMaxBytes } = params;
|
||||
}): Promise<{ attachments: InboundAttachment[]; caption?: string; voiceTranscription?: string }> {
|
||||
const { messageContent, chatId, messageId, downloadContentFromMessage, sock, attachmentsDir, attachmentsMaxBytes } = params;
|
||||
const attachments: InboundAttachment[] = [];
|
||||
|
||||
if (!messageContent) return { attachments };
|
||||
@@ -122,6 +124,10 @@ export async function collectAttachments(params: {
|
||||
kind,
|
||||
};
|
||||
|
||||
// Check if this is a voice message (ptt = push-to-talk)
|
||||
const isPttVoiceMessage = mediaType === 'audio' && mediaMessage.ptt === true;
|
||||
let voiceTranscription: string | undefined;
|
||||
|
||||
// Download if attachmentsDir is configured
|
||||
if (attachmentsDir) {
|
||||
// Metadata-only mode (attachmentsMaxBytes = 0)
|
||||
@@ -151,11 +157,54 @@ export async function collectAttachments(params: {
|
||||
}
|
||||
}
|
||||
|
||||
attachments.push(attachment);
|
||||
// Transcribe voice messages
|
||||
if (isPttVoiceMessage) {
|
||||
try {
|
||||
const { isTranscriptionConfigured } = await import('../../../transcription/index.js');
|
||||
if (!isTranscriptionConfigured()) {
|
||||
// Send error message directly to user (matches Telegram/Slack/Discord/Signal behavior)
|
||||
try {
|
||||
await sock.sendMessage(chatId, {
|
||||
text: 'Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice-messages'
|
||||
});
|
||||
} catch (sendError) {
|
||||
console.error('[WhatsApp] Failed to send transcription error message:', sendError);
|
||||
}
|
||||
// Don't forward error to agent - return early
|
||||
const caption = mediaMessage.caption as string | undefined;
|
||||
return { attachments, caption };
|
||||
}
|
||||
|
||||
// Download audio buffer for transcription
|
||||
const stream = await downloadContentFromMessage(mediaMessage, mediaType);
|
||||
const chunks: Uint8Array[] = [];
|
||||
for await (const chunk of stream) {
|
||||
chunks.push(chunk);
|
||||
}
|
||||
const buffer = Buffer.concat(chunks);
|
||||
|
||||
// Transcribe audio
|
||||
const { transcribeAudio } = await import('../../../transcription/index.js');
|
||||
const result = await transcribeAudio(buffer, name);
|
||||
|
||||
if (result.success && result.text) {
|
||||
console.log(`[WhatsApp] Transcribed voice message: "${result.text.slice(0, 50)}..."`);
|
||||
voiceTranscription = `[Voice message]: ${result.text}`;
|
||||
} else {
|
||||
console.error(`[WhatsApp] Transcription failed: ${result.error}`);
|
||||
voiceTranscription = `[Voice message - transcription failed: ${result.error}]`;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('[WhatsApp] Error transcribing voice message:', error);
|
||||
voiceTranscription = `[Voice message - error: ${error instanceof Error ? error.message : 'unknown error'}]`;
|
||||
}
|
||||
}
|
||||
|
||||
attachments.push(attachment);
|
||||
const caption = mediaMessage.caption as string | undefined;
|
||||
return { attachments, caption, voiceTranscription };
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract file extension from MIME type.
|
||||
*
|
||||
|
||||
@@ -183,9 +183,9 @@ export interface LettaBotConfig {
|
||||
}
|
||||
|
||||
export interface TranscriptionConfig {
|
||||
provider: 'openai'; // Only OpenAI supported currently
|
||||
apiKey?: string; // Falls back to OPENAI_API_KEY env var
|
||||
model?: string; // Defaults to 'whisper-1'
|
||||
provider: 'openai' | 'mistral';
|
||||
apiKey?: string; // Falls back to OPENAI_API_KEY or MISTRAL_API_KEY env var
|
||||
model?: string; // Defaults to 'whisper-1' (OpenAI) or 'voxtral-mini-latest' (Mistral)
|
||||
}
|
||||
|
||||
export interface PollingYamlConfig {
|
||||
|
||||
@@ -290,7 +290,7 @@ interface OnboardConfig {
|
||||
cron: boolean;
|
||||
|
||||
// Transcription (voice messages)
|
||||
transcription: { enabled: boolean; apiKey?: string; model?: string };
|
||||
transcription: { enabled: boolean; provider?: 'openai' | 'mistral'; apiKey?: string; model?: string };
|
||||
}
|
||||
|
||||
const isPlaceholder = (val?: string) => !val || /^(your_|sk-\.\.\.|placeholder|example)/i.test(val);
|
||||
@@ -665,6 +665,7 @@ async function stepProviders(config: OnboardConfig, env: Record<string, string>)
|
||||
});
|
||||
if (!p.isCancel(enableTranscription) && enableTranscription) {
|
||||
config.transcription.enabled = true;
|
||||
config.transcription.provider = 'openai';
|
||||
config.transcription.apiKey = providerKey;
|
||||
}
|
||||
}
|
||||
@@ -838,23 +839,39 @@ async function stepFeatures(config: OnboardConfig): Promise<void> {
|
||||
// Voice Transcription Setup
|
||||
// ============================================================================
|
||||
|
||||
async function stepTranscription(config: OnboardConfig): Promise<void> {
|
||||
// Skip if already configured from the providers step
|
||||
if (config.transcription.enabled && config.transcription.apiKey) return;
|
||||
async function stepTranscription(config: OnboardConfig, forcePrompt?: boolean): Promise<void> {
|
||||
// Skip if already configured (e.g. from OpenAI shortcut in stepProviders)
|
||||
if (!forcePrompt && config.transcription.enabled && config.transcription.apiKey) return;
|
||||
|
||||
const setupTranscription = await p.confirm({
|
||||
message: 'Enable voice message transcription? (uses OpenAI Whisper)',
|
||||
message: 'Enable voice message transcription?',
|
||||
initialValue: config.transcription.enabled,
|
||||
});
|
||||
if (p.isCancel(setupTranscription)) { p.cancel('Setup cancelled'); process.exit(0); }
|
||||
config.transcription.enabled = setupTranscription;
|
||||
|
||||
if (setupTranscription) {
|
||||
const existingKey = process.env.OPENAI_API_KEY;
|
||||
const providerChoice = await p.select({
|
||||
message: 'Transcription provider',
|
||||
options: [
|
||||
{ value: 'openai', label: 'OpenAI Whisper', hint: 'whisper-1' },
|
||||
{ value: 'mistral', label: 'Mistral Voxtral', hint: 'voxtral-mini-latest' },
|
||||
],
|
||||
initialValue: config.transcription.provider || 'openai',
|
||||
});
|
||||
if (p.isCancel(providerChoice)) { p.cancel('Setup cancelled'); process.exit(0); }
|
||||
config.transcription.provider = providerChoice as 'openai' | 'mistral';
|
||||
|
||||
const isMistral = config.transcription.provider === 'mistral';
|
||||
// Check env vars first, then check if key was already entered for LLM provider
|
||||
const existingKey = isMistral
|
||||
? process.env.MISTRAL_API_KEY
|
||||
: (process.env.OPENAI_API_KEY || config.providers?.find(p => p.id === 'openai')?.apiKey);
|
||||
const providerLabel = isMistral ? 'Mistral' : 'OpenAI';
|
||||
|
||||
const apiKey = await p.text({
|
||||
message: 'OpenAI API Key (for Whisper transcription)',
|
||||
placeholder: 'sk-...',
|
||||
message: `${providerLabel} API Key`,
|
||||
placeholder: isMistral ? '' : 'sk-...',
|
||||
initialValue: existingKey || '',
|
||||
validate: (v) => {
|
||||
if (!v) return 'API key is required for voice transcription';
|
||||
@@ -1197,7 +1214,10 @@ function showSummary(config: OnboardConfig): void {
|
||||
lines.push(`Features: ${features.length > 0 ? features.join(', ') : 'None'}`);
|
||||
|
||||
// Transcription
|
||||
lines.push(`Voice: ${config.transcription.enabled ? 'Enabled (OpenAI Whisper)' : 'Disabled'}`);
|
||||
const voiceLabel = config.transcription.enabled
|
||||
? `Enabled (${config.transcription.provider === 'mistral' ? 'Mistral Voxtral' : 'OpenAI Whisper'})`
|
||||
: 'Disabled';
|
||||
lines.push(`Voice: ${voiceLabel}`);
|
||||
|
||||
// Google
|
||||
if (config.google.enabled) {
|
||||
@@ -1243,7 +1263,7 @@ async function reviewLoop(config: OnboardConfig, env: Record<string, string>): P
|
||||
}
|
||||
else if (choice === 'channels') await stepChannels(config, env);
|
||||
else if (choice === 'features') await stepFeatures(config);
|
||||
else if (choice === 'transcription') await stepTranscription(config);
|
||||
else if (choice === 'transcription') await stepTranscription(config, true);
|
||||
else if (choice === 'google') await stepGoogle(config);
|
||||
}
|
||||
}
|
||||
@@ -1473,7 +1493,8 @@ export async function onboard(options?: { nonInteractive?: boolean }): Promise<v
|
||||
},
|
||||
cron: existingConfig.features?.cron || false,
|
||||
transcription: {
|
||||
enabled: !!existingConfig.transcription?.apiKey || !!process.env.OPENAI_API_KEY,
|
||||
enabled: !!existingConfig.transcription?.apiKey || !!process.env.OPENAI_API_KEY || !!process.env.MISTRAL_API_KEY,
|
||||
provider: existingConfig.transcription?.provider || 'openai',
|
||||
apiKey: existingConfig.transcription?.apiKey,
|
||||
model: existingConfig.transcription?.model,
|
||||
},
|
||||
@@ -1639,8 +1660,12 @@ export async function onboard(options?: { nonInteractive?: boolean }): Promise<v
|
||||
}
|
||||
|
||||
if (config.transcription.enabled && config.transcription.apiKey) {
|
||||
if (config.transcription.provider === 'mistral') {
|
||||
env.MISTRAL_API_KEY = config.transcription.apiKey;
|
||||
} else {
|
||||
env.OPENAI_API_KEY = config.transcription.apiKey;
|
||||
}
|
||||
}
|
||||
|
||||
// Helper to format access control status
|
||||
const formatAccess = (policy?: string, allowedUsers?: string[]) => {
|
||||
@@ -1670,7 +1695,7 @@ export async function onboard(options?: { nonInteractive?: boolean }): Promise<v
|
||||
'Features:',
|
||||
config.heartbeat.enabled ? ` ✓ Heartbeat (${config.heartbeat.interval}min)` : ' ✗ Heartbeat',
|
||||
config.cron ? ' ✓ Cron jobs' : ' ✗ Cron jobs',
|
||||
config.transcription.enabled ? ' ✓ Voice transcription (OpenAI Whisper)' : ' ✗ Voice transcription',
|
||||
config.transcription.enabled ? ` ✓ Voice transcription (${config.transcription.provider === 'mistral' ? 'Mistral Voxtral' : 'OpenAI Whisper'})` : ' ✗ Voice transcription',
|
||||
].join('\n');
|
||||
|
||||
p.note(summary, 'Configuration Summary');
|
||||
@@ -1782,7 +1807,7 @@ export async function onboard(options?: { nonInteractive?: boolean }): Promise<v
|
||||
agents: [agentConfig],
|
||||
...(config.transcription.enabled && config.transcription.apiKey ? {
|
||||
transcription: {
|
||||
provider: 'openai' as const,
|
||||
provider: config.transcription.provider || 'openai',
|
||||
apiKey: config.transcription.apiKey,
|
||||
...(config.transcription.model ? { model: config.transcription.model } : {}),
|
||||
},
|
||||
|
||||
@@ -58,11 +58,14 @@ export async function runSlackWizard(existingConfig?: {
|
||||
const createdApp = await stepCreateApp();
|
||||
if (!createdApp) return null;
|
||||
|
||||
// Step 2: Install to Workspace + Get Bot Token
|
||||
// Step 2: Configure App Home (enable DM messaging)
|
||||
await stepConfigureAppHome();
|
||||
|
||||
// Step 3: Install to Workspace + Get Bot Token
|
||||
const botToken = await stepInstallApp(existingConfig?.botToken);
|
||||
if (!botToken) return null;
|
||||
|
||||
// Step 3: Enable Socket Mode + Get App Token
|
||||
// Step 4: Enable Socket Mode + Get App Token
|
||||
const appToken = await stepEnableSocketMode(existingConfig?.appToken);
|
||||
if (!appToken) return null;
|
||||
|
||||
@@ -82,7 +85,7 @@ export async function runSlackWizard(existingConfig?: {
|
||||
}
|
||||
|
||||
async function stepCreateApp(): Promise<boolean> {
|
||||
p.log.step('Step 1/3: Create Slack App from Manifest');
|
||||
p.log.step('Step 1/4: Create Slack App from Manifest');
|
||||
|
||||
// Inline manifest for Socket Mode configuration
|
||||
const appName = process.env.SLACK_APP_NAME || process.env.LETTA_AGENT_NAME || 'LettaBot';
|
||||
@@ -99,6 +102,7 @@ oauth_config:
|
||||
bot:
|
||||
- app_mentions:read
|
||||
- chat:write
|
||||
- files:read
|
||||
- im:history
|
||||
- im:read
|
||||
- im:write
|
||||
@@ -117,7 +121,7 @@ settings:
|
||||
p.note(
|
||||
'Creates app with everything pre-configured:\n' +
|
||||
' • Socket Mode enabled\n' +
|
||||
' • 5 bot scopes (app_mentions:read, chat:write, im:*)\n' +
|
||||
' • 6 bot scopes (app_mentions:read, chat:write, files:read, im:*)\n' +
|
||||
' • 2 event subscriptions (app_mention, message.im)\n\n' +
|
||||
'Just review and click "Create"!',
|
||||
'One-Click Setup'
|
||||
@@ -162,7 +166,7 @@ settings:
|
||||
}
|
||||
|
||||
async function stepEnableSocketMode(existingToken?: string): Promise<string | null> {
|
||||
p.log.step('Step 3/3: Get App-Level Token');
|
||||
p.log.step('Step 4/4: Get App-Level Token');
|
||||
|
||||
p.note(
|
||||
'1. In the left sidebar, click "Socket Mode"\n' +
|
||||
@@ -197,6 +201,7 @@ async function stepConfigureScopes(): Promise<boolean> {
|
||||
'3. Click "Add an OAuth Scope" for each:\n' +
|
||||
' • app_mentions:read\n' +
|
||||
' • chat:write\n' +
|
||||
' • files:read\n' +
|
||||
' • im:history\n' +
|
||||
' • im:read\n' +
|
||||
' • im:write',
|
||||
@@ -244,7 +249,7 @@ async function stepConfigureEvents(): Promise<boolean> {
|
||||
}
|
||||
|
||||
async function stepConfigureAppHome(): Promise<boolean> {
|
||||
p.log.step('Step 5/6: Configure App Home');
|
||||
p.log.step('Step 2/4: Configure App Home');
|
||||
|
||||
p.note(
|
||||
'1. Go to "App Home" in left sidebar\n' +
|
||||
@@ -267,7 +272,7 @@ async function stepConfigureAppHome(): Promise<boolean> {
|
||||
}
|
||||
|
||||
async function stepInstallApp(existingToken?: string): Promise<string | null> {
|
||||
p.log.step('Step 6/6: Install to Workspace');
|
||||
p.log.step('Step 3/4: Install to Workspace');
|
||||
|
||||
p.note(
|
||||
'1. Go to "Install App" in left sidebar\n' +
|
||||
|
||||
@@ -1,7 +1,39 @@
|
||||
/**
|
||||
* Transcription service
|
||||
* Transcription service router
|
||||
*
|
||||
* Currently supports OpenAI Whisper. Future providers can be added here.
|
||||
* Delegates to the correct provider based on config.transcription.provider.
|
||||
* Defaults to OpenAI Whisper for backwards compatibility.
|
||||
*/
|
||||
|
||||
export { transcribeAudio, type TranscriptionResult } from './openai.js';
|
||||
import { loadConfig } from '../config/index.js';
|
||||
import type { TranscriptionResult } from './openai.js';
|
||||
import { transcribeAudio as openaiTranscribe } from './openai.js';
|
||||
import { transcribeAudio as mistralTranscribe } from './mistral.js';
|
||||
|
||||
export type { TranscriptionResult } from './openai.js';
|
||||
|
||||
/**
|
||||
* Check whether a transcription API key is available for the configured provider.
|
||||
* Used by channel handlers to gate voice message processing.
|
||||
*/
|
||||
export function isTranscriptionConfigured(): boolean {
|
||||
const config = loadConfig();
|
||||
const provider = config.transcription?.provider || 'openai';
|
||||
return !!(config.transcription?.apiKey
|
||||
|| (provider === 'mistral' ? process.env.MISTRAL_API_KEY : process.env.OPENAI_API_KEY));
|
||||
}
|
||||
|
||||
export async function transcribeAudio(
|
||||
audioBuffer: Buffer,
|
||||
filename?: string,
|
||||
options?: { audioPath?: string }
|
||||
): Promise<TranscriptionResult> {
|
||||
const config = loadConfig();
|
||||
const provider = config.transcription?.provider || 'openai';
|
||||
|
||||
if (provider === 'mistral') {
|
||||
return mistralTranscribe(audioBuffer, filename, options);
|
||||
}
|
||||
|
||||
return openaiTranscribe(audioBuffer, filename, options);
|
||||
}
|
||||
|
||||
244
src/transcription/mistral.ts
Normal file
244
src/transcription/mistral.ts
Normal file
@@ -0,0 +1,244 @@
|
||||
/**
|
||||
* Mistral Voxtral transcription service
|
||||
*
|
||||
* Uses Voxtral Transcribe 2 via the Mistral REST API.
|
||||
* Simple multipart POST — no SDK dependency needed.
|
||||
*/
|
||||
|
||||
import { loadConfig } from '../config/index.js';
|
||||
import { execSync } from 'node:child_process';
|
||||
import { writeFileSync, readFileSync, unlinkSync, mkdirSync, readdirSync } from 'node:fs';
|
||||
import { join } from 'node:path';
|
||||
import { tmpdir } from 'node:os';
|
||||
import type { TranscriptionResult } from './openai.js';
|
||||
|
||||
const MAX_FILE_SIZE = 20 * 1024 * 1024;
|
||||
const CHUNK_DURATION_SECONDS = 600;
|
||||
|
||||
function getApiKey(): string {
|
||||
const config = loadConfig();
|
||||
const apiKey = config.transcription?.apiKey || process.env.MISTRAL_API_KEY;
|
||||
if (!apiKey) {
|
||||
throw new Error('Mistral API key required for transcription. Set in config (transcription.apiKey) or MISTRAL_API_KEY env var.');
|
||||
}
|
||||
return apiKey;
|
||||
}
|
||||
|
||||
function getModel(): string {
|
||||
const config = loadConfig();
|
||||
return config.transcription?.model || process.env.TRANSCRIPTION_MODEL || 'voxtral-mini-latest';
|
||||
}
|
||||
|
||||
function getMimeType(filename: string): string {
|
||||
const ext = filename.split('.').pop()?.toLowerCase();
|
||||
const mimeTypes: Record<string, string> = {
|
||||
'ogg': 'audio/ogg',
|
||||
'oga': 'audio/ogg',
|
||||
'mp3': 'audio/mpeg',
|
||||
'mp4': 'audio/mp4',
|
||||
'm4a': 'audio/mp4',
|
||||
'wav': 'audio/wav',
|
||||
'flac': 'audio/flac',
|
||||
'webm': 'audio/webm',
|
||||
};
|
||||
return mimeTypes[ext || ''] || 'audio/ogg';
|
||||
}
|
||||
|
||||
const NEEDS_CONVERSION = ['aac', 'amr', 'caf', 'x-caf', '3gp', '3gpp'];
|
||||
|
||||
const FORMAT_MAP: Record<string, string> = {
|
||||
'aac': 'm4a',
|
||||
'amr': 'mp3',
|
||||
'opus': 'ogg',
|
||||
'x-caf': 'm4a',
|
||||
'caf': 'm4a',
|
||||
'3gp': 'mp4',
|
||||
'3gpp': 'mp4',
|
||||
};
|
||||
|
||||
let ffmpegAvailable: boolean | null = null;
|
||||
|
||||
function isFfmpegAvailable(): boolean {
|
||||
if (ffmpegAvailable === null) {
|
||||
try {
|
||||
execSync('which ffmpeg', { stdio: 'ignore' });
|
||||
ffmpegAvailable = true;
|
||||
} catch {
|
||||
ffmpegAvailable = false;
|
||||
}
|
||||
}
|
||||
return ffmpegAvailable;
|
||||
}
|
||||
|
||||
function convertAudioToMp3(audioBuffer: Buffer, inputExt: string): Buffer {
|
||||
const tempDir = join(tmpdir(), 'lettabot-transcription');
|
||||
mkdirSync(tempDir, { recursive: true });
|
||||
|
||||
const inputPath = join(tempDir, `input-${Date.now()}.${inputExt}`);
|
||||
const outputPath = join(tempDir, `output-${Date.now()}.mp3`);
|
||||
|
||||
try {
|
||||
writeFileSync(inputPath, audioBuffer);
|
||||
execSync(`ffmpeg -y -i "${inputPath}" -acodec libmp3lame -q:a 2 "${outputPath}" 2>/dev/null`, {
|
||||
timeout: 30000,
|
||||
});
|
||||
const converted = readFileSync(outputPath);
|
||||
console.log(`[Transcription] Converted ${audioBuffer.length} bytes → ${converted.length} bytes`);
|
||||
return converted;
|
||||
} finally {
|
||||
try { unlinkSync(inputPath); } catch {}
|
||||
try { unlinkSync(outputPath); } catch {}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Send a single buffer to the Voxtral API and return the text.
|
||||
*/
|
||||
async function attemptTranscription(audioBuffer: Buffer, filename: string): Promise<string> {
|
||||
const apiKey = getApiKey();
|
||||
const model = getModel();
|
||||
|
||||
const file = new File([new Uint8Array(audioBuffer)], filename, {
|
||||
type: getMimeType(filename),
|
||||
});
|
||||
|
||||
const formData = new FormData();
|
||||
formData.append('model', model);
|
||||
formData.append('file', file);
|
||||
|
||||
const response = await fetch('https://api.mistral.ai/v1/audio/transcriptions', {
|
||||
method: 'POST',
|
||||
headers: { 'Authorization': `Bearer ${apiKey}` },
|
||||
body: formData,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const errorText = await response.text();
|
||||
throw new Error(`Mistral API error (${response.status}): ${errorText}`);
|
||||
}
|
||||
|
||||
const data = await response.json() as { text: string };
|
||||
return data.text;
|
||||
}
|
||||
|
||||
/**
|
||||
* Split large audio into chunks and transcribe each.
|
||||
*/
|
||||
async function transcribeInChunks(audioBuffer: Buffer, ext: string): Promise<string> {
|
||||
if (!isFfmpegAvailable()) {
|
||||
throw new Error('Cannot split large audio files without ffmpeg');
|
||||
}
|
||||
|
||||
const tempDir = join(tmpdir(), 'lettabot-transcription', `chunks-${Date.now()}`);
|
||||
mkdirSync(tempDir, { recursive: true });
|
||||
|
||||
const inputPath = join(tempDir, `input.${ext}`);
|
||||
const outputPattern = join(tempDir, 'chunk-%03d.mp3');
|
||||
|
||||
try {
|
||||
writeFileSync(inputPath, audioBuffer);
|
||||
|
||||
execSync(
|
||||
`ffmpeg -y -i "${inputPath}" -f segment -segment_time ${CHUNK_DURATION_SECONDS} -reset_timestamps 1 -acodec libmp3lame -q:a 2 "${outputPattern}" 2>/dev/null`,
|
||||
{ timeout: 120000 }
|
||||
);
|
||||
|
||||
const chunkFiles = readdirSync(tempDir)
|
||||
.filter(f => f.startsWith('chunk-') && f.endsWith('.mp3'))
|
||||
.sort();
|
||||
|
||||
if (chunkFiles.length === 0) {
|
||||
throw new Error('Failed to split audio into chunks');
|
||||
}
|
||||
|
||||
console.log(`[Transcription] Split into ${chunkFiles.length} chunks`);
|
||||
|
||||
const transcriptions: string[] = [];
|
||||
for (let i = 0; i < chunkFiles.length; i++) {
|
||||
const chunkPath = join(tempDir, chunkFiles[i]);
|
||||
const chunkBuffer = readFileSync(chunkPath);
|
||||
console.log(`[Transcription] Transcribing chunk ${i + 1}/${chunkFiles.length} (${(chunkBuffer.length / 1024).toFixed(0)}KB)`);
|
||||
const text = await attemptTranscription(chunkBuffer, chunkFiles[i]);
|
||||
if (text.trim()) {
|
||||
transcriptions.push(text.trim());
|
||||
}
|
||||
}
|
||||
|
||||
const combined = transcriptions.join(' ');
|
||||
console.log(`[Transcription] Combined ${transcriptions.length} chunks into ${combined.length} chars`);
|
||||
return combined;
|
||||
} finally {
|
||||
try {
|
||||
const files = readdirSync(tempDir);
|
||||
for (const file of files) {
|
||||
unlinkSync(join(tempDir, file));
|
||||
}
|
||||
execSync(`rmdir "${tempDir}" 2>/dev/null || true`);
|
||||
} catch {}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Transcribe audio using Mistral Voxtral API
|
||||
*
|
||||
* Voxtral supports: wav, mp3, flac, ogg, webm
|
||||
* Telegram voice messages (OGG/Opus) work natively.
|
||||
*/
|
||||
export async function transcribeAudio(
|
||||
audioBuffer: Buffer,
|
||||
filename: string = 'audio.ogg',
|
||||
options?: { audioPath?: string }
|
||||
): Promise<TranscriptionResult> {
|
||||
const ext = filename.split('.').pop()?.toLowerCase() || '';
|
||||
|
||||
try {
|
||||
let finalBuffer = audioBuffer;
|
||||
let finalFilename = filename;
|
||||
|
||||
// Convert unsupported formats via ffmpeg
|
||||
if (NEEDS_CONVERSION.includes(ext)) {
|
||||
const mapped = FORMAT_MAP[ext];
|
||||
if (mapped) {
|
||||
console.log(`[Transcription] Trying .${ext} as .${mapped} (no conversion)`);
|
||||
finalFilename = filename.replace(/\.[^.]+$/, `.${mapped}`);
|
||||
|
||||
try {
|
||||
const text = await attemptTranscription(finalBuffer, finalFilename);
|
||||
return { success: true, text };
|
||||
} catch {
|
||||
console.log(`[Transcription] Rename approach failed for .${ext}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (isFfmpegAvailable()) {
|
||||
console.log(`[Transcription] Converting .${ext} → .mp3 with ffmpeg`);
|
||||
finalBuffer = convertAudioToMp3(audioBuffer, ext);
|
||||
finalFilename = filename.replace(/\.[^.]+$/, '.mp3');
|
||||
} else {
|
||||
return {
|
||||
success: false,
|
||||
error: `Cannot transcribe .${ext} format. Install ffmpeg for audio conversion, or send in a supported format (mp3, ogg, wav, flac).`,
|
||||
audioPath: options?.audioPath,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// Check file size and chunk if needed
|
||||
if (finalBuffer.length > MAX_FILE_SIZE) {
|
||||
const finalExt = finalFilename.split('.').pop()?.toLowerCase() || 'ogg';
|
||||
console.log(`[Transcription] File too large (${(finalBuffer.length / 1024 / 1024).toFixed(1)}MB), splitting into chunks`);
|
||||
const text = await transcribeInChunks(finalBuffer, finalExt);
|
||||
return { success: true, text };
|
||||
}
|
||||
|
||||
const text = await attemptTranscription(finalBuffer, finalFilename);
|
||||
return { success: true, text };
|
||||
} catch (error) {
|
||||
const errorMsg = error instanceof Error ? error.message : String(error);
|
||||
return {
|
||||
success: false,
|
||||
error: errorMsg,
|
||||
audioPath: options?.audioPath,
|
||||
};
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user