Add voice message transcription support (#54)

* Add voice message transcription support (all channels) Adds OpenAI Whisper transcription for voice messages across all channels: - Telegram: ctx.message.voice - WhatsApp: audioMessage via downloadMediaMessage - Signal: audio attachments from local files - Slack: audio files via url_private_download - Discord: audio attachments Voice messages sent to agent as "[Voice message]: <transcript>" Configuration (config takes priority over env): - lettabot.yaml: transcription.apiKey, transcription.model - Env: OPENAI_API_KEY, TRANSCRIPTION_MODEL Closes #47 Written by Cameron ◯ Letta Code "The best interface is no interface - just talk." * Add voice message documentation to README - Add Voice Messages to features list - Add configuration section for transcription - Document supported channels Written by Cameron ◯ Letta Code * Notify users when voice transcription is not configured Instead of silently ignoring voice messages, send a helpful message linking to the documentation. Written by Cameron ◯ Letta Code * feat: upgrade to letta-code-sdk main + fix Signal voice transcription - Switch from published SDK (v0.0.3) to local main branch (file:../letta-code-sdk) - Update bot.ts for new SDK API: createSession(agentId?, options) signature - Add conversationId tracking to store for proper conversation persistence - Fix Signal voice transcription: read attachments from ~/.local/share/signal-cli/attachments/ - Fix Telegram markdown ESM issue: make markdownToTelegramV2 async with dynamic import - Add transcription config to lettabot.yaml - Add extensive debug logging for queue and session processing Signal voice messages now properly transcribe and send to agent. 🐾 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> * fix: update Signal CLI message sender to use daemon JSON-RPC API - Switch from signal-cli-rest-api to signal-cli daemon (port 8090) - Use JSON-RPC send method instead of REST /v2/send - Support group IDs with group: prefix - Handle 201 responses and empty bodies correctly 🐾 Generated with [Letta Code](https://letta.com) Co-Authored-By: Letta <noreply@letta.com> * Add placeholder for untranscribed voice messages on Signal If a voice-only message arrives and transcription fails or is disabled, forward a placeholder so the user knows the message was received. Written by Cameron ◯ Letta Code --------- Co-authored-by: Letta <noreply@letta.com>
2026-02-01 20:07:57 -08:00
parent cf1f03e0c7
commit 053763bf89
16 changed files with 511 additions and 773 deletions
--- a/README.md
+++ b/README.md
@@ -10,6 +10,7 @@ Your personal AI assistant that remembers everything across **Telegram, Slack, D
 - **Unified Memory** - Single agent remembers everything from all channels
 - **Persistent Memory** - Agent remembers conversations across sessions (days/weeks/months)
 - **Local Tool Execution** - Agent can read files, search code, run commands on your machine
+- **Voice Messages** - Automatic transcription via OpenAI Whisper
 - **Heartbeat** - Periodic check-ins where the agent reviews tasks
 - **Scheduling** - Agent can create one-off reminders and recurring tasks
 - **Streaming Responses** - Real-time message updates as the agent thinks
@@ -97,6 +98,31 @@ That's it! Message your bot on Telegram.

 > **Note:** For detailed environment variable reference and multi-channel setup, see [SKILL.md](./SKILL.md)

+## Voice Messages
+
+LettaBot can transcribe voice messages using OpenAI Whisper. Voice messages are automatically converted to text and sent to the agent with a `[Voice message]:` prefix.
+
+**Supported channels:** Telegram, WhatsApp, Signal, Slack, Discord
+
+### Configuration
+
+Add your OpenAI API key to `lettabot.config.yaml`:
+
+```yaml
+transcription:
+  provider: openai
+  apiKey: sk-...
+  model: whisper-1  # optional, defaults to whisper-1
+```
+
+Or set via environment variable:
+
+```bash
+export OPENAI_API_KEY=sk-...
+```
+
+If no API key is configured, voice messages are silently ignored.
+
 ## Skills
 LettaBot is compatible with [skills.sh](https://skills.sh) and [Clawdhub](https://clawdhub.com/). 

--- a/package-lock.json
+++ b/package-lock.json
--- a/package.json
+++ b/package.json
@@ -39,7 +39,7 @@
    "@clack/prompts": "^0.11.0",
    "@hapi/boom": "^10.0.1",
    "@letta-ai/letta-client": "^1.7.6",
-    "@letta-ai/letta-code-sdk": "^0.0.3",
+    "@letta-ai/letta-code-sdk": "file:../letta-code-sdk",
    "@types/express": "^5.0.6",
    "@types/node": "^25.0.10",
    "@types/node-schedule": "^2.1.8",
@@ -50,6 +50,7 @@
    "gray-matter": "^4.0.3",
    "node-schedule": "^2.1.1",
    "open": "^11.0.0",
+    "openai": "^6.17.0",
    "qrcode-terminal": "^0.12.0",
    "telegram-markdown-v2": "^0.0.4",
    "tsx": "^4.21.0",
--- a/src/channels/discord.ts
+++ b/src/channels/discord.ts
@@ -132,10 +132,35 @@ Ask the bot owner to approve with:
    this.client.on('messageCreate', async (message) => {
      if (message.author?.bot) return;

-      const content = (message.content || '').trim();
+      let content = (message.content || '').trim();
      const userId = message.author?.id;
      if (!userId) return;
      
+      // Handle audio attachments
+      const audioAttachment = message.attachments.find(a => a.contentType?.startsWith('audio/'));
+      if (audioAttachment?.url) {
+        try {
+          const { loadConfig } = await import('../config/index.js');
+          const config = loadConfig();
+          if (!config.transcription?.apiKey && !process.env.OPENAI_API_KEY) {
+            await message.reply('Voice messages require OpenAI API key for transcription. See: https://github.com/letta-ai/lettabot#voice-messages');
+          } else {
+            // Download audio
+            const response = await fetch(audioAttachment.url);
+            const buffer = Buffer.from(await response.arrayBuffer());
+            
+            const { transcribeAudio } = await import('../transcription/index.js');
+            const ext = audioAttachment.contentType?.split('/')[1] || 'mp3';
+            const transcript = await transcribeAudio(buffer, audioAttachment.name || `audio.${ext}`);
+            
+            console.log(`[Discord] Transcribed audio: "${transcript.slice(0, 50)}..."`);
+            content = (content ? content + '\n' : '') + `[Voice message]: ${transcript}`;
+          }
+        } catch (error) {
+          console.error('[Discord] Error transcribing audio:', error);
+        }
+      }
+
      const access = await this.checkAccess(userId);
      if (access === 'blocked') {
        const ch = message.channel;
--- a/src/channels/signal.ts
+++ b/src/channels/signal.ts
@@ -46,6 +46,11 @@ type SignalSseEvent = {
        groupId?: string;
        groupName?: string;
      };
+      attachments?: Array<{
+        contentType?: string;
+        filename?: string;
+        id?: string;
+      }>;
    };
    syncMessage?: {
      sentMessage?: {
@@ -57,6 +62,11 @@ type SignalSseEvent = {
          groupId?: string;
          groupName?: string;
        };
+        attachments?: Array<{
+          contentType?: string;
+          filename?: string;
+          id?: string;
+        }>;
      };
    };
    typingMessage?: {
@@ -444,6 +454,11 @@ This code expires in 1 hour.`;
      
      if (!envelope) return;
      
+      // Debug: log when we receive any message
+      if (envelope.dataMessage || envelope.syncMessage) {
+        console.log('[Signal] Received envelope:', JSON.stringify(envelope, null, 2));
+      }
+      
      // Handle incoming data messages (from others)
      const dataMessage = envelope.dataMessage;
      
@@ -455,23 +470,26 @@ This code expires in 1 hour.`;
      let source: string | undefined;
      let chatId: string | undefined;
      let groupInfo: { groupId?: string; groupName?: string } | undefined;
+      let attachments: Array<{ contentType?: string; filename?: string; id?: string }> | undefined;
      
-      if (dataMessage?.message) {
+      if (dataMessage?.message || dataMessage?.attachments?.length) {
        // Regular incoming message
        messageText = dataMessage.message;
        source = envelope.source || envelope.sourceUuid;
        groupInfo = dataMessage.groupInfo;
+        attachments = dataMessage.attachments;
        
        if (groupInfo?.groupId) {
          chatId = `group:${groupInfo.groupId}`;
        } else {
          chatId = source;
        }
-      } else if (syncMessage?.message) {
+      } else if (syncMessage?.message || syncMessage?.attachments?.length) {
        // Sync message (Note to Self or sent from another device)
        messageText = syncMessage.message;
        source = syncMessage.destination || syncMessage.destinationUuid;
        groupInfo = syncMessage.groupInfo;
+        attachments = syncMessage.attachments;
        
        // For Note to Self, destination is our own number
        const isNoteToSelf = source === this.config.phoneNumber || 
@@ -487,20 +505,73 @@ This code expires in 1 hour.`;
        }
      }
      
-      if (!messageText || !source || !chatId) {
+      // Check if we have a valid message before attachment processing
+      if (!source || !chatId) {
+        return;
+      }
+      
+      // Handle voice message attachments
+      const voiceAttachment = attachments?.find(a => a.contentType?.startsWith('audio/'));
+      if (voiceAttachment?.id) {
+        console.log(`[Signal] Voice attachment detected: ${voiceAttachment.contentType}, id: ${voiceAttachment.id}`);
+        try {
+          const { loadConfig } = await import('../config/index.js');
+          const config = loadConfig();
+          if (!config.transcription?.apiKey && !process.env.OPENAI_API_KEY) {
+            if (chatId) {
+              await this.sendMessage({ 
+                chatId, 
+                text: 'Voice messages require OpenAI API key for transcription. See: https://github.com/letta-ai/lettabot#voice-messages' 
+              });
+            }
+          } else {
+            // Read attachment from signal-cli attachments directory
+            const { readFileSync } = await import('node:fs');
+            const { homedir } = await import('node:os');
+            const { join } = await import('node:path');
+            
+            const attachmentPath = join(homedir(), '.local/share/signal-cli/attachments', voiceAttachment.id);
+            console.log(`[Signal] Reading attachment from: ${attachmentPath}`);
+            const buffer = readFileSync(attachmentPath);
+            console.log(`[Signal] Read ${buffer.length} bytes`);
+            
+            const { transcribeAudio } = await import('../transcription/index.js');
+            const ext = voiceAttachment.contentType?.split('/')[1] || 'ogg';
+            const transcript = await transcribeAudio(buffer, `voice.${ext}`);
+            
+            console.log(`[Signal] Transcribed voice message: "${transcript.slice(0, 50)}..."`);
+            messageText = (messageText ? messageText + '\n' : '') + `[Voice message]: ${transcript}`;
+          }
+        } catch (error) {
+          console.error('[Signal] Error transcribing voice message:', error);
+        }
+      }
+      
+      // After processing attachments, check if we have any message content.
+      // If this was a voice-only message and transcription failed/was disabled,
+      // still forward a placeholder so the user knows we got it.
+      if (!messageText && voiceAttachment?.id) {
+        messageText = '[Voice message received]';
+      }
+      if (!messageText) {
        return;
      }
      
      // Handle Note to Self - check selfChatMode
+      console.log(`[Signal] Processing message: chatId=${chatId}, source=${source}, selfChatMode=${this.config.selfChatMode}`);
      if (chatId === 'note-to-self') {
        if (!this.config.selfChatMode) {
          // selfChatMode disabled - ignore Note to Self messages
+          console.log('[Signal] Note to Self ignored (selfChatMode disabled)');
          return;
        }
        // selfChatMode enabled - allow the message through
+        console.log('[Signal] Note to Self allowed (selfChatMode enabled)');
      } else {
        // External message - check access control
+        console.log('[Signal] Checking access for external message');
        const access = await this.checkAccess(source);
+        console.log(`[Signal] Access result: ${access}`);
        
        if (access === 'blocked') {
          console.log(`[Signal] Blocked message from unauthorized user: ${source}`);
--- a/src/channels/slack.ts
+++ b/src/channels/slack.ts
@@ -44,16 +44,44 @@ export class SlackAdapter implements ChannelAdapter {
    });
    
    // Handle messages
-    this.app.message(async ({ message, say }) => {
+    this.app.message(async ({ message, say, client }) => {
      // Type guard for regular messages
      if (message.subtype !== undefined) return;
      if (!('user' in message) || !('text' in message)) return;
      
      const userId = message.user;
-      const text = message.text || '';
+      let text = message.text || '';
      const channelId = message.channel;
      const threadTs = message.thread_ts || message.ts; // Reply in thread if applicable
      
+      // Handle audio file attachments
+      const files = (message as any).files as Array<{ mimetype?: string; url_private_download?: string; name?: string }> | undefined;
+      const audioFile = files?.find(f => f.mimetype?.startsWith('audio/'));
+      if (audioFile?.url_private_download) {
+        try {
+          const { loadConfig } = await import('../config/index.js');
+          const config = loadConfig();
+          if (!config.transcription?.apiKey && !process.env.OPENAI_API_KEY) {
+            await say('Voice messages require OpenAI API key for transcription. See: https://github.com/letta-ai/lettabot#voice-messages');
+          } else {
+            // Download file (requires bot token for auth)
+            const response = await fetch(audioFile.url_private_download, {
+              headers: { 'Authorization': `Bearer ${this.config.botToken}` }
+            });
+            const buffer = Buffer.from(await response.arrayBuffer());
+            
+            const { transcribeAudio } = await import('../transcription/index.js');
+            const ext = audioFile.mimetype?.split('/')[1] || 'mp3';
+            const transcript = await transcribeAudio(buffer, audioFile.name || `audio.${ext}`);
+            
+            console.log(`[Slack] Transcribed audio: "${transcript.slice(0, 50)}..."`);
+            text = (text ? text + '\n' : '') + `[Voice message]: ${transcript}`;
+          }
+        } catch (error) {
+          console.error('[Slack] Error transcribing audio:', error);
+        }
+      }
+      
      // Check allowed users
      if (this.config.allowedUsers && this.config.allowedUsers.length > 0) {
        if (!this.config.allowedUsers.includes(userId)) {
--- a/src/channels/telegram-format.ts
+++ b/src/channels/telegram-format.ts
@@ -5,18 +5,18 @@
 * Supports: headers, bold, italic, code, links, blockquotes, lists, etc.
 */

-import { convert } from 'telegram-markdown-v2';
-
 /**
 * Convert markdown to Telegram MarkdownV2 format.
 * Handles proper escaping of special characters.
 */
-export function markdownToTelegramV2(markdown: string): string {
+export async function markdownToTelegramV2(markdown: string): Promise<string> {
  try {
+    // Dynamic import to avoid ESM/CommonJS compatibility issues
+    const { convert } = await import('telegram-markdown-v2');
    // Use 'keep' strategy to preserve blockquotes (>) and other elements
    return convert(markdown, 'keep');
  } catch (e) {
-    console.error('[Telegram] Markdown conversion failed:', e);
+    console.error('[Telegram] Markdown conversion failed, using fallback:', e);
    // Fallback: escape special characters manually
    return escapeMarkdownV2(markdown);
  }
--- a/src/channels/telegram.ts
+++ b/src/channels/telegram.ts
@@ -167,6 +167,56 @@ export class TelegramAdapter implements ChannelAdapter {
      }
    });
    
+    // Handle voice messages
+    this.bot.on('message:voice', async (ctx) => {
+      const userId = ctx.from?.id;
+      const chatId = ctx.chat.id;
+      
+      if (!userId) return;
+      
+      // Check if transcription is configured (config or env)
+      const { loadConfig } = await import('../config/index.js');
+      const config = loadConfig();
+      if (!config.transcription?.apiKey && !process.env.OPENAI_API_KEY) {
+        await ctx.reply('Voice messages require OpenAI API key for transcription. See: https://github.com/letta-ai/lettabot#voice-messages');
+        return;
+      }
+      
+      try {
+        // Get file link
+        const voice = ctx.message.voice;
+        const file = await ctx.api.getFile(voice.file_id);
+        const fileUrl = `https://api.telegram.org/file/bot${this.config.token}/${file.file_path}`;
+        
+        // Download audio
+        const response = await fetch(fileUrl);
+        const buffer = Buffer.from(await response.arrayBuffer());
+        
+        // Transcribe
+        const { transcribeAudio } = await import('../transcription/index.js');
+        const transcript = await transcribeAudio(buffer, 'voice.ogg');
+        
+        console.log(`[Telegram] Transcribed voice message: "${transcript.slice(0, 50)}..."`);
+        
+        // Send to agent as text with prefix
+        if (this.onMessage) {
+          await this.onMessage({
+            channel: 'telegram',
+            chatId: String(chatId),
+            userId: String(userId),
+            userName: ctx.from.username || ctx.from.first_name,
+            messageId: String(ctx.message.message_id),
+            text: `[Voice message]: ${transcript}`,
+            timestamp: new Date(),
+          });
+        }
+      } catch (error) {
+        console.error('[Telegram] Error processing voice message:', error);
+        // Optionally notify user
+        await ctx.reply('Sorry, I could not transcribe that voice message.');
+      }
+    });
+    
    // Error handler
    this.bot.catch((err) => {
      console.error('[Telegram] Bot error:', err);
@@ -199,7 +249,7 @@ export class TelegramAdapter implements ChannelAdapter {
    const { markdownToTelegramV2 } = await import('./telegram-format.js');
    
    // Convert markdown to Telegram MarkdownV2 format
-    const formatted = markdownToTelegramV2(msg.text);
+    const formatted = await markdownToTelegramV2(msg.text);
    
    const result = await this.bot.api.sendMessage(msg.chatId, formatted, {
      parse_mode: 'MarkdownV2',
@@ -210,7 +260,7 @@ export class TelegramAdapter implements ChannelAdapter {
  
  async editMessage(chatId: string, messageId: string, text: string): Promise<void> {
    const { markdownToTelegramV2 } = await import('./telegram-format.js');
-    const formatted = markdownToTelegramV2(text);
+    const formatted = await markdownToTelegramV2(text);
    await this.bot.api.editMessageText(chatId, Number(messageId), formatted, { parse_mode: 'MarkdownV2' });
  }
  
--- a/src/channels/whatsapp.ts
+++ b/src/channels/whatsapp.ts
@@ -142,6 +142,7 @@ Ask the bot owner to approve with:
      DisconnectReason,
      fetchLatestBaileysVersion,
      makeCacheableSignalKeyStore,
+      downloadMediaMessage,
    } = await import('@whiskeysockets/baileys');
    
    // Load auth state
@@ -253,10 +254,38 @@ Ask the bot owner to approve with:
          this.lidToJid.set(remoteJid, (m.key as any).senderPn);
        }
        
-        // Get message text
-        const text = m.message?.conversation || 
-                     m.message?.extendedTextMessage?.text ||
-                     '';
+        // Get message text or audio
+        let text = m.message?.conversation || 
+                   m.message?.extendedTextMessage?.text ||
+                   '';
+        
+        // Handle audio/voice messages
+        const audioMessage = m.message?.audioMessage;
+        if (audioMessage) {
+          try {
+            const { loadConfig } = await import('../config/index.js');
+            const config = loadConfig();
+            if (!config.transcription?.apiKey && !process.env.OPENAI_API_KEY) {
+              await this.sock!.sendMessage(remoteJid, { 
+                text: 'Voice messages require OpenAI API key for transcription. See: https://github.com/letta-ai/lettabot#voice-messages' 
+              });
+              continue;
+            }
+            
+            // Download audio
+            const buffer = await downloadMediaMessage(m, 'buffer', {});
+            
+            // Transcribe
+            const { transcribeAudio } = await import('../transcription/index.js');
+            const transcript = await transcribeAudio(buffer as Buffer, 'voice.ogg');
+            
+            console.log(`[WhatsApp] Transcribed voice message: "${transcript.slice(0, 50)}..."`);
+            text = `[Voice message]: ${transcript}`;
+          } catch (error) {
+            console.error('[WhatsApp] Error transcribing voice message:', error);
+            continue;
+          }
+        }
        
        if (!text) continue;
        
--- a/src/cli/message.ts
+++ b/src/cli/message.ts
@@ -100,26 +100,59 @@ async function sendSlack(chatId: string, text: string): Promise<void> {
 }

 async function sendSignal(chatId: string, text: string): Promise<void> {
-  const apiUrl = process.env.SIGNAL_CLI_REST_API_URL || 'http://localhost:8080';
+  // We talk to the signal-cli daemon JSON-RPC API (the same daemon the Signal adapter uses).
+  // This is *not* the signal-cli-rest-api container.
+  const apiUrl = process.env.SIGNAL_CLI_REST_API_URL || 'http://127.0.0.1:8090';
  const phoneNumber = process.env.SIGNAL_PHONE_NUMBER;

  if (!phoneNumber) {
    throw new Error('SIGNAL_PHONE_NUMBER not set');
  }

-  const response = await fetch(`${apiUrl}/v2/send`, {
-    method: 'POST',
-    headers: { 'Content-Type': 'application/json' },
-    body: JSON.stringify({
-      message: text,
-      number: phoneNumber,
-      recipients: [chatId],
-    }),
+  // Support group IDs in the same format we use everywhere else.
+  const params: Record<string, unknown> = {
+    account: phoneNumber,
+    message: text,
+  };
+
+  if (chatId.startsWith('group:')) {
+    params.groupId = chatId.slice('group:'.length);
+  } else {
+    params.recipient = [chatId];
+  }
+
+  const body = JSON.stringify({
+    jsonrpc: '2.0',
+    method: 'send',
+    params,
+    id: Date.now(),
  });

+  const response = await fetch(`${apiUrl}/api/v1/rpc`, {
+    method: 'POST',
+    headers: { 'Content-Type': 'application/json' },
+    body,
+  });
+
+  // signal-cli returns status 201 with empty body sometimes.
+  if (response.status === 201) {
+    console.log(`✓ Sent to signal:${chatId}`);
+    return;
+  }
+
+  const textBody = await response.text();
  if (!response.ok) {
-    const error = await response.text();
-    throw new Error(`Signal API error: ${error}`);
+    throw new Error(`Signal API error: ${textBody}`);
+  }
+
+  if (!textBody.trim()) {
+    console.log(`✓ Sent to signal:${chatId}`);
+    return;
+  }
+
+  const parsed = JSON.parse(textBody) as { result?: unknown; error?: { code?: number; message?: string } };
+  if (parsed.error) {
+    throw new Error(`Signal RPC ${parsed.error.code ?? 'unknown'}: ${parsed.error.message ?? 'unknown error'}`);
  }

  console.log(`✓ Sent to signal:${chatId}`);
@@ -257,7 +290,7 @@ Environment variables:
  SLACK_BOT_TOKEN         Required for Slack
  DISCORD_BOT_TOKEN       Required for Discord
  SIGNAL_PHONE_NUMBER     Required for Signal
-  SIGNAL_CLI_REST_API_URL Signal API URL (default: http://localhost:8080)
+  SIGNAL_CLI_REST_API_URL Signal daemon URL (default: http://127.0.0.1:8090)
 `);
 }

--- a/src/config/types.ts
+++ b/src/config/types.ts
@@ -49,6 +49,15 @@ export interface LettaBotConfig {
  integrations?: {
    google?: GoogleConfig;
  };
+
+  // Transcription (voice messages)
+  transcription?: TranscriptionConfig;
+}
+
+export interface TranscriptionConfig {
+  provider: 'openai';  // Only OpenAI supported currently
+  apiKey?: string;     // Falls back to OPENAI_API_KEY env var
+  model?: string;      // Defaults to 'whisper-1'
 }

 export interface ProviderConfig {
--- a/src/core/bot.ts
+++ b/src/core/bot.ts
@@ -121,10 +121,14 @@ export class LettaBot {
    
    // Add to queue
    this.messageQueue.push({ msg, adapter });
+    console.log(`[Queue] Added to queue, length: ${this.messageQueue.length}, processing: ${this.processing}`);
    
    // Process queue if not already processing
    if (!this.processing) {
-      this.processQueue();
+      console.log('[Queue] Starting queue processing');
+      this.processQueue().catch(err => console.error('[Queue] Fatal error in processQueue:', err));
+    } else {
+      console.log('[Queue] Already processing, will process when current message finishes');
    }
  }
  
@@ -132,12 +136,18 @@ export class LettaBot {
   * Process messages one at a time
   */
  private async processQueue(): Promise<void> {
-    if (this.processing || this.messageQueue.length === 0) return;
+    console.log(`[Queue] processQueue called: processing=${this.processing}, queueLength=${this.messageQueue.length}`);
+    if (this.processing || this.messageQueue.length === 0) {
+      console.log('[Queue] Exiting early: already processing or empty queue');
+      return;
+    }
    
    this.processing = true;
+    console.log('[Queue] Started processing');
    
    while (this.messageQueue.length > 0) {
      const { msg, adapter } = this.messageQueue.shift()!;
+      console.log(`[Queue] Processing message from ${msg.userId} (${this.messageQueue.length} remaining)`);
      try {
        await this.processMessage(msg, adapter);
      } catch (error) {
@@ -145,6 +155,7 @@ export class LettaBot {
      }
    }
    
+    console.log('[Queue] Finished processing all messages');
    this.processing = false;
  }
  
@@ -152,6 +163,7 @@ export class LettaBot {
   * Process a single message
   */
  private async processMessage(msg: InboundMessage, adapter: ChannelAdapter): Promise<void> {
+    console.log('[Bot] Starting processMessage');
    // Track when user last sent a message (for heartbeat skip logic)
    this.lastUserMessageTime = new Date();
    
@@ -163,32 +175,39 @@ export class LettaBot {
      updatedAt: new Date().toISOString(),
    };
    
+    console.log('[Bot] Sending typing indicator');
    // Start typing indicator
    await adapter.sendTypingIndicator(msg.chatId);
+    console.log('[Bot] Typing indicator sent');
    
    // Create or resume session
    let session: Session;
    // Base options for all sessions (model only included for new agents)
-    // Note: canUseTool workaround for SDK v0.0.3 bug - can be removed after letta-ai/letta-code-sdk#10 is released
    const baseOptions = {
      permissionMode: 'bypassPermissions' as const,
      allowedTools: this.config.allowedTools,
      cwd: this.config.workingDir,
      systemPrompt: SYSTEM_PROMPT,
-      canUseTool: () => ({ allow: true }),
    };
    
+    console.log('[Bot] Creating/resuming session');
    try {
-      if (this.store.agentId) {
+      if (this.store.conversationId) {
+        // Resume the specific conversation we've been using
+        console.log(`[Bot] Resuming conversation: ${this.store.conversationId}`);
+        process.env.LETTA_AGENT_ID = this.store.agentId || undefined;
+        session = resumeSession(this.store.conversationId, baseOptions);
+      } else if (this.store.agentId) {
+        // Agent exists but no conversation - try default conversation
+        console.log(`[Bot] Resuming agent default conversation: ${this.store.agentId}`);
        process.env.LETTA_AGENT_ID = this.store.agentId;
-
-        // Don't pass model when resuming - agent already has its model configured
        session = resumeSession(this.store.agentId, baseOptions);
      } else {
-
-        // Only pass model when creating a new agent
-        session = createSession({ ...baseOptions, model: this.config.model, memory: loadMemoryBlocks(this.config.agentName) });
+        // Create new agent with default conversation
+        console.log('[Bot] Creating new agent');
+        session = createSession(undefined, { ...baseOptions, model: this.config.model, memory: loadMemoryBlocks(this.config.agentName) });
      }
+      console.log('[Bot] Session created/resumed');
      
      const initTimeoutMs = 30000; // 30s timeout
      const withTimeout = async <T>(promise: Promise<T>, label: string): Promise<T> => {
@@ -301,21 +320,26 @@ export class LettaBot {
          }
          
          if (streamMsg.type === 'result') {
-            // Save agent ID and attach ignore tool (only on first message)
+            // Save agent ID and conversation ID
            if (session.agentId && session.agentId !== this.store.agentId) {
              const isNewAgent = !this.store.agentId;
              // Save agent ID along with the current server URL
              const currentBaseUrl = process.env.LETTA_BASE_URL || 'https://api.letta.com';
-              this.store.setAgent(session.agentId, currentBaseUrl);
-              console.log('Saved agent ID:', session.agentId, 'on server:', currentBaseUrl);
+              this.store.setAgent(session.agentId, currentBaseUrl, session.conversationId || undefined);
+              console.log('Saved agent ID:', session.agentId, 'conversation ID:', session.conversationId, 'on server:', currentBaseUrl);
              
              // Setup new agents: set name, install skills
              if (isNewAgent) {
-                if (this.config.agentName) {
+                if (this.config.agentName && session.agentId) {
                  updateAgentName(session.agentId, this.config.agentName).catch(() => {});
                }
-                installSkillsToAgent(session.agentId);
+                if (session.agentId) {
+                  installSkillsToAgent(session.agentId);
+                }
              }
+            } else if (session.conversationId && session.conversationId !== this.store.conversationId) {
+              // Update conversation ID if it changed
+              this.store.conversationId = session.conversationId;
            }
            break;
          }
@@ -376,22 +400,23 @@ export class LettaBot {
    _context?: TriggerContext
  ): Promise<string> {
    // Base options (model only for new agents)
-    // Note: canUseTool workaround for SDK v0.0.3 bug - can be removed after letta-ai/letta-code-sdk#10 is released
    const baseOptions = {
      permissionMode: 'bypassPermissions' as const,
      allowedTools: this.config.allowedTools,
      cwd: this.config.workingDir,
      systemPrompt: SYSTEM_PROMPT,
-      canUseTool: () => ({ allow: true }),
    };
    
    let session: Session;
-    if (this.store.agentId) {
-      // Don't pass model when resuming - agent already has its model configured
+    if (this.store.conversationId) {
+      // Resume the specific conversation we've been using
+      session = resumeSession(this.store.conversationId, baseOptions);
+    } else if (this.store.agentId) {
+      // Agent exists but no conversation - try default conversation
      session = resumeSession(this.store.agentId, baseOptions);
    } else {
-      // Only pass model when creating a new agent
-      session = createSession({ ...baseOptions, model: this.config.model, memory: loadMemoryBlocks(this.config.agentName) });
+      // Create new agent with default conversation
+      session = createSession(undefined, { ...baseOptions, model: this.config.model, memory: loadMemoryBlocks(this.config.agentName) });
    }
    
    try {
@@ -406,7 +431,9 @@ export class LettaBot {
        if (msg.type === 'result') {
          if (session.agentId && session.agentId !== this.store.agentId) {
            const currentBaseUrl = process.env.LETTA_BASE_URL || 'https://api.letta.com';
-            this.store.setAgent(session.agentId, currentBaseUrl);
+            this.store.setAgent(session.agentId, currentBaseUrl, session.conversationId || undefined);
+          } else if (session.conversationId && session.conversationId !== this.store.conversationId) {
+            this.store.conversationId = session.conversationId;
          }
          break;
        }
--- a/src/core/store.ts
+++ b/src/core/store.ts
@@ -53,6 +53,15 @@ export class Store {
    this.save();
  }
  
+  get conversationId(): string | null {
+    return this.data.conversationId || null;
+  }
+  
+  set conversationId(id: string | null) {
+    this.data.conversationId = id;
+    this.save();
+  }
+  
  get baseUrl(): string | undefined {
    return this.data.baseUrl;
  }
@@ -65,9 +74,10 @@ export class Store {
  /**
   * Set agent ID and associated server URL together
   */
-  setAgent(id: string | null, baseUrl?: string): void {
+  setAgent(id: string | null, baseUrl?: string, conversationId?: string): void {
    this.data.agentId = id;
    this.data.baseUrl = baseUrl;
+    this.data.conversationId = conversationId || this.data.conversationId;
    this.data.lastUsedAt = new Date().toISOString();
    if (id && !this.data.createdAt) {
      this.data.createdAt = new Date().toISOString();
--- a/src/core/types.ts
+++ b/src/core/types.ts
@@ -101,6 +101,7 @@ export interface LastMessageTarget {
 */
 export interface AgentStore {
  agentId: string | null;
+  conversationId?: string | null; // Current conversation ID
  baseUrl?: string; // Server URL this agent belongs to
  createdAt?: string;
  lastUsedAt?: string;
--- a/src/transcription/index.ts
+++ b/src/transcription/index.ts
@@ -0,0 +1,7 @@
+/**
+ * Transcription service
+ * 
+ * Currently supports OpenAI Whisper. Future providers can be added here.
+ */
+
+export { transcribeAudio } from './openai.js';
--- a/src/transcription/openai.ts
+++ b/src/transcription/openai.ts
@@ -0,0 +1,69 @@
+/**
+ * OpenAI Whisper transcription service
+ */
+
+import OpenAI from 'openai';
+import { loadConfig } from '../config/index.js';
+
+let openaiClient: OpenAI | null = null;
+
+function getClient(): OpenAI {
+  if (!openaiClient) {
+    const config = loadConfig();
+    // Config takes priority, then env var
+    const apiKey = config.transcription?.apiKey || process.env.OPENAI_API_KEY;
+    if (!apiKey) {
+      throw new Error('OpenAI API key required for transcription. Set in config (transcription.apiKey) or OPENAI_API_KEY env var.');
+    }
+    openaiClient = new OpenAI({ apiKey });
+  }
+  return openaiClient;
+}
+
+function getModel(): string {
+  const config = loadConfig();
+  return config.transcription?.model || process.env.TRANSCRIPTION_MODEL || 'whisper-1';
+}
+
+/**
+ * Transcribe audio using OpenAI Whisper API
+ * 
+ * @param audioBuffer - The audio data as a Buffer
+ * @param filename - Filename with extension (e.g., 'voice.ogg')
+ * @returns The transcribed text
+ */
+export async function transcribeAudio(audioBuffer: Buffer, filename: string = 'audio.ogg'): Promise<string> {
+  const client = getClient();
+  
+  // Create a File object from the buffer
+  // OpenAI SDK expects a File-like object
+  // Convert Buffer to Uint8Array to satisfy BlobPart type
+  const file = new File([new Uint8Array(audioBuffer)], filename, { 
+    type: getMimeType(filename) 
+  });
+  
+  const response = await client.audio.transcriptions.create({
+    file,
+    model: getModel(),
+  });
+  
+  return response.text;
+}
+
+/**
+ * Get MIME type from filename extension
+ */
+function getMimeType(filename: string): string {
+  const ext = filename.split('.').pop()?.toLowerCase();
+  const mimeTypes: Record<string, string> = {
+    'ogg': 'audio/ogg',
+    'mp3': 'audio/mpeg',
+    'mp4': 'audio/mp4',
+    'm4a': 'audio/mp4',
+    'wav': 'audio/wav',
+    'webm': 'audio/webm',
+    'mpeg': 'audio/mpeg',
+    'mpga': 'audio/mpeg',
+  };
+  return mimeTypes[ext || ''] || 'audio/ogg';
+}