docs: consolidate voice documentation into docs/voice.md (#485)

2026-03-04 16:18:37 -08:00
parent 025fd38d5f
commit bb0ccd65e1
16 changed files with 249 additions and 96 deletions
--- a/README.md
+++ b/README.md
@@ -107,48 +107,13 @@ That's it! Message your bot on Telegram.

 > **Note:** For detailed environment variable reference and multi-channel setup, see [SKILL.md](./SKILL.md)

-## Voice Messages
+## Voice

-LettaBot can transcribe voice messages using either OpenAI Whisper or Mistral Voxtral. Voice messages are automatically converted to text and sent to the agent with a `[Voice message]:` prefix.
+LettaBot can transcribe incoming voice messages (via OpenAI Whisper or Mistral Voxtral) and reply with voice memos (via ElevenLabs or OpenAI TTS). Voice notes render as native bubbles on Telegram and WhatsApp.

 **Supported channels:** Telegram, WhatsApp, Signal, Slack, Discord

-### Configuration
-
-**Option 1: OpenAI Whisper**
-
-Add your OpenAI API key to `lettabot.yaml`:
-
-```yaml
-transcription:
-  provider: openai
-  apiKey: sk-...
-  model: whisper-1  # optional, defaults to whisper-1
-```
-
-Or set via environment variable:
-
-```bash
-export OPENAI_API_KEY=sk-...
-```
-
-**Option 2: Mistral Voxtral** (2x faster, 2x cheaper)
-
-Add your Mistral API key to `lettabot.yaml`:
-
-```yaml
-transcription:
-  provider: mistral
-  apiKey: ...
-```
-
-Or set via environment variable:
-
-```bash
-export MISTRAL_API_KEY=...
-```
-
-If no API key is configured, users will receive an error message with a link to this section.
+See [docs/voice.md](./docs/voice.md) for full setup, configuration, and troubleshooting.

 ## Skills
 LettaBot is compatible with [skills.sh](https://skills.sh) and [Clawdhub](https://clawdhub.com/). 
--- a/docs/cli-tools.md
+++ b/docs/cli-tools.md
@@ -11,7 +11,7 @@ Send a message to the most recent chat, or target a specific channel/chat.
 lettabot-message send --text "Hello from a background task"
 lettabot-message send --text "Hello" --channel slack --chat C123456
 lettabot-message send --file /tmp/report.pdf --text "Report attached" --channel discord --chat 123456789
-lettabot-message send --file /tmp/voice.ogg --voice    # Send as native voice note
+lettabot-message send --file /tmp/voice.ogg --voice    # Send as native voice note (see voice.md)
 ```

 ## lettabot-react
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -809,44 +809,30 @@ The top-level `polling` section takes priority if both are present.

 ## Transcription Configuration

-Voice message transcription via OpenAI Whisper:
+Voice message transcription (OpenAI Whisper or Mistral Voxtral):

 ```yaml
 transcription:
-  provider: openai
-  apiKey: sk-...       # Optional: uses OPENAI_API_KEY env var
-  model: whisper-1     # Default
+  provider: openai       # "openai" (default) or "mistral"
+  apiKey: sk-...         # Optional: falls back to OPENAI_API_KEY / MISTRAL_API_KEY env var
+  model: whisper-1       # Default (OpenAI) or voxtral-mini-latest (Mistral)
 ```

+See [voice.md](./voice.md) for provider details, supported formats, and troubleshooting.
+
 ## Text-to-Speech (TTS) Configuration

-Voice memo generation via the `<voice>` directive. The agent can reply with voice notes on Telegram and WhatsApp:
+Voice memo generation via the `<voice>` directive (ElevenLabs or OpenAI):

 ```yaml
 tts:
  provider: elevenlabs    # "elevenlabs" (default) or "openai"
  apiKey: sk_475a...      # Provider API key
-  voiceId: 21m00Tcm4TlvDq8ikWAM  # Voice selection (see below)
+  voiceId: onwK4e9ZLuTAKqWW03F9   # Voice selection
  model: eleven_multilingual_v2   # Optional model override
 ```

-**ElevenLabs** (default):
- `voiceId` is an ElevenLabs voice ID. Default: `21m00Tcm4TlvDq8ikWAM` (Rachel). Browse voices at [elevenlabs.io/voice-library](https://elevenlabs.io/voice-library).
- `model` defaults to `eleven_multilingual_v2`.
-
-**OpenAI**:
- `voiceId` is one of: `alloy`, `echo`, `fable`, `onyx`, `nova`, `shimmer`. Default: `alloy`.
- `model` defaults to `tts-1`. Use `tts-1-hd` for higher quality.
-
-The agent uses the `<voice>` directive in responses:
-
-```xml
-<actions>
-  <voice>Hey, here's a quick voice reply!</voice>
-</actions>
-```
-
-The `lettabot-tts` CLI tool is also available for background tasks (heartbeats, cron).
+See [voice.md](./voice.md) for provider options, channel support, and CLI tools.

 ## Attachments Configuration

@@ -987,7 +973,7 @@ Reference:
 | `LETTABOT_WORKING_DIR` | Agent working directory (overridden by per-agent `workingDir`) |
 | `TTS_PROVIDER` | TTS backend: `elevenlabs` (default) or `openai` |
 | `ELEVENLABS_API_KEY` | API key for ElevenLabs TTS |
-| `ELEVENLABS_VOICE_ID` | ElevenLabs voice ID (default: `21m00Tcm4TlvDq8ikWAM` / Rachel) |
+| `ELEVENLABS_VOICE_ID` | ElevenLabs voice ID (default: `onwK4e9ZLuTAKqWW03F9`) |
 | `ELEVENLABS_MODEL_ID` | ElevenLabs model (default: `eleven_multilingual_v2`) |
 | `OPENAI_TTS_VOICE` | OpenAI TTS voice (default: `alloy`) |
 | `OPENAI_TTS_MODEL` | OpenAI TTS model (default: `tts-1`) |
--- a/docs/directives.md
+++ b/docs/directives.md
@@ -72,7 +72,7 @@ Generates speech from text via TTS and sends it as a native voice note. No tool
 <voice>Hey, here's a quick voice reply!</voice>
 ```

-The text content is sent to the configured TTS provider (see [TTS Configuration](./configuration.md#text-to-speech-tts-configuration)), converted to audio, and delivered as a voice note. Audio is automatically cleaned up after sending.
+The text content is sent to the configured TTS provider, converted to audio, and delivered as a voice note. Audio is automatically cleaned up after sending. See [voice.md](./voice.md) for full setup and provider options.

 - Requires `tts` to be configured in `lettabot.yaml`
 - Renders as native voice bubbles on Telegram and WhatsApp
@@ -111,7 +111,7 @@ Backslash-escaped quotes (common when LLMs generate XML inside a JSON context) a
 | Slack     | Yes | Yes | Audio attachment | Reactions use Slack emoji names (`:thumbsup:` style). |
 | Discord   | Yes | Yes | Audio attachment | Custom server emoji not yet supported. |
 | WhatsApp  | No  | Yes | Voice note (PTT) | Sent with `ptt: true` for native voice bubble. |
-| Signal    | No  | No  | No | Directive skipped with a warning. |
+| Signal    | No  | Yes | Audio attachment | Sent as a file attachment. |

 When a channel doesn't implement `addReaction`, the directive is silently skipped and a warning is logged. This never blocks message delivery.

--- a/docs/telegram-setup.md
+++ b/docs/telegram-setup.md
@@ -114,16 +114,7 @@ LettaBot responds to these Telegram commands:

 ### Voice Messages

-Send a voice message to have it transcribed and processed:
-
-1. Requires `OPENAI_API_KEY` for transcription
-2. The bot transcribes and responds to the content
-3. Configure in `lettabot.yaml`:
-   ```yaml
-   transcription:
-     provider: openai
-     apiKey: sk-...  # Optional: uses OPENAI_API_KEY env var
-   ```
+Send a voice message to have it transcribed and processed. See [voice.md](./voice.md) for transcription setup (OpenAI Whisper or Mistral Voxtral).

 ### Attachments

@@ -184,9 +175,7 @@ First responses may take longer as the agent "wakes up".

 ### Voice messages not working

-1. Make sure `OPENAI_API_KEY` is set
-2. Check the logs for transcription errors
-3. Verify your OpenAI account has API access
+See [voice.md troubleshooting](./voice.md#troubleshooting) for common issues.

 ### Rate limiting

--- a/docs/voice.md
+++ b/docs/voice.md
@@ -0,0 +1,194 @@
+# Voice
+
+LettaBot has full voice support: it can receive voice messages (transcribed to text) and reply with voice memos (generated via TTS). Both features work across Telegram, WhatsApp, Signal, Discord, and Slack.
+
+## Voice Transcription (Receiving Voice Messages)
+
+When a user sends a voice message, LettaBot downloads the audio, transcribes it via the configured provider, and delivers the text to the agent with a `[Voice message]:` prefix.
+
+### Providers
+
+**OpenAI Whisper** (default):
+
+```yaml
+transcription:
+  provider: openai
+  apiKey: sk-...       # Optional: falls back to OPENAI_API_KEY env var
+  model: whisper-1     # Default
+```
+
+**Mistral Voxtral** (faster, lower cost):
+
+```yaml
+transcription:
+  provider: mistral
+  apiKey: ...          # Optional: falls back to MISTRAL_API_KEY env var
+  model: voxtral-mini-latest  # Default
+```
+
+Or configure via environment variables alone:
+
+```bash
+# OpenAI (default provider when no config is set)
+export OPENAI_API_KEY=sk-...
+
+# Mistral (requires provider to be set in config)
+export MISTRAL_API_KEY=...
+```
+
+If no API key is configured, users who send voice messages will receive an error message with a setup link.
+
+### Supported Audio Formats
+
+These formats are sent directly to the transcription API (some with a filename remap):
+
+`flac`, `m4a`, `mp3`, `mp4`, `mpeg`, `mpga`, `oga`, `ogg`, `opus`, `wav`, `webm`
+
+These formats are automatically converted to MP3 via ffmpeg (if installed):
+
+`aac`, `amr`, `caf`, `3gp`, `3gpp`
+
+Files over 20MB are automatically split into 10-minute chunks before transcription.
+
+### Channel Support
+
+| Channel   | Format received | Notes |
+|-----------|----------------|-------|
+| Telegram  | OGG/Opus       | Native voice messages |
+| WhatsApp  | OGG/Opus       | Push-to-talk voice messages |
+| Signal    | Various        | Voice attachments |
+| Discord   | Various        | Audio file attachments |
+| Slack     | Various        | Audio file uploads |
+
+## Voice Memos (Sending Voice Notes)
+
+The agent can reply with voice notes using the `<voice>` directive. The text is sent to a TTS provider, converted to OGG Opus audio, and delivered as a native voice bubble (on Telegram and WhatsApp) or a playable audio attachment (on Discord and Slack).
+
+### How It Works
+
+The agent includes a `<voice>` tag in its response:
+
+```xml
+<actions>
+  <voice>Hey, here's a quick update on that thing we discussed.</voice>
+</actions>
+```
+
+This can be combined with text -- anything after the `</actions>` block is sent as a normal message alongside the voice note:
+
+```xml
+<actions>
+  <voice>Here's the summary as audio.</voice>
+</actions>
+And here it is in text form too!
+```
+
+See [directives.md](./directives.md) for the full directive reference.
+
+### Providers
+
+**ElevenLabs** (default):
+
+```yaml
+tts:
+  provider: elevenlabs
+  apiKey: sk_475a...                    # Or ELEVENLABS_API_KEY env var
+  voiceId: onwK4e9ZLuTAKqWW03F9         # Or ELEVENLABS_VOICE_ID env var
+  model: eleven_multilingual_v2         # Or ELEVENLABS_MODEL_ID
+```
+
+Browse voices at [elevenlabs.io/voice-library](https://elevenlabs.io/voice-library).
+
+**OpenAI**:
+
+```yaml
+tts:
+  provider: openai
+  apiKey: sk-...                        # Or OPENAI_API_KEY env var
+  voiceId: alloy                        # Or OPENAI_TTS_VOICE (options: alloy, echo, fable, onyx, nova, shimmer)
+  model: tts-1                          # Or OPENAI_TTS_MODEL (use tts-1-hd for higher quality)
+```
+
+### Channel Support
+
+| Channel   | Delivery | Notes |
+|-----------|----------|-------|
+| Telegram  | Native voice bubble | Falls back to audio file if user has voice message privacy enabled (Telegram Premium). Users can allow via Settings > Privacy and Security > Voice Messages. |
+| WhatsApp  | Native voice bubble | Sent with push-to-talk (`ptt: true`) for native rendering. |
+| Discord   | Audio attachment | Playable inline. |
+| Slack     | Audio attachment | Playable inline. |
+| Signal    | Audio attachment | Sent as a file attachment. |
+
+### When to Use Voice
+
+- User sent a voice message and a voice reply feels natural
+- User explicitly asks for a voice/audio response
+- Short, conversational responses (under ~30 seconds of speech)
+
+### When NOT to Use Voice
+
+- Code snippets, file paths, URLs, or structured data -- these should be text
+- Long responses (keep voice under ~30 seconds)
+- When the user has indicated a preference for text
+
+## CLI Tools
+
+### `lettabot-tts`
+
+Generate audio from the command line:
+
+```bash
+lettabot-tts "Hello, this is a test"           # Outputs file path to stdout
+lettabot-tts "Hello" /tmp/output.ogg            # Explicit output path
+```
+
+Output files are written to `data/outbound/` by default and auto-cleaned after 1 hour.
+
+### `lettabot-message --voice`
+
+Send a voice note from a background task (heartbeat, cron):
+
+```bash
+# Generate + send in one step
+OUTPUT=$(lettabot-tts "Your message here") || exit 1
+lettabot-message send --file "$OUTPUT" --voice
+
+# Send to a specific channel
+lettabot-message send --file "$OUTPUT" --voice --channel telegram --chat 123456
+```
+
+## Environment Variable Reference
+
+| Variable | Description | Default |
+|----------|-------------|---------|
+| **Transcription** | | |
+| `OPENAI_API_KEY` | OpenAI API key (Whisper transcription + OpenAI TTS) | -- |
+| `MISTRAL_API_KEY` | Mistral API key (Voxtral transcription) | -- |
+| `TRANSCRIPTION_MODEL` | Override transcription model | `whisper-1` / `voxtral-mini-latest` |
+| **Text-to-Speech** | | |
+| `TTS_PROVIDER` | TTS backend | `elevenlabs` |
+| `ELEVENLABS_API_KEY` | ElevenLabs API key | -- |
+| `ELEVENLABS_VOICE_ID` | ElevenLabs voice ID | `onwK4e9ZLuTAKqWW03F9` |
+| `ELEVENLABS_MODEL_ID` | ElevenLabs model | `eleven_multilingual_v2` |
+| `OPENAI_TTS_VOICE` | OpenAI TTS voice name | `alloy` |
+| `OPENAI_TTS_MODEL` | OpenAI TTS model | `tts-1` |
+
+All environment variables can be overridden by the equivalent YAML config fields (see above).
+
+## Troubleshooting
+
+### Voice messages not transcribing
+
+1. Check that an API key is configured -- either in `lettabot.yaml` under `transcription.apiKey` or via the `OPENAI_API_KEY` / `MISTRAL_API_KEY` environment variable
+2. Check the logs for transcription errors
+3. If using an unsupported audio format, install `ffmpeg` for automatic conversion
+
+### Voice memos not generating
+
+1. Check that a TTS provider is configured -- either in `lettabot.yaml` under `tts` or via `ELEVENLABS_API_KEY` / `OPENAI_API_KEY`
+2. Check that `jq` and `curl` are installed (required by the `lettabot-tts` script)
+3. Check logs for TTS API errors (HTTP status codes, rate limits)
+
+### Telegram voice privacy
+
+If the bot sends audio files instead of voice bubbles on Telegram, the recipient has voice message privacy enabled (Telegram Premium feature). They can allow voice messages via Settings > Privacy and Security > Voice Messages.
--- a/docs/whatsapp-setup.md
+++ b/docs/whatsapp-setup.md
@@ -141,7 +141,7 @@ This uses your personal WhatsApp account:
 LettaBot supports receiving images, documents, and voice messages:

 - **Images**: Downloaded and shown to the agent (agent can view using Read tool)
- **Voice messages**: Automatically transcribed via OpenAI Whisper
+- **Voice messages**: Automatically transcribed (see [voice.md](./voice.md) for setup)
 - **Documents**: Downloaded with metadata shown to agent

 Configure attachment handling in `lettabot.yaml`:
--- a/e2e/bot.e2e.test.ts
+++ b/e2e/bot.e2e.test.ts
@@ -16,6 +16,18 @@ import { join } from 'node:path';

 // Skip if no API key (local dev without secrets)
 const SKIP_E2E = !process.env.LETTA_API_KEY || !process.env.LETTA_E2E_AGENT_ID;
+const DEFAULT_MESSAGE_TIMEOUT_MS = 120000;
+
+function parseTimeoutMs(envName: string, fallback: number): number {
+  const value = process.env[envName];
+  if (!value) return fallback;
+
+  const parsed = Number.parseInt(value, 10);
+  return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback;
+}
+
+const E2E_MESSAGE_TIMEOUT_MS = parseTimeoutMs('LETTA_E2E_MESSAGE_TIMEOUT_MS', DEFAULT_MESSAGE_TIMEOUT_MS);
+const E2E_MULTI_TURN_TIMEOUT_MS = parseTimeoutMs('LETTA_E2E_MULTI_TURN_TIMEOUT_MS', E2E_MESSAGE_TIMEOUT_MS * 2);

 describe.skipIf(SKIP_E2E)('e2e: LettaBot with Letta API', () => {
  let bot: LettaBot;
@@ -52,13 +64,15 @@ describe.skipIf(SKIP_E2E)('e2e: LettaBot with Letta API', () => {
  });

  it('responds to a simple message', async () => {
-    const response = await mockAdapter.simulateMessage('Say "E2E TEST OK" and nothing else.');
+    const response = await mockAdapter.simulateMessage('Say "E2E TEST OK" and nothing else.', {
+      timeoutMs: E2E_MESSAGE_TIMEOUT_MS,
+    });
    
    expect(response).toBeTruthy();
    expect(response.length).toBeGreaterThan(0);
    // The agent should respond with something containing our test phrase
    expect(response.toUpperCase()).toContain('E2E TEST OK');
-  }, 60000); // 60s timeout
+  }, E2E_MESSAGE_TIMEOUT_MS + 10000);

  it('handles /status command', async () => {
    const response = await mockAdapter.simulateMessage('/status');
@@ -78,14 +92,18 @@ describe.skipIf(SKIP_E2E)('e2e: LettaBot with Letta API', () => {

  it('maintains conversation context', async () => {
    // First message - set context
-    await mockAdapter.simulateMessage('Remember this number: 42424242');
+    await mockAdapter.simulateMessage('Remember this number: 42424242', {
+      timeoutMs: E2E_MESSAGE_TIMEOUT_MS,
+    });
    
    // Clear messages but keep session
    mockAdapter.clearMessages();
    
    // Second message - recall context
-    const response = await mockAdapter.simulateMessage('What number did I just tell you to remember?');
+    const response = await mockAdapter.simulateMessage('What number did I just tell you to remember?', {
+      timeoutMs: E2E_MULTI_TURN_TIMEOUT_MS,
+    });
    
    expect(response).toContain('42424242');
-  }, 120000); // 2 min timeout for multi-turn
+  }, E2E_MULTI_TURN_TIMEOUT_MS + 10000);
 });
--- a/skills/voice-memo/lettabot-tts
+++ b/skills/voice-memo/lettabot-tts
@@ -8,7 +8,7 @@
 #
 #   ElevenLabs:
 #     ELEVENLABS_API_KEY   - Required. API key.
-#     ELEVENLABS_VOICE_ID  - Optional. Voice ID (default: 21m00Tcm4TlvDq8ikWAM / Rachel).
+#     ELEVENLABS_VOICE_ID  - Optional. Voice ID (default: onwK4e9ZLuTAKqWW03F9).
 #     ELEVENLABS_MODEL_ID  - Optional. Model ID (default: eleven_multilingual_v2).
 #
 #   OpenAI:
--- a/src/api/server.ts
+++ b/src/api/server.ts
@@ -268,7 +268,7 @@ export function createApiServer(deliverer: AgentRouter, options: ServerOptions):
          return;
        }

-        console.log(`[API] Async chat request for agent "${resolvedName}": ${chatReq.message.slice(0, 100)}...`);
+        log.info(`Async chat request for agent "${resolvedName}": ${chatReq.message.slice(0, 100)}...`);

        // Return 202 immediately
        const asyncRes: AsyncChatResponse = {
@@ -282,10 +282,10 @@ export function createApiServer(deliverer: AgentRouter, options: ServerOptions):
        // Process in background (detached promise)
        const context = { type: 'webhook' as const, outputMode: 'silent' as const };
        deliverer.sendToAgent(agentName, chatReq.message, context).catch((error: any) => {
-          console.error(`[API] Async chat background error for agent "${resolvedName}":`, error);
+          log.error(`Async chat background error for agent "${resolvedName}":`, error);
        });
      } catch (error: any) {
-        console.error('[API] Async chat error:', error);
+        log.error('Async chat error:', error);
        const asyncRes: AsyncChatResponse = {
          success: false,
          status: 'error',
@@ -816,4 +816,3 @@ if (apiKey) init();
 </script>
 </body>
 </html>`;
-
--- a/src/channels/discord.ts
+++ b/src/channels/discord.ts
@@ -188,7 +188,7 @@ Ask the bot owner to approve with:
        try {
          const { isTranscriptionConfigured } = await import('../transcription/index.js');
          if (!isTranscriptionConfigured()) {
-            await message.reply('Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice-messages');
+            await message.reply('Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice');
          } else {
            // Download audio
            const response = await fetch(audioAttachment.url);
--- a/src/channels/signal.ts
+++ b/src/channels/signal.ts
@@ -702,7 +702,7 @@ This code expires in 1 hour.`;
            if (chatId) {
              await this.sendMessage({
                chatId,
-                text: 'Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice-messages'
+                text: 'Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice'
              });
            }
          } else {
--- a/src/channels/slack.ts
+++ b/src/channels/slack.ts
@@ -81,7 +81,7 @@ export class SlackAdapter implements ChannelAdapter {
        try {
          const { isTranscriptionConfigured } = await import('../transcription/index.js');
          if (!isTranscriptionConfigured()) {
-            await say('Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice-messages');
+            await say('Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice');
          } else {
            // Download file (requires bot token for auth)
            const response = await fetch(audioFile.url_private_download, {
@@ -198,7 +198,7 @@ export class SlackAdapter implements ChannelAdapter {
        try {
          const { isTranscriptionConfigured } = await import('../transcription/index.js');
          if (!isTranscriptionConfigured()) {
-            await this.sendMessage({ chatId: channelId, text: 'Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice-messages', threadId: threadTs });
+            await this.sendMessage({ chatId: channelId, text: 'Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice', threadId: threadTs });
            return;
          }
          // Download file (requires bot token for auth)
--- a/src/channels/telegram.ts
+++ b/src/channels/telegram.ts
@@ -383,7 +383,7 @@ export class TelegramAdapter implements ChannelAdapter {
      // Check if transcription is configured (config or env)
      const { isTranscriptionConfigured } = await import('../transcription/index.js');
      if (!isTranscriptionConfigured()) {
-        await ctx.reply('Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice-messages');
+        await ctx.reply('Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice');
        return;
      }

--- a/src/channels/whatsapp/inbound/media.ts
+++ b/src/channels/whatsapp/inbound/media.ts
@@ -168,7 +168,7 @@ export async function collectAttachments(params: {
        // Send error message directly to user (matches Telegram/Slack/Discord/Signal behavior)
        try {
          await sock.sendMessage(chatId, {
-            text: 'Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice-messages'
+            text: 'Voice messages require a transcription API key. See: https://github.com/letta-ai/lettabot#voice'
          });
        } catch (sendError) {
          log.error('Failed to send transcription error message:', sendError);
--- a/src/test/mock-channel.ts
+++ b/src/test/mock-channel.ts
@@ -69,6 +69,7 @@ export class MockChannelAdapter implements ChannelAdapter {
      userId?: string;
      chatId?: string;
      userName?: string;
+      timeoutMs?: number;
    } = {}
  ): Promise<string> {
    if (!this.onMessage) {
@@ -110,8 +111,9 @@ export class MockChannelAdapter implements ChannelAdapter {
    });
    
    // Wait for response with timeout
+    const timeoutMs = options.timeoutMs ?? 60000;
    const timeoutPromise = new Promise<never>((_, reject) => {
-      setTimeout(() => reject(new Error('Response timeout (60s)')), 60000);
+      setTimeout(() => reject(new Error(`Response timeout (${Math.round(timeoutMs / 1000)}s)`)), timeoutMs);
    });
    
    const response = await Promise.race([responsePromise, timeoutPromise]);