diff --git a/.env.example b/.env.example index 77435e1..c31a4d5 100644 --- a/.env.example +++ b/.env.example @@ -141,6 +141,21 @@ TELEGRAM_BOT_TOKEN=your_telegram_bot_token # GMAIL_REFRESH_TOKEN=your_refresh_token # GMAIL_TELEGRAM_USER=123456789 +# ============================================ +# Voice Memos / TTS (optional) +# ============================================ +# TTS provider: "elevenlabs" (default) or "openai" +# TTS_PROVIDER=elevenlabs + +# ElevenLabs (default provider) +# ELEVENLABS_API_KEY=sk_your_elevenlabs_key +# ELEVENLABS_VOICE_ID=21m00Tcm4TlvDq8ikWAM +# ELEVENLABS_MODEL_ID=eleven_multilingual_v2 + +# OpenAI TTS (uses OPENAI_API_KEY from above) +# OPENAI_TTS_VOICE=alloy +# OPENAI_TTS_MODEL=tts-1 + # ============================================ # API Server (for Docker/CLI integration) # ============================================ diff --git a/docs/cli-tools.md b/docs/cli-tools.md index d923479..22ac7fa 100644 --- a/docs/cli-tools.md +++ b/docs/cli-tools.md @@ -11,6 +11,7 @@ Send a message to the most recent chat, or target a specific channel/chat. lettabot-message send --text "Hello from a background task" lettabot-message send --text "Hello" --channel slack --chat C123456 lettabot-message send --file /tmp/report.pdf --text "Report attached" --channel discord --chat 123456789 +lettabot-message send --file /tmp/voice.ogg --voice # Send as native voice note ``` ## lettabot-react diff --git a/docs/configuration.md b/docs/configuration.md index 5fc0e3c..2f7a461 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -675,6 +675,36 @@ transcription: model: whisper-1 # Default ``` +## Text-to-Speech (TTS) Configuration + +Voice memo generation via the `` directive. The agent can reply with voice notes on Telegram and WhatsApp: + +```yaml +tts: + provider: elevenlabs # "elevenlabs" (default) or "openai" + apiKey: sk_475a... # Provider API key + voiceId: 21m00Tcm4TlvDq8ikWAM # Voice selection (see below) + model: eleven_multilingual_v2 # Optional model override +``` + +**ElevenLabs** (default): +- `voiceId` is an ElevenLabs voice ID. Default: `21m00Tcm4TlvDq8ikWAM` (Rachel). Browse voices at [elevenlabs.io/voice-library](https://elevenlabs.io/voice-library). +- `model` defaults to `eleven_multilingual_v2`. + +**OpenAI**: +- `voiceId` is one of: `alloy`, `echo`, `fable`, `onyx`, `nova`, `shimmer`. Default: `alloy`. +- `model` defaults to `tts-1`. Use `tts-1-hd` for higher quality. + +The agent uses the `` directive in responses: + +```xml + + Hey, here's a quick voice reply! + +``` + +The `lettabot-tts` CLI tool is also available for background tasks (heartbeats, cron). + ## Attachments Configuration ```yaml @@ -807,5 +837,11 @@ Environment variables override config file values: | `LOG_LEVEL` | `server.logLevel` (fatal/error/warn/info/debug/trace). Overrides config. | | `LETTABOT_LOG_LEVEL` | Alias for `LOG_LEVEL` | | `LOG_FORMAT` | Set to `json` for structured JSON output (recommended for Railway/Docker) | +| `TTS_PROVIDER` | TTS backend: `elevenlabs` (default) or `openai` | +| `ELEVENLABS_API_KEY` | API key for ElevenLabs TTS | +| `ELEVENLABS_VOICE_ID` | ElevenLabs voice ID (default: `21m00Tcm4TlvDq8ikWAM` / Rachel) | +| `ELEVENLABS_MODEL_ID` | ElevenLabs model (default: `eleven_multilingual_v2`) | +| `OPENAI_TTS_VOICE` | OpenAI TTS voice (default: `alloy`) | +| `OPENAI_TTS_MODEL` | OpenAI TTS model (default: `tts-1`) | See [SKILL.md](../SKILL.md) for complete environment variable reference. diff --git a/docs/directives.md b/docs/directives.md index 78b13b3..118fc7d 100644 --- a/docs/directives.md +++ b/docs/directives.md @@ -48,13 +48,14 @@ Sends a file or image to the same channel/chat as the triggering message. ```xml + ``` **Attributes:** - `path` / `file` (required) -- Local file path on the LettaBot server - `caption` / `text` (optional) -- Caption text for the file -- `kind` (optional) -- `image` or `file` (defaults to auto-detect based on extension) +- `kind` (optional) -- `image`, `file`, or `audio` (defaults to auto-detect based on extension). Audio files (.ogg, .opus, .mp3, .m4a, .wav, .aac, .flac) are auto-detected as `audio`. - `cleanup` (optional) -- `true` to delete the file after sending (default: false) **Security:** @@ -63,6 +64,22 @@ Sends a file or image to the same channel/chat as the triggering message. - File size is limited to `sendFileMaxSize` (default: 50MB). - The `cleanup` attribute only works when `sendFileCleanup: true` is set in the agent's features config (disabled by default). +### `` + +Generates speech from text via TTS and sends it as a native voice note. No tool calls needed. + +```xml +Hey, here's a quick voice reply! +``` + +The text content is sent to the configured TTS provider (see [TTS Configuration](./configuration.md#text-to-speech-tts-configuration)), converted to audio, and delivered as a voice note. Audio is automatically cleaned up after sending. + +- Requires `tts` to be configured in `lettabot.yaml` +- Renders as native voice bubbles on Telegram and WhatsApp +- Discord and Slack receive a playable audio attachment +- On Telegram, falls back to audio file if voice messages are restricted by Premium privacy settings +- Can be combined with text: any text after the `` block is sent as a normal message alongside the voice note + ### `` Suppresses response delivery entirely. The agent's text is discarded. @@ -88,13 +105,13 @@ Backslash-escaped quotes (common when LLMs generate XML inside a JSON context) a ## Channel Support -| Channel | `addReaction` | `send-file` | Notes | -|-----------|:---:|:---:|-------| -| Telegram | Yes | Yes | Reactions limited to Telegram's [allowed reaction set](https://core.telegram.org/bots/api#reactiontype). | -| Slack | Yes | Yes | Reactions use Slack emoji names (`:thumbsup:` style). | -| Discord | Yes | Yes | Custom server emoji not yet supported. | -| WhatsApp | No | Yes | Reactions skipped with a warning. | -| Signal | No | No | Directive skipped with a warning. | +| Channel | `addReaction` | `send-file` | `kind="audio"` | Notes | +|-----------|:---:|:---:|:---:|-------| +| Telegram | Yes | Yes | Voice note (`sendVoice`) | Falls back to `sendAudio` if voice messages are restricted by Telegram Premium privacy settings. | +| Slack | Yes | Yes | Audio attachment | Reactions use Slack emoji names (`:thumbsup:` style). | +| Discord | Yes | Yes | Audio attachment | Custom server emoji not yet supported. | +| WhatsApp | No | Yes | Voice note (PTT) | Sent with `ptt: true` for native voice bubble. | +| Signal | No | No | No | Directive skipped with a warning. | When a channel doesn't implement `addReaction`, the directive is silently skipped and a warning is logged. This never blocks message delivery. diff --git a/skills/voice-memo/SKILL.md b/skills/voice-memo/SKILL.md new file mode 100644 index 0000000..5c3608a --- /dev/null +++ b/skills/voice-memo/SKILL.md @@ -0,0 +1,58 @@ +--- +name: voice-memo +description: Reply with voice memos using text-to-speech. Use when the user sends a voice message, asks for an audio reply, or when a voice response would be more natural. +--- + +# Voice Memo Responses + +Generate voice memos using TTS and send them as native voice notes. + +## Usage + +Use the `` directive to send voice memos. No tool calls needed: + +``` + + Hey, here's a quick update on that thing we discussed. + +``` + +With accompanying text: + +``` + + Here's the summary as audio. + +And here it is in text form too! +``` + +### Silent mode (heartbeats, cron) + +For background tasks that need to send voice without a user message context: + +```bash +OUTPUT=$(lettabot-tts "Your message here") || exit 1 +lettabot-message send --file "$OUTPUT" --voice +``` + +## When to Use Voice + +- User sent a voice message and a voice reply feels natural +- User explicitly asks for a voice/audio response +- Short, conversational responses (voice is awkward for long technical content) + +## When NOT to Use Voice + +- Code snippets, file paths, URLs, or structured data (these should be text) +- Long responses -- keep voice memos under ~30 seconds of speech +- When the user has indicated a preference for text +- When `ELEVENLABS_API_KEY` is not set + +## Notes + +- Audio format is OGG Opus, which renders as native voice bubbles on Telegram and WhatsApp +- Discord and Slack will show it as a playable audio attachment +- Use `cleanup="true"` to delete the audio file after sending +- The `data/outbound/` directory is the default allowed path for send-file directives +- The script uses `$LETTABOT_WORKING_DIR` to output files to the correct directory +- On Telegram, if the user has voice message privacy enabled (Telegram Premium), the bot falls back to sending as an audio file instead of a voice bubble. Users can allow voice messages via Settings > Privacy and Security > Voice Messages. diff --git a/skills/voice-memo/lettabot-tts b/skills/voice-memo/lettabot-tts new file mode 100755 index 0000000..45e8015 --- /dev/null +++ b/skills/voice-memo/lettabot-tts @@ -0,0 +1,130 @@ +#!/usr/bin/env bash +# lettabot-tts - Generate speech audio via configurable TTS provider +# +# Usage: lettabot-tts [output_path] +# +# Environment: +# TTS_PROVIDER - Optional. "elevenlabs" (default) or "openai". +# +# ElevenLabs: +# ELEVENLABS_API_KEY - Required. API key. +# ELEVENLABS_VOICE_ID - Optional. Voice ID (default: 21m00Tcm4TlvDq8ikWAM / Rachel). +# ELEVENLABS_MODEL_ID - Optional. Model ID (default: eleven_multilingual_v2). +# +# OpenAI: +# OPENAI_API_KEY - Required. API key. +# OPENAI_TTS_VOICE - Optional. Voice name (default: alloy). +# OPENAI_TTS_MODEL - Optional. Model (default: tts-1). + +set -euo pipefail + +TEXT="${1:?Usage: lettabot-tts [output_path]}" + +# The session subprocess CWD is set to workingDir (bot.ts:642), which is the +# same base directory that directives resolve from. This means +# $(pwd) and LETTABOT_WORKING_DIR produce paths in the correct coordinate space. +OUTBOUND_DIR="${LETTABOT_WORKING_DIR:-$(pwd)}/data/outbound" + +PROVIDER="${TTS_PROVIDER:-elevenlabs}" + +# Ensure output directory exists +mkdir -p "$OUTBOUND_DIR" + +# Use collision-safe random filenames when output path is not explicitly provided. +if [ -n "${2:-}" ]; then + OUTPUT="$2" +else + # Clean stale voice files older than 1 hour + find "$OUTBOUND_DIR" -name 'voice-*.ogg' -mmin +60 -delete 2>/dev/null || true + OUTPUT=$(mktemp "${OUTBOUND_DIR}/voice-XXXXXXXXXX.ogg") +fi + +# --------------------------------------------------------------------------- +# Provider: ElevenLabs +# --------------------------------------------------------------------------- +tts_elevenlabs() { + if [ -z "${ELEVENLABS_API_KEY:-}" ]; then + echo "Error: ELEVENLABS_API_KEY is not set" >&2 + exit 1 + fi + + local voice_id="${ELEVENLABS_VOICE_ID:-onwK4e9ZLuTAKqWW03F9}" + local model_id="${ELEVENLABS_MODEL_ID:-eleven_multilingual_v2}" + + local http_code + http_code=$(curl -s -w "%{http_code}" -o "$OUTPUT" \ + "https://api.elevenlabs.io/v1/text-to-speech/${voice_id}" \ + -H "xi-api-key: ${ELEVENLABS_API_KEY}" \ + -H "Content-Type: application/json" \ + -d "$(jq -n \ + --arg text "$TEXT" \ + --arg model "$model_id" \ + '{ + text: $text, + model_id: $model, + output_format: "ogg_opus" + }' + )") + + if [ "$http_code" -lt 200 ] || [ "$http_code" -ge 300 ]; then + echo "Error: ElevenLabs API returned HTTP $http_code" >&2 + if file "$OUTPUT" | grep -q "text\|JSON\|ASCII"; then + cat "$OUTPUT" >&2 + fi + rm -f "$OUTPUT" + exit 1 + fi +} + +# --------------------------------------------------------------------------- +# Provider: OpenAI +# --------------------------------------------------------------------------- +tts_openai() { + if [ -z "${OPENAI_API_KEY:-}" ]; then + echo "Error: OPENAI_API_KEY is not set" >&2 + exit 1 + fi + + local voice="${OPENAI_TTS_VOICE:-alloy}" + local model="${OPENAI_TTS_MODEL:-tts-1}" + + local http_code + http_code=$(curl -s -w "%{http_code}" -o "$OUTPUT" \ + "https://api.openai.com/v1/audio/speech" \ + -H "Authorization: Bearer ${OPENAI_API_KEY}" \ + -H "Content-Type: application/json" \ + -d "$(jq -n \ + --arg text "$TEXT" \ + --arg model "$model" \ + --arg voice "$voice" \ + '{ + model: $model, + input: $text, + voice: $voice, + response_format: "opus" + }' + )") + + if [ "$http_code" -lt 200 ] || [ "$http_code" -ge 300 ]; then + echo "Error: OpenAI TTS API returned HTTP $http_code" >&2 + if file "$OUTPUT" | grep -q "text\|JSON\|ASCII"; then + cat "$OUTPUT" >&2 + fi + rm -f "$OUTPUT" + exit 1 + fi +} + +# --------------------------------------------------------------------------- +# Dispatch +# --------------------------------------------------------------------------- +case "$PROVIDER" in + elevenlabs) tts_elevenlabs ;; + openai) tts_openai ;; + *) + echo "Error: Unknown TTS_PROVIDER: $PROVIDER (supported: elevenlabs, openai)" >&2 + exit 1 + ;; +esac + +echo "$OUTPUT" diff --git a/src/api/server.ts b/src/api/server.ts index 2778ab5..7fc9761 100644 --- a/src/api/server.ts +++ b/src/api/server.ts @@ -104,7 +104,7 @@ export function createApiServer(deliverer: AgentRouter, options: ServerOptions): { text: fields.text, filePath: file?.tempPath, - kind: fields.kind as 'image' | 'file' | undefined, + kind: fields.kind as 'image' | 'file' | 'audio' | undefined, } ); diff --git a/src/api/types.ts b/src/api/types.ts index 6a3bbbc..b9b3051 100644 --- a/src/api/types.ts +++ b/src/api/types.ts @@ -23,7 +23,7 @@ export interface SendFileRequest { chatId: string; filePath: string; // Temporary file path on server caption?: string; - kind?: 'image' | 'file'; + kind?: 'image' | 'file' | 'audio'; threadId?: string; } diff --git a/src/channels/telegram.ts b/src/channels/telegram.ts index a3a2564..5a8de18 100644 --- a/src/channels/telegram.ts +++ b/src/channels/telegram.ts @@ -565,6 +565,21 @@ export class TelegramAdapter implements ChannelAdapter { return { messageId: String(result.message_id) }; } + if (file.kind === 'audio') { + try { + const result = await this.bot.api.sendVoice(file.chatId, input, { caption }); + return { messageId: String(result.message_id) }; + } catch (err: any) { + // Fall back to sendAudio if voice messages are restricted (Telegram Premium privacy setting) + if (err?.description?.includes('VOICE_MESSAGES_FORBIDDEN')) { + log.warn('sendVoice forbidden, falling back to sendAudio'); + const result = await this.bot.api.sendAudio(file.chatId, new InputFile(file.filePath), { caption }); + return { messageId: String(result.message_id) }; + } + throw err; + } + } + const result = await this.bot.api.sendDocument(file.chatId, input, { caption }); return { messageId: String(result.message_id) }; } diff --git a/src/channels/whatsapp/outbound.test.ts b/src/channels/whatsapp/outbound.test.ts new file mode 100644 index 0000000..bbb4930 --- /dev/null +++ b/src/channels/whatsapp/outbound.test.ts @@ -0,0 +1,36 @@ +import { describe, it, expect, vi } from 'vitest'; +import { sendWhatsAppFile, type LidMapper } from './outbound.js'; + +describe('sendWhatsAppFile', () => { + it('sends audio as native voice note payload', async () => { + const sock = { + sendMessage: vi.fn(async () => ({ key: { id: '' } })), + } as any; + + const lidMapper: LidMapper = { + selfChatLid: '', + myNumber: '', + lidToJid: new Map(), + }; + + await sendWhatsAppFile( + sock, + { + chatId: '12345@s.whatsapp.net', + filePath: '/tmp/voice.ogg', + caption: 'hello', + kind: 'audio', + }, + lidMapper, + new Set(), + ); + + expect(sock.sendMessage).toHaveBeenCalledWith( + '12345@s.whatsapp.net', + { + audio: { url: '/tmp/voice.ogg' }, + ptt: true, + }, + ); + }); +}); diff --git a/src/channels/whatsapp/outbound.ts b/src/channels/whatsapp/outbound.ts index a9e4638..04bef13 100644 --- a/src/channels/whatsapp/outbound.ts +++ b/src/channels/whatsapp/outbound.ts @@ -244,10 +244,14 @@ export async function sendWhatsAppFile( const caption = file.caption || undefined; const fileName = basename(file.filePath); - const payload = - file.kind === "image" - ? { image: { url: file.filePath }, caption } - : { document: { url: file.filePath }, mimetype: "application/octet-stream", caption, fileName }; + let payload; + if (file.kind === "image") { + payload = { image: { url: file.filePath }, caption }; + } else if (file.kind === "audio") { + payload = { audio: { url: file.filePath }, ptt: true }; + } else { + payload = { document: { url: file.filePath }, mimetype: "application/octet-stream", caption, fileName }; + } try { // Send file diff --git a/src/cli/message.ts b/src/cli/message.ts index 6598fca..c4459cf 100644 --- a/src/cli/message.ts +++ b/src/cli/message.ts @@ -147,7 +147,7 @@ async function sendViaApi( options: { text?: string; filePath?: string; - kind?: 'image' | 'file'; + kind?: 'image' | 'file' | 'audio'; } ): Promise { const apiUrl = process.env.LETTABOT_API_URL || 'http://localhost:8080'; @@ -249,7 +249,7 @@ async function sendToChannel(channel: string, chatId: string, text: string): Pro async function sendCommand(args: string[]): Promise { let text = ''; let filePath = ''; - let kind: 'image' | 'file' | undefined = undefined; + let kind: 'image' | 'file' | 'audio' | undefined = undefined; let channel = ''; let chatId = ''; const fileCapableChannels = new Set(['telegram', 'slack', 'discord', 'whatsapp']); @@ -267,6 +267,8 @@ async function sendCommand(args: string[]): Promise { i++; } else if (arg === '--image') { kind = 'image'; + } else if (arg === '--voice') { + kind = 'audio'; } else if ((arg === '--channel' || arg === '-c' || arg === '-C') && next) { channel = next; i++; @@ -332,6 +334,7 @@ Send options: --text, -t Message text (or caption when used with --file) --file, -f File path (optional, for file messages) --image Treat file as image (vs document) + --voice Treat file as voice note (sends as native voice memo) --channel, -c Channel: telegram, slack, whatsapp, discord (default: last used) --chat, --to Chat/conversation ID (default: last messaged) @@ -348,6 +351,9 @@ Examples: # Send to specific WhatsApp chat lettabot-message send --file report.pdf --text "Report attached" --channel whatsapp --chat "+1555@s.whatsapp.net" + # Send voice note + lettabot-message send --file voice.ogg --voice + # Short form lettabot-message send -t "Done!" -f doc.pdf -c telegram diff --git a/src/config/io.ts b/src/config/io.ts index 00f957c..cd5f9c7 100644 --- a/src/config/io.ts +++ b/src/config/io.ts @@ -352,6 +352,36 @@ export function configToEnv(config: LettaBotConfig): Record { env.ATTACHMENTS_MAX_AGE_DAYS = String(config.attachments.maxAgeDays); } + // TTS (text-to-speech for voice memos) + if (config.tts?.provider) { + env.TTS_PROVIDER = config.tts.provider; + } + if (config.tts?.apiKey) { + // Set the provider-specific key based on provider + const provider = config.tts.provider || 'elevenlabs'; + if (provider === 'elevenlabs') { + env.ELEVENLABS_API_KEY = config.tts.apiKey; + } else if (provider === 'openai') { + env.OPENAI_API_KEY = config.tts.apiKey; + } + } + if (config.tts?.voiceId) { + const provider = config.tts.provider || 'elevenlabs'; + if (provider === 'elevenlabs') { + env.ELEVENLABS_VOICE_ID = config.tts.voiceId; + } else if (provider === 'openai') { + env.OPENAI_TTS_VOICE = config.tts.voiceId; + } + } + if (config.tts?.model) { + const provider = config.tts.provider || 'elevenlabs'; + if (provider === 'elevenlabs') { + env.ELEVENLABS_MODEL_ID = config.tts.model; + } else if (provider === 'openai') { + env.OPENAI_TTS_MODEL = config.tts.model; + } + } + // API server (server.api is canonical, top-level api is deprecated fallback) const apiConfig = config.server.api ?? config.api; if (apiConfig?.port !== undefined) { diff --git a/src/config/types.ts b/src/config/types.ts index 4b6f342..fe532d8 100644 --- a/src/config/types.ts +++ b/src/config/types.ts @@ -177,9 +177,12 @@ export interface LettaBotConfig { google?: GoogleConfig; }; - // Transcription (voice messages) + // Transcription (inbound voice messages) transcription?: TranscriptionConfig; + // Text-to-speech (outbound voice memos) + tts?: TtsConfig; + // Attachment handling attachments?: { maxMB?: number; @@ -195,6 +198,13 @@ export interface LettaBotConfig { }; } +export interface TtsConfig { + provider?: 'elevenlabs' | 'openai'; // Default: 'elevenlabs' + apiKey?: string; // Falls back to ELEVENLABS_API_KEY or OPENAI_API_KEY env var + voiceId?: string; // ElevenLabs voice ID or OpenAI voice name + model?: string; // Model ID (provider-specific defaults) +} + export interface TranscriptionConfig { provider: 'openai' | 'mistral'; apiKey?: string; // Falls back to OPENAI_API_KEY or MISTRAL_API_KEY env var diff --git a/src/core/bot.ts b/src/core/bot.ts index 03c9215..1b9396b 100644 --- a/src/core/bot.ts +++ b/src/core/bot.ts @@ -5,15 +5,16 @@ */ import { createAgent, createSession, resumeSession, imageFromFile, imageFromURL, type Session, type MessageContentItem, type SendMessage, type CanUseToolCallback } from '@letta-ai/letta-code-sdk'; -import { mkdirSync } from 'node:fs'; +import { mkdirSync, existsSync } from 'node:fs'; import { access, unlink, realpath, stat, constants } from 'node:fs/promises'; +import { execFile } from 'node:child_process'; import { extname, resolve, join } from 'node:path'; import type { ChannelAdapter } from '../channels/types.js'; import type { BotConfig, InboundMessage, TriggerContext } from './types.js'; import type { AgentSession } from './interfaces.js'; import { Store } from './store.js'; import { updateAgentName, getPendingApprovals, rejectApproval, cancelRuns, recoverOrphanedConversationApproval, getLatestRunError } from '../tools/letta-api.js'; -import { installSkillsToAgent } from '../skills/loader.js'; +import { installSkillsToAgent, withAgentSkillsOnPath, getAgentSkillExecutableDirs, isVoiceMemoConfigured } from '../skills/loader.js'; import { formatMessageEnvelope, formatGroupBatchEnvelope, type SessionContextOptions } from './formatter.js'; import type { GroupBatcher } from './group-batcher.js'; import { loadMemoryBlocks } from './memory.js'; @@ -117,10 +118,16 @@ const IMAGE_FILE_EXTENSIONS = new Set([ '.png', '.jpg', '.jpeg', '.gif', '.webp', '.bmp', '.tiff', ]); -/** Infer whether a file is an image or generic file based on extension. */ -export function inferFileKind(filePath: string): 'image' | 'file' { +const AUDIO_FILE_EXTENSIONS = new Set([ + '.ogg', '.opus', '.mp3', '.m4a', '.wav', '.aac', '.flac', +]); + +/** Infer whether a file is an image, audio, or generic file based on extension. */ +export function inferFileKind(filePath: string): 'image' | 'file' | 'audio' { const ext = extname(filePath).toLowerCase(); - return IMAGE_FILE_EXTENSIONS.has(ext) ? 'image' : 'file'; + if (IMAGE_FILE_EXTENSIONS.has(ext)) return 'image'; + if (AUDIO_FILE_EXTENSIONS.has(ext)) return 'audio'; + return 'file'; } /** @@ -776,6 +783,59 @@ export class LettaBot implements AgentSession { console.warn('[Bot] Directive send-file failed:', err instanceof Error ? err.message : err); } } + + if (directive.type === 'voice') { + if (!isVoiceMemoConfigured()) { + log.warn('Directive voice skipped: no TTS credentials configured'); + continue; + } + if (typeof adapter.sendFile !== 'function') { + log.warn(`Directive voice skipped: ${adapter.name} does not support sendFile`); + continue; + } + + // Find lettabot-tts in agent's skill dirs + const agentId = this.store.agentId; + const skillDirs = agentId ? getAgentSkillExecutableDirs(agentId) : []; + const ttsPath = skillDirs + .map(dir => join(dir, 'lettabot-tts')) + .find(p => existsSync(p)); + + if (!ttsPath) { + log.warn('Directive voice skipped: lettabot-tts not found in skill dirs'); + continue; + } + + try { + const outputPath = await new Promise((resolve, reject) => { + execFile(ttsPath, [directive.text], { + cwd: this.config.workingDir, + env: { ...process.env, LETTABOT_WORKING_DIR: this.config.workingDir }, + timeout: 30_000, + }, (err, stdout, stderr) => { + if (err) { + reject(new Error(stderr?.trim() || err.message)); + } else { + resolve(stdout.trim()); + } + }); + }); + + await adapter.sendFile({ + chatId, + filePath: outputPath, + kind: 'audio', + threadId, + }); + acted = true; + log.info(`Directive: sent voice memo (${directive.text.length} chars)`); + + // Clean up generated file + try { await unlink(outputPath); } catch {} + } catch (err) { + log.warn('Directive voice failed:', err instanceof Error ? err.message : err); + } + } } return acted; } @@ -864,6 +924,7 @@ export class LettaBot implements AgentSession { const opts = this.baseSessionOptions(this.sessionCanUseTool); let session: Session; + let sessionAgentId: string | undefined; // In per-channel mode, look up per-key conversation ID. // In shared mode (key === "shared"), use the legacy single conversationId. @@ -873,9 +934,15 @@ export class LettaBot implements AgentSession { if (convId) { process.env.LETTA_AGENT_ID = this.store.agentId || undefined; + if (this.store.agentId) { + installSkillsToAgent(this.store.agentId, this.config.skills); + sessionAgentId = this.store.agentId; + } session = resumeSession(convId, opts); } else if (this.store.agentId) { process.env.LETTA_AGENT_ID = this.store.agentId; + installSkillsToAgent(this.store.agentId, this.config.skills); + sessionAgentId = this.store.agentId; session = createSession(this.store.agentId, opts); } else { // Create new agent -- persist immediately so we don't orphan it on later failures @@ -893,6 +960,7 @@ export class LettaBot implements AgentSession { updateAgentName(newAgentId, this.config.agentName).catch(() => {}); } installSkillsToAgent(newAgentId, this.config.skills); + sessionAgentId = newAgentId; session = createSession(newAgentId, opts); } @@ -900,7 +968,14 @@ export class LettaBot implements AgentSession { // Initialize eagerly so the subprocess is ready before the first send() log.info(`Initializing session subprocess (key=${key})...`); try { - await this.withSessionTimeout(session.initialize(), `Session initialize (key=${key})`); + if (sessionAgentId) { + await withAgentSkillsOnPath( + sessionAgentId, + () => this.withSessionTimeout(session.initialize(), `Session initialize (key=${key})`), + ); + } else { + await this.withSessionTimeout(session.initialize(), `Session initialize (key=${key})`); + } log.info(`Session subprocess ready (key=${key})`); } catch (error) { // Close immediately so failed initialization cannot leak a subprocess. @@ -1680,7 +1755,11 @@ export class LettaBot implements AgentSession { msgTypeCounts[streamMsg.type] = (msgTypeCounts[streamMsg.type] || 0) + 1; const preview = JSON.stringify(streamMsg).slice(0, 300); - log.info(`type=${streamMsg.type} ${preview}`); + if (streamMsg.type === 'reasoning' || streamMsg.type === 'assistant') { + log.debug(`type=${streamMsg.type} ${preview}`); + } else { + log.info(`type=${streamMsg.type} ${preview}`); + } // stream_event is a low-level streaming primitive (partial deltas), not a // semantic type change. Skip it for type-transition logic so it doesn't @@ -1694,6 +1773,7 @@ export class LettaBot implements AgentSession { // Flush reasoning buffer when type changes away from reasoning if (isSemanticType && lastMsgType === 'reasoning' && streamMsg.type !== 'reasoning' && reasoningBuffer.trim()) { + log.info(`Reasoning: ${reasoningBuffer.trim()}`); if (this.config.display?.showReasoning && !suppressDelivery) { try { const reasoning = this.formatReasoningDisplay(reasoningBuffer, adapter.id); @@ -2180,7 +2260,7 @@ export class LettaBot implements AgentSession { options: { text?: string; filePath?: string; - kind?: 'image' | 'file'; + kind?: 'image' | 'file' | 'audio'; } ): Promise { const adapter = this.channels.get(channelId); diff --git a/src/core/directives.test.ts b/src/core/directives.test.ts index 008496a..58d2503 100644 --- a/src/core/directives.test.ts +++ b/src/core/directives.test.ts @@ -77,6 +77,14 @@ describe('parseDirectives', () => { ]); }); + it('parses send-file directive with audio kind', () => { + const result = parseDirectives(''); + expect(result.cleanText).toBe(''); + expect(result.directives).toEqual([ + { type: 'send-file', path: 'voice.ogg', kind: 'audio' }, + ]); + }); + it('parses send-file directive with cleanup attribute', () => { const result = parseDirectives(''); expect(result.cleanText).toBe(''); @@ -149,6 +157,57 @@ describe('parseDirectives', () => { expect(result.cleanText).toBe(''); expect(result.directives).toEqual([]); }); + + it('parses voice directive with text content', () => { + const result = parseDirectives('Hello from a voice memo'); + expect(result.cleanText).toBe(''); + expect(result.directives).toEqual([{ type: 'voice', text: 'Hello from a voice memo' }]); + }); + + it('parses voice directive with text after actions block', () => { + const result = parseDirectives('Here is a voice note\nHere\'s the audio!'); + expect(result.cleanText).toBe("Here's the audio!"); + expect(result.directives).toEqual([{ type: 'voice', text: 'Here is a voice note' }]); + }); + + it('parses voice directive with multiline text', () => { + const result = parseDirectives('Line one.\nLine two.'); + expect(result.cleanText).toBe(''); + expect(result.directives).toEqual([{ type: 'voice', text: 'Line one.\nLine two.' }]); + }); + + it('ignores empty voice directive', () => { + const result = parseDirectives(' '); + expect(result.cleanText).toBe(''); + expect(result.directives).toEqual([]); + }); + + it('parses voice and react directives together', () => { + const result = parseDirectives('Check this out'); + expect(result.directives).toHaveLength(2); + expect(result.directives[0]).toEqual({ type: 'react', emoji: '🎤' }); + expect(result.directives[1]).toEqual({ type: 'voice', text: 'Check this out' }); + }); + + it('preserves order when voice appears before react', () => { + const result = parseDirectives('First'); + expect(result.directives).toEqual([ + { type: 'voice', text: 'First' }, + { type: 'react', emoji: '🎤' }, + ]); + }); + + it('preserves mixed directive order across voice and self-closing tags', () => { + const result = parseDirectives( + 'OneTwo', + ); + expect(result.directives).toEqual([ + { type: 'send-file', path: 'a.pdf' }, + { type: 'voice', text: 'One' }, + { type: 'react', emoji: '👍' }, + { type: 'voice', text: 'Two' }, + ]); + }); }); describe('stripActionsBlock', () => { diff --git a/src/core/directives.ts b/src/core/directives.ts index 42d0eaa..e26b284 100644 --- a/src/core/directives.ts +++ b/src/core/directives.ts @@ -26,12 +26,17 @@ export interface SendFileDirective { type: 'send-file'; path: string; caption?: string; - kind?: 'image' | 'file'; + kind?: 'image' | 'file' | 'audio'; cleanup?: boolean; } +export interface VoiceDirective { + type: 'voice'; + text: string; +} + // Union type — extend with more directive types later -export type Directive = ReactDirective | SendFileDirective; +export type Directive = ReactDirective | SendFileDirective | VoiceDirective; export interface ParseResult { cleanText: string; @@ -45,10 +50,11 @@ export interface ParseResult { const ACTIONS_BLOCK_REGEX = /^\s*([\s\S]*?)<\/actions>/; /** - * Match self-closing child directive tags inside the actions block. - * Captures the tag name and the full attributes string. + * Match supported directive tags inside the actions block in source order. + * - Self-closing: , + * - Content-bearing: ... */ -const CHILD_DIRECTIVE_REGEX = /<(react|send-file)\b([^>]*)\/>/g; +const DIRECTIVE_TOKEN_REGEX = /<(react|send-file)\b([^>]*)\/>|([\s\S]*?)<\/voice>/g; /** * Parse a single attribute string like: emoji="eyes" message="123" @@ -73,13 +79,21 @@ function parseChildDirectives(block: string): Directive[] { const normalizedBlock = block.replace(/\\(['"])/g, '$1'); // Reset regex state (global flag) - CHILD_DIRECTIVE_REGEX.lastIndex = 0; + DIRECTIVE_TOKEN_REGEX.lastIndex = 0; - while ((match = CHILD_DIRECTIVE_REGEX.exec(normalizedBlock)) !== null) { - const [, tagName, attrString] = match; + while ((match = DIRECTIVE_TOKEN_REGEX.exec(normalizedBlock)) !== null) { + const [, tagName, attrString, voiceText] = match; + + if (voiceText !== undefined) { + const text = voiceText.trim(); + if (text) { + directives.push({ type: 'voice', text }); + } + continue; + } if (tagName === 'react') { - const attrs = parseAttributes(attrString); + const attrs = parseAttributes(attrString || ''); if (attrs.emoji) { directives.push({ type: 'react', @@ -91,11 +105,11 @@ function parseChildDirectives(block: string): Directive[] { } if (tagName === 'send-file') { - const attrs = parseAttributes(attrString); + const attrs = parseAttributes(attrString || ''); const path = attrs.path || attrs.file; if (!path) continue; const caption = attrs.caption || attrs.text; - const kind = attrs.kind === 'image' || attrs.kind === 'file' + const kind = attrs.kind === 'image' || attrs.kind === 'file' || attrs.kind === 'audio' ? attrs.kind : undefined; const cleanup = attrs.cleanup === 'true'; diff --git a/src/core/formatter.ts b/src/core/formatter.ts index aa5cd25..eb835d0 100644 --- a/src/core/formatter.ts +++ b/src/core/formatter.ts @@ -356,6 +356,7 @@ export function formatMessageEnvelope( `- To skip replying: \`\``, `- To perform actions: wrap in \`\` at the start of your response`, ` Example: \`Your text here\``, + `- To send a voice memo: \`Your message here\``, ]; sections.push(`## Response Directives\n${directiveLines.join('\n')}`); diff --git a/src/core/gateway.ts b/src/core/gateway.ts index 89a59f6..e48c33b 100644 --- a/src/core/gateway.ts +++ b/src/core/gateway.ts @@ -114,7 +114,7 @@ export class LettaGateway implements AgentRouter { async deliverToChannel( channelId: string, chatId: string, - options: { text?: string; filePath?: string; kind?: 'image' | 'file' } + options: { text?: string; filePath?: string; kind?: 'image' | 'file' | 'audio' } ): Promise { // Try each agent until one owns the channel for (const [name, session] of this.agents) { diff --git a/src/core/interfaces.ts b/src/core/interfaces.ts index 0bec35d..9929d46 100644 --- a/src/core/interfaces.ts +++ b/src/core/interfaces.ts @@ -37,7 +37,7 @@ export interface AgentSession { deliverToChannel(channelId: string, chatId: string, options: { text?: string; filePath?: string; - kind?: 'image' | 'file'; + kind?: 'image' | 'file' | 'audio'; }): Promise; /** Get agent status */ @@ -67,7 +67,7 @@ export interface MessageDeliverer { deliverToChannel(channelId: string, chatId: string, options: { text?: string; filePath?: string; - kind?: 'image' | 'file'; + kind?: 'image' | 'file' | 'audio'; }): Promise; } diff --git a/src/core/send-file.test.ts b/src/core/send-file.test.ts index 43dfb6e..c68ed40 100644 --- a/src/core/send-file.test.ts +++ b/src/core/send-file.test.ts @@ -23,10 +23,21 @@ describe('inferFileKind', () => { expect(inferFileKind('/tmp/script.ts')).toBe('file'); }); + it('returns audio for common audio extensions', () => { + expect(inferFileKind('/tmp/voice.ogg')).toBe('audio'); + expect(inferFileKind('/tmp/voice.opus')).toBe('audio'); + expect(inferFileKind('/tmp/voice.mp3')).toBe('audio'); + expect(inferFileKind('/tmp/voice.m4a')).toBe('audio'); + expect(inferFileKind('/tmp/voice.wav')).toBe('audio'); + expect(inferFileKind('/tmp/voice.aac')).toBe('audio'); + expect(inferFileKind('/tmp/voice.flac')).toBe('audio'); + }); + it('is case insensitive', () => { expect(inferFileKind('/tmp/PHOTO.PNG')).toBe('image'); expect(inferFileKind('/tmp/photo.JPG')).toBe('image'); expect(inferFileKind('/tmp/photo.Jpeg')).toBe('image'); + expect(inferFileKind('/tmp/VOICE.OGG')).toBe('audio'); }); it('returns file for extensionless paths', () => { diff --git a/src/core/system-prompt.ts b/src/core/system-prompt.ts index 8878b75..8c9fa1a 100644 --- a/src/core/system-prompt.ts +++ b/src/core/system-prompt.ts @@ -36,6 +36,9 @@ lettabot-message send --file /path/to/image.jpg --text "Check this out!" # Send file without text (treated as image) lettabot-message send --file photo.png --image +# Send voice note +lettabot-message send --file voice.ogg --voice + # Send to specific channel and chat lettabot-message send --text "Hello!" --channel telegram --chat 123456789 @@ -104,6 +107,8 @@ This sends "Great idea!" and reacts with thumbsup. - \`\` -- react to the message you are responding to. Use the actual emoji character (👀, 👍, ❤️, 🔥, 🎉, 👏). - \`\` -- react to a specific message by ID. - \`\` -- send a file or image to the same channel/chat. File paths are restricted to the configured send-file directory (default: \`data/outbound/\` in the working directory). Paths outside this directory are blocked. +- \`\` -- send a voice note. Audio files (.ogg, .mp3, etc.) are sent as native voice memos on Telegram and WhatsApp. Use \`cleanup="true"\` to delete the file after sending. +- \`Your message here\` -- generate and send a voice memo. The text is converted to speech via TTS and sent as a native voice note. No tool calls needed. Use for short conversational replies, responding to voice messages, or when the user asks for audio. ### Actions-only response diff --git a/src/core/types.ts b/src/core/types.ts index 697eb06..6da3435 100644 --- a/src/core/types.ts +++ b/src/core/types.ts @@ -108,7 +108,7 @@ export interface OutboundFile { filePath: string; caption?: string; threadId?: string; - kind?: 'image' | 'file'; + kind?: 'image' | 'file' | 'audio'; } /** @@ -117,6 +117,7 @@ export interface OutboundFile { export interface SkillsConfig { cronEnabled?: boolean; googleEnabled?: boolean; + ttsEnabled?: boolean; additionalSkills?: string[]; } diff --git a/src/main.ts b/src/main.ts index 2487a77..72794a5 100644 --- a/src/main.ts +++ b/src/main.ts @@ -178,6 +178,7 @@ import { CronService } from './cron/service.js'; import { HeartbeatService } from './cron/heartbeat.js'; import { PollingService, parseGmailAccounts } from './polling/service.js'; import { agentExists, findAgentByName, ensureNoToolApprovals } from './tools/letta-api.js'; +import { isVoiceMemoConfigured } from './skills/loader.js'; // Skills are now installed to agent-scoped location after agent creation (see bot.ts) // Check if config exists (skip in Railway/Docker where env vars are used directly) @@ -523,6 +524,7 @@ async function main() { } log.info(`Data directory: ${dataDir}`); log.info(`Working directory: ${globalConfig.workingDir}`); + process.env.LETTABOT_WORKING_DIR = globalConfig.workingDir; // Normalize config to agents array const agents = normalizeAgents(yamlConfig); @@ -551,6 +553,7 @@ async function main() { } const gateway = new LettaGateway(); + const voiceMemoEnabled = isVoiceMemoConfigured(); const services: { cronServices: CronService[], heartbeatServices: HeartbeatService[], @@ -590,6 +593,7 @@ async function main() { skills: { cronEnabled: agentConfig.features?.cron ?? globalConfig.cronEnabled, googleEnabled: !!agentConfig.integrations?.google?.enabled || !!agentConfig.polling?.gmail?.enabled, + ttsEnabled: voiceMemoEnabled, }, }); @@ -771,7 +775,9 @@ async function main() { }, }; }); - printStartupBanner(bannerAgents); + if (!process.env.LETTABOT_NO_BANNER) { + printStartupBanner(bannerAgents); + } // Shutdown const shutdown = async () => { diff --git a/src/onboard.ts b/src/onboard.ts index 71e3298..e04be52 100644 --- a/src/onboard.ts +++ b/src/onboard.ts @@ -1528,7 +1528,7 @@ export async function onboard(options?: { nonInteractive?: boolean }): Promise {}); } + const ttsEnv = { ...process.env, ...env }; installSkillsToAgent(agentId, { cronEnabled: config.cron, googleEnabled: config.google.enabled, + ttsEnabled: isVoiceMemoConfigured(ttsEnv), }); // Disable tool approvals diff --git a/src/skills/loader.test.ts b/src/skills/loader.test.ts index 3693ebe..7a1d29b 100644 --- a/src/skills/loader.test.ts +++ b/src/skills/loader.test.ts @@ -2,13 +2,14 @@ * Skills Loader Tests */ -import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest'; import { mkdtempSync, rmSync, mkdirSync, writeFileSync, existsSync, readdirSync } from 'node:fs'; import { tmpdir } from 'node:os'; import { join } from 'node:path'; import { getAgentSkillsDir, FEATURE_SKILLS, + isVoiceMemoConfigured, } from './loader.js'; describe('skills loader', () => { @@ -52,6 +53,23 @@ describe('skills loader', () => { expect(FEATURE_SKILLS.google).toContain('gog'); expect(FEATURE_SKILLS.google).toContain('google'); }); + + it('has tts feature with voice-memo skill', () => { + expect(FEATURE_SKILLS.tts).toBeDefined(); + expect(FEATURE_SKILLS.tts).toContain('voice-memo'); + }); + }); + + describe('isVoiceMemoConfigured', () => { + it('defaults to elevenlabs and requires ELEVENLABS_API_KEY', () => { + expect(isVoiceMemoConfigured({})).toBe(false); + expect(isVoiceMemoConfigured({ ELEVENLABS_API_KEY: 'test' })).toBe(true); + }); + + it('supports openai provider and requires OPENAI_API_KEY', () => { + expect(isVoiceMemoConfigured({ TTS_PROVIDER: 'openai' })).toBe(false); + expect(isVoiceMemoConfigured({ TTS_PROVIDER: 'openai', OPENAI_API_KEY: 'test' })).toBe(true); + }); }); describe('installSkillsToAgent', () => { @@ -145,4 +163,53 @@ describe('skills loader', () => { expect(content).toBe('target version'); }); }); + + describe('loadAllSkills precedence', () => { + it('prefers global skills over bundled skills for the same name', async () => { + const originalHome = process.env.HOME; + const originalUserProfile = process.env.USERPROFILE; + const originalCwd = process.cwd(); + const tempHome = mkdtempSync(join(tmpdir(), 'lettabot-home-test-')); + const tempProject = mkdtempSync(join(tmpdir(), 'lettabot-project-test-')); + + try { + process.env.HOME = tempHome; + process.env.USERPROFILE = tempHome; + process.chdir(tempProject); + + const globalVoiceMemoDir = join(tempHome, '.letta', 'skills', 'voice-memo'); + mkdirSync(globalVoiceMemoDir, { recursive: true }); + writeFileSync( + join(globalVoiceMemoDir, 'SKILL.md'), + [ + '---', + 'name: voice-memo', + 'description: global override', + '---', + '', + '# Global override', + '', + ].join('\n'), + ); + + vi.resetModules(); + const mod = await import('./loader.js'); + const skills = mod.loadAllSkills(); + const voiceMemo = skills.find((skill: any) => skill.name === 'voice-memo'); + const expectedPath = join(tempHome, '.letta', 'skills', 'voice-memo', 'SKILL.md'); + + expect(voiceMemo).toBeDefined(); + expect(voiceMemo!.description).toBe('global override'); + expect(voiceMemo!.filePath).toContain(expectedPath); + } finally { + process.chdir(originalCwd); + if (originalHome === undefined) delete process.env.HOME; + else process.env.HOME = originalHome; + if (originalUserProfile === undefined) delete process.env.USERPROFILE; + else process.env.USERPROFILE = originalUserProfile; + rmSync(tempHome, { recursive: true, force: true }); + rmSync(tempProject, { recursive: true, force: true }); + } + }); + }); }); diff --git a/src/skills/loader.ts b/src/skills/loader.ts index b4112bd..d5be20c 100644 --- a/src/skills/loader.ts +++ b/src/skills/loader.ts @@ -4,7 +4,7 @@ import { existsSync, readdirSync, readFileSync, mkdirSync, cpSync } from 'node:fs'; import { execSync } from 'node:child_process'; -import { join, resolve } from 'node:path'; +import { join, resolve, delimiter } from 'node:path'; import matter from 'gray-matter'; import type { SkillEntry, ClawdbotMetadata } from './types.js'; @@ -30,6 +30,91 @@ export function getAgentSkillsDir(agentId: string): string { return join(HOME, '.letta', 'agents', agentId, 'skills'); } +/** + * Resolve subdirectories that contain executable skill files. + */ +function resolveSkillExecutableDirs(skillsDir: string): string[] { + // Only add dirs that contain at least one executable (non-.md) file + return readdirSync(skillsDir, { withFileTypes: true }) + .filter(d => d.isDirectory()) + .map(d => join(skillsDir, d.name)) + .filter(dir => { + try { + return readdirSync(dir).some(f => !f.endsWith('.md')); + } catch { return false; } + }); +} + +/** + * Get executable skill directories for a specific agent. + */ +export function getAgentSkillExecutableDirs(agentId: string): string[] { + const skillsDir = getAgentSkillsDir(agentId); + if (!existsSync(skillsDir)) return []; + return resolveSkillExecutableDirs(skillsDir); +} + +/** + * Temporarily prepend agent skill directories to PATH for one async operation. + * + * PATH is process-global, so serialize PATH mutations to avoid races when + * multiple sessions initialize concurrently. + */ +let _pathMutationQueue: Promise = Promise.resolve(); +async function withPathMutationLock(fn: () => Promise): Promise { + const previous = _pathMutationQueue; + let release!: () => void; + _pathMutationQueue = new Promise((resolve) => { + release = resolve; + }); + + await previous; + try { + return await fn(); + } finally { + release(); + } +} + +export async function withAgentSkillsOnPath(agentId: string, fn: () => Promise): Promise { + const skillDirs = getAgentSkillExecutableDirs(agentId); + if (skillDirs.length === 0) { + return fn(); + } + + return withPathMutationLock(async () => { + const originalPath = process.env.PATH || ''; + const originalParts = originalPath.split(delimiter).filter(Boolean); + const existing = new Set(originalParts); + const prepend = skillDirs.filter((dir) => !existing.has(dir)); + + if (prepend.length > 0) { + process.env.PATH = [...prepend, ...originalParts].join(delimiter); + log.info(`Added ${prepend.length} skill dir(s) to PATH: ${prepend.join(', ')}`); + } + + try { + return await fn(); + } finally { + process.env.PATH = originalPath; + } + }); +} + +/** + * Whether TTS credentials are configured enough to use voice-memo skill. + */ +export function isVoiceMemoConfigured(env: NodeJS.ProcessEnv = process.env): boolean { + const provider = (env.TTS_PROVIDER || 'elevenlabs').toLowerCase(); + if (provider === 'openai') { + return !!env.OPENAI_API_KEY; + } + if (provider === 'elevenlabs') { + return !!env.ELEVENLABS_API_KEY; + } + return false; +} + /** * Check if a binary exists on PATH */ @@ -158,7 +243,10 @@ export function loadAllSkills(agentId?: string | null): SkillEntry[] { // skills.sh global installs (lowest priority) dirs.push(SKILLS_SH_DIR); - // Global skills + // Bundled skills (ship with the project in skills/) + dirs.push(BUNDLED_SKILLS_DIR); + + // Global skills (override bundled defaults) dirs.push(GLOBAL_SKILLS_DIR); // Agent-scoped skills (middle priority) @@ -208,6 +296,7 @@ function installSkillsFromDir(sourceDir: string, targetDir: string): string[] { export const FEATURE_SKILLS: Record = { cron: ['scheduling'], // Scheduling handles both one-off reminders and recurring cron jobs google: ['gog', 'google'], // Installed when Google/Gmail is configured + tts: ['voice-memo'], // Voice memo replies via lettabot-tts helper }; /** @@ -242,6 +331,7 @@ function installSpecificSkills( export interface SkillsInstallConfig { cronEnabled?: boolean; googleEnabled?: boolean; // Gmail polling or Google integration + ttsEnabled?: boolean; // Voice memo replies via TTS providers additionalSkills?: string[]; // Explicitly enabled skills } @@ -261,22 +351,29 @@ export function installSkillsToWorkingDir(workingDir: string, config: SkillsInst mkdirSync(targetDir, { recursive: true }); // Collect skills to install based on enabled features - const skillsToInstall: string[] = []; + const requestedSkills: string[] = []; // Cron skills (always if cron is enabled) if (config.cronEnabled) { - skillsToInstall.push(...FEATURE_SKILLS.cron); + requestedSkills.push(...FEATURE_SKILLS.cron); } // Google skills (if Gmail polling or Google is configured) if (config.googleEnabled) { - skillsToInstall.push(...FEATURE_SKILLS.google); + requestedSkills.push(...FEATURE_SKILLS.google); + } + + // Voice memo skill (if TTS is configured) + if (config.ttsEnabled) { + requestedSkills.push(...FEATURE_SKILLS.tts); } // Additional explicitly enabled skills if (config.additionalSkills?.length) { - skillsToInstall.push(...config.additionalSkills); + requestedSkills.push(...config.additionalSkills); } + + const skillsToInstall = Array.from(new Set(requestedSkills)); if (skillsToInstall.length === 0) { log.info('No feature-gated skills to install'); @@ -310,22 +407,29 @@ export function installSkillsToAgent(agentId: string, config: SkillsInstallConfi mkdirSync(targetDir, { recursive: true }); // Collect skills to install based on enabled features - const skillsToInstall: string[] = []; + const requestedSkills: string[] = []; // Cron skills (always if cron is enabled) if (config.cronEnabled) { - skillsToInstall.push(...FEATURE_SKILLS.cron); + requestedSkills.push(...FEATURE_SKILLS.cron); } // Google skills (if Gmail polling or Google is configured) if (config.googleEnabled) { - skillsToInstall.push(...FEATURE_SKILLS.google); + requestedSkills.push(...FEATURE_SKILLS.google); + } + + // Voice memo skill (if TTS is configured) + if (config.ttsEnabled) { + requestedSkills.push(...FEATURE_SKILLS.tts); } // Additional explicitly enabled skills if (config.additionalSkills?.length) { - skillsToInstall.push(...config.additionalSkills); + requestedSkills.push(...config.additionalSkills); } + + const skillsToInstall = Array.from(new Set(requestedSkills)); if (skillsToInstall.length === 0) { return; // No skills to install - silent return