feat: add voice memo responses via TTS (#394)
This commit is contained in:
15
.env.example
15
.env.example
@@ -141,6 +141,21 @@ TELEGRAM_BOT_TOKEN=your_telegram_bot_token
|
|||||||
# GMAIL_REFRESH_TOKEN=your_refresh_token
|
# GMAIL_REFRESH_TOKEN=your_refresh_token
|
||||||
# GMAIL_TELEGRAM_USER=123456789
|
# GMAIL_TELEGRAM_USER=123456789
|
||||||
|
|
||||||
|
# ============================================
|
||||||
|
# Voice Memos / TTS (optional)
|
||||||
|
# ============================================
|
||||||
|
# TTS provider: "elevenlabs" (default) or "openai"
|
||||||
|
# TTS_PROVIDER=elevenlabs
|
||||||
|
|
||||||
|
# ElevenLabs (default provider)
|
||||||
|
# ELEVENLABS_API_KEY=sk_your_elevenlabs_key
|
||||||
|
# ELEVENLABS_VOICE_ID=21m00Tcm4TlvDq8ikWAM
|
||||||
|
# ELEVENLABS_MODEL_ID=eleven_multilingual_v2
|
||||||
|
|
||||||
|
# OpenAI TTS (uses OPENAI_API_KEY from above)
|
||||||
|
# OPENAI_TTS_VOICE=alloy
|
||||||
|
# OPENAI_TTS_MODEL=tts-1
|
||||||
|
|
||||||
# ============================================
|
# ============================================
|
||||||
# API Server (for Docker/CLI integration)
|
# API Server (for Docker/CLI integration)
|
||||||
# ============================================
|
# ============================================
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ Send a message to the most recent chat, or target a specific channel/chat.
|
|||||||
lettabot-message send --text "Hello from a background task"
|
lettabot-message send --text "Hello from a background task"
|
||||||
lettabot-message send --text "Hello" --channel slack --chat C123456
|
lettabot-message send --text "Hello" --channel slack --chat C123456
|
||||||
lettabot-message send --file /tmp/report.pdf --text "Report attached" --channel discord --chat 123456789
|
lettabot-message send --file /tmp/report.pdf --text "Report attached" --channel discord --chat 123456789
|
||||||
|
lettabot-message send --file /tmp/voice.ogg --voice # Send as native voice note
|
||||||
```
|
```
|
||||||
|
|
||||||
## lettabot-react
|
## lettabot-react
|
||||||
|
|||||||
@@ -675,6 +675,36 @@ transcription:
|
|||||||
model: whisper-1 # Default
|
model: whisper-1 # Default
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Text-to-Speech (TTS) Configuration
|
||||||
|
|
||||||
|
Voice memo generation via the `<voice>` directive. The agent can reply with voice notes on Telegram and WhatsApp:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
tts:
|
||||||
|
provider: elevenlabs # "elevenlabs" (default) or "openai"
|
||||||
|
apiKey: sk_475a... # Provider API key
|
||||||
|
voiceId: 21m00Tcm4TlvDq8ikWAM # Voice selection (see below)
|
||||||
|
model: eleven_multilingual_v2 # Optional model override
|
||||||
|
```
|
||||||
|
|
||||||
|
**ElevenLabs** (default):
|
||||||
|
- `voiceId` is an ElevenLabs voice ID. Default: `21m00Tcm4TlvDq8ikWAM` (Rachel). Browse voices at [elevenlabs.io/voice-library](https://elevenlabs.io/voice-library).
|
||||||
|
- `model` defaults to `eleven_multilingual_v2`.
|
||||||
|
|
||||||
|
**OpenAI**:
|
||||||
|
- `voiceId` is one of: `alloy`, `echo`, `fable`, `onyx`, `nova`, `shimmer`. Default: `alloy`.
|
||||||
|
- `model` defaults to `tts-1`. Use `tts-1-hd` for higher quality.
|
||||||
|
|
||||||
|
The agent uses the `<voice>` directive in responses:
|
||||||
|
|
||||||
|
```xml
|
||||||
|
<actions>
|
||||||
|
<voice>Hey, here's a quick voice reply!</voice>
|
||||||
|
</actions>
|
||||||
|
```
|
||||||
|
|
||||||
|
The `lettabot-tts` CLI tool is also available for background tasks (heartbeats, cron).
|
||||||
|
|
||||||
## Attachments Configuration
|
## Attachments Configuration
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
@@ -807,5 +837,11 @@ Environment variables override config file values:
|
|||||||
| `LOG_LEVEL` | `server.logLevel` (fatal/error/warn/info/debug/trace). Overrides config. |
|
| `LOG_LEVEL` | `server.logLevel` (fatal/error/warn/info/debug/trace). Overrides config. |
|
||||||
| `LETTABOT_LOG_LEVEL` | Alias for `LOG_LEVEL` |
|
| `LETTABOT_LOG_LEVEL` | Alias for `LOG_LEVEL` |
|
||||||
| `LOG_FORMAT` | Set to `json` for structured JSON output (recommended for Railway/Docker) |
|
| `LOG_FORMAT` | Set to `json` for structured JSON output (recommended for Railway/Docker) |
|
||||||
|
| `TTS_PROVIDER` | TTS backend: `elevenlabs` (default) or `openai` |
|
||||||
|
| `ELEVENLABS_API_KEY` | API key for ElevenLabs TTS |
|
||||||
|
| `ELEVENLABS_VOICE_ID` | ElevenLabs voice ID (default: `21m00Tcm4TlvDq8ikWAM` / Rachel) |
|
||||||
|
| `ELEVENLABS_MODEL_ID` | ElevenLabs model (default: `eleven_multilingual_v2`) |
|
||||||
|
| `OPENAI_TTS_VOICE` | OpenAI TTS voice (default: `alloy`) |
|
||||||
|
| `OPENAI_TTS_MODEL` | OpenAI TTS model (default: `tts-1`) |
|
||||||
|
|
||||||
See [SKILL.md](../SKILL.md) for complete environment variable reference.
|
See [SKILL.md](../SKILL.md) for complete environment variable reference.
|
||||||
|
|||||||
@@ -48,13 +48,14 @@ Sends a file or image to the same channel/chat as the triggering message.
|
|||||||
```xml
|
```xml
|
||||||
<send-file path="/tmp/report.pdf" caption="Report attached" />
|
<send-file path="/tmp/report.pdf" caption="Report attached" />
|
||||||
<send-file path="/tmp/photo.png" kind="image" caption="Look!" />
|
<send-file path="/tmp/photo.png" kind="image" caption="Look!" />
|
||||||
|
<send-file path="/tmp/voice.ogg" kind="audio" cleanup="true" />
|
||||||
<send-file path="/tmp/temp-export.csv" cleanup="true" />
|
<send-file path="/tmp/temp-export.csv" cleanup="true" />
|
||||||
```
|
```
|
||||||
|
|
||||||
**Attributes:**
|
**Attributes:**
|
||||||
- `path` / `file` (required) -- Local file path on the LettaBot server
|
- `path` / `file` (required) -- Local file path on the LettaBot server
|
||||||
- `caption` / `text` (optional) -- Caption text for the file
|
- `caption` / `text` (optional) -- Caption text for the file
|
||||||
- `kind` (optional) -- `image` or `file` (defaults to auto-detect based on extension)
|
- `kind` (optional) -- `image`, `file`, or `audio` (defaults to auto-detect based on extension). Audio files (.ogg, .opus, .mp3, .m4a, .wav, .aac, .flac) are auto-detected as `audio`.
|
||||||
- `cleanup` (optional) -- `true` to delete the file after sending (default: false)
|
- `cleanup` (optional) -- `true` to delete the file after sending (default: false)
|
||||||
|
|
||||||
**Security:**
|
**Security:**
|
||||||
@@ -63,6 +64,22 @@ Sends a file or image to the same channel/chat as the triggering message.
|
|||||||
- File size is limited to `sendFileMaxSize` (default: 50MB).
|
- File size is limited to `sendFileMaxSize` (default: 50MB).
|
||||||
- The `cleanup` attribute only works when `sendFileCleanup: true` is set in the agent's features config (disabled by default).
|
- The `cleanup` attribute only works when `sendFileCleanup: true` is set in the agent's features config (disabled by default).
|
||||||
|
|
||||||
|
### `<voice>`
|
||||||
|
|
||||||
|
Generates speech from text via TTS and sends it as a native voice note. No tool calls needed.
|
||||||
|
|
||||||
|
```xml
|
||||||
|
<voice>Hey, here's a quick voice reply!</voice>
|
||||||
|
```
|
||||||
|
|
||||||
|
The text content is sent to the configured TTS provider (see [TTS Configuration](./configuration.md#text-to-speech-tts-configuration)), converted to audio, and delivered as a voice note. Audio is automatically cleaned up after sending.
|
||||||
|
|
||||||
|
- Requires `tts` to be configured in `lettabot.yaml`
|
||||||
|
- Renders as native voice bubbles on Telegram and WhatsApp
|
||||||
|
- Discord and Slack receive a playable audio attachment
|
||||||
|
- On Telegram, falls back to audio file if voice messages are restricted by Premium privacy settings
|
||||||
|
- Can be combined with text: any text after the `</actions>` block is sent as a normal message alongside the voice note
|
||||||
|
|
||||||
### `<no-reply/>`
|
### `<no-reply/>`
|
||||||
|
|
||||||
Suppresses response delivery entirely. The agent's text is discarded.
|
Suppresses response delivery entirely. The agent's text is discarded.
|
||||||
@@ -88,13 +105,13 @@ Backslash-escaped quotes (common when LLMs generate XML inside a JSON context) a
|
|||||||
|
|
||||||
## Channel Support
|
## Channel Support
|
||||||
|
|
||||||
| Channel | `addReaction` | `send-file` | Notes |
|
| Channel | `addReaction` | `send-file` | `kind="audio"` | Notes |
|
||||||
|-----------|:---:|:---:|-------|
|
|-----------|:---:|:---:|:---:|-------|
|
||||||
| Telegram | Yes | Yes | Reactions limited to Telegram's [allowed reaction set](https://core.telegram.org/bots/api#reactiontype). |
|
| Telegram | Yes | Yes | Voice note (`sendVoice`) | Falls back to `sendAudio` if voice messages are restricted by Telegram Premium privacy settings. |
|
||||||
| Slack | Yes | Yes | Reactions use Slack emoji names (`:thumbsup:` style). |
|
| Slack | Yes | Yes | Audio attachment | Reactions use Slack emoji names (`:thumbsup:` style). |
|
||||||
| Discord | Yes | Yes | Custom server emoji not yet supported. |
|
| Discord | Yes | Yes | Audio attachment | Custom server emoji not yet supported. |
|
||||||
| WhatsApp | No | Yes | Reactions skipped with a warning. |
|
| WhatsApp | No | Yes | Voice note (PTT) | Sent with `ptt: true` for native voice bubble. |
|
||||||
| Signal | No | No | Directive skipped with a warning. |
|
| Signal | No | No | No | Directive skipped with a warning. |
|
||||||
|
|
||||||
When a channel doesn't implement `addReaction`, the directive is silently skipped and a warning is logged. This never blocks message delivery.
|
When a channel doesn't implement `addReaction`, the directive is silently skipped and a warning is logged. This never blocks message delivery.
|
||||||
|
|
||||||
|
|||||||
58
skills/voice-memo/SKILL.md
Normal file
58
skills/voice-memo/SKILL.md
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
---
|
||||||
|
name: voice-memo
|
||||||
|
description: Reply with voice memos using text-to-speech. Use when the user sends a voice message, asks for an audio reply, or when a voice response would be more natural.
|
||||||
|
---
|
||||||
|
|
||||||
|
# Voice Memo Responses
|
||||||
|
|
||||||
|
Generate voice memos using TTS and send them as native voice notes.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
Use the `<voice>` directive to send voice memos. No tool calls needed:
|
||||||
|
|
||||||
|
```
|
||||||
|
<actions>
|
||||||
|
<voice>Hey, here's a quick update on that thing we discussed.</voice>
|
||||||
|
</actions>
|
||||||
|
```
|
||||||
|
|
||||||
|
With accompanying text:
|
||||||
|
|
||||||
|
```
|
||||||
|
<actions>
|
||||||
|
<voice>Here's the summary as audio.</voice>
|
||||||
|
</actions>
|
||||||
|
And here it is in text form too!
|
||||||
|
```
|
||||||
|
|
||||||
|
### Silent mode (heartbeats, cron)
|
||||||
|
|
||||||
|
For background tasks that need to send voice without a user message context:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
OUTPUT=$(lettabot-tts "Your message here") || exit 1
|
||||||
|
lettabot-message send --file "$OUTPUT" --voice
|
||||||
|
```
|
||||||
|
|
||||||
|
## When to Use Voice
|
||||||
|
|
||||||
|
- User sent a voice message and a voice reply feels natural
|
||||||
|
- User explicitly asks for a voice/audio response
|
||||||
|
- Short, conversational responses (voice is awkward for long technical content)
|
||||||
|
|
||||||
|
## When NOT to Use Voice
|
||||||
|
|
||||||
|
- Code snippets, file paths, URLs, or structured data (these should be text)
|
||||||
|
- Long responses -- keep voice memos under ~30 seconds of speech
|
||||||
|
- When the user has indicated a preference for text
|
||||||
|
- When `ELEVENLABS_API_KEY` is not set
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- Audio format is OGG Opus, which renders as native voice bubbles on Telegram and WhatsApp
|
||||||
|
- Discord and Slack will show it as a playable audio attachment
|
||||||
|
- Use `cleanup="true"` to delete the audio file after sending
|
||||||
|
- The `data/outbound/` directory is the default allowed path for send-file directives
|
||||||
|
- The script uses `$LETTABOT_WORKING_DIR` to output files to the correct directory
|
||||||
|
- On Telegram, if the user has voice message privacy enabled (Telegram Premium), the bot falls back to sending as an audio file instead of a voice bubble. Users can allow voice messages via Settings > Privacy and Security > Voice Messages.
|
||||||
130
skills/voice-memo/lettabot-tts
Executable file
130
skills/voice-memo/lettabot-tts
Executable file
@@ -0,0 +1,130 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# lettabot-tts - Generate speech audio via configurable TTS provider
|
||||||
|
#
|
||||||
|
# Usage: lettabot-tts <text> [output_path]
|
||||||
|
#
|
||||||
|
# Environment:
|
||||||
|
# TTS_PROVIDER - Optional. "elevenlabs" (default) or "openai".
|
||||||
|
#
|
||||||
|
# ElevenLabs:
|
||||||
|
# ELEVENLABS_API_KEY - Required. API key.
|
||||||
|
# ELEVENLABS_VOICE_ID - Optional. Voice ID (default: 21m00Tcm4TlvDq8ikWAM / Rachel).
|
||||||
|
# ELEVENLABS_MODEL_ID - Optional. Model ID (default: eleven_multilingual_v2).
|
||||||
|
#
|
||||||
|
# OpenAI:
|
||||||
|
# OPENAI_API_KEY - Required. API key.
|
||||||
|
# OPENAI_TTS_VOICE - Optional. Voice name (default: alloy).
|
||||||
|
# OPENAI_TTS_MODEL - Optional. Model (default: tts-1).
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
TEXT="${1:?Usage: lettabot-tts <text> [output_path]}"
|
||||||
|
|
||||||
|
# The session subprocess CWD is set to workingDir (bot.ts:642), which is the
|
||||||
|
# same base directory that <send-file> directives resolve from. This means
|
||||||
|
# $(pwd) and LETTABOT_WORKING_DIR produce paths in the correct coordinate space.
|
||||||
|
OUTBOUND_DIR="${LETTABOT_WORKING_DIR:-$(pwd)}/data/outbound"
|
||||||
|
|
||||||
|
PROVIDER="${TTS_PROVIDER:-elevenlabs}"
|
||||||
|
|
||||||
|
# Ensure output directory exists
|
||||||
|
mkdir -p "$OUTBOUND_DIR"
|
||||||
|
|
||||||
|
# Use collision-safe random filenames when output path is not explicitly provided.
|
||||||
|
if [ -n "${2:-}" ]; then
|
||||||
|
OUTPUT="$2"
|
||||||
|
else
|
||||||
|
# Clean stale voice files older than 1 hour
|
||||||
|
find "$OUTBOUND_DIR" -name 'voice-*.ogg' -mmin +60 -delete 2>/dev/null || true
|
||||||
|
OUTPUT=$(mktemp "${OUTBOUND_DIR}/voice-XXXXXXXXXX.ogg")
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Provider: ElevenLabs
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
tts_elevenlabs() {
|
||||||
|
if [ -z "${ELEVENLABS_API_KEY:-}" ]; then
|
||||||
|
echo "Error: ELEVENLABS_API_KEY is not set" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
local voice_id="${ELEVENLABS_VOICE_ID:-onwK4e9ZLuTAKqWW03F9}"
|
||||||
|
local model_id="${ELEVENLABS_MODEL_ID:-eleven_multilingual_v2}"
|
||||||
|
|
||||||
|
local http_code
|
||||||
|
http_code=$(curl -s -w "%{http_code}" -o "$OUTPUT" \
|
||||||
|
"https://api.elevenlabs.io/v1/text-to-speech/${voice_id}" \
|
||||||
|
-H "xi-api-key: ${ELEVENLABS_API_KEY}" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d "$(jq -n \
|
||||||
|
--arg text "$TEXT" \
|
||||||
|
--arg model "$model_id" \
|
||||||
|
'{
|
||||||
|
text: $text,
|
||||||
|
model_id: $model,
|
||||||
|
output_format: "ogg_opus"
|
||||||
|
}'
|
||||||
|
)")
|
||||||
|
|
||||||
|
if [ "$http_code" -lt 200 ] || [ "$http_code" -ge 300 ]; then
|
||||||
|
echo "Error: ElevenLabs API returned HTTP $http_code" >&2
|
||||||
|
if file "$OUTPUT" | grep -q "text\|JSON\|ASCII"; then
|
||||||
|
cat "$OUTPUT" >&2
|
||||||
|
fi
|
||||||
|
rm -f "$OUTPUT"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Provider: OpenAI
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
tts_openai() {
|
||||||
|
if [ -z "${OPENAI_API_KEY:-}" ]; then
|
||||||
|
echo "Error: OPENAI_API_KEY is not set" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
local voice="${OPENAI_TTS_VOICE:-alloy}"
|
||||||
|
local model="${OPENAI_TTS_MODEL:-tts-1}"
|
||||||
|
|
||||||
|
local http_code
|
||||||
|
http_code=$(curl -s -w "%{http_code}" -o "$OUTPUT" \
|
||||||
|
"https://api.openai.com/v1/audio/speech" \
|
||||||
|
-H "Authorization: Bearer ${OPENAI_API_KEY}" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d "$(jq -n \
|
||||||
|
--arg text "$TEXT" \
|
||||||
|
--arg model "$model" \
|
||||||
|
--arg voice "$voice" \
|
||||||
|
'{
|
||||||
|
model: $model,
|
||||||
|
input: $text,
|
||||||
|
voice: $voice,
|
||||||
|
response_format: "opus"
|
||||||
|
}'
|
||||||
|
)")
|
||||||
|
|
||||||
|
if [ "$http_code" -lt 200 ] || [ "$http_code" -ge 300 ]; then
|
||||||
|
echo "Error: OpenAI TTS API returned HTTP $http_code" >&2
|
||||||
|
if file "$OUTPUT" | grep -q "text\|JSON\|ASCII"; then
|
||||||
|
cat "$OUTPUT" >&2
|
||||||
|
fi
|
||||||
|
rm -f "$OUTPUT"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Dispatch
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
case "$PROVIDER" in
|
||||||
|
elevenlabs) tts_elevenlabs ;;
|
||||||
|
openai) tts_openai ;;
|
||||||
|
*)
|
||||||
|
echo "Error: Unknown TTS_PROVIDER: $PROVIDER (supported: elevenlabs, openai)" >&2
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
echo "$OUTPUT"
|
||||||
@@ -104,7 +104,7 @@ export function createApiServer(deliverer: AgentRouter, options: ServerOptions):
|
|||||||
{
|
{
|
||||||
text: fields.text,
|
text: fields.text,
|
||||||
filePath: file?.tempPath,
|
filePath: file?.tempPath,
|
||||||
kind: fields.kind as 'image' | 'file' | undefined,
|
kind: fields.kind as 'image' | 'file' | 'audio' | undefined,
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ export interface SendFileRequest {
|
|||||||
chatId: string;
|
chatId: string;
|
||||||
filePath: string; // Temporary file path on server
|
filePath: string; // Temporary file path on server
|
||||||
caption?: string;
|
caption?: string;
|
||||||
kind?: 'image' | 'file';
|
kind?: 'image' | 'file' | 'audio';
|
||||||
threadId?: string;
|
threadId?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -565,6 +565,21 @@ export class TelegramAdapter implements ChannelAdapter {
|
|||||||
return { messageId: String(result.message_id) };
|
return { messageId: String(result.message_id) };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (file.kind === 'audio') {
|
||||||
|
try {
|
||||||
|
const result = await this.bot.api.sendVoice(file.chatId, input, { caption });
|
||||||
|
return { messageId: String(result.message_id) };
|
||||||
|
} catch (err: any) {
|
||||||
|
// Fall back to sendAudio if voice messages are restricted (Telegram Premium privacy setting)
|
||||||
|
if (err?.description?.includes('VOICE_MESSAGES_FORBIDDEN')) {
|
||||||
|
log.warn('sendVoice forbidden, falling back to sendAudio');
|
||||||
|
const result = await this.bot.api.sendAudio(file.chatId, new InputFile(file.filePath), { caption });
|
||||||
|
return { messageId: String(result.message_id) };
|
||||||
|
}
|
||||||
|
throw err;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const result = await this.bot.api.sendDocument(file.chatId, input, { caption });
|
const result = await this.bot.api.sendDocument(file.chatId, input, { caption });
|
||||||
return { messageId: String(result.message_id) };
|
return { messageId: String(result.message_id) };
|
||||||
}
|
}
|
||||||
|
|||||||
36
src/channels/whatsapp/outbound.test.ts
Normal file
36
src/channels/whatsapp/outbound.test.ts
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
import { describe, it, expect, vi } from 'vitest';
|
||||||
|
import { sendWhatsAppFile, type LidMapper } from './outbound.js';
|
||||||
|
|
||||||
|
describe('sendWhatsAppFile', () => {
|
||||||
|
it('sends audio as native voice note payload', async () => {
|
||||||
|
const sock = {
|
||||||
|
sendMessage: vi.fn(async () => ({ key: { id: '' } })),
|
||||||
|
} as any;
|
||||||
|
|
||||||
|
const lidMapper: LidMapper = {
|
||||||
|
selfChatLid: '',
|
||||||
|
myNumber: '',
|
||||||
|
lidToJid: new Map(),
|
||||||
|
};
|
||||||
|
|
||||||
|
await sendWhatsAppFile(
|
||||||
|
sock,
|
||||||
|
{
|
||||||
|
chatId: '12345@s.whatsapp.net',
|
||||||
|
filePath: '/tmp/voice.ogg',
|
||||||
|
caption: 'hello',
|
||||||
|
kind: 'audio',
|
||||||
|
},
|
||||||
|
lidMapper,
|
||||||
|
new Set<string>(),
|
||||||
|
);
|
||||||
|
|
||||||
|
expect(sock.sendMessage).toHaveBeenCalledWith(
|
||||||
|
'12345@s.whatsapp.net',
|
||||||
|
{
|
||||||
|
audio: { url: '/tmp/voice.ogg' },
|
||||||
|
ptt: true,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -244,10 +244,14 @@ export async function sendWhatsAppFile(
|
|||||||
const caption = file.caption || undefined;
|
const caption = file.caption || undefined;
|
||||||
const fileName = basename(file.filePath);
|
const fileName = basename(file.filePath);
|
||||||
|
|
||||||
const payload =
|
let payload;
|
||||||
file.kind === "image"
|
if (file.kind === "image") {
|
||||||
? { image: { url: file.filePath }, caption }
|
payload = { image: { url: file.filePath }, caption };
|
||||||
: { document: { url: file.filePath }, mimetype: "application/octet-stream", caption, fileName };
|
} else if (file.kind === "audio") {
|
||||||
|
payload = { audio: { url: file.filePath }, ptt: true };
|
||||||
|
} else {
|
||||||
|
payload = { document: { url: file.filePath }, mimetype: "application/octet-stream", caption, fileName };
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Send file
|
// Send file
|
||||||
|
|||||||
@@ -147,7 +147,7 @@ async function sendViaApi(
|
|||||||
options: {
|
options: {
|
||||||
text?: string;
|
text?: string;
|
||||||
filePath?: string;
|
filePath?: string;
|
||||||
kind?: 'image' | 'file';
|
kind?: 'image' | 'file' | 'audio';
|
||||||
}
|
}
|
||||||
): Promise<void> {
|
): Promise<void> {
|
||||||
const apiUrl = process.env.LETTABOT_API_URL || 'http://localhost:8080';
|
const apiUrl = process.env.LETTABOT_API_URL || 'http://localhost:8080';
|
||||||
@@ -249,7 +249,7 @@ async function sendToChannel(channel: string, chatId: string, text: string): Pro
|
|||||||
async function sendCommand(args: string[]): Promise<void> {
|
async function sendCommand(args: string[]): Promise<void> {
|
||||||
let text = '';
|
let text = '';
|
||||||
let filePath = '';
|
let filePath = '';
|
||||||
let kind: 'image' | 'file' | undefined = undefined;
|
let kind: 'image' | 'file' | 'audio' | undefined = undefined;
|
||||||
let channel = '';
|
let channel = '';
|
||||||
let chatId = '';
|
let chatId = '';
|
||||||
const fileCapableChannels = new Set(['telegram', 'slack', 'discord', 'whatsapp']);
|
const fileCapableChannels = new Set(['telegram', 'slack', 'discord', 'whatsapp']);
|
||||||
@@ -267,6 +267,8 @@ async function sendCommand(args: string[]): Promise<void> {
|
|||||||
i++;
|
i++;
|
||||||
} else if (arg === '--image') {
|
} else if (arg === '--image') {
|
||||||
kind = 'image';
|
kind = 'image';
|
||||||
|
} else if (arg === '--voice') {
|
||||||
|
kind = 'audio';
|
||||||
} else if ((arg === '--channel' || arg === '-c' || arg === '-C') && next) {
|
} else if ((arg === '--channel' || arg === '-c' || arg === '-C') && next) {
|
||||||
channel = next;
|
channel = next;
|
||||||
i++;
|
i++;
|
||||||
@@ -332,6 +334,7 @@ Send options:
|
|||||||
--text, -t <text> Message text (or caption when used with --file)
|
--text, -t <text> Message text (or caption when used with --file)
|
||||||
--file, -f <path> File path (optional, for file messages)
|
--file, -f <path> File path (optional, for file messages)
|
||||||
--image Treat file as image (vs document)
|
--image Treat file as image (vs document)
|
||||||
|
--voice Treat file as voice note (sends as native voice memo)
|
||||||
--channel, -c <name> Channel: telegram, slack, whatsapp, discord (default: last used)
|
--channel, -c <name> Channel: telegram, slack, whatsapp, discord (default: last used)
|
||||||
--chat, --to <id> Chat/conversation ID (default: last messaged)
|
--chat, --to <id> Chat/conversation ID (default: last messaged)
|
||||||
|
|
||||||
@@ -348,6 +351,9 @@ Examples:
|
|||||||
# Send to specific WhatsApp chat
|
# Send to specific WhatsApp chat
|
||||||
lettabot-message send --file report.pdf --text "Report attached" --channel whatsapp --chat "+1555@s.whatsapp.net"
|
lettabot-message send --file report.pdf --text "Report attached" --channel whatsapp --chat "+1555@s.whatsapp.net"
|
||||||
|
|
||||||
|
# Send voice note
|
||||||
|
lettabot-message send --file voice.ogg --voice
|
||||||
|
|
||||||
# Short form
|
# Short form
|
||||||
lettabot-message send -t "Done!" -f doc.pdf -c telegram
|
lettabot-message send -t "Done!" -f doc.pdf -c telegram
|
||||||
|
|
||||||
|
|||||||
@@ -352,6 +352,36 @@ export function configToEnv(config: LettaBotConfig): Record<string, string> {
|
|||||||
env.ATTACHMENTS_MAX_AGE_DAYS = String(config.attachments.maxAgeDays);
|
env.ATTACHMENTS_MAX_AGE_DAYS = String(config.attachments.maxAgeDays);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TTS (text-to-speech for voice memos)
|
||||||
|
if (config.tts?.provider) {
|
||||||
|
env.TTS_PROVIDER = config.tts.provider;
|
||||||
|
}
|
||||||
|
if (config.tts?.apiKey) {
|
||||||
|
// Set the provider-specific key based on provider
|
||||||
|
const provider = config.tts.provider || 'elevenlabs';
|
||||||
|
if (provider === 'elevenlabs') {
|
||||||
|
env.ELEVENLABS_API_KEY = config.tts.apiKey;
|
||||||
|
} else if (provider === 'openai') {
|
||||||
|
env.OPENAI_API_KEY = config.tts.apiKey;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (config.tts?.voiceId) {
|
||||||
|
const provider = config.tts.provider || 'elevenlabs';
|
||||||
|
if (provider === 'elevenlabs') {
|
||||||
|
env.ELEVENLABS_VOICE_ID = config.tts.voiceId;
|
||||||
|
} else if (provider === 'openai') {
|
||||||
|
env.OPENAI_TTS_VOICE = config.tts.voiceId;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (config.tts?.model) {
|
||||||
|
const provider = config.tts.provider || 'elevenlabs';
|
||||||
|
if (provider === 'elevenlabs') {
|
||||||
|
env.ELEVENLABS_MODEL_ID = config.tts.model;
|
||||||
|
} else if (provider === 'openai') {
|
||||||
|
env.OPENAI_TTS_MODEL = config.tts.model;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// API server (server.api is canonical, top-level api is deprecated fallback)
|
// API server (server.api is canonical, top-level api is deprecated fallback)
|
||||||
const apiConfig = config.server.api ?? config.api;
|
const apiConfig = config.server.api ?? config.api;
|
||||||
if (apiConfig?.port !== undefined) {
|
if (apiConfig?.port !== undefined) {
|
||||||
|
|||||||
@@ -177,9 +177,12 @@ export interface LettaBotConfig {
|
|||||||
google?: GoogleConfig;
|
google?: GoogleConfig;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Transcription (voice messages)
|
// Transcription (inbound voice messages)
|
||||||
transcription?: TranscriptionConfig;
|
transcription?: TranscriptionConfig;
|
||||||
|
|
||||||
|
// Text-to-speech (outbound voice memos)
|
||||||
|
tts?: TtsConfig;
|
||||||
|
|
||||||
// Attachment handling
|
// Attachment handling
|
||||||
attachments?: {
|
attachments?: {
|
||||||
maxMB?: number;
|
maxMB?: number;
|
||||||
@@ -195,6 +198,13 @@ export interface LettaBotConfig {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export interface TtsConfig {
|
||||||
|
provider?: 'elevenlabs' | 'openai'; // Default: 'elevenlabs'
|
||||||
|
apiKey?: string; // Falls back to ELEVENLABS_API_KEY or OPENAI_API_KEY env var
|
||||||
|
voiceId?: string; // ElevenLabs voice ID or OpenAI voice name
|
||||||
|
model?: string; // Model ID (provider-specific defaults)
|
||||||
|
}
|
||||||
|
|
||||||
export interface TranscriptionConfig {
|
export interface TranscriptionConfig {
|
||||||
provider: 'openai' | 'mistral';
|
provider: 'openai' | 'mistral';
|
||||||
apiKey?: string; // Falls back to OPENAI_API_KEY or MISTRAL_API_KEY env var
|
apiKey?: string; // Falls back to OPENAI_API_KEY or MISTRAL_API_KEY env var
|
||||||
|
|||||||
@@ -5,15 +5,16 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
import { createAgent, createSession, resumeSession, imageFromFile, imageFromURL, type Session, type MessageContentItem, type SendMessage, type CanUseToolCallback } from '@letta-ai/letta-code-sdk';
|
import { createAgent, createSession, resumeSession, imageFromFile, imageFromURL, type Session, type MessageContentItem, type SendMessage, type CanUseToolCallback } from '@letta-ai/letta-code-sdk';
|
||||||
import { mkdirSync } from 'node:fs';
|
import { mkdirSync, existsSync } from 'node:fs';
|
||||||
import { access, unlink, realpath, stat, constants } from 'node:fs/promises';
|
import { access, unlink, realpath, stat, constants } from 'node:fs/promises';
|
||||||
|
import { execFile } from 'node:child_process';
|
||||||
import { extname, resolve, join } from 'node:path';
|
import { extname, resolve, join } from 'node:path';
|
||||||
import type { ChannelAdapter } from '../channels/types.js';
|
import type { ChannelAdapter } from '../channels/types.js';
|
||||||
import type { BotConfig, InboundMessage, TriggerContext } from './types.js';
|
import type { BotConfig, InboundMessage, TriggerContext } from './types.js';
|
||||||
import type { AgentSession } from './interfaces.js';
|
import type { AgentSession } from './interfaces.js';
|
||||||
import { Store } from './store.js';
|
import { Store } from './store.js';
|
||||||
import { updateAgentName, getPendingApprovals, rejectApproval, cancelRuns, recoverOrphanedConversationApproval, getLatestRunError } from '../tools/letta-api.js';
|
import { updateAgentName, getPendingApprovals, rejectApproval, cancelRuns, recoverOrphanedConversationApproval, getLatestRunError } from '../tools/letta-api.js';
|
||||||
import { installSkillsToAgent } from '../skills/loader.js';
|
import { installSkillsToAgent, withAgentSkillsOnPath, getAgentSkillExecutableDirs, isVoiceMemoConfigured } from '../skills/loader.js';
|
||||||
import { formatMessageEnvelope, formatGroupBatchEnvelope, type SessionContextOptions } from './formatter.js';
|
import { formatMessageEnvelope, formatGroupBatchEnvelope, type SessionContextOptions } from './formatter.js';
|
||||||
import type { GroupBatcher } from './group-batcher.js';
|
import type { GroupBatcher } from './group-batcher.js';
|
||||||
import { loadMemoryBlocks } from './memory.js';
|
import { loadMemoryBlocks } from './memory.js';
|
||||||
@@ -117,10 +118,16 @@ const IMAGE_FILE_EXTENSIONS = new Set([
|
|||||||
'.png', '.jpg', '.jpeg', '.gif', '.webp', '.bmp', '.tiff',
|
'.png', '.jpg', '.jpeg', '.gif', '.webp', '.bmp', '.tiff',
|
||||||
]);
|
]);
|
||||||
|
|
||||||
/** Infer whether a file is an image or generic file based on extension. */
|
const AUDIO_FILE_EXTENSIONS = new Set([
|
||||||
export function inferFileKind(filePath: string): 'image' | 'file' {
|
'.ogg', '.opus', '.mp3', '.m4a', '.wav', '.aac', '.flac',
|
||||||
|
]);
|
||||||
|
|
||||||
|
/** Infer whether a file is an image, audio, or generic file based on extension. */
|
||||||
|
export function inferFileKind(filePath: string): 'image' | 'file' | 'audio' {
|
||||||
const ext = extname(filePath).toLowerCase();
|
const ext = extname(filePath).toLowerCase();
|
||||||
return IMAGE_FILE_EXTENSIONS.has(ext) ? 'image' : 'file';
|
if (IMAGE_FILE_EXTENSIONS.has(ext)) return 'image';
|
||||||
|
if (AUDIO_FILE_EXTENSIONS.has(ext)) return 'audio';
|
||||||
|
return 'file';
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -776,6 +783,59 @@ export class LettaBot implements AgentSession {
|
|||||||
console.warn('[Bot] Directive send-file failed:', err instanceof Error ? err.message : err);
|
console.warn('[Bot] Directive send-file failed:', err instanceof Error ? err.message : err);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (directive.type === 'voice') {
|
||||||
|
if (!isVoiceMemoConfigured()) {
|
||||||
|
log.warn('Directive voice skipped: no TTS credentials configured');
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (typeof adapter.sendFile !== 'function') {
|
||||||
|
log.warn(`Directive voice skipped: ${adapter.name} does not support sendFile`);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find lettabot-tts in agent's skill dirs
|
||||||
|
const agentId = this.store.agentId;
|
||||||
|
const skillDirs = agentId ? getAgentSkillExecutableDirs(agentId) : [];
|
||||||
|
const ttsPath = skillDirs
|
||||||
|
.map(dir => join(dir, 'lettabot-tts'))
|
||||||
|
.find(p => existsSync(p));
|
||||||
|
|
||||||
|
if (!ttsPath) {
|
||||||
|
log.warn('Directive voice skipped: lettabot-tts not found in skill dirs');
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const outputPath = await new Promise<string>((resolve, reject) => {
|
||||||
|
execFile(ttsPath, [directive.text], {
|
||||||
|
cwd: this.config.workingDir,
|
||||||
|
env: { ...process.env, LETTABOT_WORKING_DIR: this.config.workingDir },
|
||||||
|
timeout: 30_000,
|
||||||
|
}, (err, stdout, stderr) => {
|
||||||
|
if (err) {
|
||||||
|
reject(new Error(stderr?.trim() || err.message));
|
||||||
|
} else {
|
||||||
|
resolve(stdout.trim());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
await adapter.sendFile({
|
||||||
|
chatId,
|
||||||
|
filePath: outputPath,
|
||||||
|
kind: 'audio',
|
||||||
|
threadId,
|
||||||
|
});
|
||||||
|
acted = true;
|
||||||
|
log.info(`Directive: sent voice memo (${directive.text.length} chars)`);
|
||||||
|
|
||||||
|
// Clean up generated file
|
||||||
|
try { await unlink(outputPath); } catch {}
|
||||||
|
} catch (err) {
|
||||||
|
log.warn('Directive voice failed:', err instanceof Error ? err.message : err);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return acted;
|
return acted;
|
||||||
}
|
}
|
||||||
@@ -864,6 +924,7 @@ export class LettaBot implements AgentSession {
|
|||||||
|
|
||||||
const opts = this.baseSessionOptions(this.sessionCanUseTool);
|
const opts = this.baseSessionOptions(this.sessionCanUseTool);
|
||||||
let session: Session;
|
let session: Session;
|
||||||
|
let sessionAgentId: string | undefined;
|
||||||
|
|
||||||
// In per-channel mode, look up per-key conversation ID.
|
// In per-channel mode, look up per-key conversation ID.
|
||||||
// In shared mode (key === "shared"), use the legacy single conversationId.
|
// In shared mode (key === "shared"), use the legacy single conversationId.
|
||||||
@@ -873,9 +934,15 @@ export class LettaBot implements AgentSession {
|
|||||||
|
|
||||||
if (convId) {
|
if (convId) {
|
||||||
process.env.LETTA_AGENT_ID = this.store.agentId || undefined;
|
process.env.LETTA_AGENT_ID = this.store.agentId || undefined;
|
||||||
|
if (this.store.agentId) {
|
||||||
|
installSkillsToAgent(this.store.agentId, this.config.skills);
|
||||||
|
sessionAgentId = this.store.agentId;
|
||||||
|
}
|
||||||
session = resumeSession(convId, opts);
|
session = resumeSession(convId, opts);
|
||||||
} else if (this.store.agentId) {
|
} else if (this.store.agentId) {
|
||||||
process.env.LETTA_AGENT_ID = this.store.agentId;
|
process.env.LETTA_AGENT_ID = this.store.agentId;
|
||||||
|
installSkillsToAgent(this.store.agentId, this.config.skills);
|
||||||
|
sessionAgentId = this.store.agentId;
|
||||||
session = createSession(this.store.agentId, opts);
|
session = createSession(this.store.agentId, opts);
|
||||||
} else {
|
} else {
|
||||||
// Create new agent -- persist immediately so we don't orphan it on later failures
|
// Create new agent -- persist immediately so we don't orphan it on later failures
|
||||||
@@ -893,6 +960,7 @@ export class LettaBot implements AgentSession {
|
|||||||
updateAgentName(newAgentId, this.config.agentName).catch(() => {});
|
updateAgentName(newAgentId, this.config.agentName).catch(() => {});
|
||||||
}
|
}
|
||||||
installSkillsToAgent(newAgentId, this.config.skills);
|
installSkillsToAgent(newAgentId, this.config.skills);
|
||||||
|
sessionAgentId = newAgentId;
|
||||||
|
|
||||||
session = createSession(newAgentId, opts);
|
session = createSession(newAgentId, opts);
|
||||||
}
|
}
|
||||||
@@ -900,7 +968,14 @@ export class LettaBot implements AgentSession {
|
|||||||
// Initialize eagerly so the subprocess is ready before the first send()
|
// Initialize eagerly so the subprocess is ready before the first send()
|
||||||
log.info(`Initializing session subprocess (key=${key})...`);
|
log.info(`Initializing session subprocess (key=${key})...`);
|
||||||
try {
|
try {
|
||||||
await this.withSessionTimeout(session.initialize(), `Session initialize (key=${key})`);
|
if (sessionAgentId) {
|
||||||
|
await withAgentSkillsOnPath(
|
||||||
|
sessionAgentId,
|
||||||
|
() => this.withSessionTimeout(session.initialize(), `Session initialize (key=${key})`),
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
await this.withSessionTimeout(session.initialize(), `Session initialize (key=${key})`);
|
||||||
|
}
|
||||||
log.info(`Session subprocess ready (key=${key})`);
|
log.info(`Session subprocess ready (key=${key})`);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
// Close immediately so failed initialization cannot leak a subprocess.
|
// Close immediately so failed initialization cannot leak a subprocess.
|
||||||
@@ -1680,7 +1755,11 @@ export class LettaBot implements AgentSession {
|
|||||||
msgTypeCounts[streamMsg.type] = (msgTypeCounts[streamMsg.type] || 0) + 1;
|
msgTypeCounts[streamMsg.type] = (msgTypeCounts[streamMsg.type] || 0) + 1;
|
||||||
|
|
||||||
const preview = JSON.stringify(streamMsg).slice(0, 300);
|
const preview = JSON.stringify(streamMsg).slice(0, 300);
|
||||||
log.info(`type=${streamMsg.type} ${preview}`);
|
if (streamMsg.type === 'reasoning' || streamMsg.type === 'assistant') {
|
||||||
|
log.debug(`type=${streamMsg.type} ${preview}`);
|
||||||
|
} else {
|
||||||
|
log.info(`type=${streamMsg.type} ${preview}`);
|
||||||
|
}
|
||||||
|
|
||||||
// stream_event is a low-level streaming primitive (partial deltas), not a
|
// stream_event is a low-level streaming primitive (partial deltas), not a
|
||||||
// semantic type change. Skip it for type-transition logic so it doesn't
|
// semantic type change. Skip it for type-transition logic so it doesn't
|
||||||
@@ -1694,6 +1773,7 @@ export class LettaBot implements AgentSession {
|
|||||||
|
|
||||||
// Flush reasoning buffer when type changes away from reasoning
|
// Flush reasoning buffer when type changes away from reasoning
|
||||||
if (isSemanticType && lastMsgType === 'reasoning' && streamMsg.type !== 'reasoning' && reasoningBuffer.trim()) {
|
if (isSemanticType && lastMsgType === 'reasoning' && streamMsg.type !== 'reasoning' && reasoningBuffer.trim()) {
|
||||||
|
log.info(`Reasoning: ${reasoningBuffer.trim()}`);
|
||||||
if (this.config.display?.showReasoning && !suppressDelivery) {
|
if (this.config.display?.showReasoning && !suppressDelivery) {
|
||||||
try {
|
try {
|
||||||
const reasoning = this.formatReasoningDisplay(reasoningBuffer, adapter.id);
|
const reasoning = this.formatReasoningDisplay(reasoningBuffer, adapter.id);
|
||||||
@@ -2180,7 +2260,7 @@ export class LettaBot implements AgentSession {
|
|||||||
options: {
|
options: {
|
||||||
text?: string;
|
text?: string;
|
||||||
filePath?: string;
|
filePath?: string;
|
||||||
kind?: 'image' | 'file';
|
kind?: 'image' | 'file' | 'audio';
|
||||||
}
|
}
|
||||||
): Promise<string | undefined> {
|
): Promise<string | undefined> {
|
||||||
const adapter = this.channels.get(channelId);
|
const adapter = this.channels.get(channelId);
|
||||||
|
|||||||
@@ -77,6 +77,14 @@ describe('parseDirectives', () => {
|
|||||||
]);
|
]);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('parses send-file directive with audio kind', () => {
|
||||||
|
const result = parseDirectives('<actions><send-file path="voice.ogg" kind="audio" /></actions>');
|
||||||
|
expect(result.cleanText).toBe('');
|
||||||
|
expect(result.directives).toEqual([
|
||||||
|
{ type: 'send-file', path: 'voice.ogg', kind: 'audio' },
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
it('parses send-file directive with cleanup attribute', () => {
|
it('parses send-file directive with cleanup attribute', () => {
|
||||||
const result = parseDirectives('<actions><send-file path="/tmp/report.pdf" cleanup="true" /></actions>');
|
const result = parseDirectives('<actions><send-file path="/tmp/report.pdf" cleanup="true" /></actions>');
|
||||||
expect(result.cleanText).toBe('');
|
expect(result.cleanText).toBe('');
|
||||||
@@ -149,6 +157,57 @@ describe('parseDirectives', () => {
|
|||||||
expect(result.cleanText).toBe('');
|
expect(result.cleanText).toBe('');
|
||||||
expect(result.directives).toEqual([]);
|
expect(result.directives).toEqual([]);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('parses voice directive with text content', () => {
|
||||||
|
const result = parseDirectives('<actions><voice>Hello from a voice memo</voice></actions>');
|
||||||
|
expect(result.cleanText).toBe('');
|
||||||
|
expect(result.directives).toEqual([{ type: 'voice', text: 'Hello from a voice memo' }]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('parses voice directive with text after actions block', () => {
|
||||||
|
const result = parseDirectives('<actions><voice>Here is a voice note</voice></actions>\nHere\'s the audio!');
|
||||||
|
expect(result.cleanText).toBe("Here's the audio!");
|
||||||
|
expect(result.directives).toEqual([{ type: 'voice', text: 'Here is a voice note' }]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('parses voice directive with multiline text', () => {
|
||||||
|
const result = parseDirectives('<actions><voice>Line one.\nLine two.</voice></actions>');
|
||||||
|
expect(result.cleanText).toBe('');
|
||||||
|
expect(result.directives).toEqual([{ type: 'voice', text: 'Line one.\nLine two.' }]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('ignores empty voice directive', () => {
|
||||||
|
const result = parseDirectives('<actions><voice> </voice></actions>');
|
||||||
|
expect(result.cleanText).toBe('');
|
||||||
|
expect(result.directives).toEqual([]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('parses voice and react directives together', () => {
|
||||||
|
const result = parseDirectives('<actions><react emoji="🎤" /><voice>Check this out</voice></actions>');
|
||||||
|
expect(result.directives).toHaveLength(2);
|
||||||
|
expect(result.directives[0]).toEqual({ type: 'react', emoji: '🎤' });
|
||||||
|
expect(result.directives[1]).toEqual({ type: 'voice', text: 'Check this out' });
|
||||||
|
});
|
||||||
|
|
||||||
|
it('preserves order when voice appears before react', () => {
|
||||||
|
const result = parseDirectives('<actions><voice>First</voice><react emoji="🎤" /></actions>');
|
||||||
|
expect(result.directives).toEqual([
|
||||||
|
{ type: 'voice', text: 'First' },
|
||||||
|
{ type: 'react', emoji: '🎤' },
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('preserves mixed directive order across voice and self-closing tags', () => {
|
||||||
|
const result = parseDirectives(
|
||||||
|
'<actions><send-file path="a.pdf" /><voice>One</voice><react emoji="👍" /><voice>Two</voice></actions>',
|
||||||
|
);
|
||||||
|
expect(result.directives).toEqual([
|
||||||
|
{ type: 'send-file', path: 'a.pdf' },
|
||||||
|
{ type: 'voice', text: 'One' },
|
||||||
|
{ type: 'react', emoji: '👍' },
|
||||||
|
{ type: 'voice', text: 'Two' },
|
||||||
|
]);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe('stripActionsBlock', () => {
|
describe('stripActionsBlock', () => {
|
||||||
|
|||||||
@@ -26,12 +26,17 @@ export interface SendFileDirective {
|
|||||||
type: 'send-file';
|
type: 'send-file';
|
||||||
path: string;
|
path: string;
|
||||||
caption?: string;
|
caption?: string;
|
||||||
kind?: 'image' | 'file';
|
kind?: 'image' | 'file' | 'audio';
|
||||||
cleanup?: boolean;
|
cleanup?: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export interface VoiceDirective {
|
||||||
|
type: 'voice';
|
||||||
|
text: string;
|
||||||
|
}
|
||||||
|
|
||||||
// Union type — extend with more directive types later
|
// Union type — extend with more directive types later
|
||||||
export type Directive = ReactDirective | SendFileDirective;
|
export type Directive = ReactDirective | SendFileDirective | VoiceDirective;
|
||||||
|
|
||||||
export interface ParseResult {
|
export interface ParseResult {
|
||||||
cleanText: string;
|
cleanText: string;
|
||||||
@@ -45,10 +50,11 @@ export interface ParseResult {
|
|||||||
const ACTIONS_BLOCK_REGEX = /^\s*<actions>([\s\S]*?)<\/actions>/;
|
const ACTIONS_BLOCK_REGEX = /^\s*<actions>([\s\S]*?)<\/actions>/;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Match self-closing child directive tags inside the actions block.
|
* Match supported directive tags inside the actions block in source order.
|
||||||
* Captures the tag name and the full attributes string.
|
* - Self-closing: <react ... />, <send-file ... />
|
||||||
|
* - Content-bearing: <voice>...</voice>
|
||||||
*/
|
*/
|
||||||
const CHILD_DIRECTIVE_REGEX = /<(react|send-file)\b([^>]*)\/>/g;
|
const DIRECTIVE_TOKEN_REGEX = /<(react|send-file)\b([^>]*)\/>|<voice>([\s\S]*?)<\/voice>/g;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Parse a single attribute string like: emoji="eyes" message="123"
|
* Parse a single attribute string like: emoji="eyes" message="123"
|
||||||
@@ -73,13 +79,21 @@ function parseChildDirectives(block: string): Directive[] {
|
|||||||
const normalizedBlock = block.replace(/\\(['"])/g, '$1');
|
const normalizedBlock = block.replace(/\\(['"])/g, '$1');
|
||||||
|
|
||||||
// Reset regex state (global flag)
|
// Reset regex state (global flag)
|
||||||
CHILD_DIRECTIVE_REGEX.lastIndex = 0;
|
DIRECTIVE_TOKEN_REGEX.lastIndex = 0;
|
||||||
|
|
||||||
while ((match = CHILD_DIRECTIVE_REGEX.exec(normalizedBlock)) !== null) {
|
while ((match = DIRECTIVE_TOKEN_REGEX.exec(normalizedBlock)) !== null) {
|
||||||
const [, tagName, attrString] = match;
|
const [, tagName, attrString, voiceText] = match;
|
||||||
|
|
||||||
|
if (voiceText !== undefined) {
|
||||||
|
const text = voiceText.trim();
|
||||||
|
if (text) {
|
||||||
|
directives.push({ type: 'voice', text });
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
if (tagName === 'react') {
|
if (tagName === 'react') {
|
||||||
const attrs = parseAttributes(attrString);
|
const attrs = parseAttributes(attrString || '');
|
||||||
if (attrs.emoji) {
|
if (attrs.emoji) {
|
||||||
directives.push({
|
directives.push({
|
||||||
type: 'react',
|
type: 'react',
|
||||||
@@ -91,11 +105,11 @@ function parseChildDirectives(block: string): Directive[] {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (tagName === 'send-file') {
|
if (tagName === 'send-file') {
|
||||||
const attrs = parseAttributes(attrString);
|
const attrs = parseAttributes(attrString || '');
|
||||||
const path = attrs.path || attrs.file;
|
const path = attrs.path || attrs.file;
|
||||||
if (!path) continue;
|
if (!path) continue;
|
||||||
const caption = attrs.caption || attrs.text;
|
const caption = attrs.caption || attrs.text;
|
||||||
const kind = attrs.kind === 'image' || attrs.kind === 'file'
|
const kind = attrs.kind === 'image' || attrs.kind === 'file' || attrs.kind === 'audio'
|
||||||
? attrs.kind
|
? attrs.kind
|
||||||
: undefined;
|
: undefined;
|
||||||
const cleanup = attrs.cleanup === 'true';
|
const cleanup = attrs.cleanup === 'true';
|
||||||
|
|||||||
@@ -356,6 +356,7 @@ export function formatMessageEnvelope(
|
|||||||
`- To skip replying: \`<no-reply/>\``,
|
`- To skip replying: \`<no-reply/>\``,
|
||||||
`- To perform actions: wrap in \`<actions>\` at the start of your response`,
|
`- To perform actions: wrap in \`<actions>\` at the start of your response`,
|
||||||
` Example: \`<actions><react emoji="thumbsup" /></actions>Your text here\``,
|
` Example: \`<actions><react emoji="thumbsup" /></actions>Your text here\``,
|
||||||
|
`- To send a voice memo: \`<actions><voice>Your message here</voice></actions>\``,
|
||||||
];
|
];
|
||||||
sections.push(`## Response Directives\n${directiveLines.join('\n')}`);
|
sections.push(`## Response Directives\n${directiveLines.join('\n')}`);
|
||||||
|
|
||||||
|
|||||||
@@ -114,7 +114,7 @@ export class LettaGateway implements AgentRouter {
|
|||||||
async deliverToChannel(
|
async deliverToChannel(
|
||||||
channelId: string,
|
channelId: string,
|
||||||
chatId: string,
|
chatId: string,
|
||||||
options: { text?: string; filePath?: string; kind?: 'image' | 'file' }
|
options: { text?: string; filePath?: string; kind?: 'image' | 'file' | 'audio' }
|
||||||
): Promise<string | undefined> {
|
): Promise<string | undefined> {
|
||||||
// Try each agent until one owns the channel
|
// Try each agent until one owns the channel
|
||||||
for (const [name, session] of this.agents) {
|
for (const [name, session] of this.agents) {
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ export interface AgentSession {
|
|||||||
deliverToChannel(channelId: string, chatId: string, options: {
|
deliverToChannel(channelId: string, chatId: string, options: {
|
||||||
text?: string;
|
text?: string;
|
||||||
filePath?: string;
|
filePath?: string;
|
||||||
kind?: 'image' | 'file';
|
kind?: 'image' | 'file' | 'audio';
|
||||||
}): Promise<string | undefined>;
|
}): Promise<string | undefined>;
|
||||||
|
|
||||||
/** Get agent status */
|
/** Get agent status */
|
||||||
@@ -67,7 +67,7 @@ export interface MessageDeliverer {
|
|||||||
deliverToChannel(channelId: string, chatId: string, options: {
|
deliverToChannel(channelId: string, chatId: string, options: {
|
||||||
text?: string;
|
text?: string;
|
||||||
filePath?: string;
|
filePath?: string;
|
||||||
kind?: 'image' | 'file';
|
kind?: 'image' | 'file' | 'audio';
|
||||||
}): Promise<string | undefined>;
|
}): Promise<string | undefined>;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -23,10 +23,21 @@ describe('inferFileKind', () => {
|
|||||||
expect(inferFileKind('/tmp/script.ts')).toBe('file');
|
expect(inferFileKind('/tmp/script.ts')).toBe('file');
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('returns audio for common audio extensions', () => {
|
||||||
|
expect(inferFileKind('/tmp/voice.ogg')).toBe('audio');
|
||||||
|
expect(inferFileKind('/tmp/voice.opus')).toBe('audio');
|
||||||
|
expect(inferFileKind('/tmp/voice.mp3')).toBe('audio');
|
||||||
|
expect(inferFileKind('/tmp/voice.m4a')).toBe('audio');
|
||||||
|
expect(inferFileKind('/tmp/voice.wav')).toBe('audio');
|
||||||
|
expect(inferFileKind('/tmp/voice.aac')).toBe('audio');
|
||||||
|
expect(inferFileKind('/tmp/voice.flac')).toBe('audio');
|
||||||
|
});
|
||||||
|
|
||||||
it('is case insensitive', () => {
|
it('is case insensitive', () => {
|
||||||
expect(inferFileKind('/tmp/PHOTO.PNG')).toBe('image');
|
expect(inferFileKind('/tmp/PHOTO.PNG')).toBe('image');
|
||||||
expect(inferFileKind('/tmp/photo.JPG')).toBe('image');
|
expect(inferFileKind('/tmp/photo.JPG')).toBe('image');
|
||||||
expect(inferFileKind('/tmp/photo.Jpeg')).toBe('image');
|
expect(inferFileKind('/tmp/photo.Jpeg')).toBe('image');
|
||||||
|
expect(inferFileKind('/tmp/VOICE.OGG')).toBe('audio');
|
||||||
});
|
});
|
||||||
|
|
||||||
it('returns file for extensionless paths', () => {
|
it('returns file for extensionless paths', () => {
|
||||||
|
|||||||
@@ -36,6 +36,9 @@ lettabot-message send --file /path/to/image.jpg --text "Check this out!"
|
|||||||
# Send file without text (treated as image)
|
# Send file without text (treated as image)
|
||||||
lettabot-message send --file photo.png --image
|
lettabot-message send --file photo.png --image
|
||||||
|
|
||||||
|
# Send voice note
|
||||||
|
lettabot-message send --file voice.ogg --voice
|
||||||
|
|
||||||
# Send to specific channel and chat
|
# Send to specific channel and chat
|
||||||
lettabot-message send --text "Hello!" --channel telegram --chat 123456789
|
lettabot-message send --text "Hello!" --channel telegram --chat 123456789
|
||||||
|
|
||||||
@@ -104,6 +107,8 @@ This sends "Great idea!" and reacts with thumbsup.
|
|||||||
- \`<react emoji="👀" />\` -- react to the message you are responding to. Use the actual emoji character (👀, 👍, ❤️, 🔥, 🎉, 👏).
|
- \`<react emoji="👀" />\` -- react to the message you are responding to. Use the actual emoji character (👀, 👍, ❤️, 🔥, 🎉, 👏).
|
||||||
- \`<react emoji="🔥" message="123" />\` -- react to a specific message by ID.
|
- \`<react emoji="🔥" message="123" />\` -- react to a specific message by ID.
|
||||||
- \`<send-file path="/path/to/file.png" kind="image" caption="..." />\` -- send a file or image to the same channel/chat. File paths are restricted to the configured send-file directory (default: \`data/outbound/\` in the working directory). Paths outside this directory are blocked.
|
- \`<send-file path="/path/to/file.png" kind="image" caption="..." />\` -- send a file or image to the same channel/chat. File paths are restricted to the configured send-file directory (default: \`data/outbound/\` in the working directory). Paths outside this directory are blocked.
|
||||||
|
- \`<send-file path="/path/to/voice.ogg" kind="audio" cleanup="true" />\` -- send a voice note. Audio files (.ogg, .mp3, etc.) are sent as native voice memos on Telegram and WhatsApp. Use \`cleanup="true"\` to delete the file after sending.
|
||||||
|
- \`<voice>Your message here</voice>\` -- generate and send a voice memo. The text is converted to speech via TTS and sent as a native voice note. No tool calls needed. Use for short conversational replies, responding to voice messages, or when the user asks for audio.
|
||||||
|
|
||||||
### Actions-only response
|
### Actions-only response
|
||||||
|
|
||||||
|
|||||||
@@ -108,7 +108,7 @@ export interface OutboundFile {
|
|||||||
filePath: string;
|
filePath: string;
|
||||||
caption?: string;
|
caption?: string;
|
||||||
threadId?: string;
|
threadId?: string;
|
||||||
kind?: 'image' | 'file';
|
kind?: 'image' | 'file' | 'audio';
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -117,6 +117,7 @@ export interface OutboundFile {
|
|||||||
export interface SkillsConfig {
|
export interface SkillsConfig {
|
||||||
cronEnabled?: boolean;
|
cronEnabled?: boolean;
|
||||||
googleEnabled?: boolean;
|
googleEnabled?: boolean;
|
||||||
|
ttsEnabled?: boolean;
|
||||||
additionalSkills?: string[];
|
additionalSkills?: string[];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -178,6 +178,7 @@ import { CronService } from './cron/service.js';
|
|||||||
import { HeartbeatService } from './cron/heartbeat.js';
|
import { HeartbeatService } from './cron/heartbeat.js';
|
||||||
import { PollingService, parseGmailAccounts } from './polling/service.js';
|
import { PollingService, parseGmailAccounts } from './polling/service.js';
|
||||||
import { agentExists, findAgentByName, ensureNoToolApprovals } from './tools/letta-api.js';
|
import { agentExists, findAgentByName, ensureNoToolApprovals } from './tools/letta-api.js';
|
||||||
|
import { isVoiceMemoConfigured } from './skills/loader.js';
|
||||||
// Skills are now installed to agent-scoped location after agent creation (see bot.ts)
|
// Skills are now installed to agent-scoped location after agent creation (see bot.ts)
|
||||||
|
|
||||||
// Check if config exists (skip in Railway/Docker where env vars are used directly)
|
// Check if config exists (skip in Railway/Docker where env vars are used directly)
|
||||||
@@ -523,6 +524,7 @@ async function main() {
|
|||||||
}
|
}
|
||||||
log.info(`Data directory: ${dataDir}`);
|
log.info(`Data directory: ${dataDir}`);
|
||||||
log.info(`Working directory: ${globalConfig.workingDir}`);
|
log.info(`Working directory: ${globalConfig.workingDir}`);
|
||||||
|
process.env.LETTABOT_WORKING_DIR = globalConfig.workingDir;
|
||||||
|
|
||||||
// Normalize config to agents array
|
// Normalize config to agents array
|
||||||
const agents = normalizeAgents(yamlConfig);
|
const agents = normalizeAgents(yamlConfig);
|
||||||
@@ -551,6 +553,7 @@ async function main() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const gateway = new LettaGateway();
|
const gateway = new LettaGateway();
|
||||||
|
const voiceMemoEnabled = isVoiceMemoConfigured();
|
||||||
const services: {
|
const services: {
|
||||||
cronServices: CronService[],
|
cronServices: CronService[],
|
||||||
heartbeatServices: HeartbeatService[],
|
heartbeatServices: HeartbeatService[],
|
||||||
@@ -590,6 +593,7 @@ async function main() {
|
|||||||
skills: {
|
skills: {
|
||||||
cronEnabled: agentConfig.features?.cron ?? globalConfig.cronEnabled,
|
cronEnabled: agentConfig.features?.cron ?? globalConfig.cronEnabled,
|
||||||
googleEnabled: !!agentConfig.integrations?.google?.enabled || !!agentConfig.polling?.gmail?.enabled,
|
googleEnabled: !!agentConfig.integrations?.google?.enabled || !!agentConfig.polling?.gmail?.enabled,
|
||||||
|
ttsEnabled: voiceMemoEnabled,
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -771,7 +775,9 @@ async function main() {
|
|||||||
},
|
},
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
printStartupBanner(bannerAgents);
|
if (!process.env.LETTABOT_NO_BANNER) {
|
||||||
|
printStartupBanner(bannerAgents);
|
||||||
|
}
|
||||||
|
|
||||||
// Shutdown
|
// Shutdown
|
||||||
const shutdown = async () => {
|
const shutdown = async () => {
|
||||||
|
|||||||
@@ -1528,7 +1528,7 @@ export async function onboard(options?: { nonInteractive?: boolean }): Promise<v
|
|||||||
if (config.agentChoice === 'new' && !config.agentId) {
|
if (config.agentChoice === 'new' && !config.agentId) {
|
||||||
const { createAgent } = await import('@letta-ai/letta-code-sdk');
|
const { createAgent } = await import('@letta-ai/letta-code-sdk');
|
||||||
const { updateAgentName, ensureNoToolApprovals } = await import('./tools/letta-api.js');
|
const { updateAgentName, ensureNoToolApprovals } = await import('./tools/letta-api.js');
|
||||||
const { installSkillsToAgent } = await import('./skills/loader.js');
|
const { installSkillsToAgent, isVoiceMemoConfigured } = await import('./skills/loader.js');
|
||||||
const { loadMemoryBlocks } = await import('./core/memory.js');
|
const { loadMemoryBlocks } = await import('./core/memory.js');
|
||||||
const { SYSTEM_PROMPT } = await import('./core/system-prompt.js');
|
const { SYSTEM_PROMPT } = await import('./core/system-prompt.js');
|
||||||
|
|
||||||
@@ -1545,9 +1545,11 @@ export async function onboard(options?: { nonInteractive?: boolean }): Promise<v
|
|||||||
if (config.agentName) {
|
if (config.agentName) {
|
||||||
await updateAgentName(agentId, config.agentName).catch(() => {});
|
await updateAgentName(agentId, config.agentName).catch(() => {});
|
||||||
}
|
}
|
||||||
|
const ttsEnv = { ...process.env, ...env };
|
||||||
installSkillsToAgent(agentId, {
|
installSkillsToAgent(agentId, {
|
||||||
cronEnabled: config.cron,
|
cronEnabled: config.cron,
|
||||||
googleEnabled: config.google.enabled,
|
googleEnabled: config.google.enabled,
|
||||||
|
ttsEnabled: isVoiceMemoConfigured(ttsEnv),
|
||||||
});
|
});
|
||||||
|
|
||||||
// Disable tool approvals
|
// Disable tool approvals
|
||||||
|
|||||||
@@ -2,13 +2,14 @@
|
|||||||
* Skills Loader Tests
|
* Skills Loader Tests
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import { describe, it, expect, beforeEach, afterEach } from 'vitest';
|
import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest';
|
||||||
import { mkdtempSync, rmSync, mkdirSync, writeFileSync, existsSync, readdirSync } from 'node:fs';
|
import { mkdtempSync, rmSync, mkdirSync, writeFileSync, existsSync, readdirSync } from 'node:fs';
|
||||||
import { tmpdir } from 'node:os';
|
import { tmpdir } from 'node:os';
|
||||||
import { join } from 'node:path';
|
import { join } from 'node:path';
|
||||||
import {
|
import {
|
||||||
getAgentSkillsDir,
|
getAgentSkillsDir,
|
||||||
FEATURE_SKILLS,
|
FEATURE_SKILLS,
|
||||||
|
isVoiceMemoConfigured,
|
||||||
} from './loader.js';
|
} from './loader.js';
|
||||||
|
|
||||||
describe('skills loader', () => {
|
describe('skills loader', () => {
|
||||||
@@ -52,6 +53,23 @@ describe('skills loader', () => {
|
|||||||
expect(FEATURE_SKILLS.google).toContain('gog');
|
expect(FEATURE_SKILLS.google).toContain('gog');
|
||||||
expect(FEATURE_SKILLS.google).toContain('google');
|
expect(FEATURE_SKILLS.google).toContain('google');
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('has tts feature with voice-memo skill', () => {
|
||||||
|
expect(FEATURE_SKILLS.tts).toBeDefined();
|
||||||
|
expect(FEATURE_SKILLS.tts).toContain('voice-memo');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('isVoiceMemoConfigured', () => {
|
||||||
|
it('defaults to elevenlabs and requires ELEVENLABS_API_KEY', () => {
|
||||||
|
expect(isVoiceMemoConfigured({})).toBe(false);
|
||||||
|
expect(isVoiceMemoConfigured({ ELEVENLABS_API_KEY: 'test' })).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('supports openai provider and requires OPENAI_API_KEY', () => {
|
||||||
|
expect(isVoiceMemoConfigured({ TTS_PROVIDER: 'openai' })).toBe(false);
|
||||||
|
expect(isVoiceMemoConfigured({ TTS_PROVIDER: 'openai', OPENAI_API_KEY: 'test' })).toBe(true);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe('installSkillsToAgent', () => {
|
describe('installSkillsToAgent', () => {
|
||||||
@@ -145,4 +163,53 @@ describe('skills loader', () => {
|
|||||||
expect(content).toBe('target version');
|
expect(content).toBe('target version');
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe('loadAllSkills precedence', () => {
|
||||||
|
it('prefers global skills over bundled skills for the same name', async () => {
|
||||||
|
const originalHome = process.env.HOME;
|
||||||
|
const originalUserProfile = process.env.USERPROFILE;
|
||||||
|
const originalCwd = process.cwd();
|
||||||
|
const tempHome = mkdtempSync(join(tmpdir(), 'lettabot-home-test-'));
|
||||||
|
const tempProject = mkdtempSync(join(tmpdir(), 'lettabot-project-test-'));
|
||||||
|
|
||||||
|
try {
|
||||||
|
process.env.HOME = tempHome;
|
||||||
|
process.env.USERPROFILE = tempHome;
|
||||||
|
process.chdir(tempProject);
|
||||||
|
|
||||||
|
const globalVoiceMemoDir = join(tempHome, '.letta', 'skills', 'voice-memo');
|
||||||
|
mkdirSync(globalVoiceMemoDir, { recursive: true });
|
||||||
|
writeFileSync(
|
||||||
|
join(globalVoiceMemoDir, 'SKILL.md'),
|
||||||
|
[
|
||||||
|
'---',
|
||||||
|
'name: voice-memo',
|
||||||
|
'description: global override',
|
||||||
|
'---',
|
||||||
|
'',
|
||||||
|
'# Global override',
|
||||||
|
'',
|
||||||
|
].join('\n'),
|
||||||
|
);
|
||||||
|
|
||||||
|
vi.resetModules();
|
||||||
|
const mod = await import('./loader.js');
|
||||||
|
const skills = mod.loadAllSkills();
|
||||||
|
const voiceMemo = skills.find((skill: any) => skill.name === 'voice-memo');
|
||||||
|
const expectedPath = join(tempHome, '.letta', 'skills', 'voice-memo', 'SKILL.md');
|
||||||
|
|
||||||
|
expect(voiceMemo).toBeDefined();
|
||||||
|
expect(voiceMemo!.description).toBe('global override');
|
||||||
|
expect(voiceMemo!.filePath).toContain(expectedPath);
|
||||||
|
} finally {
|
||||||
|
process.chdir(originalCwd);
|
||||||
|
if (originalHome === undefined) delete process.env.HOME;
|
||||||
|
else process.env.HOME = originalHome;
|
||||||
|
if (originalUserProfile === undefined) delete process.env.USERPROFILE;
|
||||||
|
else process.env.USERPROFILE = originalUserProfile;
|
||||||
|
rmSync(tempHome, { recursive: true, force: true });
|
||||||
|
rmSync(tempProject, { recursive: true, force: true });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
|
|
||||||
import { existsSync, readdirSync, readFileSync, mkdirSync, cpSync } from 'node:fs';
|
import { existsSync, readdirSync, readFileSync, mkdirSync, cpSync } from 'node:fs';
|
||||||
import { execSync } from 'node:child_process';
|
import { execSync } from 'node:child_process';
|
||||||
import { join, resolve } from 'node:path';
|
import { join, resolve, delimiter } from 'node:path';
|
||||||
import matter from 'gray-matter';
|
import matter from 'gray-matter';
|
||||||
import type { SkillEntry, ClawdbotMetadata } from './types.js';
|
import type { SkillEntry, ClawdbotMetadata } from './types.js';
|
||||||
|
|
||||||
@@ -30,6 +30,91 @@ export function getAgentSkillsDir(agentId: string): string {
|
|||||||
return join(HOME, '.letta', 'agents', agentId, 'skills');
|
return join(HOME, '.letta', 'agents', agentId, 'skills');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resolve subdirectories that contain executable skill files.
|
||||||
|
*/
|
||||||
|
function resolveSkillExecutableDirs(skillsDir: string): string[] {
|
||||||
|
// Only add dirs that contain at least one executable (non-.md) file
|
||||||
|
return readdirSync(skillsDir, { withFileTypes: true })
|
||||||
|
.filter(d => d.isDirectory())
|
||||||
|
.map(d => join(skillsDir, d.name))
|
||||||
|
.filter(dir => {
|
||||||
|
try {
|
||||||
|
return readdirSync(dir).some(f => !f.endsWith('.md'));
|
||||||
|
} catch { return false; }
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get executable skill directories for a specific agent.
|
||||||
|
*/
|
||||||
|
export function getAgentSkillExecutableDirs(agentId: string): string[] {
|
||||||
|
const skillsDir = getAgentSkillsDir(agentId);
|
||||||
|
if (!existsSync(skillsDir)) return [];
|
||||||
|
return resolveSkillExecutableDirs(skillsDir);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Temporarily prepend agent skill directories to PATH for one async operation.
|
||||||
|
*
|
||||||
|
* PATH is process-global, so serialize PATH mutations to avoid races when
|
||||||
|
* multiple sessions initialize concurrently.
|
||||||
|
*/
|
||||||
|
let _pathMutationQueue: Promise<void> = Promise.resolve();
|
||||||
|
async function withPathMutationLock<T>(fn: () => Promise<T>): Promise<T> {
|
||||||
|
const previous = _pathMutationQueue;
|
||||||
|
let release!: () => void;
|
||||||
|
_pathMutationQueue = new Promise<void>((resolve) => {
|
||||||
|
release = resolve;
|
||||||
|
});
|
||||||
|
|
||||||
|
await previous;
|
||||||
|
try {
|
||||||
|
return await fn();
|
||||||
|
} finally {
|
||||||
|
release();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function withAgentSkillsOnPath<T>(agentId: string, fn: () => Promise<T>): Promise<T> {
|
||||||
|
const skillDirs = getAgentSkillExecutableDirs(agentId);
|
||||||
|
if (skillDirs.length === 0) {
|
||||||
|
return fn();
|
||||||
|
}
|
||||||
|
|
||||||
|
return withPathMutationLock(async () => {
|
||||||
|
const originalPath = process.env.PATH || '';
|
||||||
|
const originalParts = originalPath.split(delimiter).filter(Boolean);
|
||||||
|
const existing = new Set(originalParts);
|
||||||
|
const prepend = skillDirs.filter((dir) => !existing.has(dir));
|
||||||
|
|
||||||
|
if (prepend.length > 0) {
|
||||||
|
process.env.PATH = [...prepend, ...originalParts].join(delimiter);
|
||||||
|
log.info(`Added ${prepend.length} skill dir(s) to PATH: ${prepend.join(', ')}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
return await fn();
|
||||||
|
} finally {
|
||||||
|
process.env.PATH = originalPath;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Whether TTS credentials are configured enough to use voice-memo skill.
|
||||||
|
*/
|
||||||
|
export function isVoiceMemoConfigured(env: NodeJS.ProcessEnv = process.env): boolean {
|
||||||
|
const provider = (env.TTS_PROVIDER || 'elevenlabs').toLowerCase();
|
||||||
|
if (provider === 'openai') {
|
||||||
|
return !!env.OPENAI_API_KEY;
|
||||||
|
}
|
||||||
|
if (provider === 'elevenlabs') {
|
||||||
|
return !!env.ELEVENLABS_API_KEY;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Check if a binary exists on PATH
|
* Check if a binary exists on PATH
|
||||||
*/
|
*/
|
||||||
@@ -158,7 +243,10 @@ export function loadAllSkills(agentId?: string | null): SkillEntry[] {
|
|||||||
// skills.sh global installs (lowest priority)
|
// skills.sh global installs (lowest priority)
|
||||||
dirs.push(SKILLS_SH_DIR);
|
dirs.push(SKILLS_SH_DIR);
|
||||||
|
|
||||||
// Global skills
|
// Bundled skills (ship with the project in skills/)
|
||||||
|
dirs.push(BUNDLED_SKILLS_DIR);
|
||||||
|
|
||||||
|
// Global skills (override bundled defaults)
|
||||||
dirs.push(GLOBAL_SKILLS_DIR);
|
dirs.push(GLOBAL_SKILLS_DIR);
|
||||||
|
|
||||||
// Agent-scoped skills (middle priority)
|
// Agent-scoped skills (middle priority)
|
||||||
@@ -208,6 +296,7 @@ function installSkillsFromDir(sourceDir: string, targetDir: string): string[] {
|
|||||||
export const FEATURE_SKILLS: Record<string, string[]> = {
|
export const FEATURE_SKILLS: Record<string, string[]> = {
|
||||||
cron: ['scheduling'], // Scheduling handles both one-off reminders and recurring cron jobs
|
cron: ['scheduling'], // Scheduling handles both one-off reminders and recurring cron jobs
|
||||||
google: ['gog', 'google'], // Installed when Google/Gmail is configured
|
google: ['gog', 'google'], // Installed when Google/Gmail is configured
|
||||||
|
tts: ['voice-memo'], // Voice memo replies via lettabot-tts helper
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -242,6 +331,7 @@ function installSpecificSkills(
|
|||||||
export interface SkillsInstallConfig {
|
export interface SkillsInstallConfig {
|
||||||
cronEnabled?: boolean;
|
cronEnabled?: boolean;
|
||||||
googleEnabled?: boolean; // Gmail polling or Google integration
|
googleEnabled?: boolean; // Gmail polling or Google integration
|
||||||
|
ttsEnabled?: boolean; // Voice memo replies via TTS providers
|
||||||
additionalSkills?: string[]; // Explicitly enabled skills
|
additionalSkills?: string[]; // Explicitly enabled skills
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -261,22 +351,29 @@ export function installSkillsToWorkingDir(workingDir: string, config: SkillsInst
|
|||||||
mkdirSync(targetDir, { recursive: true });
|
mkdirSync(targetDir, { recursive: true });
|
||||||
|
|
||||||
// Collect skills to install based on enabled features
|
// Collect skills to install based on enabled features
|
||||||
const skillsToInstall: string[] = [];
|
const requestedSkills: string[] = [];
|
||||||
|
|
||||||
// Cron skills (always if cron is enabled)
|
// Cron skills (always if cron is enabled)
|
||||||
if (config.cronEnabled) {
|
if (config.cronEnabled) {
|
||||||
skillsToInstall.push(...FEATURE_SKILLS.cron);
|
requestedSkills.push(...FEATURE_SKILLS.cron);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Google skills (if Gmail polling or Google is configured)
|
// Google skills (if Gmail polling or Google is configured)
|
||||||
if (config.googleEnabled) {
|
if (config.googleEnabled) {
|
||||||
skillsToInstall.push(...FEATURE_SKILLS.google);
|
requestedSkills.push(...FEATURE_SKILLS.google);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Voice memo skill (if TTS is configured)
|
||||||
|
if (config.ttsEnabled) {
|
||||||
|
requestedSkills.push(...FEATURE_SKILLS.tts);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Additional explicitly enabled skills
|
// Additional explicitly enabled skills
|
||||||
if (config.additionalSkills?.length) {
|
if (config.additionalSkills?.length) {
|
||||||
skillsToInstall.push(...config.additionalSkills);
|
requestedSkills.push(...config.additionalSkills);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const skillsToInstall = Array.from(new Set(requestedSkills));
|
||||||
|
|
||||||
if (skillsToInstall.length === 0) {
|
if (skillsToInstall.length === 0) {
|
||||||
log.info('No feature-gated skills to install');
|
log.info('No feature-gated skills to install');
|
||||||
@@ -310,22 +407,29 @@ export function installSkillsToAgent(agentId: string, config: SkillsInstallConfi
|
|||||||
mkdirSync(targetDir, { recursive: true });
|
mkdirSync(targetDir, { recursive: true });
|
||||||
|
|
||||||
// Collect skills to install based on enabled features
|
// Collect skills to install based on enabled features
|
||||||
const skillsToInstall: string[] = [];
|
const requestedSkills: string[] = [];
|
||||||
|
|
||||||
// Cron skills (always if cron is enabled)
|
// Cron skills (always if cron is enabled)
|
||||||
if (config.cronEnabled) {
|
if (config.cronEnabled) {
|
||||||
skillsToInstall.push(...FEATURE_SKILLS.cron);
|
requestedSkills.push(...FEATURE_SKILLS.cron);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Google skills (if Gmail polling or Google is configured)
|
// Google skills (if Gmail polling or Google is configured)
|
||||||
if (config.googleEnabled) {
|
if (config.googleEnabled) {
|
||||||
skillsToInstall.push(...FEATURE_SKILLS.google);
|
requestedSkills.push(...FEATURE_SKILLS.google);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Voice memo skill (if TTS is configured)
|
||||||
|
if (config.ttsEnabled) {
|
||||||
|
requestedSkills.push(...FEATURE_SKILLS.tts);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Additional explicitly enabled skills
|
// Additional explicitly enabled skills
|
||||||
if (config.additionalSkills?.length) {
|
if (config.additionalSkills?.length) {
|
||||||
skillsToInstall.push(...config.additionalSkills);
|
requestedSkills.push(...config.additionalSkills);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const skillsToInstall = Array.from(new Set(requestedSkills));
|
||||||
|
|
||||||
if (skillsToInstall.length === 0) {
|
if (skillsToInstall.length === 0) {
|
||||||
return; // No skills to install - silent return
|
return; // No skills to install - silent return
|
||||||
|
|||||||
Reference in New Issue
Block a user