fix: harden voice memo delivery diagnostics (#536)
Co-authored-by: Letta Code <noreply@letta.com>
This commit is contained in:
@@ -187,7 +187,28 @@ All environment variables can be overridden by the equivalent YAML config fields
|
||||
|
||||
1. Check that a TTS provider is configured -- either in `lettabot.yaml` under `tts` or via `ELEVENLABS_API_KEY` / `OPENAI_API_KEY`
|
||||
2. Check that `jq` and `curl` are installed (required by the `lettabot-tts` script)
|
||||
3. Check logs for TTS API errors (HTTP status codes, rate limits)
|
||||
3. Check logs for voice pipeline events:
|
||||
- `[Bot] Directive voice: generating memo (...)`
|
||||
- `[Bot] Directive voice: generated file ...`
|
||||
- `[Bot] Directive voice failed: ...`
|
||||
- `[Telegram] sendVoice failed, falling back to sendAudio: ...`
|
||||
4. Check logs for TTS API errors (HTTP status codes, rate limits)
|
||||
|
||||
### Docker checklist for voice
|
||||
|
||||
For container images, ensure these binaries are available:
|
||||
|
||||
- `bash` (required by `lettabot-tts` shebang)
|
||||
- `curl` and `jq` (required for TTS API calls)
|
||||
- `ffmpeg` (recommended for full inbound voice transcription compatibility)
|
||||
- `ca-certificates` (required for HTTPS API calls)
|
||||
|
||||
Quick runtime validation from inside the container:
|
||||
|
||||
```bash
|
||||
which bash curl jq ffmpeg
|
||||
lettabot-tts "TTS health check"
|
||||
```
|
||||
|
||||
### Telegram voice privacy
|
||||
|
||||
|
||||
@@ -27,6 +27,20 @@ OUTBOUND_DIR="${LETTABOT_WORKING_DIR:-$(pwd)}/data/outbound"
|
||||
|
||||
PROVIDER="${TTS_PROVIDER:-elevenlabs}"
|
||||
|
||||
require_cmd() {
|
||||
if ! command -v "$1" >/dev/null 2>&1; then
|
||||
echo "Error: Required command '$1' is not installed or not on PATH" >&2
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
preflight() {
|
||||
require_cmd curl
|
||||
require_cmd jq
|
||||
}
|
||||
|
||||
preflight
|
||||
|
||||
# Ensure output directory exists
|
||||
mkdir -p "$OUTBOUND_DIR"
|
||||
|
||||
@@ -52,7 +66,7 @@ tts_elevenlabs() {
|
||||
local model_id="${ELEVENLABS_MODEL_ID:-eleven_multilingual_v2}"
|
||||
|
||||
local http_code
|
||||
http_code=$(curl -s -w "%{http_code}" -o "$OUTPUT" \
|
||||
http_code=$(curl -sS -w "%{http_code}" -o "$OUTPUT" \
|
||||
"https://api.elevenlabs.io/v1/text-to-speech/${voice_id}" \
|
||||
-H "xi-api-key: ${ELEVENLABS_API_KEY}" \
|
||||
-H "Content-Type: application/json" \
|
||||
@@ -67,13 +81,21 @@ tts_elevenlabs() {
|
||||
)")
|
||||
|
||||
if [ "$http_code" -lt 200 ] || [ "$http_code" -ge 300 ]; then
|
||||
echo "Error: ElevenLabs API returned HTTP $http_code" >&2
|
||||
if file "$OUTPUT" | grep -q "text\|JSON\|ASCII"; then
|
||||
cat "$OUTPUT" >&2
|
||||
echo "Error: ElevenLabs API returned HTTP $http_code (model=$model_id voice_id=$voice_id)" >&2
|
||||
if [ -s "$OUTPUT" ]; then
|
||||
echo "Error response preview:" >&2
|
||||
head -c 2000 "$OUTPUT" >&2 || true
|
||||
echo >&2
|
||||
fi
|
||||
rm -f "$OUTPUT"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -s "$OUTPUT" ]; then
|
||||
echo "Error: ElevenLabs TTS response was empty" >&2
|
||||
rm -f "$OUTPUT"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -89,7 +111,7 @@ tts_openai() {
|
||||
local model="${OPENAI_TTS_MODEL:-tts-1}"
|
||||
|
||||
local http_code
|
||||
http_code=$(curl -s -w "%{http_code}" -o "$OUTPUT" \
|
||||
http_code=$(curl -sS -w "%{http_code}" -o "$OUTPUT" \
|
||||
"https://api.openai.com/v1/audio/speech" \
|
||||
-H "Authorization: Bearer ${OPENAI_API_KEY}" \
|
||||
-H "Content-Type: application/json" \
|
||||
@@ -106,13 +128,21 @@ tts_openai() {
|
||||
)")
|
||||
|
||||
if [ "$http_code" -lt 200 ] || [ "$http_code" -ge 300 ]; then
|
||||
echo "Error: OpenAI TTS API returned HTTP $http_code" >&2
|
||||
if file "$OUTPUT" | grep -q "text\|JSON\|ASCII"; then
|
||||
cat "$OUTPUT" >&2
|
||||
echo "Error: OpenAI TTS API returned HTTP $http_code (model=$model voice=$voice)" >&2
|
||||
if [ -s "$OUTPUT" ]; then
|
||||
echo "Error response preview:" >&2
|
||||
head -c 2000 "$OUTPUT" >&2 || true
|
||||
echo >&2
|
||||
fi
|
||||
rm -f "$OUTPUT"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -s "$OUTPUT" ]; then
|
||||
echo "Error: OpenAI TTS response was empty" >&2
|
||||
rm -f "$OUTPUT"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@@ -32,3 +32,49 @@ describe('TelegramAdapter reactions', () => {
|
||||
]);
|
||||
});
|
||||
});
|
||||
|
||||
describe('TelegramAdapter audio fallback', () => {
|
||||
afterEach(() => {
|
||||
vi.restoreAllMocks();
|
||||
});
|
||||
|
||||
it('falls back to sendAudio for VOICE_MESSAGES_FORBIDDEN errors', async () => {
|
||||
const adapter = new TelegramAdapter({ token: 'test-token' });
|
||||
const sendVoice = vi
|
||||
.spyOn(adapter.getBot().api, 'sendVoice')
|
||||
.mockRejectedValue({ description: 'Bad Request: VOICE_MESSAGES_FORBIDDEN' } as any);
|
||||
const sendAudio = vi
|
||||
.spyOn(adapter.getBot().api, 'sendAudio')
|
||||
.mockResolvedValue({ message_id: 987 } as any);
|
||||
|
||||
const result = await adapter.sendFile({
|
||||
chatId: '123',
|
||||
filePath: '/tmp/voice.ogg',
|
||||
kind: 'audio',
|
||||
});
|
||||
|
||||
expect(sendVoice).toHaveBeenCalledTimes(1);
|
||||
expect(sendAudio).toHaveBeenCalledTimes(1);
|
||||
expect(result).toEqual({ messageId: '987' });
|
||||
});
|
||||
|
||||
it('does not fall back to sendAudio for non-voice transport failures', async () => {
|
||||
const adapter = new TelegramAdapter({ token: 'test-token' });
|
||||
const timeoutError = new Error('socket hang up');
|
||||
const sendVoice = vi
|
||||
.spyOn(adapter.getBot().api, 'sendVoice')
|
||||
.mockRejectedValue(timeoutError);
|
||||
const sendAudio = vi
|
||||
.spyOn(adapter.getBot().api, 'sendAudio')
|
||||
.mockResolvedValue({ message_id: 999 } as any);
|
||||
|
||||
await expect(adapter.sendFile({
|
||||
chatId: '123',
|
||||
filePath: '/tmp/voice.ogg',
|
||||
kind: 'audio',
|
||||
})).rejects.toBe(timeoutError);
|
||||
|
||||
expect(sendVoice).toHaveBeenCalledTimes(1);
|
||||
expect(sendAudio).not.toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
|
||||
@@ -26,6 +26,27 @@ import { resolveDailyLimits, checkDailyLimit, type GroupModeConfig } from './gro
|
||||
import { createLogger } from '../logger.js';
|
||||
|
||||
const log = createLogger('Telegram');
|
||||
|
||||
function getTelegramErrorReason(err: unknown): string {
|
||||
if (err && typeof err === 'object') {
|
||||
const maybeError = err as { description?: string; message?: string };
|
||||
if (typeof maybeError.description === 'string' && maybeError.description.trim().length > 0) {
|
||||
return maybeError.description;
|
||||
}
|
||||
if (typeof maybeError.message === 'string' && maybeError.message.trim().length > 0) {
|
||||
return maybeError.message;
|
||||
}
|
||||
}
|
||||
return String(err);
|
||||
}
|
||||
|
||||
function shouldFallbackToAudio(err: unknown): boolean {
|
||||
if (!err || typeof err !== 'object') return false;
|
||||
const description = (err as { description?: string }).description;
|
||||
if (typeof description !== 'string') return false;
|
||||
return description.includes('VOICE_MESSAGES_FORBIDDEN');
|
||||
}
|
||||
|
||||
export interface TelegramConfig {
|
||||
token: string;
|
||||
dmPolicy?: DmPolicy; // 'pairing' (default), 'allowlist', or 'open'
|
||||
@@ -593,13 +614,21 @@ export class TelegramAdapter implements ChannelAdapter {
|
||||
const result = await this.bot.api.sendVoice(file.chatId, input, { caption });
|
||||
return { messageId: String(result.message_id) };
|
||||
} catch (err: any) {
|
||||
// Fall back to sendAudio if voice messages are restricted (Telegram Premium privacy setting)
|
||||
if (err?.description?.includes('VOICE_MESSAGES_FORBIDDEN')) {
|
||||
log.warn('sendVoice forbidden, falling back to sendAudio');
|
||||
const reason = getTelegramErrorReason(err);
|
||||
// Only retry with sendAudio for deterministic voice-policy rejections.
|
||||
// For network/timeout errors we rethrow to avoid possible duplicate sends.
|
||||
if (!shouldFallbackToAudio(err)) {
|
||||
throw err;
|
||||
}
|
||||
log.warn('sendVoice failed with VOICE_MESSAGES_FORBIDDEN, falling back to sendAudio:', reason);
|
||||
try {
|
||||
const result = await this.bot.api.sendAudio(file.chatId, new InputFile(file.filePath), { caption });
|
||||
return { messageId: String(result.message_id) };
|
||||
} catch (fallbackErr: any) {
|
||||
const fallbackReason = getTelegramErrorReason(fallbackErr);
|
||||
log.error('sendAudio fallback also failed:', fallbackReason);
|
||||
throw fallbackErr;
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -461,11 +461,24 @@ export class LettaBot implements AgentSession {
|
||||
.map(dir => join(dir, 'lettabot-tts'))
|
||||
.find(p => existsSync(p));
|
||||
|
||||
const ttsProvider = (process.env.TTS_PROVIDER || 'elevenlabs').toLowerCase();
|
||||
const ttsVoice = ttsProvider === 'openai'
|
||||
? (process.env.OPENAI_TTS_VOICE || 'alloy')
|
||||
: (process.env.ELEVENLABS_VOICE_ID || 'onwK4e9ZLuTAKqWW03F9');
|
||||
const ttsModel = ttsProvider === 'openai'
|
||||
? (process.env.OPENAI_TTS_MODEL || 'tts-1')
|
||||
: (process.env.ELEVENLABS_MODEL_ID || 'eleven_multilingual_v2');
|
||||
|
||||
if (!ttsPath) {
|
||||
log.warn('Directive voice skipped: lettabot-tts not found in skill dirs');
|
||||
continue;
|
||||
}
|
||||
|
||||
log.info(
|
||||
`Directive voice: generating memo (provider=${ttsProvider}, model=${ttsModel}, voice=${ttsVoice}, textLen=${directive.text.length})`,
|
||||
);
|
||||
log.info(`Directive voice: helper=${ttsPath}`);
|
||||
|
||||
try {
|
||||
const outputPath = await new Promise<string>((resolve, reject) => {
|
||||
execFile(ttsPath, [directive.text], {
|
||||
@@ -474,13 +487,37 @@ export class LettaBot implements AgentSession {
|
||||
timeout: 30_000,
|
||||
}, (err, stdout, stderr) => {
|
||||
if (err) {
|
||||
reject(new Error(stderr?.trim() || err.message));
|
||||
const execErr = new Error(stderr?.trim() || err.message) as Error & {
|
||||
code?: string | number | null;
|
||||
signal?: NodeJS.Signals;
|
||||
stdout?: string;
|
||||
stderr?: string;
|
||||
};
|
||||
execErr.code = err.code;
|
||||
execErr.signal = err.signal;
|
||||
execErr.stdout = stdout?.trim();
|
||||
execErr.stderr = stderr?.trim();
|
||||
reject(execErr);
|
||||
} else {
|
||||
resolve(stdout.trim());
|
||||
const output = stdout.trim();
|
||||
if (!output) {
|
||||
reject(new Error('lettabot-tts returned an empty output path'));
|
||||
return;
|
||||
}
|
||||
if (stderr?.trim()) {
|
||||
log.warn('Directive voice: lettabot-tts stderr:', stderr.trim());
|
||||
}
|
||||
resolve(output.split('\n').at(-1)?.trim() || output);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
const outputStats = await stat(outputPath);
|
||||
if (!outputStats.isFile()) {
|
||||
throw new Error(`Generated TTS output is not a file: ${outputPath}`);
|
||||
}
|
||||
log.info(`Directive voice: generated file ${outputPath} (${outputStats.size} bytes)`);
|
||||
|
||||
await adapter.sendFile({
|
||||
chatId,
|
||||
filePath: outputPath,
|
||||
@@ -493,7 +530,23 @@ export class LettaBot implements AgentSession {
|
||||
// Clean up generated file
|
||||
try { await unlink(outputPath); } catch {}
|
||||
} catch (err) {
|
||||
log.warn('Directive voice failed:', err instanceof Error ? err.message : err);
|
||||
const execErr = err as Error & {
|
||||
code?: string | number | null;
|
||||
signal?: NodeJS.Signals;
|
||||
stdout?: string;
|
||||
stderr?: string;
|
||||
};
|
||||
log.warn('Directive voice failed:', {
|
||||
message: execErr?.message || String(err),
|
||||
code: execErr?.code,
|
||||
signal: execErr?.signal,
|
||||
stdout: typeof execErr?.stdout === 'string' ? execErr.stdout.slice(0, 300) : undefined,
|
||||
stderr: typeof execErr?.stderr === 'string' ? execErr.stderr.slice(0, 1200) : undefined,
|
||||
provider: ttsProvider,
|
||||
model: ttsModel,
|
||||
voice: ttsVoice,
|
||||
helper: ttsPath,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user