fix: harden voice memo delivery diagnostics (#536)

Co-authored-by: Letta Code <noreply@letta.com>
This commit is contained in:
Cameron
2026-03-09 13:22:26 -07:00
committed by GitHub
parent 7da991206f
commit d23f0f9328
5 changed files with 195 additions and 16 deletions

View File

@@ -187,7 +187,28 @@ All environment variables can be overridden by the equivalent YAML config fields
1. Check that a TTS provider is configured -- either in `lettabot.yaml` under `tts` or via `ELEVENLABS_API_KEY` / `OPENAI_API_KEY`
2. Check that `jq` and `curl` are installed (required by the `lettabot-tts` script)
3. Check logs for TTS API errors (HTTP status codes, rate limits)
3. Check logs for voice pipeline events:
- `[Bot] Directive voice: generating memo (...)`
- `[Bot] Directive voice: generated file ...`
- `[Bot] Directive voice failed: ...`
- `[Telegram] sendVoice failed, falling back to sendAudio: ...`
4. Check logs for TTS API errors (HTTP status codes, rate limits)
### Docker checklist for voice
For container images, ensure these binaries are available:
- `bash` (required by `lettabot-tts` shebang)
- `curl` and `jq` (required for TTS API calls)
- `ffmpeg` (recommended for full inbound voice transcription compatibility)
- `ca-certificates` (required for HTTPS API calls)
Quick runtime validation from inside the container:
```bash
which bash curl jq ffmpeg
lettabot-tts "TTS health check"
```
### Telegram voice privacy

View File

@@ -27,6 +27,20 @@ OUTBOUND_DIR="${LETTABOT_WORKING_DIR:-$(pwd)}/data/outbound"
PROVIDER="${TTS_PROVIDER:-elevenlabs}"
require_cmd() {
if ! command -v "$1" >/dev/null 2>&1; then
echo "Error: Required command '$1' is not installed or not on PATH" >&2
exit 1
fi
}
preflight() {
require_cmd curl
require_cmd jq
}
preflight
# Ensure output directory exists
mkdir -p "$OUTBOUND_DIR"
@@ -52,7 +66,7 @@ tts_elevenlabs() {
local model_id="${ELEVENLABS_MODEL_ID:-eleven_multilingual_v2}"
local http_code
http_code=$(curl -s -w "%{http_code}" -o "$OUTPUT" \
http_code=$(curl -sS -w "%{http_code}" -o "$OUTPUT" \
"https://api.elevenlabs.io/v1/text-to-speech/${voice_id}" \
-H "xi-api-key: ${ELEVENLABS_API_KEY}" \
-H "Content-Type: application/json" \
@@ -67,13 +81,21 @@ tts_elevenlabs() {
)")
if [ "$http_code" -lt 200 ] || [ "$http_code" -ge 300 ]; then
echo "Error: ElevenLabs API returned HTTP $http_code" >&2
if file "$OUTPUT" | grep -q "text\|JSON\|ASCII"; then
cat "$OUTPUT" >&2
echo "Error: ElevenLabs API returned HTTP $http_code (model=$model_id voice_id=$voice_id)" >&2
if [ -s "$OUTPUT" ]; then
echo "Error response preview:" >&2
head -c 2000 "$OUTPUT" >&2 || true
echo >&2
fi
rm -f "$OUTPUT"
exit 1
fi
if [ ! -s "$OUTPUT" ]; then
echo "Error: ElevenLabs TTS response was empty" >&2
rm -f "$OUTPUT"
exit 1
fi
}
# ---------------------------------------------------------------------------
@@ -89,7 +111,7 @@ tts_openai() {
local model="${OPENAI_TTS_MODEL:-tts-1}"
local http_code
http_code=$(curl -s -w "%{http_code}" -o "$OUTPUT" \
http_code=$(curl -sS -w "%{http_code}" -o "$OUTPUT" \
"https://api.openai.com/v1/audio/speech" \
-H "Authorization: Bearer ${OPENAI_API_KEY}" \
-H "Content-Type: application/json" \
@@ -106,13 +128,21 @@ tts_openai() {
)")
if [ "$http_code" -lt 200 ] || [ "$http_code" -ge 300 ]; then
echo "Error: OpenAI TTS API returned HTTP $http_code" >&2
if file "$OUTPUT" | grep -q "text\|JSON\|ASCII"; then
cat "$OUTPUT" >&2
echo "Error: OpenAI TTS API returned HTTP $http_code (model=$model voice=$voice)" >&2
if [ -s "$OUTPUT" ]; then
echo "Error response preview:" >&2
head -c 2000 "$OUTPUT" >&2 || true
echo >&2
fi
rm -f "$OUTPUT"
exit 1
fi
if [ ! -s "$OUTPUT" ]; then
echo "Error: OpenAI TTS response was empty" >&2
rm -f "$OUTPUT"
exit 1
fi
}
# ---------------------------------------------------------------------------

View File

@@ -32,3 +32,49 @@ describe('TelegramAdapter reactions', () => {
]);
});
});
describe('TelegramAdapter audio fallback', () => {
afterEach(() => {
vi.restoreAllMocks();
});
it('falls back to sendAudio for VOICE_MESSAGES_FORBIDDEN errors', async () => {
const adapter = new TelegramAdapter({ token: 'test-token' });
const sendVoice = vi
.spyOn(adapter.getBot().api, 'sendVoice')
.mockRejectedValue({ description: 'Bad Request: VOICE_MESSAGES_FORBIDDEN' } as any);
const sendAudio = vi
.spyOn(adapter.getBot().api, 'sendAudio')
.mockResolvedValue({ message_id: 987 } as any);
const result = await adapter.sendFile({
chatId: '123',
filePath: '/tmp/voice.ogg',
kind: 'audio',
});
expect(sendVoice).toHaveBeenCalledTimes(1);
expect(sendAudio).toHaveBeenCalledTimes(1);
expect(result).toEqual({ messageId: '987' });
});
it('does not fall back to sendAudio for non-voice transport failures', async () => {
const adapter = new TelegramAdapter({ token: 'test-token' });
const timeoutError = new Error('socket hang up');
const sendVoice = vi
.spyOn(adapter.getBot().api, 'sendVoice')
.mockRejectedValue(timeoutError);
const sendAudio = vi
.spyOn(adapter.getBot().api, 'sendAudio')
.mockResolvedValue({ message_id: 999 } as any);
await expect(adapter.sendFile({
chatId: '123',
filePath: '/tmp/voice.ogg',
kind: 'audio',
})).rejects.toBe(timeoutError);
expect(sendVoice).toHaveBeenCalledTimes(1);
expect(sendAudio).not.toHaveBeenCalled();
});
});

View File

@@ -26,6 +26,27 @@ import { resolveDailyLimits, checkDailyLimit, type GroupModeConfig } from './gro
import { createLogger } from '../logger.js';
const log = createLogger('Telegram');
function getTelegramErrorReason(err: unknown): string {
if (err && typeof err === 'object') {
const maybeError = err as { description?: string; message?: string };
if (typeof maybeError.description === 'string' && maybeError.description.trim().length > 0) {
return maybeError.description;
}
if (typeof maybeError.message === 'string' && maybeError.message.trim().length > 0) {
return maybeError.message;
}
}
return String(err);
}
function shouldFallbackToAudio(err: unknown): boolean {
if (!err || typeof err !== 'object') return false;
const description = (err as { description?: string }).description;
if (typeof description !== 'string') return false;
return description.includes('VOICE_MESSAGES_FORBIDDEN');
}
export interface TelegramConfig {
token: string;
dmPolicy?: DmPolicy; // 'pairing' (default), 'allowlist', or 'open'
@@ -593,13 +614,21 @@ export class TelegramAdapter implements ChannelAdapter {
const result = await this.bot.api.sendVoice(file.chatId, input, { caption });
return { messageId: String(result.message_id) };
} catch (err: any) {
// Fall back to sendAudio if voice messages are restricted (Telegram Premium privacy setting)
if (err?.description?.includes('VOICE_MESSAGES_FORBIDDEN')) {
log.warn('sendVoice forbidden, falling back to sendAudio');
const reason = getTelegramErrorReason(err);
// Only retry with sendAudio for deterministic voice-policy rejections.
// For network/timeout errors we rethrow to avoid possible duplicate sends.
if (!shouldFallbackToAudio(err)) {
throw err;
}
log.warn('sendVoice failed with VOICE_MESSAGES_FORBIDDEN, falling back to sendAudio:', reason);
try {
const result = await this.bot.api.sendAudio(file.chatId, new InputFile(file.filePath), { caption });
return { messageId: String(result.message_id) };
} catch (fallbackErr: any) {
const fallbackReason = getTelegramErrorReason(fallbackErr);
log.error('sendAudio fallback also failed:', fallbackReason);
throw fallbackErr;
}
throw err;
}
}

View File

@@ -461,11 +461,24 @@ export class LettaBot implements AgentSession {
.map(dir => join(dir, 'lettabot-tts'))
.find(p => existsSync(p));
const ttsProvider = (process.env.TTS_PROVIDER || 'elevenlabs').toLowerCase();
const ttsVoice = ttsProvider === 'openai'
? (process.env.OPENAI_TTS_VOICE || 'alloy')
: (process.env.ELEVENLABS_VOICE_ID || 'onwK4e9ZLuTAKqWW03F9');
const ttsModel = ttsProvider === 'openai'
? (process.env.OPENAI_TTS_MODEL || 'tts-1')
: (process.env.ELEVENLABS_MODEL_ID || 'eleven_multilingual_v2');
if (!ttsPath) {
log.warn('Directive voice skipped: lettabot-tts not found in skill dirs');
continue;
}
log.info(
`Directive voice: generating memo (provider=${ttsProvider}, model=${ttsModel}, voice=${ttsVoice}, textLen=${directive.text.length})`,
);
log.info(`Directive voice: helper=${ttsPath}`);
try {
const outputPath = await new Promise<string>((resolve, reject) => {
execFile(ttsPath, [directive.text], {
@@ -474,13 +487,37 @@ export class LettaBot implements AgentSession {
timeout: 30_000,
}, (err, stdout, stderr) => {
if (err) {
reject(new Error(stderr?.trim() || err.message));
const execErr = new Error(stderr?.trim() || err.message) as Error & {
code?: string | number | null;
signal?: NodeJS.Signals;
stdout?: string;
stderr?: string;
};
execErr.code = err.code;
execErr.signal = err.signal;
execErr.stdout = stdout?.trim();
execErr.stderr = stderr?.trim();
reject(execErr);
} else {
resolve(stdout.trim());
const output = stdout.trim();
if (!output) {
reject(new Error('lettabot-tts returned an empty output path'));
return;
}
if (stderr?.trim()) {
log.warn('Directive voice: lettabot-tts stderr:', stderr.trim());
}
resolve(output.split('\n').at(-1)?.trim() || output);
}
});
});
const outputStats = await stat(outputPath);
if (!outputStats.isFile()) {
throw new Error(`Generated TTS output is not a file: ${outputPath}`);
}
log.info(`Directive voice: generated file ${outputPath} (${outputStats.size} bytes)`);
await adapter.sendFile({
chatId,
filePath: outputPath,
@@ -493,7 +530,23 @@ export class LettaBot implements AgentSession {
// Clean up generated file
try { await unlink(outputPath); } catch {}
} catch (err) {
log.warn('Directive voice failed:', err instanceof Error ? err.message : err);
const execErr = err as Error & {
code?: string | number | null;
signal?: NodeJS.Signals;
stdout?: string;
stderr?: string;
};
log.warn('Directive voice failed:', {
message: execErr?.message || String(err),
code: execErr?.code,
signal: execErr?.signal,
stdout: typeof execErr?.stdout === 'string' ? execErr.stdout.slice(0, 300) : undefined,
stderr: typeof execErr?.stderr === 'string' ? execErr.stderr.slice(0, 1200) : undefined,
provider: ttsProvider,
model: ttsModel,
voice: ttsVoice,
helper: ttsPath,
});
}
}
}