feat: pass images to the LLM via multimodal API (#184)

feat: pass images to the LLM via multimodal API

When users send images through any channel, the actual image content is now passed to the LLM via the SDK's multimodal API (imageFromFile/imageFromURL) instead of just text metadata.

- Graceful fallback for unsupported MIME types, missing files, and load errors
- Opt-out via features.inlineImages: false in config
- Warns when model doesn't support vision (detects [Image omitted] in response)
This commit is contained in:
Gabriele Sarti
2026-02-08 23:22:32 -05:00
committed by GitHub
parent 64f12be6cd
commit 110681e979
3 changed files with 62 additions and 5 deletions

View File

@@ -203,10 +203,13 @@ export function configToEnv(config: LettaBotConfig): Record<string, string> {
if (config.features?.heartbeat?.enabled) {
env.HEARTBEAT_INTERVAL_MIN = String(config.features.heartbeat.intervalMin || 30);
}
if (config.features?.inlineImages === false) {
env.INLINE_IMAGES = 'false';
}
if (config.features?.maxToolCalls !== undefined) {
env.MAX_TOOL_CALLS = String(config.features.maxToolCalls);
}
// Polling - top-level polling config (preferred)
if (config.polling?.gmail?.enabled && config.polling.gmail.account) {
env.GMAIL_ACCOUNT = config.polling.gmail.account;

View File

@@ -43,6 +43,7 @@ export interface LettaBotConfig {
enabled: boolean;
intervalMin?: number;
};
inlineImages?: boolean; // Send images directly to the LLM (default: true). Set false to only send file paths.
maxToolCalls?: number; // Abort if agent calls this many tools in one turn (default: 100)
};

View File

@@ -4,7 +4,7 @@
* Single agent, single conversation - chat continues across all channels.
*/
import { createAgent, createSession, resumeSession, type Session } from '@letta-ai/letta-code-sdk';
import { createAgent, createSession, resumeSession, imageFromFile, imageFromURL, type Session, type MessageContentItem, type SendMessage } from '@letta-ai/letta-code-sdk';
import { mkdirSync } from 'node:fs';
import type { ChannelAdapter } from '../channels/types.js';
import type { BotConfig, InboundMessage, TriggerContext } from './types.js';
@@ -33,6 +33,52 @@ function isApprovalConflictError(error: unknown): boolean {
return false;
}
const SUPPORTED_IMAGE_MIMES = new Set([
'image/png', 'image/jpeg', 'image/gif', 'image/webp',
]);
async function buildMultimodalMessage(
formattedText: string,
msg: InboundMessage,
): Promise<SendMessage> {
// Respect opt-out: when INLINE_IMAGES=false, skip multimodal and only send file paths in envelope
if (process.env.INLINE_IMAGES === 'false') {
return formattedText;
}
const imageAttachments = (msg.attachments ?? []).filter(
(a) => a.kind === 'image'
&& (a.localPath || a.url)
&& (!a.mimeType || SUPPORTED_IMAGE_MIMES.has(a.mimeType))
);
if (imageAttachments.length === 0) {
return formattedText;
}
const content: MessageContentItem[] = [
{ type: 'text', text: formattedText },
];
for (const attachment of imageAttachments) {
try {
if (attachment.localPath) {
content.push(imageFromFile(attachment.localPath));
} else if (attachment.url) {
content.push(await imageFromURL(attachment.url));
}
} catch (err) {
console.warn(`[Bot] Failed to load image ${attachment.name || 'unknown'}: ${err instanceof Error ? err.message : err}`);
}
}
if (content.length > 1) {
console.log(`[Bot] Sending ${content.length - 1} inline image(s) to LLM`);
}
return content.length > 1 ? content : formattedText;
}
export class LettaBot {
private store: Store;
private config: BotConfig;
@@ -440,11 +486,12 @@ export class LettaBot {
} : undefined;
// Send message to agent with metadata envelope
const formattedMessage = msg.isBatch && msg.batchedMessages
const formattedText = msg.isBatch && msg.batchedMessages
? formatGroupBatchEnvelope(msg.batchedMessages)
: formatMessageEnvelope(msg);
: formatMessageEnvelope(msg, {}, sessionContext);
const messageToSend = await buildMultimodalMessage(formattedText, msg);
try {
await withTimeout(session.send(formattedMessage), 'Session send');
await withTimeout(session.send(messageToSend), 'Session send');
} catch (sendError) {
// Check for 409 CONFLICT from orphaned approval_request_message
if (!retried && isApprovalConflictError(sendError) && this.store.agentId && this.store.conversationId) {
@@ -658,6 +705,12 @@ export class LettaBot {
response = '';
}
// Detect unsupported multimodal: images were sent but server replaced them
const sentImages = Array.isArray(messageToSend);
if (sentImages && response.includes('[Image omitted]')) {
console.warn('[Bot] Model does not support images — server replaced inline images with "[Image omitted]". Consider using a vision-capable model or setting features.inlineImages: false in config.');
}
// Send final response
if (response.trim()) {
try {