diff --git a/src/config/io.ts b/src/config/io.ts index f4f2d1c..2c13ca4 100644 --- a/src/config/io.ts +++ b/src/config/io.ts @@ -203,10 +203,13 @@ export function configToEnv(config: LettaBotConfig): Record { if (config.features?.heartbeat?.enabled) { env.HEARTBEAT_INTERVAL_MIN = String(config.features.heartbeat.intervalMin || 30); } + if (config.features?.inlineImages === false) { + env.INLINE_IMAGES = 'false'; + } if (config.features?.maxToolCalls !== undefined) { env.MAX_TOOL_CALLS = String(config.features.maxToolCalls); } - + // Polling - top-level polling config (preferred) if (config.polling?.gmail?.enabled && config.polling.gmail.account) { env.GMAIL_ACCOUNT = config.polling.gmail.account; diff --git a/src/config/types.ts b/src/config/types.ts index 0c575ad..e25267f 100644 --- a/src/config/types.ts +++ b/src/config/types.ts @@ -43,6 +43,7 @@ export interface LettaBotConfig { enabled: boolean; intervalMin?: number; }; + inlineImages?: boolean; // Send images directly to the LLM (default: true). Set false to only send file paths. maxToolCalls?: number; // Abort if agent calls this many tools in one turn (default: 100) }; diff --git a/src/core/bot.ts b/src/core/bot.ts index 0a21fc4..980dd5b 100644 --- a/src/core/bot.ts +++ b/src/core/bot.ts @@ -4,7 +4,7 @@ * Single agent, single conversation - chat continues across all channels. */ -import { createAgent, createSession, resumeSession, type Session } from '@letta-ai/letta-code-sdk'; +import { createAgent, createSession, resumeSession, imageFromFile, imageFromURL, type Session, type MessageContentItem, type SendMessage } from '@letta-ai/letta-code-sdk'; import { mkdirSync } from 'node:fs'; import type { ChannelAdapter } from '../channels/types.js'; import type { BotConfig, InboundMessage, TriggerContext } from './types.js'; @@ -33,6 +33,52 @@ function isApprovalConflictError(error: unknown): boolean { return false; } +const SUPPORTED_IMAGE_MIMES = new Set([ + 'image/png', 'image/jpeg', 'image/gif', 'image/webp', +]); + +async function buildMultimodalMessage( + formattedText: string, + msg: InboundMessage, +): Promise { + // Respect opt-out: when INLINE_IMAGES=false, skip multimodal and only send file paths in envelope + if (process.env.INLINE_IMAGES === 'false') { + return formattedText; + } + + const imageAttachments = (msg.attachments ?? []).filter( + (a) => a.kind === 'image' + && (a.localPath || a.url) + && (!a.mimeType || SUPPORTED_IMAGE_MIMES.has(a.mimeType)) + ); + + if (imageAttachments.length === 0) { + return formattedText; + } + + const content: MessageContentItem[] = [ + { type: 'text', text: formattedText }, + ]; + + for (const attachment of imageAttachments) { + try { + if (attachment.localPath) { + content.push(imageFromFile(attachment.localPath)); + } else if (attachment.url) { + content.push(await imageFromURL(attachment.url)); + } + } catch (err) { + console.warn(`[Bot] Failed to load image ${attachment.name || 'unknown'}: ${err instanceof Error ? err.message : err}`); + } + } + + if (content.length > 1) { + console.log(`[Bot] Sending ${content.length - 1} inline image(s) to LLM`); + } + + return content.length > 1 ? content : formattedText; +} + export class LettaBot { private store: Store; private config: BotConfig; @@ -440,11 +486,12 @@ export class LettaBot { } : undefined; // Send message to agent with metadata envelope - const formattedMessage = msg.isBatch && msg.batchedMessages + const formattedText = msg.isBatch && msg.batchedMessages ? formatGroupBatchEnvelope(msg.batchedMessages) - : formatMessageEnvelope(msg); + : formatMessageEnvelope(msg, {}, sessionContext); + const messageToSend = await buildMultimodalMessage(formattedText, msg); try { - await withTimeout(session.send(formattedMessage), 'Session send'); + await withTimeout(session.send(messageToSend), 'Session send'); } catch (sendError) { // Check for 409 CONFLICT from orphaned approval_request_message if (!retried && isApprovalConflictError(sendError) && this.store.agentId && this.store.conversationId) { @@ -658,6 +705,12 @@ export class LettaBot { response = ''; } + // Detect unsupported multimodal: images were sent but server replaced them + const sentImages = Array.isArray(messageToSend); + if (sentImages && response.includes('[Image omitted]')) { + console.warn('[Bot] Model does not support images — server replaced inline images with "[Image omitted]". Consider using a vision-capable model or setting features.inlineImages: false in config.'); + } + // Send final response if (response.trim()) { try {