feat: pass images to the LLM via multimodal API (#184)

feat: pass images to the LLM via multimodal API When users send images through any channel, the actual image content is now passed to the LLM via the SDK's multimodal API (imageFromFile/imageFromURL) instead of just text metadata. - Graceful fallback for unsupported MIME types, missing files, and load errors - Opt-out via features.inlineImages: false in config - Warns when model doesn't support vision (detects [Image omitted] in response)
2026-02-08 23:22:32 -05:00
parent 64f12be6cd
commit 110681e979
3 changed files with 62 additions and 5 deletions
--- a/src/config/io.ts
+++ b/src/config/io.ts
@@ -203,10 +203,13 @@ export function configToEnv(config: LettaBotConfig): Record<string, string> {
  if (config.features?.heartbeat?.enabled) {
    env.HEARTBEAT_INTERVAL_MIN = String(config.features.heartbeat.intervalMin || 30);
  }
+  if (config.features?.inlineImages === false) {
+    env.INLINE_IMAGES = 'false';
+  }
  if (config.features?.maxToolCalls !== undefined) {
    env.MAX_TOOL_CALLS = String(config.features.maxToolCalls);
  }
-  
+
  // Polling - top-level polling config (preferred)
  if (config.polling?.gmail?.enabled && config.polling.gmail.account) {
    env.GMAIL_ACCOUNT = config.polling.gmail.account;
--- a/src/config/types.ts
+++ b/src/config/types.ts
@@ -43,6 +43,7 @@ export interface LettaBotConfig {
      enabled: boolean;
      intervalMin?: number;
    };
+    inlineImages?: boolean;   // Send images directly to the LLM (default: true). Set false to only send file paths.
    maxToolCalls?: number;  // Abort if agent calls this many tools in one turn (default: 100)
  };

--- a/src/core/bot.ts
+++ b/src/core/bot.ts
@@ -4,7 +4,7 @@
 * Single agent, single conversation - chat continues across all channels.
 */

-import { createAgent, createSession, resumeSession, type Session } from '@letta-ai/letta-code-sdk';
+import { createAgent, createSession, resumeSession, imageFromFile, imageFromURL, type Session, type MessageContentItem, type SendMessage } from '@letta-ai/letta-code-sdk';
 import { mkdirSync } from 'node:fs';
 import type { ChannelAdapter } from '../channels/types.js';
 import type { BotConfig, InboundMessage, TriggerContext } from './types.js';
@@ -33,6 +33,52 @@ function isApprovalConflictError(error: unknown): boolean {
  return false;
 }

+const SUPPORTED_IMAGE_MIMES = new Set([
+  'image/png', 'image/jpeg', 'image/gif', 'image/webp',
+]);
+
+async function buildMultimodalMessage(
+  formattedText: string,
+  msg: InboundMessage,
+): Promise<SendMessage> {
+  // Respect opt-out: when INLINE_IMAGES=false, skip multimodal and only send file paths in envelope
+  if (process.env.INLINE_IMAGES === 'false') {
+    return formattedText;
+  }
+
+  const imageAttachments = (msg.attachments ?? []).filter(
+    (a) => a.kind === 'image'
+      && (a.localPath || a.url)
+      && (!a.mimeType || SUPPORTED_IMAGE_MIMES.has(a.mimeType))
+  );
+
+  if (imageAttachments.length === 0) {
+    return formattedText;
+  }
+
+  const content: MessageContentItem[] = [
+    { type: 'text', text: formattedText },
+  ];
+
+  for (const attachment of imageAttachments) {
+    try {
+      if (attachment.localPath) {
+        content.push(imageFromFile(attachment.localPath));
+      } else if (attachment.url) {
+        content.push(await imageFromURL(attachment.url));
+      }
+    } catch (err) {
+      console.warn(`[Bot] Failed to load image ${attachment.name || 'unknown'}: ${err instanceof Error ? err.message : err}`);
+    }
+  }
+
+  if (content.length > 1) {
+    console.log(`[Bot] Sending ${content.length - 1} inline image(s) to LLM`);
+  }
+
+  return content.length > 1 ? content : formattedText;
+}
+
 export class LettaBot {
  private store: Store;
  private config: BotConfig;
@@ -440,11 +486,12 @@ export class LettaBot {
      } : undefined;

      // Send message to agent with metadata envelope
-      const formattedMessage = msg.isBatch && msg.batchedMessages
+      const formattedText = msg.isBatch && msg.batchedMessages
        ? formatGroupBatchEnvelope(msg.batchedMessages)
-        : formatMessageEnvelope(msg);
+        : formatMessageEnvelope(msg, {}, sessionContext);
+      const messageToSend = await buildMultimodalMessage(formattedText, msg);
      try {
-        await withTimeout(session.send(formattedMessage), 'Session send');
+        await withTimeout(session.send(messageToSend), 'Session send');
      } catch (sendError) {
        // Check for 409 CONFLICT from orphaned approval_request_message
        if (!retried && isApprovalConflictError(sendError) && this.store.agentId && this.store.conversationId) {
@@ -658,6 +705,12 @@ export class LettaBot {
        response = '';
      }

+      // Detect unsupported multimodal: images were sent but server replaced them
+      const sentImages = Array.isArray(messageToSend);
+      if (sentImages && response.includes('[Image omitted]')) {
+        console.warn('[Bot] Model does not support images — server replaced inline images with "[Image omitted]". Consider using a vision-capable model or setting features.inlineImages: false in config.');
+      }
+
      // Send final response
      if (response.trim()) {
        try {