fix(core): resize images before sending to LLM to prevent llm_api_error (#593)

2026-03-13 14:10:05 -07:00
parent 68056ed21b
commit f5005c33a0
6 changed files with 87 additions and 11 deletions
--- a/package-lock.json
+++ b/package-lock.json
@@ -28,6 +28,7 @@
        "openai": "^6.17.0",
        "pino": "^10.3.1",
        "qrcode-terminal": "^0.12.0",
+        "sharp": "^0.34.1",
        "telegramify-markdown": "^1.0.0",
        "tsx": "^4.21.0",
        "typescript": "^5.9.3",
--- a/package.json
+++ b/package.json
@@ -85,6 +85,7 @@
    "openai": "^6.17.0",
    "pino": "^10.3.1",
    "qrcode-terminal": "^0.12.0",
+    "sharp": "^0.34.1",
    "telegramify-markdown": "^1.0.0",
    "tsx": "^4.21.0",
    "typescript": "^5.9.3",
--- a/src/core/bot.ts
+++ b/src/core/bot.ts
@@ -4,9 +4,10 @@
 * Single agent, single conversation - chat continues across all channels.
 */

-import { imageFromFile, imageFromURL, type Session, type MessageContentItem, type SendMessage, type CanUseToolCallback } from '@letta-ai/letta-code-sdk';
+import { imageFromBase64, type ImageContent, type Session, type MessageContentItem, type SendMessage, type CanUseToolCallback } from '@letta-ai/letta-code-sdk';
 import { mkdirSync, existsSync } from 'node:fs';
-import { access, unlink, realpath, stat, constants } from 'node:fs/promises';
+import { readFile, access, unlink, realpath, stat, constants } from 'node:fs/promises';
+import sharp from 'sharp';
 import { execFile } from 'node:child_process';
 import { extname, resolve, join } from 'node:path';
 import type { ChannelAdapter } from '../channels/types.js';
@@ -49,6 +50,68 @@ const AUDIO_FILE_EXTENSIONS = new Set([
  '.ogg', '.opus', '.mp3', '.m4a', '.wav', '.aac', '.flac',
 ]);

+/** Anthropic recommends max 1568px on longest side; larger images waste bandwidth for no benefit. */
+const MAX_IMAGE_DIMENSION = 1568;
+
+const MIME_FROM_EXT: Record<string, ImageContent['source']['media_type']> = {
+  '.png': 'image/png',
+  '.gif': 'image/gif',
+  '.webp': 'image/webp',
+  '.jpg': 'image/jpeg',
+  '.jpeg': 'image/jpeg',
+};
+
+/**
+ * Read, resize (if needed), and base64-encode an image for the LLM.
+ * Returns null on any failure so the caller can skip gracefully.
+ */
+async function prepareImage(
+  source: { localPath?: string; url?: string; mimeType?: string; name?: string },
+): Promise<ImageContent | null> {
+  let buffer: Buffer;
+  let mediaType: ImageContent['source']['media_type'];
+
+  // Resolve media type from attachment metadata or file extension
+  const resolveMime = (hint?: string, path?: string): ImageContent['source']['media_type'] => {
+    if (hint && SUPPORTED_IMAGE_MIMES.has(hint)) return hint as ImageContent['source']['media_type'];
+    if (path) {
+      const ext = extname(path).toLowerCase();
+      if (MIME_FROM_EXT[ext]) return MIME_FROM_EXT[ext];
+    }
+    return 'image/jpeg'; // safe default
+  };
+
+  if (source.localPath) {
+    buffer = await readFile(source.localPath);
+    mediaType = resolveMime(source.mimeType, source.localPath);
+  } else if (source.url) {
+    const response = await fetch(source.url);
+    if (!response.ok) {
+      log.warn(`Failed to fetch image from ${source.url}: HTTP ${response.status}`);
+      return null;
+    }
+    buffer = Buffer.from(await response.arrayBuffer());
+    const ct = response.headers.get('content-type') ?? undefined;
+    mediaType = resolveMime(ct ?? source.mimeType, source.url);
+  } else {
+    return null;
+  }
+
+  // Resize if the longest side exceeds the threshold
+  const metadata = await sharp(buffer).metadata();
+  const longest = Math.max(metadata.width ?? 0, metadata.height ?? 0);
+
+  if (longest > MAX_IMAGE_DIMENSION) {
+    log.info(`Resizing image ${source.name || 'unknown'} from ${metadata.width}x${metadata.height} (max side → ${MAX_IMAGE_DIMENSION}px)`);
+    buffer = await sharp(buffer)
+      .resize({ width: MAX_IMAGE_DIMENSION, height: MAX_IMAGE_DIMENSION, fit: 'inside', withoutEnlargement: true })
+      .toBuffer();
+  }
+
+  const data = buffer.toString('base64');
+  return imageFromBase64(data, mediaType);
+}
+
 type StreamErrorDetail = {
  message: string;
  stopReason: string;
@@ -125,11 +188,8 @@ async function buildMultimodalMessage(

  for (const attachment of imageAttachments) {
    try {
-      if (attachment.localPath) {
-        content.push(imageFromFile(attachment.localPath));
-      } else if (attachment.url) {
-        content.push(await imageFromURL(attachment.url));
-      }
+      const item = await prepareImage(attachment);
+      if (item) content.push(item);
    } catch (err) {
      log.warn(`Failed to load image ${attachment.name || 'unknown'}: ${err instanceof Error ? err.message : err}`);
    }
@@ -1544,6 +1604,7 @@ export class LettaBot implements AgentSession {
                  (!lastErrorDetail || lastErrorDetail.message === 'Agent stopped: error')) {
                const enriched = await getLatestRunError(this.store.agentId, retryConvId);
                if (enriched) {
+                  log.info(`Enriched error detail: ${enriched.message} [${enriched.stopReason}]`);
                  lastErrorDetail = {
                    message: enriched.message,
                    stopReason: enriched.stopReason,
@@ -1875,6 +1936,7 @@ export class LettaBot implements AgentSession {
                    (!lastErrorDetail || lastErrorDetail.message === 'Agent stopped: error')) {
                  const enriched = await getLatestRunError(this.store.agentId, convId);
                  if (enriched) {
+                    log.info(`Enriched error detail: ${enriched.message} [${enriched.stopReason}]`);
                    lastErrorDetail = {
                      message: enriched.message,
                      stopReason: enriched.stopReason,
--- a/src/core/errors.test.ts
+++ b/src/core/errors.test.ts
@@ -101,7 +101,7 @@ describe('formatApiErrorForUser', () => {
      stopReason: 'error',
    });
    expect(msg).toContain('stuck tool approval');
-    expect(msg).toContain('reset-conversation');
+    expect(msg).toContain('/reset');
    // Should NOT match the generic conflict message
    expect(msg).not.toContain('Another request is still processing');
  });
@@ -120,7 +120,7 @@ describe('formatApiErrorForUser', () => {
      stopReason: 'requires_approval',
    });
    expect(msg).toContain('stuck tool approval');
-    expect(msg).toContain('reset-conversation');
+    expect(msg).toContain('/reset');
  });

  it('falls back to sanitized original message when no mapping matches', () => {
--- a/src/core/result-guard.test.ts
+++ b/src/core/result-guard.test.ts
@@ -5,6 +5,19 @@ import { tmpdir } from 'node:os';
 import { LettaBot } from './bot.js';
 import type { InboundMessage, OutboundMessage } from './types.js';

+vi.mock('../tools/letta-api.js', () => ({
+  getPendingApprovals: vi.fn(),
+  rejectApproval: vi.fn(),
+  cancelRuns: vi.fn(),
+  cancelConversation: vi.fn(),
+  recoverOrphanedConversationApproval: vi.fn().mockResolvedValue({ recovered: false }),
+  recoverPendingApprovalsForAgent: vi.fn(),
+  isRecoverableConversationId: vi.fn(() => false),
+  getLatestRunError: vi.fn().mockResolvedValue(null),
+  getAgentModel: vi.fn(),
+  updateAgentModel: vi.fn(),
+}));
+
 describe('result divergence guard', () => {
  let workDir: string;

--- a/src/core/sdk-session-contract.test.ts
+++ b/src/core/sdk-session-contract.test.ts
@@ -7,8 +7,7 @@ vi.mock('@letta-ai/letta-code-sdk', () => ({
  createAgent: vi.fn(),
  createSession: vi.fn(),
  resumeSession: vi.fn(),
-  imageFromFile: vi.fn(),
-  imageFromURL: vi.fn(),
+  imageFromBase64: vi.fn((_data: string, _type: string) => ({ type: 'image', source: { type: 'base64', media_type: _type, data: _data } })),
 }));

 vi.mock('../tools/letta-api.js', () => ({