feat: add image reading support to Read tool (#603)

Co-authored-by: Letta <noreply@letta.com>
2026-01-20 13:37:18 -08:00
parent e6661e7699
commit d34a65323c
7 changed files with 303 additions and 5 deletions
--- a/src/cli/App.tsx
+++ b/src/cli/App.tsx
@@ -158,6 +158,10 @@ import {
  subscribe as subscribeToSubagents,
 } from "./helpers/subagentState";
 import { getRandomThinkingVerb } from "./helpers/thinkingMessages";
+import {
+  clearQueuedToolImages,
+  getAndClearQueuedToolImages,
+} from "./helpers/toolImageRegistry";
 import {
  isFileEditTool,
  isFileWriteTool,
@@ -3239,6 +3243,9 @@ export default function App({
      // Lock input for async operation (set before any await to prevent queue processing)
      setCommandRunning(true);

+      // Clear any queued tool images from the previous agent context
+      clearQueuedToolImages();
+
      const inputCmd = "/agents";
      const cmdId = uid("cmd");

@@ -3717,9 +3724,44 @@ export default function App({
        // Send all results to server if any
        if (allResults.length > 0) {
          toolResultsInFlightRef.current = true;
-          await processConversation([
+
+          // Check for queued tool images (from Read tool reading image files)
+          const toolImages = getAndClearQueuedToolImages();
+          const input: Array<MessageCreate | ApprovalCreate> = [
            { type: "approval", approvals: allResults },
-          ]);
+          ];
+
+          // If there are queued images, add them as a user message
+          if (toolImages.length > 0) {
+            const imageContentParts: Array<
+              | { type: "text"; text: string }
+              | {
+                  type: "image";
+                  source: { type: "base64"; media_type: string; data: string };
+                }
+            > = [];
+            for (const img of toolImages) {
+              imageContentParts.push({
+                type: "text",
+                text: `<system-reminder>Image read from ${img.filePath} (Read tool call: ${img.toolCallId}):</system-reminder>`,
+              });
+              imageContentParts.push({
+                type: "image",
+                source: {
+                  type: "base64",
+                  media_type: img.mediaType,
+                  data: img.data,
+                },
+              });
+            }
+            input.push({
+              type: "message",
+              role: "user",
+              content: imageContentParts as unknown as MessageCreate["content"],
+            });
+          }
+
+          await processConversation(input);
          toolResultsInFlightRef.current = false;
        }
      } finally {
@@ -4357,6 +4399,9 @@ export default function App({

          setCommandRunning(true);

+          // Clear any queued tool images from the previous conversation
+          clearQueuedToolImages();
+
          try {
            const client = await getClient();

@@ -5577,7 +5622,37 @@ ${gitContext}
      }

      // Build message content from display value (handles placeholders for text/images)
-      const contentParts = buildMessageContentFromDisplay(msg);
+      let contentParts = buildMessageContentFromDisplay(msg);
+
+      // Prepend any queued tool images (from Read tool reading image files)
+      const queuedToolImages = getAndClearQueuedToolImages();
+      if (queuedToolImages.length > 0) {
+        const imageParts: Array<
+          | { type: "text"; text: string }
+          | {
+              type: "image";
+              source: { type: "base64"; media_type: string; data: string };
+            }
+        > = [];
+        for (const img of queuedToolImages) {
+          // Add system reminder text
+          imageParts.push({
+            type: "text",
+            text: `<system-reminder>Image read from ${img.filePath} (Read tool call: ${img.toolCallId}):</system-reminder>`,
+          });
+          // Add image content
+          imageParts.push({
+            type: "image",
+            source: {
+              type: "base64",
+              media_type: img.mediaType,
+              data: img.data,
+            },
+          });
+        }
+        // Prepend to contentParts
+        contentParts = [...imageParts, ...contentParts];
+      }

      // Prepend plan mode reminder if in plan mode
      const planModeReminder = getPlanModeReminder();
--- a/src/cli/helpers/toolImageRegistry.ts
+++ b/src/cli/helpers/toolImageRegistry.ts
@@ -0,0 +1,47 @@
+// Registry for images read by tools that need to be sent in the next user message turn.
+// This is needed because tool returns only support string content - we can't return
+// image data directly in tool results to the Letta API.
+
+export interface QueuedToolImage {
+  toolCallId: string;
+  filePath: string;
+  data: string; // base64
+  mediaType: string;
+  width: number;
+  height: number;
+}
+
+const queuedImages: QueuedToolImage[] = [];
+
+/**
+ * Queue an image to be sent in the next user message.
+ * Called by the Read tool when reading an image file.
+ */
+export function queueToolImage(image: QueuedToolImage): void {
+  queuedImages.push(image);
+}
+
+/**
+ * Get and clear all queued images.
+ * Called when building the user message content.
+ */
+export function getAndClearQueuedToolImages(): QueuedToolImage[] {
+  const images = [...queuedImages];
+  queuedImages.length = 0;
+  return images;
+}
+
+/**
+ * Clear all queued images without returning them.
+ * Called on conversation/agent switch to prevent memory leaks.
+ */
+export function clearQueuedToolImages(): void {
+  queuedImages.length = 0;
+}
+
+/**
+ * Check if there are any queued images.
+ */
+export function hasQueuedToolImages(): boolean {
+  return queuedImages.length > 0;
+}
--- a/src/headless.ts
+++ b/src/headless.ts
@@ -31,6 +31,7 @@ import { formatErrorDetails } from "./cli/helpers/errorFormatter";
 import { safeJsonParseOr } from "./cli/helpers/safeJsonParse";
 import { drainStreamWithResume } from "./cli/helpers/stream";
 import { StreamProcessor } from "./cli/helpers/streamProcessor";
+import { getAndClearQueuedToolImages } from "./cli/helpers/toolImageRegistry";
 import { settingsManager } from "./settings-manager";
 import { checkToolPermission } from "./tools/manager";
 import type {
@@ -934,11 +935,42 @@ export async function handleHeadlessCommand(
  // Add user prompt
  messageContent += prompt;

+  // Build content parts (text + any queued tool images from Read tool)
+  type ContentPart =
+    | { type: "text"; text: string }
+    | {
+        type: "image";
+        source: { type: "base64"; media_type: string; data: string };
+      };
+  const contentParts: ContentPart[] = [];
+
+  // Check for queued tool images (from Read tool reading image files)
+  const queuedToolImages = getAndClearQueuedToolImages();
+  if (queuedToolImages.length > 0) {
+    for (const img of queuedToolImages) {
+      contentParts.push({
+        type: "text",
+        text: `<system-reminder>Image read from ${img.filePath} (Read tool call: ${img.toolCallId}):</system-reminder>`,
+      });
+      contentParts.push({
+        type: "image",
+        source: {
+          type: "base64",
+          media_type: img.mediaType,
+          data: img.data,
+        },
+      });
+    }
+  }
+
+  // Add the text message content
+  contentParts.push({ type: "text", text: messageContent });
+
  // Start with the user message
  let currentInput: Array<MessageCreate | ApprovalCreate> = [
    {
      role: "user",
-      content: [{ type: "text", text: messageContent }],
+      content: contentParts as unknown as MessageCreate["content"],
    },
  ];

@@ -1241,6 +1273,9 @@ export async function handleHeadlessCommand(
        );
        const executedResults = await executeApprovalBatch(decisions);

+        // Check for queued tool images (from Read tool reading image files)
+        const toolImages = getAndClearQueuedToolImages();
+
        // Send all results in one batch
        currentInput = [
          {
@@ -1248,6 +1283,36 @@ export async function handleHeadlessCommand(
            approvals: executedResults as ApprovalResult[],
          },
        ];
+
+        // If there are queued images, add them as a user message
+        if (toolImages.length > 0) {
+          const imageContentParts: Array<
+            | { type: "text"; text: string }
+            | {
+                type: "image";
+                source: { type: "base64"; media_type: string; data: string };
+              }
+          > = [];
+          for (const img of toolImages) {
+            imageContentParts.push({
+              type: "text",
+              text: `<system-reminder>Image read from ${img.filePath} (Read tool call: ${img.toolCallId}):</system-reminder>`,
+            });
+            imageContentParts.push({
+              type: "image",
+              source: {
+                type: "base64",
+                media_type: img.mediaType,
+                data: img.data,
+              },
+            });
+          }
+          currentInput.push({
+            role: "user",
+            content: imageContentParts as unknown as MessageCreate["content"],
+          });
+        }
+
        continue;
      }

--- a/src/tools/descriptions/Read.md
+++ b/src/tools/descriptions/Read.md
@@ -9,6 +9,8 @@ Usage:
 - You can optionally specify a line offset and limit (especially handy for long files), but it's recommended to read the whole file by not providing these parameters
 - Any lines longer than 2000 characters will be truncated
 - Results are returned using cat -n format, with line numbers starting at 1
+- This tool allows Letta Code to read images (PNG, JPG, JPEG, GIF, WEBP, BMP). When reading an image file the contents are presented visually as Letta Code is a multimodal LLM. Large images are automatically resized to fit within API limits.
+- You will regularly be asked to read screenshots. If the user provides a path to a screenshot, ALWAYS use this tool to view the file at the path. This tool will work with all temporary file paths.
 - This tool can only read files, not directories. To read a directory, use the ls command via Bash.
 - You can call multiple tools in a single response. It is always better to speculatively read multiple potentially useful files in parallel.
 - If you read a file that exists but has empty contents you will receive a system reminder warning in place of file contents.
--- a/src/tools/impl/Read.ts
+++ b/src/tools/impl/Read.ts
@@ -1,9 +1,78 @@
 import { promises as fs } from "node:fs";
 import * as path from "node:path";
+import { resizeImageIfNeeded } from "../../cli/helpers/imageResize.js";
+import { queueToolImage } from "../../cli/helpers/toolImageRegistry.js";
+import { getToolExecutionContext } from "../toolContext.js";
 import { OVERFLOW_CONFIG, writeOverflowFile } from "./overflow.js";
 import { LIMITS } from "./truncation.js";
 import { validateRequiredParams } from "./validation.js";

+// Supported image extensions (lowercase)
+const IMAGE_EXTENSIONS = new Set([
+  ".png",
+  ".jpg",
+  ".jpeg",
+  ".gif",
+  ".webp",
+  ".bmp",
+]);
+
+/**
+ * Check if a file path is an image based on extension.
+ */
+function isImageFile(filePath: string): boolean {
+  const ext = path.extname(filePath).toLowerCase();
+  return IMAGE_EXTENSIONS.has(ext);
+}
+
+/**
+ * Get MIME type from file extension.
+ */
+function getMimeType(filePath: string): string {
+  const ext = path.extname(filePath).toLowerCase();
+  const mimeTypes: Record<string, string> = {
+    ".png": "image/png",
+    ".jpg": "image/jpeg",
+    ".jpeg": "image/jpeg",
+    ".gif": "image/gif",
+    ".webp": "image/webp",
+    ".bmp": "image/bmp",
+  };
+  return mimeTypes[ext] || "image/png";
+}
+
+/**
+ * Read an image file, resize if needed, and queue for display.
+ * Returns a placeholder message - actual image is sent in the next user message.
+ */
+async function readImageFile(filePath: string): Promise<ReadResult> {
+  const buffer = await fs.readFile(filePath);
+  const inputMimeType = getMimeType(filePath);
+  const resized = await resizeImageIfNeeded(buffer, inputMimeType);
+
+  // Get tool call ID from execution context
+  const context = getToolExecutionContext();
+  const toolCallId = context?.toolCallId || "unknown";
+
+  // Queue for next turn
+  queueToolImage({
+    toolCallId,
+    filePath,
+    data: resized.data,
+    mediaType: resized.mediaType,
+    width: resized.width,
+    height: resized.height,
+  });
+
+  const resizeNote = resized.resized
+    ? ` (resized to ${resized.width}x${resized.height})`
+    : ` (${resized.width}x${resized.height})`;
+
+  return {
+    content: `[Image: ${filePath}${resizeNote} - queued for display]`,
+  };
+}
+
 interface ReadArgs {
  file_path: string;
  offset?: number;
@@ -145,6 +214,12 @@ export async function read(args: ReadArgs): Promise<ReadResult> {
      throw new Error(
        `File too large: ${stats.size} bytes (max ${maxSize} bytes)`,
      );
+
+    // Handle image files specially - read, resize, and queue for display
+    if (isImageFile(resolvedPath)) {
+      return await readImageFile(resolvedPath);
+    }
+
    if (await isBinaryFile(resolvedPath))
      throw new Error(`Cannot read binary file: ${resolvedPath}`);
    const content = await fs.readFile(resolvedPath, "utf-8");
--- a/src/tools/manager.ts
+++ b/src/tools/manager.ts
@@ -2,6 +2,7 @@ import { getModelInfo } from "../agent/model";
 import { getAllSubagentConfigs } from "../agent/subagents";
 import { INTERRUPTED_BY_USER } from "../constants";
 import { telemetry } from "../telemetry";
+import { setToolExecutionContext } from "./toolContext";
 import { TOOL_DEFINITIONS, type ToolName } from "./toolDefinitions";

 export const TOOL_NAMES = Object.keys(TOOL_DEFINITIONS) as ToolName[];
@@ -754,7 +755,14 @@ export async function executeTool(
      }
    }

-    const result = await tool.fn(enhancedArgs);
+    // Set execution context for tools that need it (e.g., Read for image queuing)
+    setToolExecutionContext({ toolCallId: options?.toolCallId });
+    let result: unknown;
+    try {
+      result = await tool.fn(enhancedArgs);
+    } finally {
+      setToolExecutionContext(null);
+    }
    const duration = Date.now() - startTime;

    // Extract stdout/stderr if present (for bash tools)
--- a/src/tools/toolContext.ts
+++ b/src/tools/toolContext.ts
@@ -0,0 +1,26 @@
+// Tool execution context - allows tools to access execution metadata
+// Separate file to avoid circular dependencies with manager.ts
+
+interface ToolExecutionContext {
+  toolCallId?: string;
+}
+
+let currentToolContext: ToolExecutionContext | null = null;
+
+/**
+ * Get the current tool execution context.
+ * Called by tools that need access to execution metadata (e.g., Read for image queuing).
+ */
+export function getToolExecutionContext(): ToolExecutionContext | null {
+  return currentToolContext;
+}
+
+/**
+ * Set the current tool execution context.
+ * Called by manager.ts before executing a tool.
+ */
+export function setToolExecutionContext(
+  context: ToolExecutionContext | null,
+): void {
+  currentToolContext = context;
+}