feat: add image reading support to Read tool (#614)

Co-authored-by: Letta <noreply@letta.com>
2026-01-20 22:38:33 -08:00
parent 1168a83716
commit 5635156b51
10 changed files with 231 additions and 18 deletions
--- a/package.json
+++ b/package.json
@@ -30,7 +30,7 @@
    "access": "public"
  },
  "dependencies": {
-    "@letta-ai/letta-client": "^1.6.8",
+    "@letta-ai/letta-client": "^1.7.2",
    "glob": "^13.0.0",
    "ink-link": "^5.0.0",
    "open": "^10.2.0",
@@ -43,6 +43,7 @@
    "@types/bun": "latest",
    "@types/diff": "^8.0.0",
    "@types/picomatch": "^4.0.2",
+    "@types/react": "^19.2.9",
    "diff": "^8.0.2",
    "husky": "9.1.7",
    "ink": "^5.0.0",
--- a/src/agent/approval-execution.ts
+++ b/src/agent/approval-execution.ts
@@ -3,12 +3,32 @@
 import * as path from "node:path";
 import type {
  ApprovalReturn,
+  TextContent,
  ToolReturn,
 } from "@letta-ai/letta-client/resources/agents/messages";
 import type { ToolReturnMessage } from "@letta-ai/letta-client/resources/tools";
 import type { ApprovalRequest } from "../cli/helpers/stream";
 import { INTERRUPTED_BY_USER } from "../constants";
-import { executeTool, type ToolExecutionResult } from "../tools/manager";
+import {
+  executeTool,
+  type ToolExecutionResult,
+  type ToolReturnContent,
+} from "../tools/manager";
+
+/**
+ * Extract displayable text from tool return content (for UI display).
+ * Multimodal content returns the text parts concatenated.
+ */
+export function getDisplayableToolReturn(content: ToolReturnContent): string {
+  if (typeof content === "string") {
+    return content;
+  }
+  // Extract text from multimodal content
+  return content
+    .filter((part): part is TextContent => part.type === "text")
+    .map((part) => part.text)
+    .join("\n");
+}

 /**
 * Tools that are safe to execute in parallel (read-only or independent).
@@ -235,13 +255,14 @@ async function executeSingleDecision(
      );

      // Update UI if callback provided (interactive mode)
+      // Note: UI display uses text-only version, backend gets full multimodal content
      if (onChunk) {
        onChunk({
          message_type: "tool_return_message",
          id: "dummy",
          date: new Date().toISOString(),
          tool_call_id: decision.approval.toolCallId,
-          tool_return: toolResult.toolReturn,
+          tool_return: getDisplayableToolReturn(toolResult.toolReturn),
          status: toolResult.status,
          stdout: toolResult.stdout,
          stderr: toolResult.stderr,
@@ -251,7 +272,7 @@ async function executeSingleDecision(
      return {
        type: "tool",
        tool_call_id: decision.approval.toolCallId,
-        tool_return: toolResult.toolReturn,
+        tool_return: toolResult.toolReturn, // Full multimodal content for backend
        status: toolResult.status,
        stdout: toolResult.stdout,
        stderr: toolResult.stderr,
--- a/src/cli/App.tsx
+++ b/src/cli/App.tsx
@@ -26,6 +26,7 @@ import {
 import {
  type ApprovalResult,
  executeAutoAllowedTools,
+  getDisplayableToolReturn,
 } from "../agent/approval-execution";
 import {
  buildApprovalRecoveryMessage,
@@ -7333,7 +7334,7 @@ DO NOT respond to these messages or otherwise consider them in your response unl
          id: "dummy",
          date: new Date().toISOString(),
          tool_call_id: approval.toolCallId,
-          tool_return: toolResult.toolReturn,
+          tool_return: getDisplayableToolReturn(toolResult.toolReturn),
          status: toolResult.status,
          stdout: toolResult.stdout,
          stderr: toolResult.stderr,
--- a/src/cli/components/ToolCallMessageRich.tsx
+++ b/src/cli/components/ToolCallMessageRich.tsx
@@ -594,12 +594,30 @@ export const ToolCallMessage = memo(
        }
      }

-      // Check if this is a file read tool - show line count summary
+      // Check if this is a file read tool - show line count or image summary
      if (
        isFileReadTool(rawName) &&
        line.resultOk !== false &&
        line.resultText
      ) {
+        // Check if this is an image result (starts with "[Image: filename]")
+        const isImageResult = line.resultText.startsWith("[Image: ");
+
+        if (isImageResult) {
+          return (
+            <Box flexDirection="row">
+              <Box width={prefixWidth} flexShrink={0}>
+                <Text>{prefix}</Text>
+              </Box>
+              <Box flexGrow={1} width={contentWidth}>
+                <Text>
+                  Read <Text bold>1</Text> image
+                </Text>
+              </Box>
+            </Box>
+          );
+        }
+
        // Count lines in the result (the content returned by Read tool)
        const lineCount = line.resultText.split("\n").length;
        return (
@@ -609,7 +627,8 @@ export const ToolCallMessage = memo(
            </Box>
            <Box flexGrow={1} width={contentWidth}>
              <Text>
-                Read <Text bold>{lineCount}</Text> lines
+                Read <Text bold>{lineCount}</Text> line
+                {lineCount !== 1 ? "s" : ""}
              </Text>
            </Box>
          </Box>
--- a/src/cli/helpers/backfill.ts
+++ b/src/cli/helpers/backfill.ts
@@ -1,10 +1,30 @@
 import type {
+  ImageContent,
  LettaAssistantMessageContentUnion,
  LettaUserMessageContentUnion,
  Message,
+  TextContent,
 } from "@letta-ai/letta-client/resources/agents/messages";
 import type { Buffers } from "./accumulator";

+/**
+ * Extract displayable text from tool return content.
+ * Multimodal content returns the text parts concatenated.
+ */
+function getDisplayableToolReturn(
+  content: string | Array<TextContent | ImageContent> | undefined,
+): string {
+  if (!content) return "";
+  if (typeof content === "string") {
+    return content;
+  }
+  // Extract text from multimodal content
+  return content
+    .filter((part): part is TextContent => part.type === "text")
+    .map((part) => part.text)
+    .join("\n");
+}
+
 // const PASTE_LINE_THRESHOLD = 5;
 // const PASTE_CHAR_THRESHOLD = 500;
 const CLIP_CHAR_LIMIT_TEXT = 500;
@@ -238,7 +258,8 @@ export function backfillBuffers(buffers: Buffers, history: Message[]): void {

          // Update the existing line with the result
          // Handle both func_response (streaming) and tool_return (SDK) properties
-          const resultText =
+          // tool_return can be multimodal (string or array of content parts)
+          const rawResult =
            ("func_response" in toolReturn
              ? toolReturn.func_response
              : undefined) ||
@@ -246,6 +267,7 @@ export function backfillBuffers(buffers: Buffers, history: Message[]): void {
              ? toolReturn.tool_return
              : undefined) ||
            "";
+          const resultText = getDisplayableToolReturn(rawResult);
          buffers.byId.set(toolCallLineId, {
            ...existingLine,
            resultText,
--- a/src/tools/descriptions/Read.md
+++ b/src/tools/descriptions/Read.md
@@ -9,6 +9,8 @@ Usage:
 - You can optionally specify a line offset and limit (especially handy for long files), but it's recommended to read the whole file by not providing these parameters
 - Any lines longer than 2000 characters will be truncated
 - Results are returned using cat -n format, with line numbers starting at 1
+- This tool allows Letta Code to read images (PNG, JPG, JPEG, GIF, WEBP, BMP). When reading an image file the contents are presented visually as Letta Code is a multimodal LLM. Large images are automatically resized to fit within API limits.
+- You will regularly be asked to read screenshots. If the user provides a path to a screenshot, ALWAYS use this tool to view the file at the path. This tool will work with all temporary file paths.
 - This tool can only read files, not directories. To read a directory, use the ls command via Bash.
 - You can call multiple tools in a single response. It is always better to speculatively read multiple potentially useful files in parallel.
 - If you read a file that exists but has empty contents you will receive a system reminder warning in place of file contents.
--- a/src/tools/impl/Read.ts
+++ b/src/tools/impl/Read.ts
@@ -1,16 +1,93 @@
 import { promises as fs } from "node:fs";
 import * as path from "node:path";
+import type {
+  ImageContent,
+  TextContent,
+} from "@letta-ai/letta-client/resources/agents/messages";
+import { LETTA_CLOUD_API_URL } from "../../auth/oauth.js";
+import { resizeImageIfNeeded } from "../../cli/helpers/imageResize.js";
+import { settingsManager } from "../../settings-manager.js";
 import { OVERFLOW_CONFIG, writeOverflowFile } from "./overflow.js";
 import { LIMITS } from "./truncation.js";
 import { validateRequiredParams } from "./validation.js";

+/**
+ * Check if the server supports images in tool responses.
+ * Currently only api.letta.com supports this feature.
+ */
+function serverSupportsImageToolReturns(): boolean {
+  const settings = settingsManager.getSettings();
+  const baseURL =
+    process.env.LETTA_BASE_URL ||
+    settings.env?.LETTA_BASE_URL ||
+    LETTA_CLOUD_API_URL;
+  return baseURL === LETTA_CLOUD_API_URL;
+}
+
 interface ReadArgs {
  file_path: string;
  offset?: number;
  limit?: number;
 }
+
+// Tool return content types - either a string or array of content parts
+export type ToolReturnContent = string | Array<TextContent | ImageContent>;
+
 interface ReadResult {
-  content: string;
+  content: ToolReturnContent;
+}
+
+// Supported image extensions
+const IMAGE_EXTENSIONS = new Set([
+  ".png",
+  ".jpg",
+  ".jpeg",
+  ".gif",
+  ".webp",
+  ".bmp",
+]);
+
+function isImageFile(filePath: string): boolean {
+  const ext = path.extname(filePath).toLowerCase();
+  return IMAGE_EXTENSIONS.has(ext);
+}
+
+function getMediaType(ext: string): string {
+  const types: Record<string, string> = {
+    ".png": "image/png",
+    ".jpg": "image/jpeg",
+    ".jpeg": "image/jpeg",
+    ".gif": "image/gif",
+    ".webp": "image/webp",
+    ".bmp": "image/png", // Convert BMP to PNG
+  };
+  return types[ext] || "image/png";
+}
+
+async function readImageFile(
+  filePath: string,
+): Promise<Array<TextContent | ImageContent>> {
+  const buffer = await fs.readFile(filePath);
+  const ext = path.extname(filePath).toLowerCase();
+  const mediaType = getMediaType(ext);
+
+  // Use shared image resize utility
+  const result = await resizeImageIfNeeded(buffer, mediaType);
+
+  return [
+    {
+      type: "text",
+      text: `[Image: ${path.basename(filePath)}${result.resized ? " (resized to fit API limits)" : ""}]`,
+    },
+    {
+      type: "image",
+      source: {
+        type: "base64",
+        media_type: result.mediaType,
+        data: result.data,
+      },
+    },
+  ];
 }

 async function isBinaryFile(filePath: string): Promise<boolean> {
@@ -140,6 +217,28 @@ export async function read(args: ReadArgs): Promise<ReadResult> {
    const stats = await fs.stat(resolvedPath);
    if (stats.isDirectory())
      throw new Error(`Path is a directory, not a file: ${resolvedPath}`);
+
+    // Check if this is an image file
+    if (isImageFile(resolvedPath)) {
+      // Check if server supports images in tool responses
+      if (!serverSupportsImageToolReturns()) {
+        throw new Error(
+          `This server does not support images in tool responses.`,
+        );
+      }
+
+      // Images have a higher size limit (20MB raw, will be resized if needed)
+      const maxImageSize = 20 * 1024 * 1024;
+      if (stats.size > maxImageSize) {
+        throw new Error(
+          `Image file too large: ${stats.size} bytes (max ${maxImageSize} bytes)`,
+        );
+      }
+      const imageContent = await readImageFile(resolvedPath);
+      return { content: imageContent };
+    }
+
+    // Regular text file handling
    const maxSize = 10 * 1024 * 1024; // 10MB
    if (stats.size > maxSize)
      throw new Error(
--- a/src/tools/impl/ReadFileGemini.ts
+++ b/src/tools/impl/ReadFileGemini.ts
@@ -3,7 +3,8 @@
 * Uses Gemini's exact schema and description
 */

-import { read } from "./Read";
+import type { TextContent } from "@letta-ai/letta-client/resources/agents/messages";
+import { read, type ToolReturnContent } from "./Read";

 interface ReadFileGeminiArgs {
  file_path: string;
@@ -11,6 +12,20 @@ interface ReadFileGeminiArgs {
  limit?: number;
 }

+/**
+ * Extract text from tool return content (for Gemini wrapper)
+ */
+function extractText(content: ToolReturnContent): string {
+  if (typeof content === "string") {
+    return content;
+  }
+  // Extract text from multimodal content (Gemini doesn't support images via this tool)
+  return content
+    .filter((part): part is TextContent => part.type === "text")
+    .map((part) => part.text)
+    .join("\n");
+}
+
 export async function read_file_gemini(
  args: ReadFileGeminiArgs,
 ): Promise<{ message: string }> {
@@ -24,6 +39,6 @@ export async function read_file_gemini(

  const result = await read(lettaArgs);

-  // Read returns { content: string }
-  return { message: result.content };
+  // Read returns { content: ToolReturnContent } - extract text for Gemini
+  return { message: extractText(result.content) };
 }
--- a/src/tools/impl/ReadLSP.ts
+++ b/src/tools/impl/ReadLSP.ts
@@ -2,7 +2,7 @@
 * LSP-enhanced Read tool - wraps the base Read tool and adds LSP diagnostics
 * This is used when LETTA_ENABLE_LSP is set
 */
-import { read as baseRead } from "./Read.js";
+import { read as baseRead, type ToolReturnContent } from "./Read.js";

 // Format a single diagnostic in opencode style: "ERROR [line:col] message"
 function formatDiagnostic(diag: {
@@ -30,7 +30,7 @@ interface ReadLSPArgs {
 }

 interface ReadLSPResult {
-  content: string;
+  content: ToolReturnContent;
 }

 export async function read_lsp(args: ReadLSPArgs): Promise<ReadLSPResult> {
@@ -42,6 +42,11 @@ export async function read_lsp(args: ReadLSPArgs): Promise<ReadLSPResult> {
    return result;
  }

+  // If content is multimodal (image), skip LSP processing - only applies to text files
+  if (typeof result.content !== "string") {
+    return result;
+  }
+
  // Determine if we should include diagnostics
  const lineCount = result.content.split("\n").length;
  const shouldInclude =
--- a/src/tools/manager.ts
+++ b/src/tools/manager.ts
@@ -210,8 +210,16 @@ interface ToolDefinition {
  fn: (args: ToolArgs) => Promise<unknown>;
 }

+import type {
+  ImageContent,
+  TextContent,
+} from "@letta-ai/letta-client/resources/agents/messages";
+
+// Tool return content can be a string or array of text/image content parts
+export type ToolReturnContent = string | Array<TextContent | ImageContent>;
+
 export type ToolExecutionResult = {
-  toolReturn: string;
+  toolReturn: ToolReturnContent;
  status: "success" | "error";
  stdout?: string[];
  stderr?: string[];
@@ -628,7 +636,18 @@ function isStringArray(value: unknown): value is string[] {
  );
 }

-function flattenToolResponse(result: unknown): string {
+/**
+ * Check if an array contains multimodal content (text + images)
+ */
+function isMultimodalContent(
+  arr: unknown[],
+): arr is Array<TextContent | ImageContent> {
+  return arr.every(
+    (item) => isRecord(item) && (item.type === "text" || item.type === "image"),
+  );
+}
+
+function flattenToolResponse(result: unknown): ToolReturnContent {
  if (result === null || result === undefined) {
    return "";
  }
@@ -645,6 +664,11 @@ function flattenToolResponse(result: unknown): string {
    return result.message;
  }

+  // Check for multimodal content (images) - return as-is without flattening
+  if (Array.isArray(result.content) && isMultimodalContent(result.content)) {
+    return result.content;
+  }
+
  if (typeof result.content === "string") {
    return result.content;
  }
@@ -770,12 +794,16 @@ export async function executeTool(
    // Flatten the response to plain text
    const flattenedResponse = flattenToolResponse(result);

-    // Track tool usage
+    // Track tool usage (calculate size for multimodal content)
+    const responseSize =
+      typeof flattenedResponse === "string"
+        ? flattenedResponse.length
+        : JSON.stringify(flattenedResponse).length;
    telemetry.trackToolUsage(
      internalName,
      toolStatus === "success",
      duration,
-      flattenedResponse.length,
+      responseSize,
      toolStatus === "error" ? "tool_error" : undefined,
      stderr ? stderr.join("\n") : undefined,
    );