diff --git a/package.json b/package.json
index d869753..3c62b28 100644
--- a/package.json
+++ b/package.json
@@ -30,7 +30,7 @@
     "access": "public"
   },
   "dependencies": {
-    "@letta-ai/letta-client": "^1.6.8",
+    "@letta-ai/letta-client": "^1.7.2",
     "glob": "^13.0.0",
     "ink-link": "^5.0.0",
     "open": "^10.2.0",
@@ -43,6 +43,7 @@
     "@types/bun": "latest",
     "@types/diff": "^8.0.0",
     "@types/picomatch": "^4.0.2",
+    "@types/react": "^19.2.9",
     "diff": "^8.0.2",
     "husky": "9.1.7",
     "ink": "^5.0.0",
diff --git a/src/agent/approval-execution.ts b/src/agent/approval-execution.ts
index ea132b8..907e70c 100644
--- a/src/agent/approval-execution.ts
+++ b/src/agent/approval-execution.ts
@@ -3,12 +3,32 @@
 import * as path from "node:path";
 import type {
   ApprovalReturn,
+  TextContent,
   ToolReturn,
 } from "@letta-ai/letta-client/resources/agents/messages";
 import type { ToolReturnMessage } from "@letta-ai/letta-client/resources/tools";
 import type { ApprovalRequest } from "../cli/helpers/stream";
 import { INTERRUPTED_BY_USER } from "../constants";
-import { executeTool, type ToolExecutionResult } from "../tools/manager";
+import {
+  executeTool,
+  type ToolExecutionResult,
+  type ToolReturnContent,
+} from "../tools/manager";
+
+/**
+ * Extract displayable text from tool return content (for UI display).
+ * Multimodal content returns the text parts concatenated.
+ */
+export function getDisplayableToolReturn(content: ToolReturnContent): string {
+  if (typeof content === "string") {
+    return content;
+  }
+  // Extract text from multimodal content
+  return content
+    .filter((part): part is TextContent => part.type === "text")
+    .map((part) => part.text)
+    .join("\n");
+}
 
 /**
  * Tools that are safe to execute in parallel (read-only or independent).
@@ -235,13 +255,14 @@ async function executeSingleDecision(
       );
 
       // Update UI if callback provided (interactive mode)
+      // Note: UI display uses text-only version, backend gets full multimodal content
       if (onChunk) {
         onChunk({
           message_type: "tool_return_message",
           id: "dummy",
           date: new Date().toISOString(),
           tool_call_id: decision.approval.toolCallId,
-          tool_return: toolResult.toolReturn,
+          tool_return: getDisplayableToolReturn(toolResult.toolReturn),
           status: toolResult.status,
           stdout: toolResult.stdout,
           stderr: toolResult.stderr,
@@ -251,7 +272,7 @@ async function executeSingleDecision(
       return {
         type: "tool",
         tool_call_id: decision.approval.toolCallId,
-        tool_return: toolResult.toolReturn,
+        tool_return: toolResult.toolReturn, // Full multimodal content for backend
         status: toolResult.status,
         stdout: toolResult.stdout,
         stderr: toolResult.stderr,
diff --git a/src/cli/App.tsx b/src/cli/App.tsx
index f5e728e..5c1e792 100644
--- a/src/cli/App.tsx
+++ b/src/cli/App.tsx
@@ -26,6 +26,7 @@ import {
 import {
   type ApprovalResult,
   executeAutoAllowedTools,
+  getDisplayableToolReturn,
 } from "../agent/approval-execution";
 import {
   buildApprovalRecoveryMessage,
@@ -7333,7 +7334,7 @@ DO NOT respond to these messages or otherwise consider them in your response unl
           id: "dummy",
           date: new Date().toISOString(),
           tool_call_id: approval.toolCallId,
-          tool_return: toolResult.toolReturn,
+          tool_return: getDisplayableToolReturn(toolResult.toolReturn),
           status: toolResult.status,
           stdout: toolResult.stdout,
           stderr: toolResult.stderr,
diff --git a/src/cli/components/ToolCallMessageRich.tsx b/src/cli/components/ToolCallMessageRich.tsx
index 7bf6d47..acdf6db 100644
--- a/src/cli/components/ToolCallMessageRich.tsx
+++ b/src/cli/components/ToolCallMessageRich.tsx
@@ -594,12 +594,30 @@ export const ToolCallMessage = memo(
         }
       }
 
-      // Check if this is a file read tool - show line count summary
+      // Check if this is a file read tool - show line count or image summary
       if (
         isFileReadTool(rawName) &&
         line.resultOk !== false &&
         line.resultText
       ) {
+        // Check if this is an image result (starts with "[Image: filename]")
+        const isImageResult = line.resultText.startsWith("[Image: ");
+
+        if (isImageResult) {
+          return (
+            <Box flexDirection="row">
+              <Box width={prefixWidth} flexShrink={0}>
+                <Text>{prefix}</Text>
+              </Box>
+              <Box flexGrow={1} width={contentWidth}>
+                <Text>
+                  Read <Text bold>1</Text> image
+                </Text>
+              </Box>
+            </Box>
+          );
+        }
+
         // Count lines in the result (the content returned by Read tool)
         const lineCount = line.resultText.split("\n").length;
         return (
@@ -609,7 +627,8 @@ export const ToolCallMessage = memo(
             </Box>
             <Box flexGrow={1} width={contentWidth}>
               <Text>
-                Read <Text bold>{lineCount}</Text> lines
+                Read <Text bold>{lineCount}</Text> line
+                {lineCount !== 1 ? "s" : ""}
               </Text>
             </Box>
           </Box>
diff --git a/src/cli/helpers/backfill.ts b/src/cli/helpers/backfill.ts
index 79c2110..e1f0d6f 100644
--- a/src/cli/helpers/backfill.ts
+++ b/src/cli/helpers/backfill.ts
@@ -1,10 +1,30 @@
 import type {
+  ImageContent,
   LettaAssistantMessageContentUnion,
   LettaUserMessageContentUnion,
   Message,
+  TextContent,
 } from "@letta-ai/letta-client/resources/agents/messages";
 import type { Buffers } from "./accumulator";
 
+/**
+ * Extract displayable text from tool return content.
+ * Multimodal content returns the text parts concatenated.
+ */
+function getDisplayableToolReturn(
+  content: string | Array<TextContent | ImageContent> | undefined,
+): string {
+  if (!content) return "";
+  if (typeof content === "string") {
+    return content;
+  }
+  // Extract text from multimodal content
+  return content
+    .filter((part): part is TextContent => part.type === "text")
+    .map((part) => part.text)
+    .join("\n");
+}
+
 // const PASTE_LINE_THRESHOLD = 5;
 // const PASTE_CHAR_THRESHOLD = 500;
 const CLIP_CHAR_LIMIT_TEXT = 500;
@@ -238,7 +258,8 @@ export function backfillBuffers(buffers: Buffers, history: Message[]): void {
 
           // Update the existing line with the result
           // Handle both func_response (streaming) and tool_return (SDK) properties
-          const resultText =
+          // tool_return can be multimodal (string or array of content parts)
+          const rawResult =
             ("func_response" in toolReturn
               ? toolReturn.func_response
               : undefined) ||
@@ -246,6 +267,7 @@ export function backfillBuffers(buffers: Buffers, history: Message[]): void {
               ? toolReturn.tool_return
               : undefined) ||
             "";
+          const resultText = getDisplayableToolReturn(rawResult);
           buffers.byId.set(toolCallLineId, {
             ...existingLine,
             resultText,
diff --git a/src/tools/descriptions/Read.md b/src/tools/descriptions/Read.md
index d436428..17e1d7f 100644
--- a/src/tools/descriptions/Read.md
+++ b/src/tools/descriptions/Read.md
@@ -9,6 +9,8 @@ Usage:
 - You can optionally specify a line offset and limit (especially handy for long files), but it's recommended to read the whole file by not providing these parameters
 - Any lines longer than 2000 characters will be truncated
 - Results are returned using cat -n format, with line numbers starting at 1
+- This tool allows Letta Code to read images (PNG, JPG, JPEG, GIF, WEBP, BMP). When reading an image file the contents are presented visually as Letta Code is a multimodal LLM. Large images are automatically resized to fit within API limits.
+- You will regularly be asked to read screenshots. If the user provides a path to a screenshot, ALWAYS use this tool to view the file at the path. This tool will work with all temporary file paths.
 - This tool can only read files, not directories. To read a directory, use the ls command via Bash.
 - You can call multiple tools in a single response. It is always better to speculatively read multiple potentially useful files in parallel.
 - If you read a file that exists but has empty contents you will receive a system reminder warning in place of file contents.
diff --git a/src/tools/impl/Read.ts b/src/tools/impl/Read.ts
index 72ffcf1..a14ca4e 100644
--- a/src/tools/impl/Read.ts
+++ b/src/tools/impl/Read.ts
@@ -1,16 +1,93 @@
 import { promises as fs } from "node:fs";
 import * as path from "node:path";
+import type {
+  ImageContent,
+  TextContent,
+} from "@letta-ai/letta-client/resources/agents/messages";
+import { LETTA_CLOUD_API_URL } from "../../auth/oauth.js";
+import { resizeImageIfNeeded } from "../../cli/helpers/imageResize.js";
+import { settingsManager } from "../../settings-manager.js";
 import { OVERFLOW_CONFIG, writeOverflowFile } from "./overflow.js";
 import { LIMITS } from "./truncation.js";
 import { validateRequiredParams } from "./validation.js";
 
+/**
+ * Check if the server supports images in tool responses.
+ * Currently only api.letta.com supports this feature.
+ */
+function serverSupportsImageToolReturns(): boolean {
+  const settings = settingsManager.getSettings();
+  const baseURL =
+    process.env.LETTA_BASE_URL ||
+    settings.env?.LETTA_BASE_URL ||
+    LETTA_CLOUD_API_URL;
+  return baseURL === LETTA_CLOUD_API_URL;
+}
+
 interface ReadArgs {
   file_path: string;
   offset?: number;
   limit?: number;
 }
+
+// Tool return content types - either a string or array of content parts
+export type ToolReturnContent = string | Array<TextContent | ImageContent>;
+
 interface ReadResult {
-  content: string;
+  content: ToolReturnContent;
+}
+
+// Supported image extensions
+const IMAGE_EXTENSIONS = new Set([
+  ".png",
+  ".jpg",
+  ".jpeg",
+  ".gif",
+  ".webp",
+  ".bmp",
+]);
+
+function isImageFile(filePath: string): boolean {
+  const ext = path.extname(filePath).toLowerCase();
+  return IMAGE_EXTENSIONS.has(ext);
+}
+
+function getMediaType(ext: string): string {
+  const types: Record<string, string> = {
+    ".png": "image/png",
+    ".jpg": "image/jpeg",
+    ".jpeg": "image/jpeg",
+    ".gif": "image/gif",
+    ".webp": "image/webp",
+    ".bmp": "image/png", // Convert BMP to PNG
+  };
+  return types[ext] || "image/png";
+}
+
+async function readImageFile(
+  filePath: string,
+): Promise<Array<TextContent | ImageContent>> {
+  const buffer = await fs.readFile(filePath);
+  const ext = path.extname(filePath).toLowerCase();
+  const mediaType = getMediaType(ext);
+
+  // Use shared image resize utility
+  const result = await resizeImageIfNeeded(buffer, mediaType);
+
+  return [
+    {
+      type: "text",
+      text: `[Image: ${path.basename(filePath)}${result.resized ? " (resized to fit API limits)" : ""}]`,
+    },
+    {
+      type: "image",
+      source: {
+        type: "base64",
+        media_type: result.mediaType,
+        data: result.data,
+      },
+    },
+  ];
 }
 
 async function isBinaryFile(filePath: string): Promise<boolean> {
@@ -140,6 +217,28 @@ export async function read(args: ReadArgs): Promise<ReadResult> {
     const stats = await fs.stat(resolvedPath);
     if (stats.isDirectory())
       throw new Error(`Path is a directory, not a file: ${resolvedPath}`);
+
+    // Check if this is an image file
+    if (isImageFile(resolvedPath)) {
+      // Check if server supports images in tool responses
+      if (!serverSupportsImageToolReturns()) {
+        throw new Error(
+          `This server does not support images in tool responses.`,
+        );
+      }
+
+      // Images have a higher size limit (20MB raw, will be resized if needed)
+      const maxImageSize = 20 * 1024 * 1024;
+      if (stats.size > maxImageSize) {
+        throw new Error(
+          `Image file too large: ${stats.size} bytes (max ${maxImageSize} bytes)`,
+        );
+      }
+      const imageContent = await readImageFile(resolvedPath);
+      return { content: imageContent };
+    }
+
+    // Regular text file handling
     const maxSize = 10 * 1024 * 1024; // 10MB
     if (stats.size > maxSize)
       throw new Error(
diff --git a/src/tools/impl/ReadFileGemini.ts b/src/tools/impl/ReadFileGemini.ts
index 7b84759..4eac9aa 100644
--- a/src/tools/impl/ReadFileGemini.ts
+++ b/src/tools/impl/ReadFileGemini.ts
@@ -3,7 +3,8 @@
  * Uses Gemini's exact schema and description
  */
 
-import { read } from "./Read";
+import type { TextContent } from "@letta-ai/letta-client/resources/agents/messages";
+import { read, type ToolReturnContent } from "./Read";
 
 interface ReadFileGeminiArgs {
   file_path: string;
@@ -11,6 +12,20 @@ interface ReadFileGeminiArgs {
   limit?: number;
 }
 
+/**
+ * Extract text from tool return content (for Gemini wrapper)
+ */
+function extractText(content: ToolReturnContent): string {
+  if (typeof content === "string") {
+    return content;
+  }
+  // Extract text from multimodal content (Gemini doesn't support images via this tool)
+  return content
+    .filter((part): part is TextContent => part.type === "text")
+    .map((part) => part.text)
+    .join("\n");
+}
+
 export async function read_file_gemini(
   args: ReadFileGeminiArgs,
 ): Promise<{ message: string }> {
@@ -24,6 +39,6 @@ export async function read_file_gemini(
 
   const result = await read(lettaArgs);
 
-  // Read returns { content: string }
-  return { message: result.content };
+  // Read returns { content: ToolReturnContent } - extract text for Gemini
+  return { message: extractText(result.content) };
 }
diff --git a/src/tools/impl/ReadLSP.ts b/src/tools/impl/ReadLSP.ts
index 689ef7e..cd603a5 100644
--- a/src/tools/impl/ReadLSP.ts
+++ b/src/tools/impl/ReadLSP.ts
@@ -2,7 +2,7 @@
  * LSP-enhanced Read tool - wraps the base Read tool and adds LSP diagnostics
  * This is used when LETTA_ENABLE_LSP is set
  */
-import { read as baseRead } from "./Read.js";
+import { read as baseRead, type ToolReturnContent } from "./Read.js";
 
 // Format a single diagnostic in opencode style: "ERROR [line:col] message"
 function formatDiagnostic(diag: {
@@ -30,7 +30,7 @@ interface ReadLSPArgs {
 }
 
 interface ReadLSPResult {
-  content: string;
+  content: ToolReturnContent;
 }
 
 export async function read_lsp(args: ReadLSPArgs): Promise<ReadLSPResult> {
@@ -42,6 +42,11 @@ export async function read_lsp(args: ReadLSPArgs): Promise<ReadLSPResult> {
     return result;
   }
 
+  // If content is multimodal (image), skip LSP processing - only applies to text files
+  if (typeof result.content !== "string") {
+    return result;
+  }
+
   // Determine if we should include diagnostics
   const lineCount = result.content.split("\n").length;
   const shouldInclude =
diff --git a/src/tools/manager.ts b/src/tools/manager.ts
index 1229e0d..99df4fc 100644
--- a/src/tools/manager.ts
+++ b/src/tools/manager.ts
@@ -210,8 +210,16 @@ interface ToolDefinition {
   fn: (args: ToolArgs) => Promise<unknown>;
 }
 
+import type {
+  ImageContent,
+  TextContent,
+} from "@letta-ai/letta-client/resources/agents/messages";
+
+// Tool return content can be a string or array of text/image content parts
+export type ToolReturnContent = string | Array<TextContent | ImageContent>;
+
 export type ToolExecutionResult = {
-  toolReturn: string;
+  toolReturn: ToolReturnContent;
   status: "success" | "error";
   stdout?: string[];
   stderr?: string[];
@@ -628,7 +636,18 @@ function isStringArray(value: unknown): value is string[] {
   );
 }
 
-function flattenToolResponse(result: unknown): string {
+/**
+ * Check if an array contains multimodal content (text + images)
+ */
+function isMultimodalContent(
+  arr: unknown[],
+): arr is Array<TextContent | ImageContent> {
+  return arr.every(
+    (item) => isRecord(item) && (item.type === "text" || item.type === "image"),
+  );
+}
+
+function flattenToolResponse(result: unknown): ToolReturnContent {
   if (result === null || result === undefined) {
     return "";
   }
@@ -645,6 +664,11 @@ function flattenToolResponse(result: unknown): string {
     return result.message;
   }
 
+  // Check for multimodal content (images) - return as-is without flattening
+  if (Array.isArray(result.content) && isMultimodalContent(result.content)) {
+    return result.content;
+  }
+
   if (typeof result.content === "string") {
     return result.content;
   }
@@ -770,12 +794,16 @@ export async function executeTool(
     // Flatten the response to plain text
     const flattenedResponse = flattenToolResponse(result);
 
-    // Track tool usage
+    // Track tool usage (calculate size for multimodal content)
+    const responseSize =
+      typeof flattenedResponse === "string"
+        ? flattenedResponse.length
+        : JSON.stringify(flattenedResponse).length;
     telemetry.trackToolUsage(
       internalName,
       toolStatus === "success",
       duration,
-      flattenedResponse.length,
+      responseSize,
       toolStatus === "error" ? "tool_error" : undefined,
       stderr ? stderr.join("\n") : undefined,
     );