diff --git a/package.json b/package.json
index d869753..3c62b28 100644
--- a/package.json
+++ b/package.json
@@ -30,7 +30,7 @@
"access": "public"
},
"dependencies": {
- "@letta-ai/letta-client": "^1.6.8",
+ "@letta-ai/letta-client": "^1.7.2",
"glob": "^13.0.0",
"ink-link": "^5.0.0",
"open": "^10.2.0",
@@ -43,6 +43,7 @@
"@types/bun": "latest",
"@types/diff": "^8.0.0",
"@types/picomatch": "^4.0.2",
+ "@types/react": "^19.2.9",
"diff": "^8.0.2",
"husky": "9.1.7",
"ink": "^5.0.0",
diff --git a/src/agent/approval-execution.ts b/src/agent/approval-execution.ts
index ea132b8..907e70c 100644
--- a/src/agent/approval-execution.ts
+++ b/src/agent/approval-execution.ts
@@ -3,12 +3,32 @@
import * as path from "node:path";
import type {
ApprovalReturn,
+ TextContent,
ToolReturn,
} from "@letta-ai/letta-client/resources/agents/messages";
import type { ToolReturnMessage } from "@letta-ai/letta-client/resources/tools";
import type { ApprovalRequest } from "../cli/helpers/stream";
import { INTERRUPTED_BY_USER } from "../constants";
-import { executeTool, type ToolExecutionResult } from "../tools/manager";
+import {
+ executeTool,
+ type ToolExecutionResult,
+ type ToolReturnContent,
+} from "../tools/manager";
+
+/**
+ * Extract displayable text from tool return content (for UI display).
+ * Multimodal content returns the text parts concatenated.
+ */
+export function getDisplayableToolReturn(content: ToolReturnContent): string {
+ if (typeof content === "string") {
+ return content;
+ }
+ // Extract text from multimodal content
+ return content
+ .filter((part): part is TextContent => part.type === "text")
+ .map((part) => part.text)
+ .join("\n");
+}
/**
* Tools that are safe to execute in parallel (read-only or independent).
@@ -235,13 +255,14 @@ async function executeSingleDecision(
);
// Update UI if callback provided (interactive mode)
+ // Note: UI display uses text-only version, backend gets full multimodal content
if (onChunk) {
onChunk({
message_type: "tool_return_message",
id: "dummy",
date: new Date().toISOString(),
tool_call_id: decision.approval.toolCallId,
- tool_return: toolResult.toolReturn,
+ tool_return: getDisplayableToolReturn(toolResult.toolReturn),
status: toolResult.status,
stdout: toolResult.stdout,
stderr: toolResult.stderr,
@@ -251,7 +272,7 @@ async function executeSingleDecision(
return {
type: "tool",
tool_call_id: decision.approval.toolCallId,
- tool_return: toolResult.toolReturn,
+ tool_return: toolResult.toolReturn, // Full multimodal content for backend
status: toolResult.status,
stdout: toolResult.stdout,
stderr: toolResult.stderr,
diff --git a/src/cli/App.tsx b/src/cli/App.tsx
index f5e728e..5c1e792 100644
--- a/src/cli/App.tsx
+++ b/src/cli/App.tsx
@@ -26,6 +26,7 @@ import {
import {
type ApprovalResult,
executeAutoAllowedTools,
+ getDisplayableToolReturn,
} from "../agent/approval-execution";
import {
buildApprovalRecoveryMessage,
@@ -7333,7 +7334,7 @@ DO NOT respond to these messages or otherwise consider them in your response unl
id: "dummy",
date: new Date().toISOString(),
tool_call_id: approval.toolCallId,
- tool_return: toolResult.toolReturn,
+ tool_return: getDisplayableToolReturn(toolResult.toolReturn),
status: toolResult.status,
stdout: toolResult.stdout,
stderr: toolResult.stderr,
diff --git a/src/cli/components/ToolCallMessageRich.tsx b/src/cli/components/ToolCallMessageRich.tsx
index 7bf6d47..acdf6db 100644
--- a/src/cli/components/ToolCallMessageRich.tsx
+++ b/src/cli/components/ToolCallMessageRich.tsx
@@ -594,12 +594,30 @@ export const ToolCallMessage = memo(
}
}
- // Check if this is a file read tool - show line count summary
+ // Check if this is a file read tool - show line count or image summary
if (
isFileReadTool(rawName) &&
line.resultOk !== false &&
line.resultText
) {
+ // Check if this is an image result (starts with "[Image: filename]")
+ const isImageResult = line.resultText.startsWith("[Image: ");
+
+ if (isImageResult) {
+ return (
+
+
+ {prefix}
+
+
+
+ Read 1 image
+
+
+
+ );
+ }
+
// Count lines in the result (the content returned by Read tool)
const lineCount = line.resultText.split("\n").length;
return (
@@ -609,7 +627,8 @@ export const ToolCallMessage = memo(
- Read {lineCount} lines
+ Read {lineCount} line
+ {lineCount !== 1 ? "s" : ""}
diff --git a/src/cli/helpers/backfill.ts b/src/cli/helpers/backfill.ts
index 79c2110..e1f0d6f 100644
--- a/src/cli/helpers/backfill.ts
+++ b/src/cli/helpers/backfill.ts
@@ -1,10 +1,30 @@
import type {
+ ImageContent,
LettaAssistantMessageContentUnion,
LettaUserMessageContentUnion,
Message,
+ TextContent,
} from "@letta-ai/letta-client/resources/agents/messages";
import type { Buffers } from "./accumulator";
+/**
+ * Extract displayable text from tool return content.
+ * Multimodal content returns the text parts concatenated.
+ */
+function getDisplayableToolReturn(
+ content: string | Array | undefined,
+): string {
+ if (!content) return "";
+ if (typeof content === "string") {
+ return content;
+ }
+ // Extract text from multimodal content
+ return content
+ .filter((part): part is TextContent => part.type === "text")
+ .map((part) => part.text)
+ .join("\n");
+}
+
// const PASTE_LINE_THRESHOLD = 5;
// const PASTE_CHAR_THRESHOLD = 500;
const CLIP_CHAR_LIMIT_TEXT = 500;
@@ -238,7 +258,8 @@ export function backfillBuffers(buffers: Buffers, history: Message[]): void {
// Update the existing line with the result
// Handle both func_response (streaming) and tool_return (SDK) properties
- const resultText =
+ // tool_return can be multimodal (string or array of content parts)
+ const rawResult =
("func_response" in toolReturn
? toolReturn.func_response
: undefined) ||
@@ -246,6 +267,7 @@ export function backfillBuffers(buffers: Buffers, history: Message[]): void {
? toolReturn.tool_return
: undefined) ||
"";
+ const resultText = getDisplayableToolReturn(rawResult);
buffers.byId.set(toolCallLineId, {
...existingLine,
resultText,
diff --git a/src/tools/descriptions/Read.md b/src/tools/descriptions/Read.md
index d436428..17e1d7f 100644
--- a/src/tools/descriptions/Read.md
+++ b/src/tools/descriptions/Read.md
@@ -9,6 +9,8 @@ Usage:
- You can optionally specify a line offset and limit (especially handy for long files), but it's recommended to read the whole file by not providing these parameters
- Any lines longer than 2000 characters will be truncated
- Results are returned using cat -n format, with line numbers starting at 1
+- This tool allows Letta Code to read images (PNG, JPG, JPEG, GIF, WEBP, BMP). When reading an image file the contents are presented visually as Letta Code is a multimodal LLM. Large images are automatically resized to fit within API limits.
+- You will regularly be asked to read screenshots. If the user provides a path to a screenshot, ALWAYS use this tool to view the file at the path. This tool will work with all temporary file paths.
- This tool can only read files, not directories. To read a directory, use the ls command via Bash.
- You can call multiple tools in a single response. It is always better to speculatively read multiple potentially useful files in parallel.
- If you read a file that exists but has empty contents you will receive a system reminder warning in place of file contents.
diff --git a/src/tools/impl/Read.ts b/src/tools/impl/Read.ts
index 72ffcf1..a14ca4e 100644
--- a/src/tools/impl/Read.ts
+++ b/src/tools/impl/Read.ts
@@ -1,16 +1,93 @@
import { promises as fs } from "node:fs";
import * as path from "node:path";
+import type {
+ ImageContent,
+ TextContent,
+} from "@letta-ai/letta-client/resources/agents/messages";
+import { LETTA_CLOUD_API_URL } from "../../auth/oauth.js";
+import { resizeImageIfNeeded } from "../../cli/helpers/imageResize.js";
+import { settingsManager } from "../../settings-manager.js";
import { OVERFLOW_CONFIG, writeOverflowFile } from "./overflow.js";
import { LIMITS } from "./truncation.js";
import { validateRequiredParams } from "./validation.js";
+/**
+ * Check if the server supports images in tool responses.
+ * Currently only api.letta.com supports this feature.
+ */
+function serverSupportsImageToolReturns(): boolean {
+ const settings = settingsManager.getSettings();
+ const baseURL =
+ process.env.LETTA_BASE_URL ||
+ settings.env?.LETTA_BASE_URL ||
+ LETTA_CLOUD_API_URL;
+ return baseURL === LETTA_CLOUD_API_URL;
+}
+
interface ReadArgs {
file_path: string;
offset?: number;
limit?: number;
}
+
+// Tool return content types - either a string or array of content parts
+export type ToolReturnContent = string | Array;
+
interface ReadResult {
- content: string;
+ content: ToolReturnContent;
+}
+
+// Supported image extensions
+const IMAGE_EXTENSIONS = new Set([
+ ".png",
+ ".jpg",
+ ".jpeg",
+ ".gif",
+ ".webp",
+ ".bmp",
+]);
+
+function isImageFile(filePath: string): boolean {
+ const ext = path.extname(filePath).toLowerCase();
+ return IMAGE_EXTENSIONS.has(ext);
+}
+
+function getMediaType(ext: string): string {
+ const types: Record = {
+ ".png": "image/png",
+ ".jpg": "image/jpeg",
+ ".jpeg": "image/jpeg",
+ ".gif": "image/gif",
+ ".webp": "image/webp",
+ ".bmp": "image/png", // Convert BMP to PNG
+ };
+ return types[ext] || "image/png";
+}
+
+async function readImageFile(
+ filePath: string,
+): Promise> {
+ const buffer = await fs.readFile(filePath);
+ const ext = path.extname(filePath).toLowerCase();
+ const mediaType = getMediaType(ext);
+
+ // Use shared image resize utility
+ const result = await resizeImageIfNeeded(buffer, mediaType);
+
+ return [
+ {
+ type: "text",
+ text: `[Image: ${path.basename(filePath)}${result.resized ? " (resized to fit API limits)" : ""}]`,
+ },
+ {
+ type: "image",
+ source: {
+ type: "base64",
+ media_type: result.mediaType,
+ data: result.data,
+ },
+ },
+ ];
}
async function isBinaryFile(filePath: string): Promise {
@@ -140,6 +217,28 @@ export async function read(args: ReadArgs): Promise {
const stats = await fs.stat(resolvedPath);
if (stats.isDirectory())
throw new Error(`Path is a directory, not a file: ${resolvedPath}`);
+
+ // Check if this is an image file
+ if (isImageFile(resolvedPath)) {
+ // Check if server supports images in tool responses
+ if (!serverSupportsImageToolReturns()) {
+ throw new Error(
+ `This server does not support images in tool responses.`,
+ );
+ }
+
+ // Images have a higher size limit (20MB raw, will be resized if needed)
+ const maxImageSize = 20 * 1024 * 1024;
+ if (stats.size > maxImageSize) {
+ throw new Error(
+ `Image file too large: ${stats.size} bytes (max ${maxImageSize} bytes)`,
+ );
+ }
+ const imageContent = await readImageFile(resolvedPath);
+ return { content: imageContent };
+ }
+
+ // Regular text file handling
const maxSize = 10 * 1024 * 1024; // 10MB
if (stats.size > maxSize)
throw new Error(
diff --git a/src/tools/impl/ReadFileGemini.ts b/src/tools/impl/ReadFileGemini.ts
index 7b84759..4eac9aa 100644
--- a/src/tools/impl/ReadFileGemini.ts
+++ b/src/tools/impl/ReadFileGemini.ts
@@ -3,7 +3,8 @@
* Uses Gemini's exact schema and description
*/
-import { read } from "./Read";
+import type { TextContent } from "@letta-ai/letta-client/resources/agents/messages";
+import { read, type ToolReturnContent } from "./Read";
interface ReadFileGeminiArgs {
file_path: string;
@@ -11,6 +12,20 @@ interface ReadFileGeminiArgs {
limit?: number;
}
+/**
+ * Extract text from tool return content (for Gemini wrapper)
+ */
+function extractText(content: ToolReturnContent): string {
+ if (typeof content === "string") {
+ return content;
+ }
+ // Extract text from multimodal content (Gemini doesn't support images via this tool)
+ return content
+ .filter((part): part is TextContent => part.type === "text")
+ .map((part) => part.text)
+ .join("\n");
+}
+
export async function read_file_gemini(
args: ReadFileGeminiArgs,
): Promise<{ message: string }> {
@@ -24,6 +39,6 @@ export async function read_file_gemini(
const result = await read(lettaArgs);
- // Read returns { content: string }
- return { message: result.content };
+ // Read returns { content: ToolReturnContent } - extract text for Gemini
+ return { message: extractText(result.content) };
}
diff --git a/src/tools/impl/ReadLSP.ts b/src/tools/impl/ReadLSP.ts
index 689ef7e..cd603a5 100644
--- a/src/tools/impl/ReadLSP.ts
+++ b/src/tools/impl/ReadLSP.ts
@@ -2,7 +2,7 @@
* LSP-enhanced Read tool - wraps the base Read tool and adds LSP diagnostics
* This is used when LETTA_ENABLE_LSP is set
*/
-import { read as baseRead } from "./Read.js";
+import { read as baseRead, type ToolReturnContent } from "./Read.js";
// Format a single diagnostic in opencode style: "ERROR [line:col] message"
function formatDiagnostic(diag: {
@@ -30,7 +30,7 @@ interface ReadLSPArgs {
}
interface ReadLSPResult {
- content: string;
+ content: ToolReturnContent;
}
export async function read_lsp(args: ReadLSPArgs): Promise {
@@ -42,6 +42,11 @@ export async function read_lsp(args: ReadLSPArgs): Promise {
return result;
}
+ // If content is multimodal (image), skip LSP processing - only applies to text files
+ if (typeof result.content !== "string") {
+ return result;
+ }
+
// Determine if we should include diagnostics
const lineCount = result.content.split("\n").length;
const shouldInclude =
diff --git a/src/tools/manager.ts b/src/tools/manager.ts
index 1229e0d..99df4fc 100644
--- a/src/tools/manager.ts
+++ b/src/tools/manager.ts
@@ -210,8 +210,16 @@ interface ToolDefinition {
fn: (args: ToolArgs) => Promise;
}
+import type {
+ ImageContent,
+ TextContent,
+} from "@letta-ai/letta-client/resources/agents/messages";
+
+// Tool return content can be a string or array of text/image content parts
+export type ToolReturnContent = string | Array;
+
export type ToolExecutionResult = {
- toolReturn: string;
+ toolReturn: ToolReturnContent;
status: "success" | "error";
stdout?: string[];
stderr?: string[];
@@ -628,7 +636,18 @@ function isStringArray(value: unknown): value is string[] {
);
}
-function flattenToolResponse(result: unknown): string {
+/**
+ * Check if an array contains multimodal content (text + images)
+ */
+function isMultimodalContent(
+ arr: unknown[],
+): arr is Array {
+ return arr.every(
+ (item) => isRecord(item) && (item.type === "text" || item.type === "image"),
+ );
+}
+
+function flattenToolResponse(result: unknown): ToolReturnContent {
if (result === null || result === undefined) {
return "";
}
@@ -645,6 +664,11 @@ function flattenToolResponse(result: unknown): string {
return result.message;
}
+ // Check for multimodal content (images) - return as-is without flattening
+ if (Array.isArray(result.content) && isMultimodalContent(result.content)) {
+ return result.content;
+ }
+
if (typeof result.content === "string") {
return result.content;
}
@@ -770,12 +794,16 @@ export async function executeTool(
// Flatten the response to plain text
const flattenedResponse = flattenToolResponse(result);
- // Track tool usage
+ // Track tool usage (calculate size for multimodal content)
+ const responseSize =
+ typeof flattenedResponse === "string"
+ ? flattenedResponse.length
+ : JSON.stringify(flattenedResponse).length;
telemetry.trackToolUsage(
internalName,
toolStatus === "success",
duration,
- flattenedResponse.length,
+ responseSize,
toolStatus === "error" ? "tool_error" : undefined,
stderr ? stderr.join("\n") : undefined,
);