feat: add image reading support to Read tool (#603)

Co-authored-by: Letta <noreply@letta.com>
This commit is contained in:
Charles Packer
2026-01-20 13:37:18 -08:00
committed by GitHub
parent e6661e7699
commit d34a65323c
7 changed files with 303 additions and 5 deletions

View File

@@ -158,6 +158,10 @@ import {
subscribe as subscribeToSubagents,
} from "./helpers/subagentState";
import { getRandomThinkingVerb } from "./helpers/thinkingMessages";
import {
clearQueuedToolImages,
getAndClearQueuedToolImages,
} from "./helpers/toolImageRegistry";
import {
isFileEditTool,
isFileWriteTool,
@@ -3239,6 +3243,9 @@ export default function App({
// Lock input for async operation (set before any await to prevent queue processing)
setCommandRunning(true);
// Clear any queued tool images from the previous agent context
clearQueuedToolImages();
const inputCmd = "/agents";
const cmdId = uid("cmd");
@@ -3717,9 +3724,44 @@ export default function App({
// Send all results to server if any
if (allResults.length > 0) {
toolResultsInFlightRef.current = true;
await processConversation([
// Check for queued tool images (from Read tool reading image files)
const toolImages = getAndClearQueuedToolImages();
const input: Array<MessageCreate | ApprovalCreate> = [
{ type: "approval", approvals: allResults },
]);
];
// If there are queued images, add them as a user message
if (toolImages.length > 0) {
const imageContentParts: Array<
| { type: "text"; text: string }
| {
type: "image";
source: { type: "base64"; media_type: string; data: string };
}
> = [];
for (const img of toolImages) {
imageContentParts.push({
type: "text",
text: `<system-reminder>Image read from ${img.filePath} (Read tool call: ${img.toolCallId}):</system-reminder>`,
});
imageContentParts.push({
type: "image",
source: {
type: "base64",
media_type: img.mediaType,
data: img.data,
},
});
}
input.push({
type: "message",
role: "user",
content: imageContentParts as unknown as MessageCreate["content"],
});
}
await processConversation(input);
toolResultsInFlightRef.current = false;
}
} finally {
@@ -4357,6 +4399,9 @@ export default function App({
setCommandRunning(true);
// Clear any queued tool images from the previous conversation
clearQueuedToolImages();
try {
const client = await getClient();
@@ -5577,7 +5622,37 @@ ${gitContext}
}
// Build message content from display value (handles placeholders for text/images)
const contentParts = buildMessageContentFromDisplay(msg);
let contentParts = buildMessageContentFromDisplay(msg);
// Prepend any queued tool images (from Read tool reading image files)
const queuedToolImages = getAndClearQueuedToolImages();
if (queuedToolImages.length > 0) {
const imageParts: Array<
| { type: "text"; text: string }
| {
type: "image";
source: { type: "base64"; media_type: string; data: string };
}
> = [];
for (const img of queuedToolImages) {
// Add system reminder text
imageParts.push({
type: "text",
text: `<system-reminder>Image read from ${img.filePath} (Read tool call: ${img.toolCallId}):</system-reminder>`,
});
// Add image content
imageParts.push({
type: "image",
source: {
type: "base64",
media_type: img.mediaType,
data: img.data,
},
});
}
// Prepend to contentParts
contentParts = [...imageParts, ...contentParts];
}
// Prepend plan mode reminder if in plan mode
const planModeReminder = getPlanModeReminder();

View File

@@ -0,0 +1,47 @@
// Registry for images read by tools that need to be sent in the next user message turn.
// This is needed because tool returns only support string content - we can't return
// image data directly in tool results to the Letta API.
export interface QueuedToolImage {
toolCallId: string;
filePath: string;
data: string; // base64
mediaType: string;
width: number;
height: number;
}
const queuedImages: QueuedToolImage[] = [];
/**
* Queue an image to be sent in the next user message.
* Called by the Read tool when reading an image file.
*/
export function queueToolImage(image: QueuedToolImage): void {
queuedImages.push(image);
}
/**
* Get and clear all queued images.
* Called when building the user message content.
*/
export function getAndClearQueuedToolImages(): QueuedToolImage[] {
const images = [...queuedImages];
queuedImages.length = 0;
return images;
}
/**
* Clear all queued images without returning them.
* Called on conversation/agent switch to prevent memory leaks.
*/
export function clearQueuedToolImages(): void {
queuedImages.length = 0;
}
/**
* Check if there are any queued images.
*/
export function hasQueuedToolImages(): boolean {
return queuedImages.length > 0;
}

View File

@@ -31,6 +31,7 @@ import { formatErrorDetails } from "./cli/helpers/errorFormatter";
import { safeJsonParseOr } from "./cli/helpers/safeJsonParse";
import { drainStreamWithResume } from "./cli/helpers/stream";
import { StreamProcessor } from "./cli/helpers/streamProcessor";
import { getAndClearQueuedToolImages } from "./cli/helpers/toolImageRegistry";
import { settingsManager } from "./settings-manager";
import { checkToolPermission } from "./tools/manager";
import type {
@@ -934,11 +935,42 @@ export async function handleHeadlessCommand(
// Add user prompt
messageContent += prompt;
// Build content parts (text + any queued tool images from Read tool)
type ContentPart =
| { type: "text"; text: string }
| {
type: "image";
source: { type: "base64"; media_type: string; data: string };
};
const contentParts: ContentPart[] = [];
// Check for queued tool images (from Read tool reading image files)
const queuedToolImages = getAndClearQueuedToolImages();
if (queuedToolImages.length > 0) {
for (const img of queuedToolImages) {
contentParts.push({
type: "text",
text: `<system-reminder>Image read from ${img.filePath} (Read tool call: ${img.toolCallId}):</system-reminder>`,
});
contentParts.push({
type: "image",
source: {
type: "base64",
media_type: img.mediaType,
data: img.data,
},
});
}
}
// Add the text message content
contentParts.push({ type: "text", text: messageContent });
// Start with the user message
let currentInput: Array<MessageCreate | ApprovalCreate> = [
{
role: "user",
content: [{ type: "text", text: messageContent }],
content: contentParts as unknown as MessageCreate["content"],
},
];
@@ -1241,6 +1273,9 @@ export async function handleHeadlessCommand(
);
const executedResults = await executeApprovalBatch(decisions);
// Check for queued tool images (from Read tool reading image files)
const toolImages = getAndClearQueuedToolImages();
// Send all results in one batch
currentInput = [
{
@@ -1248,6 +1283,36 @@ export async function handleHeadlessCommand(
approvals: executedResults as ApprovalResult[],
},
];
// If there are queued images, add them as a user message
if (toolImages.length > 0) {
const imageContentParts: Array<
| { type: "text"; text: string }
| {
type: "image";
source: { type: "base64"; media_type: string; data: string };
}
> = [];
for (const img of toolImages) {
imageContentParts.push({
type: "text",
text: `<system-reminder>Image read from ${img.filePath} (Read tool call: ${img.toolCallId}):</system-reminder>`,
});
imageContentParts.push({
type: "image",
source: {
type: "base64",
media_type: img.mediaType,
data: img.data,
},
});
}
currentInput.push({
role: "user",
content: imageContentParts as unknown as MessageCreate["content"],
});
}
continue;
}

View File

@@ -9,6 +9,8 @@ Usage:
- You can optionally specify a line offset and limit (especially handy for long files), but it's recommended to read the whole file by not providing these parameters
- Any lines longer than 2000 characters will be truncated
- Results are returned using cat -n format, with line numbers starting at 1
- This tool allows Letta Code to read images (PNG, JPG, JPEG, GIF, WEBP, BMP). When reading an image file the contents are presented visually as Letta Code is a multimodal LLM. Large images are automatically resized to fit within API limits.
- You will regularly be asked to read screenshots. If the user provides a path to a screenshot, ALWAYS use this tool to view the file at the path. This tool will work with all temporary file paths.
- This tool can only read files, not directories. To read a directory, use the ls command via Bash.
- You can call multiple tools in a single response. It is always better to speculatively read multiple potentially useful files in parallel.
- If you read a file that exists but has empty contents you will receive a system reminder warning in place of file contents.

View File

@@ -1,9 +1,78 @@
import { promises as fs } from "node:fs";
import * as path from "node:path";
import { resizeImageIfNeeded } from "../../cli/helpers/imageResize.js";
import { queueToolImage } from "../../cli/helpers/toolImageRegistry.js";
import { getToolExecutionContext } from "../toolContext.js";
import { OVERFLOW_CONFIG, writeOverflowFile } from "./overflow.js";
import { LIMITS } from "./truncation.js";
import { validateRequiredParams } from "./validation.js";
// Supported image extensions (lowercase)
const IMAGE_EXTENSIONS = new Set([
".png",
".jpg",
".jpeg",
".gif",
".webp",
".bmp",
]);
/**
* Check if a file path is an image based on extension.
*/
function isImageFile(filePath: string): boolean {
const ext = path.extname(filePath).toLowerCase();
return IMAGE_EXTENSIONS.has(ext);
}
/**
* Get MIME type from file extension.
*/
function getMimeType(filePath: string): string {
const ext = path.extname(filePath).toLowerCase();
const mimeTypes: Record<string, string> = {
".png": "image/png",
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".gif": "image/gif",
".webp": "image/webp",
".bmp": "image/bmp",
};
return mimeTypes[ext] || "image/png";
}
/**
* Read an image file, resize if needed, and queue for display.
* Returns a placeholder message - actual image is sent in the next user message.
*/
async function readImageFile(filePath: string): Promise<ReadResult> {
const buffer = await fs.readFile(filePath);
const inputMimeType = getMimeType(filePath);
const resized = await resizeImageIfNeeded(buffer, inputMimeType);
// Get tool call ID from execution context
const context = getToolExecutionContext();
const toolCallId = context?.toolCallId || "unknown";
// Queue for next turn
queueToolImage({
toolCallId,
filePath,
data: resized.data,
mediaType: resized.mediaType,
width: resized.width,
height: resized.height,
});
const resizeNote = resized.resized
? ` (resized to ${resized.width}x${resized.height})`
: ` (${resized.width}x${resized.height})`;
return {
content: `[Image: ${filePath}${resizeNote} - queued for display]`,
};
}
interface ReadArgs {
file_path: string;
offset?: number;
@@ -145,6 +214,12 @@ export async function read(args: ReadArgs): Promise<ReadResult> {
throw new Error(
`File too large: ${stats.size} bytes (max ${maxSize} bytes)`,
);
// Handle image files specially - read, resize, and queue for display
if (isImageFile(resolvedPath)) {
return await readImageFile(resolvedPath);
}
if (await isBinaryFile(resolvedPath))
throw new Error(`Cannot read binary file: ${resolvedPath}`);
const content = await fs.readFile(resolvedPath, "utf-8");

View File

@@ -2,6 +2,7 @@ import { getModelInfo } from "../agent/model";
import { getAllSubagentConfigs } from "../agent/subagents";
import { INTERRUPTED_BY_USER } from "../constants";
import { telemetry } from "../telemetry";
import { setToolExecutionContext } from "./toolContext";
import { TOOL_DEFINITIONS, type ToolName } from "./toolDefinitions";
export const TOOL_NAMES = Object.keys(TOOL_DEFINITIONS) as ToolName[];
@@ -754,7 +755,14 @@ export async function executeTool(
}
}
const result = await tool.fn(enhancedArgs);
// Set execution context for tools that need it (e.g., Read for image queuing)
setToolExecutionContext({ toolCallId: options?.toolCallId });
let result: unknown;
try {
result = await tool.fn(enhancedArgs);
} finally {
setToolExecutionContext(null);
}
const duration = Date.now() - startTime;
// Extract stdout/stderr if present (for bash tools)

26
src/tools/toolContext.ts Normal file
View File

@@ -0,0 +1,26 @@
// Tool execution context - allows tools to access execution metadata
// Separate file to avoid circular dependencies with manager.ts
interface ToolExecutionContext {
toolCallId?: string;
}
let currentToolContext: ToolExecutionContext | null = null;
/**
* Get the current tool execution context.
* Called by tools that need access to execution metadata (e.g., Read for image queuing).
*/
export function getToolExecutionContext(): ToolExecutionContext | null {
return currentToolContext;
}
/**
* Set the current tool execution context.
* Called by manager.ts before executing a tool.
*/
export function setToolExecutionContext(
context: ToolExecutionContext | null,
): void {
currentToolContext = context;
}