feat: add image reading support to Read tool (#603)
Co-authored-by: Letta <noreply@letta.com>
This commit is contained in:
@@ -158,6 +158,10 @@ import {
|
||||
subscribe as subscribeToSubagents,
|
||||
} from "./helpers/subagentState";
|
||||
import { getRandomThinkingVerb } from "./helpers/thinkingMessages";
|
||||
import {
|
||||
clearQueuedToolImages,
|
||||
getAndClearQueuedToolImages,
|
||||
} from "./helpers/toolImageRegistry";
|
||||
import {
|
||||
isFileEditTool,
|
||||
isFileWriteTool,
|
||||
@@ -3239,6 +3243,9 @@ export default function App({
|
||||
// Lock input for async operation (set before any await to prevent queue processing)
|
||||
setCommandRunning(true);
|
||||
|
||||
// Clear any queued tool images from the previous agent context
|
||||
clearQueuedToolImages();
|
||||
|
||||
const inputCmd = "/agents";
|
||||
const cmdId = uid("cmd");
|
||||
|
||||
@@ -3717,9 +3724,44 @@ export default function App({
|
||||
// Send all results to server if any
|
||||
if (allResults.length > 0) {
|
||||
toolResultsInFlightRef.current = true;
|
||||
await processConversation([
|
||||
|
||||
// Check for queued tool images (from Read tool reading image files)
|
||||
const toolImages = getAndClearQueuedToolImages();
|
||||
const input: Array<MessageCreate | ApprovalCreate> = [
|
||||
{ type: "approval", approvals: allResults },
|
||||
]);
|
||||
];
|
||||
|
||||
// If there are queued images, add them as a user message
|
||||
if (toolImages.length > 0) {
|
||||
const imageContentParts: Array<
|
||||
| { type: "text"; text: string }
|
||||
| {
|
||||
type: "image";
|
||||
source: { type: "base64"; media_type: string; data: string };
|
||||
}
|
||||
> = [];
|
||||
for (const img of toolImages) {
|
||||
imageContentParts.push({
|
||||
type: "text",
|
||||
text: `<system-reminder>Image read from ${img.filePath} (Read tool call: ${img.toolCallId}):</system-reminder>`,
|
||||
});
|
||||
imageContentParts.push({
|
||||
type: "image",
|
||||
source: {
|
||||
type: "base64",
|
||||
media_type: img.mediaType,
|
||||
data: img.data,
|
||||
},
|
||||
});
|
||||
}
|
||||
input.push({
|
||||
type: "message",
|
||||
role: "user",
|
||||
content: imageContentParts as unknown as MessageCreate["content"],
|
||||
});
|
||||
}
|
||||
|
||||
await processConversation(input);
|
||||
toolResultsInFlightRef.current = false;
|
||||
}
|
||||
} finally {
|
||||
@@ -4357,6 +4399,9 @@ export default function App({
|
||||
|
||||
setCommandRunning(true);
|
||||
|
||||
// Clear any queued tool images from the previous conversation
|
||||
clearQueuedToolImages();
|
||||
|
||||
try {
|
||||
const client = await getClient();
|
||||
|
||||
@@ -5577,7 +5622,37 @@ ${gitContext}
|
||||
}
|
||||
|
||||
// Build message content from display value (handles placeholders for text/images)
|
||||
const contentParts = buildMessageContentFromDisplay(msg);
|
||||
let contentParts = buildMessageContentFromDisplay(msg);
|
||||
|
||||
// Prepend any queued tool images (from Read tool reading image files)
|
||||
const queuedToolImages = getAndClearQueuedToolImages();
|
||||
if (queuedToolImages.length > 0) {
|
||||
const imageParts: Array<
|
||||
| { type: "text"; text: string }
|
||||
| {
|
||||
type: "image";
|
||||
source: { type: "base64"; media_type: string; data: string };
|
||||
}
|
||||
> = [];
|
||||
for (const img of queuedToolImages) {
|
||||
// Add system reminder text
|
||||
imageParts.push({
|
||||
type: "text",
|
||||
text: `<system-reminder>Image read from ${img.filePath} (Read tool call: ${img.toolCallId}):</system-reminder>`,
|
||||
});
|
||||
// Add image content
|
||||
imageParts.push({
|
||||
type: "image",
|
||||
source: {
|
||||
type: "base64",
|
||||
media_type: img.mediaType,
|
||||
data: img.data,
|
||||
},
|
||||
});
|
||||
}
|
||||
// Prepend to contentParts
|
||||
contentParts = [...imageParts, ...contentParts];
|
||||
}
|
||||
|
||||
// Prepend plan mode reminder if in plan mode
|
||||
const planModeReminder = getPlanModeReminder();
|
||||
|
||||
47
src/cli/helpers/toolImageRegistry.ts
Normal file
47
src/cli/helpers/toolImageRegistry.ts
Normal file
@@ -0,0 +1,47 @@
|
||||
// Registry for images read by tools that need to be sent in the next user message turn.
|
||||
// This is needed because tool returns only support string content - we can't return
|
||||
// image data directly in tool results to the Letta API.
|
||||
|
||||
export interface QueuedToolImage {
|
||||
toolCallId: string;
|
||||
filePath: string;
|
||||
data: string; // base64
|
||||
mediaType: string;
|
||||
width: number;
|
||||
height: number;
|
||||
}
|
||||
|
||||
const queuedImages: QueuedToolImage[] = [];
|
||||
|
||||
/**
|
||||
* Queue an image to be sent in the next user message.
|
||||
* Called by the Read tool when reading an image file.
|
||||
*/
|
||||
export function queueToolImage(image: QueuedToolImage): void {
|
||||
queuedImages.push(image);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get and clear all queued images.
|
||||
* Called when building the user message content.
|
||||
*/
|
||||
export function getAndClearQueuedToolImages(): QueuedToolImage[] {
|
||||
const images = [...queuedImages];
|
||||
queuedImages.length = 0;
|
||||
return images;
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear all queued images without returning them.
|
||||
* Called on conversation/agent switch to prevent memory leaks.
|
||||
*/
|
||||
export function clearQueuedToolImages(): void {
|
||||
queuedImages.length = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if there are any queued images.
|
||||
*/
|
||||
export function hasQueuedToolImages(): boolean {
|
||||
return queuedImages.length > 0;
|
||||
}
|
||||
@@ -31,6 +31,7 @@ import { formatErrorDetails } from "./cli/helpers/errorFormatter";
|
||||
import { safeJsonParseOr } from "./cli/helpers/safeJsonParse";
|
||||
import { drainStreamWithResume } from "./cli/helpers/stream";
|
||||
import { StreamProcessor } from "./cli/helpers/streamProcessor";
|
||||
import { getAndClearQueuedToolImages } from "./cli/helpers/toolImageRegistry";
|
||||
import { settingsManager } from "./settings-manager";
|
||||
import { checkToolPermission } from "./tools/manager";
|
||||
import type {
|
||||
@@ -934,11 +935,42 @@ export async function handleHeadlessCommand(
|
||||
// Add user prompt
|
||||
messageContent += prompt;
|
||||
|
||||
// Build content parts (text + any queued tool images from Read tool)
|
||||
type ContentPart =
|
||||
| { type: "text"; text: string }
|
||||
| {
|
||||
type: "image";
|
||||
source: { type: "base64"; media_type: string; data: string };
|
||||
};
|
||||
const contentParts: ContentPart[] = [];
|
||||
|
||||
// Check for queued tool images (from Read tool reading image files)
|
||||
const queuedToolImages = getAndClearQueuedToolImages();
|
||||
if (queuedToolImages.length > 0) {
|
||||
for (const img of queuedToolImages) {
|
||||
contentParts.push({
|
||||
type: "text",
|
||||
text: `<system-reminder>Image read from ${img.filePath} (Read tool call: ${img.toolCallId}):</system-reminder>`,
|
||||
});
|
||||
contentParts.push({
|
||||
type: "image",
|
||||
source: {
|
||||
type: "base64",
|
||||
media_type: img.mediaType,
|
||||
data: img.data,
|
||||
},
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Add the text message content
|
||||
contentParts.push({ type: "text", text: messageContent });
|
||||
|
||||
// Start with the user message
|
||||
let currentInput: Array<MessageCreate | ApprovalCreate> = [
|
||||
{
|
||||
role: "user",
|
||||
content: [{ type: "text", text: messageContent }],
|
||||
content: contentParts as unknown as MessageCreate["content"],
|
||||
},
|
||||
];
|
||||
|
||||
@@ -1241,6 +1273,9 @@ export async function handleHeadlessCommand(
|
||||
);
|
||||
const executedResults = await executeApprovalBatch(decisions);
|
||||
|
||||
// Check for queued tool images (from Read tool reading image files)
|
||||
const toolImages = getAndClearQueuedToolImages();
|
||||
|
||||
// Send all results in one batch
|
||||
currentInput = [
|
||||
{
|
||||
@@ -1248,6 +1283,36 @@ export async function handleHeadlessCommand(
|
||||
approvals: executedResults as ApprovalResult[],
|
||||
},
|
||||
];
|
||||
|
||||
// If there are queued images, add them as a user message
|
||||
if (toolImages.length > 0) {
|
||||
const imageContentParts: Array<
|
||||
| { type: "text"; text: string }
|
||||
| {
|
||||
type: "image";
|
||||
source: { type: "base64"; media_type: string; data: string };
|
||||
}
|
||||
> = [];
|
||||
for (const img of toolImages) {
|
||||
imageContentParts.push({
|
||||
type: "text",
|
||||
text: `<system-reminder>Image read from ${img.filePath} (Read tool call: ${img.toolCallId}):</system-reminder>`,
|
||||
});
|
||||
imageContentParts.push({
|
||||
type: "image",
|
||||
source: {
|
||||
type: "base64",
|
||||
media_type: img.mediaType,
|
||||
data: img.data,
|
||||
},
|
||||
});
|
||||
}
|
||||
currentInput.push({
|
||||
role: "user",
|
||||
content: imageContentParts as unknown as MessageCreate["content"],
|
||||
});
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
@@ -9,6 +9,8 @@ Usage:
|
||||
- You can optionally specify a line offset and limit (especially handy for long files), but it's recommended to read the whole file by not providing these parameters
|
||||
- Any lines longer than 2000 characters will be truncated
|
||||
- Results are returned using cat -n format, with line numbers starting at 1
|
||||
- This tool allows Letta Code to read images (PNG, JPG, JPEG, GIF, WEBP, BMP). When reading an image file the contents are presented visually as Letta Code is a multimodal LLM. Large images are automatically resized to fit within API limits.
|
||||
- You will regularly be asked to read screenshots. If the user provides a path to a screenshot, ALWAYS use this tool to view the file at the path. This tool will work with all temporary file paths.
|
||||
- This tool can only read files, not directories. To read a directory, use the ls command via Bash.
|
||||
- You can call multiple tools in a single response. It is always better to speculatively read multiple potentially useful files in parallel.
|
||||
- If you read a file that exists but has empty contents you will receive a system reminder warning in place of file contents.
|
||||
|
||||
@@ -1,9 +1,78 @@
|
||||
import { promises as fs } from "node:fs";
|
||||
import * as path from "node:path";
|
||||
import { resizeImageIfNeeded } from "../../cli/helpers/imageResize.js";
|
||||
import { queueToolImage } from "../../cli/helpers/toolImageRegistry.js";
|
||||
import { getToolExecutionContext } from "../toolContext.js";
|
||||
import { OVERFLOW_CONFIG, writeOverflowFile } from "./overflow.js";
|
||||
import { LIMITS } from "./truncation.js";
|
||||
import { validateRequiredParams } from "./validation.js";
|
||||
|
||||
// Supported image extensions (lowercase)
|
||||
const IMAGE_EXTENSIONS = new Set([
|
||||
".png",
|
||||
".jpg",
|
||||
".jpeg",
|
||||
".gif",
|
||||
".webp",
|
||||
".bmp",
|
||||
]);
|
||||
|
||||
/**
|
||||
* Check if a file path is an image based on extension.
|
||||
*/
|
||||
function isImageFile(filePath: string): boolean {
|
||||
const ext = path.extname(filePath).toLowerCase();
|
||||
return IMAGE_EXTENSIONS.has(ext);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get MIME type from file extension.
|
||||
*/
|
||||
function getMimeType(filePath: string): string {
|
||||
const ext = path.extname(filePath).toLowerCase();
|
||||
const mimeTypes: Record<string, string> = {
|
||||
".png": "image/png",
|
||||
".jpg": "image/jpeg",
|
||||
".jpeg": "image/jpeg",
|
||||
".gif": "image/gif",
|
||||
".webp": "image/webp",
|
||||
".bmp": "image/bmp",
|
||||
};
|
||||
return mimeTypes[ext] || "image/png";
|
||||
}
|
||||
|
||||
/**
|
||||
* Read an image file, resize if needed, and queue for display.
|
||||
* Returns a placeholder message - actual image is sent in the next user message.
|
||||
*/
|
||||
async function readImageFile(filePath: string): Promise<ReadResult> {
|
||||
const buffer = await fs.readFile(filePath);
|
||||
const inputMimeType = getMimeType(filePath);
|
||||
const resized = await resizeImageIfNeeded(buffer, inputMimeType);
|
||||
|
||||
// Get tool call ID from execution context
|
||||
const context = getToolExecutionContext();
|
||||
const toolCallId = context?.toolCallId || "unknown";
|
||||
|
||||
// Queue for next turn
|
||||
queueToolImage({
|
||||
toolCallId,
|
||||
filePath,
|
||||
data: resized.data,
|
||||
mediaType: resized.mediaType,
|
||||
width: resized.width,
|
||||
height: resized.height,
|
||||
});
|
||||
|
||||
const resizeNote = resized.resized
|
||||
? ` (resized to ${resized.width}x${resized.height})`
|
||||
: ` (${resized.width}x${resized.height})`;
|
||||
|
||||
return {
|
||||
content: `[Image: ${filePath}${resizeNote} - queued for display]`,
|
||||
};
|
||||
}
|
||||
|
||||
interface ReadArgs {
|
||||
file_path: string;
|
||||
offset?: number;
|
||||
@@ -145,6 +214,12 @@ export async function read(args: ReadArgs): Promise<ReadResult> {
|
||||
throw new Error(
|
||||
`File too large: ${stats.size} bytes (max ${maxSize} bytes)`,
|
||||
);
|
||||
|
||||
// Handle image files specially - read, resize, and queue for display
|
||||
if (isImageFile(resolvedPath)) {
|
||||
return await readImageFile(resolvedPath);
|
||||
}
|
||||
|
||||
if (await isBinaryFile(resolvedPath))
|
||||
throw new Error(`Cannot read binary file: ${resolvedPath}`);
|
||||
const content = await fs.readFile(resolvedPath, "utf-8");
|
||||
|
||||
@@ -2,6 +2,7 @@ import { getModelInfo } from "../agent/model";
|
||||
import { getAllSubagentConfigs } from "../agent/subagents";
|
||||
import { INTERRUPTED_BY_USER } from "../constants";
|
||||
import { telemetry } from "../telemetry";
|
||||
import { setToolExecutionContext } from "./toolContext";
|
||||
import { TOOL_DEFINITIONS, type ToolName } from "./toolDefinitions";
|
||||
|
||||
export const TOOL_NAMES = Object.keys(TOOL_DEFINITIONS) as ToolName[];
|
||||
@@ -754,7 +755,14 @@ export async function executeTool(
|
||||
}
|
||||
}
|
||||
|
||||
const result = await tool.fn(enhancedArgs);
|
||||
// Set execution context for tools that need it (e.g., Read for image queuing)
|
||||
setToolExecutionContext({ toolCallId: options?.toolCallId });
|
||||
let result: unknown;
|
||||
try {
|
||||
result = await tool.fn(enhancedArgs);
|
||||
} finally {
|
||||
setToolExecutionContext(null);
|
||||
}
|
||||
const duration = Date.now() - startTime;
|
||||
|
||||
// Extract stdout/stderr if present (for bash tools)
|
||||
|
||||
26
src/tools/toolContext.ts
Normal file
26
src/tools/toolContext.ts
Normal file
@@ -0,0 +1,26 @@
|
||||
// Tool execution context - allows tools to access execution metadata
|
||||
// Separate file to avoid circular dependencies with manager.ts
|
||||
|
||||
interface ToolExecutionContext {
|
||||
toolCallId?: string;
|
||||
}
|
||||
|
||||
let currentToolContext: ToolExecutionContext | null = null;
|
||||
|
||||
/**
|
||||
* Get the current tool execution context.
|
||||
* Called by tools that need access to execution metadata (e.g., Read for image queuing).
|
||||
*/
|
||||
export function getToolExecutionContext(): ToolExecutionContext | null {
|
||||
return currentToolContext;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the current tool execution context.
|
||||
* Called by manager.ts before executing a tool.
|
||||
*/
|
||||
export function setToolExecutionContext(
|
||||
context: ToolExecutionContext | null,
|
||||
): void {
|
||||
currentToolContext = context;
|
||||
}
|
||||
Reference in New Issue
Block a user