feat: add image reading support to Read tool (#614)

Co-authored-by: Letta <noreply@letta.com>
This commit is contained in:
Charles Packer
2026-01-20 22:38:33 -08:00
committed by GitHub
parent 1168a83716
commit 5635156b51
10 changed files with 231 additions and 18 deletions

View File

@@ -30,7 +30,7 @@
"access": "public"
},
"dependencies": {
"@letta-ai/letta-client": "^1.6.8",
"@letta-ai/letta-client": "^1.7.2",
"glob": "^13.0.0",
"ink-link": "^5.0.0",
"open": "^10.2.0",
@@ -43,6 +43,7 @@
"@types/bun": "latest",
"@types/diff": "^8.0.0",
"@types/picomatch": "^4.0.2",
"@types/react": "^19.2.9",
"diff": "^8.0.2",
"husky": "9.1.7",
"ink": "^5.0.0",

View File

@@ -3,12 +3,32 @@
import * as path from "node:path";
import type {
ApprovalReturn,
TextContent,
ToolReturn,
} from "@letta-ai/letta-client/resources/agents/messages";
import type { ToolReturnMessage } from "@letta-ai/letta-client/resources/tools";
import type { ApprovalRequest } from "../cli/helpers/stream";
import { INTERRUPTED_BY_USER } from "../constants";
import { executeTool, type ToolExecutionResult } from "../tools/manager";
import {
executeTool,
type ToolExecutionResult,
type ToolReturnContent,
} from "../tools/manager";
/**
* Extract displayable text from tool return content (for UI display).
* Multimodal content returns the text parts concatenated.
*/
export function getDisplayableToolReturn(content: ToolReturnContent): string {
if (typeof content === "string") {
return content;
}
// Extract text from multimodal content
return content
.filter((part): part is TextContent => part.type === "text")
.map((part) => part.text)
.join("\n");
}
/**
* Tools that are safe to execute in parallel (read-only or independent).
@@ -235,13 +255,14 @@ async function executeSingleDecision(
);
// Update UI if callback provided (interactive mode)
// Note: UI display uses text-only version, backend gets full multimodal content
if (onChunk) {
onChunk({
message_type: "tool_return_message",
id: "dummy",
date: new Date().toISOString(),
tool_call_id: decision.approval.toolCallId,
tool_return: toolResult.toolReturn,
tool_return: getDisplayableToolReturn(toolResult.toolReturn),
status: toolResult.status,
stdout: toolResult.stdout,
stderr: toolResult.stderr,
@@ -251,7 +272,7 @@ async function executeSingleDecision(
return {
type: "tool",
tool_call_id: decision.approval.toolCallId,
tool_return: toolResult.toolReturn,
tool_return: toolResult.toolReturn, // Full multimodal content for backend
status: toolResult.status,
stdout: toolResult.stdout,
stderr: toolResult.stderr,

View File

@@ -26,6 +26,7 @@ import {
import {
type ApprovalResult,
executeAutoAllowedTools,
getDisplayableToolReturn,
} from "../agent/approval-execution";
import {
buildApprovalRecoveryMessage,
@@ -7333,7 +7334,7 @@ DO NOT respond to these messages or otherwise consider them in your response unl
id: "dummy",
date: new Date().toISOString(),
tool_call_id: approval.toolCallId,
tool_return: toolResult.toolReturn,
tool_return: getDisplayableToolReturn(toolResult.toolReturn),
status: toolResult.status,
stdout: toolResult.stdout,
stderr: toolResult.stderr,

View File

@@ -594,12 +594,30 @@ export const ToolCallMessage = memo(
}
}
// Check if this is a file read tool - show line count summary
// Check if this is a file read tool - show line count or image summary
if (
isFileReadTool(rawName) &&
line.resultOk !== false &&
line.resultText
) {
// Check if this is an image result (starts with "[Image: filename]")
const isImageResult = line.resultText.startsWith("[Image: ");
if (isImageResult) {
return (
<Box flexDirection="row">
<Box width={prefixWidth} flexShrink={0}>
<Text>{prefix}</Text>
</Box>
<Box flexGrow={1} width={contentWidth}>
<Text>
Read <Text bold>1</Text> image
</Text>
</Box>
</Box>
);
}
// Count lines in the result (the content returned by Read tool)
const lineCount = line.resultText.split("\n").length;
return (
@@ -609,7 +627,8 @@ export const ToolCallMessage = memo(
</Box>
<Box flexGrow={1} width={contentWidth}>
<Text>
Read <Text bold>{lineCount}</Text> lines
Read <Text bold>{lineCount}</Text> line
{lineCount !== 1 ? "s" : ""}
</Text>
</Box>
</Box>

View File

@@ -1,10 +1,30 @@
import type {
ImageContent,
LettaAssistantMessageContentUnion,
LettaUserMessageContentUnion,
Message,
TextContent,
} from "@letta-ai/letta-client/resources/agents/messages";
import type { Buffers } from "./accumulator";
/**
* Extract displayable text from tool return content.
* Multimodal content returns the text parts concatenated.
*/
function getDisplayableToolReturn(
content: string | Array<TextContent | ImageContent> | undefined,
): string {
if (!content) return "";
if (typeof content === "string") {
return content;
}
// Extract text from multimodal content
return content
.filter((part): part is TextContent => part.type === "text")
.map((part) => part.text)
.join("\n");
}
// const PASTE_LINE_THRESHOLD = 5;
// const PASTE_CHAR_THRESHOLD = 500;
const CLIP_CHAR_LIMIT_TEXT = 500;
@@ -238,7 +258,8 @@ export function backfillBuffers(buffers: Buffers, history: Message[]): void {
// Update the existing line with the result
// Handle both func_response (streaming) and tool_return (SDK) properties
const resultText =
// tool_return can be multimodal (string or array of content parts)
const rawResult =
("func_response" in toolReturn
? toolReturn.func_response
: undefined) ||
@@ -246,6 +267,7 @@ export function backfillBuffers(buffers: Buffers, history: Message[]): void {
? toolReturn.tool_return
: undefined) ||
"";
const resultText = getDisplayableToolReturn(rawResult);
buffers.byId.set(toolCallLineId, {
...existingLine,
resultText,

View File

@@ -9,6 +9,8 @@ Usage:
- You can optionally specify a line offset and limit (especially handy for long files), but it's recommended to read the whole file by not providing these parameters
- Any lines longer than 2000 characters will be truncated
- Results are returned using cat -n format, with line numbers starting at 1
- This tool allows Letta Code to read images (PNG, JPG, JPEG, GIF, WEBP, BMP). When reading an image file the contents are presented visually as Letta Code is a multimodal LLM. Large images are automatically resized to fit within API limits.
- You will regularly be asked to read screenshots. If the user provides a path to a screenshot, ALWAYS use this tool to view the file at the path. This tool will work with all temporary file paths.
- This tool can only read files, not directories. To read a directory, use the ls command via Bash.
- You can call multiple tools in a single response. It is always better to speculatively read multiple potentially useful files in parallel.
- If you read a file that exists but has empty contents you will receive a system reminder warning in place of file contents.

View File

@@ -1,16 +1,93 @@
import { promises as fs } from "node:fs";
import * as path from "node:path";
import type {
ImageContent,
TextContent,
} from "@letta-ai/letta-client/resources/agents/messages";
import { LETTA_CLOUD_API_URL } from "../../auth/oauth.js";
import { resizeImageIfNeeded } from "../../cli/helpers/imageResize.js";
import { settingsManager } from "../../settings-manager.js";
import { OVERFLOW_CONFIG, writeOverflowFile } from "./overflow.js";
import { LIMITS } from "./truncation.js";
import { validateRequiredParams } from "./validation.js";
/**
* Check if the server supports images in tool responses.
* Currently only api.letta.com supports this feature.
*/
function serverSupportsImageToolReturns(): boolean {
const settings = settingsManager.getSettings();
const baseURL =
process.env.LETTA_BASE_URL ||
settings.env?.LETTA_BASE_URL ||
LETTA_CLOUD_API_URL;
return baseURL === LETTA_CLOUD_API_URL;
}
interface ReadArgs {
file_path: string;
offset?: number;
limit?: number;
}
// Tool return content types - either a string or array of content parts
export type ToolReturnContent = string | Array<TextContent | ImageContent>;
interface ReadResult {
content: string;
content: ToolReturnContent;
}
// Supported image extensions
const IMAGE_EXTENSIONS = new Set([
".png",
".jpg",
".jpeg",
".gif",
".webp",
".bmp",
]);
function isImageFile(filePath: string): boolean {
const ext = path.extname(filePath).toLowerCase();
return IMAGE_EXTENSIONS.has(ext);
}
function getMediaType(ext: string): string {
const types: Record<string, string> = {
".png": "image/png",
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".gif": "image/gif",
".webp": "image/webp",
".bmp": "image/png", // Convert BMP to PNG
};
return types[ext] || "image/png";
}
async function readImageFile(
filePath: string,
): Promise<Array<TextContent | ImageContent>> {
const buffer = await fs.readFile(filePath);
const ext = path.extname(filePath).toLowerCase();
const mediaType = getMediaType(ext);
// Use shared image resize utility
const result = await resizeImageIfNeeded(buffer, mediaType);
return [
{
type: "text",
text: `[Image: ${path.basename(filePath)}${result.resized ? " (resized to fit API limits)" : ""}]`,
},
{
type: "image",
source: {
type: "base64",
media_type: result.mediaType,
data: result.data,
},
},
];
}
async function isBinaryFile(filePath: string): Promise<boolean> {
@@ -140,6 +217,28 @@ export async function read(args: ReadArgs): Promise<ReadResult> {
const stats = await fs.stat(resolvedPath);
if (stats.isDirectory())
throw new Error(`Path is a directory, not a file: ${resolvedPath}`);
// Check if this is an image file
if (isImageFile(resolvedPath)) {
// Check if server supports images in tool responses
if (!serverSupportsImageToolReturns()) {
throw new Error(
`This server does not support images in tool responses.`,
);
}
// Images have a higher size limit (20MB raw, will be resized if needed)
const maxImageSize = 20 * 1024 * 1024;
if (stats.size > maxImageSize) {
throw new Error(
`Image file too large: ${stats.size} bytes (max ${maxImageSize} bytes)`,
);
}
const imageContent = await readImageFile(resolvedPath);
return { content: imageContent };
}
// Regular text file handling
const maxSize = 10 * 1024 * 1024; // 10MB
if (stats.size > maxSize)
throw new Error(

View File

@@ -3,7 +3,8 @@
* Uses Gemini's exact schema and description
*/
import { read } from "./Read";
import type { TextContent } from "@letta-ai/letta-client/resources/agents/messages";
import { read, type ToolReturnContent } from "./Read";
interface ReadFileGeminiArgs {
file_path: string;
@@ -11,6 +12,20 @@ interface ReadFileGeminiArgs {
limit?: number;
}
/**
* Extract text from tool return content (for Gemini wrapper)
*/
function extractText(content: ToolReturnContent): string {
if (typeof content === "string") {
return content;
}
// Extract text from multimodal content (Gemini doesn't support images via this tool)
return content
.filter((part): part is TextContent => part.type === "text")
.map((part) => part.text)
.join("\n");
}
export async function read_file_gemini(
args: ReadFileGeminiArgs,
): Promise<{ message: string }> {
@@ -24,6 +39,6 @@ export async function read_file_gemini(
const result = await read(lettaArgs);
// Read returns { content: string }
return { message: result.content };
// Read returns { content: ToolReturnContent } - extract text for Gemini
return { message: extractText(result.content) };
}

View File

@@ -2,7 +2,7 @@
* LSP-enhanced Read tool - wraps the base Read tool and adds LSP diagnostics
* This is used when LETTA_ENABLE_LSP is set
*/
import { read as baseRead } from "./Read.js";
import { read as baseRead, type ToolReturnContent } from "./Read.js";
// Format a single diagnostic in opencode style: "ERROR [line:col] message"
function formatDiagnostic(diag: {
@@ -30,7 +30,7 @@ interface ReadLSPArgs {
}
interface ReadLSPResult {
content: string;
content: ToolReturnContent;
}
export async function read_lsp(args: ReadLSPArgs): Promise<ReadLSPResult> {
@@ -42,6 +42,11 @@ export async function read_lsp(args: ReadLSPArgs): Promise<ReadLSPResult> {
return result;
}
// If content is multimodal (image), skip LSP processing - only applies to text files
if (typeof result.content !== "string") {
return result;
}
// Determine if we should include diagnostics
const lineCount = result.content.split("\n").length;
const shouldInclude =

View File

@@ -210,8 +210,16 @@ interface ToolDefinition {
fn: (args: ToolArgs) => Promise<unknown>;
}
import type {
ImageContent,
TextContent,
} from "@letta-ai/letta-client/resources/agents/messages";
// Tool return content can be a string or array of text/image content parts
export type ToolReturnContent = string | Array<TextContent | ImageContent>;
export type ToolExecutionResult = {
toolReturn: string;
toolReturn: ToolReturnContent;
status: "success" | "error";
stdout?: string[];
stderr?: string[];
@@ -628,7 +636,18 @@ function isStringArray(value: unknown): value is string[] {
);
}
function flattenToolResponse(result: unknown): string {
/**
* Check if an array contains multimodal content (text + images)
*/
function isMultimodalContent(
arr: unknown[],
): arr is Array<TextContent | ImageContent> {
return arr.every(
(item) => isRecord(item) && (item.type === "text" || item.type === "image"),
);
}
function flattenToolResponse(result: unknown): ToolReturnContent {
if (result === null || result === undefined) {
return "";
}
@@ -645,6 +664,11 @@ function flattenToolResponse(result: unknown): string {
return result.message;
}
// Check for multimodal content (images) - return as-is without flattening
if (Array.isArray(result.content) && isMultimodalContent(result.content)) {
return result.content;
}
if (typeof result.content === "string") {
return result.content;
}
@@ -770,12 +794,16 @@ export async function executeTool(
// Flatten the response to plain text
const flattenedResponse = flattenToolResponse(result);
// Track tool usage
// Track tool usage (calculate size for multimodal content)
const responseSize =
typeof flattenedResponse === "string"
? flattenedResponse.length
: JSON.stringify(flattenedResponse).length;
telemetry.trackToolUsage(
internalName,
toolStatus === "success",
duration,
flattenedResponse.length,
responseSize,
toolStatus === "error" ? "tool_error" : undefined,
stderr ? stderr.join("\n") : undefined,
);