feat: add image reading support to Read tool (#614)
Co-authored-by: Letta <noreply@letta.com>
This commit is contained in:
@@ -30,7 +30,7 @@
|
||||
"access": "public"
|
||||
},
|
||||
"dependencies": {
|
||||
"@letta-ai/letta-client": "^1.6.8",
|
||||
"@letta-ai/letta-client": "^1.7.2",
|
||||
"glob": "^13.0.0",
|
||||
"ink-link": "^5.0.0",
|
||||
"open": "^10.2.0",
|
||||
@@ -43,6 +43,7 @@
|
||||
"@types/bun": "latest",
|
||||
"@types/diff": "^8.0.0",
|
||||
"@types/picomatch": "^4.0.2",
|
||||
"@types/react": "^19.2.9",
|
||||
"diff": "^8.0.2",
|
||||
"husky": "9.1.7",
|
||||
"ink": "^5.0.0",
|
||||
|
||||
@@ -3,12 +3,32 @@
|
||||
import * as path from "node:path";
|
||||
import type {
|
||||
ApprovalReturn,
|
||||
TextContent,
|
||||
ToolReturn,
|
||||
} from "@letta-ai/letta-client/resources/agents/messages";
|
||||
import type { ToolReturnMessage } from "@letta-ai/letta-client/resources/tools";
|
||||
import type { ApprovalRequest } from "../cli/helpers/stream";
|
||||
import { INTERRUPTED_BY_USER } from "../constants";
|
||||
import { executeTool, type ToolExecutionResult } from "../tools/manager";
|
||||
import {
|
||||
executeTool,
|
||||
type ToolExecutionResult,
|
||||
type ToolReturnContent,
|
||||
} from "../tools/manager";
|
||||
|
||||
/**
|
||||
* Extract displayable text from tool return content (for UI display).
|
||||
* Multimodal content returns the text parts concatenated.
|
||||
*/
|
||||
export function getDisplayableToolReturn(content: ToolReturnContent): string {
|
||||
if (typeof content === "string") {
|
||||
return content;
|
||||
}
|
||||
// Extract text from multimodal content
|
||||
return content
|
||||
.filter((part): part is TextContent => part.type === "text")
|
||||
.map((part) => part.text)
|
||||
.join("\n");
|
||||
}
|
||||
|
||||
/**
|
||||
* Tools that are safe to execute in parallel (read-only or independent).
|
||||
@@ -235,13 +255,14 @@ async function executeSingleDecision(
|
||||
);
|
||||
|
||||
// Update UI if callback provided (interactive mode)
|
||||
// Note: UI display uses text-only version, backend gets full multimodal content
|
||||
if (onChunk) {
|
||||
onChunk({
|
||||
message_type: "tool_return_message",
|
||||
id: "dummy",
|
||||
date: new Date().toISOString(),
|
||||
tool_call_id: decision.approval.toolCallId,
|
||||
tool_return: toolResult.toolReturn,
|
||||
tool_return: getDisplayableToolReturn(toolResult.toolReturn),
|
||||
status: toolResult.status,
|
||||
stdout: toolResult.stdout,
|
||||
stderr: toolResult.stderr,
|
||||
@@ -251,7 +272,7 @@ async function executeSingleDecision(
|
||||
return {
|
||||
type: "tool",
|
||||
tool_call_id: decision.approval.toolCallId,
|
||||
tool_return: toolResult.toolReturn,
|
||||
tool_return: toolResult.toolReturn, // Full multimodal content for backend
|
||||
status: toolResult.status,
|
||||
stdout: toolResult.stdout,
|
||||
stderr: toolResult.stderr,
|
||||
|
||||
@@ -26,6 +26,7 @@ import {
|
||||
import {
|
||||
type ApprovalResult,
|
||||
executeAutoAllowedTools,
|
||||
getDisplayableToolReturn,
|
||||
} from "../agent/approval-execution";
|
||||
import {
|
||||
buildApprovalRecoveryMessage,
|
||||
@@ -7333,7 +7334,7 @@ DO NOT respond to these messages or otherwise consider them in your response unl
|
||||
id: "dummy",
|
||||
date: new Date().toISOString(),
|
||||
tool_call_id: approval.toolCallId,
|
||||
tool_return: toolResult.toolReturn,
|
||||
tool_return: getDisplayableToolReturn(toolResult.toolReturn),
|
||||
status: toolResult.status,
|
||||
stdout: toolResult.stdout,
|
||||
stderr: toolResult.stderr,
|
||||
|
||||
@@ -594,12 +594,30 @@ export const ToolCallMessage = memo(
|
||||
}
|
||||
}
|
||||
|
||||
// Check if this is a file read tool - show line count summary
|
||||
// Check if this is a file read tool - show line count or image summary
|
||||
if (
|
||||
isFileReadTool(rawName) &&
|
||||
line.resultOk !== false &&
|
||||
line.resultText
|
||||
) {
|
||||
// Check if this is an image result (starts with "[Image: filename]")
|
||||
const isImageResult = line.resultText.startsWith("[Image: ");
|
||||
|
||||
if (isImageResult) {
|
||||
return (
|
||||
<Box flexDirection="row">
|
||||
<Box width={prefixWidth} flexShrink={0}>
|
||||
<Text>{prefix}</Text>
|
||||
</Box>
|
||||
<Box flexGrow={1} width={contentWidth}>
|
||||
<Text>
|
||||
Read <Text bold>1</Text> image
|
||||
</Text>
|
||||
</Box>
|
||||
</Box>
|
||||
);
|
||||
}
|
||||
|
||||
// Count lines in the result (the content returned by Read tool)
|
||||
const lineCount = line.resultText.split("\n").length;
|
||||
return (
|
||||
@@ -609,7 +627,8 @@ export const ToolCallMessage = memo(
|
||||
</Box>
|
||||
<Box flexGrow={1} width={contentWidth}>
|
||||
<Text>
|
||||
Read <Text bold>{lineCount}</Text> lines
|
||||
Read <Text bold>{lineCount}</Text> line
|
||||
{lineCount !== 1 ? "s" : ""}
|
||||
</Text>
|
||||
</Box>
|
||||
</Box>
|
||||
|
||||
@@ -1,10 +1,30 @@
|
||||
import type {
|
||||
ImageContent,
|
||||
LettaAssistantMessageContentUnion,
|
||||
LettaUserMessageContentUnion,
|
||||
Message,
|
||||
TextContent,
|
||||
} from "@letta-ai/letta-client/resources/agents/messages";
|
||||
import type { Buffers } from "./accumulator";
|
||||
|
||||
/**
|
||||
* Extract displayable text from tool return content.
|
||||
* Multimodal content returns the text parts concatenated.
|
||||
*/
|
||||
function getDisplayableToolReturn(
|
||||
content: string | Array<TextContent | ImageContent> | undefined,
|
||||
): string {
|
||||
if (!content) return "";
|
||||
if (typeof content === "string") {
|
||||
return content;
|
||||
}
|
||||
// Extract text from multimodal content
|
||||
return content
|
||||
.filter((part): part is TextContent => part.type === "text")
|
||||
.map((part) => part.text)
|
||||
.join("\n");
|
||||
}
|
||||
|
||||
// const PASTE_LINE_THRESHOLD = 5;
|
||||
// const PASTE_CHAR_THRESHOLD = 500;
|
||||
const CLIP_CHAR_LIMIT_TEXT = 500;
|
||||
@@ -238,7 +258,8 @@ export function backfillBuffers(buffers: Buffers, history: Message[]): void {
|
||||
|
||||
// Update the existing line with the result
|
||||
// Handle both func_response (streaming) and tool_return (SDK) properties
|
||||
const resultText =
|
||||
// tool_return can be multimodal (string or array of content parts)
|
||||
const rawResult =
|
||||
("func_response" in toolReturn
|
||||
? toolReturn.func_response
|
||||
: undefined) ||
|
||||
@@ -246,6 +267,7 @@ export function backfillBuffers(buffers: Buffers, history: Message[]): void {
|
||||
? toolReturn.tool_return
|
||||
: undefined) ||
|
||||
"";
|
||||
const resultText = getDisplayableToolReturn(rawResult);
|
||||
buffers.byId.set(toolCallLineId, {
|
||||
...existingLine,
|
||||
resultText,
|
||||
|
||||
@@ -9,6 +9,8 @@ Usage:
|
||||
- You can optionally specify a line offset and limit (especially handy for long files), but it's recommended to read the whole file by not providing these parameters
|
||||
- Any lines longer than 2000 characters will be truncated
|
||||
- Results are returned using cat -n format, with line numbers starting at 1
|
||||
- This tool allows Letta Code to read images (PNG, JPG, JPEG, GIF, WEBP, BMP). When reading an image file the contents are presented visually as Letta Code is a multimodal LLM. Large images are automatically resized to fit within API limits.
|
||||
- You will regularly be asked to read screenshots. If the user provides a path to a screenshot, ALWAYS use this tool to view the file at the path. This tool will work with all temporary file paths.
|
||||
- This tool can only read files, not directories. To read a directory, use the ls command via Bash.
|
||||
- You can call multiple tools in a single response. It is always better to speculatively read multiple potentially useful files in parallel.
|
||||
- If you read a file that exists but has empty contents you will receive a system reminder warning in place of file contents.
|
||||
|
||||
@@ -1,16 +1,93 @@
|
||||
import { promises as fs } from "node:fs";
|
||||
import * as path from "node:path";
|
||||
import type {
|
||||
ImageContent,
|
||||
TextContent,
|
||||
} from "@letta-ai/letta-client/resources/agents/messages";
|
||||
import { LETTA_CLOUD_API_URL } from "../../auth/oauth.js";
|
||||
import { resizeImageIfNeeded } from "../../cli/helpers/imageResize.js";
|
||||
import { settingsManager } from "../../settings-manager.js";
|
||||
import { OVERFLOW_CONFIG, writeOverflowFile } from "./overflow.js";
|
||||
import { LIMITS } from "./truncation.js";
|
||||
import { validateRequiredParams } from "./validation.js";
|
||||
|
||||
/**
|
||||
* Check if the server supports images in tool responses.
|
||||
* Currently only api.letta.com supports this feature.
|
||||
*/
|
||||
function serverSupportsImageToolReturns(): boolean {
|
||||
const settings = settingsManager.getSettings();
|
||||
const baseURL =
|
||||
process.env.LETTA_BASE_URL ||
|
||||
settings.env?.LETTA_BASE_URL ||
|
||||
LETTA_CLOUD_API_URL;
|
||||
return baseURL === LETTA_CLOUD_API_URL;
|
||||
}
|
||||
|
||||
interface ReadArgs {
|
||||
file_path: string;
|
||||
offset?: number;
|
||||
limit?: number;
|
||||
}
|
||||
|
||||
// Tool return content types - either a string or array of content parts
|
||||
export type ToolReturnContent = string | Array<TextContent | ImageContent>;
|
||||
|
||||
interface ReadResult {
|
||||
content: string;
|
||||
content: ToolReturnContent;
|
||||
}
|
||||
|
||||
// Supported image extensions
|
||||
const IMAGE_EXTENSIONS = new Set([
|
||||
".png",
|
||||
".jpg",
|
||||
".jpeg",
|
||||
".gif",
|
||||
".webp",
|
||||
".bmp",
|
||||
]);
|
||||
|
||||
function isImageFile(filePath: string): boolean {
|
||||
const ext = path.extname(filePath).toLowerCase();
|
||||
return IMAGE_EXTENSIONS.has(ext);
|
||||
}
|
||||
|
||||
function getMediaType(ext: string): string {
|
||||
const types: Record<string, string> = {
|
||||
".png": "image/png",
|
||||
".jpg": "image/jpeg",
|
||||
".jpeg": "image/jpeg",
|
||||
".gif": "image/gif",
|
||||
".webp": "image/webp",
|
||||
".bmp": "image/png", // Convert BMP to PNG
|
||||
};
|
||||
return types[ext] || "image/png";
|
||||
}
|
||||
|
||||
async function readImageFile(
|
||||
filePath: string,
|
||||
): Promise<Array<TextContent | ImageContent>> {
|
||||
const buffer = await fs.readFile(filePath);
|
||||
const ext = path.extname(filePath).toLowerCase();
|
||||
const mediaType = getMediaType(ext);
|
||||
|
||||
// Use shared image resize utility
|
||||
const result = await resizeImageIfNeeded(buffer, mediaType);
|
||||
|
||||
return [
|
||||
{
|
||||
type: "text",
|
||||
text: `[Image: ${path.basename(filePath)}${result.resized ? " (resized to fit API limits)" : ""}]`,
|
||||
},
|
||||
{
|
||||
type: "image",
|
||||
source: {
|
||||
type: "base64",
|
||||
media_type: result.mediaType,
|
||||
data: result.data,
|
||||
},
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
async function isBinaryFile(filePath: string): Promise<boolean> {
|
||||
@@ -140,6 +217,28 @@ export async function read(args: ReadArgs): Promise<ReadResult> {
|
||||
const stats = await fs.stat(resolvedPath);
|
||||
if (stats.isDirectory())
|
||||
throw new Error(`Path is a directory, not a file: ${resolvedPath}`);
|
||||
|
||||
// Check if this is an image file
|
||||
if (isImageFile(resolvedPath)) {
|
||||
// Check if server supports images in tool responses
|
||||
if (!serverSupportsImageToolReturns()) {
|
||||
throw new Error(
|
||||
`This server does not support images in tool responses.`,
|
||||
);
|
||||
}
|
||||
|
||||
// Images have a higher size limit (20MB raw, will be resized if needed)
|
||||
const maxImageSize = 20 * 1024 * 1024;
|
||||
if (stats.size > maxImageSize) {
|
||||
throw new Error(
|
||||
`Image file too large: ${stats.size} bytes (max ${maxImageSize} bytes)`,
|
||||
);
|
||||
}
|
||||
const imageContent = await readImageFile(resolvedPath);
|
||||
return { content: imageContent };
|
||||
}
|
||||
|
||||
// Regular text file handling
|
||||
const maxSize = 10 * 1024 * 1024; // 10MB
|
||||
if (stats.size > maxSize)
|
||||
throw new Error(
|
||||
|
||||
@@ -3,7 +3,8 @@
|
||||
* Uses Gemini's exact schema and description
|
||||
*/
|
||||
|
||||
import { read } from "./Read";
|
||||
import type { TextContent } from "@letta-ai/letta-client/resources/agents/messages";
|
||||
import { read, type ToolReturnContent } from "./Read";
|
||||
|
||||
interface ReadFileGeminiArgs {
|
||||
file_path: string;
|
||||
@@ -11,6 +12,20 @@ interface ReadFileGeminiArgs {
|
||||
limit?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract text from tool return content (for Gemini wrapper)
|
||||
*/
|
||||
function extractText(content: ToolReturnContent): string {
|
||||
if (typeof content === "string") {
|
||||
return content;
|
||||
}
|
||||
// Extract text from multimodal content (Gemini doesn't support images via this tool)
|
||||
return content
|
||||
.filter((part): part is TextContent => part.type === "text")
|
||||
.map((part) => part.text)
|
||||
.join("\n");
|
||||
}
|
||||
|
||||
export async function read_file_gemini(
|
||||
args: ReadFileGeminiArgs,
|
||||
): Promise<{ message: string }> {
|
||||
@@ -24,6 +39,6 @@ export async function read_file_gemini(
|
||||
|
||||
const result = await read(lettaArgs);
|
||||
|
||||
// Read returns { content: string }
|
||||
return { message: result.content };
|
||||
// Read returns { content: ToolReturnContent } - extract text for Gemini
|
||||
return { message: extractText(result.content) };
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
* LSP-enhanced Read tool - wraps the base Read tool and adds LSP diagnostics
|
||||
* This is used when LETTA_ENABLE_LSP is set
|
||||
*/
|
||||
import { read as baseRead } from "./Read.js";
|
||||
import { read as baseRead, type ToolReturnContent } from "./Read.js";
|
||||
|
||||
// Format a single diagnostic in opencode style: "ERROR [line:col] message"
|
||||
function formatDiagnostic(diag: {
|
||||
@@ -30,7 +30,7 @@ interface ReadLSPArgs {
|
||||
}
|
||||
|
||||
interface ReadLSPResult {
|
||||
content: string;
|
||||
content: ToolReturnContent;
|
||||
}
|
||||
|
||||
export async function read_lsp(args: ReadLSPArgs): Promise<ReadLSPResult> {
|
||||
@@ -42,6 +42,11 @@ export async function read_lsp(args: ReadLSPArgs): Promise<ReadLSPResult> {
|
||||
return result;
|
||||
}
|
||||
|
||||
// If content is multimodal (image), skip LSP processing - only applies to text files
|
||||
if (typeof result.content !== "string") {
|
||||
return result;
|
||||
}
|
||||
|
||||
// Determine if we should include diagnostics
|
||||
const lineCount = result.content.split("\n").length;
|
||||
const shouldInclude =
|
||||
|
||||
@@ -210,8 +210,16 @@ interface ToolDefinition {
|
||||
fn: (args: ToolArgs) => Promise<unknown>;
|
||||
}
|
||||
|
||||
import type {
|
||||
ImageContent,
|
||||
TextContent,
|
||||
} from "@letta-ai/letta-client/resources/agents/messages";
|
||||
|
||||
// Tool return content can be a string or array of text/image content parts
|
||||
export type ToolReturnContent = string | Array<TextContent | ImageContent>;
|
||||
|
||||
export type ToolExecutionResult = {
|
||||
toolReturn: string;
|
||||
toolReturn: ToolReturnContent;
|
||||
status: "success" | "error";
|
||||
stdout?: string[];
|
||||
stderr?: string[];
|
||||
@@ -628,7 +636,18 @@ function isStringArray(value: unknown): value is string[] {
|
||||
);
|
||||
}
|
||||
|
||||
function flattenToolResponse(result: unknown): string {
|
||||
/**
|
||||
* Check if an array contains multimodal content (text + images)
|
||||
*/
|
||||
function isMultimodalContent(
|
||||
arr: unknown[],
|
||||
): arr is Array<TextContent | ImageContent> {
|
||||
return arr.every(
|
||||
(item) => isRecord(item) && (item.type === "text" || item.type === "image"),
|
||||
);
|
||||
}
|
||||
|
||||
function flattenToolResponse(result: unknown): ToolReturnContent {
|
||||
if (result === null || result === undefined) {
|
||||
return "";
|
||||
}
|
||||
@@ -645,6 +664,11 @@ function flattenToolResponse(result: unknown): string {
|
||||
return result.message;
|
||||
}
|
||||
|
||||
// Check for multimodal content (images) - return as-is without flattening
|
||||
if (Array.isArray(result.content) && isMultimodalContent(result.content)) {
|
||||
return result.content;
|
||||
}
|
||||
|
||||
if (typeof result.content === "string") {
|
||||
return result.content;
|
||||
}
|
||||
@@ -770,12 +794,16 @@ export async function executeTool(
|
||||
// Flatten the response to plain text
|
||||
const flattenedResponse = flattenToolResponse(result);
|
||||
|
||||
// Track tool usage
|
||||
// Track tool usage (calculate size for multimodal content)
|
||||
const responseSize =
|
||||
typeof flattenedResponse === "string"
|
||||
? flattenedResponse.length
|
||||
: JSON.stringify(flattenedResponse).length;
|
||||
telemetry.trackToolUsage(
|
||||
internalName,
|
||||
toolStatus === "success",
|
||||
duration,
|
||||
flattenedResponse.length,
|
||||
responseSize,
|
||||
toolStatus === "error" ? "tool_error" : undefined,
|
||||
stderr ? stderr.join("\n") : undefined,
|
||||
);
|
||||
|
||||
Reference in New Issue
Block a user