diff --git a/package.json b/package.json index d869753..3c62b28 100644 --- a/package.json +++ b/package.json @@ -30,7 +30,7 @@ "access": "public" }, "dependencies": { - "@letta-ai/letta-client": "^1.6.8", + "@letta-ai/letta-client": "^1.7.2", "glob": "^13.0.0", "ink-link": "^5.0.0", "open": "^10.2.0", @@ -43,6 +43,7 @@ "@types/bun": "latest", "@types/diff": "^8.0.0", "@types/picomatch": "^4.0.2", + "@types/react": "^19.2.9", "diff": "^8.0.2", "husky": "9.1.7", "ink": "^5.0.0", diff --git a/src/agent/approval-execution.ts b/src/agent/approval-execution.ts index ea132b8..907e70c 100644 --- a/src/agent/approval-execution.ts +++ b/src/agent/approval-execution.ts @@ -3,12 +3,32 @@ import * as path from "node:path"; import type { ApprovalReturn, + TextContent, ToolReturn, } from "@letta-ai/letta-client/resources/agents/messages"; import type { ToolReturnMessage } from "@letta-ai/letta-client/resources/tools"; import type { ApprovalRequest } from "../cli/helpers/stream"; import { INTERRUPTED_BY_USER } from "../constants"; -import { executeTool, type ToolExecutionResult } from "../tools/manager"; +import { + executeTool, + type ToolExecutionResult, + type ToolReturnContent, +} from "../tools/manager"; + +/** + * Extract displayable text from tool return content (for UI display). + * Multimodal content returns the text parts concatenated. + */ +export function getDisplayableToolReturn(content: ToolReturnContent): string { + if (typeof content === "string") { + return content; + } + // Extract text from multimodal content + return content + .filter((part): part is TextContent => part.type === "text") + .map((part) => part.text) + .join("\n"); +} /** * Tools that are safe to execute in parallel (read-only or independent). @@ -235,13 +255,14 @@ async function executeSingleDecision( ); // Update UI if callback provided (interactive mode) + // Note: UI display uses text-only version, backend gets full multimodal content if (onChunk) { onChunk({ message_type: "tool_return_message", id: "dummy", date: new Date().toISOString(), tool_call_id: decision.approval.toolCallId, - tool_return: toolResult.toolReturn, + tool_return: getDisplayableToolReturn(toolResult.toolReturn), status: toolResult.status, stdout: toolResult.stdout, stderr: toolResult.stderr, @@ -251,7 +272,7 @@ async function executeSingleDecision( return { type: "tool", tool_call_id: decision.approval.toolCallId, - tool_return: toolResult.toolReturn, + tool_return: toolResult.toolReturn, // Full multimodal content for backend status: toolResult.status, stdout: toolResult.stdout, stderr: toolResult.stderr, diff --git a/src/cli/App.tsx b/src/cli/App.tsx index f5e728e..5c1e792 100644 --- a/src/cli/App.tsx +++ b/src/cli/App.tsx @@ -26,6 +26,7 @@ import { import { type ApprovalResult, executeAutoAllowedTools, + getDisplayableToolReturn, } from "../agent/approval-execution"; import { buildApprovalRecoveryMessage, @@ -7333,7 +7334,7 @@ DO NOT respond to these messages or otherwise consider them in your response unl id: "dummy", date: new Date().toISOString(), tool_call_id: approval.toolCallId, - tool_return: toolResult.toolReturn, + tool_return: getDisplayableToolReturn(toolResult.toolReturn), status: toolResult.status, stdout: toolResult.stdout, stderr: toolResult.stderr, diff --git a/src/cli/components/ToolCallMessageRich.tsx b/src/cli/components/ToolCallMessageRich.tsx index 7bf6d47..acdf6db 100644 --- a/src/cli/components/ToolCallMessageRich.tsx +++ b/src/cli/components/ToolCallMessageRich.tsx @@ -594,12 +594,30 @@ export const ToolCallMessage = memo( } } - // Check if this is a file read tool - show line count summary + // Check if this is a file read tool - show line count or image summary if ( isFileReadTool(rawName) && line.resultOk !== false && line.resultText ) { + // Check if this is an image result (starts with "[Image: filename]") + const isImageResult = line.resultText.startsWith("[Image: "); + + if (isImageResult) { + return ( + + + {prefix} + + + + Read 1 image + + + + ); + } + // Count lines in the result (the content returned by Read tool) const lineCount = line.resultText.split("\n").length; return ( @@ -609,7 +627,8 @@ export const ToolCallMessage = memo( - Read {lineCount} lines + Read {lineCount} line + {lineCount !== 1 ? "s" : ""} diff --git a/src/cli/helpers/backfill.ts b/src/cli/helpers/backfill.ts index 79c2110..e1f0d6f 100644 --- a/src/cli/helpers/backfill.ts +++ b/src/cli/helpers/backfill.ts @@ -1,10 +1,30 @@ import type { + ImageContent, LettaAssistantMessageContentUnion, LettaUserMessageContentUnion, Message, + TextContent, } from "@letta-ai/letta-client/resources/agents/messages"; import type { Buffers } from "./accumulator"; +/** + * Extract displayable text from tool return content. + * Multimodal content returns the text parts concatenated. + */ +function getDisplayableToolReturn( + content: string | Array | undefined, +): string { + if (!content) return ""; + if (typeof content === "string") { + return content; + } + // Extract text from multimodal content + return content + .filter((part): part is TextContent => part.type === "text") + .map((part) => part.text) + .join("\n"); +} + // const PASTE_LINE_THRESHOLD = 5; // const PASTE_CHAR_THRESHOLD = 500; const CLIP_CHAR_LIMIT_TEXT = 500; @@ -238,7 +258,8 @@ export function backfillBuffers(buffers: Buffers, history: Message[]): void { // Update the existing line with the result // Handle both func_response (streaming) and tool_return (SDK) properties - const resultText = + // tool_return can be multimodal (string or array of content parts) + const rawResult = ("func_response" in toolReturn ? toolReturn.func_response : undefined) || @@ -246,6 +267,7 @@ export function backfillBuffers(buffers: Buffers, history: Message[]): void { ? toolReturn.tool_return : undefined) || ""; + const resultText = getDisplayableToolReturn(rawResult); buffers.byId.set(toolCallLineId, { ...existingLine, resultText, diff --git a/src/tools/descriptions/Read.md b/src/tools/descriptions/Read.md index d436428..17e1d7f 100644 --- a/src/tools/descriptions/Read.md +++ b/src/tools/descriptions/Read.md @@ -9,6 +9,8 @@ Usage: - You can optionally specify a line offset and limit (especially handy for long files), but it's recommended to read the whole file by not providing these parameters - Any lines longer than 2000 characters will be truncated - Results are returned using cat -n format, with line numbers starting at 1 +- This tool allows Letta Code to read images (PNG, JPG, JPEG, GIF, WEBP, BMP). When reading an image file the contents are presented visually as Letta Code is a multimodal LLM. Large images are automatically resized to fit within API limits. +- You will regularly be asked to read screenshots. If the user provides a path to a screenshot, ALWAYS use this tool to view the file at the path. This tool will work with all temporary file paths. - This tool can only read files, not directories. To read a directory, use the ls command via Bash. - You can call multiple tools in a single response. It is always better to speculatively read multiple potentially useful files in parallel. - If you read a file that exists but has empty contents you will receive a system reminder warning in place of file contents. diff --git a/src/tools/impl/Read.ts b/src/tools/impl/Read.ts index 72ffcf1..a14ca4e 100644 --- a/src/tools/impl/Read.ts +++ b/src/tools/impl/Read.ts @@ -1,16 +1,93 @@ import { promises as fs } from "node:fs"; import * as path from "node:path"; +import type { + ImageContent, + TextContent, +} from "@letta-ai/letta-client/resources/agents/messages"; +import { LETTA_CLOUD_API_URL } from "../../auth/oauth.js"; +import { resizeImageIfNeeded } from "../../cli/helpers/imageResize.js"; +import { settingsManager } from "../../settings-manager.js"; import { OVERFLOW_CONFIG, writeOverflowFile } from "./overflow.js"; import { LIMITS } from "./truncation.js"; import { validateRequiredParams } from "./validation.js"; +/** + * Check if the server supports images in tool responses. + * Currently only api.letta.com supports this feature. + */ +function serverSupportsImageToolReturns(): boolean { + const settings = settingsManager.getSettings(); + const baseURL = + process.env.LETTA_BASE_URL || + settings.env?.LETTA_BASE_URL || + LETTA_CLOUD_API_URL; + return baseURL === LETTA_CLOUD_API_URL; +} + interface ReadArgs { file_path: string; offset?: number; limit?: number; } + +// Tool return content types - either a string or array of content parts +export type ToolReturnContent = string | Array; + interface ReadResult { - content: string; + content: ToolReturnContent; +} + +// Supported image extensions +const IMAGE_EXTENSIONS = new Set([ + ".png", + ".jpg", + ".jpeg", + ".gif", + ".webp", + ".bmp", +]); + +function isImageFile(filePath: string): boolean { + const ext = path.extname(filePath).toLowerCase(); + return IMAGE_EXTENSIONS.has(ext); +} + +function getMediaType(ext: string): string { + const types: Record = { + ".png": "image/png", + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".gif": "image/gif", + ".webp": "image/webp", + ".bmp": "image/png", // Convert BMP to PNG + }; + return types[ext] || "image/png"; +} + +async function readImageFile( + filePath: string, +): Promise> { + const buffer = await fs.readFile(filePath); + const ext = path.extname(filePath).toLowerCase(); + const mediaType = getMediaType(ext); + + // Use shared image resize utility + const result = await resizeImageIfNeeded(buffer, mediaType); + + return [ + { + type: "text", + text: `[Image: ${path.basename(filePath)}${result.resized ? " (resized to fit API limits)" : ""}]`, + }, + { + type: "image", + source: { + type: "base64", + media_type: result.mediaType, + data: result.data, + }, + }, + ]; } async function isBinaryFile(filePath: string): Promise { @@ -140,6 +217,28 @@ export async function read(args: ReadArgs): Promise { const stats = await fs.stat(resolvedPath); if (stats.isDirectory()) throw new Error(`Path is a directory, not a file: ${resolvedPath}`); + + // Check if this is an image file + if (isImageFile(resolvedPath)) { + // Check if server supports images in tool responses + if (!serverSupportsImageToolReturns()) { + throw new Error( + `This server does not support images in tool responses.`, + ); + } + + // Images have a higher size limit (20MB raw, will be resized if needed) + const maxImageSize = 20 * 1024 * 1024; + if (stats.size > maxImageSize) { + throw new Error( + `Image file too large: ${stats.size} bytes (max ${maxImageSize} bytes)`, + ); + } + const imageContent = await readImageFile(resolvedPath); + return { content: imageContent }; + } + + // Regular text file handling const maxSize = 10 * 1024 * 1024; // 10MB if (stats.size > maxSize) throw new Error( diff --git a/src/tools/impl/ReadFileGemini.ts b/src/tools/impl/ReadFileGemini.ts index 7b84759..4eac9aa 100644 --- a/src/tools/impl/ReadFileGemini.ts +++ b/src/tools/impl/ReadFileGemini.ts @@ -3,7 +3,8 @@ * Uses Gemini's exact schema and description */ -import { read } from "./Read"; +import type { TextContent } from "@letta-ai/letta-client/resources/agents/messages"; +import { read, type ToolReturnContent } from "./Read"; interface ReadFileGeminiArgs { file_path: string; @@ -11,6 +12,20 @@ interface ReadFileGeminiArgs { limit?: number; } +/** + * Extract text from tool return content (for Gemini wrapper) + */ +function extractText(content: ToolReturnContent): string { + if (typeof content === "string") { + return content; + } + // Extract text from multimodal content (Gemini doesn't support images via this tool) + return content + .filter((part): part is TextContent => part.type === "text") + .map((part) => part.text) + .join("\n"); +} + export async function read_file_gemini( args: ReadFileGeminiArgs, ): Promise<{ message: string }> { @@ -24,6 +39,6 @@ export async function read_file_gemini( const result = await read(lettaArgs); - // Read returns { content: string } - return { message: result.content }; + // Read returns { content: ToolReturnContent } - extract text for Gemini + return { message: extractText(result.content) }; } diff --git a/src/tools/impl/ReadLSP.ts b/src/tools/impl/ReadLSP.ts index 689ef7e..cd603a5 100644 --- a/src/tools/impl/ReadLSP.ts +++ b/src/tools/impl/ReadLSP.ts @@ -2,7 +2,7 @@ * LSP-enhanced Read tool - wraps the base Read tool and adds LSP diagnostics * This is used when LETTA_ENABLE_LSP is set */ -import { read as baseRead } from "./Read.js"; +import { read as baseRead, type ToolReturnContent } from "./Read.js"; // Format a single diagnostic in opencode style: "ERROR [line:col] message" function formatDiagnostic(diag: { @@ -30,7 +30,7 @@ interface ReadLSPArgs { } interface ReadLSPResult { - content: string; + content: ToolReturnContent; } export async function read_lsp(args: ReadLSPArgs): Promise { @@ -42,6 +42,11 @@ export async function read_lsp(args: ReadLSPArgs): Promise { return result; } + // If content is multimodal (image), skip LSP processing - only applies to text files + if (typeof result.content !== "string") { + return result; + } + // Determine if we should include diagnostics const lineCount = result.content.split("\n").length; const shouldInclude = diff --git a/src/tools/manager.ts b/src/tools/manager.ts index 1229e0d..99df4fc 100644 --- a/src/tools/manager.ts +++ b/src/tools/manager.ts @@ -210,8 +210,16 @@ interface ToolDefinition { fn: (args: ToolArgs) => Promise; } +import type { + ImageContent, + TextContent, +} from "@letta-ai/letta-client/resources/agents/messages"; + +// Tool return content can be a string or array of text/image content parts +export type ToolReturnContent = string | Array; + export type ToolExecutionResult = { - toolReturn: string; + toolReturn: ToolReturnContent; status: "success" | "error"; stdout?: string[]; stderr?: string[]; @@ -628,7 +636,18 @@ function isStringArray(value: unknown): value is string[] { ); } -function flattenToolResponse(result: unknown): string { +/** + * Check if an array contains multimodal content (text + images) + */ +function isMultimodalContent( + arr: unknown[], +): arr is Array { + return arr.every( + (item) => isRecord(item) && (item.type === "text" || item.type === "image"), + ); +} + +function flattenToolResponse(result: unknown): ToolReturnContent { if (result === null || result === undefined) { return ""; } @@ -645,6 +664,11 @@ function flattenToolResponse(result: unknown): string { return result.message; } + // Check for multimodal content (images) - return as-is without flattening + if (Array.isArray(result.content) && isMultimodalContent(result.content)) { + return result.content; + } + if (typeof result.content === "string") { return result.content; } @@ -770,12 +794,16 @@ export async function executeTool( // Flatten the response to plain text const flattenedResponse = flattenToolResponse(result); - // Track tool usage + // Track tool usage (calculate size for multimodal content) + const responseSize = + typeof flattenedResponse === "string" + ? flattenedResponse.length + : JSON.stringify(flattenedResponse).length; telemetry.trackToolUsage( internalName, toolStatus === "success", duration, - flattenedResponse.length, + responseSize, toolStatus === "error" ? "tool_error" : undefined, stderr ? stderr.join("\n") : undefined, );