diff --git a/src/cli/App.tsx b/src/cli/App.tsx index 64b9088..e98c0ff 100644 --- a/src/cli/App.tsx +++ b/src/cli/App.tsx @@ -158,6 +158,10 @@ import { subscribe as subscribeToSubagents, } from "./helpers/subagentState"; import { getRandomThinkingVerb } from "./helpers/thinkingMessages"; +import { + clearQueuedToolImages, + getAndClearQueuedToolImages, +} from "./helpers/toolImageRegistry"; import { isFileEditTool, isFileWriteTool, @@ -3239,6 +3243,9 @@ export default function App({ // Lock input for async operation (set before any await to prevent queue processing) setCommandRunning(true); + // Clear any queued tool images from the previous agent context + clearQueuedToolImages(); + const inputCmd = "/agents"; const cmdId = uid("cmd"); @@ -3717,9 +3724,44 @@ export default function App({ // Send all results to server if any if (allResults.length > 0) { toolResultsInFlightRef.current = true; - await processConversation([ + + // Check for queued tool images (from Read tool reading image files) + const toolImages = getAndClearQueuedToolImages(); + const input: Array = [ { type: "approval", approvals: allResults }, - ]); + ]; + + // If there are queued images, add them as a user message + if (toolImages.length > 0) { + const imageContentParts: Array< + | { type: "text"; text: string } + | { + type: "image"; + source: { type: "base64"; media_type: string; data: string }; + } + > = []; + for (const img of toolImages) { + imageContentParts.push({ + type: "text", + text: `Image read from ${img.filePath} (Read tool call: ${img.toolCallId}):`, + }); + imageContentParts.push({ + type: "image", + source: { + type: "base64", + media_type: img.mediaType, + data: img.data, + }, + }); + } + input.push({ + type: "message", + role: "user", + content: imageContentParts as unknown as MessageCreate["content"], + }); + } + + await processConversation(input); toolResultsInFlightRef.current = false; } } finally { @@ -4357,6 +4399,9 @@ export default function App({ setCommandRunning(true); + // Clear any queued tool images from the previous conversation + clearQueuedToolImages(); + try { const client = await getClient(); @@ -5577,7 +5622,37 @@ ${gitContext} } // Build message content from display value (handles placeholders for text/images) - const contentParts = buildMessageContentFromDisplay(msg); + let contentParts = buildMessageContentFromDisplay(msg); + + // Prepend any queued tool images (from Read tool reading image files) + const queuedToolImages = getAndClearQueuedToolImages(); + if (queuedToolImages.length > 0) { + const imageParts: Array< + | { type: "text"; text: string } + | { + type: "image"; + source: { type: "base64"; media_type: string; data: string }; + } + > = []; + for (const img of queuedToolImages) { + // Add system reminder text + imageParts.push({ + type: "text", + text: `Image read from ${img.filePath} (Read tool call: ${img.toolCallId}):`, + }); + // Add image content + imageParts.push({ + type: "image", + source: { + type: "base64", + media_type: img.mediaType, + data: img.data, + }, + }); + } + // Prepend to contentParts + contentParts = [...imageParts, ...contentParts]; + } // Prepend plan mode reminder if in plan mode const planModeReminder = getPlanModeReminder(); diff --git a/src/cli/helpers/toolImageRegistry.ts b/src/cli/helpers/toolImageRegistry.ts new file mode 100644 index 0000000..fee46de --- /dev/null +++ b/src/cli/helpers/toolImageRegistry.ts @@ -0,0 +1,47 @@ +// Registry for images read by tools that need to be sent in the next user message turn. +// This is needed because tool returns only support string content - we can't return +// image data directly in tool results to the Letta API. + +export interface QueuedToolImage { + toolCallId: string; + filePath: string; + data: string; // base64 + mediaType: string; + width: number; + height: number; +} + +const queuedImages: QueuedToolImage[] = []; + +/** + * Queue an image to be sent in the next user message. + * Called by the Read tool when reading an image file. + */ +export function queueToolImage(image: QueuedToolImage): void { + queuedImages.push(image); +} + +/** + * Get and clear all queued images. + * Called when building the user message content. + */ +export function getAndClearQueuedToolImages(): QueuedToolImage[] { + const images = [...queuedImages]; + queuedImages.length = 0; + return images; +} + +/** + * Clear all queued images without returning them. + * Called on conversation/agent switch to prevent memory leaks. + */ +export function clearQueuedToolImages(): void { + queuedImages.length = 0; +} + +/** + * Check if there are any queued images. + */ +export function hasQueuedToolImages(): boolean { + return queuedImages.length > 0; +} diff --git a/src/headless.ts b/src/headless.ts index 5961076..c5df126 100644 --- a/src/headless.ts +++ b/src/headless.ts @@ -31,6 +31,7 @@ import { formatErrorDetails } from "./cli/helpers/errorFormatter"; import { safeJsonParseOr } from "./cli/helpers/safeJsonParse"; import { drainStreamWithResume } from "./cli/helpers/stream"; import { StreamProcessor } from "./cli/helpers/streamProcessor"; +import { getAndClearQueuedToolImages } from "./cli/helpers/toolImageRegistry"; import { settingsManager } from "./settings-manager"; import { checkToolPermission } from "./tools/manager"; import type { @@ -934,11 +935,42 @@ export async function handleHeadlessCommand( // Add user prompt messageContent += prompt; + // Build content parts (text + any queued tool images from Read tool) + type ContentPart = + | { type: "text"; text: string } + | { + type: "image"; + source: { type: "base64"; media_type: string; data: string }; + }; + const contentParts: ContentPart[] = []; + + // Check for queued tool images (from Read tool reading image files) + const queuedToolImages = getAndClearQueuedToolImages(); + if (queuedToolImages.length > 0) { + for (const img of queuedToolImages) { + contentParts.push({ + type: "text", + text: `Image read from ${img.filePath} (Read tool call: ${img.toolCallId}):`, + }); + contentParts.push({ + type: "image", + source: { + type: "base64", + media_type: img.mediaType, + data: img.data, + }, + }); + } + } + + // Add the text message content + contentParts.push({ type: "text", text: messageContent }); + // Start with the user message let currentInput: Array = [ { role: "user", - content: [{ type: "text", text: messageContent }], + content: contentParts as unknown as MessageCreate["content"], }, ]; @@ -1241,6 +1273,9 @@ export async function handleHeadlessCommand( ); const executedResults = await executeApprovalBatch(decisions); + // Check for queued tool images (from Read tool reading image files) + const toolImages = getAndClearQueuedToolImages(); + // Send all results in one batch currentInput = [ { @@ -1248,6 +1283,36 @@ export async function handleHeadlessCommand( approvals: executedResults as ApprovalResult[], }, ]; + + // If there are queued images, add them as a user message + if (toolImages.length > 0) { + const imageContentParts: Array< + | { type: "text"; text: string } + | { + type: "image"; + source: { type: "base64"; media_type: string; data: string }; + } + > = []; + for (const img of toolImages) { + imageContentParts.push({ + type: "text", + text: `Image read from ${img.filePath} (Read tool call: ${img.toolCallId}):`, + }); + imageContentParts.push({ + type: "image", + source: { + type: "base64", + media_type: img.mediaType, + data: img.data, + }, + }); + } + currentInput.push({ + role: "user", + content: imageContentParts as unknown as MessageCreate["content"], + }); + } + continue; } diff --git a/src/tools/descriptions/Read.md b/src/tools/descriptions/Read.md index d436428..17e1d7f 100644 --- a/src/tools/descriptions/Read.md +++ b/src/tools/descriptions/Read.md @@ -9,6 +9,8 @@ Usage: - You can optionally specify a line offset and limit (especially handy for long files), but it's recommended to read the whole file by not providing these parameters - Any lines longer than 2000 characters will be truncated - Results are returned using cat -n format, with line numbers starting at 1 +- This tool allows Letta Code to read images (PNG, JPG, JPEG, GIF, WEBP, BMP). When reading an image file the contents are presented visually as Letta Code is a multimodal LLM. Large images are automatically resized to fit within API limits. +- You will regularly be asked to read screenshots. If the user provides a path to a screenshot, ALWAYS use this tool to view the file at the path. This tool will work with all temporary file paths. - This tool can only read files, not directories. To read a directory, use the ls command via Bash. - You can call multiple tools in a single response. It is always better to speculatively read multiple potentially useful files in parallel. - If you read a file that exists but has empty contents you will receive a system reminder warning in place of file contents. diff --git a/src/tools/impl/Read.ts b/src/tools/impl/Read.ts index 72ffcf1..8c0e9cd 100644 --- a/src/tools/impl/Read.ts +++ b/src/tools/impl/Read.ts @@ -1,9 +1,78 @@ import { promises as fs } from "node:fs"; import * as path from "node:path"; +import { resizeImageIfNeeded } from "../../cli/helpers/imageResize.js"; +import { queueToolImage } from "../../cli/helpers/toolImageRegistry.js"; +import { getToolExecutionContext } from "../toolContext.js"; import { OVERFLOW_CONFIG, writeOverflowFile } from "./overflow.js"; import { LIMITS } from "./truncation.js"; import { validateRequiredParams } from "./validation.js"; +// Supported image extensions (lowercase) +const IMAGE_EXTENSIONS = new Set([ + ".png", + ".jpg", + ".jpeg", + ".gif", + ".webp", + ".bmp", +]); + +/** + * Check if a file path is an image based on extension. + */ +function isImageFile(filePath: string): boolean { + const ext = path.extname(filePath).toLowerCase(); + return IMAGE_EXTENSIONS.has(ext); +} + +/** + * Get MIME type from file extension. + */ +function getMimeType(filePath: string): string { + const ext = path.extname(filePath).toLowerCase(); + const mimeTypes: Record = { + ".png": "image/png", + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".gif": "image/gif", + ".webp": "image/webp", + ".bmp": "image/bmp", + }; + return mimeTypes[ext] || "image/png"; +} + +/** + * Read an image file, resize if needed, and queue for display. + * Returns a placeholder message - actual image is sent in the next user message. + */ +async function readImageFile(filePath: string): Promise { + const buffer = await fs.readFile(filePath); + const inputMimeType = getMimeType(filePath); + const resized = await resizeImageIfNeeded(buffer, inputMimeType); + + // Get tool call ID from execution context + const context = getToolExecutionContext(); + const toolCallId = context?.toolCallId || "unknown"; + + // Queue for next turn + queueToolImage({ + toolCallId, + filePath, + data: resized.data, + mediaType: resized.mediaType, + width: resized.width, + height: resized.height, + }); + + const resizeNote = resized.resized + ? ` (resized to ${resized.width}x${resized.height})` + : ` (${resized.width}x${resized.height})`; + + return { + content: `[Image: ${filePath}${resizeNote} - queued for display]`, + }; +} + interface ReadArgs { file_path: string; offset?: number; @@ -145,6 +214,12 @@ export async function read(args: ReadArgs): Promise { throw new Error( `File too large: ${stats.size} bytes (max ${maxSize} bytes)`, ); + + // Handle image files specially - read, resize, and queue for display + if (isImageFile(resolvedPath)) { + return await readImageFile(resolvedPath); + } + if (await isBinaryFile(resolvedPath)) throw new Error(`Cannot read binary file: ${resolvedPath}`); const content = await fs.readFile(resolvedPath, "utf-8"); diff --git a/src/tools/manager.ts b/src/tools/manager.ts index 1229e0d..6b6a5b0 100644 --- a/src/tools/manager.ts +++ b/src/tools/manager.ts @@ -2,6 +2,7 @@ import { getModelInfo } from "../agent/model"; import { getAllSubagentConfigs } from "../agent/subagents"; import { INTERRUPTED_BY_USER } from "../constants"; import { telemetry } from "../telemetry"; +import { setToolExecutionContext } from "./toolContext"; import { TOOL_DEFINITIONS, type ToolName } from "./toolDefinitions"; export const TOOL_NAMES = Object.keys(TOOL_DEFINITIONS) as ToolName[]; @@ -754,7 +755,14 @@ export async function executeTool( } } - const result = await tool.fn(enhancedArgs); + // Set execution context for tools that need it (e.g., Read for image queuing) + setToolExecutionContext({ toolCallId: options?.toolCallId }); + let result: unknown; + try { + result = await tool.fn(enhancedArgs); + } finally { + setToolExecutionContext(null); + } const duration = Date.now() - startTime; // Extract stdout/stderr if present (for bash tools) diff --git a/src/tools/toolContext.ts b/src/tools/toolContext.ts new file mode 100644 index 0000000..f0729c4 --- /dev/null +++ b/src/tools/toolContext.ts @@ -0,0 +1,26 @@ +// Tool execution context - allows tools to access execution metadata +// Separate file to avoid circular dependencies with manager.ts + +interface ToolExecutionContext { + toolCallId?: string; +} + +let currentToolContext: ToolExecutionContext | null = null; + +/** + * Get the current tool execution context. + * Called by tools that need access to execution metadata (e.g., Read for image queuing). + */ +export function getToolExecutionContext(): ToolExecutionContext | null { + return currentToolContext; +} + +/** + * Set the current tool execution context. + * Called by manager.ts before executing a tool. + */ +export function setToolExecutionContext( + context: ToolExecutionContext | null, +): void { + currentToolContext = context; +}