From 5f5c0df18e1998683fec9b1a054507fe61b4e544 Mon Sep 17 00:00:00 2001 From: Charles Packer Date: Sat, 17 Jan 2026 16:19:30 -0800 Subject: [PATCH] feat: reduce time-to-boot, remove default eager approval checks on inputs, auto-cancel stale approvals (#579) Co-authored-by: Letta --- scripts/latency-benchmark.ts | 341 +++++++++++++++++++++++ src/agent/approval-recovery.ts | 18 ++ src/agent/check-approval.ts | 3 +- src/agent/create.ts | 28 +- src/agent/skills.ts | 122 ++++++++ src/cli/App.tsx | 160 ++++++++++- src/headless.ts | 97 ++++--- src/index.ts | 66 ++--- src/tests/approval-recovery.test.ts | 249 +++++++++++++++++ src/tests/lazy-approval-recovery.test.ts | 256 +++++++++++++++++ src/tools/impl/Skill.ts | 9 +- src/types/protocol.ts | 14 + src/utils/timing.ts | 106 +++++++ 13 files changed, 1376 insertions(+), 93 deletions(-) create mode 100644 scripts/latency-benchmark.ts create mode 100644 src/tests/approval-recovery.test.ts create mode 100644 src/tests/lazy-approval-recovery.test.ts diff --git a/scripts/latency-benchmark.ts b/scripts/latency-benchmark.ts new file mode 100644 index 0000000..b6aec9f --- /dev/null +++ b/scripts/latency-benchmark.ts @@ -0,0 +1,341 @@ +#!/usr/bin/env bun +/** + * Latency Benchmark Script for Letta Code CLI + * + * Runs headless mode with LETTA_DEBUG_TIMINGS=1 and parses the output + * to measure latency breakdown at different stages. + * + * Usage: + * bun scripts/latency-benchmark.ts + * bun scripts/latency-benchmark.ts --scenario fresh-agent + * bun scripts/latency-benchmark.ts --iterations 5 + * + * Requires: LETTA_API_KEY environment variable + */ + +import { spawn } from "node:child_process"; + +interface ApiCall { + method: string; + path: string; + durationMs: number; + status?: number; +} + +interface Milestone { + name: string; + offsetMs: number; +} + +interface BenchmarkResult { + scenario: string; + totalMs: number; + milestones: Milestone[]; + apiCalls: ApiCall[]; + exitCode: number; +} + +interface ScenarioConfig { + name: string; + description: string; + args: string[]; +} + +// Define benchmark scenarios +const SCENARIOS: ScenarioConfig[] = [ + { + name: "fresh-agent", + description: "Create new agent and send simple prompt", + args: [ + "-p", + "What is 2+2? Reply with just the number.", + "--new-agent", + "--yolo", + "--output-format", + "json", + ], + }, + { + name: "resume-agent", + description: "Resume last agent and send simple prompt", + args: [ + "-p", + "What is 3+3? Reply with just the number.", + "--continue", + "--yolo", + "--output-format", + "json", + ], + }, + { + name: "minimal-math", + description: "Simple math question (no tool calls)", + args: [ + "-p", + "What is 5+5? Reply with just the number.", + "--continue", + "--yolo", + "--output-format", + "json", + ], + }, +]; + +/** + * Parse timing logs from stderr output + */ +function parseTimingLogs(stderr: string): { + milestones: Milestone[]; + apiCalls: ApiCall[]; +} { + const milestones: Milestone[] = []; + const apiCalls: ApiCall[] = []; + + const lines = stderr.split("\n"); + + for (const line of lines) { + // Parse milestones: [timing] MILESTONE CLI_START at +0ms (12:34:56.789) + const milestoneMatch = line.match( + /\[timing\] MILESTONE (\S+) at \+(\d+(?:\.\d+)?)(ms|s)/, + ); + if (milestoneMatch) { + const name = milestoneMatch[1]!; + let offsetMs = parseFloat(milestoneMatch[2]!); + if (milestoneMatch[3] === "s") { + offsetMs *= 1000; + } + milestones.push({ name, offsetMs }); + continue; + } + + // Parse API calls: [timing] GET /v1/agents/... -> 245ms (status: 200) + const apiMatch = line.match( + /\[timing\] (GET|POST|PUT|DELETE|PATCH) (\S+) -> (\d+(?:\.\d+)?)(ms|s)(?: \(status: (\d+)\))?/, + ); + if (apiMatch) { + const method = apiMatch[1]!; + const path = apiMatch[2]!; + let durationMs = parseFloat(apiMatch[3]!); + if (apiMatch[4] === "s") { + durationMs *= 1000; + } + const status = apiMatch[5] ? parseInt(apiMatch[5], 10) : undefined; + apiCalls.push({ method, path, durationMs, status }); + } + } + + return { milestones, apiCalls }; +} + +/** + * Run a single benchmark scenario + */ +async function runBenchmark(scenario: ScenarioConfig): Promise { + const start = performance.now(); + + return new Promise((resolve) => { + const proc = spawn("bun", ["run", "dev", ...scenario.args], { + env: { ...process.env, LETTA_DEBUG_TIMINGS: "1" }, + stdio: ["pipe", "pipe", "pipe"], + }); + + let stdout = ""; + let stderr = ""; + + proc.stdout.on("data", (data) => { + stdout += data.toString(); + }); + + proc.stderr.on("data", (data) => { + stderr += data.toString(); + }); + + proc.on("close", (code) => { + const totalMs = performance.now() - start; + const { milestones, apiCalls } = parseTimingLogs(stderr); + + resolve({ + scenario: scenario.name, + totalMs, + milestones, + apiCalls, + exitCode: code ?? 1, + }); + }); + + // Timeout after 2 minutes + setTimeout(() => { + proc.kill("SIGTERM"); + }, 120000); + }); +} + +/** + * Format duration for display + */ +function formatMs(ms: number): string { + if (ms < 1000) return `${Math.round(ms)}ms`; + return `${(ms / 1000).toFixed(2)}s`; +} + +/** + * Print benchmark results + */ +function printResults(results: BenchmarkResult[]): void { + console.log("\n" + "=".repeat(70)); + console.log("LATENCY BENCHMARK RESULTS"); + console.log("=".repeat(70) + "\n"); + + for (const result of results) { + const scenario = SCENARIOS.find((s) => s.name === result.scenario); + console.log(`Scenario: ${result.scenario}`); + console.log(` ${scenario?.description || ""}`); + console.log(` Exit code: ${result.exitCode}`); + console.log(` Total wall time: ${formatMs(result.totalMs)}`); + console.log(""); + + // Print milestones + if (result.milestones.length > 0) { + console.log(" Milestones:"); + let prevMs = 0; + for (const milestone of result.milestones) { + const delta = milestone.offsetMs - prevMs; + const deltaStr = prevMs === 0 ? "" : ` (+${formatMs(delta)})`; + console.log( + ` +${formatMs(milestone.offsetMs).padStart(8)} ${milestone.name}${deltaStr}`, + ); + prevMs = milestone.offsetMs; + } + console.log(""); + } + + // Print API calls summary + if (result.apiCalls.length > 0) { + console.log(" API Calls:"); + const totalApiMs = result.apiCalls.reduce((sum, c) => sum + c.durationMs, 0); + + // Group by path pattern + const grouped: Record = {}; + for (const call of result.apiCalls) { + // Normalize paths (remove UUIDs) + const normalizedPath = call.path.replace( + /[a-f0-9-]{36}/g, + "{id}", + ); + const key = `${call.method} ${normalizedPath}`; + if (!grouped[key]) { + grouped[key] = { count: 0, totalMs: 0 }; + } + grouped[key].count++; + grouped[key].totalMs += call.durationMs; + } + + // Sort by total time + const sorted = Object.entries(grouped).sort( + (a, b) => b[1].totalMs - a[1].totalMs, + ); + + for (const [endpoint, stats] of sorted) { + const countStr = stats.count > 1 ? ` (x${stats.count})` : ""; + console.log( + ` ${formatMs(stats.totalMs).padStart(8)} ${endpoint}${countStr}`, + ); + } + + console.log(` ${"─".repeat(50)}`); + console.log(` ${formatMs(totalApiMs).padStart(8)} Total API time`); + console.log( + ` ${formatMs(result.totalMs - totalApiMs).padStart(8)} CLI overhead (non-API)`, + ); + } + + console.log("\n" + "-".repeat(70) + "\n"); + } + + // Summary table + console.log("SUMMARY"); + console.log("-".repeat(70)); + console.log( + "Scenario".padEnd(20) + + "Total".padStart(12) + + "API Time".padStart(12) + + "CLI Overhead".padStart(14), + ); + console.log("-".repeat(70)); + + for (const result of results) { + const totalApiMs = result.apiCalls.reduce((sum, c) => sum + c.durationMs, 0); + const cliOverhead = result.totalMs - totalApiMs; + console.log( + result.scenario.padEnd(20) + + formatMs(result.totalMs).padStart(12) + + formatMs(totalApiMs).padStart(12) + + formatMs(cliOverhead).padStart(14), + ); + } + console.log("-".repeat(70)); +} + +async function main(): Promise { + // Parse args + const args = process.argv.slice(2); + let scenarioFilter: string | null = null; + let iterations = 1; + + for (let i = 0; i < args.length; i++) { + if (args[i] === "--scenario" && args[i + 1]) { + scenarioFilter = args[++i]!; + } else if (args[i] === "--iterations" && args[i + 1]) { + iterations = parseInt(args[++i]!, 10); + } + } + + // Check prereqs + if (!process.env.LETTA_API_KEY) { + console.error("Error: LETTA_API_KEY environment variable is required"); + process.exit(1); + } + + // Filter scenarios + const scenariosToRun = scenarioFilter + ? SCENARIOS.filter((s) => s.name === scenarioFilter) + : SCENARIOS; + + if (scenariosToRun.length === 0) { + console.error(`Error: Unknown scenario "${scenarioFilter}"`); + console.error(`Available scenarios: ${SCENARIOS.map((s) => s.name).join(", ")}`); + process.exit(1); + } + + console.log("Running latency benchmarks..."); + console.log(`Scenarios: ${scenariosToRun.map((s) => s.name).join(", ")}`); + console.log(`Iterations: ${iterations}`); + console.log(""); + + const allResults: BenchmarkResult[] = []; + + for (let iter = 0; iter < iterations; iter++) { + if (iterations > 1) { + console.log(`\n--- Iteration ${iter + 1} of ${iterations} ---`); + } + + for (const scenario of scenariosToRun) { + console.log(`Running: ${scenario.name}...`); + const result = await runBenchmark(scenario); + allResults.push(result); + + if (result.exitCode !== 0) { + console.warn(` Warning: ${scenario.name} exited with code ${result.exitCode}`); + } else { + console.log(` Completed in ${formatMs(result.totalMs)}`); + } + } + } + + printResults(allResults); +} + +main().catch((err) => { + console.error(err); + process.exit(1); +}); diff --git a/src/agent/approval-recovery.ts b/src/agent/approval-recovery.ts index caae481..f420a96 100644 --- a/src/agent/approval-recovery.ts +++ b/src/agent/approval-recovery.ts @@ -2,9 +2,14 @@ import type { MessageCreate } from "@letta-ai/letta-client/resources/agents/agen import { getClient } from "./client"; import { APPROVAL_RECOVERY_PROMPT } from "./promptAssets"; +// Error when trying to SEND approval but server has no pending approval const APPROVAL_RECOVERY_DETAIL_FRAGMENT = "no tool call is currently awaiting approval"; +// Error when trying to SEND message but server has pending approval waiting +// This is the CONFLICT error - opposite of desync +const APPROVAL_PENDING_DETAIL_FRAGMENT = "cannot send a new message"; + type RunErrorMetadata = | { error_type?: string; @@ -20,6 +25,19 @@ export function isApprovalStateDesyncError(detail: unknown): boolean { return detail.toLowerCase().includes(APPROVAL_RECOVERY_DETAIL_FRAGMENT); } +/** + * Check if error indicates there's a pending approval blocking new messages. + * This is the CONFLICT error from the backend when trying to send a user message + * while the agent is waiting for approval on a tool call. + * + * Error format: + * { detail: "CONFLICT: Cannot send a new message: The agent is waiting for approval..." } + */ +export function isApprovalPendingError(detail: unknown): boolean { + if (typeof detail !== "string") return false; + return detail.toLowerCase().includes(APPROVAL_PENDING_DETAIL_FRAGMENT); +} + export async function fetchRunErrorDetail( runId: string | null | undefined, ): Promise { diff --git a/src/agent/check-approval.ts b/src/agent/check-approval.ts index b2e684e..5fdc268 100644 --- a/src/agent/check-approval.ts +++ b/src/agent/check-approval.ts @@ -29,8 +29,9 @@ export interface ResumeData { /** * Extract approval requests from an approval_request_message. + * Exported for testing parallel tool call handling. */ -function extractApprovals(messageToCheck: Message): { +export function extractApprovals(messageToCheck: Message): { pendingApproval: ApprovalRequest | null; pendingApprovals: ApprovalRequest[]; } { diff --git a/src/agent/create.ts b/src/agent/create.ts index b292d6f..4542c7a 100644 --- a/src/agent/create.ts +++ b/src/agent/create.ts @@ -271,26 +271,14 @@ export async function createAgent( // Track provenance: which blocks were created // Note: We no longer reuse shared blocks - each agent gets fresh blocks const blockProvenance: BlockProvenance[] = []; - const blockIds: string[] = []; - // Create all blocks fresh for the new agent + // Mark new blocks for provenance tracking (actual creation happens in agents.create) for (const block of filteredMemoryBlocks) { - try { - const createdBlock = await client.blocks.create(block); - if (!createdBlock.id) { - throw new Error(`Created block ${block.label} has no ID`); - } - blockIds.push(createdBlock.id); - blockProvenance.push({ label: block.label, source: "new" }); - } catch (error) { - console.error(`Failed to create block ${block.label}:`, error); - throw error; - } + blockProvenance.push({ label: block.label, source: "new" }); } - // Add any referenced block IDs (existing blocks to attach) + // Mark referenced blocks for provenance tracking for (const blockId of referencedBlockIds) { - blockIds.push(blockId); blockProvenance.push({ label: blockId, source: "shared" }); } @@ -314,7 +302,9 @@ export async function createAgent( systemPromptContent = `${systemPromptContent}\n\n${options.systemPromptAppend}`; } - // Create agent with all block IDs (existing + newly created) + // Create agent with inline memory blocks (LET-7101: single API call instead of N+1) + // - memory_blocks: new blocks to create inline + // - block_ids: references to existing blocks (for shared memory) const tags = ["origin:letta-code"]; if (process.env.LETTA_CODE_AGENT_ROLE === "subagent") { tags.push("role:subagent"); @@ -332,7 +322,11 @@ export async function createAgent( model: modelHandle, context_window_limit: contextWindow, tools: toolNames, - block_ids: blockIds, + // New blocks created inline with agent (saves ~2s of sequential API calls) + memory_blocks: + filteredMemoryBlocks.length > 0 ? filteredMemoryBlocks : undefined, + // Referenced block IDs (existing blocks to attach) + block_ids: referencedBlockIds.length > 0 ? referencedBlockIds : undefined, tags, // should be default off, but just in case include_base_tools: false, diff --git a/src/agent/skills.ts b/src/agent/skills.ts index 1845945..1029790 100644 --- a/src/agent/skills.ts +++ b/src/agent/skills.ts @@ -486,3 +486,125 @@ export function formatSkillsForMemory( // Otherwise fall back to compact tree format return formatSkillsAsTree(skills, skillsDirectory); } + +// ============================================================================ +// Skills Sync with Hash-Based Caching (Phase 2.5 - LET-7101) +// ============================================================================ + +import { createHash } from "node:crypto"; +import { mkdir, writeFile } from "node:fs/promises"; + +/** + * Get the project-local skills hash file path. + * Uses .letta/skills-hash.json in the current working directory + * because the skills block content depends on the project's .skills/ folder. + */ +function getSkillsHashFilePath(): string { + return join(process.cwd(), ".letta", "skills-hash.json"); +} + +interface SkillsHashCache { + hash: string; + timestamp: string; +} + +/** + * Compute a hash of the formatted skills content + */ +function computeSkillsHash(content: string): string { + return createHash("sha256").update(content).digest("hex").slice(0, 16); +} + +/** + * Get the cached skills hash (if any) + */ +async function getCachedSkillsHash(): Promise { + try { + const hashFile = getSkillsHashFilePath(); + const data = await readFile(hashFile, "utf-8"); + const cache: SkillsHashCache = JSON.parse(data); + return cache.hash; + } catch { + return null; + } +} + +/** + * Set the cached skills hash + */ +async function setCachedSkillsHash(hash: string): Promise { + try { + const hashFile = getSkillsHashFilePath(); + // Ensure project .letta directory exists + const lettaDir = join(process.cwd(), ".letta"); + await mkdir(lettaDir, { recursive: true }); + + const cache: SkillsHashCache = { + hash, + timestamp: new Date().toISOString(), + }; + await writeFile(hashFile, JSON.stringify(cache, null, 2)); + } catch { + // Ignore cache write failures - not critical + } +} + +/** + * Sync skills to an agent's memory block. + * Discovers skills from filesystem and updates the skills block. + * + * @param client - Letta client + * @param agentId - Agent ID to update + * @param skillsDirectory - Path to project skills directory + * @param options - Optional settings + * @returns Object indicating if sync occurred and discovered skills + */ +export async function syncSkillsToAgent( + client: import("@letta-ai/letta-client").default, + agentId: string, + skillsDirectory: string, + options?: { skipIfUnchanged?: boolean }, +): Promise<{ synced: boolean; skills: Skill[] }> { + // Discover skills from filesystem + const { skills, errors } = await discoverSkills(skillsDirectory); + + if (errors.length > 0) { + for (const error of errors) { + console.warn(`[skills] Discovery error: ${error.path}: ${error.message}`); + } + } + + // Format skills for memory block + const formattedSkills = formatSkillsForMemory(skills, skillsDirectory); + + // Check if we can skip the update + if (options?.skipIfUnchanged) { + const newHash = computeSkillsHash(formattedSkills); + const cachedHash = await getCachedSkillsHash(); + + if (newHash === cachedHash) { + return { synced: false, skills }; + } + + // Update the block and cache the new hash + await client.agents.blocks.update("skills", { + agent_id: agentId, + value: formattedSkills, + }); + await setCachedSkillsHash(newHash); + + return { synced: true, skills }; + } + + // No skip option - always update + await client.agents.blocks.update("skills", { + agent_id: agentId, + value: formattedSkills, + }); + + // Update hash cache for future runs + const newHash = computeSkillsHash(formattedSkills); + await setCachedSkillsHash(newHash); + + return { synced: true, skills }; +} diff --git a/src/cli/App.tsx b/src/cli/App.tsx index 40bbd50..c24bfbf 100644 --- a/src/cli/App.tsx +++ b/src/cli/App.tsx @@ -30,6 +30,7 @@ import { import { buildApprovalRecoveryMessage, fetchRunErrorDetail, + isApprovalPendingError, isApprovalStateDesyncError, } from "../agent/approval-recovery"; import { prefetchAvailableModelHandles } from "../agent/available-models"; @@ -175,10 +176,10 @@ import { useTerminalRows, useTerminalWidth } from "./hooks/useTerminalWidth"; const CLEAR_SCREEN_AND_HOME = "\u001B[2J\u001B[H"; const MIN_RESIZE_DELTA = 2; -// Feature flag: Check for pending approvals before sending messages -// This prevents infinite thinking state when there's an orphaned approval -// Can be disabled if the latency check adds too much overhead -const CHECK_PENDING_APPROVALS_BEFORE_SEND = true; +// Eager approval checking is now CONDITIONAL (LET-7101): +// - Enabled when resuming a session (--resume, --continue, or startupApprovals exist) +// - Disabled for normal messages (lazy recovery handles edge cases) +// This saves ~2s latency per message in the common case. // Feature flag: Eagerly cancel streams client-side when user presses ESC // When true (default), immediately abort the stream after calling .cancel() @@ -678,6 +679,12 @@ export default function App({ >(null); const toolAbortControllerRef = useRef(null); + // Eager approval checking: only enabled when resuming a session (LET-7101) + // After first successful message, we disable it since any new approvals are from our own turn + const [needsEagerApprovalCheck, setNeedsEagerApprovalCheck] = useState( + () => resumedExistingConversation || startupApprovals.length > 0, + ); + // Track auto-handled results to combine with user decisions const [autoHandledResults, setAutoHandledResults] = useState< Array<{ @@ -1941,6 +1948,12 @@ export default function App({ setStreaming(false); llmApiErrorRetriesRef.current = 0; // Reset retry counter on success + // Disable eager approval check after first successful message (LET-7101) + // Any new approvals from here on are from our own turn, not orphaned + if (needsEagerApprovalCheck) { + setNeedsEagerApprovalCheck(false); + } + // Send desktop notification when turn completes // and we're not about to auto-send another queued message if (!waitingForQueueCancelRef.current) { @@ -2552,6 +2565,112 @@ export default function App({ return; } + // Check for approval pending error (sent user message while approval waiting) + // This is the lazy recovery path for when needsEagerApprovalCheck is false + const approvalPendingDetected = + isApprovalPendingError(detailFromRun) || + isApprovalPendingError(latestErrorText); + + if ( + !hasApprovalInPayload && + approvalPendingDetected && + llmApiErrorRetriesRef.current < LLM_API_ERROR_MAX_RETRIES + ) { + llmApiErrorRetriesRef.current += 1; + + // Log for debugging (visible in transcripts) + const statusId = uid("status"); + buffersRef.current.byId.set(statusId, { + kind: "status", + id: statusId, + lines: [ + "[LAZY RECOVERY] Detected CONFLICT: server has pending approval", + "[LAZY RECOVERY] Fetching stale approvals to auto-deny...", + ], + }); + buffersRef.current.order.push(statusId); + refreshDerived(); + + try { + // Fetch pending approvals and auto-deny them + const client = await getClient(); + const agent = await client.agents.retrieve(agentIdRef.current); + const { pendingApprovals: existingApprovals } = + await getResumeData(client, agent, conversationIdRef.current); + + if (existingApprovals && existingApprovals.length > 0) { + // Update status with details + buffersRef.current.byId.set(statusId, { + kind: "status", + id: statusId, + lines: [ + "[LAZY RECOVERY] Detected CONFLICT: server has pending approval", + `[LAZY RECOVERY] Found ${existingApprovals.length} stale approval(s):`, + ...existingApprovals.map( + (a) => + ` - ${a.toolName} (${a.toolCallId.slice(0, 8)}...)`, + ), + "[LAZY RECOVERY] Auto-denying and batching with user message...", + ], + }); + refreshDerived(); + + // Create denial results for all stale approvals + // Use the same format as handleCancelApprovals (lines 6390-6395) + const denialResults = existingApprovals.map((approval) => ({ + type: "approval" as const, + tool_call_id: approval.toolCallId, + approve: false, + reason: + "Auto-denied: stale approval from interrupted session", + })); + + // Prepend approval denials to the current input (keeps user message) + const approvalPayload: ApprovalCreate = { + type: "approval", + approvals: denialResults, + }; + currentInput.unshift(approvalPayload); + } else { + // No approvals found - server state may have cleared + buffersRef.current.byId.set(statusId, { + kind: "status", + id: statusId, + lines: [ + "[LAZY RECOVERY] Detected CONFLICT but no pending approvals found", + "[LAZY RECOVERY] Retrying message...", + ], + }); + refreshDerived(); + } + } catch (_recoveryError) { + // If we can't fetch approvals, just retry the original message + buffersRef.current.byId.set(statusId, { + kind: "status", + id: statusId, + lines: [ + "[LAZY RECOVERY] Failed to fetch pending approvals", + "[LAZY RECOVERY] Retrying message anyway...", + ], + }); + refreshDerived(); + } + + // Brief pause so user can see the status + await new Promise((resolve) => setTimeout(resolve, 500)); + + // Remove the transient status + buffersRef.current.byId.delete(statusId); + buffersRef.current.order = buffersRef.current.order.filter( + (id) => id !== statusId, + ); + refreshDerived(); + + // Reset interrupted flag so retry stream chunks are processed + buffersRef.current.interrupted = false; + continue; + } + // Check if this is a retriable error (transient LLM API error) const retriable = await isRetriableError( stopReasonToHandle, @@ -2757,6 +2876,7 @@ export default function App({ setStreaming, currentModelId, updateStreamingOutput, + needsEagerApprovalCheck, ], ); @@ -3278,7 +3398,8 @@ export default function App({ const checkPendingApprovalsForSlashCommand = useCallback(async (): Promise< { blocked: true } | { blocked: false } > => { - if (!CHECK_PENDING_APPROVALS_BEFORE_SEND) { + // Only check eagerly when resuming a session (LET-7101) + if (!needsEagerApprovalCheck) { return { blocked: false }; } @@ -3425,7 +3546,13 @@ export default function App({ // If check fails, proceed anyway (don't block user) return { blocked: false }; } - }, [agentId, processConversation, refreshDerived, updateStreamingOutput]); + }, [ + agentId, + processConversation, + refreshDerived, + updateStreamingOutput, + needsEagerApprovalCheck, + ]); // biome-ignore lint/correctness/useExhaustiveDependencies: refs read .current dynamically, complex callback with intentional deps const onSubmit = useCallback( @@ -5372,7 +5499,20 @@ DO NOT respond to these messages or otherwise consider them in your response unl // Check for pending approvals before sending message (skip if we already have // a queued approval response to send first). - if (CHECK_PENDING_APPROVALS_BEFORE_SEND && !queuedApprovalResults) { + // Only do eager check when resuming a session (LET-7101) - otherwise lazy recovery handles it + if (needsEagerApprovalCheck && !queuedApprovalResults) { + // Log for debugging + const eagerStatusId = uid("status"); + buffersRef.current.byId.set(eagerStatusId, { + kind: "status", + id: eagerStatusId, + lines: [ + "[EAGER CHECK] Checking for pending approvals (resume mode)...", + ], + }); + buffersRef.current.order.push(eagerStatusId); + refreshDerived(); + try { const client = await getClient(); // Fetch fresh agent state to check for pending approvals with accurate in-context messages @@ -5383,6 +5523,12 @@ DO NOT respond to these messages or otherwise consider them in your response unl conversationIdRef.current, ); + // Remove eager check status + buffersRef.current.byId.delete(eagerStatusId); + buffersRef.current.order = buffersRef.current.order.filter( + (id) => id !== eagerStatusId, + ); + // Check if user cancelled while we were fetching approval state if ( userCancelledRef.current || diff --git a/src/headless.ts b/src/headless.ts index f197f27..996d25e 100644 --- a/src/headless.ts +++ b/src/headless.ts @@ -11,6 +11,7 @@ import type { ApprovalResult } from "./agent/approval-execution"; import { buildApprovalRecoveryMessage, fetchRunErrorDetail, + isApprovalPendingError, isApprovalStateDesyncError, } from "./agent/approval-recovery"; import { getClient } from "./agent/client"; @@ -40,11 +41,17 @@ import type { ControlResponse, ErrorMessage, MessageWire, + RecoveryMessage, ResultMessage, RetryMessage, StreamEvent, SystemInitMessage, } from "./types/protocol"; +import { + markMilestone, + measureSinceMilestone, + reportAllMilestones, +} from "./utils/timing"; // Maximum number of times to retry a turn when the backend // reports an `llm_api_error` stop reason. This helps smooth @@ -169,6 +176,7 @@ export async function handleHeadlessCommand( } const client = await getClient(); + markMilestone("HEADLESS_CLIENT_READY"); // Check for --resume flag (interactive only) if (values.resume) { @@ -462,6 +470,7 @@ export async function handleHeadlessCommand( console.error("No agent found. Use --new-agent to create a new agent."); process.exit(1); } + markMilestone("HEADLESS_AGENT_RESOLVED"); // Check if we're resuming an existing agent (not creating a new one) const isResumingAgent = !!( @@ -567,6 +576,7 @@ export async function handleHeadlessCommand( }); conversationId = conversation.id; } + markMilestone("HEADLESS_CONVERSATION_READY"); // Save session (agent + conversation) to both project and global settings // Skip for subagents - they shouldn't pollute the LRU settings @@ -591,41 +601,33 @@ export async function handleHeadlessCommand( // Set agent context for tools that need it (e.g., Skill tool, Task tool) setAgentContext(agent.id, skillsDirectory); - await initializeLoadedSkillsFlag(); - // Re-discover skills and update the skills memory block + // Fire-and-forget: Initialize loaded skills flag (LET-7101) + // Don't await - this is just for the skill unload reminder + initializeLoadedSkillsFlag().catch(() => { + // Ignore errors - not critical + }); + + // Fire-and-forget: Sync skills in background (LET-7101) // This ensures new skills added after agent creation are available - try { - const { discoverSkills, formatSkillsForMemory, SKILLS_DIR } = await import( - "./agent/skills" - ); - const { join } = await import("node:path"); + // Don't await - proceed to message sending immediately + (async () => { + try { + const { syncSkillsToAgent, SKILLS_DIR } = await import("./agent/skills"); + const { join } = await import("node:path"); - const resolvedSkillsDirectory = - skillsDirectory || join(process.cwd(), SKILLS_DIR); - const { skills, errors } = await discoverSkills(resolvedSkillsDirectory); + const resolvedSkillsDirectory = + skillsDirectory || join(process.cwd(), SKILLS_DIR); - if (errors.length > 0) { - console.warn("Errors encountered during skill discovery:"); - for (const error of errors) { - console.warn(` ${error.path}: ${error.message}`); - } + await syncSkillsToAgent(client, agent.id, resolvedSkillsDirectory, { + skipIfUnchanged: true, + }); + } catch (error) { + console.warn( + `[skills] Background sync failed: ${error instanceof Error ? error.message : String(error)}`, + ); } - - // Update the skills memory block with freshly discovered skills - const formattedSkills = formatSkillsForMemory( - skills, - resolvedSkillsDirectory, - ); - await client.agents.blocks.update("skills", { - agent_id: agent.id, - value: formattedSkills, - }); - } catch (error) { - console.warn( - `Failed to update skills: ${error instanceof Error ? error.message : String(error)}`, - ); - } + })(); // Validate output format const outputFormat = @@ -830,8 +832,11 @@ export async function handleHeadlessCommand( } }; - // Clear any pending approvals before starting a new turn - await resolveAllPendingApprovals(); + // Clear any pending approvals before starting a new turn - ONLY when resuming (LET-7101) + // For new agents/conversations, lazy recovery handles any edge cases + if (isResumingAgent) { + await resolveAllPendingApprovals(); + } // Build message content with reminders (plan mode first, then skill unload) const { permissionMode } = await import("./permissions/mode"); @@ -865,6 +870,9 @@ export async function handleHeadlessCommand( let lastKnownRunId: string | null = null; let llmApiErrorRetries = 0; + markMilestone("HEADLESS_FIRST_STREAM_START"); + measureSinceMilestone("headless-setup-total", "HEADLESS_CLIENT_READY"); + try { while (true) { const stream = await sendMessageStream(conversationId, currentInput); @@ -922,8 +930,25 @@ export async function handleHeadlessCommand( } // Detect server conflict due to pending approval; handle it and retry - if (errorInfo?.message?.includes("Cannot send a new message")) { - // Don't emit this error; clear approvals and retry outer loop + // Check both detail and message fields since error formats vary + if ( + isApprovalPendingError(errorInfo?.detail) || + isApprovalPendingError(errorInfo?.message) + ) { + // Emit recovery message for stream-json mode (enables testing) + if (outputFormat === "stream-json") { + const recoveryMsg: RecoveryMessage = { + type: "recovery", + recovery_type: "approval_pending", + message: + "Detected pending approval conflict; auto-denying stale approval and retrying", + run_id: lastRunId ?? undefined, + session_id: sessionId, + uuid: `recovery-${lastRunId || crypto.randomUUID()}`, + }; + console.log(JSON.stringify(recoveryMsg)); + } + // Clear approvals and retry outer loop await resolveAllPendingApprovals(); // Reset state and restart turn stopReason = "error" as StopReasonType; @@ -1541,6 +1566,10 @@ export async function handleHeadlessCommand( } console.log(resultText); } + + // Report all milestones at the end for latency audit + markMilestone("HEADLESS_COMPLETE"); + reportAllMilestones(); } /** diff --git a/src/index.ts b/src/index.ts index 987ca36..ab77fb9 100755 --- a/src/index.ts +++ b/src/index.ts @@ -17,6 +17,7 @@ import { permissionMode } from "./permissions/mode"; import { settingsManager } from "./settings-manager"; import { telemetry } from "./telemetry"; import { loadTools } from "./tools/manager"; +import { markMilestone } from "./utils/timing"; // Stable empty array constants to prevent new references on every render // These are used as fallbacks when resumeData is null, avoiding the React @@ -335,9 +336,12 @@ async function getPinnedAgentNames(): Promise<{ id: string; name: string }[]> { } async function main(): Promise { + markMilestone("CLI_START"); + // Initialize settings manager (loads settings once into memory) await settingsManager.initialize(); const settings = await settingsManager.getSettingsWithSecureTokens(); + markMilestone("SETTINGS_LOADED"); // Initialize LSP infrastructure for type checking if (process.env.LETTA_ENABLE_LSP) { @@ -728,6 +732,7 @@ async function main(): Promise { // Validate credentials by checking health endpoint const { validateCredentials } = await import("./auth/oauth"); const isValid = await validateCredentials(baseURL, apiKey ?? ""); + markMilestone("CREDENTIALS_VALIDATED"); if (!isValid) { // For headless mode, error out with helpful message @@ -829,18 +834,22 @@ async function main(): Promise { } if (isHeadless) { + markMilestone("HEADLESS_MODE_START"); // For headless mode, load tools synchronously (respecting model/toolset when provided) const modelForTools = getModelForToolLoading( specifiedModel, specifiedToolset as "codex" | "default" | undefined, ); await loadTools(modelForTools); + markMilestone("TOOLS_LOADED"); const { handleHeadlessCommand } = await import("./headless"); await handleHeadlessCommand(process.argv, specifiedModel, skillsDirectory); return; } + markMilestone("TUI_MODE_START"); + // Enable enhanced key reporting (Shift+Enter, etc.) BEFORE Ink initializes. // In VS Code/xterm.js this typically requires a short handshake (query + enable). try { @@ -853,6 +862,7 @@ async function main(): Promise { } // Interactive: lazy-load React/Ink + App + markMilestone("REACT_IMPORT_START"); const React = await import("react"); const { render } = await import("ink"); const { useState, useEffect } = React; @@ -1441,42 +1451,35 @@ async function main(): Promise { // Set agent context for tools that need it (e.g., Skill tool) setAgentContext(agent.id, skillsDirectory); - await initializeLoadedSkillsFlag(); - // Re-discover skills and update the skills memory block + // Fire-and-forget: Initialize loaded skills flag (LET-7101) + // Don't await - this is just for the skill unload reminder + initializeLoadedSkillsFlag().catch(() => { + // Ignore errors - not critical + }); + + // Fire-and-forget: Sync skills in background (LET-7101) // This ensures new skills added after agent creation are available - try { - const { discoverSkills, formatSkillsForMemory, SKILLS_DIR } = - await import("./agent/skills"); - const { join } = await import("node:path"); + // Don't await - user can start typing immediately + (async () => { + try { + const { syncSkillsToAgent, SKILLS_DIR } = await import( + "./agent/skills" + ); + const { join } = await import("node:path"); - const resolvedSkillsDirectory = - skillsDirectory || join(process.cwd(), SKILLS_DIR); - const { skills, errors } = await discoverSkills( - resolvedSkillsDirectory, - ); + const resolvedSkillsDirectory = + skillsDirectory || join(process.cwd(), SKILLS_DIR); - if (errors.length > 0) { - console.warn("Errors encountered during skill discovery:"); - for (const error of errors) { - console.warn(` ${error.path}: ${error.message}`); - } + await syncSkillsToAgent(client, agent.id, resolvedSkillsDirectory, { + skipIfUnchanged: true, + }); + } catch (error) { + console.warn( + `[skills] Background sync failed: ${error instanceof Error ? error.message : String(error)}`, + ); } - - // Update the skills memory block with freshly discovered skills - const formattedSkills = formatSkillsForMemory( - skills, - resolvedSkillsDirectory, - ); - await client.agents.blocks.update("skills", { - agent_id: agent.id, - value: formattedSkills, - }); - } catch (error) { - console.warn( - `Failed to update skills: ${error instanceof Error ? error.message : String(error)}`, - ); - } + })(); // Check if we're resuming an existing agent // We're resuming if: @@ -1797,6 +1800,7 @@ async function main(): Promise { }); } + markMilestone("REACT_RENDER_START"); render( React.createElement(LoadingApp, { continueSession: shouldContinue, diff --git a/src/tests/approval-recovery.test.ts b/src/tests/approval-recovery.test.ts new file mode 100644 index 0000000..ab8c68a --- /dev/null +++ b/src/tests/approval-recovery.test.ts @@ -0,0 +1,249 @@ +import { describe, expect, test } from "bun:test"; +import type { Message } from "@letta-ai/letta-client/resources/agents/messages"; +import { + isApprovalPendingError, + isApprovalStateDesyncError, +} from "../agent/approval-recovery"; +import { extractApprovals } from "../agent/check-approval"; + +/** + * Tests for approval error detection helpers (LET-7101). + * + * These functions detect two opposite error conditions: + * 1. isApprovalStateDesyncError: Sent approval, but server has no pending approval + * 2. isApprovalPendingError: Sent user message, but server has pending approval waiting + */ + +describe("isApprovalStateDesyncError", () => { + test("detects desync error in detail string", () => { + const detail = "No tool call is currently awaiting approval"; + expect(isApprovalStateDesyncError(detail)).toBe(true); + }); + + test("detects desync error case-insensitively", () => { + const detail = "NO TOOL CALL IS CURRENTLY AWAITING APPROVAL"; + expect(isApprovalStateDesyncError(detail)).toBe(true); + }); + + test("detects desync error in longer message", () => { + const detail = + "Error: No tool call is currently awaiting approval. The approval request may have expired."; + expect(isApprovalStateDesyncError(detail)).toBe(true); + }); + + test("returns false for unrelated errors", () => { + expect(isApprovalStateDesyncError("Connection timeout")).toBe(false); + expect(isApprovalStateDesyncError("Internal server error")).toBe(false); + }); + + test("returns false for non-string input", () => { + expect(isApprovalStateDesyncError(null)).toBe(false); + expect(isApprovalStateDesyncError(undefined)).toBe(false); + expect(isApprovalStateDesyncError(123)).toBe(false); + expect(isApprovalStateDesyncError({ error: "test" })).toBe(false); + }); +}); + +describe("isApprovalPendingError", () => { + // This is the actual error format from the Letta backend (screenshot from LET-7101) + const REAL_ERROR_DETAIL = + "CONFLICT: Cannot send a new message: The agent is waiting for approval on a tool call. Please approve or deny the pending request before continuing."; + + test("detects approval pending error in real error format", () => { + expect(isApprovalPendingError(REAL_ERROR_DETAIL)).toBe(true); + }); + + test("detects approval pending error case-insensitively", () => { + expect(isApprovalPendingError("CANNOT SEND A NEW MESSAGE")).toBe(true); + expect(isApprovalPendingError("cannot send a new message")).toBe(true); + }); + + test("detects partial match in longer message", () => { + const detail = "Error occurred: Cannot send a new message while processing"; + expect(isApprovalPendingError(detail)).toBe(true); + }); + + test("returns false for desync errors (opposite case)", () => { + // These are the OPPOSITE error - when we send approval but there's nothing pending + expect( + isApprovalPendingError("No tool call is currently awaiting approval"), + ).toBe(false); + }); + + test("returns false for unrelated errors", () => { + expect(isApprovalPendingError("Connection timeout")).toBe(false); + expect(isApprovalPendingError("Rate limit exceeded")).toBe(false); + expect(isApprovalPendingError("Invalid API key")).toBe(false); + }); + + test("returns false for non-string input", () => { + expect(isApprovalPendingError(null)).toBe(false); + expect(isApprovalPendingError(undefined)).toBe(false); + expect(isApprovalPendingError(123)).toBe(false); + expect(isApprovalPendingError({ detail: REAL_ERROR_DETAIL })).toBe(false); + }); +}); + +/** + * Tests for parallel tool call approval extraction. + * Ensures lazy recovery handles multiple simultaneous tool calls correctly. + */ +describe("extractApprovals", () => { + // Helper to create a minimal Message-like object for testing + // We use 'as Message' cast because the real Message type is complex + const createMessage = (overrides: { + tool_calls?: Array<{ + tool_call_id?: string; + name?: string; + arguments?: string; + }>; + tool_call?: { + tool_call_id?: string; + name?: string; + arguments?: string; + }; + }): Message => + ({ + id: "test-msg-id", + date: new Date().toISOString(), + message_type: "approval_request_message", + ...overrides, + }) as unknown as Message; + + test("extracts single tool call from tool_calls array", () => { + const msg = createMessage({ + tool_calls: [ + { + tool_call_id: "call-1", + name: "Bash", + arguments: '{"command": "echo hello"}', + }, + ], + }); + + const result = extractApprovals(msg); + + expect(result.pendingApprovals).toHaveLength(1); + expect(result.pendingApprovals[0]!.toolCallId).toBe("call-1"); + expect(result.pendingApprovals[0]!.toolName).toBe("Bash"); + expect(result.pendingApproval?.toolCallId).toBe("call-1"); + }); + + test("extracts multiple parallel tool calls", () => { + const msg = createMessage({ + tool_calls: [ + { + tool_call_id: "call-1", + name: "Bash", + arguments: '{"command": "echo hello"}', + }, + { + tool_call_id: "call-2", + name: "web_search", + arguments: '{"query": "test"}', + }, + { + tool_call_id: "call-3", + name: "Read", + arguments: '{"file_path": "/tmp/test.txt"}', + }, + ], + }); + + const result = extractApprovals(msg); + + expect(result.pendingApprovals).toHaveLength(3); + expect(result.pendingApprovals[0]!.toolCallId).toBe("call-1"); + expect(result.pendingApprovals[0]!.toolName).toBe("Bash"); + expect(result.pendingApprovals[1]!.toolCallId).toBe("call-2"); + expect(result.pendingApprovals[1]!.toolName).toBe("web_search"); + expect(result.pendingApprovals[2]!.toolCallId).toBe("call-3"); + expect(result.pendingApprovals[2]!.toolName).toBe("Read"); + // pendingApproval is deprecated, should be first item + expect(result.pendingApproval?.toolCallId).toBe("call-1"); + }); + + test("handles deprecated single tool_call field", () => { + const msg = createMessage({ + tool_call: { + tool_call_id: "call-legacy", + name: "Write", + arguments: '{"file_path": "/tmp/out.txt"}', + }, + }); + + const result = extractApprovals(msg); + + expect(result.pendingApprovals).toHaveLength(1); + expect(result.pendingApprovals[0]!.toolCallId).toBe("call-legacy"); + expect(result.pendingApprovals[0]!.toolName).toBe("Write"); + }); + + test("prefers tool_calls array over deprecated tool_call", () => { + const msg = createMessage({ + tool_calls: [{ tool_call_id: "call-new", name: "Bash", arguments: "{}" }], + tool_call: { + tool_call_id: "call-old", + name: "Write", + arguments: "{}", + }, + }); + + const result = extractApprovals(msg); + + // Should use tool_calls, not tool_call + expect(result.pendingApprovals).toHaveLength(1); + expect(result.pendingApprovals[0]!.toolCallId).toBe("call-new"); + }); + + test("filters out tool calls without tool_call_id", () => { + const msg = createMessage({ + tool_calls: [ + { tool_call_id: "call-valid", name: "Bash", arguments: "{}" }, + { name: "Invalid", arguments: "{}" }, // Missing tool_call_id + { tool_call_id: "", name: "Empty", arguments: "{}" }, // Empty tool_call_id + { tool_call_id: "call-valid-2", name: "Read", arguments: "{}" }, + ], + }); + + const result = extractApprovals(msg); + + // Should only include entries with valid tool_call_id + expect(result.pendingApprovals).toHaveLength(2); + expect(result.pendingApprovals[0]!.toolCallId).toBe("call-valid"); + expect(result.pendingApprovals[1]!.toolCallId).toBe("call-valid-2"); + }); + + test("returns empty array when no tool calls present", () => { + const msg = createMessage({}); + + const result = extractApprovals(msg); + + expect(result.pendingApprovals).toHaveLength(0); + expect(result.pendingApproval).toBeNull(); + }); + + test("handles missing name and arguments gracefully", () => { + const msg = createMessage({ + tool_calls: [{ tool_call_id: "call-minimal" }], + }); + + const result = extractApprovals(msg); + + expect(result.pendingApprovals).toHaveLength(1); + expect(result.pendingApprovals[0]!.toolCallId).toBe("call-minimal"); + expect(result.pendingApprovals[0]!.toolName).toBe(""); + expect(result.pendingApprovals[0]!.toolArgs).toBe(""); + }); +}); + +/** + * Note: Full integration testing of lazy approval recovery requires: + * 1. Starting CLI without --yolo + * 2. Sending a prompt that triggers a tool call requiring approval + * 3. Instead of approving, sending another user message + * 4. Verifying the CONFLICT error is detected and recovery happens + * + * This is complex to automate reliably in unit tests. + * Manual testing or a dedicated integration test suite is recommended. + */ diff --git a/src/tests/lazy-approval-recovery.test.ts b/src/tests/lazy-approval-recovery.test.ts new file mode 100644 index 0000000..3273227 --- /dev/null +++ b/src/tests/lazy-approval-recovery.test.ts @@ -0,0 +1,256 @@ +import { describe, expect, test } from "bun:test"; +import { spawn } from "node:child_process"; + +/** + * Integration test for lazy approval recovery (LET-7101). + * + * NOTE: The lazy approval recovery is primarily designed for TUI mode where: + * 1. User has a session with pending approvals (e.g., from a previous run) + * 2. User sends a new message before responding to the approval + * 3. Server returns CONFLICT error + * 4. CLI recovers by auto-denying stale approvals and retrying + * + * In bidirectional mode, messages sent during permission wait are dropped + * (see headless.ts line 1710-1714), so we can't directly test the CONFLICT + * scenario here. This test validates that the flow doesn't crash when + * messages are sent while approvals are pending. + * + * The RecoveryMessage emission can be tested by: + * 1. Manual testing in TUI mode (start session with orphaned approval) + * 2. Or by modifying headless mode to not drop messages during permission wait + */ + +// Prompt that will trigger a Bash tool call requiring approval +const BASH_TRIGGER_PROMPT = + "Run this exact bash command: echo test123. Do not use any other tools."; + +// Second message to send while approval is pending +const INTERRUPT_MESSAGE = + "Actually, just say OK instead. Do not call any tools."; + +interface StreamMessage { + type: string; + subtype?: string; + message_type?: string; + stop_reason?: string; + // eslint-disable-next-line @typescript-eslint/no-explicit-any + [key: string]: any; +} + +/** + * Run bidirectional test with custom message handling. + * Allows sending messages at specific points in the flow. + */ +async function runLazyRecoveryTest(timeoutMs = 180000): Promise<{ + messages: StreamMessage[]; + success: boolean; + errorSeen: boolean; +}> { + return new Promise((resolve, reject) => { + const proc = spawn( + "bun", + [ + "run", + "dev", + "-p", + "--input-format", + "stream-json", + "--output-format", + "stream-json", + "--new-agent", + "-m", + "haiku", + // NOTE: No --yolo flag - approvals are required + ], + { + cwd: process.cwd(), + env: { ...process.env }, + }, + ); + + const messages: StreamMessage[] = []; + let buffer = ""; + let initReceived = false; + let approvalSeen = false; + let interruptSent = false; + let errorSeen = false; + let resultCount = 0; + let closing = false; + + const timeout = setTimeout(() => { + if (!closing) { + proc.kill(); + reject(new Error(`Test timed out after ${timeoutMs}ms`)); + } + }, timeoutMs); + + const cleanup = () => { + closing = true; + clearTimeout(timeout); + setTimeout(() => { + proc.stdin?.end(); + proc.kill(); + }, 500); + }; + + const processLine = (line: string) => { + if (!line.trim()) return; + try { + const msg: StreamMessage = JSON.parse(line); + messages.push(msg); + + // Debug output + if (process.env.DEBUG_TEST) { + console.log("MSG:", JSON.stringify(msg, null, 2)); + } + + // Step 1: Wait for init, then send bash trigger prompt + if (msg.type === "system" && msg.subtype === "init" && !initReceived) { + initReceived = true; + const userMsg = JSON.stringify({ + type: "user", + message: { role: "user", content: BASH_TRIGGER_PROMPT }, + }); + proc.stdin?.write(`${userMsg}\n`); + return; + } + + // Step 2: When we see approval request, send another user message instead + if ( + msg.type === "message" && + msg.message_type === "approval_request_message" && + !approvalSeen + ) { + approvalSeen = true; + // Wait a moment, then send interrupt message (NOT an approval) + setTimeout(() => { + if (!interruptSent) { + interruptSent = true; + const userMsg = JSON.stringify({ + type: "user", + message: { role: "user", content: INTERRUPT_MESSAGE }, + }); + proc.stdin?.write(`${userMsg}\n`); + } + }, 500); + return; + } + + // Track recovery messages - this is the key signal that lazy recovery worked + if ( + msg.type === "recovery" && + msg.recovery_type === "approval_pending" + ) { + errorSeen = true; // reusing this flag to mean "recovery message seen" + } + + // Also track raw errors (shouldn't see these if recovery works properly) + if ( + msg.type === "error" || + (msg.type === "message" && msg.message_type === "error_message") + ) { + const detail = msg.detail || msg.message || ""; + if (detail.toLowerCase().includes("cannot send a new message")) { + // Raw error leaked through - recovery may have failed + console.log( + "WARNING: Raw CONFLICT error seen (recovery may have failed)", + ); + } + } + + // Track results - we need 2 (one for each user message, though first may fail) + if (msg.type === "result") { + resultCount++; + // After second result (or after seeing error + result), we're done + if (resultCount >= 2 || (errorSeen && resultCount >= 1)) { + cleanup(); + resolve({ messages, success: true, errorSeen }); + } + } + } catch { + // Not valid JSON, ignore + } + }; + + proc.stdout?.on("data", (data) => { + buffer += data.toString(); + const lines = buffer.split("\n"); + buffer = lines.pop() || ""; + for (const line of lines) { + processLine(line); + } + }); + + let _stderr = ""; + proc.stderr?.on("data", (data) => { + _stderr += data.toString(); + }); + + proc.on("close", (_code) => { + clearTimeout(timeout); + // Process any remaining buffer + if (buffer.trim()) { + processLine(buffer); + } + + if (!closing) { + // If we got here without resolving, check what we have + resolve({ + messages, + success: resultCount > 0, + errorSeen, + }); + } + }); + + proc.on("error", (err) => { + clearTimeout(timeout); + reject(err); + }); + }); +} + +describe("lazy approval recovery", () => { + test("handles concurrent message while approval is pending", async () => { + const result = await runLazyRecoveryTest(); + + // Log messages for debugging if test fails + if (!result.success) { + console.log("All messages received:"); + for (const msg of result.messages) { + console.log(JSON.stringify(msg, null, 2)); + } + } + + // We should have seen the approval request (proves tool requiring approval was called) + const approvalRequest = result.messages.find( + (m) => m.message_type === "approval_request_message", + ); + expect(approvalRequest).toBeDefined(); + + // The test should complete successfully + expect(result.success).toBe(true); + + // Count results - we should get at least 1 (the second message should always complete) + const resultCount = result.messages.filter( + (m) => m.type === "result", + ).length; + expect(resultCount).toBeGreaterThanOrEqual(1); + + // KEY ASSERTION: Check if we saw the recovery message + // This proves the lazy recovery mechanism was triggered + const recoveryMessage = result.messages.find( + (m) => m.type === "recovery" && m.recovery_type === "approval_pending", + ); + if (recoveryMessage) { + console.log("Recovery message detected - lazy recovery worked correctly"); + expect(result.errorSeen).toBe(true); // Should have been set when we saw recovery + } else { + // Recovery might not be triggered if approval was auto-handled before second message + // This can happen due to timing - the test still validates the flow works + console.log( + "Note: No recovery message seen - approval may have been handled before conflict", + ); + } + }, 180000); // 3 minute timeout for CI +}); diff --git a/src/tools/impl/Skill.ts b/src/tools/impl/Skill.ts index f57ca20..238d903 100644 --- a/src/tools/impl/Skill.ts +++ b/src/tools/impl/Skill.ts @@ -149,7 +149,7 @@ async function readSkillContent( try { const content = await readFile(projectSkillPath, "utf-8"); return { content, path: projectSkillPath }; - } catch (primaryError) { + } catch { // Fallback: check for bundled skills in a repo-level skills directory (legacy) try { const bundledSkillsDir = join(process.cwd(), "skills", "skills"); @@ -157,8 +157,11 @@ async function readSkillContent( const content = await readFile(bundledSkillPath, "utf-8"); return { content, path: bundledSkillPath }; } catch { - // If all fallbacks fail, rethrow the original error - throw primaryError; + // If all fallbacks fail, throw a helpful error message (LET-7101) + // Suggest refresh in case skills sync is still running in background + throw new Error( + `Skill "${skillId}" not found. If you recently added this skill, try Skill({ command: "refresh" }) to re-scan the skills directory.`, + ); } } } diff --git a/src/types/protocol.ts b/src/types/protocol.ts index 2b9da39..96e5e5f 100644 --- a/src/types/protocol.ts +++ b/src/types/protocol.ts @@ -198,6 +198,19 @@ export interface RetryMessage extends MessageEnvelope { run_id?: string; } +/** + * Recovery message emitted when the CLI detects and recovers from errors. + * Used for approval state conflicts and other recoverable errors. + */ +export interface RecoveryMessage extends MessageEnvelope { + type: "recovery"; + /** Type of recovery performed */ + recovery_type: "approval_pending" | "approval_desync"; + /** Human-readable description of what happened */ + message: string; + run_id?: string; +} + // ═══════════════════════════════════════════════════════════════ // RESULT // ═══════════════════════════════════════════════════════════════ @@ -329,6 +342,7 @@ export type WireMessage = | AutoApprovalMessage | ErrorMessage | RetryMessage + | RecoveryMessage | ResultMessage | ControlResponse | ControlRequest; // CLI → SDK control requests (e.g., can_use_tool) diff --git a/src/utils/timing.ts b/src/utils/timing.ts index 1a75e8b..934a6b1 100644 --- a/src/utils/timing.ts +++ b/src/utils/timing.ts @@ -34,6 +34,112 @@ export function logTiming(message: string): void { } } +// ============================================================================ +// Milestone tracking for latency audits +// ============================================================================ + +// Store milestones with their timestamps (ms since process start via performance.now()) +const milestones: Map = new Map(); + +// Reference time for relative measurements (set on first milestone) +let firstMilestoneTime: number | null = null; + +/** + * Mark a named milestone in the boot/execution sequence. + * Call this at key points to track where time is spent. + * + * @param name - Descriptive name like "SETTINGS_LOADED" or "AGENT_RESOLVED" + */ +export function markMilestone(name: string): void { + const now = performance.now(); + milestones.set(name, now); + + if (firstMilestoneTime === null) { + firstMilestoneTime = now; + } + + if (isTimingsEnabled()) { + const relative = now - firstMilestoneTime; + console.error( + `[timing] MILESTONE ${name} at +${formatDuration(relative)} (${formatTimestamp(new Date())})`, + ); + } +} + +/** + * Measure time elapsed since a previous milestone. + * + * @param label - Description of what we're measuring (e.g., "tool loading") + * @param fromMilestone - Name of the starting milestone + */ +export function measureSinceMilestone( + label: string, + fromMilestone: string, +): void { + if (!isTimingsEnabled()) return; + + const startTime = milestones.get(fromMilestone); + if (startTime === undefined) { + console.error( + `[timing] WARNING: milestone "${fromMilestone}" not found for measurement "${label}"`, + ); + return; + } + + const duration = performance.now() - startTime; + console.error(`[timing] ${label}: ${formatDuration(duration)}`); +} + +/** + * Get the duration between two milestones in milliseconds. + * Returns null if either milestone doesn't exist. + */ +export function getMilestoneDuration( + fromMilestone: string, + toMilestone: string, +): number | null { + const start = milestones.get(fromMilestone); + const end = milestones.get(toMilestone); + if (start === undefined || end === undefined) return null; + return end - start; +} + +/** + * Print a summary of all milestones with relative timestamps. + * Useful at the end of a benchmark run. + */ +export function reportAllMilestones(): void { + if (!isTimingsEnabled() || milestones.size === 0) return; + + const first = firstMilestoneTime ?? 0; + + console.error(`[timing] ======== MILESTONE SUMMARY ========`); + + // Sort by timestamp + const sorted = [...milestones.entries()].sort((a, b) => a[1] - b[1]); + + let prevTime = first; + for (const [name, time] of sorted) { + const relativeToStart = time - first; + const delta = time - prevTime; + const deltaStr = prevTime === first ? "" : ` (+${formatDuration(delta)})`; + console.error( + `[timing] +${formatDuration(relativeToStart).padStart(8)} ${name}${deltaStr}`, + ); + prevTime = time; + } + + console.error(`[timing] =====================================`); +} + +/** + * Clear all milestones (useful for running multiple benchmarks in sequence). + */ +export function clearMilestones(): void { + milestones.clear(); + firstMilestoneTime = null; +} + // Simple fetch type that matches the SDK's expected signature type SimpleFetch = ( input: string | URL | Request,