feat: reduce time-to-boot, remove default eager approval checks on inputs, auto-cancel stale approvals (#579)
Co-authored-by: Letta <noreply@letta.com>
This commit is contained in:
249
src/tests/approval-recovery.test.ts
Normal file
249
src/tests/approval-recovery.test.ts
Normal file
@@ -0,0 +1,249 @@
|
||||
import { describe, expect, test } from "bun:test";
|
||||
import type { Message } from "@letta-ai/letta-client/resources/agents/messages";
|
||||
import {
|
||||
isApprovalPendingError,
|
||||
isApprovalStateDesyncError,
|
||||
} from "../agent/approval-recovery";
|
||||
import { extractApprovals } from "../agent/check-approval";
|
||||
|
||||
/**
|
||||
* Tests for approval error detection helpers (LET-7101).
|
||||
*
|
||||
* These functions detect two opposite error conditions:
|
||||
* 1. isApprovalStateDesyncError: Sent approval, but server has no pending approval
|
||||
* 2. isApprovalPendingError: Sent user message, but server has pending approval waiting
|
||||
*/
|
||||
|
||||
describe("isApprovalStateDesyncError", () => {
|
||||
test("detects desync error in detail string", () => {
|
||||
const detail = "No tool call is currently awaiting approval";
|
||||
expect(isApprovalStateDesyncError(detail)).toBe(true);
|
||||
});
|
||||
|
||||
test("detects desync error case-insensitively", () => {
|
||||
const detail = "NO TOOL CALL IS CURRENTLY AWAITING APPROVAL";
|
||||
expect(isApprovalStateDesyncError(detail)).toBe(true);
|
||||
});
|
||||
|
||||
test("detects desync error in longer message", () => {
|
||||
const detail =
|
||||
"Error: No tool call is currently awaiting approval. The approval request may have expired.";
|
||||
expect(isApprovalStateDesyncError(detail)).toBe(true);
|
||||
});
|
||||
|
||||
test("returns false for unrelated errors", () => {
|
||||
expect(isApprovalStateDesyncError("Connection timeout")).toBe(false);
|
||||
expect(isApprovalStateDesyncError("Internal server error")).toBe(false);
|
||||
});
|
||||
|
||||
test("returns false for non-string input", () => {
|
||||
expect(isApprovalStateDesyncError(null)).toBe(false);
|
||||
expect(isApprovalStateDesyncError(undefined)).toBe(false);
|
||||
expect(isApprovalStateDesyncError(123)).toBe(false);
|
||||
expect(isApprovalStateDesyncError({ error: "test" })).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe("isApprovalPendingError", () => {
|
||||
// This is the actual error format from the Letta backend (screenshot from LET-7101)
|
||||
const REAL_ERROR_DETAIL =
|
||||
"CONFLICT: Cannot send a new message: The agent is waiting for approval on a tool call. Please approve or deny the pending request before continuing.";
|
||||
|
||||
test("detects approval pending error in real error format", () => {
|
||||
expect(isApprovalPendingError(REAL_ERROR_DETAIL)).toBe(true);
|
||||
});
|
||||
|
||||
test("detects approval pending error case-insensitively", () => {
|
||||
expect(isApprovalPendingError("CANNOT SEND A NEW MESSAGE")).toBe(true);
|
||||
expect(isApprovalPendingError("cannot send a new message")).toBe(true);
|
||||
});
|
||||
|
||||
test("detects partial match in longer message", () => {
|
||||
const detail = "Error occurred: Cannot send a new message while processing";
|
||||
expect(isApprovalPendingError(detail)).toBe(true);
|
||||
});
|
||||
|
||||
test("returns false for desync errors (opposite case)", () => {
|
||||
// These are the OPPOSITE error - when we send approval but there's nothing pending
|
||||
expect(
|
||||
isApprovalPendingError("No tool call is currently awaiting approval"),
|
||||
).toBe(false);
|
||||
});
|
||||
|
||||
test("returns false for unrelated errors", () => {
|
||||
expect(isApprovalPendingError("Connection timeout")).toBe(false);
|
||||
expect(isApprovalPendingError("Rate limit exceeded")).toBe(false);
|
||||
expect(isApprovalPendingError("Invalid API key")).toBe(false);
|
||||
});
|
||||
|
||||
test("returns false for non-string input", () => {
|
||||
expect(isApprovalPendingError(null)).toBe(false);
|
||||
expect(isApprovalPendingError(undefined)).toBe(false);
|
||||
expect(isApprovalPendingError(123)).toBe(false);
|
||||
expect(isApprovalPendingError({ detail: REAL_ERROR_DETAIL })).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
/**
|
||||
* Tests for parallel tool call approval extraction.
|
||||
* Ensures lazy recovery handles multiple simultaneous tool calls correctly.
|
||||
*/
|
||||
describe("extractApprovals", () => {
|
||||
// Helper to create a minimal Message-like object for testing
|
||||
// We use 'as Message' cast because the real Message type is complex
|
||||
const createMessage = (overrides: {
|
||||
tool_calls?: Array<{
|
||||
tool_call_id?: string;
|
||||
name?: string;
|
||||
arguments?: string;
|
||||
}>;
|
||||
tool_call?: {
|
||||
tool_call_id?: string;
|
||||
name?: string;
|
||||
arguments?: string;
|
||||
};
|
||||
}): Message =>
|
||||
({
|
||||
id: "test-msg-id",
|
||||
date: new Date().toISOString(),
|
||||
message_type: "approval_request_message",
|
||||
...overrides,
|
||||
}) as unknown as Message;
|
||||
|
||||
test("extracts single tool call from tool_calls array", () => {
|
||||
const msg = createMessage({
|
||||
tool_calls: [
|
||||
{
|
||||
tool_call_id: "call-1",
|
||||
name: "Bash",
|
||||
arguments: '{"command": "echo hello"}',
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const result = extractApprovals(msg);
|
||||
|
||||
expect(result.pendingApprovals).toHaveLength(1);
|
||||
expect(result.pendingApprovals[0]!.toolCallId).toBe("call-1");
|
||||
expect(result.pendingApprovals[0]!.toolName).toBe("Bash");
|
||||
expect(result.pendingApproval?.toolCallId).toBe("call-1");
|
||||
});
|
||||
|
||||
test("extracts multiple parallel tool calls", () => {
|
||||
const msg = createMessage({
|
||||
tool_calls: [
|
||||
{
|
||||
tool_call_id: "call-1",
|
||||
name: "Bash",
|
||||
arguments: '{"command": "echo hello"}',
|
||||
},
|
||||
{
|
||||
tool_call_id: "call-2",
|
||||
name: "web_search",
|
||||
arguments: '{"query": "test"}',
|
||||
},
|
||||
{
|
||||
tool_call_id: "call-3",
|
||||
name: "Read",
|
||||
arguments: '{"file_path": "/tmp/test.txt"}',
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const result = extractApprovals(msg);
|
||||
|
||||
expect(result.pendingApprovals).toHaveLength(3);
|
||||
expect(result.pendingApprovals[0]!.toolCallId).toBe("call-1");
|
||||
expect(result.pendingApprovals[0]!.toolName).toBe("Bash");
|
||||
expect(result.pendingApprovals[1]!.toolCallId).toBe("call-2");
|
||||
expect(result.pendingApprovals[1]!.toolName).toBe("web_search");
|
||||
expect(result.pendingApprovals[2]!.toolCallId).toBe("call-3");
|
||||
expect(result.pendingApprovals[2]!.toolName).toBe("Read");
|
||||
// pendingApproval is deprecated, should be first item
|
||||
expect(result.pendingApproval?.toolCallId).toBe("call-1");
|
||||
});
|
||||
|
||||
test("handles deprecated single tool_call field", () => {
|
||||
const msg = createMessage({
|
||||
tool_call: {
|
||||
tool_call_id: "call-legacy",
|
||||
name: "Write",
|
||||
arguments: '{"file_path": "/tmp/out.txt"}',
|
||||
},
|
||||
});
|
||||
|
||||
const result = extractApprovals(msg);
|
||||
|
||||
expect(result.pendingApprovals).toHaveLength(1);
|
||||
expect(result.pendingApprovals[0]!.toolCallId).toBe("call-legacy");
|
||||
expect(result.pendingApprovals[0]!.toolName).toBe("Write");
|
||||
});
|
||||
|
||||
test("prefers tool_calls array over deprecated tool_call", () => {
|
||||
const msg = createMessage({
|
||||
tool_calls: [{ tool_call_id: "call-new", name: "Bash", arguments: "{}" }],
|
||||
tool_call: {
|
||||
tool_call_id: "call-old",
|
||||
name: "Write",
|
||||
arguments: "{}",
|
||||
},
|
||||
});
|
||||
|
||||
const result = extractApprovals(msg);
|
||||
|
||||
// Should use tool_calls, not tool_call
|
||||
expect(result.pendingApprovals).toHaveLength(1);
|
||||
expect(result.pendingApprovals[0]!.toolCallId).toBe("call-new");
|
||||
});
|
||||
|
||||
test("filters out tool calls without tool_call_id", () => {
|
||||
const msg = createMessage({
|
||||
tool_calls: [
|
||||
{ tool_call_id: "call-valid", name: "Bash", arguments: "{}" },
|
||||
{ name: "Invalid", arguments: "{}" }, // Missing tool_call_id
|
||||
{ tool_call_id: "", name: "Empty", arguments: "{}" }, // Empty tool_call_id
|
||||
{ tool_call_id: "call-valid-2", name: "Read", arguments: "{}" },
|
||||
],
|
||||
});
|
||||
|
||||
const result = extractApprovals(msg);
|
||||
|
||||
// Should only include entries with valid tool_call_id
|
||||
expect(result.pendingApprovals).toHaveLength(2);
|
||||
expect(result.pendingApprovals[0]!.toolCallId).toBe("call-valid");
|
||||
expect(result.pendingApprovals[1]!.toolCallId).toBe("call-valid-2");
|
||||
});
|
||||
|
||||
test("returns empty array when no tool calls present", () => {
|
||||
const msg = createMessage({});
|
||||
|
||||
const result = extractApprovals(msg);
|
||||
|
||||
expect(result.pendingApprovals).toHaveLength(0);
|
||||
expect(result.pendingApproval).toBeNull();
|
||||
});
|
||||
|
||||
test("handles missing name and arguments gracefully", () => {
|
||||
const msg = createMessage({
|
||||
tool_calls: [{ tool_call_id: "call-minimal" }],
|
||||
});
|
||||
|
||||
const result = extractApprovals(msg);
|
||||
|
||||
expect(result.pendingApprovals).toHaveLength(1);
|
||||
expect(result.pendingApprovals[0]!.toolCallId).toBe("call-minimal");
|
||||
expect(result.pendingApprovals[0]!.toolName).toBe("");
|
||||
expect(result.pendingApprovals[0]!.toolArgs).toBe("");
|
||||
});
|
||||
});
|
||||
|
||||
/**
|
||||
* Note: Full integration testing of lazy approval recovery requires:
|
||||
* 1. Starting CLI without --yolo
|
||||
* 2. Sending a prompt that triggers a tool call requiring approval
|
||||
* 3. Instead of approving, sending another user message
|
||||
* 4. Verifying the CONFLICT error is detected and recovery happens
|
||||
*
|
||||
* This is complex to automate reliably in unit tests.
|
||||
* Manual testing or a dedicated integration test suite is recommended.
|
||||
*/
|
||||
256
src/tests/lazy-approval-recovery.test.ts
Normal file
256
src/tests/lazy-approval-recovery.test.ts
Normal file
@@ -0,0 +1,256 @@
|
||||
import { describe, expect, test } from "bun:test";
|
||||
import { spawn } from "node:child_process";
|
||||
|
||||
/**
|
||||
* Integration test for lazy approval recovery (LET-7101).
|
||||
*
|
||||
* NOTE: The lazy approval recovery is primarily designed for TUI mode where:
|
||||
* 1. User has a session with pending approvals (e.g., from a previous run)
|
||||
* 2. User sends a new message before responding to the approval
|
||||
* 3. Server returns CONFLICT error
|
||||
* 4. CLI recovers by auto-denying stale approvals and retrying
|
||||
*
|
||||
* In bidirectional mode, messages sent during permission wait are dropped
|
||||
* (see headless.ts line 1710-1714), so we can't directly test the CONFLICT
|
||||
* scenario here. This test validates that the flow doesn't crash when
|
||||
* messages are sent while approvals are pending.
|
||||
*
|
||||
* The RecoveryMessage emission can be tested by:
|
||||
* 1. Manual testing in TUI mode (start session with orphaned approval)
|
||||
* 2. Or by modifying headless mode to not drop messages during permission wait
|
||||
*/
|
||||
|
||||
// Prompt that will trigger a Bash tool call requiring approval
|
||||
const BASH_TRIGGER_PROMPT =
|
||||
"Run this exact bash command: echo test123. Do not use any other tools.";
|
||||
|
||||
// Second message to send while approval is pending
|
||||
const INTERRUPT_MESSAGE =
|
||||
"Actually, just say OK instead. Do not call any tools.";
|
||||
|
||||
interface StreamMessage {
|
||||
type: string;
|
||||
subtype?: string;
|
||||
message_type?: string;
|
||||
stop_reason?: string;
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
[key: string]: any;
|
||||
}
|
||||
|
||||
/**
|
||||
* Run bidirectional test with custom message handling.
|
||||
* Allows sending messages at specific points in the flow.
|
||||
*/
|
||||
async function runLazyRecoveryTest(timeoutMs = 180000): Promise<{
|
||||
messages: StreamMessage[];
|
||||
success: boolean;
|
||||
errorSeen: boolean;
|
||||
}> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const proc = spawn(
|
||||
"bun",
|
||||
[
|
||||
"run",
|
||||
"dev",
|
||||
"-p",
|
||||
"--input-format",
|
||||
"stream-json",
|
||||
"--output-format",
|
||||
"stream-json",
|
||||
"--new-agent",
|
||||
"-m",
|
||||
"haiku",
|
||||
// NOTE: No --yolo flag - approvals are required
|
||||
],
|
||||
{
|
||||
cwd: process.cwd(),
|
||||
env: { ...process.env },
|
||||
},
|
||||
);
|
||||
|
||||
const messages: StreamMessage[] = [];
|
||||
let buffer = "";
|
||||
let initReceived = false;
|
||||
let approvalSeen = false;
|
||||
let interruptSent = false;
|
||||
let errorSeen = false;
|
||||
let resultCount = 0;
|
||||
let closing = false;
|
||||
|
||||
const timeout = setTimeout(() => {
|
||||
if (!closing) {
|
||||
proc.kill();
|
||||
reject(new Error(`Test timed out after ${timeoutMs}ms`));
|
||||
}
|
||||
}, timeoutMs);
|
||||
|
||||
const cleanup = () => {
|
||||
closing = true;
|
||||
clearTimeout(timeout);
|
||||
setTimeout(() => {
|
||||
proc.stdin?.end();
|
||||
proc.kill();
|
||||
}, 500);
|
||||
};
|
||||
|
||||
const processLine = (line: string) => {
|
||||
if (!line.trim()) return;
|
||||
try {
|
||||
const msg: StreamMessage = JSON.parse(line);
|
||||
messages.push(msg);
|
||||
|
||||
// Debug output
|
||||
if (process.env.DEBUG_TEST) {
|
||||
console.log("MSG:", JSON.stringify(msg, null, 2));
|
||||
}
|
||||
|
||||
// Step 1: Wait for init, then send bash trigger prompt
|
||||
if (msg.type === "system" && msg.subtype === "init" && !initReceived) {
|
||||
initReceived = true;
|
||||
const userMsg = JSON.stringify({
|
||||
type: "user",
|
||||
message: { role: "user", content: BASH_TRIGGER_PROMPT },
|
||||
});
|
||||
proc.stdin?.write(`${userMsg}\n`);
|
||||
return;
|
||||
}
|
||||
|
||||
// Step 2: When we see approval request, send another user message instead
|
||||
if (
|
||||
msg.type === "message" &&
|
||||
msg.message_type === "approval_request_message" &&
|
||||
!approvalSeen
|
||||
) {
|
||||
approvalSeen = true;
|
||||
// Wait a moment, then send interrupt message (NOT an approval)
|
||||
setTimeout(() => {
|
||||
if (!interruptSent) {
|
||||
interruptSent = true;
|
||||
const userMsg = JSON.stringify({
|
||||
type: "user",
|
||||
message: { role: "user", content: INTERRUPT_MESSAGE },
|
||||
});
|
||||
proc.stdin?.write(`${userMsg}\n`);
|
||||
}
|
||||
}, 500);
|
||||
return;
|
||||
}
|
||||
|
||||
// Track recovery messages - this is the key signal that lazy recovery worked
|
||||
if (
|
||||
msg.type === "recovery" &&
|
||||
msg.recovery_type === "approval_pending"
|
||||
) {
|
||||
errorSeen = true; // reusing this flag to mean "recovery message seen"
|
||||
}
|
||||
|
||||
// Also track raw errors (shouldn't see these if recovery works properly)
|
||||
if (
|
||||
msg.type === "error" ||
|
||||
(msg.type === "message" && msg.message_type === "error_message")
|
||||
) {
|
||||
const detail = msg.detail || msg.message || "";
|
||||
if (detail.toLowerCase().includes("cannot send a new message")) {
|
||||
// Raw error leaked through - recovery may have failed
|
||||
console.log(
|
||||
"WARNING: Raw CONFLICT error seen (recovery may have failed)",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Track results - we need 2 (one for each user message, though first may fail)
|
||||
if (msg.type === "result") {
|
||||
resultCount++;
|
||||
// After second result (or after seeing error + result), we're done
|
||||
if (resultCount >= 2 || (errorSeen && resultCount >= 1)) {
|
||||
cleanup();
|
||||
resolve({ messages, success: true, errorSeen });
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// Not valid JSON, ignore
|
||||
}
|
||||
};
|
||||
|
||||
proc.stdout?.on("data", (data) => {
|
||||
buffer += data.toString();
|
||||
const lines = buffer.split("\n");
|
||||
buffer = lines.pop() || "";
|
||||
for (const line of lines) {
|
||||
processLine(line);
|
||||
}
|
||||
});
|
||||
|
||||
let _stderr = "";
|
||||
proc.stderr?.on("data", (data) => {
|
||||
_stderr += data.toString();
|
||||
});
|
||||
|
||||
proc.on("close", (_code) => {
|
||||
clearTimeout(timeout);
|
||||
// Process any remaining buffer
|
||||
if (buffer.trim()) {
|
||||
processLine(buffer);
|
||||
}
|
||||
|
||||
if (!closing) {
|
||||
// If we got here without resolving, check what we have
|
||||
resolve({
|
||||
messages,
|
||||
success: resultCount > 0,
|
||||
errorSeen,
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
proc.on("error", (err) => {
|
||||
clearTimeout(timeout);
|
||||
reject(err);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
describe("lazy approval recovery", () => {
|
||||
test("handles concurrent message while approval is pending", async () => {
|
||||
const result = await runLazyRecoveryTest();
|
||||
|
||||
// Log messages for debugging if test fails
|
||||
if (!result.success) {
|
||||
console.log("All messages received:");
|
||||
for (const msg of result.messages) {
|
||||
console.log(JSON.stringify(msg, null, 2));
|
||||
}
|
||||
}
|
||||
|
||||
// We should have seen the approval request (proves tool requiring approval was called)
|
||||
const approvalRequest = result.messages.find(
|
||||
(m) => m.message_type === "approval_request_message",
|
||||
);
|
||||
expect(approvalRequest).toBeDefined();
|
||||
|
||||
// The test should complete successfully
|
||||
expect(result.success).toBe(true);
|
||||
|
||||
// Count results - we should get at least 1 (the second message should always complete)
|
||||
const resultCount = result.messages.filter(
|
||||
(m) => m.type === "result",
|
||||
).length;
|
||||
expect(resultCount).toBeGreaterThanOrEqual(1);
|
||||
|
||||
// KEY ASSERTION: Check if we saw the recovery message
|
||||
// This proves the lazy recovery mechanism was triggered
|
||||
const recoveryMessage = result.messages.find(
|
||||
(m) => m.type === "recovery" && m.recovery_type === "approval_pending",
|
||||
);
|
||||
if (recoveryMessage) {
|
||||
console.log("Recovery message detected - lazy recovery worked correctly");
|
||||
expect(result.errorSeen).toBe(true); // Should have been set when we saw recovery
|
||||
} else {
|
||||
// Recovery might not be triggered if approval was auto-handled before second message
|
||||
// This can happen due to timing - the test still validates the flow works
|
||||
console.log(
|
||||
"Note: No recovery message seen - approval may have been handled before conflict",
|
||||
);
|
||||
}
|
||||
}, 180000); // 3 minute timeout for CI
|
||||
});
|
||||
Reference in New Issue
Block a user