ci(test): split unit vs API integration tests (#797)

This commit is contained in:
paulbettner
2026-02-03 19:09:40 -06:00
committed by GitHub
parent 55a31adae7
commit d175b0e155
6 changed files with 292 additions and 240 deletions

View File

@@ -1,491 +0,0 @@
import { describe, expect, test } from "bun:test";
import { spawn } from "node:child_process";
import type {
ControlResponse,
ErrorMessage,
ResultMessage,
StreamEvent,
SystemInitMessage,
WireMessage,
} from "../types/protocol";
/**
* Tests for --input-format stream-json bidirectional communication.
* These verify the CLI's wire format for bidirectional communication.
*/
// Prescriptive prompt to ensure single-step response without tool use
const FAST_PROMPT =
"This is a test. Do not call any tools. Just respond with the word OK and nothing else.";
/**
* Helper to run bidirectional commands with stdin input.
* Event-driven: waits for init message before sending input, waits for result before closing.
*/
async function runBidirectional(
inputs: string[],
extraArgs: string[] = [],
timeoutMs = 180000, // 180s timeout - CI can be very slow
): Promise<object[]> {
return new Promise((resolve, reject) => {
const proc = spawn(
"bun",
[
"run",
"dev",
"-p",
"--input-format",
"stream-json",
"--output-format",
"stream-json",
"--new-agent",
"-m",
"haiku",
"--yolo",
...extraArgs,
],
{
cwd: process.cwd(),
// Mark as subagent to prevent polluting user's LRU settings
env: { ...process.env, LETTA_CODE_AGENT_ROLE: "subagent" },
},
);
const objects: object[] = [];
let buffer = "";
let inputIndex = 0;
let initReceived = false;
let closing = false;
// Count expected responses based on input types
const inputTypes = inputs.map((i) => {
try {
const parsed = JSON.parse(i);
return parsed.type;
} catch {
return "invalid"; // Invalid JSON
}
});
const expectedUserResults = inputTypes.filter((t) => t === "user").length;
const expectedControlResponses = inputTypes.filter(
(t) => t === "control_request",
).length;
const hasInvalidInput = inputTypes.includes("invalid");
let userResultsReceived = 0;
let controlResponsesReceived = 0;
const maybeClose = () => {
if (closing) return;
// For invalid input, close after receiving error
// For control requests only, close after all control_responses
// For user messages, close after all results
// For mixed, close when we have all expected responses
const allUserResultsDone =
expectedUserResults === 0 || userResultsReceived >= expectedUserResults;
const allControlResponsesDone =
expectedControlResponses === 0 ||
controlResponsesReceived >= expectedControlResponses;
const allInputsSent = inputIndex >= inputs.length;
if (allInputsSent && allUserResultsDone && allControlResponsesDone) {
closing = true;
setTimeout(() => proc.stdin?.end(), 500);
}
};
const processLine = (line: string) => {
if (!line.trim()) return;
try {
const obj = JSON.parse(line);
objects.push(obj);
// Check for init message - signal to start sending inputs
if (obj.type === "system" && obj.subtype === "init" && !initReceived) {
initReceived = true;
sendNextInput();
}
// Check for control_response
if (obj.type === "control_response") {
controlResponsesReceived++;
maybeClose();
}
// Check for result message
if (obj.type === "result") {
userResultsReceived++;
// If more inputs to send, send next after a brief delay
// This gives the CLI time to be ready for the next input
if (inputIndex < inputs.length) {
setTimeout(sendNextInput, 200);
}
// Always check if we should close (might have received all expected results)
maybeClose();
}
// Check for error message (for invalid JSON input test)
if (obj.type === "error" && hasInvalidInput) {
closing = true;
setTimeout(() => proc.stdin?.end(), 500);
}
} catch {
// Not valid JSON, ignore
}
};
const sendNextInput = () => {
if (inputIndex < inputs.length) {
proc.stdin?.write(`${inputs[inputIndex]}\n`);
inputIndex++;
}
};
proc.stdout?.on("data", (data) => {
buffer += data.toString();
const lines = buffer.split("\n");
buffer = lines.pop() || ""; // Keep incomplete line in buffer
for (const line of lines) {
processLine(line);
}
});
let stderr = "";
proc.stderr?.on("data", (data) => {
stderr += data.toString();
});
proc.on("close", (code) => {
// Process any remaining buffer
if (buffer.trim()) {
processLine(buffer);
}
// Check if we got enough results
const gotExpectedResults =
userResultsReceived >= expectedUserResults &&
controlResponsesReceived >= expectedControlResponses;
if (objects.length === 0 && code !== 0) {
reject(
new Error(
`Process exited with code ${code}, no output received. stderr: ${stderr}`,
),
);
} else if (!gotExpectedResults && code !== 0) {
reject(
new Error(
`Process exited with code ${code} before all results received. ` +
`Got ${userResultsReceived}/${expectedUserResults} user results, ` +
`${controlResponsesReceived}/${expectedControlResponses} control responses. ` +
`inputIndex: ${inputIndex}, initReceived: ${initReceived}. stderr: ${stderr}`,
),
);
} else {
resolve(objects);
}
});
// Safety timeout
const timeout = setTimeout(() => {
proc.kill();
reject(
new Error(
`Timeout after ${timeoutMs}ms. Received ${objects.length} objects, init: ${initReceived}, userResults: ${userResultsReceived}/${expectedUserResults}, controlResponses: ${controlResponsesReceived}/${expectedControlResponses}`,
),
);
}, timeoutMs);
proc.on("close", () => clearTimeout(timeout));
});
}
describe("input-format stream-json", () => {
test(
"initialize control request returns session info",
async () => {
const objects = (await runBidirectional([
JSON.stringify({
type: "control_request",
request_id: "init_1",
request: { subtype: "initialize" },
}),
])) as WireMessage[];
// Should have init event
const initEvent = objects.find(
(o): o is SystemInitMessage =>
o.type === "system" && "subtype" in o && o.subtype === "init",
);
expect(initEvent).toBeDefined();
expect(initEvent?.agent_id).toBeDefined();
expect(initEvent?.session_id).toBeDefined();
expect(initEvent?.model).toBeDefined();
expect(initEvent?.tools).toBeInstanceOf(Array);
// Should have control_response
const controlResponse = objects.find(
(o): o is ControlResponse => o.type === "control_response",
);
expect(controlResponse).toBeDefined();
expect(controlResponse?.response.subtype).toBe("success");
expect(controlResponse?.response.request_id).toBe("init_1");
if (controlResponse?.response.subtype === "success") {
const initResponse = controlResponse.response.response as
| { agent_id?: string }
| undefined;
expect(initResponse?.agent_id).toBeDefined();
}
},
{ timeout: 200000 },
);
test(
"user message returns assistant response and result",
async () => {
const objects = (await runBidirectional([
JSON.stringify({
type: "user",
message: { role: "user", content: FAST_PROMPT },
}),
])) as WireMessage[];
// Should have init event
const initEvent = objects.find(
(o): o is SystemInitMessage =>
o.type === "system" && "subtype" in o && o.subtype === "init",
);
expect(initEvent).toBeDefined();
// Should have message events
const messageEvents = objects.filter(
(o): o is WireMessage & { type: "message" } => o.type === "message",
);
expect(messageEvents.length).toBeGreaterThan(0);
// All messages should have session_id
// uuid is present on content messages (reasoning, assistant) but not meta messages (stop_reason, usage_statistics)
for (const msg of messageEvents) {
expect(msg.session_id).toBeDefined();
}
// Content messages should have uuid
const contentMessages = messageEvents.filter(
(m) =>
"message_type" in m &&
(m.message_type === "reasoning_message" ||
m.message_type === "assistant_message"),
);
for (const msg of contentMessages) {
expect(msg.uuid).toBeDefined();
}
// Should have result
const result = objects.find(
(o): o is ResultMessage => o.type === "result",
);
expect(result).toBeDefined();
expect(result?.subtype).toBe("success");
expect(result?.session_id).toBeDefined();
expect(result?.agent_id).toBeDefined();
expect(result?.duration_ms).toBeGreaterThan(0);
},
{ timeout: 200000 },
);
test(
"multi-turn conversation maintains context",
async () => {
// Multi-turn test needs 2 sequential LLM calls, so allow more time
const objects = (await runBidirectional(
[
JSON.stringify({
type: "user",
message: {
role: "user",
content: "Say hello",
},
}),
JSON.stringify({
type: "user",
message: {
role: "user",
content: "Say goodbye",
},
}),
],
[], // no extra args
300000, // 300s for 2 sequential LLM calls - CI can be very slow
)) as WireMessage[];
// Should have at least two results (one per turn)
const results = objects.filter(
(o): o is ResultMessage => o.type === "result",
);
expect(results.length).toBeGreaterThanOrEqual(2);
// Both results should be successful
for (const result of results) {
expect(result.subtype).toBe("success");
expect(result.session_id).toBeDefined();
expect(result.agent_id).toBeDefined();
}
// The session_id should be consistent across turns (same agent)
const firstResult = results[0];
const lastResult = results[results.length - 1];
expect(firstResult).toBeDefined();
expect(lastResult).toBeDefined();
if (firstResult && lastResult) {
expect(firstResult.session_id).toBe(lastResult.session_id);
}
},
{ timeout: 320000 },
);
test(
"interrupt control request is acknowledged",
async () => {
const objects = (await runBidirectional([
JSON.stringify({
type: "control_request",
request_id: "int_1",
request: { subtype: "interrupt" },
}),
])) as WireMessage[];
// Should have control_response for interrupt
const controlResponse = objects.find(
(o): o is ControlResponse =>
o.type === "control_response" && o.response?.request_id === "int_1",
);
expect(controlResponse).toBeDefined();
expect(controlResponse?.response.subtype).toBe("success");
},
{ timeout: 200000 },
);
test(
"--include-partial-messages emits stream_event in bidirectional mode",
async () => {
const objects = (await runBidirectional(
[
JSON.stringify({
type: "user",
message: { role: "user", content: FAST_PROMPT },
}),
],
["--include-partial-messages"],
)) as WireMessage[];
// Should have stream_event messages (not just "message" type)
const streamEvents = objects.filter(
(o): o is StreamEvent => o.type === "stream_event",
);
expect(streamEvents.length).toBeGreaterThan(0);
// Each stream_event should have the event payload and session_id
// uuid is present on content events but not meta events (stop_reason, usage_statistics)
for (const event of streamEvents) {
expect(event.event).toBeDefined();
expect(event.session_id).toBeDefined();
}
// Content events should have uuid
const contentEvents = streamEvents.filter(
(e) =>
"message_type" in e.event &&
(e.event.message_type === "reasoning_message" ||
e.event.message_type === "assistant_message"),
);
for (const event of contentEvents) {
expect(event.uuid).toBeDefined();
}
// Should still have result
const result = objects.find(
(o): o is ResultMessage => o.type === "result",
);
expect(result).toBeDefined();
expect(result?.subtype).toBe("success");
},
{ timeout: 200000 },
);
test(
"unknown control request returns error",
async () => {
const objects = (await runBidirectional([
JSON.stringify({
type: "control_request",
request_id: "unknown_1",
request: { subtype: "unknown_subtype" },
}),
])) as WireMessage[];
// Should have control_response with error
const controlResponse = objects.find(
(o): o is ControlResponse =>
o.type === "control_response" &&
o.response?.request_id === "unknown_1",
);
expect(controlResponse).toBeDefined();
expect(controlResponse?.response.subtype).toBe("error");
},
{ timeout: 200000 },
);
test(
"invalid JSON input returns error message",
async () => {
// Use raw string instead of JSON
const objects = (await runBidirectional([
"not valid json",
])) as WireMessage[];
// Should have error message
const errorMsg = objects.find(
(o): o is ErrorMessage => o.type === "error",
);
expect(errorMsg).toBeDefined();
expect(errorMsg?.message).toContain("Invalid JSON");
},
{ timeout: 200000 },
);
test(
"Task tool with explore subagent works",
async () => {
// Prescriptive prompt to ensure Task tool is used
const objects = (await runBidirectional(
[
JSON.stringify({
type: "user",
message: {
role: "user",
content:
"You MUST use the Task tool with subagent_type='explore' to find TypeScript files (*.ts) in the src directory. " +
"Return only the subagent's report, nothing else.",
},
}),
],
[],
300000, // 5 min timeout - subagent spawn + execution can be slow
)) as WireMessage[];
// Should have a successful result
const result = objects.find(
(o): o is ResultMessage => o.type === "result",
);
expect(result).toBeDefined();
expect(result?.subtype).toBe("success");
// Should have auto_approval events (Task tool was auto-approved via --yolo)
const autoApprovals = objects.filter((o) => o.type === "auto_approval");
expect(autoApprovals.length).toBeGreaterThan(0);
},
{ timeout: 320000 },
);
});

View File

@@ -1,223 +0,0 @@
import { describe, expect, test } from "bun:test";
import { spawn } from "node:child_process";
import type {
ResultMessage,
StreamEvent,
SystemInitMessage,
} from "../types/protocol";
/**
* Tests for stream-json output format.
* These verify the message structure matches the wire format types.
*/
async function runHeadlessCommand(
prompt: string,
extraArgs: string[] = [],
timeoutMs = 180000, // 180s timeout - CI can be very slow
): Promise<string[]> {
return new Promise((resolve, reject) => {
const proc = spawn(
"bun",
[
"run",
"dev",
"--new-agent",
"-p",
prompt,
"--output-format",
"stream-json",
"--yolo",
"-m",
"haiku",
...extraArgs,
],
{
cwd: process.cwd(),
// Mark as subagent to prevent polluting user's LRU settings
env: { ...process.env, LETTA_CODE_AGENT_ROLE: "subagent" },
},
);
let stdout = "";
let stderr = "";
proc.stdout.on("data", (data) => {
stdout += data.toString();
});
proc.stderr.on("data", (data) => {
stderr += data.toString();
});
// Safety timeout for CI
const timeout = setTimeout(() => {
proc.kill();
reject(new Error(`Process timeout after ${timeoutMs}ms: ${stderr}`));
}, timeoutMs);
proc.on("close", (code) => {
clearTimeout(timeout);
if (code !== 0 && !stdout.includes('"type":"result"')) {
reject(new Error(`Process exited with code ${code}: ${stderr}`));
} else {
// Parse line-delimited JSON
const lines = stdout
.split("\n")
.filter((line) => line.trim())
.filter((line) => {
try {
JSON.parse(line);
return true;
} catch {
return false;
}
});
resolve(lines);
}
});
});
}
// Prescriptive prompt to ensure single-step response without tool use
const FAST_PROMPT =
"This is a test. Do not call any tools. Just respond with the word OK and nothing else.";
describe("stream-json format", () => {
test(
"init message has type 'system' with subtype 'init'",
async () => {
const lines = await runHeadlessCommand(FAST_PROMPT);
const initLine = lines.find((line) => {
const obj = JSON.parse(line);
return obj.type === "system" && obj.subtype === "init";
});
expect(initLine).toBeDefined();
if (!initLine) throw new Error("initLine not found");
const init = JSON.parse(initLine) as SystemInitMessage;
expect(init.type).toBe("system");
expect(init.subtype).toBe("init");
expect(init.agent_id).toBeDefined();
expect(init.session_id).toBe(init.agent_id); // session_id should equal agent_id
expect(init.model).toBeDefined();
expect(init.tools).toBeInstanceOf(Array);
expect(init.cwd).toBeDefined();
expect(init.uuid).toBe(`init-${init.agent_id}`);
},
{ timeout: 200000 },
);
test(
"messages have session_id and uuid",
async () => {
const lines = await runHeadlessCommand(FAST_PROMPT);
// Find a message line
const messageLine = lines.find((line) => {
const obj = JSON.parse(line);
return obj.type === "message";
});
expect(messageLine).toBeDefined();
if (!messageLine) throw new Error("messageLine not found");
const msg = JSON.parse(messageLine) as {
session_id: string;
uuid: string;
};
expect(msg.session_id).toBeDefined();
expect(msg.uuid).toBeDefined();
// uuid should be otid or id from the Letta SDK chunk
expect(msg.uuid).toBeTruthy();
},
{ timeout: 200000 },
);
test(
"result message has correct format",
async () => {
const lines = await runHeadlessCommand(FAST_PROMPT);
const resultLine = lines.find((line) => {
const obj = JSON.parse(line);
return obj.type === "result";
});
expect(resultLine).toBeDefined();
if (!resultLine) throw new Error("resultLine not found");
const result = JSON.parse(resultLine) as ResultMessage & { uuid: string };
expect(result.type).toBe("result");
expect(result.subtype).toBe("success");
expect(result.session_id).toBeDefined();
expect(result.agent_id).toBeDefined();
expect(result.session_id).toBe(result.agent_id);
expect(result.duration_ms).toBeGreaterThan(0);
expect(result.uuid).toContain("result-");
expect(result.result).toBeDefined();
},
{ timeout: 200000 },
);
test(
"--include-partial-messages wraps chunks in stream_event",
async () => {
const lines = await runHeadlessCommand(FAST_PROMPT, [
"--include-partial-messages",
]);
// Find a stream_event line
const streamEventLine = lines.find((line) => {
const obj = JSON.parse(line);
return obj.type === "stream_event";
});
expect(streamEventLine).toBeDefined();
if (!streamEventLine) throw new Error("streamEventLine not found");
const event = JSON.parse(streamEventLine) as StreamEvent;
expect(event.type).toBe("stream_event");
expect(event.event).toBeDefined();
expect(event.session_id).toBeDefined();
expect(event.uuid).toBeDefined();
// The event should contain the original Letta SDK chunk
expect("message_type" in event.event).toBe(true);
},
{ timeout: 200000 },
);
test(
"without --include-partial-messages, messages are type 'message'",
async () => {
const lines = await runHeadlessCommand(FAST_PROMPT);
// Should have message lines, not stream_event
const messageLines = lines.filter((line) => {
const obj = JSON.parse(line);
return obj.type === "message";
});
const streamEventLines = lines.filter((line) => {
const obj = JSON.parse(line);
return obj.type === "stream_event";
});
// We should have some message lines (reasoning, assistant, stop_reason, etc.)
// In rare cases with very fast responses, we might only get init + result
// So check that IF we have content, it's "message" not "stream_event"
if (messageLines.length > 0 || streamEventLines.length > 0) {
expect(messageLines.length).toBeGreaterThan(0);
expect(streamEventLines.length).toBe(0);
}
// Always should have a result
const resultLine = lines.find((line) => {
const obj = JSON.parse(line);
return obj.type === "result";
});
expect(resultLine).toBeDefined();
},
{ timeout: 200000 },
);
});

View File

@@ -1,257 +0,0 @@
import { describe, expect, test } from "bun:test";
import { spawn } from "node:child_process";
/**
* Integration test for lazy approval recovery (LET-7101).
*
* NOTE: The lazy approval recovery is primarily designed for TUI mode where:
* 1. User has a session with pending approvals (e.g., from a previous run)
* 2. User sends a new message before responding to the approval
* 3. Server returns CONFLICT error
* 4. CLI recovers by auto-denying stale approvals and retrying
*
* In bidirectional mode, messages sent during permission wait are dropped
* (see headless.ts line 1710-1714), so we can't directly test the CONFLICT
* scenario here. This test validates that the flow doesn't crash when
* messages are sent while approvals are pending.
*
* The RecoveryMessage emission can be tested by:
* 1. Manual testing in TUI mode (start session with orphaned approval)
* 2. Or by modifying headless mode to not drop messages during permission wait
*/
// Prompt that will trigger a Bash tool call requiring approval
const BASH_TRIGGER_PROMPT =
"Run this exact bash command: echo test123. Do not use any other tools.";
// Second message to send while approval is pending
const INTERRUPT_MESSAGE =
"Actually, just say OK instead. Do not call any tools.";
interface StreamMessage {
type: string;
subtype?: string;
message_type?: string;
stop_reason?: string;
// biome-ignore lint/suspicious/noExplicitAny: index signature for arbitrary JSON fields
[key: string]: any;
}
/**
* Run bidirectional test with custom message handling.
* Allows sending messages at specific points in the flow.
*/
async function runLazyRecoveryTest(timeoutMs = 180000): Promise<{
messages: StreamMessage[];
success: boolean;
errorSeen: boolean;
}> {
return new Promise((resolve, reject) => {
const proc = spawn(
"bun",
[
"run",
"dev",
"-p",
"--input-format",
"stream-json",
"--output-format",
"stream-json",
"--new-agent",
"-m",
"haiku",
// NOTE: No --yolo flag - approvals are required
],
{
cwd: process.cwd(),
// Mark as subagent to prevent polluting user's LRU settings
env: { ...process.env, LETTA_CODE_AGENT_ROLE: "subagent" },
},
);
const messages: StreamMessage[] = [];
let buffer = "";
let initReceived = false;
let approvalSeen = false;
let interruptSent = false;
let errorSeen = false;
let resultCount = 0;
let closing = false;
const timeout = setTimeout(() => {
if (!closing) {
proc.kill();
reject(new Error(`Test timed out after ${timeoutMs}ms`));
}
}, timeoutMs);
const cleanup = () => {
closing = true;
clearTimeout(timeout);
setTimeout(() => {
proc.stdin?.end();
proc.kill();
}, 500);
};
const processLine = (line: string) => {
if (!line.trim()) return;
try {
const msg: StreamMessage = JSON.parse(line);
messages.push(msg);
// Debug output
if (process.env.DEBUG_TEST) {
console.log("MSG:", JSON.stringify(msg, null, 2));
}
// Step 1: Wait for init, then send bash trigger prompt
if (msg.type === "system" && msg.subtype === "init" && !initReceived) {
initReceived = true;
const userMsg = JSON.stringify({
type: "user",
message: { role: "user", content: BASH_TRIGGER_PROMPT },
});
proc.stdin?.write(`${userMsg}\n`);
return;
}
// Step 2: When we see approval request, send another user message instead
if (
msg.type === "message" &&
msg.message_type === "approval_request_message" &&
!approvalSeen
) {
approvalSeen = true;
// Wait a moment, then send interrupt message (NOT an approval)
setTimeout(() => {
if (!interruptSent) {
interruptSent = true;
const userMsg = JSON.stringify({
type: "user",
message: { role: "user", content: INTERRUPT_MESSAGE },
});
proc.stdin?.write(`${userMsg}\n`);
}
}, 500);
return;
}
// Track recovery messages - this is the key signal that lazy recovery worked
if (
msg.type === "recovery" &&
msg.recovery_type === "approval_pending"
) {
errorSeen = true; // reusing this flag to mean "recovery message seen"
}
// Also track raw errors (shouldn't see these if recovery works properly)
if (
msg.type === "error" ||
(msg.type === "message" && msg.message_type === "error_message")
) {
const detail = msg.detail || msg.message || "";
if (detail.toLowerCase().includes("cannot send a new message")) {
// Raw error leaked through - recovery may have failed
console.log(
"WARNING: Raw CONFLICT error seen (recovery may have failed)",
);
}
}
// Track results - we need 2 (one for each user message, though first may fail)
if (msg.type === "result") {
resultCount++;
// After second result (or after seeing error + result), we're done
if (resultCount >= 2 || (errorSeen && resultCount >= 1)) {
cleanup();
resolve({ messages, success: true, errorSeen });
}
}
} catch {
// Not valid JSON, ignore
}
};
proc.stdout?.on("data", (data) => {
buffer += data.toString();
const lines = buffer.split("\n");
buffer = lines.pop() || "";
for (const line of lines) {
processLine(line);
}
});
let _stderr = "";
proc.stderr?.on("data", (data) => {
_stderr += data.toString();
});
proc.on("close", (_code) => {
clearTimeout(timeout);
// Process any remaining buffer
if (buffer.trim()) {
processLine(buffer);
}
if (!closing) {
// If we got here without resolving, check what we have
resolve({
messages,
success: resultCount > 0,
errorSeen,
});
}
});
proc.on("error", (err) => {
clearTimeout(timeout);
reject(err);
});
});
}
describe("lazy approval recovery", () => {
test("handles concurrent message while approval is pending", async () => {
const result = await runLazyRecoveryTest();
// Log messages for debugging if test fails
if (!result.success) {
console.log("All messages received:");
for (const msg of result.messages) {
console.log(JSON.stringify(msg, null, 2));
}
}
// We should have seen the approval request (proves tool requiring approval was called)
const approvalRequest = result.messages.find(
(m) => m.message_type === "approval_request_message",
);
expect(approvalRequest).toBeDefined();
// The test should complete successfully
expect(result.success).toBe(true);
// Count results - we should get at least 1 (the second message should always complete)
const resultCount = result.messages.filter(
(m) => m.type === "result",
).length;
expect(resultCount).toBeGreaterThanOrEqual(1);
// KEY ASSERTION: Check if we saw the recovery message
// This proves the lazy recovery mechanism was triggered
const recoveryMessage = result.messages.find(
(m) => m.type === "recovery" && m.recovery_type === "approval_pending",
);
if (recoveryMessage) {
console.log("Recovery message detected - lazy recovery worked correctly");
expect(result.errorSeen).toBe(true); // Should have been set when we saw recovery
} else {
// Recovery might not be triggered if approval was auto-handled before second message
// This can happen due to timing - the test still validates the flow works
console.log(
"Note: No recovery message seen - approval may have been handled before conflict",
);
}
}, 180000); // 3 minute timeout for CI
});

View File

@@ -2,22 +2,14 @@ import { describe, expect, test } from "bun:test";
import { spawn } from "node:child_process";
/**
* Integration tests for CLI startup flows.
* Startup flow tests that validate flag conflict handling.
*
* These tests verify the boot flow decision tree:
* - Flag conflict detection
* - --conversation: derives agent from conversation
* - --agent: uses specified agent
* - --new-agent: creates new agent
* - Error messages for invalid inputs
*
* Note: Tests that depend on settings files (.letta/) are harder to isolate
* because the CLI uses process.cwd(). For now, we focus on flag-based tests.
* These must remain runnable in fork PR CI (no secrets), so they should not
* require a working Letta server or LETTA_API_KEY.
*/
const projectRoot = process.cwd();
// Helper to run CLI and capture output
async function runCli(
args: string[],
options: {
@@ -74,10 +66,6 @@ async function runCli(
});
}
// ============================================================================
// Flag Conflict Tests (fast, no API calls needed)
// ============================================================================
describe("Startup Flow - Flag Conflicts", () => {
test("--conversation conflicts with --agent", async () => {
const result = await runCli(
@@ -136,227 +124,3 @@ describe("Startup Flow - Flag Conflicts", () => {
);
});
});
// ============================================================================
// Invalid Input Tests (require API calls but fail fast)
// ============================================================================
describe("Startup Flow - Invalid Inputs", () => {
test(
"--agent with nonexistent ID shows error",
async () => {
const result = await runCli(
["--agent", "agent-definitely-does-not-exist-12345", "-p", "test"],
{ expectExit: 1, timeoutMs: 60000 },
);
expect(result.stderr).toContain("not found");
},
{ timeout: 70000 },
);
test(
"--conversation with nonexistent ID shows error",
async () => {
const result = await runCli(
[
"--conversation",
"conversation-definitely-does-not-exist-12345",
"-p",
"test",
],
{ expectExit: 1, timeoutMs: 60000 },
);
expect(result.stderr).toContain("not found");
},
{ timeout: 70000 },
);
test("--from-af with nonexistent file shows error", async () => {
const result = await runCli(
["--from-af", "/nonexistent/path/agent.af", "-p", "test"],
{ expectExit: 1 },
);
expect(result.stderr).toContain("not found");
});
});
// ============================================================================
// Integration Tests (require API access, create real agents)
// ============================================================================
describe("Startup Flow - Integration", () => {
// Store created agent/conversation IDs for cleanup and reuse
let testAgentId: string | null = null;
test(
"--new-agent creates agent and responds",
async () => {
const result = await runCli(
[
"--new-agent",
"-m",
"haiku",
"-p",
"Say OK and nothing else",
"--output-format",
"json",
],
{ timeoutMs: 120000 },
);
expect(result.exitCode).toBe(0);
// stdout includes the bun invocation line, extract just the JSON
const jsonStart = result.stdout.indexOf("{");
const output = JSON.parse(result.stdout.slice(jsonStart));
expect(output.agent_id).toBeDefined();
expect(output.result).toBeDefined();
// Save for later tests
testAgentId = output.agent_id;
},
{ timeout: 130000 },
);
test(
"--agent with valid ID uses that agent",
async () => {
// Skip if previous test didn't create an agent
if (!testAgentId) {
console.log("Skipping: no test agent available");
return;
}
const result = await runCli(
[
"--agent",
testAgentId,
"-m",
"haiku",
"-p",
"Say OK",
"--output-format",
"json",
],
{ timeoutMs: 120000 },
);
expect(result.exitCode).toBe(0);
const jsonStart = result.stdout.indexOf("{");
const output = JSON.parse(result.stdout.slice(jsonStart));
expect(output.agent_id).toBe(testAgentId);
},
{ timeout: 130000 },
);
test(
"--conversation with valid ID derives agent and uses conversation",
async () => {
// Skip if previous test didn't create an agent
if (!testAgentId) {
console.log("Skipping: no test agent available");
return;
}
// First, create a real conversation with --new (since --new-agent uses "default")
const createResult = await runCli(
[
"--agent",
testAgentId,
"--new",
"-m",
"haiku",
"-p",
"Say CREATED",
"--output-format",
"json",
],
{ timeoutMs: 120000 },
);
expect(createResult.exitCode).toBe(0);
const createJsonStart = createResult.stdout.indexOf("{");
const createOutput = JSON.parse(
createResult.stdout.slice(createJsonStart),
);
const realConversationId = createOutput.conversation_id;
expect(realConversationId).toBeDefined();
expect(realConversationId).not.toBe("default");
// Now test that --conversation can derive the agent from this conversation
const result = await runCli(
[
"--conversation",
realConversationId,
"-m",
"haiku",
"-p",
"Say OK",
"--output-format",
"json",
],
{ timeoutMs: 120000 },
);
expect(result.exitCode).toBe(0);
const jsonStart = result.stdout.indexOf("{");
const output = JSON.parse(result.stdout.slice(jsonStart));
// Should use the same agent that owns the conversation
expect(output.agent_id).toBe(testAgentId);
// Should use the specified conversation
expect(output.conversation_id).toBe(realConversationId);
},
{ timeout: 180000 },
);
test(
"--new-agent with --init-blocks none creates minimal agent",
async () => {
const result = await runCli(
[
"--new-agent",
"--init-blocks",
"none",
"-m",
"haiku",
"-p",
"Say OK",
"--output-format",
"json",
],
{ timeoutMs: 120000 },
);
expect(result.exitCode).toBe(0);
// stdout includes the bun invocation line, extract just the JSON
const jsonStart = result.stdout.indexOf("{");
const output = JSON.parse(result.stdout.slice(jsonStart));
expect(output.agent_id).toBeDefined();
},
{ timeout: 130000 },
);
});
// ============================================================================
// --continue Tests (depend on LRU state, harder to isolate)
// ============================================================================
describe("Startup Flow - Continue Flag", () => {
test(
"--continue with no LRU shows error",
async () => {
// This test relies on running in a directory with no .letta/ settings
// In practice, this might use the project's .letta/ which has an LRU
// So we check for either success (if LRU exists) or error (if not)
const result = await runCli(
["--continue", "-p", "Say OK", "--output-format", "json"],
{ timeoutMs: 60000 },
);
// Either succeeds (LRU exists) or fails with specific error
if (result.exitCode !== 0) {
expect(result.stderr).toContain("No recent session found");
}
// If it succeeds, that's also valid (test env has LRU)
},
{ timeout: 70000 },
);
});