ci(test): split unit vs API integration tests (#797)

2026-02-03 19:09:40 -06:00
parent 55a31adae7
commit d175b0e155
6 changed files with 292 additions and 240 deletions
--- a/src/tests/headless-input-format.test.ts
+++ b/src/tests/headless-input-format.test.ts
@@ -1,491 +0,0 @@
-import { describe, expect, test } from "bun:test";
-import { spawn } from "node:child_process";
-import type {
-  ControlResponse,
-  ErrorMessage,
-  ResultMessage,
-  StreamEvent,
-  SystemInitMessage,
-  WireMessage,
-} from "../types/protocol";
-
-/**
- * Tests for --input-format stream-json bidirectional communication.
- * These verify the CLI's wire format for bidirectional communication.
- */
-
-// Prescriptive prompt to ensure single-step response without tool use
-const FAST_PROMPT =
-  "This is a test. Do not call any tools. Just respond with the word OK and nothing else.";
-
-/**
- * Helper to run bidirectional commands with stdin input.
- * Event-driven: waits for init message before sending input, waits for result before closing.
- */
-async function runBidirectional(
-  inputs: string[],
-  extraArgs: string[] = [],
-  timeoutMs = 180000, // 180s timeout - CI can be very slow
-): Promise<object[]> {
-  return new Promise((resolve, reject) => {
-    const proc = spawn(
-      "bun",
-      [
-        "run",
-        "dev",
-        "-p",
-        "--input-format",
-        "stream-json",
-        "--output-format",
-        "stream-json",
-        "--new-agent",
-        "-m",
-        "haiku",
-        "--yolo",
-        ...extraArgs,
-      ],
-      {
-        cwd: process.cwd(),
-        // Mark as subagent to prevent polluting user's LRU settings
-        env: { ...process.env, LETTA_CODE_AGENT_ROLE: "subagent" },
-      },
-    );
-
-    const objects: object[] = [];
-    let buffer = "";
-    let inputIndex = 0;
-    let initReceived = false;
-    let closing = false;
-
-    // Count expected responses based on input types
-    const inputTypes = inputs.map((i) => {
-      try {
-        const parsed = JSON.parse(i);
-        return parsed.type;
-      } catch {
-        return "invalid"; // Invalid JSON
-      }
-    });
-    const expectedUserResults = inputTypes.filter((t) => t === "user").length;
-    const expectedControlResponses = inputTypes.filter(
-      (t) => t === "control_request",
-    ).length;
-    const hasInvalidInput = inputTypes.includes("invalid");
-
-    let userResultsReceived = 0;
-    let controlResponsesReceived = 0;
-
-    const maybeClose = () => {
-      if (closing) return;
-
-      // For invalid input, close after receiving error
-      // For control requests only, close after all control_responses
-      // For user messages, close after all results
-      // For mixed, close when we have all expected responses
-
-      const allUserResultsDone =
-        expectedUserResults === 0 || userResultsReceived >= expectedUserResults;
-      const allControlResponsesDone =
-        expectedControlResponses === 0 ||
-        controlResponsesReceived >= expectedControlResponses;
-      const allInputsSent = inputIndex >= inputs.length;
-
-      if (allInputsSent && allUserResultsDone && allControlResponsesDone) {
-        closing = true;
-        setTimeout(() => proc.stdin?.end(), 500);
-      }
-    };
-
-    const processLine = (line: string) => {
-      if (!line.trim()) return;
-      try {
-        const obj = JSON.parse(line);
-        objects.push(obj);
-
-        // Check for init message - signal to start sending inputs
-        if (obj.type === "system" && obj.subtype === "init" && !initReceived) {
-          initReceived = true;
-          sendNextInput();
-        }
-
-        // Check for control_response
-        if (obj.type === "control_response") {
-          controlResponsesReceived++;
-          maybeClose();
-        }
-
-        // Check for result message
-        if (obj.type === "result") {
-          userResultsReceived++;
-          // If more inputs to send, send next after a brief delay
-          // This gives the CLI time to be ready for the next input
-          if (inputIndex < inputs.length) {
-            setTimeout(sendNextInput, 200);
-          }
-          // Always check if we should close (might have received all expected results)
-          maybeClose();
-        }
-
-        // Check for error message (for invalid JSON input test)
-        if (obj.type === "error" && hasInvalidInput) {
-          closing = true;
-          setTimeout(() => proc.stdin?.end(), 500);
-        }
-      } catch {
-        // Not valid JSON, ignore
-      }
-    };
-
-    const sendNextInput = () => {
-      if (inputIndex < inputs.length) {
-        proc.stdin?.write(`${inputs[inputIndex]}\n`);
-        inputIndex++;
-      }
-    };
-
-    proc.stdout?.on("data", (data) => {
-      buffer += data.toString();
-      const lines = buffer.split("\n");
-      buffer = lines.pop() || ""; // Keep incomplete line in buffer
-      for (const line of lines) {
-        processLine(line);
-      }
-    });
-
-    let stderr = "";
-    proc.stderr?.on("data", (data) => {
-      stderr += data.toString();
-    });
-
-    proc.on("close", (code) => {
-      // Process any remaining buffer
-      if (buffer.trim()) {
-        processLine(buffer);
-      }
-
-      // Check if we got enough results
-      const gotExpectedResults =
-        userResultsReceived >= expectedUserResults &&
-        controlResponsesReceived >= expectedControlResponses;
-
-      if (objects.length === 0 && code !== 0) {
-        reject(
-          new Error(
-            `Process exited with code ${code}, no output received. stderr: ${stderr}`,
-          ),
-        );
-      } else if (!gotExpectedResults && code !== 0) {
-        reject(
-          new Error(
-            `Process exited with code ${code} before all results received. ` +
-              `Got ${userResultsReceived}/${expectedUserResults} user results, ` +
-              `${controlResponsesReceived}/${expectedControlResponses} control responses. ` +
-              `inputIndex: ${inputIndex}, initReceived: ${initReceived}. stderr: ${stderr}`,
-          ),
-        );
-      } else {
-        resolve(objects);
-      }
-    });
-
-    // Safety timeout
-    const timeout = setTimeout(() => {
-      proc.kill();
-      reject(
-        new Error(
-          `Timeout after ${timeoutMs}ms. Received ${objects.length} objects, init: ${initReceived}, userResults: ${userResultsReceived}/${expectedUserResults}, controlResponses: ${controlResponsesReceived}/${expectedControlResponses}`,
-        ),
-      );
-    }, timeoutMs);
-
-    proc.on("close", () => clearTimeout(timeout));
-  });
-}
-
-describe("input-format stream-json", () => {
-  test(
-    "initialize control request returns session info",
-    async () => {
-      const objects = (await runBidirectional([
-        JSON.stringify({
-          type: "control_request",
-          request_id: "init_1",
-          request: { subtype: "initialize" },
-        }),
-      ])) as WireMessage[];
-
-      // Should have init event
-      const initEvent = objects.find(
-        (o): o is SystemInitMessage =>
-          o.type === "system" && "subtype" in o && o.subtype === "init",
-      );
-      expect(initEvent).toBeDefined();
-      expect(initEvent?.agent_id).toBeDefined();
-      expect(initEvent?.session_id).toBeDefined();
-      expect(initEvent?.model).toBeDefined();
-      expect(initEvent?.tools).toBeInstanceOf(Array);
-
-      // Should have control_response
-      const controlResponse = objects.find(
-        (o): o is ControlResponse => o.type === "control_response",
-      );
-      expect(controlResponse).toBeDefined();
-      expect(controlResponse?.response.subtype).toBe("success");
-      expect(controlResponse?.response.request_id).toBe("init_1");
-      if (controlResponse?.response.subtype === "success") {
-        const initResponse = controlResponse.response.response as
-          | { agent_id?: string }
-          | undefined;
-        expect(initResponse?.agent_id).toBeDefined();
-      }
-    },
-    { timeout: 200000 },
-  );
-
-  test(
-    "user message returns assistant response and result",
-    async () => {
-      const objects = (await runBidirectional([
-        JSON.stringify({
-          type: "user",
-          message: { role: "user", content: FAST_PROMPT },
-        }),
-      ])) as WireMessage[];
-
-      // Should have init event
-      const initEvent = objects.find(
-        (o): o is SystemInitMessage =>
-          o.type === "system" && "subtype" in o && o.subtype === "init",
-      );
-      expect(initEvent).toBeDefined();
-
-      // Should have message events
-      const messageEvents = objects.filter(
-        (o): o is WireMessage & { type: "message" } => o.type === "message",
-      );
-      expect(messageEvents.length).toBeGreaterThan(0);
-
-      // All messages should have session_id
-      // uuid is present on content messages (reasoning, assistant) but not meta messages (stop_reason, usage_statistics)
-      for (const msg of messageEvents) {
-        expect(msg.session_id).toBeDefined();
-      }
-
-      // Content messages should have uuid
-      const contentMessages = messageEvents.filter(
-        (m) =>
-          "message_type" in m &&
-          (m.message_type === "reasoning_message" ||
-            m.message_type === "assistant_message"),
-      );
-      for (const msg of contentMessages) {
-        expect(msg.uuid).toBeDefined();
-      }
-
-      // Should have result
-      const result = objects.find(
-        (o): o is ResultMessage => o.type === "result",
-      );
-      expect(result).toBeDefined();
-      expect(result?.subtype).toBe("success");
-      expect(result?.session_id).toBeDefined();
-      expect(result?.agent_id).toBeDefined();
-      expect(result?.duration_ms).toBeGreaterThan(0);
-    },
-    { timeout: 200000 },
-  );
-
-  test(
-    "multi-turn conversation maintains context",
-    async () => {
-      // Multi-turn test needs 2 sequential LLM calls, so allow more time
-      const objects = (await runBidirectional(
-        [
-          JSON.stringify({
-            type: "user",
-            message: {
-              role: "user",
-              content: "Say hello",
-            },
-          }),
-          JSON.stringify({
-            type: "user",
-            message: {
-              role: "user",
-              content: "Say goodbye",
-            },
-          }),
-        ],
-        [], // no extra args
-        300000, // 300s for 2 sequential LLM calls - CI can be very slow
-      )) as WireMessage[];
-
-      // Should have at least two results (one per turn)
-      const results = objects.filter(
-        (o): o is ResultMessage => o.type === "result",
-      );
-      expect(results.length).toBeGreaterThanOrEqual(2);
-
-      // Both results should be successful
-      for (const result of results) {
-        expect(result.subtype).toBe("success");
-        expect(result.session_id).toBeDefined();
-        expect(result.agent_id).toBeDefined();
-      }
-
-      // The session_id should be consistent across turns (same agent)
-      const firstResult = results[0];
-      const lastResult = results[results.length - 1];
-      expect(firstResult).toBeDefined();
-      expect(lastResult).toBeDefined();
-      if (firstResult && lastResult) {
-        expect(firstResult.session_id).toBe(lastResult.session_id);
-      }
-    },
-    { timeout: 320000 },
-  );
-
-  test(
-    "interrupt control request is acknowledged",
-    async () => {
-      const objects = (await runBidirectional([
-        JSON.stringify({
-          type: "control_request",
-          request_id: "int_1",
-          request: { subtype: "interrupt" },
-        }),
-      ])) as WireMessage[];
-
-      // Should have control_response for interrupt
-      const controlResponse = objects.find(
-        (o): o is ControlResponse =>
-          o.type === "control_response" && o.response?.request_id === "int_1",
-      );
-      expect(controlResponse).toBeDefined();
-      expect(controlResponse?.response.subtype).toBe("success");
-    },
-    { timeout: 200000 },
-  );
-
-  test(
-    "--include-partial-messages emits stream_event in bidirectional mode",
-    async () => {
-      const objects = (await runBidirectional(
-        [
-          JSON.stringify({
-            type: "user",
-            message: { role: "user", content: FAST_PROMPT },
-          }),
-        ],
-        ["--include-partial-messages"],
-      )) as WireMessage[];
-
-      // Should have stream_event messages (not just "message" type)
-      const streamEvents = objects.filter(
-        (o): o is StreamEvent => o.type === "stream_event",
-      );
-      expect(streamEvents.length).toBeGreaterThan(0);
-
-      // Each stream_event should have the event payload and session_id
-      // uuid is present on content events but not meta events (stop_reason, usage_statistics)
-      for (const event of streamEvents) {
-        expect(event.event).toBeDefined();
-        expect(event.session_id).toBeDefined();
-      }
-
-      // Content events should have uuid
-      const contentEvents = streamEvents.filter(
-        (e) =>
-          "message_type" in e.event &&
-          (e.event.message_type === "reasoning_message" ||
-            e.event.message_type === "assistant_message"),
-      );
-      for (const event of contentEvents) {
-        expect(event.uuid).toBeDefined();
-      }
-
-      // Should still have result
-      const result = objects.find(
-        (o): o is ResultMessage => o.type === "result",
-      );
-      expect(result).toBeDefined();
-      expect(result?.subtype).toBe("success");
-    },
-    { timeout: 200000 },
-  );
-
-  test(
-    "unknown control request returns error",
-    async () => {
-      const objects = (await runBidirectional([
-        JSON.stringify({
-          type: "control_request",
-          request_id: "unknown_1",
-          request: { subtype: "unknown_subtype" },
-        }),
-      ])) as WireMessage[];
-
-      // Should have control_response with error
-      const controlResponse = objects.find(
-        (o): o is ControlResponse =>
-          o.type === "control_response" &&
-          o.response?.request_id === "unknown_1",
-      );
-      expect(controlResponse).toBeDefined();
-      expect(controlResponse?.response.subtype).toBe("error");
-    },
-    { timeout: 200000 },
-  );
-
-  test(
-    "invalid JSON input returns error message",
-    async () => {
-      // Use raw string instead of JSON
-      const objects = (await runBidirectional([
-        "not valid json",
-      ])) as WireMessage[];
-
-      // Should have error message
-      const errorMsg = objects.find(
-        (o): o is ErrorMessage => o.type === "error",
-      );
-      expect(errorMsg).toBeDefined();
-      expect(errorMsg?.message).toContain("Invalid JSON");
-    },
-    { timeout: 200000 },
-  );
-
-  test(
-    "Task tool with explore subagent works",
-    async () => {
-      // Prescriptive prompt to ensure Task tool is used
-      const objects = (await runBidirectional(
-        [
-          JSON.stringify({
-            type: "user",
-            message: {
-              role: "user",
-              content:
-                "You MUST use the Task tool with subagent_type='explore' to find TypeScript files (*.ts) in the src directory. " +
-                "Return only the subagent's report, nothing else.",
-            },
-          }),
-        ],
-        [],
-        300000, // 5 min timeout - subagent spawn + execution can be slow
-      )) as WireMessage[];
-
-      // Should have a successful result
-      const result = objects.find(
-        (o): o is ResultMessage => o.type === "result",
-      );
-      expect(result).toBeDefined();
-      expect(result?.subtype).toBe("success");
-
-      // Should have auto_approval events (Task tool was auto-approved via --yolo)
-      const autoApprovals = objects.filter((o) => o.type === "auto_approval");
-      expect(autoApprovals.length).toBeGreaterThan(0);
-    },
-    { timeout: 320000 },
-  );
-});
--- a/src/tests/headless-stream-json-format.test.ts
+++ b/src/tests/headless-stream-json-format.test.ts
@@ -1,223 +0,0 @@
-import { describe, expect, test } from "bun:test";
-import { spawn } from "node:child_process";
-import type {
-  ResultMessage,
-  StreamEvent,
-  SystemInitMessage,
-} from "../types/protocol";
-
-/**
- * Tests for stream-json output format.
- * These verify the message structure matches the wire format types.
- */
-
-async function runHeadlessCommand(
-  prompt: string,
-  extraArgs: string[] = [],
-  timeoutMs = 180000, // 180s timeout - CI can be very slow
-): Promise<string[]> {
-  return new Promise((resolve, reject) => {
-    const proc = spawn(
-      "bun",
-      [
-        "run",
-        "dev",
-        "--new-agent",
-        "-p",
-        prompt,
-        "--output-format",
-        "stream-json",
-        "--yolo",
-        "-m",
-        "haiku",
-        ...extraArgs,
-      ],
-      {
-        cwd: process.cwd(),
-        // Mark as subagent to prevent polluting user's LRU settings
-        env: { ...process.env, LETTA_CODE_AGENT_ROLE: "subagent" },
-      },
-    );
-
-    let stdout = "";
-    let stderr = "";
-
-    proc.stdout.on("data", (data) => {
-      stdout += data.toString();
-    });
-
-    proc.stderr.on("data", (data) => {
-      stderr += data.toString();
-    });
-
-    // Safety timeout for CI
-    const timeout = setTimeout(() => {
-      proc.kill();
-      reject(new Error(`Process timeout after ${timeoutMs}ms: ${stderr}`));
-    }, timeoutMs);
-
-    proc.on("close", (code) => {
-      clearTimeout(timeout);
-      if (code !== 0 && !stdout.includes('"type":"result"')) {
-        reject(new Error(`Process exited with code ${code}: ${stderr}`));
-      } else {
-        // Parse line-delimited JSON
-        const lines = stdout
-          .split("\n")
-          .filter((line) => line.trim())
-          .filter((line) => {
-            try {
-              JSON.parse(line);
-              return true;
-            } catch {
-              return false;
-            }
-          });
-        resolve(lines);
-      }
-    });
-  });
-}
-
-// Prescriptive prompt to ensure single-step response without tool use
-const FAST_PROMPT =
-  "This is a test. Do not call any tools. Just respond with the word OK and nothing else.";
-
-describe("stream-json format", () => {
-  test(
-    "init message has type 'system' with subtype 'init'",
-    async () => {
-      const lines = await runHeadlessCommand(FAST_PROMPT);
-      const initLine = lines.find((line) => {
-        const obj = JSON.parse(line);
-        return obj.type === "system" && obj.subtype === "init";
-      });
-
-      expect(initLine).toBeDefined();
-      if (!initLine) throw new Error("initLine not found");
-
-      const init = JSON.parse(initLine) as SystemInitMessage;
-      expect(init.type).toBe("system");
-      expect(init.subtype).toBe("init");
-      expect(init.agent_id).toBeDefined();
-      expect(init.session_id).toBe(init.agent_id); // session_id should equal agent_id
-      expect(init.model).toBeDefined();
-      expect(init.tools).toBeInstanceOf(Array);
-      expect(init.cwd).toBeDefined();
-      expect(init.uuid).toBe(`init-${init.agent_id}`);
-    },
-    { timeout: 200000 },
-  );
-
-  test(
-    "messages have session_id and uuid",
-    async () => {
-      const lines = await runHeadlessCommand(FAST_PROMPT);
-
-      // Find a message line
-      const messageLine = lines.find((line) => {
-        const obj = JSON.parse(line);
-        return obj.type === "message";
-      });
-
-      expect(messageLine).toBeDefined();
-      if (!messageLine) throw new Error("messageLine not found");
-
-      const msg = JSON.parse(messageLine) as {
-        session_id: string;
-        uuid: string;
-      };
-      expect(msg.session_id).toBeDefined();
-      expect(msg.uuid).toBeDefined();
-      // uuid should be otid or id from the Letta SDK chunk
-      expect(msg.uuid).toBeTruthy();
-    },
-    { timeout: 200000 },
-  );
-
-  test(
-    "result message has correct format",
-    async () => {
-      const lines = await runHeadlessCommand(FAST_PROMPT);
-      const resultLine = lines.find((line) => {
-        const obj = JSON.parse(line);
-        return obj.type === "result";
-      });
-
-      expect(resultLine).toBeDefined();
-      if (!resultLine) throw new Error("resultLine not found");
-
-      const result = JSON.parse(resultLine) as ResultMessage & { uuid: string };
-      expect(result.type).toBe("result");
-      expect(result.subtype).toBe("success");
-      expect(result.session_id).toBeDefined();
-      expect(result.agent_id).toBeDefined();
-      expect(result.session_id).toBe(result.agent_id);
-      expect(result.duration_ms).toBeGreaterThan(0);
-      expect(result.uuid).toContain("result-");
-      expect(result.result).toBeDefined();
-    },
-    { timeout: 200000 },
-  );
-
-  test(
-    "--include-partial-messages wraps chunks in stream_event",
-    async () => {
-      const lines = await runHeadlessCommand(FAST_PROMPT, [
-        "--include-partial-messages",
-      ]);
-
-      // Find a stream_event line
-      const streamEventLine = lines.find((line) => {
-        const obj = JSON.parse(line);
-        return obj.type === "stream_event";
-      });
-
-      expect(streamEventLine).toBeDefined();
-      if (!streamEventLine) throw new Error("streamEventLine not found");
-
-      const event = JSON.parse(streamEventLine) as StreamEvent;
-      expect(event.type).toBe("stream_event");
-      expect(event.event).toBeDefined();
-      expect(event.session_id).toBeDefined();
-      expect(event.uuid).toBeDefined();
-      // The event should contain the original Letta SDK chunk
-      expect("message_type" in event.event).toBe(true);
-    },
-    { timeout: 200000 },
-  );
-
-  test(
-    "without --include-partial-messages, messages are type 'message'",
-    async () => {
-      const lines = await runHeadlessCommand(FAST_PROMPT);
-
-      // Should have message lines, not stream_event
-      const messageLines = lines.filter((line) => {
-        const obj = JSON.parse(line);
-        return obj.type === "message";
-      });
-
-      const streamEventLines = lines.filter((line) => {
-        const obj = JSON.parse(line);
-        return obj.type === "stream_event";
-      });
-
-      // We should have some message lines (reasoning, assistant, stop_reason, etc.)
-      // In rare cases with very fast responses, we might only get init + result
-      // So check that IF we have content, it's "message" not "stream_event"
-      if (messageLines.length > 0 || streamEventLines.length > 0) {
-        expect(messageLines.length).toBeGreaterThan(0);
-        expect(streamEventLines.length).toBe(0);
-      }
-
-      // Always should have a result
-      const resultLine = lines.find((line) => {
-        const obj = JSON.parse(line);
-        return obj.type === "result";
-      });
-      expect(resultLine).toBeDefined();
-    },
-    { timeout: 200000 },
-  );
-});
--- a/src/tests/lazy-approval-recovery.test.ts
+++ b/src/tests/lazy-approval-recovery.test.ts
@@ -1,257 +0,0 @@
-import { describe, expect, test } from "bun:test";
-import { spawn } from "node:child_process";
-
-/**
- * Integration test for lazy approval recovery (LET-7101).
- *
- * NOTE: The lazy approval recovery is primarily designed for TUI mode where:
- * 1. User has a session with pending approvals (e.g., from a previous run)
- * 2. User sends a new message before responding to the approval
- * 3. Server returns CONFLICT error
- * 4. CLI recovers by auto-denying stale approvals and retrying
- *
- * In bidirectional mode, messages sent during permission wait are dropped
- * (see headless.ts line 1710-1714), so we can't directly test the CONFLICT
- * scenario here. This test validates that the flow doesn't crash when
- * messages are sent while approvals are pending.
- *
- * The RecoveryMessage emission can be tested by:
- * 1. Manual testing in TUI mode (start session with orphaned approval)
- * 2. Or by modifying headless mode to not drop messages during permission wait
- */
-
-// Prompt that will trigger a Bash tool call requiring approval
-const BASH_TRIGGER_PROMPT =
-  "Run this exact bash command: echo test123. Do not use any other tools.";
-
-// Second message to send while approval is pending
-const INTERRUPT_MESSAGE =
-  "Actually, just say OK instead. Do not call any tools.";
-
-interface StreamMessage {
-  type: string;
-  subtype?: string;
-  message_type?: string;
-  stop_reason?: string;
-  // biome-ignore lint/suspicious/noExplicitAny: index signature for arbitrary JSON fields
-  [key: string]: any;
-}
-
-/**
- * Run bidirectional test with custom message handling.
- * Allows sending messages at specific points in the flow.
- */
-async function runLazyRecoveryTest(timeoutMs = 180000): Promise<{
-  messages: StreamMessage[];
-  success: boolean;
-  errorSeen: boolean;
-}> {
-  return new Promise((resolve, reject) => {
-    const proc = spawn(
-      "bun",
-      [
-        "run",
-        "dev",
-        "-p",
-        "--input-format",
-        "stream-json",
-        "--output-format",
-        "stream-json",
-        "--new-agent",
-        "-m",
-        "haiku",
-        // NOTE: No --yolo flag - approvals are required
-      ],
-      {
-        cwd: process.cwd(),
-        // Mark as subagent to prevent polluting user's LRU settings
-        env: { ...process.env, LETTA_CODE_AGENT_ROLE: "subagent" },
-      },
-    );
-
-    const messages: StreamMessage[] = [];
-    let buffer = "";
-    let initReceived = false;
-    let approvalSeen = false;
-    let interruptSent = false;
-    let errorSeen = false;
-    let resultCount = 0;
-    let closing = false;
-
-    const timeout = setTimeout(() => {
-      if (!closing) {
-        proc.kill();
-        reject(new Error(`Test timed out after ${timeoutMs}ms`));
-      }
-    }, timeoutMs);
-
-    const cleanup = () => {
-      closing = true;
-      clearTimeout(timeout);
-      setTimeout(() => {
-        proc.stdin?.end();
-        proc.kill();
-      }, 500);
-    };
-
-    const processLine = (line: string) => {
-      if (!line.trim()) return;
-      try {
-        const msg: StreamMessage = JSON.parse(line);
-        messages.push(msg);
-
-        // Debug output
-        if (process.env.DEBUG_TEST) {
-          console.log("MSG:", JSON.stringify(msg, null, 2));
-        }
-
-        // Step 1: Wait for init, then send bash trigger prompt
-        if (msg.type === "system" && msg.subtype === "init" && !initReceived) {
-          initReceived = true;
-          const userMsg = JSON.stringify({
-            type: "user",
-            message: { role: "user", content: BASH_TRIGGER_PROMPT },
-          });
-          proc.stdin?.write(`${userMsg}\n`);
-          return;
-        }
-
-        // Step 2: When we see approval request, send another user message instead
-        if (
-          msg.type === "message" &&
-          msg.message_type === "approval_request_message" &&
-          !approvalSeen
-        ) {
-          approvalSeen = true;
-          // Wait a moment, then send interrupt message (NOT an approval)
-          setTimeout(() => {
-            if (!interruptSent) {
-              interruptSent = true;
-              const userMsg = JSON.stringify({
-                type: "user",
-                message: { role: "user", content: INTERRUPT_MESSAGE },
-              });
-              proc.stdin?.write(`${userMsg}\n`);
-            }
-          }, 500);
-          return;
-        }
-
-        // Track recovery messages - this is the key signal that lazy recovery worked
-        if (
-          msg.type === "recovery" &&
-          msg.recovery_type === "approval_pending"
-        ) {
-          errorSeen = true; // reusing this flag to mean "recovery message seen"
-        }
-
-        // Also track raw errors (shouldn't see these if recovery works properly)
-        if (
-          msg.type === "error" ||
-          (msg.type === "message" && msg.message_type === "error_message")
-        ) {
-          const detail = msg.detail || msg.message || "";
-          if (detail.toLowerCase().includes("cannot send a new message")) {
-            // Raw error leaked through - recovery may have failed
-            console.log(
-              "WARNING: Raw CONFLICT error seen (recovery may have failed)",
-            );
-          }
-        }
-
-        // Track results - we need 2 (one for each user message, though first may fail)
-        if (msg.type === "result") {
-          resultCount++;
-          // After second result (or after seeing error + result), we're done
-          if (resultCount >= 2 || (errorSeen && resultCount >= 1)) {
-            cleanup();
-            resolve({ messages, success: true, errorSeen });
-          }
-        }
-      } catch {
-        // Not valid JSON, ignore
-      }
-    };
-
-    proc.stdout?.on("data", (data) => {
-      buffer += data.toString();
-      const lines = buffer.split("\n");
-      buffer = lines.pop() || "";
-      for (const line of lines) {
-        processLine(line);
-      }
-    });
-
-    let _stderr = "";
-    proc.stderr?.on("data", (data) => {
-      _stderr += data.toString();
-    });
-
-    proc.on("close", (_code) => {
-      clearTimeout(timeout);
-      // Process any remaining buffer
-      if (buffer.trim()) {
-        processLine(buffer);
-      }
-
-      if (!closing) {
-        // If we got here without resolving, check what we have
-        resolve({
-          messages,
-          success: resultCount > 0,
-          errorSeen,
-        });
-      }
-    });
-
-    proc.on("error", (err) => {
-      clearTimeout(timeout);
-      reject(err);
-    });
-  });
-}
-
-describe("lazy approval recovery", () => {
-  test("handles concurrent message while approval is pending", async () => {
-    const result = await runLazyRecoveryTest();
-
-    // Log messages for debugging if test fails
-    if (!result.success) {
-      console.log("All messages received:");
-      for (const msg of result.messages) {
-        console.log(JSON.stringify(msg, null, 2));
-      }
-    }
-
-    // We should have seen the approval request (proves tool requiring approval was called)
-    const approvalRequest = result.messages.find(
-      (m) => m.message_type === "approval_request_message",
-    );
-    expect(approvalRequest).toBeDefined();
-
-    // The test should complete successfully
-    expect(result.success).toBe(true);
-
-    // Count results - we should get at least 1 (the second message should always complete)
-    const resultCount = result.messages.filter(
-      (m) => m.type === "result",
-    ).length;
-    expect(resultCount).toBeGreaterThanOrEqual(1);
-
-    // KEY ASSERTION: Check if we saw the recovery message
-    // This proves the lazy recovery mechanism was triggered
-    const recoveryMessage = result.messages.find(
-      (m) => m.type === "recovery" && m.recovery_type === "approval_pending",
-    );
-    if (recoveryMessage) {
-      console.log("Recovery message detected - lazy recovery worked correctly");
-      expect(result.errorSeen).toBe(true); // Should have been set when we saw recovery
-    } else {
-      // Recovery might not be triggered if approval was auto-handled before second message
-      // This can happen due to timing - the test still validates the flow works
-      console.log(
-        "Note: No recovery message seen - approval may have been handled before conflict",
-      );
-    }
-  }, 180000); // 3 minute timeout for CI
-});
--- a/src/tests/startup-flow.test.ts
+++ b/src/tests/startup-flow.test.ts
@@ -2,22 +2,14 @@ import { describe, expect, test } from "bun:test";
 import { spawn } from "node:child_process";

 /**
- * Integration tests for CLI startup flows.
+ * Startup flow tests that validate flag conflict handling.
 *
- * These tests verify the boot flow decision tree:
- * - Flag conflict detection
- * - --conversation: derives agent from conversation
- * - --agent: uses specified agent
- * - --new-agent: creates new agent
- * - Error messages for invalid inputs
- *
- * Note: Tests that depend on settings files (.letta/) are harder to isolate
- * because the CLI uses process.cwd(). For now, we focus on flag-based tests.
+ * These must remain runnable in fork PR CI (no secrets), so they should not
+ * require a working Letta server or LETTA_API_KEY.
 */

 const projectRoot = process.cwd();

-// Helper to run CLI and capture output
 async function runCli(
  args: string[],
  options: {
@@ -74,10 +66,6 @@ async function runCli(
  });
 }

-// ============================================================================
-// Flag Conflict Tests (fast, no API calls needed)
-// ============================================================================
-
 describe("Startup Flow - Flag Conflicts", () => {
  test("--conversation conflicts with --agent", async () => {
    const result = await runCli(
@@ -136,227 +124,3 @@ describe("Startup Flow - Flag Conflicts", () => {
    );
  });
 });
-
-// ============================================================================
-// Invalid Input Tests (require API calls but fail fast)
-// ============================================================================
-
-describe("Startup Flow - Invalid Inputs", () => {
-  test(
-    "--agent with nonexistent ID shows error",
-    async () => {
-      const result = await runCli(
-        ["--agent", "agent-definitely-does-not-exist-12345", "-p", "test"],
-        { expectExit: 1, timeoutMs: 60000 },
-      );
-      expect(result.stderr).toContain("not found");
-    },
-    { timeout: 70000 },
-  );
-
-  test(
-    "--conversation with nonexistent ID shows error",
-    async () => {
-      const result = await runCli(
-        [
-          "--conversation",
-          "conversation-definitely-does-not-exist-12345",
-          "-p",
-          "test",
-        ],
-        { expectExit: 1, timeoutMs: 60000 },
-      );
-      expect(result.stderr).toContain("not found");
-    },
-    { timeout: 70000 },
-  );
-
-  test("--from-af with nonexistent file shows error", async () => {
-    const result = await runCli(
-      ["--from-af", "/nonexistent/path/agent.af", "-p", "test"],
-      { expectExit: 1 },
-    );
-    expect(result.stderr).toContain("not found");
-  });
-});
-
-// ============================================================================
-// Integration Tests (require API access, create real agents)
-// ============================================================================
-
-describe("Startup Flow - Integration", () => {
-  // Store created agent/conversation IDs for cleanup and reuse
-  let testAgentId: string | null = null;
-
-  test(
-    "--new-agent creates agent and responds",
-    async () => {
-      const result = await runCli(
-        [
-          "--new-agent",
-          "-m",
-          "haiku",
-          "-p",
-          "Say OK and nothing else",
-          "--output-format",
-          "json",
-        ],
-        { timeoutMs: 120000 },
-      );
-
-      expect(result.exitCode).toBe(0);
-      // stdout includes the bun invocation line, extract just the JSON
-      const jsonStart = result.stdout.indexOf("{");
-      const output = JSON.parse(result.stdout.slice(jsonStart));
-      expect(output.agent_id).toBeDefined();
-      expect(output.result).toBeDefined();
-
-      // Save for later tests
-      testAgentId = output.agent_id;
-    },
-    { timeout: 130000 },
-  );
-
-  test(
-    "--agent with valid ID uses that agent",
-    async () => {
-      // Skip if previous test didn't create an agent
-      if (!testAgentId) {
-        console.log("Skipping: no test agent available");
-        return;
-      }
-
-      const result = await runCli(
-        [
-          "--agent",
-          testAgentId,
-          "-m",
-          "haiku",
-          "-p",
-          "Say OK",
-          "--output-format",
-          "json",
-        ],
-        { timeoutMs: 120000 },
-      );
-
-      expect(result.exitCode).toBe(0);
-      const jsonStart = result.stdout.indexOf("{");
-      const output = JSON.parse(result.stdout.slice(jsonStart));
-      expect(output.agent_id).toBe(testAgentId);
-    },
-    { timeout: 130000 },
-  );
-
-  test(
-    "--conversation with valid ID derives agent and uses conversation",
-    async () => {
-      // Skip if previous test didn't create an agent
-      if (!testAgentId) {
-        console.log("Skipping: no test agent available");
-        return;
-      }
-
-      // First, create a real conversation with --new (since --new-agent uses "default")
-      const createResult = await runCli(
-        [
-          "--agent",
-          testAgentId,
-          "--new",
-          "-m",
-          "haiku",
-          "-p",
-          "Say CREATED",
-          "--output-format",
-          "json",
-        ],
-        { timeoutMs: 120000 },
-      );
-      expect(createResult.exitCode).toBe(0);
-      const createJsonStart = createResult.stdout.indexOf("{");
-      const createOutput = JSON.parse(
-        createResult.stdout.slice(createJsonStart),
-      );
-      const realConversationId = createOutput.conversation_id;
-      expect(realConversationId).toBeDefined();
-      expect(realConversationId).not.toBe("default");
-
-      // Now test that --conversation can derive the agent from this conversation
-      const result = await runCli(
-        [
-          "--conversation",
-          realConversationId,
-          "-m",
-          "haiku",
-          "-p",
-          "Say OK",
-          "--output-format",
-          "json",
-        ],
-        { timeoutMs: 120000 },
-      );
-
-      expect(result.exitCode).toBe(0);
-      const jsonStart = result.stdout.indexOf("{");
-      const output = JSON.parse(result.stdout.slice(jsonStart));
-      // Should use the same agent that owns the conversation
-      expect(output.agent_id).toBe(testAgentId);
-      // Should use the specified conversation
-      expect(output.conversation_id).toBe(realConversationId);
-    },
-    { timeout: 180000 },
-  );
-
-  test(
-    "--new-agent with --init-blocks none creates minimal agent",
-    async () => {
-      const result = await runCli(
-        [
-          "--new-agent",
-          "--init-blocks",
-          "none",
-          "-m",
-          "haiku",
-          "-p",
-          "Say OK",
-          "--output-format",
-          "json",
-        ],
-        { timeoutMs: 120000 },
-      );
-
-      expect(result.exitCode).toBe(0);
-      // stdout includes the bun invocation line, extract just the JSON
-      const jsonStart = result.stdout.indexOf("{");
-      const output = JSON.parse(result.stdout.slice(jsonStart));
-      expect(output.agent_id).toBeDefined();
-    },
-    { timeout: 130000 },
-  );
-});
-
-// ============================================================================
-// --continue Tests (depend on LRU state, harder to isolate)
-// ============================================================================
-
-describe("Startup Flow - Continue Flag", () => {
-  test(
-    "--continue with no LRU shows error",
-    async () => {
-      // This test relies on running in a directory with no .letta/ settings
-      // In practice, this might use the project's .letta/ which has an LRU
-      // So we check for either success (if LRU exists) or error (if not)
-      const result = await runCli(
-        ["--continue", "-p", "Say OK", "--output-format", "json"],
-        { timeoutMs: 60000 },
-      );
-
-      // Either succeeds (LRU exists) or fails with specific error
-      if (result.exitCode !== 0) {
-        expect(result.stderr).toContain("No recent session found");
-      }
-      // If it succeeds, that's also valid (test env has LRU)
-    },
-    { timeout: 70000 },
-  );
-});