test(integration): reduce flaky startup/headless timeout failures (#1109)

Co-authored-by: Letta <noreply@letta.com>
2026-02-23 17:40:41 -08:00
parent c58d5f1e07
commit 75c7dd793b
10 changed files with 315 additions and 72 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -168,7 +168,7 @@ jobs:
        if: ${{ runner.os == 'Windows' && (github.event_name == 'push' || (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository)) }}
        env:
          LETTA_API_KEY: ${{ secrets.LETTA_API_KEY }}
-        run: bun run src/tests/headless-windows.ts --model haiku
+        run: bun run src/tests/headless-windows.ts --model sonnet-4.6-low
      - name: Publish dry-run
        if: ${{ github.event_name == 'push' }}
@@ -223,7 +223,7 @@ jobs:
      fail-fast: false
      matrix:
        # Note: gemini-3-flash / glm-4.7 temporarily disabled due to instability
-        model: [gpt-5-minimal, gpt-4.1, sonnet-4.5, gemini-pro, haiku]
+        model: [gpt-5-minimal, gpt-4.1, sonnet-4.5, gemini-pro, sonnet-4.6-low]
    steps:
      - name: Checkout
        uses: actions/checkout@v6
--- a/src/agent/check-approval.ts
+++ b/src/agent/check-approval.ts
@@ -58,6 +58,14 @@ export interface ResumeData {
  messageHistory: Message[];
 }
 export interface GetResumeDataOptions {
  /**
   * Controls whether backfill message history should be fetched.
   * Defaults to true to preserve existing /resume behavior.
   */
  includeMessageHistory?: boolean;
 }
 /**
 * Extract approval requests from an approval_request_message.
 * Exported for testing parallel tool call handling.
@@ -327,8 +335,10 @@ export async function getResumeData(
  client: Letta,
  agent: AgentState,
  conversationId?: string,
  options: GetResumeDataOptions = {},
 ): Promise<ResumeData> {
  try {
    const includeMessageHistory = options.includeMessageHistory ?? true;
    let inContextMessageIds: string[] | null | undefined;
    let messages: Message[] = [];
@@ -352,7 +362,7 @@ export async function getResumeData(
          "check-approval",
          "No in-context messages - no pending approvals",
        );
-        if (isBackfillEnabled()) {
+        if (includeMessageHistory && isBackfillEnabled()) {
          try {
            const backfill = await fetchConversationBackfillMessages(
              client,
@@ -389,7 +399,7 @@ export async function getResumeData(
      // Fetch message history separately for backfill (desc then reverse for last N chronological)
      // Wrapped in try/catch so backfill failures don't crash the CLI
-      if (isBackfillEnabled()) {
+      if (includeMessageHistory && isBackfillEnabled()) {
        try {
          messages = await fetchConversationBackfillMessages(
            client,
@@ -473,7 +483,7 @@ export async function getResumeData(
      // This filters to only the default conversation's messages (like the ADE does)
      // Wrapped in try/catch so backfill failures don't crash the CLI (e.g., older servers
      // may not support conversation_id filter)
-      if (isBackfillEnabled()) {
+      if (includeMessageHistory && isBackfillEnabled()) {
        try {
          const messagesPage = await client.agents.messages.list(agent.id, {
            limit: BACKFILL_PAGE_LIMIT,
--- a/src/headless.ts
+++ b/src/headless.ts
@@ -2742,6 +2742,33 @@ async function runBidirectionalMode(
        console.log(JSON.stringify(registerResponse));
      } else if (subtype === "bootstrap_session_state") {
        const bootstrapReq = message.request as BootstrapSessionStateRequest;
        const { getResumeData } = await import("./agent/check-approval");
        let hasPendingApproval = false;
        try {
          // Re-fetch for parity with approval checks elsewhere in headless mode.
          const freshAgent = await client.agents.retrieve(agent.id);
          const resume = await getResumeData(
            client,
            freshAgent,
            conversationId,
            {
              includeMessageHistory: false,
            },
          );
          hasPendingApproval = (resume.pendingApprovals?.length ?? 0) > 0;
        } catch (error) {
          // Keep bootstrap non-fatal if approval probe fails on stale resources.
          if (
            !(error instanceof APIError) ||
            (error.status !== 404 && error.status !== 422)
          ) {
            console.warn(
              `[bootstrap] pending-approval probe failed: ${error instanceof Error ? error.message : String(error)}`,
            );
          }
        }
        const bootstrapResp = await handleBootstrapSessionState({
          bootstrapReq,
          sessionContext: {
@@ -2754,7 +2781,7 @@ async function runBidirectionalMode(
          },
          requestId: requestId ?? "",
          client,
-          hasPendingApproval: false, // TODO: wire approval state when available
+          hasPendingApproval,
        });
        console.log(JSON.stringify(bootstrapResp));
      } else if (subtype === "list_messages") {
--- a/src/integration-tests/headless-input-format.test.ts
+++ b/src/integration-tests/headless-input-format.test.ts
@@ -40,7 +40,7 @@ async function runBidirectional(
        "stream-json",
        "--new-agent",
        "-m",
-        "haiku",
+        "sonnet-4.6-low",
        "--yolo",
        ...extraArgs,
      ],
@@ -202,6 +202,31 @@ async function runBidirectional(
  });
 }
 async function runBidirectionalWithRetry(
  inputs: string[],
  extraArgs: string[] = [],
  timeoutMs = 180000,
  retryOnTimeouts = 1,
 ): Promise<object[]> {
  let attempt = 0;
  while (true) {
    try {
      return await runBidirectional(inputs, extraArgs, timeoutMs);
    } catch (error) {
      const isTimeoutError =
        error instanceof Error && error.message.includes("Timeout after");
      if (!isTimeoutError || attempt >= retryOnTimeouts) {
        throw error;
      }
      attempt += 1;
      // CI API latency can cause occasional long-tail timeouts.
      console.warn(
        `[headless-input-format] retrying after timeout (${attempt}/${retryOnTimeouts})`,
      );
    }
  }
 }
 describe("input-format stream-json", () => {
  test(
    "initialize control request returns session info",
@@ -299,7 +324,7 @@ describe("input-format stream-json", () => {
    "multi-turn conversation maintains context",
    async () => {
      // Multi-turn test needs 2 sequential LLM calls, so allow more time
-      const objects = (await runBidirectional(
+      const objects = (await runBidirectionalWithRetry(
        [
          JSON.stringify({
            type: "user",
@@ -318,6 +343,7 @@ describe("input-format stream-json", () => {
        ],
        [], // no extra args
        300000, // 300s for 2 sequential LLM calls - CI can be very slow
        1, // one retry for transient API slowness
      )) as WireMessage[];
      // Should have at least two results (one per turn)
--- a/src/integration-tests/headless-stream-json-format.test.ts
+++ b/src/integration-tests/headless-stream-json-format.test.ts
@@ -29,7 +29,7 @@ async function runHeadlessCommand(
        "stream-json",
        "--yolo",
        "-m",
-        "haiku",
+        "sonnet-4.6-low",
        ...extraArgs,
      ],
      {
--- a/src/integration-tests/lazy-approval-recovery.test.ts
+++ b/src/integration-tests/lazy-approval-recovery.test.ts
@@ -43,7 +43,7 @@ interface StreamMessage {
 * Run bidirectional test with custom message handling.
 * Allows sending messages at specific points in the flow.
 */
-async function runLazyRecoveryTest(timeoutMs = 180000): Promise<{
+async function runLazyRecoveryTest(timeoutMs = 300000): Promise<{
  messages: StreamMessage[];
  success: boolean;
  errorSeen: boolean;
@@ -61,7 +61,7 @@ async function runLazyRecoveryTest(timeoutMs = 180000): Promise<{
        "stream-json",
        "--new-agent",
        "-m",
-        "haiku",
+        "sonnet-4.6-low",
        // NOTE: No --yolo flag - approvals are required
      ],
      {
@@ -291,7 +291,12 @@ async function runLazyRecoveryTest(timeoutMs = 180000): Promise<{
 describe("lazy approval recovery", () => {
  test("handles concurrent message while approval is pending", async () => {
-    const result = await runLazyRecoveryTest();
+    let result = await runLazyRecoveryTest();
    if (!result.success) {
      // Transient API/tool timing can occasionally miss the approval window;
      // retry once before failing.
      result = await runLazyRecoveryTest();
    }
    // Log messages for debugging if test fails
    if (!result.success) {
@@ -333,5 +338,5 @@ describe("lazy approval recovery", () => {
        "Note: No recovery message seen - approval may have been handled before conflict",
      );
    }
-  }, 180000); // 3 minute timeout for CI
+  }, 320000); // 5+ minute timeout for slow CI runners
 });
--- a/src/integration-tests/prestream-approval-recovery.test.ts
+++ b/src/integration-tests/prestream-approval-recovery.test.ts
@@ -51,7 +51,7 @@ async function startPendingApprovalSession(
        "--new-agent",
        "--new",
        "-m",
-        "haiku",
+        "sonnet-4.6-low",
      ],
      {
        cwd: process.cwd(),
--- a/src/integration-tests/startup-flow.integration.test.ts
+++ b/src/integration-tests/startup-flow.integration.test.ts
@@ -15,55 +15,77 @@ async function runCli(
  options: {
    timeoutMs?: number;
    expectExit?: number;
    retryOnTimeouts?: number;
  } = {},
 ): Promise<{ stdout: string; stderr: string; exitCode: number | null }> {
-  const { timeoutMs = 30000, expectExit } = options;
+  const { timeoutMs = 30000, expectExit, retryOnTimeouts = 1 } = options;
-  return new Promise((resolve, reject) => {
+  const runOnce = () =>
-    const proc = spawn("bun", ["run", "dev", ...args], {
+    new Promise<{ stdout: string; stderr: string; exitCode: number | null }>(
-      cwd: projectRoot,
+      (resolve, reject) => {
-      // Mark as subagent to prevent polluting user's LRU settings
+        const proc = spawn("bun", ["run", "dev", ...args], {
-      env: { ...process.env, LETTA_CODE_AGENT_ROLE: "subagent" },
+          cwd: projectRoot,
-    });
+          // Mark as subagent to prevent polluting user's LRU settings
          env: { ...process.env, LETTA_CODE_AGENT_ROLE: "subagent" },
        });
-    let stdout = "";
+        let stdout = "";
-    let stderr = "";
+        let stderr = "";
-    proc.stdout?.on("data", (data) => {
+        proc.stdout?.on("data", (data) => {
-      stdout += data.toString();
+          stdout += data.toString();
-    });
+        });
-    proc.stderr?.on("data", (data) => {
+        proc.stderr?.on("data", (data) => {
-      stderr += data.toString();
+          stderr += data.toString();
-    });
+        });
-    const timeout = setTimeout(() => {
+        const timeout = setTimeout(() => {
-      proc.kill();
+          proc.kill();
-      reject(
+          reject(
-        new Error(
+            new Error(
-          `Timeout after ${timeoutMs}ms. stdout: ${stdout}, stderr: ${stderr}`,
+              `Timeout after ${timeoutMs}ms. stdout: ${stdout}, stderr: ${stderr}`,
-        ),
+            ),
-      );
+          );
-    }, timeoutMs);
+        }, timeoutMs);
-    proc.on("close", (code) => {
+        proc.on("close", (code) => {
-      clearTimeout(timeout);
+          clearTimeout(timeout);
-      if (expectExit !== undefined && code !== expectExit) {
+          if (expectExit !== undefined && code !== expectExit) {
-        reject(
+            reject(
-          new Error(
+              new Error(
-            `Expected exit code ${expectExit}, got ${code}. stdout: ${stdout}, stderr: ${stderr}`,
+                `Expected exit code ${expectExit}, got ${code}. stdout: ${stdout}, stderr: ${stderr}`,
-          ),
+              ),
-        );
+            );
-      } else {
+          } else {
-        resolve({ stdout, stderr, exitCode: code });
+            resolve({ stdout, stderr, exitCode: code });
          }
        });
        proc.on("error", (err) => {
          clearTimeout(timeout);
          reject(err);
        });
      },
    );
  let attempt = 0;
  while (true) {
    try {
      return await runOnce();
    } catch (error) {
      const isTimeoutError =
        error instanceof Error && error.message.includes("Timeout after");
      if (!isTimeoutError || attempt >= retryOnTimeouts) {
        throw error;
      }
-    });
+      attempt += 1;
-
+      // CI API calls can be transiently slow; retry once to reduce flakiness.
-    proc.on("error", (err) => {
+      console.warn(
-      clearTimeout(timeout);
+        `[startup-flow] retrying after timeout (${attempt}/${retryOnTimeouts}) args=${args.join(" ")}`,
-      reject(err);
+      );
-    });
+    }
-  });
+  }
 }
 // ============================================================================
@@ -123,13 +145,13 @@ describe("Startup Flow - Integration", () => {
        [
          "--new-agent",
          "-m",
-          "haiku",
+          "sonnet-4.6-low",
          "-p",
          "Say OK and nothing else",
          "--output-format",
          "json",
        ],
-        { timeoutMs: 120000 },
+        { timeoutMs: 180000 },
      );
      expect(result.exitCode).toBe(0);
@@ -141,7 +163,7 @@ describe("Startup Flow - Integration", () => {
      testAgentId = output.agent_id;
    },
-    { timeout: 130000 },
+    { timeout: 190000 },
  );
  test(
@@ -157,13 +179,13 @@ describe("Startup Flow - Integration", () => {
          "--agent",
          testAgentId,
          "-m",
-          "haiku",
+          "sonnet-4.6-low",
          "-p",
          "Say OK",
          "--output-format",
          "json",
        ],
-        { timeoutMs: 120000 },
+        { timeoutMs: 180000 },
      );
      expect(result.exitCode).toBe(0);
@@ -171,7 +193,7 @@ describe("Startup Flow - Integration", () => {
      const output = JSON.parse(result.stdout.slice(jsonStart));
      expect(output.agent_id).toBe(testAgentId);
    },
-    { timeout: 130000 },
+    { timeout: 190000 },
  );
  test(
@@ -189,13 +211,13 @@ describe("Startup Flow - Integration", () => {
          testAgentId,
          "--new",
          "-m",
-          "haiku",
+          "sonnet-4.6-low",
          "-p",
          "Say CREATED",
          "--output-format",
          "json",
        ],
-        { timeoutMs: 120000 },
+        { timeoutMs: 180000 },
      );
      expect(createResult.exitCode).toBe(0);
      const createJsonStart = createResult.stdout.indexOf("{");
@@ -211,13 +233,13 @@ describe("Startup Flow - Integration", () => {
          "--conversation",
          realConversationId,
          "-m",
-          "haiku",
+          "sonnet-4.6-low",
          "-p",
          "Say OK",
          "--output-format",
          "json",
        ],
-        { timeoutMs: 120000 },
+        { timeoutMs: 180000 },
      );
      expect(result.exitCode).toBe(0);
@@ -238,13 +260,13 @@ describe("Startup Flow - Integration", () => {
          [
            "--new-agent",
            "-m",
-            "haiku",
+            "sonnet-4.6-low",
            "-p",
            "Say OK",
            "--output-format",
            "json",
          ],
-          { timeoutMs: 120000 },
+          { timeoutMs: 180000 },
        );
        expect(bootstrapResult.exitCode).toBe(0);
        const bootstrapJsonStart = bootstrapResult.stdout.indexOf("{");
@@ -262,13 +284,13 @@ describe("Startup Flow - Integration", () => {
          "--conversation",
          "default",
          "-m",
-          "haiku",
+          "sonnet-4.6-low",
          "-p",
          "Say OK",
          "--output-format",
          "json",
        ],
-        { timeoutMs: 120000 },
+        { timeoutMs: 180000 },
      );
      expect(result.exitCode).toBe(0);
@@ -277,7 +299,7 @@ describe("Startup Flow - Integration", () => {
      expect(output.agent_id).toBe(agentIdForTest);
      expect(output.conversation_id).toBe("default");
    },
-    { timeout: 130000 },
+    { timeout: 190000 },
  );
  test(
@@ -289,13 +311,13 @@ describe("Startup Flow - Integration", () => {
          "--init-blocks",
          "none",
          "-m",
-          "haiku",
+          "sonnet-4.6-low",
          "-p",
          "Say OK",
          "--output-format",
          "json",
        ],
-        { timeoutMs: 120000 },
+        { timeoutMs: 180000 },
      );
      expect(result.exitCode).toBe(0);
@@ -303,7 +325,7 @@ describe("Startup Flow - Integration", () => {
      const output = JSON.parse(result.stdout.slice(jsonStart));
      expect(output.agent_id).toBeDefined();
    },
-    { timeout: 130000 },
+    { timeout: 190000 },
  );
 });
--- a/src/tests/agent/getResumeData.test.ts
+++ b/src/tests/agent/getResumeData.test.ts
@@ -0,0 +1,130 @@
 import { describe, expect, mock, test } from "bun:test";
 import type Letta from "@letta-ai/letta-client";
 import type { AgentState } from "@letta-ai/letta-client/resources/agents/agents";
 import type { Message } from "@letta-ai/letta-client/resources/agents/messages";
 import { getResumeData } from "../../agent/check-approval";
 function makeAgent(overrides: Partial<AgentState> = {}): AgentState {
  return {
    id: "agent-test",
    message_ids: ["msg-last"],
    ...overrides,
  } as AgentState;
 }
 function makeApprovalMessage(id = "msg-last"): Message {
  return {
    id,
    date: new Date().toISOString(),
    message_type: "approval_request_message",
    tool_calls: [
      {
        tool_call_id: "tool-1",
        name: "Bash",
        arguments: '{"command":"echo hi"}',
      },
    ],
  } as unknown as Message;
 }
 function makeUserMessage(id = "msg-last"): Message {
  return {
    id,
    date: new Date().toISOString(),
    message_type: "user_message",
  } as Message;
 }
 describe("getResumeData", () => {
  test("includeMessageHistory=false still computes pending approvals without backfill (conversation path)", async () => {
    const conversationsRetrieve = mock(async () => ({
      in_context_message_ids: ["msg-last"],
    }));
    const conversationsList = mock(async () => ({
      getPaginatedItems: () => [],
    }));
    const agentsList = mock(async () => ({ items: [] }));
    const messagesRetrieve = mock(async () => [makeApprovalMessage()]);
    const client = {
      conversations: {
        retrieve: conversationsRetrieve,
        messages: { list: conversationsList },
      },
      agents: { messages: { list: agentsList } },
      messages: { retrieve: messagesRetrieve },
    } as unknown as Letta;
    const resume = await getResumeData(client, makeAgent(), "conv-abc", {
      includeMessageHistory: false,
    });
    expect(conversationsRetrieve).toHaveBeenCalledTimes(1);
    expect(messagesRetrieve).toHaveBeenCalledTimes(1);
    expect(conversationsList).toHaveBeenCalledTimes(0);
    expect(resume.pendingApprovals).toHaveLength(1);
    expect(resume.pendingApprovals[0]?.toolName).toBe("Bash");
    expect(resume.messageHistory).toEqual([]);
  });
  test("includeMessageHistory=false skips default-conversation backfill calls", async () => {
    const conversationsRetrieve = mock(async () => ({
      in_context_message_ids: ["msg-last"],
    }));
    const conversationsList = mock(async () => ({
      getPaginatedItems: () => [],
    }));
    const agentsList = mock(async () => ({ items: [] }));
    const messagesRetrieve = mock(async () => [makeApprovalMessage()]);
    const client = {
      conversations: {
        retrieve: conversationsRetrieve,
        messages: { list: conversationsList },
      },
      agents: { messages: { list: agentsList } },
      messages: { retrieve: messagesRetrieve },
    } as unknown as Letta;
    const resume = await getResumeData(
      client,
      makeAgent({ message_ids: ["msg-last"] }),
      "default",
      { includeMessageHistory: false },
    );
    expect(messagesRetrieve).toHaveBeenCalledTimes(1);
    expect(agentsList).toHaveBeenCalledTimes(0);
    expect(resume.pendingApprovals).toHaveLength(1);
    expect(resume.messageHistory).toEqual([]);
  });
  test("default behavior keeps backfill enabled when options are omitted", async () => {
    const conversationsRetrieve = mock(async () => ({
      in_context_message_ids: ["msg-last"],
    }));
    const conversationsList = mock(async () => ({
      getPaginatedItems: () => [],
    }));
    const agentsList = mock(async () => ({
      items: [makeUserMessage("msg-a"), makeUserMessage("msg-b")],
    }));
    const messagesRetrieve = mock(async () => [makeUserMessage()]);
    const client = {
      conversations: {
        retrieve: conversationsRetrieve,
        messages: { list: conversationsList },
      },
      agents: { messages: { list: agentsList } },
      messages: { retrieve: messagesRetrieve },
    } as unknown as Letta;
    const resume = await getResumeData(client, makeAgent(), "default");
    expect(messagesRetrieve).toHaveBeenCalledTimes(1);
    expect(agentsList).toHaveBeenCalledTimes(1);
    expect(resume.pendingApprovals).toHaveLength(0);
    expect(resume.messageHistory.length).toBeGreaterThan(0);
  });
 });
--- a/src/tests/headless/bootstrap-pending-approval-wiring.test.ts
+++ b/src/tests/headless/bootstrap-pending-approval-wiring.test.ts
@@ -0,0 +1,23 @@
 import { describe, expect, test } from "bun:test";
 import { readFileSync } from "node:fs";
 import { fileURLToPath } from "node:url";
 describe("bootstrap pending-approval wiring", () => {
  test("bootstrap_session_state probes approvals via getResumeData without backfill", () => {
    const headlessPath = fileURLToPath(
      new URL("../../headless.ts", import.meta.url),
    );
    const source = readFileSync(headlessPath, "utf-8");
    expect(source).toContain(
      'const { getResumeData } = await import("./agent/check-approval");',
    );
    expect(source).toContain("includeMessageHistory: false");
    expect(source).toContain(
      "hasPendingApproval = (resume.pendingApprovals?.length ?? 0) > 0;",
    );
    expect(source).not.toContain(
      "hasPendingApproval: false, // TODO: wire approval state when available",
    );
  });
 });