diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 699d229..d764cc9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -168,7 +168,7 @@ jobs: if: ${{ runner.os == 'Windows' && (github.event_name == 'push' || (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository)) }} env: LETTA_API_KEY: ${{ secrets.LETTA_API_KEY }} - run: bun run src/tests/headless-windows.ts --model haiku + run: bun run src/tests/headless-windows.ts --model sonnet-4.6-low - name: Publish dry-run if: ${{ github.event_name == 'push' }} @@ -223,7 +223,7 @@ jobs: fail-fast: false matrix: # Note: gemini-3-flash / glm-4.7 temporarily disabled due to instability - model: [gpt-5-minimal, gpt-4.1, sonnet-4.5, gemini-pro, haiku] + model: [gpt-5-minimal, gpt-4.1, sonnet-4.5, gemini-pro, sonnet-4.6-low] steps: - name: Checkout uses: actions/checkout@v6 diff --git a/src/agent/check-approval.ts b/src/agent/check-approval.ts index 6bcc060..6661e6d 100644 --- a/src/agent/check-approval.ts +++ b/src/agent/check-approval.ts @@ -58,6 +58,14 @@ export interface ResumeData { messageHistory: Message[]; } +export interface GetResumeDataOptions { + /** + * Controls whether backfill message history should be fetched. + * Defaults to true to preserve existing /resume behavior. + */ + includeMessageHistory?: boolean; +} + /** * Extract approval requests from an approval_request_message. * Exported for testing parallel tool call handling. @@ -327,8 +335,10 @@ export async function getResumeData( client: Letta, agent: AgentState, conversationId?: string, + options: GetResumeDataOptions = {}, ): Promise { try { + const includeMessageHistory = options.includeMessageHistory ?? true; let inContextMessageIds: string[] | null | undefined; let messages: Message[] = []; @@ -352,7 +362,7 @@ export async function getResumeData( "check-approval", "No in-context messages - no pending approvals", ); - if (isBackfillEnabled()) { + if (includeMessageHistory && isBackfillEnabled()) { try { const backfill = await fetchConversationBackfillMessages( client, @@ -389,7 +399,7 @@ export async function getResumeData( // Fetch message history separately for backfill (desc then reverse for last N chronological) // Wrapped in try/catch so backfill failures don't crash the CLI - if (isBackfillEnabled()) { + if (includeMessageHistory && isBackfillEnabled()) { try { messages = await fetchConversationBackfillMessages( client, @@ -473,7 +483,7 @@ export async function getResumeData( // This filters to only the default conversation's messages (like the ADE does) // Wrapped in try/catch so backfill failures don't crash the CLI (e.g., older servers // may not support conversation_id filter) - if (isBackfillEnabled()) { + if (includeMessageHistory && isBackfillEnabled()) { try { const messagesPage = await client.agents.messages.list(agent.id, { limit: BACKFILL_PAGE_LIMIT, diff --git a/src/headless.ts b/src/headless.ts index 10f3ef5..0c51d26 100644 --- a/src/headless.ts +++ b/src/headless.ts @@ -2742,6 +2742,33 @@ async function runBidirectionalMode( console.log(JSON.stringify(registerResponse)); } else if (subtype === "bootstrap_session_state") { const bootstrapReq = message.request as BootstrapSessionStateRequest; + const { getResumeData } = await import("./agent/check-approval"); + let hasPendingApproval = false; + + try { + // Re-fetch for parity with approval checks elsewhere in headless mode. + const freshAgent = await client.agents.retrieve(agent.id); + const resume = await getResumeData( + client, + freshAgent, + conversationId, + { + includeMessageHistory: false, + }, + ); + hasPendingApproval = (resume.pendingApprovals?.length ?? 0) > 0; + } catch (error) { + // Keep bootstrap non-fatal if approval probe fails on stale resources. + if ( + !(error instanceof APIError) || + (error.status !== 404 && error.status !== 422) + ) { + console.warn( + `[bootstrap] pending-approval probe failed: ${error instanceof Error ? error.message : String(error)}`, + ); + } + } + const bootstrapResp = await handleBootstrapSessionState({ bootstrapReq, sessionContext: { @@ -2754,7 +2781,7 @@ async function runBidirectionalMode( }, requestId: requestId ?? "", client, - hasPendingApproval: false, // TODO: wire approval state when available + hasPendingApproval, }); console.log(JSON.stringify(bootstrapResp)); } else if (subtype === "list_messages") { diff --git a/src/integration-tests/headless-input-format.test.ts b/src/integration-tests/headless-input-format.test.ts index 7c3045e..61f82fc 100644 --- a/src/integration-tests/headless-input-format.test.ts +++ b/src/integration-tests/headless-input-format.test.ts @@ -40,7 +40,7 @@ async function runBidirectional( "stream-json", "--new-agent", "-m", - "haiku", + "sonnet-4.6-low", "--yolo", ...extraArgs, ], @@ -202,6 +202,31 @@ async function runBidirectional( }); } +async function runBidirectionalWithRetry( + inputs: string[], + extraArgs: string[] = [], + timeoutMs = 180000, + retryOnTimeouts = 1, +): Promise { + let attempt = 0; + while (true) { + try { + return await runBidirectional(inputs, extraArgs, timeoutMs); + } catch (error) { + const isTimeoutError = + error instanceof Error && error.message.includes("Timeout after"); + if (!isTimeoutError || attempt >= retryOnTimeouts) { + throw error; + } + attempt += 1; + // CI API latency can cause occasional long-tail timeouts. + console.warn( + `[headless-input-format] retrying after timeout (${attempt}/${retryOnTimeouts})`, + ); + } + } +} + describe("input-format stream-json", () => { test( "initialize control request returns session info", @@ -299,7 +324,7 @@ describe("input-format stream-json", () => { "multi-turn conversation maintains context", async () => { // Multi-turn test needs 2 sequential LLM calls, so allow more time - const objects = (await runBidirectional( + const objects = (await runBidirectionalWithRetry( [ JSON.stringify({ type: "user", @@ -318,6 +343,7 @@ describe("input-format stream-json", () => { ], [], // no extra args 300000, // 300s for 2 sequential LLM calls - CI can be very slow + 1, // one retry for transient API slowness )) as WireMessage[]; // Should have at least two results (one per turn) diff --git a/src/integration-tests/headless-stream-json-format.test.ts b/src/integration-tests/headless-stream-json-format.test.ts index 231ce70..988a159 100644 --- a/src/integration-tests/headless-stream-json-format.test.ts +++ b/src/integration-tests/headless-stream-json-format.test.ts @@ -29,7 +29,7 @@ async function runHeadlessCommand( "stream-json", "--yolo", "-m", - "haiku", + "sonnet-4.6-low", ...extraArgs, ], { diff --git a/src/integration-tests/lazy-approval-recovery.test.ts b/src/integration-tests/lazy-approval-recovery.test.ts index 3820820..5155dc2 100644 --- a/src/integration-tests/lazy-approval-recovery.test.ts +++ b/src/integration-tests/lazy-approval-recovery.test.ts @@ -43,7 +43,7 @@ interface StreamMessage { * Run bidirectional test with custom message handling. * Allows sending messages at specific points in the flow. */ -async function runLazyRecoveryTest(timeoutMs = 180000): Promise<{ +async function runLazyRecoveryTest(timeoutMs = 300000): Promise<{ messages: StreamMessage[]; success: boolean; errorSeen: boolean; @@ -61,7 +61,7 @@ async function runLazyRecoveryTest(timeoutMs = 180000): Promise<{ "stream-json", "--new-agent", "-m", - "haiku", + "sonnet-4.6-low", // NOTE: No --yolo flag - approvals are required ], { @@ -291,7 +291,12 @@ async function runLazyRecoveryTest(timeoutMs = 180000): Promise<{ describe("lazy approval recovery", () => { test("handles concurrent message while approval is pending", async () => { - const result = await runLazyRecoveryTest(); + let result = await runLazyRecoveryTest(); + if (!result.success) { + // Transient API/tool timing can occasionally miss the approval window; + // retry once before failing. + result = await runLazyRecoveryTest(); + } // Log messages for debugging if test fails if (!result.success) { @@ -333,5 +338,5 @@ describe("lazy approval recovery", () => { "Note: No recovery message seen - approval may have been handled before conflict", ); } - }, 180000); // 3 minute timeout for CI + }, 320000); // 5+ minute timeout for slow CI runners }); diff --git a/src/integration-tests/prestream-approval-recovery.test.ts b/src/integration-tests/prestream-approval-recovery.test.ts index c06c466..93a8de4 100644 --- a/src/integration-tests/prestream-approval-recovery.test.ts +++ b/src/integration-tests/prestream-approval-recovery.test.ts @@ -51,7 +51,7 @@ async function startPendingApprovalSession( "--new-agent", "--new", "-m", - "haiku", + "sonnet-4.6-low", ], { cwd: process.cwd(), diff --git a/src/integration-tests/startup-flow.integration.test.ts b/src/integration-tests/startup-flow.integration.test.ts index 76a1bdf..33ebbb2 100644 --- a/src/integration-tests/startup-flow.integration.test.ts +++ b/src/integration-tests/startup-flow.integration.test.ts @@ -15,55 +15,77 @@ async function runCli( options: { timeoutMs?: number; expectExit?: number; + retryOnTimeouts?: number; } = {}, ): Promise<{ stdout: string; stderr: string; exitCode: number | null }> { - const { timeoutMs = 30000, expectExit } = options; + const { timeoutMs = 30000, expectExit, retryOnTimeouts = 1 } = options; - return new Promise((resolve, reject) => { - const proc = spawn("bun", ["run", "dev", ...args], { - cwd: projectRoot, - // Mark as subagent to prevent polluting user's LRU settings - env: { ...process.env, LETTA_CODE_AGENT_ROLE: "subagent" }, - }); + const runOnce = () => + new Promise<{ stdout: string; stderr: string; exitCode: number | null }>( + (resolve, reject) => { + const proc = spawn("bun", ["run", "dev", ...args], { + cwd: projectRoot, + // Mark as subagent to prevent polluting user's LRU settings + env: { ...process.env, LETTA_CODE_AGENT_ROLE: "subagent" }, + }); - let stdout = ""; - let stderr = ""; + let stdout = ""; + let stderr = ""; - proc.stdout?.on("data", (data) => { - stdout += data.toString(); - }); + proc.stdout?.on("data", (data) => { + stdout += data.toString(); + }); - proc.stderr?.on("data", (data) => { - stderr += data.toString(); - }); + proc.stderr?.on("data", (data) => { + stderr += data.toString(); + }); - const timeout = setTimeout(() => { - proc.kill(); - reject( - new Error( - `Timeout after ${timeoutMs}ms. stdout: ${stdout}, stderr: ${stderr}`, - ), - ); - }, timeoutMs); + const timeout = setTimeout(() => { + proc.kill(); + reject( + new Error( + `Timeout after ${timeoutMs}ms. stdout: ${stdout}, stderr: ${stderr}`, + ), + ); + }, timeoutMs); - proc.on("close", (code) => { - clearTimeout(timeout); - if (expectExit !== undefined && code !== expectExit) { - reject( - new Error( - `Expected exit code ${expectExit}, got ${code}. stdout: ${stdout}, stderr: ${stderr}`, - ), - ); - } else { - resolve({ stdout, stderr, exitCode: code }); + proc.on("close", (code) => { + clearTimeout(timeout); + if (expectExit !== undefined && code !== expectExit) { + reject( + new Error( + `Expected exit code ${expectExit}, got ${code}. stdout: ${stdout}, stderr: ${stderr}`, + ), + ); + } else { + resolve({ stdout, stderr, exitCode: code }); + } + }); + + proc.on("error", (err) => { + clearTimeout(timeout); + reject(err); + }); + }, + ); + + let attempt = 0; + while (true) { + try { + return await runOnce(); + } catch (error) { + const isTimeoutError = + error instanceof Error && error.message.includes("Timeout after"); + if (!isTimeoutError || attempt >= retryOnTimeouts) { + throw error; } - }); - - proc.on("error", (err) => { - clearTimeout(timeout); - reject(err); - }); - }); + attempt += 1; + // CI API calls can be transiently slow; retry once to reduce flakiness. + console.warn( + `[startup-flow] retrying after timeout (${attempt}/${retryOnTimeouts}) args=${args.join(" ")}`, + ); + } + } } // ============================================================================ @@ -123,13 +145,13 @@ describe("Startup Flow - Integration", () => { [ "--new-agent", "-m", - "haiku", + "sonnet-4.6-low", "-p", "Say OK and nothing else", "--output-format", "json", ], - { timeoutMs: 120000 }, + { timeoutMs: 180000 }, ); expect(result.exitCode).toBe(0); @@ -141,7 +163,7 @@ describe("Startup Flow - Integration", () => { testAgentId = output.agent_id; }, - { timeout: 130000 }, + { timeout: 190000 }, ); test( @@ -157,13 +179,13 @@ describe("Startup Flow - Integration", () => { "--agent", testAgentId, "-m", - "haiku", + "sonnet-4.6-low", "-p", "Say OK", "--output-format", "json", ], - { timeoutMs: 120000 }, + { timeoutMs: 180000 }, ); expect(result.exitCode).toBe(0); @@ -171,7 +193,7 @@ describe("Startup Flow - Integration", () => { const output = JSON.parse(result.stdout.slice(jsonStart)); expect(output.agent_id).toBe(testAgentId); }, - { timeout: 130000 }, + { timeout: 190000 }, ); test( @@ -189,13 +211,13 @@ describe("Startup Flow - Integration", () => { testAgentId, "--new", "-m", - "haiku", + "sonnet-4.6-low", "-p", "Say CREATED", "--output-format", "json", ], - { timeoutMs: 120000 }, + { timeoutMs: 180000 }, ); expect(createResult.exitCode).toBe(0); const createJsonStart = createResult.stdout.indexOf("{"); @@ -211,13 +233,13 @@ describe("Startup Flow - Integration", () => { "--conversation", realConversationId, "-m", - "haiku", + "sonnet-4.6-low", "-p", "Say OK", "--output-format", "json", ], - { timeoutMs: 120000 }, + { timeoutMs: 180000 }, ); expect(result.exitCode).toBe(0); @@ -238,13 +260,13 @@ describe("Startup Flow - Integration", () => { [ "--new-agent", "-m", - "haiku", + "sonnet-4.6-low", "-p", "Say OK", "--output-format", "json", ], - { timeoutMs: 120000 }, + { timeoutMs: 180000 }, ); expect(bootstrapResult.exitCode).toBe(0); const bootstrapJsonStart = bootstrapResult.stdout.indexOf("{"); @@ -262,13 +284,13 @@ describe("Startup Flow - Integration", () => { "--conversation", "default", "-m", - "haiku", + "sonnet-4.6-low", "-p", "Say OK", "--output-format", "json", ], - { timeoutMs: 120000 }, + { timeoutMs: 180000 }, ); expect(result.exitCode).toBe(0); @@ -277,7 +299,7 @@ describe("Startup Flow - Integration", () => { expect(output.agent_id).toBe(agentIdForTest); expect(output.conversation_id).toBe("default"); }, - { timeout: 130000 }, + { timeout: 190000 }, ); test( @@ -289,13 +311,13 @@ describe("Startup Flow - Integration", () => { "--init-blocks", "none", "-m", - "haiku", + "sonnet-4.6-low", "-p", "Say OK", "--output-format", "json", ], - { timeoutMs: 120000 }, + { timeoutMs: 180000 }, ); expect(result.exitCode).toBe(0); @@ -303,7 +325,7 @@ describe("Startup Flow - Integration", () => { const output = JSON.parse(result.stdout.slice(jsonStart)); expect(output.agent_id).toBeDefined(); }, - { timeout: 130000 }, + { timeout: 190000 }, ); }); diff --git a/src/tests/agent/getResumeData.test.ts b/src/tests/agent/getResumeData.test.ts new file mode 100644 index 0000000..03e3053 --- /dev/null +++ b/src/tests/agent/getResumeData.test.ts @@ -0,0 +1,130 @@ +import { describe, expect, mock, test } from "bun:test"; +import type Letta from "@letta-ai/letta-client"; +import type { AgentState } from "@letta-ai/letta-client/resources/agents/agents"; +import type { Message } from "@letta-ai/letta-client/resources/agents/messages"; +import { getResumeData } from "../../agent/check-approval"; + +function makeAgent(overrides: Partial = {}): AgentState { + return { + id: "agent-test", + message_ids: ["msg-last"], + ...overrides, + } as AgentState; +} + +function makeApprovalMessage(id = "msg-last"): Message { + return { + id, + date: new Date().toISOString(), + message_type: "approval_request_message", + tool_calls: [ + { + tool_call_id: "tool-1", + name: "Bash", + arguments: '{"command":"echo hi"}', + }, + ], + } as unknown as Message; +} + +function makeUserMessage(id = "msg-last"): Message { + return { + id, + date: new Date().toISOString(), + message_type: "user_message", + } as Message; +} + +describe("getResumeData", () => { + test("includeMessageHistory=false still computes pending approvals without backfill (conversation path)", async () => { + const conversationsRetrieve = mock(async () => ({ + in_context_message_ids: ["msg-last"], + })); + const conversationsList = mock(async () => ({ + getPaginatedItems: () => [], + })); + const agentsList = mock(async () => ({ items: [] })); + const messagesRetrieve = mock(async () => [makeApprovalMessage()]); + + const client = { + conversations: { + retrieve: conversationsRetrieve, + messages: { list: conversationsList }, + }, + agents: { messages: { list: agentsList } }, + messages: { retrieve: messagesRetrieve }, + } as unknown as Letta; + + const resume = await getResumeData(client, makeAgent(), "conv-abc", { + includeMessageHistory: false, + }); + + expect(conversationsRetrieve).toHaveBeenCalledTimes(1); + expect(messagesRetrieve).toHaveBeenCalledTimes(1); + expect(conversationsList).toHaveBeenCalledTimes(0); + expect(resume.pendingApprovals).toHaveLength(1); + expect(resume.pendingApprovals[0]?.toolName).toBe("Bash"); + expect(resume.messageHistory).toEqual([]); + }); + + test("includeMessageHistory=false skips default-conversation backfill calls", async () => { + const conversationsRetrieve = mock(async () => ({ + in_context_message_ids: ["msg-last"], + })); + const conversationsList = mock(async () => ({ + getPaginatedItems: () => [], + })); + const agentsList = mock(async () => ({ items: [] })); + const messagesRetrieve = mock(async () => [makeApprovalMessage()]); + + const client = { + conversations: { + retrieve: conversationsRetrieve, + messages: { list: conversationsList }, + }, + agents: { messages: { list: agentsList } }, + messages: { retrieve: messagesRetrieve }, + } as unknown as Letta; + + const resume = await getResumeData( + client, + makeAgent({ message_ids: ["msg-last"] }), + "default", + { includeMessageHistory: false }, + ); + + expect(messagesRetrieve).toHaveBeenCalledTimes(1); + expect(agentsList).toHaveBeenCalledTimes(0); + expect(resume.pendingApprovals).toHaveLength(1); + expect(resume.messageHistory).toEqual([]); + }); + + test("default behavior keeps backfill enabled when options are omitted", async () => { + const conversationsRetrieve = mock(async () => ({ + in_context_message_ids: ["msg-last"], + })); + const conversationsList = mock(async () => ({ + getPaginatedItems: () => [], + })); + const agentsList = mock(async () => ({ + items: [makeUserMessage("msg-a"), makeUserMessage("msg-b")], + })); + const messagesRetrieve = mock(async () => [makeUserMessage()]); + + const client = { + conversations: { + retrieve: conversationsRetrieve, + messages: { list: conversationsList }, + }, + agents: { messages: { list: agentsList } }, + messages: { retrieve: messagesRetrieve }, + } as unknown as Letta; + + const resume = await getResumeData(client, makeAgent(), "default"); + + expect(messagesRetrieve).toHaveBeenCalledTimes(1); + expect(agentsList).toHaveBeenCalledTimes(1); + expect(resume.pendingApprovals).toHaveLength(0); + expect(resume.messageHistory.length).toBeGreaterThan(0); + }); +}); diff --git a/src/tests/headless/bootstrap-pending-approval-wiring.test.ts b/src/tests/headless/bootstrap-pending-approval-wiring.test.ts new file mode 100644 index 0000000..f3d316b --- /dev/null +++ b/src/tests/headless/bootstrap-pending-approval-wiring.test.ts @@ -0,0 +1,23 @@ +import { describe, expect, test } from "bun:test"; +import { readFileSync } from "node:fs"; +import { fileURLToPath } from "node:url"; + +describe("bootstrap pending-approval wiring", () => { + test("bootstrap_session_state probes approvals via getResumeData without backfill", () => { + const headlessPath = fileURLToPath( + new URL("../../headless.ts", import.meta.url), + ); + const source = readFileSync(headlessPath, "utf-8"); + + expect(source).toContain( + 'const { getResumeData } = await import("./agent/check-approval");', + ); + expect(source).toContain("includeMessageHistory: false"); + expect(source).toContain( + "hasPendingApproval = (resume.pendingApprovals?.length ?? 0) > 0;", + ); + expect(source).not.toContain( + "hasPendingApproval: false, // TODO: wire approval state when available", + ); + }); +});