From 3c084b52a68f7ce1f9b87ac6cc56907083822e9b Mon Sep 17 00:00:00 2001 From: Charles Packer Date: Sun, 9 Nov 2025 12:34:25 -0800 Subject: [PATCH] feat: add headless matrix to ci (#86) --- .github/workflows/ci.yml | 35 +++- src/tests/headless-scenario.ts | 151 ++++++++++++++++++ .../tests/test-image-send.ts | 22 ++- 3 files changed, 201 insertions(+), 7 deletions(-) create mode 100644 src/tests/headless-scenario.ts rename test-image-send.ts => src/tests/test-image-send.ts (65%) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e65ee34..0ed6172 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -66,8 +66,8 @@ jobs: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: bun install - - name: Run tests - run: bun test + - name: Run tests (extended timeout) + run: bun test --timeout 15000 - name: Build bundle run: bun run build @@ -111,3 +111,34 @@ jobs: - name: Pack (no auth available) if: ${{ github.event_name != 'push' }} run: bun pm pack + + headless: + needs: check + name: Headless / ${{ matrix.model }} + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + model: [gpt-5-minimal, gpt-4.1, default, gemini-pro, glm-4.6, haiku] + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Bun + uses: oven-sh/setup-bun@v1 + with: + bun-version: 1.3.0 + + - name: Install dependencies + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: bun install + + - name: Run headless scenario (all outputs) + if: ${{ github.event_name == 'push' || (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) }} + env: + LETTA_API_KEY: ${{ secrets.LETTA_API_KEY }} + run: | + bun run src/tests/headless-scenario.ts --model "${{ matrix.model }}" --output text --parallel on + bun run src/tests/headless-scenario.ts --model "${{ matrix.model }}" --output json --parallel on + bun run src/tests/headless-scenario.ts --model "${{ matrix.model }}" --output stream-json --parallel on diff --git a/src/tests/headless-scenario.ts b/src/tests/headless-scenario.ts new file mode 100644 index 0000000..7e91e6a --- /dev/null +++ b/src/tests/headless-scenario.ts @@ -0,0 +1,151 @@ +#!/usr/bin/env bun +/** + * Headless scenario test runner + * + * Runs a single multi-step scenario against the LeTTA Code CLI (headless) for a given + * model and output format. Intended for CI matrix usage. + * + * Usage: + * bun tsx src/tests/headless-scenario.ts --model gpt-4.1 --output stream-json --parallel on + */ + +type Args = { + model: string; + output: "text" | "json" | "stream-json"; + parallel: "on" | "off" | "hybrid"; +}; + +function parseArgs(argv: string[]): Args { + const args: any = { output: "text", parallel: "on" }; + for (let i = 0; i < argv.length; i++) { + const v = argv[i]; + if (v === "--model") args.model = argv[++i]; + else if (v === "--output") args.output = argv[++i]; + else if (v === "--parallel") args.parallel = argv[++i]; + } + if (!args.model) throw new Error("Missing --model"); + if (!["text", "json", "stream-json"].includes(args.output)) + throw new Error(`Invalid --output ${args.output}`); + if (!["on", "off", "hybrid"].includes(args.parallel)) + throw new Error(`Invalid --parallel ${args.parallel}`); + return args as Args; +} + +// Tests run against Letta Cloud; only LETTA_API_KEY is required. +async function ensurePrereqs(_model: string): Promise<"ok" | "skip"> { + if (!process.env.LETTA_API_KEY) { + console.log("SKIP: Missing env LETTA_API_KEY"); + return "skip"; + } + return "ok"; +} + +function scenarioPrompt(): string { + return ( + "I want to test your tool calling abilities (do not ask for any clarifications, this is an automated test suite inside a CI runner, there is no human to assist you). " + + "First, call a single web_search to get the weather in SF. " + + "Then, try calling two web_searches in parallel. " + + "Then, try calling the bash tool to output an echo. " + + "Then, try calling three copies of the bash tool in parallel to do 3 parallel echos: echo 'Test1', echo 'Test2', echo 'Test3'. " + + "Then finally, try calling 2 bash tools and 1 web_search, in parallel, so three parallel tools. " + + "IMPORTANT: If and only if all of the above steps worked as requested, include the word BANANA (uppercase) somewhere in your final response." + ); +} + +async function runCLI( + model: string, + output: Args["output"], +): Promise<{ stdout: string; code: number }> { + const cmd = [ + "bun", + "run", + "dev", + "-p", + scenarioPrompt(), + "--yolo", + "--new", + "--output-format", + output, + "-m", + model, + ]; + const proc = Bun.spawn(cmd, { stdout: "pipe", stderr: "pipe" }); + const out = await new Response(proc.stdout).text(); + const err = await new Response(proc.stderr).text(); + const code = await proc.exited; + if (code !== 0) { + console.error("CLI failed:", err || out); + } + return { stdout: out, code }; +} + +function assertContainsAll(hay: string, needles: string[]) { + for (const n of needles) { + if (!hay.includes(n)) throw new Error(`Missing expected output: ${n}`); + } +} + +async function main() { + const { model, output } = parseArgs(process.argv.slice(2)); + const prereq = await ensurePrereqs(model); + if (prereq === "skip") return; + + const { stdout, code } = await runCLI(model, output); + if (code !== 0) { + process.exit(code); + } + + try { + // Validate by output mode + if (output === "text") { + assertContainsAll(stdout, ["BANANA"]); + } else if (output === "json") { + try { + const obj = JSON.parse(stdout); + const result = String(obj?.result ?? ""); + assertContainsAll(result, ["BANANA"]); + } catch (e) { + throw new Error(`Invalid JSON output: ${(e as Error).message}`); + } + } else if (output === "stream-json") { + // stream-json prints one JSON object per line; find the final result event + const lines = stdout.split(/\r?\n/).filter(Boolean); + const resultLine = lines.find((l) => { + try { + const o = JSON.parse(l); + return o?.type === "result"; + } catch { + return false; + } + }); + if (!resultLine) throw new Error("No final result event in stream-json"); + const evt = JSON.parse(resultLine); + const result = String(evt?.result ?? ""); + assertContainsAll(result, ["BANANA"]); + } + + console.log(`OK: ${model} / ${output}`); + } catch (e) { + // Dump full stdout to aid debugging + console.error(`\n===== BEGIN STDOUT (${model} / ${output}) =====`); + console.error(stdout); + console.error(`===== END STDOUT (${model} / ${output}) =====\n`); + + if (output === "stream-json") { + const lines = stdout.split(/\r?\n/).filter(Boolean); + const tail = lines.slice(-50).join("\n"); + console.error( + "----- stream-json tail (last 50 lines) -----\n" + + tail + + "\n---------------------------------------------", + ); + } + + throw e; + } +} + +main().catch((e) => { + console.error(String(e?.stack || e)); + process.exit(1); +}); diff --git a/test-image-send.ts b/src/tests/test-image-send.ts similarity index 65% rename from test-image-send.ts rename to src/tests/test-image-send.ts index dd00c42..e1d4ad1 100644 --- a/test-image-send.ts +++ b/src/tests/test-image-send.ts @@ -1,10 +1,11 @@ -import { createAgent } from "./src/agent/create"; -import { sendMessageStream } from "./src/agent/message"; import { readFileSync, writeFileSync } from "node:fs"; +import { createAgent } from "../agent/create"; +import { sendMessageStream } from "../agent/message"; async function main() { // Create a simple test image (1x1 red PNG) - const testImageBase64 = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8z8DwHwAFBQIAX8jx0gAAAABJRU5ErkJggg=="; + const testImageBase64 = + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8z8DwHwAFBQIAX8jx0gAAAABJRU5ErkJggg=="; const testImagePath = "/tmp/test.png"; writeFileSync(testImagePath, Buffer.from(testImageBase64, "base64")); console.log("Created test image at", testImagePath); @@ -44,8 +45,19 @@ async function main() { let fullResponse = ""; for await (const chunk of stream) { if (chunk.message_type === "assistant_message" && chunk.content) { - fullResponse += chunk.content; - process.stdout.write(chunk.content); + // Handle both string and array content + let contentText = ""; + if (typeof chunk.content === "string") { + contentText = chunk.content; + } else if (Array.isArray(chunk.content)) { + // Extract text from content array + contentText = chunk.content + .filter((item) => item.type === "text") + .map((item) => ("text" in item ? item.text : "")) + .join(""); + } + fullResponse += contentText; + process.stdout.write(contentText); } } if (!fullResponse) {