From 3c084b52a68f7ce1f9b87ac6cc56907083822e9b Mon Sep 17 00:00:00 2001
From: Charles Packer <packercharles@gmail.com>
Date: Sun, 9 Nov 2025 12:34:25 -0800
Subject: [PATCH] feat: add headless matrix to ci (#86)

---
 .github/workflows/ci.yml                      |  35 +++-
 src/tests/headless-scenario.ts                | 151 ++++++++++++++++++
 .../tests/test-image-send.ts                  |  22 ++-
 3 files changed, 201 insertions(+), 7 deletions(-)
 create mode 100644 src/tests/headless-scenario.ts
 rename test-image-send.ts => src/tests/test-image-send.ts (65%)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index e65ee34..0ed6172 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -66,8 +66,8 @@ jobs:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: bun install
 
-      - name: Run tests
-        run: bun test
+      - name: Run tests (extended timeout)
+        run: bun test --timeout 15000
 
       - name: Build bundle
         run: bun run build
@@ -111,3 +111,34 @@ jobs:
       - name: Pack (no auth available)
         if: ${{ github.event_name != 'push' }}
         run: bun pm pack
+
+  headless:
+    needs: check
+    name: Headless / ${{ matrix.model }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        model: [gpt-5-minimal, gpt-4.1, default, gemini-pro, glm-4.6, haiku]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Bun
+        uses: oven-sh/setup-bun@v1
+        with:
+          bun-version: 1.3.0
+
+      - name: Install dependencies
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: bun install
+
+      - name: Run headless scenario (all outputs)
+        if: ${{ github.event_name == 'push' || (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) }}
+        env:
+          LETTA_API_KEY: ${{ secrets.LETTA_API_KEY }}
+        run: |
+          bun run src/tests/headless-scenario.ts --model "${{ matrix.model }}" --output text --parallel on
+          bun run src/tests/headless-scenario.ts --model "${{ matrix.model }}" --output json --parallel on
+          bun run src/tests/headless-scenario.ts --model "${{ matrix.model }}" --output stream-json --parallel on
diff --git a/src/tests/headless-scenario.ts b/src/tests/headless-scenario.ts
new file mode 100644
index 0000000..7e91e6a
--- /dev/null
+++ b/src/tests/headless-scenario.ts
@@ -0,0 +1,151 @@
+#!/usr/bin/env bun
+/**
+ * Headless scenario test runner
+ *
+ * Runs a single multi-step scenario against the LeTTA Code CLI (headless) for a given
+ * model and output format. Intended for CI matrix usage.
+ *
+ * Usage:
+ *   bun tsx src/tests/headless-scenario.ts --model gpt-4.1 --output stream-json --parallel on
+ */
+
+type Args = {
+  model: string;
+  output: "text" | "json" | "stream-json";
+  parallel: "on" | "off" | "hybrid";
+};
+
+function parseArgs(argv: string[]): Args {
+  const args: any = { output: "text", parallel: "on" };
+  for (let i = 0; i < argv.length; i++) {
+    const v = argv[i];
+    if (v === "--model") args.model = argv[++i];
+    else if (v === "--output") args.output = argv[++i];
+    else if (v === "--parallel") args.parallel = argv[++i];
+  }
+  if (!args.model) throw new Error("Missing --model");
+  if (!["text", "json", "stream-json"].includes(args.output))
+    throw new Error(`Invalid --output ${args.output}`);
+  if (!["on", "off", "hybrid"].includes(args.parallel))
+    throw new Error(`Invalid --parallel ${args.parallel}`);
+  return args as Args;
+}
+
+// Tests run against Letta Cloud; only LETTA_API_KEY is required.
+async function ensurePrereqs(_model: string): Promise<"ok" | "skip"> {
+  if (!process.env.LETTA_API_KEY) {
+    console.log("SKIP: Missing env LETTA_API_KEY");
+    return "skip";
+  }
+  return "ok";
+}
+
+function scenarioPrompt(): string {
+  return (
+    "I want to test your tool calling abilities (do not ask for any clarifications, this is an automated test suite inside a CI runner, there is no human to assist you). " +
+    "First, call a single web_search to get the weather in SF. " +
+    "Then, try calling two web_searches in parallel. " +
+    "Then, try calling the bash tool to output an echo. " +
+    "Then, try calling three copies of the bash tool in parallel to do 3 parallel echos: echo 'Test1', echo 'Test2', echo 'Test3'. " +
+    "Then finally, try calling 2 bash tools and 1 web_search, in parallel, so three parallel tools. " +
+    "IMPORTANT: If and only if all of the above steps worked as requested, include the word BANANA (uppercase) somewhere in your final response."
+  );
+}
+
+async function runCLI(
+  model: string,
+  output: Args["output"],
+): Promise<{ stdout: string; code: number }> {
+  const cmd = [
+    "bun",
+    "run",
+    "dev",
+    "-p",
+    scenarioPrompt(),
+    "--yolo",
+    "--new",
+    "--output-format",
+    output,
+    "-m",
+    model,
+  ];
+  const proc = Bun.spawn(cmd, { stdout: "pipe", stderr: "pipe" });
+  const out = await new Response(proc.stdout).text();
+  const err = await new Response(proc.stderr).text();
+  const code = await proc.exited;
+  if (code !== 0) {
+    console.error("CLI failed:", err || out);
+  }
+  return { stdout: out, code };
+}
+
+function assertContainsAll(hay: string, needles: string[]) {
+  for (const n of needles) {
+    if (!hay.includes(n)) throw new Error(`Missing expected output: ${n}`);
+  }
+}
+
+async function main() {
+  const { model, output } = parseArgs(process.argv.slice(2));
+  const prereq = await ensurePrereqs(model);
+  if (prereq === "skip") return;
+
+  const { stdout, code } = await runCLI(model, output);
+  if (code !== 0) {
+    process.exit(code);
+  }
+
+  try {
+    // Validate by output mode
+    if (output === "text") {
+      assertContainsAll(stdout, ["BANANA"]);
+    } else if (output === "json") {
+      try {
+        const obj = JSON.parse(stdout);
+        const result = String(obj?.result ?? "");
+        assertContainsAll(result, ["BANANA"]);
+      } catch (e) {
+        throw new Error(`Invalid JSON output: ${(e as Error).message}`);
+      }
+    } else if (output === "stream-json") {
+      // stream-json prints one JSON object per line; find the final result event
+      const lines = stdout.split(/\r?\n/).filter(Boolean);
+      const resultLine = lines.find((l) => {
+        try {
+          const o = JSON.parse(l);
+          return o?.type === "result";
+        } catch {
+          return false;
+        }
+      });
+      if (!resultLine) throw new Error("No final result event in stream-json");
+      const evt = JSON.parse(resultLine);
+      const result = String(evt?.result ?? "");
+      assertContainsAll(result, ["BANANA"]);
+    }
+
+    console.log(`OK: ${model} / ${output}`);
+  } catch (e) {
+    // Dump full stdout to aid debugging
+    console.error(`\n===== BEGIN STDOUT (${model} / ${output}) =====`);
+    console.error(stdout);
+    console.error(`===== END STDOUT (${model} / ${output}) =====\n`);
+
+    if (output === "stream-json") {
+      const lines = stdout.split(/\r?\n/).filter(Boolean);
+      const tail = lines.slice(-50).join("\n");
+      console.error(
+        "----- stream-json tail (last 50 lines) -----\n" +
+          tail +
+          "\n---------------------------------------------",
+      );
+    }
+
+    throw e;
+  }
+}
+
+main().catch((e) => {
+  console.error(String(e?.stack || e));
+  process.exit(1);
+});
diff --git a/test-image-send.ts b/src/tests/test-image-send.ts
similarity index 65%
rename from test-image-send.ts
rename to src/tests/test-image-send.ts
index dd00c42..e1d4ad1 100644
--- a/test-image-send.ts
+++ b/src/tests/test-image-send.ts
@@ -1,10 +1,11 @@
-import { createAgent } from "./src/agent/create";
-import { sendMessageStream } from "./src/agent/message";
 import { readFileSync, writeFileSync } from "node:fs";
+import { createAgent } from "../agent/create";
+import { sendMessageStream } from "../agent/message";
 
 async function main() {
   // Create a simple test image (1x1 red PNG)
-  const testImageBase64 = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8z8DwHwAFBQIAX8jx0gAAAABJRU5ErkJggg==";
+  const testImageBase64 =
+    "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8z8DwHwAFBQIAX8jx0gAAAABJRU5ErkJggg==";
   const testImagePath = "/tmp/test.png";
   writeFileSync(testImagePath, Buffer.from(testImageBase64, "base64"));
   console.log("Created test image at", testImagePath);
@@ -44,8 +45,19 @@ async function main() {
   let fullResponse = "";
   for await (const chunk of stream) {
     if (chunk.message_type === "assistant_message" && chunk.content) {
-      fullResponse += chunk.content;
-      process.stdout.write(chunk.content);
+      // Handle both string and array content
+      let contentText = "";
+      if (typeof chunk.content === "string") {
+        contentText = chunk.content;
+      } else if (Array.isArray(chunk.content)) {
+        // Extract text from content array
+        contentText = chunk.content
+          .filter((item) => item.type === "text")
+          .map((item) => ("text" in item ? item.text : ""))
+          .join("");
+      }
+      fullResponse += contentText;
+      process.stdout.write(contentText);
     }
   }
   if (!fullResponse) {