letta-code/src/tests/headless-scenario.ts

#!/usr/bin/env bun
/**
 * Headless scenario test runner
 *
 * Runs a single multi-step scenario against the LeTTA Code CLI (headless) for a given
 * model and output format. Intended for CI matrix usage.
 *
 * Usage:
 *   bun tsx src/tests/headless-scenario.ts --model gpt-4.1 --output stream-json --parallel on
 */

type Args = {
  model: string;
  output: "text" | "json" | "stream-json";
  parallel: "on" | "off" | "hybrid";
};

function parseArgs(argv: string[]): Args {
  const args: {
    model?: string;
    output: Args["output"];
    parallel: Args["parallel"];
  } = {
    output: "text",
    parallel: "on",
  };
  for (let i = 0; i < argv.length; i++) {
    const v = argv[i];
    if (v === "--model") args.model = argv[++i];
    else if (v === "--output") args.output = argv[++i] as Args["output"];
    else if (v === "--parallel") args.parallel = argv[++i] as Args["parallel"];
  }
  if (!args.model) throw new Error("Missing --model");
  if (!["text", "json", "stream-json"].includes(args.output))
    throw new Error(`Invalid --output ${args.output}`);
  if (!["on", "off", "hybrid"].includes(args.parallel))
    throw new Error(`Invalid --parallel ${args.parallel}`);
  return args as Args;
}

// Tests run against Letta Cloud; only LETTA_API_KEY is required.
async function ensurePrereqs(_model: string): Promise<"ok" | "skip"> {
  if (!process.env.LETTA_API_KEY) {
    console.log("SKIP: Missing env LETTA_API_KEY");
    return "skip";
  }
  return "ok";
}

function scenarioPrompt(): string {
  return (
    "I want to test your tool calling abilities (do not ask for any clarifications, this is an automated test suite inside a CI runner, there is no human to assist you). " +
    "First, call a single conversation_search to search for 'hello'. " +
    "Then, try calling two conversation_searches in parallel (search for 'test' and 'hello'). " +
    "Then, try running a shell command to output an echo (use whatever shell/bash tool is available). " +
    "Then, try running three shell commands in parallel to do 3 parallel echos: echo 'Test1', echo 'Test2', echo 'Test3'. " +
    "Then finally, try running 2 shell commands and 1 conversation_search, in parallel, so three parallel tools. " +
    "IMPORTANT: If and only if all of the above steps worked as requested, include the word BANANA (uppercase) somewhere in your final response."
  );
}

async function runCLI(
  model: string,
  output: Args["output"],
): Promise<{ stdout: string; code: number }> {
  const cmd = [
    "bun",
    "run",
    "dev",
    "-p",
    scenarioPrompt(),
    "--yolo",
    "--new-agent",
    "--base-tools",
    "memory,web_search,fetch_webpage,conversation_search",
    "--output-format",
    output,
    "-m",
    model,
  ];
  // Mark as subagent to prevent polluting user's LRU settings
  const proc = Bun.spawn(cmd, {
    stdout: "pipe",
    stderr: "pipe",
    env: { ...process.env, LETTA_CODE_AGENT_ROLE: "subagent" },
  });
  const out = await new Response(proc.stdout).text();
  const err = await new Response(proc.stderr).text();
  const code = await proc.exited;
  if (code !== 0) {
    console.error("CLI failed:", err || out);
  }
  return { stdout: out, code };
}

const REQUIRED_MARKERS = ["BANANA"];
const MAX_ATTEMPTS = 2;

function assertContainsAll(hay: string, needles: string[]) {
  for (const n of needles) {
    if (!hay.includes(n)) throw new Error(`Missing expected output: ${n}`);
  }
}

function extractStreamJsonAssistantText(stdout: string): string {
  const parts: string[] = [];
  for (const line of stdout.split(/\r?\n/)) {
    if (!line.trim()) continue;
    try {
      const event = JSON.parse(line) as {
        type?: string;
        message_type?: string;
        content?: unknown;
        result?: unknown;
      };
      if (
        event.type === "message" &&
        event.message_type === "assistant_message" &&
        typeof event.content === "string"
      ) {
        parts.push(event.content);
      }
      if (event.type === "result" && typeof event.result === "string") {
        parts.push(event.result);
      }
    } catch {
      // Ignore malformed lines; validation will fail if we never find the marker.
    }
  }
  return parts.join("");
}

function validateOutput(stdout: string, output: Args["output"]) {
  if (output === "text") {
    assertContainsAll(stdout, REQUIRED_MARKERS);
    return;
  }

  if (output === "json") {
    try {
      const obj = JSON.parse(stdout);
      const result = String(obj?.result ?? "");
      assertContainsAll(result, REQUIRED_MARKERS);
      return;
    } catch (e) {
      throw new Error(`Invalid JSON output: ${(e as Error).message}`);
    }
  }

  const streamText = extractStreamJsonAssistantText(stdout);
  if (!streamText) {
    throw new Error("No assistant/result content found in stream-json output");
  }
  assertContainsAll(streamText, REQUIRED_MARKERS);
}

async function main() {
  const { model, output } = parseArgs(process.argv.slice(2));
  const prereq = await ensurePrereqs(model);
  if (prereq === "skip") return;

  let stdout = "";
  let code = 0;
  let lastError: Error | null = null;

  for (let attempt = 1; attempt <= MAX_ATTEMPTS; attempt += 1) {
    ({ stdout, code } = await runCLI(model, output));
    if (code !== 0) {
      lastError = new Error(`CLI exited with code ${code}`);
    } else {
      try {
        validateOutput(stdout, output);
        console.log(`OK: ${model} / ${output}`);
        return;
      } catch (error) {
        lastError = error as Error;
      }
    }

    if (attempt < MAX_ATTEMPTS) {
      console.error(
        `[headless-scenario] attempt ${attempt}/${MAX_ATTEMPTS} failed for ${model} / ${output}: ${lastError?.message ?? "unknown error"}`,
      );
      await Bun.sleep(500);
    }
  }

  try {
    if (code !== 0) {
      process.exit(code);
    }
    if (lastError) {
      throw lastError;
    }
  } catch (e) {
    // Dump full stdout to aid debugging
    console.error(`\n===== BEGIN STDOUT (${model} / ${output}) =====`);
    console.error(stdout);
    console.error(`===== END STDOUT (${model} / ${output}) =====\n`);

    if (output === "stream-json") {
      const lines = stdout.split(/\r?\n/).filter(Boolean);
      const tail = lines.slice(-50).join("\n");
      console.error(
        "----- stream-json tail (last 50 lines) -----\n" +
          tail +
          "\n---------------------------------------------",
      );
    }

    throw e;
  }
}

main().catch((e) => {
  console.error(String(e?.stack || e));
  process.exit(1);
});