219 lines
6.4 KiB
TypeScript
219 lines
6.4 KiB
TypeScript
#!/usr/bin/env bun
|
|
/**
|
|
* Headless scenario test runner
|
|
*
|
|
* Runs a single multi-step scenario against the LeTTA Code CLI (headless) for a given
|
|
* model and output format. Intended for CI matrix usage.
|
|
*
|
|
* Usage:
|
|
* bun tsx src/tests/headless-scenario.ts --model gpt-4.1 --output stream-json --parallel on
|
|
*/
|
|
|
|
type Args = {
|
|
model: string;
|
|
output: "text" | "json" | "stream-json";
|
|
parallel: "on" | "off" | "hybrid";
|
|
};
|
|
|
|
function parseArgs(argv: string[]): Args {
|
|
const args: {
|
|
model?: string;
|
|
output: Args["output"];
|
|
parallel: Args["parallel"];
|
|
} = {
|
|
output: "text",
|
|
parallel: "on",
|
|
};
|
|
for (let i = 0; i < argv.length; i++) {
|
|
const v = argv[i];
|
|
if (v === "--model") args.model = argv[++i];
|
|
else if (v === "--output") args.output = argv[++i] as Args["output"];
|
|
else if (v === "--parallel") args.parallel = argv[++i] as Args["parallel"];
|
|
}
|
|
if (!args.model) throw new Error("Missing --model");
|
|
if (!["text", "json", "stream-json"].includes(args.output))
|
|
throw new Error(`Invalid --output ${args.output}`);
|
|
if (!["on", "off", "hybrid"].includes(args.parallel))
|
|
throw new Error(`Invalid --parallel ${args.parallel}`);
|
|
return args as Args;
|
|
}
|
|
|
|
// Tests run against Letta Cloud; only LETTA_API_KEY is required.
|
|
async function ensurePrereqs(_model: string): Promise<"ok" | "skip"> {
|
|
if (!process.env.LETTA_API_KEY) {
|
|
console.log("SKIP: Missing env LETTA_API_KEY");
|
|
return "skip";
|
|
}
|
|
return "ok";
|
|
}
|
|
|
|
function scenarioPrompt(): string {
|
|
return (
|
|
"I want to test your tool calling abilities (do not ask for any clarifications, this is an automated test suite inside a CI runner, there is no human to assist you). " +
|
|
"First, call a single conversation_search to search for 'hello'. " +
|
|
"Then, try calling two conversation_searches in parallel (search for 'test' and 'hello'). " +
|
|
"Then, try running a shell command to output an echo (use whatever shell/bash tool is available). " +
|
|
"Then, try running three shell commands in parallel to do 3 parallel echos: echo 'Test1', echo 'Test2', echo 'Test3'. " +
|
|
"Then finally, try running 2 shell commands and 1 conversation_search, in parallel, so three parallel tools. " +
|
|
"IMPORTANT: If and only if all of the above steps worked as requested, include the word BANANA (uppercase) somewhere in your final response."
|
|
);
|
|
}
|
|
|
|
async function runCLI(
|
|
model: string,
|
|
output: Args["output"],
|
|
): Promise<{ stdout: string; code: number }> {
|
|
const cmd = [
|
|
"bun",
|
|
"run",
|
|
"dev",
|
|
"-p",
|
|
scenarioPrompt(),
|
|
"--yolo",
|
|
"--new-agent",
|
|
"--base-tools",
|
|
"memory,web_search,fetch_webpage,conversation_search",
|
|
"--output-format",
|
|
output,
|
|
"-m",
|
|
model,
|
|
];
|
|
// Mark as subagent to prevent polluting user's LRU settings
|
|
const proc = Bun.spawn(cmd, {
|
|
stdout: "pipe",
|
|
stderr: "pipe",
|
|
env: { ...process.env, LETTA_CODE_AGENT_ROLE: "subagent" },
|
|
});
|
|
const out = await new Response(proc.stdout).text();
|
|
const err = await new Response(proc.stderr).text();
|
|
const code = await proc.exited;
|
|
if (code !== 0) {
|
|
console.error("CLI failed:", err || out);
|
|
}
|
|
return { stdout: out, code };
|
|
}
|
|
|
|
const REQUIRED_MARKERS = ["BANANA"];
|
|
const MAX_ATTEMPTS = 2;
|
|
|
|
function assertContainsAll(hay: string, needles: string[]) {
|
|
for (const n of needles) {
|
|
if (!hay.includes(n)) throw new Error(`Missing expected output: ${n}`);
|
|
}
|
|
}
|
|
|
|
function extractStreamJsonAssistantText(stdout: string): string {
|
|
const parts: string[] = [];
|
|
for (const line of stdout.split(/\r?\n/)) {
|
|
if (!line.trim()) continue;
|
|
try {
|
|
const event = JSON.parse(line) as {
|
|
type?: string;
|
|
message_type?: string;
|
|
content?: unknown;
|
|
result?: unknown;
|
|
};
|
|
if (
|
|
event.type === "message" &&
|
|
event.message_type === "assistant_message" &&
|
|
typeof event.content === "string"
|
|
) {
|
|
parts.push(event.content);
|
|
}
|
|
if (event.type === "result" && typeof event.result === "string") {
|
|
parts.push(event.result);
|
|
}
|
|
} catch {
|
|
// Ignore malformed lines; validation will fail if we never find the marker.
|
|
}
|
|
}
|
|
return parts.join("");
|
|
}
|
|
|
|
function validateOutput(stdout: string, output: Args["output"]) {
|
|
if (output === "text") {
|
|
assertContainsAll(stdout, REQUIRED_MARKERS);
|
|
return;
|
|
}
|
|
|
|
if (output === "json") {
|
|
try {
|
|
const obj = JSON.parse(stdout);
|
|
const result = String(obj?.result ?? "");
|
|
assertContainsAll(result, REQUIRED_MARKERS);
|
|
return;
|
|
} catch (e) {
|
|
throw new Error(`Invalid JSON output: ${(e as Error).message}`);
|
|
}
|
|
}
|
|
|
|
const streamText = extractStreamJsonAssistantText(stdout);
|
|
if (!streamText) {
|
|
throw new Error("No assistant/result content found in stream-json output");
|
|
}
|
|
assertContainsAll(streamText, REQUIRED_MARKERS);
|
|
}
|
|
|
|
async function main() {
|
|
const { model, output } = parseArgs(process.argv.slice(2));
|
|
const prereq = await ensurePrereqs(model);
|
|
if (prereq === "skip") return;
|
|
|
|
let stdout = "";
|
|
let code = 0;
|
|
let lastError: Error | null = null;
|
|
|
|
for (let attempt = 1; attempt <= MAX_ATTEMPTS; attempt += 1) {
|
|
({ stdout, code } = await runCLI(model, output));
|
|
if (code !== 0) {
|
|
lastError = new Error(`CLI exited with code ${code}`);
|
|
} else {
|
|
try {
|
|
validateOutput(stdout, output);
|
|
console.log(`OK: ${model} / ${output}`);
|
|
return;
|
|
} catch (error) {
|
|
lastError = error as Error;
|
|
}
|
|
}
|
|
|
|
if (attempt < MAX_ATTEMPTS) {
|
|
console.error(
|
|
`[headless-scenario] attempt ${attempt}/${MAX_ATTEMPTS} failed for ${model} / ${output}: ${lastError?.message ?? "unknown error"}`,
|
|
);
|
|
await Bun.sleep(500);
|
|
}
|
|
}
|
|
|
|
try {
|
|
if (code !== 0) {
|
|
process.exit(code);
|
|
}
|
|
if (lastError) {
|
|
throw lastError;
|
|
}
|
|
} catch (e) {
|
|
// Dump full stdout to aid debugging
|
|
console.error(`\n===== BEGIN STDOUT (${model} / ${output}) =====`);
|
|
console.error(stdout);
|
|
console.error(`===== END STDOUT (${model} / ${output}) =====\n`);
|
|
|
|
if (output === "stream-json") {
|
|
const lines = stdout.split(/\r?\n/).filter(Boolean);
|
|
const tail = lines.slice(-50).join("\n");
|
|
console.error(
|
|
"----- stream-json tail (last 50 lines) -----\n" +
|
|
tail +
|
|
"\n---------------------------------------------",
|
|
);
|
|
}
|
|
|
|
throw e;
|
|
}
|
|
}
|
|
|
|
main().catch((e) => {
|
|
console.error(String(e?.stack || e));
|
|
process.exit(1);
|
|
});
|