From 6f999fac251f7614e2118f527fbcf1c706caf8f8 Mon Sep 17 00:00:00 2001 From: Devansh Jain <31609257+devanshrj@users.noreply.github.com> Date: Fri, 13 Mar 2026 19:08:24 -0700 Subject: [PATCH] =?UTF-8?q?feat:=20TB=20regression=20=E2=80=94=20full=20ru?= =?UTF-8?q?ns,=20xhigh=20models,=20baselines,=20and=20report=20improvement?= =?UTF-8?q?s=20(#1390)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Letta Code --- .../workflows/terminal-bench-regression.yml | 18 +- benchmarks/terminal_bench/README.md | 55 +++++ benchmarks/terminal_bench/baseline.json | 191 +++++++++++++++++- benchmarks/terminal_bench/letta_code_agent.py | 11 +- .../terminal_bench/regression-tasks.txt | 6 - benchmarks/terminal_bench/report.py | 116 ++++++++--- 6 files changed, 343 insertions(+), 54 deletions(-) create mode 100644 benchmarks/terminal_bench/README.md delete mode 100644 benchmarks/terminal_bench/regression-tasks.txt diff --git a/.github/workflows/terminal-bench-regression.yml b/.github/workflows/terminal-bench-regression.yml index 631d1f8..da245af 100644 --- a/.github/workflows/terminal-bench-regression.yml +++ b/.github/workflows/terminal-bench-regression.yml @@ -2,7 +2,7 @@ name: Terminal-Bench Regression on: schedule: - - cron: "0 8 * * 1,4" # Monday + Thursday 8am UTC + - cron: "0 12 * * 1" # Monday 5am PT (12pm UTC) workflow_dispatch: inputs: model: @@ -10,7 +10,7 @@ on: default: "" concurrency: description: "Max concurrent tasks" - default: "4" + default: "10" env: FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true @@ -22,7 +22,7 @@ jobs: strategy: fail-fast: false matrix: - model: [sonnet-4.6-low, gpt-5-minimal] + model: [sonnet-4.6-xhigh, gpt-5.3-codex-xhigh] steps: - name: Checkout uses: actions/checkout@v6 @@ -54,21 +54,13 @@ jobs: run: | source .venv/bin/activate - # Build --task-name flags from regression-tasks.txt - TASK_FLAGS="" - while IFS= read -r task; do - [[ "$task" =~ ^#.*$ || -z "$task" ]] && continue - TASK_FLAGS="$TASK_FLAGS --task-name $task" - done < benchmarks/terminal_bench/regression-tasks.txt - harbor run \ --dataset terminal-bench@2.0 \ --agent-import-path benchmarks.terminal_bench.letta_code_agent:LettaCode \ --model "${{ matrix.model }}" \ --env modal \ - --n-concurrent ${{ inputs.concurrency || '4' }} \ - --job-name "regression-${{ matrix.model }}-$(date +%Y%m%d)" \ - $TASK_FLAGS + --n-concurrent ${{ inputs.concurrency || '10' }} \ + --job-name "regression-${{ matrix.model }}-$(date +%Y%m%d)" - name: Upload results artifact if: always() diff --git a/benchmarks/terminal_bench/README.md b/benchmarks/terminal_bench/README.md new file mode 100644 index 0000000..ef67389 --- /dev/null +++ b/benchmarks/terminal_bench/README.md @@ -0,0 +1,55 @@ +# Terminal-Bench Regression + +Weekly regression tests for Letta Code on [Terminal-Bench 2.0](https://github.com/laude-institute/terminal-bench-2). + +## How it works + +1. **GitHub Actions** (`.github/workflows/terminal-bench-regression.yml`) runs every Monday at 5am PT +2. **Harbor** orchestrates task execution in **Modal** cloud sandboxes +3. Letta Code is built from source (`main` branch) inside each sandbox +4. Results are compared against `baseline.json` and posted to a GitHub issue +5. `@devanshrj` is tagged if any model drops >5% from baseline + +## Models + +| Model | Baseline | +|-------|----------| +| `sonnet-4.6-xhigh` | 38/89 (42.7%) | +| `gpt-5.3-codex-xhigh` | 57/89 (64.0%) | + +## Files + +| File | Description | +|------|-------------| +| `letta_code_agent.py` | Harbor agent — installs and runs Letta Code CLI in sandbox | +| `install-letta-code.sh.j2` | Jinja2 install script (Node.js, Bun, build from source) | +| `baseline.json` | Per-model, per-task pass/fail baselines | +| `report.py` | Parses results, detects regressions, posts GitHub issue | + +## Manual trigger + +```bash +gh workflow run terminal-bench-regression.yml --ref main -f concurrency=10 +``` + +## Required secrets + +- `LETTA_API_KEY` — Letta Cloud API key +- `ANTHROPIC_API_KEY` / `OPENAI_API_KEY` — LLM provider keys +- `MODAL_TOKEN_ID` / `MODAL_TOKEN_SECRET` — Modal sandbox credentials + +## Updating baselines + +Replace `baseline.json` with results from a new run. Format: + +```json +{ + "model-name": { + "pass_rate": 0.427, + "tasks": { + "task-name": true, + ... + } + } +} +``` diff --git a/benchmarks/terminal_bench/baseline.json b/benchmarks/terminal_bench/baseline.json index 9e26dfe..cadcefe 100644 --- a/benchmarks/terminal_bench/baseline.json +++ b/benchmarks/terminal_bench/baseline.json @@ -1 +1,190 @@ -{} \ No newline at end of file +{ + "sonnet-4.6-xhigh": { + "pass_rate": 0.427, + "tasks": { + "adaptive-rejection-sampler": false, + "bn-fit-modify": true, + "break-filter-js-from-html": false, + "build-cython-ext": true, + "build-pmars": true, + "build-pov-ray": true, + "caffe-cifar-10": false, + "cancel-async-tasks": false, + "chess-best-move": false, + "circuit-fibsqrt": false, + "cobol-modernization": false, + "code-from-image": true, + "compile-compcert": true, + "configure-git-webserver": true, + "constraints-scheduling": true, + "count-dataset-tokens": true, + "crack-7z-hash": false, + "custom-memory-heap-crash": false, + "db-wal-recovery": false, + "distribution-search": false, + "dna-assembly": false, + "dna-insert": false, + "extract-elf": true, + "extract-moves-from-video": false, + "feal-differential-cryptanalysis": true, + "feal-linear-cryptanalysis": false, + "filter-js-from-html": false, + "financial-document-processor": false, + "fix-code-vulnerability": true, + "fix-git": true, + "fix-ocaml-gc": true, + "gcode-to-text": true, + "git-leak-recovery": true, + "git-multibranch": true, + "gpt2-codegolf": false, + "headless-terminal": true, + "hf-model-inference": true, + "install-windows-3.11": false, + "kv-store-grpc": true, + "large-scale-text-editing": true, + "largest-eigenval": false, + "llm-inference-batching-scheduler": false, + "log-summary-date-ranges": true, + "mailman": false, + "make-doom-for-mips": false, + "make-mips-interpreter": true, + "mcmc-sampling-stan": true, + "merge-diff-arc-agi-task": true, + "model-extraction-relu-logits": false, + "modernize-scientific-stack": true, + "mteb-leaderboard": false, + "mteb-retrieve": false, + "multi-source-data-merger": true, + "nginx-request-logging": true, + "openssl-selfsigned-cert": true, + "overfull-hbox": false, + "password-recovery": true, + "path-tracing-reverse": false, + "path-tracing": false, + "polyglot-c-py": false, + "polyglot-rust-c": false, + "portfolio-optimization": false, + "protein-assembly": false, + "prove-plus-comm": true, + "pypi-server": true, + "pytorch-model-cli": false, + "pytorch-model-recovery": true, + "qemu-alpine-ssh": false, + "qemu-startup": true, + "query-optimize": false, + "raman-fitting": false, + "regex-chess": false, + "regex-log": true, + "reshard-c4-data": false, + "rstan-to-pystan": false, + "sam-cell-seg": false, + "sanitize-git-repo": false, + "schemelike-metacircular-eval": false, + "sparql-university": false, + "sqlite-db-truncate": true, + "sqlite-with-gcov": false, + "torch-pipeline-parallelism": false, + "torch-tensor-parallelism": true, + "train-fasttext": false, + "tune-mjcf": false, + "video-processing": false, + "vulnerable-secret": true, + "winning-avg-corewars": false, + "write-compressor": false + } + }, + "gpt-5.3-codex-xhigh": { + "pass_rate": 0.6404, + "tasks": { + "adaptive-rejection-sampler": false, + "bn-fit-modify": true, + "break-filter-js-from-html": true, + "build-cython-ext": true, + "build-pmars": true, + "build-pov-ray": true, + "caffe-cifar-10": true, + "cancel-async-tasks": false, + "chess-best-move": true, + "circuit-fibsqrt": true, + "cobol-modernization": true, + "code-from-image": true, + "compile-compcert": true, + "configure-git-webserver": true, + "constraints-scheduling": true, + "count-dataset-tokens": true, + "crack-7z-hash": true, + "custom-memory-heap-crash": true, + "db-wal-recovery": false, + "distribution-search": true, + "dna-assembly": false, + "dna-insert": true, + "extract-elf": true, + "extract-moves-from-video": false, + "feal-differential-cryptanalysis": true, + "feal-linear-cryptanalysis": true, + "filter-js-from-html": false, + "financial-document-processor": true, + "fix-code-vulnerability": true, + "fix-git": true, + "fix-ocaml-gc": true, + "gcode-to-text": false, + "git-leak-recovery": true, + "git-multibranch": true, + "gpt2-codegolf": false, + "headless-terminal": true, + "hf-model-inference": true, + "install-windows-3.11": false, + "kv-store-grpc": true, + "large-scale-text-editing": true, + "largest-eigenval": true, + "llm-inference-batching-scheduler": true, + "log-summary-date-ranges": true, + "mailman": false, + "make-doom-for-mips": false, + "make-mips-interpreter": false, + "mcmc-sampling-stan": false, + "merge-diff-arc-agi-task": true, + "model-extraction-relu-logits": true, + "modernize-scientific-stack": true, + "mteb-leaderboard": true, + "mteb-retrieve": false, + "multi-source-data-merger": true, + "nginx-request-logging": true, + "openssl-selfsigned-cert": true, + "overfull-hbox": false, + "password-recovery": true, + "path-tracing-reverse": true, + "path-tracing": true, + "polyglot-c-py": false, + "polyglot-rust-c": false, + "portfolio-optimization": false, + "protein-assembly": true, + "prove-plus-comm": true, + "pypi-server": true, + "pytorch-model-cli": true, + "pytorch-model-recovery": false, + "qemu-alpine-ssh": false, + "qemu-startup": false, + "query-optimize": false, + "raman-fitting": false, + "regex-chess": false, + "regex-log": true, + "reshard-c4-data": false, + "rstan-to-pystan": true, + "sam-cell-seg": false, + "sanitize-git-repo": true, + "schemelike-metacircular-eval": false, + "sparql-university": true, + "sqlite-db-truncate": true, + "sqlite-with-gcov": true, + "torch-pipeline-parallelism": false, + "torch-tensor-parallelism": false, + "train-fasttext": false, + "tune-mjcf": true, + "video-processing": false, + "vulnerable-secret": true, + "winning-avg-corewars": true, + "write-compressor": false + } + } +} diff --git a/benchmarks/terminal_bench/letta_code_agent.py b/benchmarks/terminal_bench/letta_code_agent.py index 0d41cc7..9df6ff4 100644 --- a/benchmarks/terminal_bench/letta_code_agent.py +++ b/benchmarks/terminal_bench/letta_code_agent.py @@ -27,6 +27,12 @@ _PROVIDER_SYSTEM_MAP = { } _DEFAULT_SYSTEM = "source-claude" +# Map Letta Code model handles to litellm model names for cost calculation. +_LITELLM_MODEL_MAP: dict[str, str] = { + "sonnet-4.6-xhigh": "anthropic/claude-sonnet-4-6", + "gpt-5.3-codex-xhigh": "openai/gpt-5.3-codex", +} + class LettaCode(BaseInstalledAgent): """Run Letta Code CLI inside a harbor environment.""" @@ -194,9 +200,10 @@ class LettaCode(BaseInstalledAgent): def _populate_usage(self, events_text: str, context: AgentContext) -> None: """Extract usage from events and populate context + write usage.json.""" - model_name = self.model_name or os.environ.get("LETTA_MODEL", "").strip() + raw_model = self.model_name or os.environ.get("LETTA_MODEL", "").strip() + litellm_model = _LITELLM_MODEL_MAP.get(raw_model, raw_model) usage = self._extract_usage_from_events(events_text) - cost = self._calculate_cost(model_name, usage) + cost = self._calculate_cost(litellm_model, usage) context.n_input_tokens = usage["prompt_tokens"] or None context.n_output_tokens = usage["completion_tokens"] or None diff --git a/benchmarks/terminal_bench/regression-tasks.txt b/benchmarks/terminal_bench/regression-tasks.txt deleted file mode 100644 index efe2026..0000000 --- a/benchmarks/terminal_bench/regression-tasks.txt +++ /dev/null @@ -1,6 +0,0 @@ -# Terminal-Bench regression task subset for Letta Code -# These tasks are run on a schedule to detect regressions. -# Criteria: fast (<10 min), diverse capabilities, deterministic. -# Adjust based on known Letta Code pass rates. - -cancel-async-tasks diff --git a/benchmarks/terminal_bench/report.py b/benchmarks/terminal_bench/report.py index c3a9900..eefdceb 100644 --- a/benchmarks/terminal_bench/report.py +++ b/benchmarks/terminal_bench/report.py @@ -25,9 +25,9 @@ from datetime import datetime, timezone from pathlib import Path -def parse_job_results(results_dir: Path) -> dict[str, dict[str, bool]]: - """Parse Harbor job results into {model: {task: passed}}.""" - model_results: dict[str, dict[str, bool]] = {} +def parse_job_results(results_dir: Path) -> dict[str, dict]: + """Parse Harbor job results into {model: {tasks: {task: passed}, cost: {..}}}.""" + model_results: dict[str, dict] = {} for artifact_dir in sorted(results_dir.iterdir()): if not artifact_dir.is_dir(): @@ -41,6 +41,9 @@ def parse_job_results(results_dir: Path) -> dict[str, dict[str, bool]]: model = dir_name tasks: dict[str, bool] = {} + total_cost = 0.0 + total_prompt_tokens = 0 + total_completion_tokens = 0 # Look for job directories — Harbor puts them under jobs/ jobs_dir = artifact_dir / "jobs" @@ -66,22 +69,40 @@ def parse_job_results(results_dir: Path) -> dict[str, dict[str, bool]]: try: reward = float(reward_file.read_text().strip()) tasks[task_name] = reward >= 1.0 - continue except (ValueError, OSError): pass - # Fall back to result.json - result_file = trial_dir / "result.json" - if result_file.exists(): + if task_name not in tasks: + # Fall back to result.json + result_file = trial_dir / "result.json" + if result_file.exists(): + try: + result = json.loads(result_file.read_text()) + reward = result.get("reward", result.get("score", 0)) + tasks[task_name] = float(reward) >= 1.0 + except (json.JSONDecodeError, ValueError, OSError): + tasks[task_name] = False + + # Collect cost from usage.json + usage_file = trial_dir / "usage.json" + if usage_file.exists(): try: - result = json.loads(result_file.read_text()) - reward = result.get("reward", result.get("score", 0)) - tasks[task_name] = float(reward) >= 1.0 - except (json.JSONDecodeError, ValueError, OSError): - tasks[task_name] = False + usage = json.loads(usage_file.read_text()) + total_cost += usage.get("cost_usd", 0.0) + total_prompt_tokens += usage.get("prompt_tokens", 0) + total_completion_tokens += usage.get("completion_tokens", 0) + except (json.JSONDecodeError, OSError): + pass if tasks: - model_results[model] = tasks + model_results[model] = { + "tasks": tasks, + "cost": { + "cost_usd": round(total_cost, 2), + "prompt_tokens": total_prompt_tokens, + "completion_tokens": total_completion_tokens, + }, + } return model_results @@ -105,7 +126,7 @@ def load_baseline(baseline_path: Path) -> dict: def build_report( - model_results: dict[str, dict[str, bool]], + model_results: dict[str, dict], baseline: dict, ) -> tuple[str, bool]: """Build a markdown report and determine if there's a regression. @@ -120,7 +141,9 @@ def build_report( has_regression = False - for model, tasks in sorted(model_results.items()): + for model, data in sorted(model_results.items()): + tasks = data["tasks"] + cost = data.get("cost", {}) pass_rate = compute_pass_rate(tasks) passed = sum(1 for v in tasks.values() if v) total = len(tasks) @@ -133,7 +156,7 @@ def build_report( delta_str = "" if baseline_rate is not None: delta = pass_rate - baseline_rate - if delta < -0.10: + if delta < -0.05: has_regression = True delta_str = f" | **{delta:+.0%} from baseline** :red_circle:" elif delta < 0: @@ -141,30 +164,55 @@ def build_report( elif delta > 0: delta_str = f" | {delta:+.0%} from baseline :white_check_mark:" - lines.append(f"### `{model}` — {passed}/{total} ({pass_rate:.0%}){delta_str}") + cost_str = "" + cost_usd = cost.get("cost_usd", 0) + if cost_usd > 0: + cost_str = f" | ${cost_usd:.2f}" + + lines.append(f"
") + lines.append(f"{model} — {passed}/{total} ({pass_rate:.0%}){delta_str}{cost_str}") lines.append("") - lines.append("| Task | Result | Baseline |") - lines.append("|------|--------|----------|") + + # Categorize tasks + regressions = [] # was passing, now failing + improvements = [] # was failing, now passing + new_tasks = [] # not in baseline for task_name, passed_now in sorted(tasks.items()): - result_emoji = ":white_check_mark:" if passed_now else ":x:" baseline_val = baseline_tasks.get(task_name) - if baseline_val is None: - baseline_str = "—" - elif baseline_val: - baseline_str = ":white_check_mark:" - else: - baseline_str = ":x:" - - # Flag regressions: was passing, now failing - regression_marker = "" - if baseline_val is True and not passed_now: - regression_marker = " **REGRESSION**" + new_tasks.append((task_name, passed_now)) + elif baseline_val and not passed_now: + regressions.append(task_name) has_regression = True + elif not baseline_val and passed_now: + improvements.append(task_name) - lines.append(f"| {task_name} | {result_emoji}{regression_marker} | {baseline_str} |") + if regressions: + lines.append(f"**Regressions ({len(regressions)}):**") + for t in regressions: + lines.append(f"- :red_circle: {t}") + lines.append("") + if improvements: + lines.append(f"**Improvements ({len(improvements)}):**") + for t in improvements: + lines.append(f"- :white_check_mark: {t}") + lines.append("") + + if new_tasks: + new_passed = sum(1 for _, p in new_tasks if p) + lines.append(f"**New tasks ({new_passed}/{len(new_tasks)} passed):**") + for t, p in new_tasks: + emoji = ":white_check_mark:" if p else ":x:" + lines.append(f"- {emoji} {t}") + lines.append("") + + if not regressions and not improvements and not new_tasks: + lines.append("No changes from baseline.") + lines.append("") + + lines.append("
") lines.append("") if not model_results: @@ -179,6 +227,10 @@ def build_report( lines.append(f"[Workflow run]({run_url}/{repo}/actions/runs/{run_id})") lines.append("") + if has_regression: + lines.append("cc @devanshrj") + lines.append("") + return "\n".join(lines), has_regression