From 6f999fac251f7614e2118f527fbcf1c706caf8f8 Mon Sep 17 00:00:00 2001
From: Devansh Jain <31609257+devanshrj@users.noreply.github.com>
Date: Fri, 13 Mar 2026 19:08:24 -0700
Subject: [PATCH] =?UTF-8?q?feat:=20TB=20regression=20=E2=80=94=20full=20ru?=
 =?UTF-8?q?ns,=20xhigh=20models,=20baselines,=20and=20report=20improvement?=
 =?UTF-8?q?s=20(#1390)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Letta Code <noreply@letta.com>
---
 .../workflows/terminal-bench-regression.yml   |  18 +-
 benchmarks/terminal_bench/README.md           |  55 +++++
 benchmarks/terminal_bench/baseline.json       | 191 +++++++++++++++++-
 benchmarks/terminal_bench/letta_code_agent.py |  11 +-
 .../terminal_bench/regression-tasks.txt       |   6 -
 benchmarks/terminal_bench/report.py           | 116 ++++++++---
 6 files changed, 343 insertions(+), 54 deletions(-)
 create mode 100644 benchmarks/terminal_bench/README.md
 delete mode 100644 benchmarks/terminal_bench/regression-tasks.txt

diff --git a/.github/workflows/terminal-bench-regression.yml b/.github/workflows/terminal-bench-regression.yml
index 631d1f8..da245af 100644
--- a/.github/workflows/terminal-bench-regression.yml
+++ b/.github/workflows/terminal-bench-regression.yml
@@ -2,7 +2,7 @@ name: Terminal-Bench Regression
 
 on:
   schedule:
-    - cron: "0 8 * * 1,4" # Monday + Thursday 8am UTC
+    - cron: "0 12 * * 1" # Monday 5am PT (12pm UTC)
   workflow_dispatch:
     inputs:
       model:
@@ -10,7 +10,7 @@ on:
         default: ""
       concurrency:
         description: "Max concurrent tasks"
-        default: "4"
+        default: "10"
 
 env:
   FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
@@ -22,7 +22,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        model: [sonnet-4.6-low, gpt-5-minimal]
+        model: [sonnet-4.6-xhigh, gpt-5.3-codex-xhigh]
     steps:
       - name: Checkout
         uses: actions/checkout@v6
@@ -54,21 +54,13 @@ jobs:
         run: |
           source .venv/bin/activate
 
-          # Build --task-name flags from regression-tasks.txt
-          TASK_FLAGS=""
-          while IFS= read -r task; do
-            [[ "$task" =~ ^#.*$ || -z "$task" ]] && continue
-            TASK_FLAGS="$TASK_FLAGS --task-name $task"
-          done < benchmarks/terminal_bench/regression-tasks.txt
-
           harbor run \
             --dataset terminal-bench@2.0 \
             --agent-import-path benchmarks.terminal_bench.letta_code_agent:LettaCode \
             --model "${{ matrix.model }}" \
             --env modal \
-            --n-concurrent ${{ inputs.concurrency || '4' }} \
-            --job-name "regression-${{ matrix.model }}-$(date +%Y%m%d)" \
-            $TASK_FLAGS
+            --n-concurrent ${{ inputs.concurrency || '10' }} \
+            --job-name "regression-${{ matrix.model }}-$(date +%Y%m%d)"
 
       - name: Upload results artifact
         if: always()
diff --git a/benchmarks/terminal_bench/README.md b/benchmarks/terminal_bench/README.md
new file mode 100644
index 0000000..ef67389
--- /dev/null
+++ b/benchmarks/terminal_bench/README.md
@@ -0,0 +1,55 @@
+# Terminal-Bench Regression
+
+Weekly regression tests for Letta Code on [Terminal-Bench 2.0](https://github.com/laude-institute/terminal-bench-2).
+
+## How it works
+
+1. **GitHub Actions** (`.github/workflows/terminal-bench-regression.yml`) runs every Monday at 5am PT
+2. **Harbor** orchestrates task execution in **Modal** cloud sandboxes
+3. Letta Code is built from source (`main` branch) inside each sandbox
+4. Results are compared against `baseline.json` and posted to a GitHub issue
+5. `@devanshrj` is tagged if any model drops >5% from baseline
+
+## Models
+
+| Model | Baseline |
+|-------|----------|
+| `sonnet-4.6-xhigh` | 38/89 (42.7%) |
+| `gpt-5.3-codex-xhigh` | 57/89 (64.0%) |
+
+## Files
+
+| File | Description |
+|------|-------------|
+| `letta_code_agent.py` | Harbor agent — installs and runs Letta Code CLI in sandbox |
+| `install-letta-code.sh.j2` | Jinja2 install script (Node.js, Bun, build from source) |
+| `baseline.json` | Per-model, per-task pass/fail baselines |
+| `report.py` | Parses results, detects regressions, posts GitHub issue |
+
+## Manual trigger
+
+```bash
+gh workflow run terminal-bench-regression.yml --ref main -f concurrency=10
+```
+
+## Required secrets
+
+- `LETTA_API_KEY` — Letta Cloud API key
+- `ANTHROPIC_API_KEY` / `OPENAI_API_KEY` — LLM provider keys
+- `MODAL_TOKEN_ID` / `MODAL_TOKEN_SECRET` — Modal sandbox credentials
+
+## Updating baselines
+
+Replace `baseline.json` with results from a new run. Format:
+
+```json
+{
+  "model-name": {
+    "pass_rate": 0.427,
+    "tasks": {
+      "task-name": true,
+      ...
+    }
+  }
+}
+```
diff --git a/benchmarks/terminal_bench/baseline.json b/benchmarks/terminal_bench/baseline.json
index 9e26dfe..cadcefe 100644
--- a/benchmarks/terminal_bench/baseline.json
+++ b/benchmarks/terminal_bench/baseline.json
@@ -1 +1,190 @@
-{}
\ No newline at end of file
+{
+  "sonnet-4.6-xhigh": {
+    "pass_rate": 0.427,
+    "tasks": {
+      "adaptive-rejection-sampler": false,
+      "bn-fit-modify": true,
+      "break-filter-js-from-html": false,
+      "build-cython-ext": true,
+      "build-pmars": true,
+      "build-pov-ray": true,
+      "caffe-cifar-10": false,
+      "cancel-async-tasks": false,
+      "chess-best-move": false,
+      "circuit-fibsqrt": false,
+      "cobol-modernization": false,
+      "code-from-image": true,
+      "compile-compcert": true,
+      "configure-git-webserver": true,
+      "constraints-scheduling": true,
+      "count-dataset-tokens": true,
+      "crack-7z-hash": false,
+      "custom-memory-heap-crash": false,
+      "db-wal-recovery": false,
+      "distribution-search": false,
+      "dna-assembly": false,
+      "dna-insert": false,
+      "extract-elf": true,
+      "extract-moves-from-video": false,
+      "feal-differential-cryptanalysis": true,
+      "feal-linear-cryptanalysis": false,
+      "filter-js-from-html": false,
+      "financial-document-processor": false,
+      "fix-code-vulnerability": true,
+      "fix-git": true,
+      "fix-ocaml-gc": true,
+      "gcode-to-text": true,
+      "git-leak-recovery": true,
+      "git-multibranch": true,
+      "gpt2-codegolf": false,
+      "headless-terminal": true,
+      "hf-model-inference": true,
+      "install-windows-3.11": false,
+      "kv-store-grpc": true,
+      "large-scale-text-editing": true,
+      "largest-eigenval": false,
+      "llm-inference-batching-scheduler": false,
+      "log-summary-date-ranges": true,
+      "mailman": false,
+      "make-doom-for-mips": false,
+      "make-mips-interpreter": true,
+      "mcmc-sampling-stan": true,
+      "merge-diff-arc-agi-task": true,
+      "model-extraction-relu-logits": false,
+      "modernize-scientific-stack": true,
+      "mteb-leaderboard": false,
+      "mteb-retrieve": false,
+      "multi-source-data-merger": true,
+      "nginx-request-logging": true,
+      "openssl-selfsigned-cert": true,
+      "overfull-hbox": false,
+      "password-recovery": true,
+      "path-tracing-reverse": false,
+      "path-tracing": false,
+      "polyglot-c-py": false,
+      "polyglot-rust-c": false,
+      "portfolio-optimization": false,
+      "protein-assembly": false,
+      "prove-plus-comm": true,
+      "pypi-server": true,
+      "pytorch-model-cli": false,
+      "pytorch-model-recovery": true,
+      "qemu-alpine-ssh": false,
+      "qemu-startup": true,
+      "query-optimize": false,
+      "raman-fitting": false,
+      "regex-chess": false,
+      "regex-log": true,
+      "reshard-c4-data": false,
+      "rstan-to-pystan": false,
+      "sam-cell-seg": false,
+      "sanitize-git-repo": false,
+      "schemelike-metacircular-eval": false,
+      "sparql-university": false,
+      "sqlite-db-truncate": true,
+      "sqlite-with-gcov": false,
+      "torch-pipeline-parallelism": false,
+      "torch-tensor-parallelism": true,
+      "train-fasttext": false,
+      "tune-mjcf": false,
+      "video-processing": false,
+      "vulnerable-secret": true,
+      "winning-avg-corewars": false,
+      "write-compressor": false
+    }
+  },
+  "gpt-5.3-codex-xhigh": {
+    "pass_rate": 0.6404,
+    "tasks": {
+      "adaptive-rejection-sampler": false,
+      "bn-fit-modify": true,
+      "break-filter-js-from-html": true,
+      "build-cython-ext": true,
+      "build-pmars": true,
+      "build-pov-ray": true,
+      "caffe-cifar-10": true,
+      "cancel-async-tasks": false,
+      "chess-best-move": true,
+      "circuit-fibsqrt": true,
+      "cobol-modernization": true,
+      "code-from-image": true,
+      "compile-compcert": true,
+      "configure-git-webserver": true,
+      "constraints-scheduling": true,
+      "count-dataset-tokens": true,
+      "crack-7z-hash": true,
+      "custom-memory-heap-crash": true,
+      "db-wal-recovery": false,
+      "distribution-search": true,
+      "dna-assembly": false,
+      "dna-insert": true,
+      "extract-elf": true,
+      "extract-moves-from-video": false,
+      "feal-differential-cryptanalysis": true,
+      "feal-linear-cryptanalysis": true,
+      "filter-js-from-html": false,
+      "financial-document-processor": true,
+      "fix-code-vulnerability": true,
+      "fix-git": true,
+      "fix-ocaml-gc": true,
+      "gcode-to-text": false,
+      "git-leak-recovery": true,
+      "git-multibranch": true,
+      "gpt2-codegolf": false,
+      "headless-terminal": true,
+      "hf-model-inference": true,
+      "install-windows-3.11": false,
+      "kv-store-grpc": true,
+      "large-scale-text-editing": true,
+      "largest-eigenval": true,
+      "llm-inference-batching-scheduler": true,
+      "log-summary-date-ranges": true,
+      "mailman": false,
+      "make-doom-for-mips": false,
+      "make-mips-interpreter": false,
+      "mcmc-sampling-stan": false,
+      "merge-diff-arc-agi-task": true,
+      "model-extraction-relu-logits": true,
+      "modernize-scientific-stack": true,
+      "mteb-leaderboard": true,
+      "mteb-retrieve": false,
+      "multi-source-data-merger": true,
+      "nginx-request-logging": true,
+      "openssl-selfsigned-cert": true,
+      "overfull-hbox": false,
+      "password-recovery": true,
+      "path-tracing-reverse": true,
+      "path-tracing": true,
+      "polyglot-c-py": false,
+      "polyglot-rust-c": false,
+      "portfolio-optimization": false,
+      "protein-assembly": true,
+      "prove-plus-comm": true,
+      "pypi-server": true,
+      "pytorch-model-cli": true,
+      "pytorch-model-recovery": false,
+      "qemu-alpine-ssh": false,
+      "qemu-startup": false,
+      "query-optimize": false,
+      "raman-fitting": false,
+      "regex-chess": false,
+      "regex-log": true,
+      "reshard-c4-data": false,
+      "rstan-to-pystan": true,
+      "sam-cell-seg": false,
+      "sanitize-git-repo": true,
+      "schemelike-metacircular-eval": false,
+      "sparql-university": true,
+      "sqlite-db-truncate": true,
+      "sqlite-with-gcov": true,
+      "torch-pipeline-parallelism": false,
+      "torch-tensor-parallelism": false,
+      "train-fasttext": false,
+      "tune-mjcf": true,
+      "video-processing": false,
+      "vulnerable-secret": true,
+      "winning-avg-corewars": true,
+      "write-compressor": false
+    }
+  }
+}
diff --git a/benchmarks/terminal_bench/letta_code_agent.py b/benchmarks/terminal_bench/letta_code_agent.py
index 0d41cc7..9df6ff4 100644
--- a/benchmarks/terminal_bench/letta_code_agent.py
+++ b/benchmarks/terminal_bench/letta_code_agent.py
@@ -27,6 +27,12 @@ _PROVIDER_SYSTEM_MAP = {
 }
 _DEFAULT_SYSTEM = "source-claude"
 
+# Map Letta Code model handles to litellm model names for cost calculation.
+_LITELLM_MODEL_MAP: dict[str, str] = {
+    "sonnet-4.6-xhigh": "anthropic/claude-sonnet-4-6",
+    "gpt-5.3-codex-xhigh": "openai/gpt-5.3-codex",
+}
+
 
 class LettaCode(BaseInstalledAgent):
     """Run Letta Code CLI inside a harbor environment."""
@@ -194,9 +200,10 @@ class LettaCode(BaseInstalledAgent):
 
     def _populate_usage(self, events_text: str, context: AgentContext) -> None:
         """Extract usage from events and populate context + write usage.json."""
-        model_name = self.model_name or os.environ.get("LETTA_MODEL", "").strip()
+        raw_model = self.model_name or os.environ.get("LETTA_MODEL", "").strip()
+        litellm_model = _LITELLM_MODEL_MAP.get(raw_model, raw_model)
         usage = self._extract_usage_from_events(events_text)
-        cost = self._calculate_cost(model_name, usage)
+        cost = self._calculate_cost(litellm_model, usage)
 
         context.n_input_tokens = usage["prompt_tokens"] or None
         context.n_output_tokens = usage["completion_tokens"] or None
diff --git a/benchmarks/terminal_bench/regression-tasks.txt b/benchmarks/terminal_bench/regression-tasks.txt
deleted file mode 100644
index efe2026..0000000
--- a/benchmarks/terminal_bench/regression-tasks.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-# Terminal-Bench regression task subset for Letta Code
-# These tasks are run on a schedule to detect regressions.
-# Criteria: fast (<10 min), diverse capabilities, deterministic.
-# Adjust based on known Letta Code pass rates.
-
-cancel-async-tasks
diff --git a/benchmarks/terminal_bench/report.py b/benchmarks/terminal_bench/report.py
index c3a9900..eefdceb 100644
--- a/benchmarks/terminal_bench/report.py
+++ b/benchmarks/terminal_bench/report.py
@@ -25,9 +25,9 @@ from datetime import datetime, timezone
 from pathlib import Path
 
 
-def parse_job_results(results_dir: Path) -> dict[str, dict[str, bool]]:
-    """Parse Harbor job results into {model: {task: passed}}."""
-    model_results: dict[str, dict[str, bool]] = {}
+def parse_job_results(results_dir: Path) -> dict[str, dict]:
+    """Parse Harbor job results into {model: {tasks: {task: passed}, cost: {..}}}."""
+    model_results: dict[str, dict] = {}
 
     for artifact_dir in sorted(results_dir.iterdir()):
         if not artifact_dir.is_dir():
@@ -41,6 +41,9 @@ def parse_job_results(results_dir: Path) -> dict[str, dict[str, bool]]:
             model = dir_name
 
         tasks: dict[str, bool] = {}
+        total_cost = 0.0
+        total_prompt_tokens = 0
+        total_completion_tokens = 0
 
         # Look for job directories — Harbor puts them under jobs/
         jobs_dir = artifact_dir / "jobs"
@@ -66,22 +69,40 @@ def parse_job_results(results_dir: Path) -> dict[str, dict[str, bool]]:
                     try:
                         reward = float(reward_file.read_text().strip())
                         tasks[task_name] = reward >= 1.0
-                        continue
                     except (ValueError, OSError):
                         pass
 
-                # Fall back to result.json
-                result_file = trial_dir / "result.json"
-                if result_file.exists():
+                if task_name not in tasks:
+                    # Fall back to result.json
+                    result_file = trial_dir / "result.json"
+                    if result_file.exists():
+                        try:
+                            result = json.loads(result_file.read_text())
+                            reward = result.get("reward", result.get("score", 0))
+                            tasks[task_name] = float(reward) >= 1.0
+                        except (json.JSONDecodeError, ValueError, OSError):
+                            tasks[task_name] = False
+
+                # Collect cost from usage.json
+                usage_file = trial_dir / "usage.json"
+                if usage_file.exists():
                     try:
-                        result = json.loads(result_file.read_text())
-                        reward = result.get("reward", result.get("score", 0))
-                        tasks[task_name] = float(reward) >= 1.0
-                    except (json.JSONDecodeError, ValueError, OSError):
-                        tasks[task_name] = False
+                        usage = json.loads(usage_file.read_text())
+                        total_cost += usage.get("cost_usd", 0.0)
+                        total_prompt_tokens += usage.get("prompt_tokens", 0)
+                        total_completion_tokens += usage.get("completion_tokens", 0)
+                    except (json.JSONDecodeError, OSError):
+                        pass
 
         if tasks:
-            model_results[model] = tasks
+            model_results[model] = {
+                "tasks": tasks,
+                "cost": {
+                    "cost_usd": round(total_cost, 2),
+                    "prompt_tokens": total_prompt_tokens,
+                    "completion_tokens": total_completion_tokens,
+                },
+            }
 
     return model_results
 
@@ -105,7 +126,7 @@ def load_baseline(baseline_path: Path) -> dict:
 
 
 def build_report(
-    model_results: dict[str, dict[str, bool]],
+    model_results: dict[str, dict],
     baseline: dict,
 ) -> tuple[str, bool]:
     """Build a markdown report and determine if there's a regression.
@@ -120,7 +141,9 @@ def build_report(
 
     has_regression = False
 
-    for model, tasks in sorted(model_results.items()):
+    for model, data in sorted(model_results.items()):
+        tasks = data["tasks"]
+        cost = data.get("cost", {})
         pass_rate = compute_pass_rate(tasks)
         passed = sum(1 for v in tasks.values() if v)
         total = len(tasks)
@@ -133,7 +156,7 @@ def build_report(
         delta_str = ""
         if baseline_rate is not None:
             delta = pass_rate - baseline_rate
-            if delta < -0.10:
+            if delta < -0.05:
                 has_regression = True
                 delta_str = f" | **{delta:+.0%} from baseline** :red_circle:"
             elif delta < 0:
@@ -141,30 +164,55 @@ def build_report(
             elif delta > 0:
                 delta_str = f" | {delta:+.0%} from baseline :white_check_mark:"
 
-        lines.append(f"### `{model}` — {passed}/{total} ({pass_rate:.0%}){delta_str}")
+        cost_str = ""
+        cost_usd = cost.get("cost_usd", 0)
+        if cost_usd > 0:
+            cost_str = f" | ${cost_usd:.2f}"
+
+        lines.append(f"<details>")
+        lines.append(f"<summary><strong>{model}</strong> — {passed}/{total} ({pass_rate:.0%}){delta_str}{cost_str}</summary>")
         lines.append("")
-        lines.append("| Task | Result | Baseline |")
-        lines.append("|------|--------|----------|")
+
+        # Categorize tasks
+        regressions = []  # was passing, now failing
+        improvements = []  # was failing, now passing
+        new_tasks = []  # not in baseline
 
         for task_name, passed_now in sorted(tasks.items()):
-            result_emoji = ":white_check_mark:" if passed_now else ":x:"
             baseline_val = baseline_tasks.get(task_name)
-
             if baseline_val is None:
-                baseline_str = "—"
-            elif baseline_val:
-                baseline_str = ":white_check_mark:"
-            else:
-                baseline_str = ":x:"
-
-            # Flag regressions: was passing, now failing
-            regression_marker = ""
-            if baseline_val is True and not passed_now:
-                regression_marker = " **REGRESSION**"
+                new_tasks.append((task_name, passed_now))
+            elif baseline_val and not passed_now:
+                regressions.append(task_name)
                 has_regression = True
+            elif not baseline_val and passed_now:
+                improvements.append(task_name)
 
-            lines.append(f"| {task_name} | {result_emoji}{regression_marker} | {baseline_str} |")
+        if regressions:
+            lines.append(f"**Regressions ({len(regressions)}):**")
+            for t in regressions:
+                lines.append(f"- :red_circle: {t}")
+            lines.append("")
 
+        if improvements:
+            lines.append(f"**Improvements ({len(improvements)}):**")
+            for t in improvements:
+                lines.append(f"- :white_check_mark: {t}")
+            lines.append("")
+
+        if new_tasks:
+            new_passed = sum(1 for _, p in new_tasks if p)
+            lines.append(f"**New tasks ({new_passed}/{len(new_tasks)} passed):**")
+            for t, p in new_tasks:
+                emoji = ":white_check_mark:" if p else ":x:"
+                lines.append(f"- {emoji} {t}")
+            lines.append("")
+
+        if not regressions and not improvements and not new_tasks:
+            lines.append("No changes from baseline.")
+            lines.append("")
+
+        lines.append("</details>")
         lines.append("")
 
     if not model_results:
@@ -179,6 +227,10 @@ def build_report(
         lines.append(f"[Workflow run]({run_url}/{repo}/actions/runs/{run_id})")
         lines.append("")
 
+    if has_regression:
+        lines.append("cc @devanshrj")
+        lines.append("")
+
     return "\n".join(lines), has_regression