feat: TB regression — full runs, xhigh models, baselines, and report improvements (#1390)

Co-authored-by: Letta Code <noreply@letta.com>
2026-03-13 19:08:24 -07:00
parent c52656625b
commit 6f999fac25
6 changed files with 343 additions and 54 deletions
--- a/.github/workflows/terminal-bench-regression.yml
+++ b/.github/workflows/terminal-bench-regression.yml
@@ -2,7 +2,7 @@ name: Terminal-Bench Regression

 on:
  schedule:
-    - cron: "0 8 * * 1,4" # Monday + Thursday 8am UTC
+    - cron: "0 12 * * 1" # Monday 5am PT (12pm UTC)
  workflow_dispatch:
    inputs:
      model:
@@ -10,7 +10,7 @@ on:
        default: ""
      concurrency:
        description: "Max concurrent tasks"
-        default: "4"
+        default: "10"

 env:
  FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
@@ -22,7 +22,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        model: [sonnet-4.6-low, gpt-5-minimal]
+        model: [sonnet-4.6-xhigh, gpt-5.3-codex-xhigh]
    steps:
      - name: Checkout
        uses: actions/checkout@v6
@@ -54,21 +54,13 @@ jobs:
        run: |
          source .venv/bin/activate

-          # Build --task-name flags from regression-tasks.txt
-          TASK_FLAGS=""
-          while IFS= read -r task; do
-            [[ "$task" =~ ^#.*$ || -z "$task" ]] && continue
-            TASK_FLAGS="$TASK_FLAGS --task-name $task"
-          done < benchmarks/terminal_bench/regression-tasks.txt
-
          harbor run \
            --dataset terminal-bench@2.0 \
            --agent-import-path benchmarks.terminal_bench.letta_code_agent:LettaCode \
            --model "${{ matrix.model }}" \
            --env modal \
-            --n-concurrent ${{ inputs.concurrency || '4' }} \
-            --job-name "regression-${{ matrix.model }}-$(date +%Y%m%d)" \
-            $TASK_FLAGS
+            --n-concurrent ${{ inputs.concurrency || '10' }} \
+            --job-name "regression-${{ matrix.model }}-$(date +%Y%m%d)"

      - name: Upload results artifact
        if: always()