feat: TB regression — full runs, xhigh models, baselines, and report improvements (#1390)

Co-authored-by: Letta Code <noreply@letta.com>
This commit is contained in:
Devansh Jain
2026-03-13 19:08:24 -07:00
committed by GitHub
parent c52656625b
commit 6f999fac25
6 changed files with 343 additions and 54 deletions

View File

@@ -2,7 +2,7 @@ name: Terminal-Bench Regression
on:
schedule:
- cron: "0 8 * * 1,4" # Monday + Thursday 8am UTC
- cron: "0 12 * * 1" # Monday 5am PT (12pm UTC)
workflow_dispatch:
inputs:
model:
@@ -10,7 +10,7 @@ on:
default: ""
concurrency:
description: "Max concurrent tasks"
default: "4"
default: "10"
env:
FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
@@ -22,7 +22,7 @@ jobs:
strategy:
fail-fast: false
matrix:
model: [sonnet-4.6-low, gpt-5-minimal]
model: [sonnet-4.6-xhigh, gpt-5.3-codex-xhigh]
steps:
- name: Checkout
uses: actions/checkout@v6
@@ -54,21 +54,13 @@ jobs:
run: |
source .venv/bin/activate
# Build --task-name flags from regression-tasks.txt
TASK_FLAGS=""
while IFS= read -r task; do
[[ "$task" =~ ^#.*$ || -z "$task" ]] && continue
TASK_FLAGS="$TASK_FLAGS --task-name $task"
done < benchmarks/terminal_bench/regression-tasks.txt
harbor run \
--dataset terminal-bench@2.0 \
--agent-import-path benchmarks.terminal_bench.letta_code_agent:LettaCode \
--model "${{ matrix.model }}" \
--env modal \
--n-concurrent ${{ inputs.concurrency || '4' }} \
--job-name "regression-${{ matrix.model }}-$(date +%Y%m%d)" \
$TASK_FLAGS
--n-concurrent ${{ inputs.concurrency || '10' }} \
--job-name "regression-${{ matrix.model }}-$(date +%Y%m%d)"
- name: Upload results artifact
if: always()