feat: TB regression — full runs, xhigh models, baselines, and report improvements (#1390)
Co-authored-by: Letta Code <noreply@letta.com>
This commit is contained in:
18
.github/workflows/terminal-bench-regression.yml
vendored
18
.github/workflows/terminal-bench-regression.yml
vendored
@@ -2,7 +2,7 @@ name: Terminal-Bench Regression
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: "0 8 * * 1,4" # Monday + Thursday 8am UTC
|
||||
- cron: "0 12 * * 1" # Monday 5am PT (12pm UTC)
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
model:
|
||||
@@ -10,7 +10,7 @@ on:
|
||||
default: ""
|
||||
concurrency:
|
||||
description: "Max concurrent tasks"
|
||||
default: "4"
|
||||
default: "10"
|
||||
|
||||
env:
|
||||
FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
|
||||
@@ -22,7 +22,7 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
model: [sonnet-4.6-low, gpt-5-minimal]
|
||||
model: [sonnet-4.6-xhigh, gpt-5.3-codex-xhigh]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
@@ -54,21 +54,13 @@ jobs:
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
|
||||
# Build --task-name flags from regression-tasks.txt
|
||||
TASK_FLAGS=""
|
||||
while IFS= read -r task; do
|
||||
[[ "$task" =~ ^#.*$ || -z "$task" ]] && continue
|
||||
TASK_FLAGS="$TASK_FLAGS --task-name $task"
|
||||
done < benchmarks/terminal_bench/regression-tasks.txt
|
||||
|
||||
harbor run \
|
||||
--dataset terminal-bench@2.0 \
|
||||
--agent-import-path benchmarks.terminal_bench.letta_code_agent:LettaCode \
|
||||
--model "${{ matrix.model }}" \
|
||||
--env modal \
|
||||
--n-concurrent ${{ inputs.concurrency || '4' }} \
|
||||
--job-name "regression-${{ matrix.model }}-$(date +%Y%m%d)" \
|
||||
$TASK_FLAGS
|
||||
--n-concurrent ${{ inputs.concurrency || '10' }} \
|
||||
--job-name "regression-${{ matrix.model }}-$(date +%Y%m%d)"
|
||||
|
||||
- name: Upload results artifact
|
||||
if: always()
|
||||
|
||||
Reference in New Issue
Block a user