Files
letta-code/.github/workflows/terminal-bench-regression.yml

104 lines
3.0 KiB
YAML

name: Terminal-Bench Regression
on:
schedule:
- cron: "0 12 * * 1" # Monday 5am PT (12pm UTC)
workflow_dispatch:
inputs:
model:
description: "Override model (blank = run both defaults)"
default: ""
concurrency:
description: "Max concurrent tasks"
default: "10"
env:
FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
jobs:
regression:
runs-on: ubuntu-latest
timeout-minutes: 180
strategy:
fail-fast: false
matrix:
model: [sonnet-4.6-xhigh, gpt-5.3-codex-xhigh]
steps:
- name: Checkout
uses: actions/checkout@v6
- name: Setup Python + uv
uses: astral-sh/setup-uv@v6
with:
enable-cache: false
- name: Create venv and install deps
run: |
uv venv .venv
source .venv/bin/activate
uv pip install "harbor>=0.1.45" "litellm>=1.0.0" "modal>=1.3.5"
- name: Configure Modal
env:
MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
run: |
printf '[letta]\ntoken_id = "%s"\ntoken_secret = "%s"\nactive = true\nenvironment = "terminal-bench"\nimage_builder_version = "2025.06"\n' \
"$MODAL_TOKEN_ID" "$MODAL_TOKEN_SECRET" > ~/.modal.toml
- name: Run regression tasks
env:
LETTA_API_KEY: ${{ secrets.LETTA_API_KEY }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
run: |
source .venv/bin/activate
harbor run \
--dataset terminal-bench@2.0 \
--agent-import-path benchmarks.terminal_bench.letta_code_agent:LettaCode \
--model "${{ matrix.model }}" \
--env modal \
--n-concurrent ${{ inputs.concurrency || '10' }} \
--job-name "regression-${{ matrix.model }}-$(date +%Y%m%d)"
- name: Upload results artifact
if: always()
uses: actions/upload-artifact@v4
with:
name: tb-results-${{ matrix.model }}
path: jobs/
report:
needs: regression
if: always()
runs-on: ubuntu-latest
permissions:
issues: write
contents: read
steps:
- name: Checkout
uses: actions/checkout@v6
- name: Download all result artifacts
uses: actions/download-artifact@v4
with:
path: results/
- name: Setup Python + uv
uses: astral-sh/setup-uv@v6
with:
enable-cache: false
- name: Generate report and update GitHub Issue
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GITHUB_REPOSITORY: ${{ github.repository }}
GITHUB_RUN_ID: ${{ github.run_id }}
GITHUB_SERVER_URL: ${{ github.server_url }}
run: |
uv run python benchmarks/terminal_bench/report.py \
--results-dir results/ \
--baseline benchmarks/terminal_bench/baseline.json \
--repo "${{ github.repository }}"