name: Terminal-Bench Regression on: schedule: - cron: "0 12 * * 1" # Monday 5am PT (12pm UTC) workflow_dispatch: inputs: model: description: "Override model (blank = run both defaults)" default: "" concurrency: description: "Max concurrent tasks" default: "10" env: FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true jobs: regression: runs-on: ubuntu-latest timeout-minutes: 180 strategy: fail-fast: false matrix: model: [sonnet-4.6-xhigh, gpt-5.3-codex-xhigh] steps: - name: Checkout uses: actions/checkout@v6 - name: Setup Python + uv uses: astral-sh/setup-uv@v6 with: enable-cache: false - name: Create venv and install deps run: | uv venv .venv source .venv/bin/activate uv pip install "harbor>=0.1.45" "litellm>=1.0.0" "modal>=1.3.5" - name: Configure Modal env: MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }} MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }} run: | printf '[letta]\ntoken_id = "%s"\ntoken_secret = "%s"\nactive = true\nenvironment = "terminal-bench"\nimage_builder_version = "2025.06"\n' \ "$MODAL_TOKEN_ID" "$MODAL_TOKEN_SECRET" > ~/.modal.toml - name: Run regression tasks env: LETTA_API_KEY: ${{ secrets.LETTA_API_KEY }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} run: | source .venv/bin/activate harbor run \ --dataset terminal-bench@2.0 \ --agent-import-path benchmarks.terminal_bench.letta_code_agent:LettaCode \ --model "${{ matrix.model }}" \ --env modal \ --n-concurrent ${{ inputs.concurrency || '10' }} \ --job-name "regression-${{ matrix.model }}-$(date +%Y%m%d)" - name: Upload results artifact if: always() uses: actions/upload-artifact@v4 with: name: tb-results-${{ matrix.model }} path: jobs/ report: needs: regression if: always() runs-on: ubuntu-latest permissions: issues: write contents: read steps: - name: Checkout uses: actions/checkout@v6 - name: Download all result artifacts uses: actions/download-artifact@v4 with: path: results/ - name: Setup Python + uv uses: astral-sh/setup-uv@v6 with: enable-cache: false - name: Generate report and update GitHub Issue env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_REPOSITORY: ${{ github.repository }} GITHUB_RUN_ID: ${{ github.run_id }} GITHUB_SERVER_URL: ${{ github.server_url }} run: | uv run python benchmarks/terminal_bench/report.py \ --results-dir results/ \ --baseline benchmarks/terminal_bench/baseline.json \ --repo "${{ github.repository }}"