diff --git a/.github/workflows/terminal-bench-regression.yml b/.github/workflows/terminal-bench-regression.yml index 68d6002..631d1f8 100644 --- a/.github/workflows/terminal-bench-regression.yml +++ b/.github/workflows/terminal-bench-regression.yml @@ -12,6 +12,9 @@ on: description: "Max concurrent tasks" default: "4" +env: + FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true + jobs: regression: runs-on: ubuntu-latest @@ -26,9 +29,14 @@ jobs: - name: Setup Python + uv uses: astral-sh/setup-uv@v6 + with: + enable-cache: false - - name: Install Harbor - run: uv pip install --system "harbor>=0.1.45" "litellm>=1.0.0" + - name: Create venv and install deps + run: | + uv venv .venv + source .venv/bin/activate + uv pip install "harbor>=0.1.45" "litellm>=1.0.0" "modal>=1.3.5" - name: Configure Modal env: @@ -44,6 +52,8 @@ jobs: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} run: | + source .venv/bin/activate + # Build --task-name flags from regression-tasks.txt TASK_FLAGS="" while IFS= read -r task; do @@ -85,6 +95,8 @@ jobs: - name: Setup Python + uv uses: astral-sh/setup-uv@v6 + with: + enable-cache: false - name: Generate report and update GitHub Issue env: @@ -93,7 +105,7 @@ jobs: GITHUB_RUN_ID: ${{ github.run_id }} GITHUB_SERVER_URL: ${{ github.server_url }} run: | - python benchmarks/terminal_bench/report.py \ + uv run python benchmarks/terminal_bench/report.py \ --results-dir results/ \ --baseline benchmarks/terminal_bench/baseline.json \ --repo "${{ github.repository }}" diff --git a/benchmarks/terminal_bench/install-letta-code.sh.j2 b/benchmarks/terminal_bench/install-letta-code.sh.j2 index ed8eaee..f35e6fa 100644 --- a/benchmarks/terminal_bench/install-letta-code.sh.j2 +++ b/benchmarks/terminal_bench/install-letta-code.sh.j2 @@ -2,7 +2,7 @@ set -euo pipefail apt-get update -apt-get install -y curl git unzip +apt-get install -y curl git unzip build-essential # Install Node.js (required to run the letta CLI) curl -fsSL https://deb.nodesource.com/setup_20.x | bash -