From 656895d312a754886a736b71d20298585891e04f Mon Sep 17 00:00:00 2001 From: Devansh Jain <31609257+devanshrj@users.noreply.github.com> Date: Fri, 13 Mar 2026 14:26:38 -0700 Subject: [PATCH] feat: add Terminal-Bench weekly regression workflow [LET-7791] (#1232) Co-authored-by: Letta --- .../workflows/terminal-bench-regression.yml | 99 +++++ benchmarks/terminal_bench/__init__.py | 0 benchmarks/terminal_bench/baseline.json | 1 + .../terminal_bench/install-letta-code.sh.j2 | 25 ++ benchmarks/terminal_bench/letta_code_agent.py | 419 ++++++++++++++++++ .../terminal_bench/regression-tasks.txt | 6 + benchmarks/terminal_bench/report.py | 271 +++++++++++ 7 files changed, 821 insertions(+) create mode 100644 .github/workflows/terminal-bench-regression.yml create mode 100644 benchmarks/terminal_bench/__init__.py create mode 100644 benchmarks/terminal_bench/baseline.json create mode 100644 benchmarks/terminal_bench/install-letta-code.sh.j2 create mode 100644 benchmarks/terminal_bench/letta_code_agent.py create mode 100644 benchmarks/terminal_bench/regression-tasks.txt create mode 100644 benchmarks/terminal_bench/report.py diff --git a/.github/workflows/terminal-bench-regression.yml b/.github/workflows/terminal-bench-regression.yml new file mode 100644 index 0000000..68d6002 --- /dev/null +++ b/.github/workflows/terminal-bench-regression.yml @@ -0,0 +1,99 @@ +name: Terminal-Bench Regression + +on: + schedule: + - cron: "0 8 * * 1,4" # Monday + Thursday 8am UTC + workflow_dispatch: + inputs: + model: + description: "Override model (blank = run both defaults)" + default: "" + concurrency: + description: "Max concurrent tasks" + default: "4" + +jobs: + regression: + runs-on: ubuntu-latest + timeout-minutes: 180 + strategy: + fail-fast: false + matrix: + model: [sonnet-4.6-low, gpt-5-minimal] + steps: + - name: Checkout + uses: actions/checkout@v6 + + - name: Setup Python + uv + uses: astral-sh/setup-uv@v6 + + - name: Install Harbor + run: uv pip install --system "harbor>=0.1.45" "litellm>=1.0.0" + + - name: Configure Modal + env: + MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }} + MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }} + run: | + printf '[letta]\ntoken_id = "%s"\ntoken_secret = "%s"\nactive = true\nenvironment = "terminal-bench"\nimage_builder_version = "2025.06"\n' \ + "$MODAL_TOKEN_ID" "$MODAL_TOKEN_SECRET" > ~/.modal.toml + + - name: Run regression tasks + env: + LETTA_API_KEY: ${{ secrets.LETTA_API_KEY }} + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + run: | + # Build --task-name flags from regression-tasks.txt + TASK_FLAGS="" + while IFS= read -r task; do + [[ "$task" =~ ^#.*$ || -z "$task" ]] && continue + TASK_FLAGS="$TASK_FLAGS --task-name $task" + done < benchmarks/terminal_bench/regression-tasks.txt + + harbor run \ + --dataset terminal-bench@2.0 \ + --agent-import-path benchmarks.terminal_bench.letta_code_agent:LettaCode \ + --model "${{ matrix.model }}" \ + --env modal \ + --n-concurrent ${{ inputs.concurrency || '4' }} \ + --job-name "regression-${{ matrix.model }}-$(date +%Y%m%d)" \ + $TASK_FLAGS + + - name: Upload results artifact + if: always() + uses: actions/upload-artifact@v4 + with: + name: tb-results-${{ matrix.model }} + path: jobs/ + + report: + needs: regression + if: always() + runs-on: ubuntu-latest + permissions: + issues: write + contents: read + steps: + - name: Checkout + uses: actions/checkout@v6 + + - name: Download all result artifacts + uses: actions/download-artifact@v4 + with: + path: results/ + + - name: Setup Python + uv + uses: astral-sh/setup-uv@v6 + + - name: Generate report and update GitHub Issue + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_REPOSITORY: ${{ github.repository }} + GITHUB_RUN_ID: ${{ github.run_id }} + GITHUB_SERVER_URL: ${{ github.server_url }} + run: | + python benchmarks/terminal_bench/report.py \ + --results-dir results/ \ + --baseline benchmarks/terminal_bench/baseline.json \ + --repo "${{ github.repository }}" diff --git a/benchmarks/terminal_bench/__init__.py b/benchmarks/terminal_bench/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/terminal_bench/baseline.json b/benchmarks/terminal_bench/baseline.json new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/benchmarks/terminal_bench/baseline.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/benchmarks/terminal_bench/install-letta-code.sh.j2 b/benchmarks/terminal_bench/install-letta-code.sh.j2 new file mode 100644 index 0000000..ed8eaee --- /dev/null +++ b/benchmarks/terminal_bench/install-letta-code.sh.j2 @@ -0,0 +1,25 @@ +#!/bin/bash +set -euo pipefail + +apt-get update +apt-get install -y curl git unzip + +# Install Node.js (required to run the letta CLI) +curl -fsSL https://deb.nodesource.com/setup_20.x | bash - +apt-get install -y nodejs + +# Install Bun (required to build letta-code from source) +curl -fsSL https://bun.sh/install | bash +export BUN_INSTALL="$HOME/.bun" +export PATH="$BUN_INSTALL/bin:$PATH" + +# Build letta-code from source at a pinned branch/ref +LETTA_CODE_REPO="https://github.com/letta-ai/letta-code.git" +LETTA_CODE_REF="{{ branch | default(commit | default('main')) }}" + +git clone "$LETTA_CODE_REPO" /tmp/letta-code +cd /tmp/letta-code +git checkout "$LETTA_CODE_REF" +bun install +bun run build +npm link diff --git a/benchmarks/terminal_bench/letta_code_agent.py b/benchmarks/terminal_bench/letta_code_agent.py new file mode 100644 index 0000000..0d41cc7 --- /dev/null +++ b/benchmarks/terminal_bench/letta_code_agent.py @@ -0,0 +1,419 @@ +import asyncio +import json +import logging +import os +import shlex +import tempfile +import urllib.request +from datetime import datetime +from pathlib import Path + +from harbor.agents.installed.base import BaseInstalledAgent, ExecInput +from harbor.environments.base import BaseEnvironment +from harbor.models.agent.context import AgentContext + +from litellm import ModelResponse, Usage, completion_cost +from litellm.types.utils import CompletionTokensDetailsWrapper, PromptTokensDetailsWrapper + +logger = logging.getLogger(__name__) + +# Keys tried (in order) when extracting agent ID from Letta settings JSON. +_SETTINGS_AGENT_ID_KEYS = ("agent_id", "default_agent_id", "lastAgent", "last_agent") + +# Provider keywords used to select the right system prompt for the CLI. +_PROVIDER_SYSTEM_MAP = { + "source-codex": ("gpt", "o1-", "o3-"), + "source-gemini": ("gemini",), +} +_DEFAULT_SYSTEM = "source-claude" + + +class LettaCode(BaseInstalledAgent): + """Run Letta Code CLI inside a harbor environment.""" + + def __init__(self, *args, **kwargs): + # Pop letta_code_model before passing to super (which doesn't expect it). + self._letta_code_model: str | None = kwargs.pop("letta_code_model", None) + super().__init__(*args, **kwargs) + + @staticmethod + def name() -> str: + return "letta-code" + + @property + def _install_agent_template_path(self) -> Path: + return Path(__file__).parent / "install-letta-code.sh.j2" + + def create_run_agent_commands(self, instruction: str) -> list[ExecInput]: + # Unused — we override run() directly — but required by the ABC. + return [] + + # ------------------------------------------------------------------ + # Helpers + # ------------------------------------------------------------------ + + @staticmethod + def _extract_agent_id_from_events(events_text: str) -> str | None: + """Scan JSONL *text* for the first ``agent-*`` id.""" + for line in events_text.splitlines(): + line = line.strip() + if not line.startswith("{"): + continue + try: + event = json.loads(line) + except json.JSONDecodeError: + continue + for key in ("agent_id", "session_id"): + aid = event.get(key) + if isinstance(aid, str) and aid.startswith("agent-"): + return aid + return None + + @staticmethod + def _extract_agent_id_from_settings(settings_text: str) -> str | None: + """Parse Letta ``settings.local.json`` content and return an agent id.""" + if not settings_text.strip(): + return None + try: + json_start = settings_text.find("{") + cleaned = settings_text[json_start:] if json_start != -1 else settings_text + obj = json.loads(cleaned) + if not isinstance(obj, dict): + return None + for key in _SETTINGS_AGENT_ID_KEYS: + val = obj.get(key) + if val: + return val + # Fallback: first value that looks like an agent id. + for val in obj.values(): + if isinstance(val, str) and val.startswith("agent-"): + return val + except Exception: + pass + return None + + @staticmethod + def _build_model_flags(model_name: str) -> str: + """Return CLI flags for ``--model`` and ``--system``.""" + if not model_name: + return "" + flags = f"--model {shlex.quote(model_name)} " + lower = model_name.lower() + system = _DEFAULT_SYSTEM + for sys_name, keywords in _PROVIDER_SYSTEM_MAP.items(): + if any(kw in lower for kw in keywords): + system = sys_name + break + flags += f"--system {system} " + return flags + + def _find_events_text(self) -> str: + """Return events JSONL content from the local logs directory.""" + logs_dir = Path(self.logs_dir) + events_files = sorted(logs_dir.glob("*.events.jsonl")) + if not events_files: + return "" + return events_files[0].read_text() + + # ------------------------------------------------------------------ + # Usage / cost tracking + # ------------------------------------------------------------------ + + @staticmethod + def _extract_usage_from_events(events_text: str) -> dict[str, int]: + """Extract token usage from Letta Code stream-json events. + + Checks two formats: + 1. ``message_type == "usage_statistics"`` events (Letta streaming API) + 2. Last event with ``type == "result"`` containing a ``usage`` field + """ + totals: dict[str, int] = { + "prompt_tokens": 0, + "completion_tokens": 0, + "cached_input_tokens": 0, + "cache_write_tokens": 0, + "reasoning_tokens": 0, + } + parsed_events: list[dict] = [] + found_usage_stats = False + + for line in events_text.splitlines(): + line = line.strip() + if not line.startswith("{"): + continue + try: + event = json.loads(line) + except json.JSONDecodeError: + continue + parsed_events.append(event) + + if event.get("message_type") == "usage_statistics": + found_usage_stats = True + for key in totals: + totals[key] += event.get(key) or 0 + details = event.get("prompt_tokens_details") or {} + totals["cached_input_tokens"] += details.get("cached_tokens") or 0 + details = event.get("completion_tokens_details") or {} + totals["reasoning_tokens"] += details.get("reasoning_tokens") or 0 + + # Fallback: last result event + if not found_usage_stats and parsed_events: + last = parsed_events[-1] + if last.get("type") == "result" and "usage" in last: + usage = last["usage"] + for key in totals: + totals[key] += usage.get(key) or 0 + + return totals + + @staticmethod + def _calculate_cost(model_name: str, usage: dict[str, int]) -> float: + """Calculate cost in USD using litellm's pricing data.""" + prompt_tokens = usage.get("prompt_tokens", 0) + completion_tokens = usage.get("completion_tokens", 0) + if not model_name or (prompt_tokens == 0 and completion_tokens == 0): + return 0.0 + resp = ModelResponse() + resp.usage = Usage( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=prompt_tokens + completion_tokens, + prompt_tokens_details=PromptTokensDetailsWrapper( + cached_tokens=usage.get("cached_input_tokens", 0), + cache_creation_tokens=usage.get("cache_write_tokens", 0), + ), + completion_tokens_details=CompletionTokensDetailsWrapper( + reasoning_tokens=usage.get("reasoning_tokens", 0), + ), + ) + try: + return float(completion_cost(completion_response=resp, model=model_name)) + except Exception: + logger.debug(f"Could not calculate cost for model {model_name}", exc_info=True) + return 0.0 + + def _populate_usage(self, events_text: str, context: AgentContext) -> None: + """Extract usage from events and populate context + write usage.json.""" + model_name = self.model_name or os.environ.get("LETTA_MODEL", "").strip() + usage = self._extract_usage_from_events(events_text) + cost = self._calculate_cost(model_name, usage) + + context.n_input_tokens = usage["prompt_tokens"] or None + context.n_output_tokens = usage["completion_tokens"] or None + context.cost_usd = cost if cost > 0 else None + + # Write usage.json to the task directory (parent of agent logs) + usage_data: dict = { + "prompt_tokens": usage["prompt_tokens"], + "completion_tokens": usage["completion_tokens"], + "total_tokens": usage["prompt_tokens"] + usage["completion_tokens"], + "cost_usd": round(cost, 6), + } + for key in ("cached_input_tokens", "cache_write_tokens", "reasoning_tokens"): + if usage.get(key, 0) > 0: + usage_data[key] = usage[key] + + try: + usage_path = Path(self.logs_dir).parent / "usage.json" + usage_path.parent.mkdir(parents=True, exist_ok=True) + with open(usage_path, "w") as f: + json.dump(usage_data, f, indent=2) + except Exception as e: + logger.warning(f"Failed to save usage.json: {e}") + + # ------------------------------------------------------------------ + # Harbor lifecycle hooks + # ------------------------------------------------------------------ + + def populate_context_post_run(self, context: AgentContext) -> None: + """Populate agent context from downloaded logs (e.g. after timeout). + + Harbor calls this when ``context.is_empty()`` returns True, which + happens when ``run()`` is cancelled by a timeout before it can + populate the context itself. Harbor's ``_maybe_download_logs`` + copies the container's ``/logs/agent/`` directory to + ``self.logs_dir`` first, so event files should be available here. + """ + events_text = self._find_events_text() + if not events_text.strip(): + return + + agent_id = self._extract_agent_id_from_events(events_text) + if agent_id: + (Path(self.logs_dir) / "letta_agent_id_recovered.txt").write_text(agent_id) + + try: + self._populate_usage(events_text, context) + except Exception as e: + logger.warning(f"Failed to extract usage in populate_context_post_run: {e}") + + async def setup(self, environment: BaseEnvironment) -> None: + """Install the letta CLI inside the task container.""" + await super().setup(environment) + + async def run( + self, + instruction: str, + environment: BaseEnvironment, + context: AgentContext, + ) -> None: + """Invoke letta CLI inside the environment with the given instruction.""" + + # --- environment variables ---------------------------------------- + agent_env: dict[str, str] = {} + for key in ("LETTA_API_KEY", "LETTA_BASE_URL", "OPENAI_API_KEY", "ANTHROPIC_API_KEY"): + if key in os.environ: + agent_env[key] = os.environ[key] + + # Prefer Letta Code model id (bundles reasoning config) over raw handle. + # self.model_name (litellm handle) is still used for cost calculation. + cli_model = self._letta_code_model or self.model_name or os.environ.get("LETTA_MODEL", "").strip() + if cli_model: + agent_env["LETTA_MODEL"] = cli_model + + # --- build full instruction with prompt prefix ---------------------- + prompt_prefix = ( + "Complete the task. Do NOT ask clarification questions, you have " + "enough information to complete the task. Make sure to finish the " + "task to the best of your ability and do not stop at an intermediate step." + ) + full_instruction = f"{prompt_prefix}\n\n{instruction}" + + # --- upload instruction ------------------------------------------- + escaped_instruction = shlex.quote(full_instruction) + with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmpf: + tmpf.write(full_instruction) + local_instr_path = tmpf.name + try: + await environment.exec("bash -lc 'mkdir -p /installed-agent'", timeout_sec=None) + await environment.upload_file(local_instr_path, "/installed-agent/instruction.txt") + finally: + try: + Path(local_instr_path).unlink(missing_ok=True) # type: ignore[arg-type] + except Exception: + pass + + # --- build run script --------------------------------------------- + ts = datetime.now().strftime("%Y-%m-%d__%H-%M-%S") + base = f"/logs/agent/{ts}" + model_flag = self._build_model_flags(cli_model) + + run_script = ( + "#!/usr/bin/env bash\n" + "set -eo pipefail\n" + "source ~/.bashrc >/dev/null 2>&1 || true\n" + "mkdir -p /logs/agent\n" + f"letta --new-agent --conv default --no-skills {model_flag}-p {escaped_instruction} " + f"--permission-mode bypassPermissions --output-format stream-json " + f"2>'{base}.stderr.log' | tee '{base}.events.jsonl'\n" + ) + + logs_dir = Path(self.logs_dir) + logs_dir.mkdir(parents=True, exist_ok=True) + run_script_path = logs_dir / "run_script.sh" + run_script_path.write_text(run_script) + + # --- execute ------------------------------------------------------ + result = None + run_error: Exception | None = None + + async def _capture_settings_after_delay() -> None: + """Snapshot agent ID from settings shortly after the agent starts. + + This is a safety net for timeouts: if run() is cancelled before + reaching the post-run log collection, we still have the agent ID. + """ + try: + await asyncio.sleep(1.0) + out = await environment.exec( + "bash -lc 'cat .letta/settings.local.json 2>/dev/null || true'", + timeout_sec=None, + ) + mid_agent_id = self._extract_agent_id_from_settings(out.stdout or "") + if mid_agent_id: + (logs_dir / f"letta_agent_id_{ts}.txt").write_text(mid_agent_id) + except Exception: + pass + + try: + await environment.exec("bash -lc 'mkdir -p /installed-agent'", timeout_sec=None) + tmp_script_path = "/installed-agent/run-letta.sh" + await environment.upload_file(str(run_script_path), tmp_script_path) + await environment.exec(f"bash -lc 'chmod +x {tmp_script_path}'", timeout_sec=None) + + asyncio.create_task(_capture_settings_after_delay()) + + result = await environment.exec( + f"bash -lc 'bash {tmp_script_path}'", + env=agent_env or None, + timeout_sec=None, + ) + except Exception as e: + run_error = e + + # --- extract agent id & export ------------------------------------- + # Harbor already downloads /logs/agent/{ts}.* to self.logs_dir, + # so we only need to fetch the events in-memory for agent ID extraction. + agent_id: str | None = None + events_text: str = "" + try: + events_text = await self._download_file(environment, f"{base}.events.jsonl") + + settings_text = await self._download_file(environment, ".letta/settings.local.json") + agent_id = self._extract_agent_id_from_settings(settings_text) + + if not agent_id: + agent_id = self._extract_agent_id_from_events(events_text) + + if agent_id: + (logs_dir / f"letta_agent_id_{ts}.txt").write_text(agent_id) + + if agent_id and run_error is None: + self._export_agent(agent_id, logs_dir, ts) + except Exception: + pass + + # --- usage / cost ------------------------------------------------- + try: + self._populate_usage(events_text, context) + except Exception as e: + logger.warning(f"Failed to extract/save usage: {e}") + + # --- populate context --------------------------------------------- + context.metadata = { + **(context.metadata or {}), + "letta_return_code": getattr(result, "return_code", None), + "letta_logs_ts": ts, + } + + if run_error is not None: + raise run_error + + # ------------------------------------------------------------------ + # Private I/O helpers + # ------------------------------------------------------------------ + + @staticmethod + async def _download_file(environment: BaseEnvironment, remote_path: str) -> str: + """Cat a file from the environment, returning '' on failure.""" + try: + out = await environment.exec( + f"bash -lc 'cat \"{remote_path}\" 2>/dev/null || true'", + timeout_sec=None, + ) + return out.stdout or "" + except Exception: + return "" + + @staticmethod + def _export_agent(agent_id: str, logs_dir: Path, ts: str) -> None: + """Download the ``.af`` agent export (best-effort).""" + try: + base_url = os.environ.get("LETTA_BASE_URL", "https://api.letta.com").rstrip("/") + export_url = f"{base_url}/v1/agents/{agent_id}/export" + req = urllib.request.Request(export_url, method="GET") + with urllib.request.urlopen(req, timeout=30) as resp: + agent_bytes = resp.read() + (logs_dir / f"letta_agent_export_{ts}.af").write_bytes(agent_bytes) + except Exception: + pass diff --git a/benchmarks/terminal_bench/regression-tasks.txt b/benchmarks/terminal_bench/regression-tasks.txt new file mode 100644 index 0000000..efe2026 --- /dev/null +++ b/benchmarks/terminal_bench/regression-tasks.txt @@ -0,0 +1,6 @@ +# Terminal-Bench regression task subset for Letta Code +# These tasks are run on a schedule to detect regressions. +# Criteria: fast (<10 min), diverse capabilities, deterministic. +# Adjust based on known Letta Code pass rates. + +cancel-async-tasks diff --git a/benchmarks/terminal_bench/report.py b/benchmarks/terminal_bench/report.py new file mode 100644 index 0000000..c3a9900 --- /dev/null +++ b/benchmarks/terminal_bench/report.py @@ -0,0 +1,271 @@ +#!/usr/bin/env python3 +"""Parse Harbor job results and report regressions via GitHub Issue. + +Usage: + python report.py --results-dir results/ --baseline baseline.json --repo owner/repo + +Expects Harbor job output structure under results-dir: + results/ + tb-results-/ + jobs/ + / + result.json + / + result.json # trial result with reward + verifier/ + reward.txt # 0.0 or 1.0 +""" + +import argparse +import json +import os +import subprocess +import sys +from datetime import datetime, timezone +from pathlib import Path + + +def parse_job_results(results_dir: Path) -> dict[str, dict[str, bool]]: + """Parse Harbor job results into {model: {task: passed}}.""" + model_results: dict[str, dict[str, bool]] = {} + + for artifact_dir in sorted(results_dir.iterdir()): + if not artifact_dir.is_dir(): + continue + + # Artifact name: tb-results- + dir_name = artifact_dir.name + if dir_name.startswith("tb-results-"): + model = dir_name[len("tb-results-"):] + else: + model = dir_name + + tasks: dict[str, bool] = {} + + # Look for job directories — Harbor puts them under jobs/ + jobs_dir = artifact_dir / "jobs" + if not jobs_dir.exists(): + # Artifacts might be flat (just the job contents) + jobs_dir = artifact_dir + + for job_dir in sorted(jobs_dir.iterdir()): + if not job_dir.is_dir(): + continue + + # Each subdirectory of the job is a trial (task) + for trial_dir in sorted(job_dir.iterdir()): + if not trial_dir.is_dir(): + continue + + # Skip non-trial dirs like config.json + task_name = trial_dir.name + + # Try verifier/reward.txt first + reward_file = trial_dir / "verifier" / "reward.txt" + if reward_file.exists(): + try: + reward = float(reward_file.read_text().strip()) + tasks[task_name] = reward >= 1.0 + continue + except (ValueError, OSError): + pass + + # Fall back to result.json + result_file = trial_dir / "result.json" + if result_file.exists(): + try: + result = json.loads(result_file.read_text()) + reward = result.get("reward", result.get("score", 0)) + tasks[task_name] = float(reward) >= 1.0 + except (json.JSONDecodeError, ValueError, OSError): + tasks[task_name] = False + + if tasks: + model_results[model] = tasks + + return model_results + + +def compute_pass_rate(tasks: dict[str, bool]) -> float: + """Compute pass rate from task results.""" + if not tasks: + return 0.0 + return sum(1 for v in tasks.values() if v) / len(tasks) + + +def load_baseline(baseline_path: Path) -> dict: + """Load baseline.json, returning empty dict if missing or empty.""" + if not baseline_path.exists(): + return {} + try: + data = json.loads(baseline_path.read_text()) + return data if isinstance(data, dict) and data else {} + except (json.JSONDecodeError, OSError): + return {} + + +def build_report( + model_results: dict[str, dict[str, bool]], + baseline: dict, +) -> tuple[str, bool]: + """Build a markdown report and determine if there's a regression. + + Returns (markdown_body, has_regression). + """ + now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC") + lines = [ + f"## Terminal-Bench Regression Report — {now}", + "", + ] + + has_regression = False + + for model, tasks in sorted(model_results.items()): + pass_rate = compute_pass_rate(tasks) + passed = sum(1 for v in tasks.values() if v) + total = len(tasks) + + # Compare to baseline + baseline_model = baseline.get(model, {}) + baseline_rate = baseline_model.get("pass_rate") + baseline_tasks = baseline_model.get("tasks", {}) + + delta_str = "" + if baseline_rate is not None: + delta = pass_rate - baseline_rate + if delta < -0.10: + has_regression = True + delta_str = f" | **{delta:+.0%} from baseline** :red_circle:" + elif delta < 0: + delta_str = f" | {delta:+.0%} from baseline :warning:" + elif delta > 0: + delta_str = f" | {delta:+.0%} from baseline :white_check_mark:" + + lines.append(f"### `{model}` — {passed}/{total} ({pass_rate:.0%}){delta_str}") + lines.append("") + lines.append("| Task | Result | Baseline |") + lines.append("|------|--------|----------|") + + for task_name, passed_now in sorted(tasks.items()): + result_emoji = ":white_check_mark:" if passed_now else ":x:" + baseline_val = baseline_tasks.get(task_name) + + if baseline_val is None: + baseline_str = "—" + elif baseline_val: + baseline_str = ":white_check_mark:" + else: + baseline_str = ":x:" + + # Flag regressions: was passing, now failing + regression_marker = "" + if baseline_val is True and not passed_now: + regression_marker = " **REGRESSION**" + has_regression = True + + lines.append(f"| {task_name} | {result_emoji}{regression_marker} | {baseline_str} |") + + lines.append("") + + if not model_results: + lines.append("No results found. Check workflow logs.") + lines.append("") + + # Add workflow link + run_url = os.environ.get("GITHUB_SERVER_URL", "https://github.com") + repo = os.environ.get("GITHUB_REPOSITORY", "") + run_id = os.environ.get("GITHUB_RUN_ID", "") + if repo and run_id: + lines.append(f"[Workflow run]({run_url}/{repo}/actions/runs/{run_id})") + lines.append("") + + return "\n".join(lines), has_regression + + +def update_github_issue(repo: str, title: str, body: str) -> None: + """Create or update a GitHub Issue with the given title and body. + + Uses `gh` CLI which must be authenticated via GH_TOKEN env var. + """ + # Search for existing issue + result = subprocess.run( + ["gh", "issue", "list", "--repo", repo, "--search", f'"{title}" in:title', "--state", "open", "--json", "number", "--limit", "1"], + capture_output=True, + text=True, + ) + + existing_number = None + if result.returncode == 0 and result.stdout.strip(): + try: + issues = json.loads(result.stdout) + if issues: + existing_number = issues[0]["number"] + except (json.JSONDecodeError, KeyError, IndexError): + pass + + if existing_number: + # Update existing issue with a comment + subprocess.run( + ["gh", "issue", "comment", str(existing_number), "--repo", repo, "--body", body], + check=True, + ) + print(f"Updated issue #{existing_number}") + else: + # Create new issue + result = subprocess.run( + ["gh", "issue", "create", "--repo", repo, "--title", title, "--body", body, "--label", "benchmark"], + capture_output=True, + text=True, + ) + if result.returncode == 0: + print(f"Created issue: {result.stdout.strip()}") + else: + # Label might not exist — retry without it + subprocess.run( + ["gh", "issue", "create", "--repo", repo, "--title", title, "--body", body], + check=True, + ) + + +def main() -> None: + parser = argparse.ArgumentParser(description="Report Terminal-Bench regression results") + parser.add_argument("--results-dir", required=True, type=Path, help="Directory with downloaded artifacts") + parser.add_argument("--baseline", required=True, type=Path, help="Path to baseline.json") + parser.add_argument("--repo", required=True, help="GitHub repo (owner/repo)") + args = parser.parse_args() + + model_results = parse_job_results(args.results_dir) + baseline = load_baseline(args.baseline) + + if not model_results: + print("WARNING: No results parsed from artifacts.") + print(f"Contents of {args.results_dir}:") + for p in sorted(args.results_dir.rglob("*")): + print(f" {p}") + sys.exit(1) + + body, has_regression = build_report(model_results, baseline) + + # Print report to stdout + print(body) + + # Update GitHub Issue + gh_token = os.environ.get("GH_TOKEN") + if gh_token: + update_github_issue( + repo=args.repo, + title="Terminal-Bench Regression Tracker", + body=body, + ) + else: + print("GH_TOKEN not set — skipping GitHub Issue update") + + if has_regression: + print("\n:red_circle: REGRESSION DETECTED — failing workflow") + sys.exit(1) + else: + print("\nNo regressions detected.") + + +if __name__ == "__main__": + main()