letta-code/benchmarks/terminal_bench/report.py

#!/usr/bin/env python3
"""Parse Harbor job results and report regressions via GitHub Issue.

Usage:
    python report.py --results-dir results/ --baseline baseline.json --repo owner/repo

Expects Harbor job output structure under results-dir:
    results/
      tb-results-<model>/
        jobs/
          <job-name>/
            result.json
            <task-name>/
              result.json        # trial result with reward
              verifier/
                reward.txt       # 0.0 or 1.0
"""

import argparse
import json
import os
import subprocess
import sys
from datetime import datetime, timezone
from pathlib import Path


def parse_job_results(results_dir: Path) -> dict[str, dict]:
    """Parse Harbor job results into {model: {tasks: {task: passed}, cost: {..}}}."""
    model_results: dict[str, dict] = {}

    for artifact_dir in sorted(results_dir.iterdir()):
        if not artifact_dir.is_dir():
            continue

        # Artifact name: tb-results-<model>
        dir_name = artifact_dir.name
        if dir_name.startswith("tb-results-"):
            model = dir_name[len("tb-results-"):]
        else:
            model = dir_name

        tasks: dict[str, bool] = {}
        total_cost = 0.0
        total_prompt_tokens = 0
        total_completion_tokens = 0

        # Look for job directories — Harbor puts them under jobs/
        jobs_dir = artifact_dir / "jobs"
        if not jobs_dir.exists():
            # Artifacts might be flat (just the job contents)
            jobs_dir = artifact_dir

        for job_dir in sorted(jobs_dir.iterdir()):
            if not job_dir.is_dir():
                continue

            # Each subdirectory of the job is a trial (task)
            for trial_dir in sorted(job_dir.iterdir()):
                if not trial_dir.is_dir():
                    continue

                # Skip non-trial dirs like config.json
                task_name = trial_dir.name

                # Try verifier/reward.txt first
                reward_file = trial_dir / "verifier" / "reward.txt"
                if reward_file.exists():
                    try:
                        reward = float(reward_file.read_text().strip())
                        tasks[task_name] = reward >= 1.0
                    except (ValueError, OSError):
                        pass

                if task_name not in tasks:
                    # Fall back to result.json
                    result_file = trial_dir / "result.json"
                    if result_file.exists():
                        try:
                            result = json.loads(result_file.read_text())
                            reward = result.get("reward", result.get("score", 0))
                            tasks[task_name] = float(reward) >= 1.0
                        except (json.JSONDecodeError, ValueError, OSError):
                            tasks[task_name] = False

                # Collect cost from usage.json
                usage_file = trial_dir / "usage.json"
                if usage_file.exists():
                    try:
                        usage = json.loads(usage_file.read_text())
                        total_cost += usage.get("cost_usd", 0.0)
                        total_prompt_tokens += usage.get("prompt_tokens", 0)
                        total_completion_tokens += usage.get("completion_tokens", 0)
                    except (json.JSONDecodeError, OSError):
                        pass

        if tasks:
            model_results[model] = {
                "tasks": tasks,
                "cost": {
                    "cost_usd": round(total_cost, 2),
                    "prompt_tokens": total_prompt_tokens,
                    "completion_tokens": total_completion_tokens,
                },
            }

    return model_results


def compute_pass_rate(tasks: dict[str, bool]) -> float:
    """Compute pass rate from task results."""
    if not tasks:
        return 0.0
    return sum(1 for v in tasks.values() if v) / len(tasks)


def load_baseline(baseline_path: Path) -> dict:
    """Load baseline.json, returning empty dict if missing or empty."""
    if not baseline_path.exists():
        return {}
    try:
        data = json.loads(baseline_path.read_text())
        return data if isinstance(data, dict) and data else {}
    except (json.JSONDecodeError, OSError):
        return {}


def build_report(
    model_results: dict[str, dict],
    baseline: dict,
) -> tuple[str, bool]:
    """Build a markdown report and determine if there's a regression.

    Returns (markdown_body, has_regression).
    """
    now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
    lines = [
        f"## Terminal-Bench Regression Report — {now}",
        "",
    ]

    has_regression = False

    for model, data in sorted(model_results.items()):
        tasks = data["tasks"]
        cost = data.get("cost", {})
        pass_rate = compute_pass_rate(tasks)
        passed = sum(1 for v in tasks.values() if v)
        total = len(tasks)

        # Compare to baseline
        baseline_model = baseline.get(model, {})
        baseline_rate = baseline_model.get("pass_rate")
        baseline_tasks = baseline_model.get("tasks", {})

        delta_str = ""
        if baseline_rate is not None:
            delta = pass_rate - baseline_rate
            if delta < -0.05:
                has_regression = True
                delta_str = f" | **{delta:+.0%} from baseline** :red_circle:"
            elif delta < 0:
                delta_str = f" | {delta:+.0%} from baseline :warning:"
            elif delta > 0:
                delta_str = f" | {delta:+.0%} from baseline :white_check_mark:"

        cost_str = ""
        cost_usd = cost.get("cost_usd", 0)
        if cost_usd > 0:
            cost_str = f" | ${cost_usd:.2f}"

        lines.append(f"<details>")
        lines.append(f"<summary><strong>{model}</strong> — {passed}/{total} ({pass_rate:.0%}){delta_str}{cost_str}</summary>")
        lines.append("")

        # Categorize tasks
        regressions = []  # was passing, now failing
        improvements = []  # was failing, now passing
        new_tasks = []  # not in baseline

        for task_name, passed_now in sorted(tasks.items()):
            baseline_val = baseline_tasks.get(task_name)
            if baseline_val is None:
                new_tasks.append((task_name, passed_now))
            elif baseline_val and not passed_now:
                regressions.append(task_name)
                has_regression = True
            elif not baseline_val and passed_now:
                improvements.append(task_name)

        if regressions:
            lines.append(f"**Regressions ({len(regressions)}):**")
            for t in regressions:
                lines.append(f"- :red_circle: {t}")
            lines.append("")

        if improvements:
            lines.append(f"**Improvements ({len(improvements)}):**")
            for t in improvements:
                lines.append(f"- :white_check_mark: {t}")
            lines.append("")

        if new_tasks:
            new_passed = sum(1 for _, p in new_tasks if p)
            lines.append(f"**New tasks ({new_passed}/{len(new_tasks)} passed):**")
            for t, p in new_tasks:
                emoji = ":white_check_mark:" if p else ":x:"
                lines.append(f"- {emoji} {t}")
            lines.append("")

        if not regressions and not improvements and not new_tasks:
            lines.append("No changes from baseline.")
            lines.append("")

        lines.append("</details>")
        lines.append("")

    if not model_results:
        lines.append("No results found. Check workflow logs.")
        lines.append("")

    # Add workflow link
    run_url = os.environ.get("GITHUB_SERVER_URL", "https://github.com")
    repo = os.environ.get("GITHUB_REPOSITORY", "")
    run_id = os.environ.get("GITHUB_RUN_ID", "")
    if repo and run_id:
        lines.append(f"[Workflow run]({run_url}/{repo}/actions/runs/{run_id})")
        lines.append("")

    if has_regression:
        lines.append("cc @devanshrj")
        lines.append("")

    return "\n".join(lines), has_regression


def update_github_issue(repo: str, title: str, body: str) -> None:
    """Create or update a GitHub Issue with the given title and body.

    Uses `gh` CLI which must be authenticated via GH_TOKEN env var.
    """
    # Search for existing issue
    result = subprocess.run(
        ["gh", "issue", "list", "--repo", repo, "--search", f'"{title}" in:title', "--state", "open", "--json", "number", "--limit", "1"],
        capture_output=True,
        text=True,
    )

    existing_number = None
    if result.returncode == 0 and result.stdout.strip():
        try:
            issues = json.loads(result.stdout)
            if issues:
                existing_number = issues[0]["number"]
        except (json.JSONDecodeError, KeyError, IndexError):
            pass

    if existing_number:
        # Update existing issue with a comment
        subprocess.run(
            ["gh", "issue", "comment", str(existing_number), "--repo", repo, "--body", body],
            check=True,
        )
        print(f"Updated issue #{existing_number}")
    else:
        # Create new issue
        result = subprocess.run(
            ["gh", "issue", "create", "--repo", repo, "--title", title, "--body", body, "--label", "benchmark"],
            capture_output=True,
            text=True,
        )
        if result.returncode == 0:
            print(f"Created issue: {result.stdout.strip()}")
        else:
            # Label might not exist — retry without it
            subprocess.run(
                ["gh", "issue", "create", "--repo", repo, "--title", title, "--body", body],
                check=True,
            )


def main() -> None:
    parser = argparse.ArgumentParser(description="Report Terminal-Bench regression results")
    parser.add_argument("--results-dir", required=True, type=Path, help="Directory with downloaded artifacts")
    parser.add_argument("--baseline", required=True, type=Path, help="Path to baseline.json")
    parser.add_argument("--repo", required=True, help="GitHub repo (owner/repo)")
    args = parser.parse_args()

    model_results = parse_job_results(args.results_dir)
    baseline = load_baseline(args.baseline)

    if not model_results:
        print("WARNING: No results parsed from artifacts.")
        print(f"Contents of {args.results_dir}:")
        for p in sorted(args.results_dir.rglob("*")):
            print(f"  {p}")
        sys.exit(1)

    body, has_regression = build_report(model_results, baseline)

    # Print report to stdout
    print(body)

    # Update GitHub Issue
    gh_token = os.environ.get("GH_TOKEN")
    if gh_token:
        update_github_issue(
            repo=args.repo,
            title="Terminal-Bench Regression Tracker",
            body=body,
        )
    else:
        print("GH_TOKEN not set — skipping GitHub Issue update")

    if has_regression:
        print("\n:red_circle: REGRESSION DETECTED — failing workflow")
        sys.exit(1)
    else:
        print("\nNo regressions detected.")


if __name__ == "__main__":
    main()