feat: TB regression — full runs, xhigh models, baselines, and report improvements (#1390)
Co-authored-by: Letta Code <noreply@letta.com>
This commit is contained in:
18
.github/workflows/terminal-bench-regression.yml
vendored
18
.github/workflows/terminal-bench-regression.yml
vendored
@@ -2,7 +2,7 @@ name: Terminal-Bench Regression
|
|||||||
|
|
||||||
on:
|
on:
|
||||||
schedule:
|
schedule:
|
||||||
- cron: "0 8 * * 1,4" # Monday + Thursday 8am UTC
|
- cron: "0 12 * * 1" # Monday 5am PT (12pm UTC)
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
inputs:
|
inputs:
|
||||||
model:
|
model:
|
||||||
@@ -10,7 +10,7 @@ on:
|
|||||||
default: ""
|
default: ""
|
||||||
concurrency:
|
concurrency:
|
||||||
description: "Max concurrent tasks"
|
description: "Max concurrent tasks"
|
||||||
default: "4"
|
default: "10"
|
||||||
|
|
||||||
env:
|
env:
|
||||||
FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
|
FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
|
||||||
@@ -22,7 +22,7 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
model: [sonnet-4.6-low, gpt-5-minimal]
|
model: [sonnet-4.6-xhigh, gpt-5.3-codex-xhigh]
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v6
|
||||||
@@ -54,21 +54,13 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
source .venv/bin/activate
|
source .venv/bin/activate
|
||||||
|
|
||||||
# Build --task-name flags from regression-tasks.txt
|
|
||||||
TASK_FLAGS=""
|
|
||||||
while IFS= read -r task; do
|
|
||||||
[[ "$task" =~ ^#.*$ || -z "$task" ]] && continue
|
|
||||||
TASK_FLAGS="$TASK_FLAGS --task-name $task"
|
|
||||||
done < benchmarks/terminal_bench/regression-tasks.txt
|
|
||||||
|
|
||||||
harbor run \
|
harbor run \
|
||||||
--dataset terminal-bench@2.0 \
|
--dataset terminal-bench@2.0 \
|
||||||
--agent-import-path benchmarks.terminal_bench.letta_code_agent:LettaCode \
|
--agent-import-path benchmarks.terminal_bench.letta_code_agent:LettaCode \
|
||||||
--model "${{ matrix.model }}" \
|
--model "${{ matrix.model }}" \
|
||||||
--env modal \
|
--env modal \
|
||||||
--n-concurrent ${{ inputs.concurrency || '4' }} \
|
--n-concurrent ${{ inputs.concurrency || '10' }} \
|
||||||
--job-name "regression-${{ matrix.model }}-$(date +%Y%m%d)" \
|
--job-name "regression-${{ matrix.model }}-$(date +%Y%m%d)"
|
||||||
$TASK_FLAGS
|
|
||||||
|
|
||||||
- name: Upload results artifact
|
- name: Upload results artifact
|
||||||
if: always()
|
if: always()
|
||||||
|
|||||||
55
benchmarks/terminal_bench/README.md
Normal file
55
benchmarks/terminal_bench/README.md
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
# Terminal-Bench Regression
|
||||||
|
|
||||||
|
Weekly regression tests for Letta Code on [Terminal-Bench 2.0](https://github.com/laude-institute/terminal-bench-2).
|
||||||
|
|
||||||
|
## How it works
|
||||||
|
|
||||||
|
1. **GitHub Actions** (`.github/workflows/terminal-bench-regression.yml`) runs every Monday at 5am PT
|
||||||
|
2. **Harbor** orchestrates task execution in **Modal** cloud sandboxes
|
||||||
|
3. Letta Code is built from source (`main` branch) inside each sandbox
|
||||||
|
4. Results are compared against `baseline.json` and posted to a GitHub issue
|
||||||
|
5. `@devanshrj` is tagged if any model drops >5% from baseline
|
||||||
|
|
||||||
|
## Models
|
||||||
|
|
||||||
|
| Model | Baseline |
|
||||||
|
|-------|----------|
|
||||||
|
| `sonnet-4.6-xhigh` | 38/89 (42.7%) |
|
||||||
|
| `gpt-5.3-codex-xhigh` | 57/89 (64.0%) |
|
||||||
|
|
||||||
|
## Files
|
||||||
|
|
||||||
|
| File | Description |
|
||||||
|
|------|-------------|
|
||||||
|
| `letta_code_agent.py` | Harbor agent — installs and runs Letta Code CLI in sandbox |
|
||||||
|
| `install-letta-code.sh.j2` | Jinja2 install script (Node.js, Bun, build from source) |
|
||||||
|
| `baseline.json` | Per-model, per-task pass/fail baselines |
|
||||||
|
| `report.py` | Parses results, detects regressions, posts GitHub issue |
|
||||||
|
|
||||||
|
## Manual trigger
|
||||||
|
|
||||||
|
```bash
|
||||||
|
gh workflow run terminal-bench-regression.yml --ref main -f concurrency=10
|
||||||
|
```
|
||||||
|
|
||||||
|
## Required secrets
|
||||||
|
|
||||||
|
- `LETTA_API_KEY` — Letta Cloud API key
|
||||||
|
- `ANTHROPIC_API_KEY` / `OPENAI_API_KEY` — LLM provider keys
|
||||||
|
- `MODAL_TOKEN_ID` / `MODAL_TOKEN_SECRET` — Modal sandbox credentials
|
||||||
|
|
||||||
|
## Updating baselines
|
||||||
|
|
||||||
|
Replace `baseline.json` with results from a new run. Format:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"model-name": {
|
||||||
|
"pass_rate": 0.427,
|
||||||
|
"tasks": {
|
||||||
|
"task-name": true,
|
||||||
|
...
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
@@ -1 +1,190 @@
|
|||||||
{}
|
{
|
||||||
|
"sonnet-4.6-xhigh": {
|
||||||
|
"pass_rate": 0.427,
|
||||||
|
"tasks": {
|
||||||
|
"adaptive-rejection-sampler": false,
|
||||||
|
"bn-fit-modify": true,
|
||||||
|
"break-filter-js-from-html": false,
|
||||||
|
"build-cython-ext": true,
|
||||||
|
"build-pmars": true,
|
||||||
|
"build-pov-ray": true,
|
||||||
|
"caffe-cifar-10": false,
|
||||||
|
"cancel-async-tasks": false,
|
||||||
|
"chess-best-move": false,
|
||||||
|
"circuit-fibsqrt": false,
|
||||||
|
"cobol-modernization": false,
|
||||||
|
"code-from-image": true,
|
||||||
|
"compile-compcert": true,
|
||||||
|
"configure-git-webserver": true,
|
||||||
|
"constraints-scheduling": true,
|
||||||
|
"count-dataset-tokens": true,
|
||||||
|
"crack-7z-hash": false,
|
||||||
|
"custom-memory-heap-crash": false,
|
||||||
|
"db-wal-recovery": false,
|
||||||
|
"distribution-search": false,
|
||||||
|
"dna-assembly": false,
|
||||||
|
"dna-insert": false,
|
||||||
|
"extract-elf": true,
|
||||||
|
"extract-moves-from-video": false,
|
||||||
|
"feal-differential-cryptanalysis": true,
|
||||||
|
"feal-linear-cryptanalysis": false,
|
||||||
|
"filter-js-from-html": false,
|
||||||
|
"financial-document-processor": false,
|
||||||
|
"fix-code-vulnerability": true,
|
||||||
|
"fix-git": true,
|
||||||
|
"fix-ocaml-gc": true,
|
||||||
|
"gcode-to-text": true,
|
||||||
|
"git-leak-recovery": true,
|
||||||
|
"git-multibranch": true,
|
||||||
|
"gpt2-codegolf": false,
|
||||||
|
"headless-terminal": true,
|
||||||
|
"hf-model-inference": true,
|
||||||
|
"install-windows-3.11": false,
|
||||||
|
"kv-store-grpc": true,
|
||||||
|
"large-scale-text-editing": true,
|
||||||
|
"largest-eigenval": false,
|
||||||
|
"llm-inference-batching-scheduler": false,
|
||||||
|
"log-summary-date-ranges": true,
|
||||||
|
"mailman": false,
|
||||||
|
"make-doom-for-mips": false,
|
||||||
|
"make-mips-interpreter": true,
|
||||||
|
"mcmc-sampling-stan": true,
|
||||||
|
"merge-diff-arc-agi-task": true,
|
||||||
|
"model-extraction-relu-logits": false,
|
||||||
|
"modernize-scientific-stack": true,
|
||||||
|
"mteb-leaderboard": false,
|
||||||
|
"mteb-retrieve": false,
|
||||||
|
"multi-source-data-merger": true,
|
||||||
|
"nginx-request-logging": true,
|
||||||
|
"openssl-selfsigned-cert": true,
|
||||||
|
"overfull-hbox": false,
|
||||||
|
"password-recovery": true,
|
||||||
|
"path-tracing-reverse": false,
|
||||||
|
"path-tracing": false,
|
||||||
|
"polyglot-c-py": false,
|
||||||
|
"polyglot-rust-c": false,
|
||||||
|
"portfolio-optimization": false,
|
||||||
|
"protein-assembly": false,
|
||||||
|
"prove-plus-comm": true,
|
||||||
|
"pypi-server": true,
|
||||||
|
"pytorch-model-cli": false,
|
||||||
|
"pytorch-model-recovery": true,
|
||||||
|
"qemu-alpine-ssh": false,
|
||||||
|
"qemu-startup": true,
|
||||||
|
"query-optimize": false,
|
||||||
|
"raman-fitting": false,
|
||||||
|
"regex-chess": false,
|
||||||
|
"regex-log": true,
|
||||||
|
"reshard-c4-data": false,
|
||||||
|
"rstan-to-pystan": false,
|
||||||
|
"sam-cell-seg": false,
|
||||||
|
"sanitize-git-repo": false,
|
||||||
|
"schemelike-metacircular-eval": false,
|
||||||
|
"sparql-university": false,
|
||||||
|
"sqlite-db-truncate": true,
|
||||||
|
"sqlite-with-gcov": false,
|
||||||
|
"torch-pipeline-parallelism": false,
|
||||||
|
"torch-tensor-parallelism": true,
|
||||||
|
"train-fasttext": false,
|
||||||
|
"tune-mjcf": false,
|
||||||
|
"video-processing": false,
|
||||||
|
"vulnerable-secret": true,
|
||||||
|
"winning-avg-corewars": false,
|
||||||
|
"write-compressor": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"gpt-5.3-codex-xhigh": {
|
||||||
|
"pass_rate": 0.6404,
|
||||||
|
"tasks": {
|
||||||
|
"adaptive-rejection-sampler": false,
|
||||||
|
"bn-fit-modify": true,
|
||||||
|
"break-filter-js-from-html": true,
|
||||||
|
"build-cython-ext": true,
|
||||||
|
"build-pmars": true,
|
||||||
|
"build-pov-ray": true,
|
||||||
|
"caffe-cifar-10": true,
|
||||||
|
"cancel-async-tasks": false,
|
||||||
|
"chess-best-move": true,
|
||||||
|
"circuit-fibsqrt": true,
|
||||||
|
"cobol-modernization": true,
|
||||||
|
"code-from-image": true,
|
||||||
|
"compile-compcert": true,
|
||||||
|
"configure-git-webserver": true,
|
||||||
|
"constraints-scheduling": true,
|
||||||
|
"count-dataset-tokens": true,
|
||||||
|
"crack-7z-hash": true,
|
||||||
|
"custom-memory-heap-crash": true,
|
||||||
|
"db-wal-recovery": false,
|
||||||
|
"distribution-search": true,
|
||||||
|
"dna-assembly": false,
|
||||||
|
"dna-insert": true,
|
||||||
|
"extract-elf": true,
|
||||||
|
"extract-moves-from-video": false,
|
||||||
|
"feal-differential-cryptanalysis": true,
|
||||||
|
"feal-linear-cryptanalysis": true,
|
||||||
|
"filter-js-from-html": false,
|
||||||
|
"financial-document-processor": true,
|
||||||
|
"fix-code-vulnerability": true,
|
||||||
|
"fix-git": true,
|
||||||
|
"fix-ocaml-gc": true,
|
||||||
|
"gcode-to-text": false,
|
||||||
|
"git-leak-recovery": true,
|
||||||
|
"git-multibranch": true,
|
||||||
|
"gpt2-codegolf": false,
|
||||||
|
"headless-terminal": true,
|
||||||
|
"hf-model-inference": true,
|
||||||
|
"install-windows-3.11": false,
|
||||||
|
"kv-store-grpc": true,
|
||||||
|
"large-scale-text-editing": true,
|
||||||
|
"largest-eigenval": true,
|
||||||
|
"llm-inference-batching-scheduler": true,
|
||||||
|
"log-summary-date-ranges": true,
|
||||||
|
"mailman": false,
|
||||||
|
"make-doom-for-mips": false,
|
||||||
|
"make-mips-interpreter": false,
|
||||||
|
"mcmc-sampling-stan": false,
|
||||||
|
"merge-diff-arc-agi-task": true,
|
||||||
|
"model-extraction-relu-logits": true,
|
||||||
|
"modernize-scientific-stack": true,
|
||||||
|
"mteb-leaderboard": true,
|
||||||
|
"mteb-retrieve": false,
|
||||||
|
"multi-source-data-merger": true,
|
||||||
|
"nginx-request-logging": true,
|
||||||
|
"openssl-selfsigned-cert": true,
|
||||||
|
"overfull-hbox": false,
|
||||||
|
"password-recovery": true,
|
||||||
|
"path-tracing-reverse": true,
|
||||||
|
"path-tracing": true,
|
||||||
|
"polyglot-c-py": false,
|
||||||
|
"polyglot-rust-c": false,
|
||||||
|
"portfolio-optimization": false,
|
||||||
|
"protein-assembly": true,
|
||||||
|
"prove-plus-comm": true,
|
||||||
|
"pypi-server": true,
|
||||||
|
"pytorch-model-cli": true,
|
||||||
|
"pytorch-model-recovery": false,
|
||||||
|
"qemu-alpine-ssh": false,
|
||||||
|
"qemu-startup": false,
|
||||||
|
"query-optimize": false,
|
||||||
|
"raman-fitting": false,
|
||||||
|
"regex-chess": false,
|
||||||
|
"regex-log": true,
|
||||||
|
"reshard-c4-data": false,
|
||||||
|
"rstan-to-pystan": true,
|
||||||
|
"sam-cell-seg": false,
|
||||||
|
"sanitize-git-repo": true,
|
||||||
|
"schemelike-metacircular-eval": false,
|
||||||
|
"sparql-university": true,
|
||||||
|
"sqlite-db-truncate": true,
|
||||||
|
"sqlite-with-gcov": true,
|
||||||
|
"torch-pipeline-parallelism": false,
|
||||||
|
"torch-tensor-parallelism": false,
|
||||||
|
"train-fasttext": false,
|
||||||
|
"tune-mjcf": true,
|
||||||
|
"video-processing": false,
|
||||||
|
"vulnerable-secret": true,
|
||||||
|
"winning-avg-corewars": true,
|
||||||
|
"write-compressor": false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -27,6 +27,12 @@ _PROVIDER_SYSTEM_MAP = {
|
|||||||
}
|
}
|
||||||
_DEFAULT_SYSTEM = "source-claude"
|
_DEFAULT_SYSTEM = "source-claude"
|
||||||
|
|
||||||
|
# Map Letta Code model handles to litellm model names for cost calculation.
|
||||||
|
_LITELLM_MODEL_MAP: dict[str, str] = {
|
||||||
|
"sonnet-4.6-xhigh": "anthropic/claude-sonnet-4-6",
|
||||||
|
"gpt-5.3-codex-xhigh": "openai/gpt-5.3-codex",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class LettaCode(BaseInstalledAgent):
|
class LettaCode(BaseInstalledAgent):
|
||||||
"""Run Letta Code CLI inside a harbor environment."""
|
"""Run Letta Code CLI inside a harbor environment."""
|
||||||
@@ -194,9 +200,10 @@ class LettaCode(BaseInstalledAgent):
|
|||||||
|
|
||||||
def _populate_usage(self, events_text: str, context: AgentContext) -> None:
|
def _populate_usage(self, events_text: str, context: AgentContext) -> None:
|
||||||
"""Extract usage from events and populate context + write usage.json."""
|
"""Extract usage from events and populate context + write usage.json."""
|
||||||
model_name = self.model_name or os.environ.get("LETTA_MODEL", "").strip()
|
raw_model = self.model_name or os.environ.get("LETTA_MODEL", "").strip()
|
||||||
|
litellm_model = _LITELLM_MODEL_MAP.get(raw_model, raw_model)
|
||||||
usage = self._extract_usage_from_events(events_text)
|
usage = self._extract_usage_from_events(events_text)
|
||||||
cost = self._calculate_cost(model_name, usage)
|
cost = self._calculate_cost(litellm_model, usage)
|
||||||
|
|
||||||
context.n_input_tokens = usage["prompt_tokens"] or None
|
context.n_input_tokens = usage["prompt_tokens"] or None
|
||||||
context.n_output_tokens = usage["completion_tokens"] or None
|
context.n_output_tokens = usage["completion_tokens"] or None
|
||||||
|
|||||||
@@ -1,6 +0,0 @@
|
|||||||
# Terminal-Bench regression task subset for Letta Code
|
|
||||||
# These tasks are run on a schedule to detect regressions.
|
|
||||||
# Criteria: fast (<10 min), diverse capabilities, deterministic.
|
|
||||||
# Adjust based on known Letta Code pass rates.
|
|
||||||
|
|
||||||
cancel-async-tasks
|
|
||||||
@@ -25,9 +25,9 @@ from datetime import datetime, timezone
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
def parse_job_results(results_dir: Path) -> dict[str, dict[str, bool]]:
|
def parse_job_results(results_dir: Path) -> dict[str, dict]:
|
||||||
"""Parse Harbor job results into {model: {task: passed}}."""
|
"""Parse Harbor job results into {model: {tasks: {task: passed}, cost: {..}}}."""
|
||||||
model_results: dict[str, dict[str, bool]] = {}
|
model_results: dict[str, dict] = {}
|
||||||
|
|
||||||
for artifact_dir in sorted(results_dir.iterdir()):
|
for artifact_dir in sorted(results_dir.iterdir()):
|
||||||
if not artifact_dir.is_dir():
|
if not artifact_dir.is_dir():
|
||||||
@@ -41,6 +41,9 @@ def parse_job_results(results_dir: Path) -> dict[str, dict[str, bool]]:
|
|||||||
model = dir_name
|
model = dir_name
|
||||||
|
|
||||||
tasks: dict[str, bool] = {}
|
tasks: dict[str, bool] = {}
|
||||||
|
total_cost = 0.0
|
||||||
|
total_prompt_tokens = 0
|
||||||
|
total_completion_tokens = 0
|
||||||
|
|
||||||
# Look for job directories — Harbor puts them under jobs/
|
# Look for job directories — Harbor puts them under jobs/
|
||||||
jobs_dir = artifact_dir / "jobs"
|
jobs_dir = artifact_dir / "jobs"
|
||||||
@@ -66,10 +69,10 @@ def parse_job_results(results_dir: Path) -> dict[str, dict[str, bool]]:
|
|||||||
try:
|
try:
|
||||||
reward = float(reward_file.read_text().strip())
|
reward = float(reward_file.read_text().strip())
|
||||||
tasks[task_name] = reward >= 1.0
|
tasks[task_name] = reward >= 1.0
|
||||||
continue
|
|
||||||
except (ValueError, OSError):
|
except (ValueError, OSError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
if task_name not in tasks:
|
||||||
# Fall back to result.json
|
# Fall back to result.json
|
||||||
result_file = trial_dir / "result.json"
|
result_file = trial_dir / "result.json"
|
||||||
if result_file.exists():
|
if result_file.exists():
|
||||||
@@ -80,8 +83,26 @@ def parse_job_results(results_dir: Path) -> dict[str, dict[str, bool]]:
|
|||||||
except (json.JSONDecodeError, ValueError, OSError):
|
except (json.JSONDecodeError, ValueError, OSError):
|
||||||
tasks[task_name] = False
|
tasks[task_name] = False
|
||||||
|
|
||||||
|
# Collect cost from usage.json
|
||||||
|
usage_file = trial_dir / "usage.json"
|
||||||
|
if usage_file.exists():
|
||||||
|
try:
|
||||||
|
usage = json.loads(usage_file.read_text())
|
||||||
|
total_cost += usage.get("cost_usd", 0.0)
|
||||||
|
total_prompt_tokens += usage.get("prompt_tokens", 0)
|
||||||
|
total_completion_tokens += usage.get("completion_tokens", 0)
|
||||||
|
except (json.JSONDecodeError, OSError):
|
||||||
|
pass
|
||||||
|
|
||||||
if tasks:
|
if tasks:
|
||||||
model_results[model] = tasks
|
model_results[model] = {
|
||||||
|
"tasks": tasks,
|
||||||
|
"cost": {
|
||||||
|
"cost_usd": round(total_cost, 2),
|
||||||
|
"prompt_tokens": total_prompt_tokens,
|
||||||
|
"completion_tokens": total_completion_tokens,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
return model_results
|
return model_results
|
||||||
|
|
||||||
@@ -105,7 +126,7 @@ def load_baseline(baseline_path: Path) -> dict:
|
|||||||
|
|
||||||
|
|
||||||
def build_report(
|
def build_report(
|
||||||
model_results: dict[str, dict[str, bool]],
|
model_results: dict[str, dict],
|
||||||
baseline: dict,
|
baseline: dict,
|
||||||
) -> tuple[str, bool]:
|
) -> tuple[str, bool]:
|
||||||
"""Build a markdown report and determine if there's a regression.
|
"""Build a markdown report and determine if there's a regression.
|
||||||
@@ -120,7 +141,9 @@ def build_report(
|
|||||||
|
|
||||||
has_regression = False
|
has_regression = False
|
||||||
|
|
||||||
for model, tasks in sorted(model_results.items()):
|
for model, data in sorted(model_results.items()):
|
||||||
|
tasks = data["tasks"]
|
||||||
|
cost = data.get("cost", {})
|
||||||
pass_rate = compute_pass_rate(tasks)
|
pass_rate = compute_pass_rate(tasks)
|
||||||
passed = sum(1 for v in tasks.values() if v)
|
passed = sum(1 for v in tasks.values() if v)
|
||||||
total = len(tasks)
|
total = len(tasks)
|
||||||
@@ -133,7 +156,7 @@ def build_report(
|
|||||||
delta_str = ""
|
delta_str = ""
|
||||||
if baseline_rate is not None:
|
if baseline_rate is not None:
|
||||||
delta = pass_rate - baseline_rate
|
delta = pass_rate - baseline_rate
|
||||||
if delta < -0.10:
|
if delta < -0.05:
|
||||||
has_regression = True
|
has_regression = True
|
||||||
delta_str = f" | **{delta:+.0%} from baseline** :red_circle:"
|
delta_str = f" | **{delta:+.0%} from baseline** :red_circle:"
|
||||||
elif delta < 0:
|
elif delta < 0:
|
||||||
@@ -141,30 +164,55 @@ def build_report(
|
|||||||
elif delta > 0:
|
elif delta > 0:
|
||||||
delta_str = f" | {delta:+.0%} from baseline :white_check_mark:"
|
delta_str = f" | {delta:+.0%} from baseline :white_check_mark:"
|
||||||
|
|
||||||
lines.append(f"### `{model}` — {passed}/{total} ({pass_rate:.0%}){delta_str}")
|
cost_str = ""
|
||||||
|
cost_usd = cost.get("cost_usd", 0)
|
||||||
|
if cost_usd > 0:
|
||||||
|
cost_str = f" | ${cost_usd:.2f}"
|
||||||
|
|
||||||
|
lines.append(f"<details>")
|
||||||
|
lines.append(f"<summary><strong>{model}</strong> — {passed}/{total} ({pass_rate:.0%}){delta_str}{cost_str}</summary>")
|
||||||
lines.append("")
|
lines.append("")
|
||||||
lines.append("| Task | Result | Baseline |")
|
|
||||||
lines.append("|------|--------|----------|")
|
# Categorize tasks
|
||||||
|
regressions = [] # was passing, now failing
|
||||||
|
improvements = [] # was failing, now passing
|
||||||
|
new_tasks = [] # not in baseline
|
||||||
|
|
||||||
for task_name, passed_now in sorted(tasks.items()):
|
for task_name, passed_now in sorted(tasks.items()):
|
||||||
result_emoji = ":white_check_mark:" if passed_now else ":x:"
|
|
||||||
baseline_val = baseline_tasks.get(task_name)
|
baseline_val = baseline_tasks.get(task_name)
|
||||||
|
|
||||||
if baseline_val is None:
|
if baseline_val is None:
|
||||||
baseline_str = "—"
|
new_tasks.append((task_name, passed_now))
|
||||||
elif baseline_val:
|
elif baseline_val and not passed_now:
|
||||||
baseline_str = ":white_check_mark:"
|
regressions.append(task_name)
|
||||||
else:
|
|
||||||
baseline_str = ":x:"
|
|
||||||
|
|
||||||
# Flag regressions: was passing, now failing
|
|
||||||
regression_marker = ""
|
|
||||||
if baseline_val is True and not passed_now:
|
|
||||||
regression_marker = " **REGRESSION**"
|
|
||||||
has_regression = True
|
has_regression = True
|
||||||
|
elif not baseline_val and passed_now:
|
||||||
|
improvements.append(task_name)
|
||||||
|
|
||||||
lines.append(f"| {task_name} | {result_emoji}{regression_marker} | {baseline_str} |")
|
if regressions:
|
||||||
|
lines.append(f"**Regressions ({len(regressions)}):**")
|
||||||
|
for t in regressions:
|
||||||
|
lines.append(f"- :red_circle: {t}")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
if improvements:
|
||||||
|
lines.append(f"**Improvements ({len(improvements)}):**")
|
||||||
|
for t in improvements:
|
||||||
|
lines.append(f"- :white_check_mark: {t}")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
if new_tasks:
|
||||||
|
new_passed = sum(1 for _, p in new_tasks if p)
|
||||||
|
lines.append(f"**New tasks ({new_passed}/{len(new_tasks)} passed):**")
|
||||||
|
for t, p in new_tasks:
|
||||||
|
emoji = ":white_check_mark:" if p else ":x:"
|
||||||
|
lines.append(f"- {emoji} {t}")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
if not regressions and not improvements and not new_tasks:
|
||||||
|
lines.append("No changes from baseline.")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
lines.append("</details>")
|
||||||
lines.append("")
|
lines.append("")
|
||||||
|
|
||||||
if not model_results:
|
if not model_results:
|
||||||
@@ -179,6 +227,10 @@ def build_report(
|
|||||||
lines.append(f"[Workflow run]({run_url}/{repo}/actions/runs/{run_id})")
|
lines.append(f"[Workflow run]({run_url}/{repo}/actions/runs/{run_id})")
|
||||||
lines.append("")
|
lines.append("")
|
||||||
|
|
||||||
|
if has_regression:
|
||||||
|
lines.append("cc @devanshrj")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
return "\n".join(lines), has_regression
|
return "\n".join(lines), has_regression
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user