feat: TB regression — full runs, xhigh models, baselines, and report improvements (#1390)
Co-authored-by: Letta Code <noreply@letta.com>
This commit is contained in:
@@ -1 +1,190 @@
|
||||
{}
|
||||
{
|
||||
"sonnet-4.6-xhigh": {
|
||||
"pass_rate": 0.427,
|
||||
"tasks": {
|
||||
"adaptive-rejection-sampler": false,
|
||||
"bn-fit-modify": true,
|
||||
"break-filter-js-from-html": false,
|
||||
"build-cython-ext": true,
|
||||
"build-pmars": true,
|
||||
"build-pov-ray": true,
|
||||
"caffe-cifar-10": false,
|
||||
"cancel-async-tasks": false,
|
||||
"chess-best-move": false,
|
||||
"circuit-fibsqrt": false,
|
||||
"cobol-modernization": false,
|
||||
"code-from-image": true,
|
||||
"compile-compcert": true,
|
||||
"configure-git-webserver": true,
|
||||
"constraints-scheduling": true,
|
||||
"count-dataset-tokens": true,
|
||||
"crack-7z-hash": false,
|
||||
"custom-memory-heap-crash": false,
|
||||
"db-wal-recovery": false,
|
||||
"distribution-search": false,
|
||||
"dna-assembly": false,
|
||||
"dna-insert": false,
|
||||
"extract-elf": true,
|
||||
"extract-moves-from-video": false,
|
||||
"feal-differential-cryptanalysis": true,
|
||||
"feal-linear-cryptanalysis": false,
|
||||
"filter-js-from-html": false,
|
||||
"financial-document-processor": false,
|
||||
"fix-code-vulnerability": true,
|
||||
"fix-git": true,
|
||||
"fix-ocaml-gc": true,
|
||||
"gcode-to-text": true,
|
||||
"git-leak-recovery": true,
|
||||
"git-multibranch": true,
|
||||
"gpt2-codegolf": false,
|
||||
"headless-terminal": true,
|
||||
"hf-model-inference": true,
|
||||
"install-windows-3.11": false,
|
||||
"kv-store-grpc": true,
|
||||
"large-scale-text-editing": true,
|
||||
"largest-eigenval": false,
|
||||
"llm-inference-batching-scheduler": false,
|
||||
"log-summary-date-ranges": true,
|
||||
"mailman": false,
|
||||
"make-doom-for-mips": false,
|
||||
"make-mips-interpreter": true,
|
||||
"mcmc-sampling-stan": true,
|
||||
"merge-diff-arc-agi-task": true,
|
||||
"model-extraction-relu-logits": false,
|
||||
"modernize-scientific-stack": true,
|
||||
"mteb-leaderboard": false,
|
||||
"mteb-retrieve": false,
|
||||
"multi-source-data-merger": true,
|
||||
"nginx-request-logging": true,
|
||||
"openssl-selfsigned-cert": true,
|
||||
"overfull-hbox": false,
|
||||
"password-recovery": true,
|
||||
"path-tracing-reverse": false,
|
||||
"path-tracing": false,
|
||||
"polyglot-c-py": false,
|
||||
"polyglot-rust-c": false,
|
||||
"portfolio-optimization": false,
|
||||
"protein-assembly": false,
|
||||
"prove-plus-comm": true,
|
||||
"pypi-server": true,
|
||||
"pytorch-model-cli": false,
|
||||
"pytorch-model-recovery": true,
|
||||
"qemu-alpine-ssh": false,
|
||||
"qemu-startup": true,
|
||||
"query-optimize": false,
|
||||
"raman-fitting": false,
|
||||
"regex-chess": false,
|
||||
"regex-log": true,
|
||||
"reshard-c4-data": false,
|
||||
"rstan-to-pystan": false,
|
||||
"sam-cell-seg": false,
|
||||
"sanitize-git-repo": false,
|
||||
"schemelike-metacircular-eval": false,
|
||||
"sparql-university": false,
|
||||
"sqlite-db-truncate": true,
|
||||
"sqlite-with-gcov": false,
|
||||
"torch-pipeline-parallelism": false,
|
||||
"torch-tensor-parallelism": true,
|
||||
"train-fasttext": false,
|
||||
"tune-mjcf": false,
|
||||
"video-processing": false,
|
||||
"vulnerable-secret": true,
|
||||
"winning-avg-corewars": false,
|
||||
"write-compressor": false
|
||||
}
|
||||
},
|
||||
"gpt-5.3-codex-xhigh": {
|
||||
"pass_rate": 0.6404,
|
||||
"tasks": {
|
||||
"adaptive-rejection-sampler": false,
|
||||
"bn-fit-modify": true,
|
||||
"break-filter-js-from-html": true,
|
||||
"build-cython-ext": true,
|
||||
"build-pmars": true,
|
||||
"build-pov-ray": true,
|
||||
"caffe-cifar-10": true,
|
||||
"cancel-async-tasks": false,
|
||||
"chess-best-move": true,
|
||||
"circuit-fibsqrt": true,
|
||||
"cobol-modernization": true,
|
||||
"code-from-image": true,
|
||||
"compile-compcert": true,
|
||||
"configure-git-webserver": true,
|
||||
"constraints-scheduling": true,
|
||||
"count-dataset-tokens": true,
|
||||
"crack-7z-hash": true,
|
||||
"custom-memory-heap-crash": true,
|
||||
"db-wal-recovery": false,
|
||||
"distribution-search": true,
|
||||
"dna-assembly": false,
|
||||
"dna-insert": true,
|
||||
"extract-elf": true,
|
||||
"extract-moves-from-video": false,
|
||||
"feal-differential-cryptanalysis": true,
|
||||
"feal-linear-cryptanalysis": true,
|
||||
"filter-js-from-html": false,
|
||||
"financial-document-processor": true,
|
||||
"fix-code-vulnerability": true,
|
||||
"fix-git": true,
|
||||
"fix-ocaml-gc": true,
|
||||
"gcode-to-text": false,
|
||||
"git-leak-recovery": true,
|
||||
"git-multibranch": true,
|
||||
"gpt2-codegolf": false,
|
||||
"headless-terminal": true,
|
||||
"hf-model-inference": true,
|
||||
"install-windows-3.11": false,
|
||||
"kv-store-grpc": true,
|
||||
"large-scale-text-editing": true,
|
||||
"largest-eigenval": true,
|
||||
"llm-inference-batching-scheduler": true,
|
||||
"log-summary-date-ranges": true,
|
||||
"mailman": false,
|
||||
"make-doom-for-mips": false,
|
||||
"make-mips-interpreter": false,
|
||||
"mcmc-sampling-stan": false,
|
||||
"merge-diff-arc-agi-task": true,
|
||||
"model-extraction-relu-logits": true,
|
||||
"modernize-scientific-stack": true,
|
||||
"mteb-leaderboard": true,
|
||||
"mteb-retrieve": false,
|
||||
"multi-source-data-merger": true,
|
||||
"nginx-request-logging": true,
|
||||
"openssl-selfsigned-cert": true,
|
||||
"overfull-hbox": false,
|
||||
"password-recovery": true,
|
||||
"path-tracing-reverse": true,
|
||||
"path-tracing": true,
|
||||
"polyglot-c-py": false,
|
||||
"polyglot-rust-c": false,
|
||||
"portfolio-optimization": false,
|
||||
"protein-assembly": true,
|
||||
"prove-plus-comm": true,
|
||||
"pypi-server": true,
|
||||
"pytorch-model-cli": true,
|
||||
"pytorch-model-recovery": false,
|
||||
"qemu-alpine-ssh": false,
|
||||
"qemu-startup": false,
|
||||
"query-optimize": false,
|
||||
"raman-fitting": false,
|
||||
"regex-chess": false,
|
||||
"regex-log": true,
|
||||
"reshard-c4-data": false,
|
||||
"rstan-to-pystan": true,
|
||||
"sam-cell-seg": false,
|
||||
"sanitize-git-repo": true,
|
||||
"schemelike-metacircular-eval": false,
|
||||
"sparql-university": true,
|
||||
"sqlite-db-truncate": true,
|
||||
"sqlite-with-gcov": true,
|
||||
"torch-pipeline-parallelism": false,
|
||||
"torch-tensor-parallelism": false,
|
||||
"train-fasttext": false,
|
||||
"tune-mjcf": true,
|
||||
"video-processing": false,
|
||||
"vulnerable-secret": true,
|
||||
"winning-avg-corewars": true,
|
||||
"write-compressor": false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user