Files
letta-code/benchmarks/terminal_bench/baseline.json

191 lines
6.1 KiB
JSON

{
"sonnet-4.6-xhigh": {
"pass_rate": 0.427,
"tasks": {
"adaptive-rejection-sampler": false,
"bn-fit-modify": true,
"break-filter-js-from-html": false,
"build-cython-ext": true,
"build-pmars": true,
"build-pov-ray": true,
"caffe-cifar-10": false,
"cancel-async-tasks": false,
"chess-best-move": false,
"circuit-fibsqrt": false,
"cobol-modernization": false,
"code-from-image": true,
"compile-compcert": true,
"configure-git-webserver": true,
"constraints-scheduling": true,
"count-dataset-tokens": true,
"crack-7z-hash": false,
"custom-memory-heap-crash": false,
"db-wal-recovery": false,
"distribution-search": false,
"dna-assembly": false,
"dna-insert": false,
"extract-elf": true,
"extract-moves-from-video": false,
"feal-differential-cryptanalysis": true,
"feal-linear-cryptanalysis": false,
"filter-js-from-html": false,
"financial-document-processor": false,
"fix-code-vulnerability": true,
"fix-git": true,
"fix-ocaml-gc": true,
"gcode-to-text": true,
"git-leak-recovery": true,
"git-multibranch": true,
"gpt2-codegolf": false,
"headless-terminal": true,
"hf-model-inference": true,
"install-windows-3.11": false,
"kv-store-grpc": true,
"large-scale-text-editing": true,
"largest-eigenval": false,
"llm-inference-batching-scheduler": false,
"log-summary-date-ranges": true,
"mailman": false,
"make-doom-for-mips": false,
"make-mips-interpreter": true,
"mcmc-sampling-stan": true,
"merge-diff-arc-agi-task": true,
"model-extraction-relu-logits": false,
"modernize-scientific-stack": true,
"mteb-leaderboard": false,
"mteb-retrieve": false,
"multi-source-data-merger": true,
"nginx-request-logging": true,
"openssl-selfsigned-cert": true,
"overfull-hbox": false,
"password-recovery": true,
"path-tracing-reverse": false,
"path-tracing": false,
"polyglot-c-py": false,
"polyglot-rust-c": false,
"portfolio-optimization": false,
"protein-assembly": false,
"prove-plus-comm": true,
"pypi-server": true,
"pytorch-model-cli": false,
"pytorch-model-recovery": true,
"qemu-alpine-ssh": false,
"qemu-startup": true,
"query-optimize": false,
"raman-fitting": false,
"regex-chess": false,
"regex-log": true,
"reshard-c4-data": false,
"rstan-to-pystan": false,
"sam-cell-seg": false,
"sanitize-git-repo": false,
"schemelike-metacircular-eval": false,
"sparql-university": false,
"sqlite-db-truncate": true,
"sqlite-with-gcov": false,
"torch-pipeline-parallelism": false,
"torch-tensor-parallelism": true,
"train-fasttext": false,
"tune-mjcf": false,
"video-processing": false,
"vulnerable-secret": true,
"winning-avg-corewars": false,
"write-compressor": false
}
},
"gpt-5.3-codex-xhigh": {
"pass_rate": 0.6404,
"tasks": {
"adaptive-rejection-sampler": false,
"bn-fit-modify": true,
"break-filter-js-from-html": true,
"build-cython-ext": true,
"build-pmars": true,
"build-pov-ray": true,
"caffe-cifar-10": true,
"cancel-async-tasks": false,
"chess-best-move": true,
"circuit-fibsqrt": true,
"cobol-modernization": true,
"code-from-image": true,
"compile-compcert": true,
"configure-git-webserver": true,
"constraints-scheduling": true,
"count-dataset-tokens": true,
"crack-7z-hash": true,
"custom-memory-heap-crash": true,
"db-wal-recovery": false,
"distribution-search": true,
"dna-assembly": false,
"dna-insert": true,
"extract-elf": true,
"extract-moves-from-video": false,
"feal-differential-cryptanalysis": true,
"feal-linear-cryptanalysis": true,
"filter-js-from-html": false,
"financial-document-processor": true,
"fix-code-vulnerability": true,
"fix-git": true,
"fix-ocaml-gc": true,
"gcode-to-text": false,
"git-leak-recovery": true,
"git-multibranch": true,
"gpt2-codegolf": false,
"headless-terminal": true,
"hf-model-inference": true,
"install-windows-3.11": false,
"kv-store-grpc": true,
"large-scale-text-editing": true,
"largest-eigenval": true,
"llm-inference-batching-scheduler": true,
"log-summary-date-ranges": true,
"mailman": false,
"make-doom-for-mips": false,
"make-mips-interpreter": false,
"mcmc-sampling-stan": false,
"merge-diff-arc-agi-task": true,
"model-extraction-relu-logits": true,
"modernize-scientific-stack": true,
"mteb-leaderboard": true,
"mteb-retrieve": false,
"multi-source-data-merger": true,
"nginx-request-logging": true,
"openssl-selfsigned-cert": true,
"overfull-hbox": false,
"password-recovery": true,
"path-tracing-reverse": true,
"path-tracing": true,
"polyglot-c-py": false,
"polyglot-rust-c": false,
"portfolio-optimization": false,
"protein-assembly": true,
"prove-plus-comm": true,
"pypi-server": true,
"pytorch-model-cli": true,
"pytorch-model-recovery": false,
"qemu-alpine-ssh": false,
"qemu-startup": false,
"query-optimize": false,
"raman-fitting": false,
"regex-chess": false,
"regex-log": true,
"reshard-c4-data": false,
"rstan-to-pystan": true,
"sam-cell-seg": false,
"sanitize-git-repo": true,
"schemelike-metacircular-eval": false,
"sparql-university": true,
"sqlite-db-truncate": true,
"sqlite-with-gcov": true,
"torch-pipeline-parallelism": false,
"torch-tensor-parallelism": false,
"train-fasttext": false,
"tune-mjcf": true,
"video-processing": false,
"vulnerable-secret": true,
"winning-avg-corewars": true,
"write-compressor": false
}
}
}