Files
letta-server/fern/pages/leaderboard/data.yaml
Kian Jones b8e9a80d93 merge this (#4759)
* wait I forgot to comit locally

* cp the entire core directory and then rm the .git subdir
2025-09-17 15:47:40 -07:00

157 lines
4.5 KiB
YAML

- model: claude-3-5-haiku
average: 87.78
total_cost: 4.15
archival_memory_read_benchmark: 96.33
core_memory_write_benchmark: 91.0
core_memory_read_benchmark: 76.0
- model: gemini-2-5-pro
average: 98.22
total_cost: 5.02
archival_memory_read_benchmark: 96.0
core_memory_write_benchmark: 98.67
core_memory_read_benchmark: 100.0
- model: claude-3-7-sonnet-extended
average: 95.78
total_cost: 14.42
archival_memory_read_benchmark: 93.33
core_memory_write_benchmark: 95.67
core_memory_read_benchmark: 98.33
- model: gemini-2-5-flash
average: 94.0
total_cost: 0.55
archival_memory_read_benchmark: 93.0
core_memory_write_benchmark: 92.0
core_memory_read_benchmark: 97.0
- model: openai-gpt-4.1
average: 95.44
total_cost: 7.05
archival_memory_read_benchmark: 89.67
core_memory_write_benchmark: 99.33
core_memory_read_benchmark: 97.33
- model: claude-3-7-sonnet
average: 92.56
total_cost: 17.24
archival_memory_read_benchmark: 88.0
core_memory_write_benchmark: 96.33
core_memory_read_benchmark: 93.33
- model: together-llama-4-scout-17b
average: 78.56
total_cost: 0.77
archival_memory_read_benchmark: 86.33
core_memory_write_benchmark: 56.0
core_memory_read_benchmark: 93.33
- model: together-qwen-2-5-72b
average: 77.44
total_cost: 4.71
archival_memory_read_benchmark: 79.33
core_memory_write_benchmark: 68.33
core_memory_read_benchmark: 84.67
- model: claude-3-5-sonnet
average: 90.0
total_cost: 14.07
archival_memory_read_benchmark: 76.67
core_memory_write_benchmark: 98.33
core_memory_read_benchmark: 95.0
- model: openai-gpt-4o
average: 88.0
total_cost: 8.11
archival_memory_read_benchmark: 69.0
core_memory_write_benchmark: 98.67
core_memory_read_benchmark: 96.33
- model: together-llama-3-1-405b
average: 81.67
total_cost: 9.84
archival_memory_read_benchmark: 60.67
core_memory_write_benchmark: 86.0
core_memory_read_benchmark: 98.33
- model: together-llama-4-maverick-17b
average: 62.33
total_cost: 1.06
archival_memory_read_benchmark: 53.0
core_memory_write_benchmark: 39.33
core_memory_read_benchmark: 94.67
- model: openai-o1
average: 77.11
total_cost: 63.63
archival_memory_read_benchmark: 52.33
core_memory_write_benchmark: 82.0
core_memory_read_benchmark: 97.0
- model: openai-gpt-4.1-mini
average: 78.22
total_cost: 1.35
archival_memory_read_benchmark: 41.0
core_memory_write_benchmark: 95.0
core_memory_read_benchmark: 98.67
- model: together-deepseek-v3
average: 73.33
total_cost: 3.39
archival_memory_read_benchmark: 26.33
core_memory_write_benchmark: 96.0
core_memory_read_benchmark: 97.67
- model: together-llama-3-2-3b
average: 4.67
total_cost: 0.87
archival_memory_read_benchmark: 14.0
core_memory_write_benchmark: 0.0
core_memory_read_benchmark: 0.0
- model: together-llama-3-70b
average: 35.89
total_cost: 1.56
archival_memory_read_benchmark: 13.0
core_memory_write_benchmark: 0.0
core_memory_read_benchmark: 94.67
- model: together-meta-llama-3-1-8b
average: 32.67
total_cost: 0.98
archival_memory_read_benchmark: 8.0
core_memory_write_benchmark: 12.0
core_memory_read_benchmark: 78.0
- model: together-llama-3-3-70b
average: 66.33
total_cost: 2.56
archival_memory_read_benchmark: 6.33
core_memory_write_benchmark: 97.0
core_memory_read_benchmark: 95.67
- model: together-meta-llama-3-1-70b
average: 62.56
total_cost: 2.61
archival_memory_read_benchmark: 6.0
core_memory_write_benchmark: 86.67
core_memory_read_benchmark: 95.0
- model: openai-o3-mini
average: 65.67
total_cost: 3.67
archival_memory_read_benchmark: 5.33
core_memory_write_benchmark: 93.33
core_memory_read_benchmark: 98.33
- model: openai-o4-mini
average: 67.0
total_cost: 3.89
archival_memory_read_benchmark: 4.67
core_memory_write_benchmark: 98.33
core_memory_read_benchmark: 98.0
- model: openai-gpt-4.1-nano
average: 24.0
total_cost: 0.35
archival_memory_read_benchmark: 2.0
core_memory_write_benchmark: 14.0
core_memory_read_benchmark: 56.0
- model: openai-gpt-4o-mini
average: 65.22
total_cost: 0.35
archival_memory_read_benchmark: 1.33
core_memory_write_benchmark: 95.33
core_memory_read_benchmark: 99.0
- model: together-qwen-2-5-7b
average: 16.67
total_cost: 1.23
archival_memory_read_benchmark: 1.0
core_memory_write_benchmark: 36.67
core_memory_read_benchmark: 12.33
- model: openai-gpt-3.5-turbo
average: 21.0
total_cost: 1.71
archival_memory_read_benchmark: 0.67
core_memory_write_benchmark: 10.33
core_memory_read_benchmark: 52.0