Yuxuan-Zhang-Dexter commited on
Commit
9578973
·
1 Parent(s): b25eb61

update news and leaderboard data

Browse files
app.py CHANGED
@@ -914,6 +914,10 @@ def build_app():
914
  label="Comparative Analysis (Group Bar Chart)",
915
  elem_classes="visualization-container"
916
  )
 
 
 
 
917
 
918
 
919
  # Hidden placeholder for group bar visualization (to maintain code references)
@@ -1143,6 +1147,10 @@ def build_app():
1143
  label="Comparative Analysis (Group Bar Chart)",
1144
  elem_classes="visualization-container"
1145
  )
 
 
 
 
1146
 
1147
  # Game selection section
1148
  with gr.Row():
 
914
  label="Comparative Analysis (Group Bar Chart)",
915
  elem_classes="visualization-container"
916
  )
917
+ gr.Markdown(
918
+ "*💡 Click a legend entry to isolate that model. Double-click additional ones to add them for comparison.*",
919
+ elem_classes="radar-tip"
920
+ )
921
 
922
 
923
  # Hidden placeholder for group bar visualization (to maintain code references)
 
1147
  label="Comparative Analysis (Group Bar Chart)",
1148
  elem_classes="visualization-container"
1149
  )
1150
+ gr.Markdown(
1151
+ "*💡 Click a legend entry to isolate that model. Double-click additional ones to add them for comparison.*",
1152
+ elem_classes="radar-tip"
1153
+ )
1154
 
1155
  # Game selection section
1156
  with gr.Row():
assets/game_video_link.json CHANGED
@@ -1,7 +1,8 @@
1
  {
2
  "sokoban": "https://www.youtube.com/watch?v=59enV32MBUE",
3
- "super_mario": "https://www.youtube.com/watch?v=nixMIJZYAgg",
4
  "2048": "https://www.youtube.com/watch?v=3aYDCSa3AWI",
5
  "candy": "https://www.youtube.com/watch?v=b-Uyz3W4yIg",
6
- "ace_attorney": "https://www.youtube.com/watch?v=q8PMW870yp8"
 
7
  }
 
1
  {
2
  "sokoban": "https://www.youtube.com/watch?v=59enV32MBUE",
3
+ "super_mario_bros": "https://www.youtube.com/watch?v=nixMIJZYAgg",
4
  "2048": "https://www.youtube.com/watch?v=3aYDCSa3AWI",
5
  "candy": "https://www.youtube.com/watch?v=b-Uyz3W4yIg",
6
+ "ace_attorney": "https://www.youtube.com/watch?v=q8PMW870yp8",
7
+ "tetris": "https://www.youtube.com/watch?v=m6i9L6-pgu4"
8
  }
assets/model_color.json CHANGED
@@ -1,6 +1,5 @@
1
  {
2
  "claude-3-7-sonnet-20250219": "#4A90E2",
3
- "claude-3-7-sonnet-20250219 (thinking)": "#2E5C8A",
4
  "claude-3-5-haiku-20241022": "#7FB5E6",
5
  "claude-3-5-sonnet-20241022": "#1A4C7C",
6
  "claude-opus-4-20250514": "#3A80D2",
@@ -9,9 +8,8 @@
9
  "gemini-2.0-flash-thinking-exp-1219": "#C2185B",
10
  "gemini-2.5-pro-exp-03-25": "#FF80AB",
11
  "gemini-2.5-flash-preview-04-17": "#F06292",
12
- "gemini-2.5-flash-preview-04-17 (thinking)": "#E91E63",
13
  "gemini-2.5-flash-preview-05-20": "#F8BBD9",
14
- "gemini-2.5-pro-preview-05-06 (thinking)": "#AD1457",
15
  "gemini-2.5-pro-preview-06-05": "#EC407A",
16
  "gpt-4o-2024-11-20": "#00BFA5",
17
  "gpt-4.5-preview-2025-02-27": "#00796B",
@@ -23,7 +21,6 @@
23
  "o4-mini-2025-04-16": "#00ACC1",
24
  "grok-3-beta": "#FF7043",
25
  "grok-3-mini-beta": "#FF8A65",
26
- "grok-3-mini-beta (thinking)": "#F57C00",
27
  "deepseek-v3": "#FFC107",
28
  "deepseek-r1-0120": "#FFA000",
29
  "deepseek-r1-0528": "#FFB300",
@@ -31,7 +28,6 @@
31
  "qwen3-235B-A22B-fp8": "#6A1B9A",
32
  "random (x30)": "#9E9E9E",
33
  "gamingagent + claude-3-7-sonnet-20250219": "#4A90E2",
34
- "gamingagent + claude-3-7-sonnet-20250219 (thinking)": "#2E5C8A",
35
  "gamingagent + claude-3-5-haiku-20241022": "#7FB5E6",
36
  "gamingagent + claude-3-5-sonnet-20241022": "#1A4C7C",
37
  "gamingagent + claude-opus-4-20250514": "#3A80D2",
@@ -40,9 +36,8 @@
40
  "gamingagent + gemini-2.0-flash-thinking-exp-1219": "#C2185B",
41
  "gamingagent + gemini-2.5-pro-exp-03-25": "#FF80AB",
42
  "gamingagent + gemini-2.5-flash-preview-04-17": "#F06292",
43
- "gamingagent + gemini-2.5-flash-preview-04-17 (thinking)": "#E91E63",
44
  "gamingagent + gemini-2.5-flash-preview-05-20": "#F8BBD9",
45
- "gamingagent + gemini-2.5-pro-preview-05-06 (thinking)": "#AD1457",
46
  "gamingagent + gemini-2.5-pro-preview-06-05": "#EC407A",
47
  "gamingagent + gpt-4o-2024-11-20": "#00BFA5",
48
  "gamingagent + gpt-4.5-preview-2025-02-27": "#00796B",
@@ -54,7 +49,6 @@
54
  "gamingagent + o4-mini-2025-04-16": "#00ACC1",
55
  "gamingagent + grok-3-beta": "#FF7043",
56
  "gamingagent + grok-3-mini-beta": "#FF8A65",
57
- "gamingagent + grok-3-mini-beta (thinking)": "#F57C00",
58
  "gamingagent + deepseek-v3": "#FFC107",
59
  "gamingagent + deepseek-r1-0120": "#FFA000",
60
  "gamingagent + deepseek-r1-0528": "#FFB300",
 
1
  {
2
  "claude-3-7-sonnet-20250219": "#4A90E2",
 
3
  "claude-3-5-haiku-20241022": "#7FB5E6",
4
  "claude-3-5-sonnet-20241022": "#1A4C7C",
5
  "claude-opus-4-20250514": "#3A80D2",
 
8
  "gemini-2.0-flash-thinking-exp-1219": "#C2185B",
9
  "gemini-2.5-pro-exp-03-25": "#FF80AB",
10
  "gemini-2.5-flash-preview-04-17": "#F06292",
 
11
  "gemini-2.5-flash-preview-05-20": "#F8BBD9",
12
+ "gemini-2.5-pro-preview-05-06": "#AD1457",
13
  "gemini-2.5-pro-preview-06-05": "#EC407A",
14
  "gpt-4o-2024-11-20": "#00BFA5",
15
  "gpt-4.5-preview-2025-02-27": "#00796B",
 
21
  "o4-mini-2025-04-16": "#00ACC1",
22
  "grok-3-beta": "#FF7043",
23
  "grok-3-mini-beta": "#FF8A65",
 
24
  "deepseek-v3": "#FFC107",
25
  "deepseek-r1-0120": "#FFA000",
26
  "deepseek-r1-0528": "#FFB300",
 
28
  "qwen3-235B-A22B-fp8": "#6A1B9A",
29
  "random (x30)": "#9E9E9E",
30
  "gamingagent + claude-3-7-sonnet-20250219": "#4A90E2",
 
31
  "gamingagent + claude-3-5-haiku-20241022": "#7FB5E6",
32
  "gamingagent + claude-3-5-sonnet-20241022": "#1A4C7C",
33
  "gamingagent + claude-opus-4-20250514": "#3A80D2",
 
36
  "gamingagent + gemini-2.0-flash-thinking-exp-1219": "#C2185B",
37
  "gamingagent + gemini-2.5-pro-exp-03-25": "#FF80AB",
38
  "gamingagent + gemini-2.5-flash-preview-04-17": "#F06292",
 
39
  "gamingagent + gemini-2.5-flash-preview-05-20": "#F8BBD9",
40
+ "gamingagent + gemini-2.5-pro-preview-05-06": "#AD1457",
41
  "gamingagent + gemini-2.5-pro-preview-06-05": "#EC407A",
42
  "gamingagent + gpt-4o-2024-11-20": "#00BFA5",
43
  "gamingagent + gpt-4.5-preview-2025-02-27": "#00796B",
 
49
  "gamingagent + o4-mini-2025-04-16": "#00ACC1",
50
  "gamingagent + grok-3-beta": "#FF7043",
51
  "gamingagent + grok-3-mini-beta": "#FF8A65",
 
52
  "gamingagent + deepseek-v3": "#FFC107",
53
  "gamingagent + deepseek-r1-0120": "#FFA000",
54
  "gamingagent + deepseek-r1-0528": "#FFB300",
assets/news.json CHANGED
@@ -1,10 +1,22 @@
1
  {
2
  "news": [
 
 
 
 
 
 
 
 
 
 
 
 
3
  {
4
  "date": "2025-04-28",
5
  "video_link": "https://www.youtube.com/watch?v=OEQRhBKYxIE",
6
  "twitter_text": "Grok-3-mini-beta Joins the Battle: Outperforms Gemini 2.5 Flash, Challenges O3-mini Across Games — Full Grok-3-beta Power Yet to Come. 🚀",
7
- "twitter_link": "https://x.com/haoailab"
8
  },
9
  {
10
  "date": "2025-04-24",
 
1
  {
2
  "news": [
3
+ {
4
+ "date": "2025-06-07",
5
+ "video_link": "https://www.youtube.com/watch?v=4CV-fB5EVJs",
6
+ "twitter_text": "New Benchmark Results: Claude-sonnet-4 & Claude-opus-4 and Join Us in the Discord!",
7
+ "twitter_link": "https://x.com/haoailab/status/1931438794756313530"
8
+ },
9
+ {
10
+ "date": "2025-06-03",
11
+ "video_link": "https://www.youtube.com/watch?v=m6i9L6-pgu4",
12
+ "twitter_text": "New Benchmark Results: How do top open-source models like Deepseek r1 & Qwen 3 perform on games?",
13
+ "twitter_link": "https://x.com/haoailab/status/1929997363407708646"
14
+ },
15
  {
16
  "date": "2025-04-28",
17
  "video_link": "https://www.youtube.com/watch?v=OEQRhBKYxIE",
18
  "twitter_text": "Grok-3-mini-beta Joins the Battle: Outperforms Gemini 2.5 Flash, Challenges O3-mini Across Games — Full Grok-3-beta Power Yet to Come. 🚀",
19
+ "twitter_link": "https://x.com/haoailab/status/1917309598861779021"
20
  },
21
  {
22
  "date": "2025-04-24",
data_visualization.py CHANGED
@@ -314,6 +314,7 @@ def create_group_bar_chart(df, top_n=10):
314
  uniformtext=dict(mode='hide', minsize=8), # Hide text that doesn't fit
315
  legend=dict(
316
  font=dict(size=12),
 
317
  itemsizing='trace',
318
  x=1.1,
319
  y=1,
 
314
  uniformtext=dict(mode='hide', minsize=8), # Hide text that doesn't fit
315
  legend=dict(
316
  font=dict(size=12),
317
+ title="Choose your model 💡 (click / double-click)",
318
  itemsizing='trace',
319
  x=1.1,
320
  y=1,
rank_data_03_25_2025.json CHANGED
@@ -5,67 +5,67 @@
5
  {
6
  "model": "gamingagent + claude-3-5-sonnet-20241022",
7
  "score": 1267.7,
8
- "detail_data":"709,1532,1562",
9
  "progress": "1-1"
10
  },
11
  {
12
- "model": "gamingagent + claude-3-7-sonnet-20250219 (thinking)",
13
  "score": 1418.7,
14
- "detail_data":"2015,709,1532",
15
  "progress": "1-1"
16
  },
17
  {
18
- "model": "gamingagent + gemini-2.5-flash-preview-04-17 (thinking)",
19
  "score": 1385.0,
20
- "detail_data":"1672,1266,1247",
21
  "progress": "1-1"
22
  },
23
  {
24
- "model": "gamingagent + gemini-2.5-pro-preview-05-06 (thinking)",
25
  "score": 1498.3,
26
- "detail_data":"1561,1271,1663",
27
  "progress": "1-1"
28
  },
29
  {
30
  "model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8",
31
  "score": 1468.7,
32
- "detail_data":"898,2008,1500",
33
  "progress": "1-1"
34
  },
35
  {
36
  "model": "gamingagent + gpt-4.1-2025-04-14",
37
  "score": 2126.3,
38
- "detail_data":"1531,722,4126",
39
  "progress": "1-1"
40
  },
41
  {
42
  "model": "gamingagent + gpt-4o-2024-11-20",
43
  "score": 2047.3,
44
- "detail_data":"2017,2590,1535",
45
  "progress": "1-1"
46
  },
47
  {
48
  "model": "gamingagent + o1-2024-12-17",
49
  "score": 855,
50
- "detail_data":"855",
51
  "progress": "1-1"
52
  },
53
  {
54
  "model": "gamingagent + o3-2025-04-16",
55
  "score": 3445,
56
- "detail_data":"3445",
57
  "progress": "1-1"
58
  },
59
  {
60
  "model": "gamingagent + o4-mini-2025-04-16",
61
  "score": 1448.0,
62
- "detail_data":"1525,1263,1556",
63
  "progress": "1-1"
64
  },
65
  {
66
  "model": "random (x30)",
67
  "score": 986.97,
68
- "detail_data":"986.97",
69
  "progress": "1-1"
70
  }
71
  ]
@@ -80,7 +80,7 @@
80
  "highest_tail": 256
81
  },
82
  {
83
- "model": "gamingagent + claude-3-7-sonnet-20250219 (thinking)",
84
  "score": 2624,
85
  "details": "2560,3224,2088",
86
  "highest_tail": 256
@@ -92,19 +92,19 @@
92
  "highest_tail": 256
93
  },
94
  {
95
- "model": "gamingagent + gemini-2.5-flash-preview-04-17 (thinking)",
96
  "score": 1697.33,
97
  "details": "1304,1316,2472",
98
  "highest_tail": 256
99
  },
100
  {
101
- "model": "gamingagent + gemini-2.5-pro-preview-05-06 (thinking)",
102
  "score": 3586.67,
103
  "details": "5300,2400,3060",
104
  "highest_tail": 512
105
  },
106
  {
107
- "model": "gamingagent + grok-3-mini-beta (thinking)",
108
  "score": 4036,
109
  "details": "6412,2492,3204",
110
  "highest_tail": 512
@@ -192,7 +192,7 @@
192
  "details": "16,14,14"
193
  },
194
  {
195
- "model": "gamingagent + claude-3-7-sonnet-20250219 (thinking)",
196
  "score": 16.3,
197
  "details": "19,15,15"
198
  },
@@ -202,17 +202,17 @@
202
  "details": "15,14,14"
203
  },
204
  {
205
- "model": "gamingagent + gemini-2.5-flash-preview-04-17 (thinking)",
206
  "score": 16.3,
207
  "details": "20,14,15"
208
  },
209
  {
210
- "model": "gamingagent + gemini-2.5-pro-preview-05-06 (thinking)",
211
  "score": 23.3,
212
  "details": "23,23,24"
213
  },
214
  {
215
- "model": "gamingagent + grok-3-mini-beta (thinking)",
216
  "score": 21.3,
217
  "details": "20,15,29"
218
  },
@@ -287,7 +287,7 @@
287
  "details": "92,165,61"
288
  },
289
  {
290
- "model": "gamingagent + claude-3-7-sonnet-20250219 (thinking)",
291
  "score": 484,
292
  "details": "535,428,489"
293
  },
@@ -297,17 +297,17 @@
297
  "details": "409,436,497"
298
  },
299
  {
300
- "model": "gamingagent + gemini-2.5-flash-preview-04-17 (thinking)",
301
  "score": 334.7,
302
  "details": "259,372,373"
303
  },
304
  {
305
- "model": "gamingagent + gemini-2.5-pro-preview-05-06 (thinking)",
306
  "score": 416.3,
307
  "details": "411,414,424"
308
  },
309
  {
310
- "model": "gamingagent + grok-3-mini-beta (thinking)",
311
  "score": 254,
312
  "details": "299,332,131"
313
  },
@@ -379,85 +379,85 @@
379
  {
380
  "model": "gamingagent + claude-3-5-sonnet-20241022",
381
  "score": 0,
382
- "detail_box_on_target":"0,0,0",
383
  "cracked_levels": "0,0,0"
384
  },
385
  {
386
- "model": "gamingagent + claude-3-7-sonnet-20250219 (thinking)",
387
  "score": 2.33,
388
- "detail_box_on_target":"2,4,1",
389
  "cracked_levels": "1,2,0"
390
  },
391
  {
392
  "model": "gamingagent + deepseek-r1-0120",
393
  "score": 1.33,
394
- "detail_box_on_target":"2,0,2",
395
  "cracked_levels": "1,0,1"
396
  },
397
  {
398
- "model": "gamingagent + gemini-2.5-flash-preview-04-17 (thinking)",
399
  "score": 1.67,
400
- "detail_box_on_target":"3,0,2",
401
  "cracked_levels": "2,0,1"
402
  },
403
  {
404
- "model": "gamingagent + gemini-2.5-pro-preview-05-06 (thinking)",
405
  "score": 4.33,
406
- "detail_box_on_target":"4,4,5",
407
  "cracked_levels": "2,2,3"
408
  },
409
  {
410
- "model": "gamingagent + grok-3-mini-beta (thinking)",
411
  "score": 5.67,
412
- "detail_box_on_target":"5,6,6",
413
  "cracked_levels": "3,3,3"
414
  },
415
  {
416
  "model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8",
417
  "score": 0,
418
- "detail_box_on_target":"0,0,0",
419
  "cracked_levels": "0,0,0"
420
  },
421
  {
422
  "model": "gamingagent + gpt-4.1-2025-04-14",
423
  "score": 0,
424
- "detail_box_on_target":"0,0,0",
425
  "cracked_levels": "0,0,0"
426
  },
427
  {
428
  "model": "gamingagent + gpt-4o-2024-11-20",
429
  "score": 0,
430
- "detail_box_on_target":"0,0,0",
431
  "cracked_levels": "0,0,0"
432
  },
433
  {
434
  "model": "gamingagent + o1-2024-12-17",
435
  "score": 2.33,
436
- "detail_box_on_target":"2,2,3",
437
  "cracked_levels": "1,1,2"
438
  },
439
  {
440
  "model": "gamingagent + o1-mini-2024-09-12",
441
  "score": 1.33,
442
- "detail_box_on_target":"1,2,1",
443
  "cracked_levels": "0,1,0"
444
  },
445
  {
446
  "model": "gamingagent + o3-2025-04-16",
447
  "score": 8,
448
- "detail_box_on_target":"10,6",
449
  "cracked_levels": "5,3"
450
  },
451
  {
452
  "model": "gamingagent + o4-mini-2025-04-16",
453
  "score": 5.33,
454
- "detail_box_on_target":"4,6,6",
455
  "cracked_levels": "2,2,3"
456
  },
457
  {
458
  "model": "random (x30)",
459
  "score": 0,
460
- "detail_box_on_target":"0,0,0",
461
  "cracked_levels": "0,0,0"
462
  },
463
  {
@@ -492,7 +492,7 @@
492
  "evaluator result": "1/3"
493
  },
494
  {
495
- "model": "gamingagent + claude-3-7-sonnet-20250219 (thinking)",
496
  "score": 7,
497
  "progress": "2:2/9",
498
  "evaluator result": "5/11"
@@ -504,19 +504,19 @@
504
  "evaluator result": "1/5"
505
  },
506
  {
507
- "model": "gamingagent + gemini-2.5-flash-preview-04-17 (thinking)",
508
  "score": 4,
509
  "progress": "1:4/5",
510
  "evaluator result": "1/7"
511
  },
512
  {
513
- "model": "gamingagent + gemini-2.5-pro-preview-05-06 (thinking)",
514
  "score": 7,
515
  "progress": "2:2/9",
516
  "evaluator result": "2/3"
517
  },
518
  {
519
- "model": "gamingagent + grok-3-mini-beta (thinking)",
520
  "score": 0,
521
  "progress": "0",
522
  "evaluator result": "0"
 
5
  {
6
  "model": "gamingagent + claude-3-5-sonnet-20241022",
7
  "score": 1267.7,
8
+ "detail_data": "709,1532,1562",
9
  "progress": "1-1"
10
  },
11
  {
12
+ "model": "gamingagent + claude-3-7-sonnet-20250219",
13
  "score": 1418.7,
14
+ "detail_data": "2015,709,1532",
15
  "progress": "1-1"
16
  },
17
  {
18
+ "model": "gamingagent + gemini-2.5-flash-preview-04-17",
19
  "score": 1385.0,
20
+ "detail_data": "1672,1266,1247",
21
  "progress": "1-1"
22
  },
23
  {
24
+ "model": "gamingagent + gemini-2.5-pro-preview-05-06",
25
  "score": 1498.3,
26
+ "detail_data": "1561,1271,1663",
27
  "progress": "1-1"
28
  },
29
  {
30
  "model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8",
31
  "score": 1468.7,
32
+ "detail_data": "898,2008,1500",
33
  "progress": "1-1"
34
  },
35
  {
36
  "model": "gamingagent + gpt-4.1-2025-04-14",
37
  "score": 2126.3,
38
+ "detail_data": "1531,722,4126",
39
  "progress": "1-1"
40
  },
41
  {
42
  "model": "gamingagent + gpt-4o-2024-11-20",
43
  "score": 2047.3,
44
+ "detail_data": "2017,2590,1535",
45
  "progress": "1-1"
46
  },
47
  {
48
  "model": "gamingagent + o1-2024-12-17",
49
  "score": 855,
50
+ "detail_data": "855",
51
  "progress": "1-1"
52
  },
53
  {
54
  "model": "gamingagent + o3-2025-04-16",
55
  "score": 3445,
56
+ "detail_data": "3445",
57
  "progress": "1-1"
58
  },
59
  {
60
  "model": "gamingagent + o4-mini-2025-04-16",
61
  "score": 1448.0,
62
+ "detail_data": "1525,1263,1556",
63
  "progress": "1-1"
64
  },
65
  {
66
  "model": "random (x30)",
67
  "score": 986.97,
68
+ "detail_data": "986.97",
69
  "progress": "1-1"
70
  }
71
  ]
 
80
  "highest_tail": 256
81
  },
82
  {
83
+ "model": "gamingagent + claude-3-7-sonnet-20250219",
84
  "score": 2624,
85
  "details": "2560,3224,2088",
86
  "highest_tail": 256
 
92
  "highest_tail": 256
93
  },
94
  {
95
+ "model": "gamingagent + gemini-2.5-flash-preview-04-17",
96
  "score": 1697.33,
97
  "details": "1304,1316,2472",
98
  "highest_tail": 256
99
  },
100
  {
101
+ "model": "gamingagent + gemini-2.5-pro-preview-05-06",
102
  "score": 3586.67,
103
  "details": "5300,2400,3060",
104
  "highest_tail": 512
105
  },
106
  {
107
+ "model": "gamingagent + grok-3-mini-beta",
108
  "score": 4036,
109
  "details": "6412,2492,3204",
110
  "highest_tail": 512
 
192
  "details": "16,14,14"
193
  },
194
  {
195
+ "model": "gamingagent + claude-3-7-sonnet-20250219",
196
  "score": 16.3,
197
  "details": "19,15,15"
198
  },
 
202
  "details": "15,14,14"
203
  },
204
  {
205
+ "model": "gamingagent + gemini-2.5-flash-preview-04-17",
206
  "score": 16.3,
207
  "details": "20,14,15"
208
  },
209
  {
210
+ "model": "gamingagent + gemini-2.5-pro-preview-05-06",
211
  "score": 23.3,
212
  "details": "23,23,24"
213
  },
214
  {
215
+ "model": "gamingagent + grok-3-mini-beta",
216
  "score": 21.3,
217
  "details": "20,15,29"
218
  },
 
287
  "details": "92,165,61"
288
  },
289
  {
290
+ "model": "gamingagent + claude-3-7-sonnet-20250219",
291
  "score": 484,
292
  "details": "535,428,489"
293
  },
 
297
  "details": "409,436,497"
298
  },
299
  {
300
+ "model": "gamingagent + gemini-2.5-flash-preview-04-17",
301
  "score": 334.7,
302
  "details": "259,372,373"
303
  },
304
  {
305
+ "model": "gamingagent + gemini-2.5-pro-preview-05-06",
306
  "score": 416.3,
307
  "details": "411,414,424"
308
  },
309
  {
310
+ "model": "gamingagent + grok-3-mini-beta",
311
  "score": 254,
312
  "details": "299,332,131"
313
  },
 
379
  {
380
  "model": "gamingagent + claude-3-5-sonnet-20241022",
381
  "score": 0,
382
+ "detail_box_on_target": "0,0,0",
383
  "cracked_levels": "0,0,0"
384
  },
385
  {
386
+ "model": "gamingagent + claude-3-7-sonnet-20250219",
387
  "score": 2.33,
388
+ "detail_box_on_target": "2,4,1",
389
  "cracked_levels": "1,2,0"
390
  },
391
  {
392
  "model": "gamingagent + deepseek-r1-0120",
393
  "score": 1.33,
394
+ "detail_box_on_target": "2,0,2",
395
  "cracked_levels": "1,0,1"
396
  },
397
  {
398
+ "model": "gamingagent + gemini-2.5-flash-preview-04-17",
399
  "score": 1.67,
400
+ "detail_box_on_target": "3,0,2",
401
  "cracked_levels": "2,0,1"
402
  },
403
  {
404
+ "model": "gamingagent + gemini-2.5-pro-preview-05-06",
405
  "score": 4.33,
406
+ "detail_box_on_target": "4,4,5",
407
  "cracked_levels": "2,2,3"
408
  },
409
  {
410
+ "model": "gamingagent + grok-3-mini-beta",
411
  "score": 5.67,
412
+ "detail_box_on_target": "5,6,6",
413
  "cracked_levels": "3,3,3"
414
  },
415
  {
416
  "model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8",
417
  "score": 0,
418
+ "detail_box_on_target": "0,0,0",
419
  "cracked_levels": "0,0,0"
420
  },
421
  {
422
  "model": "gamingagent + gpt-4.1-2025-04-14",
423
  "score": 0,
424
+ "detail_box_on_target": "0,0,0",
425
  "cracked_levels": "0,0,0"
426
  },
427
  {
428
  "model": "gamingagent + gpt-4o-2024-11-20",
429
  "score": 0,
430
+ "detail_box_on_target": "0,0,0",
431
  "cracked_levels": "0,0,0"
432
  },
433
  {
434
  "model": "gamingagent + o1-2024-12-17",
435
  "score": 2.33,
436
+ "detail_box_on_target": "2,2,3",
437
  "cracked_levels": "1,1,2"
438
  },
439
  {
440
  "model": "gamingagent + o1-mini-2024-09-12",
441
  "score": 1.33,
442
+ "detail_box_on_target": "1,2,1",
443
  "cracked_levels": "0,1,0"
444
  },
445
  {
446
  "model": "gamingagent + o3-2025-04-16",
447
  "score": 8,
448
+ "detail_box_on_target": "10,6",
449
  "cracked_levels": "5,3"
450
  },
451
  {
452
  "model": "gamingagent + o4-mini-2025-04-16",
453
  "score": 5.33,
454
+ "detail_box_on_target": "4,6,6",
455
  "cracked_levels": "2,2,3"
456
  },
457
  {
458
  "model": "random (x30)",
459
  "score": 0,
460
+ "detail_box_on_target": "0,0,0",
461
  "cracked_levels": "0,0,0"
462
  },
463
  {
 
492
  "evaluator result": "1/3"
493
  },
494
  {
495
+ "model": "gamingagent + claude-3-7-sonnet-20250219",
496
  "score": 7,
497
  "progress": "2:2/9",
498
  "evaluator result": "5/11"
 
504
  "evaluator result": "1/5"
505
  },
506
  {
507
+ "model": "gamingagent + gemini-2.5-flash-preview-04-17",
508
  "score": 4,
509
  "progress": "1:4/5",
510
  "evaluator result": "1/7"
511
  },
512
  {
513
+ "model": "gamingagent + gemini-2.5-pro-preview-05-06",
514
  "score": 7,
515
  "progress": "2:2/9",
516
  "evaluator result": "2/3"
517
  },
518
  {
519
+ "model": "gamingagent + grok-3-mini-beta",
520
  "score": 0,
521
  "progress": "0",
522
  "evaluator result": "0"
rank_single_model_03_25_2025.json CHANGED
@@ -9,19 +9,19 @@
9
  "progress": "1-1"
10
  },
11
  {
12
- "model": "claude-3-7-sonnet-20250219 (thinking)",
13
  "score": 1430.0,
14
  "detail_data": "1532,1515,1243",
15
  "progress": "1-1"
16
  },
17
  {
18
- "model": "gemini-2.5-flash-preview-04-17 (thinking)",
19
  "score": 1540.7,
20
  "detail_data": "1794,1270,1558",
21
  "progress": "1-1"
22
  },
23
  {
24
- "model": "gemini-2.5-pro-preview-05-06 (thinking)",
25
  "score": 1025.3,
26
  "detail_data": "820,1534,722",
27
  "progress": "1-1"
@@ -75,67 +75,67 @@
75
  "results": [
76
  {
77
  "model": "claude-3-5-sonnet-20241022",
78
- "score": 17.0,
79
  "details": "188,20,44",
80
  "highest_tail": 32
81
  },
82
  {
83
- "model": "claude-3-7-sonnet-20250219 (thinking)",
84
- "score": 126.3,
85
  "details": "1596,4256,3008",
86
  "highest_tail": 512
87
  },
88
  {
89
- "model": "gemini-2.5-flash-preview-04-17 (thinking)",
90
- "score": 97.7,
91
  "details": "2228,1424,1564",
92
  "highest_tail": 256
93
  },
94
  {
95
- "model": "gemini-2.5-pro-preview-05-06 (thinking)",
96
  "score": 120.5,
97
- "details": "5784,3544,3704",
98
  "highest_tail": 512
99
  },
100
  {
101
  "model": "llama-4-maverick-17b-128e-instruct-fp8",
102
- "score": 44.6,
103
  "details": "16,56,12",
104
  "highest_tail": 64
105
  },
106
  {
107
  "model": "gpt-4.1-2025-04-14",
108
- "score": 94.5,
109
  "details": "264,500,2576",
110
  "highest_tail": 256
111
  },
112
  {
113
  "model": "gpt-4o-2024-11-20",
114
- "score": 70.4,
115
  "details": "292,196,40",
116
  "highest_tail": 32
117
  },
118
  {
119
  "model": "o1-2024-12-17",
120
- "score": 128.1,
121
  "details": "7176",
122
  "highest_tail": 512
123
  },
124
  {
125
  "model": "o3-2025-04-16",
126
- "score": 128.2,
127
  "details": "7220",
128
  "highest_tail": 512
129
  },
130
  {
131
  "model": "o4-mini-2025-04-16",
132
- "score": 97.6,
133
  "details": "3004,84,2560",
134
  "highest_tail": 256
135
  },
136
  {
137
  "model": "random (x30)",
138
- "score": 100.4,
139
  "details": "",
140
  "highest_tail": 128
141
  },
@@ -168,17 +168,17 @@
168
  "details": "10,15,12"
169
  },
170
  {
171
- "model": "claude-3-7-sonnet-20250219 (thinking)",
172
  "score": 13.0,
173
  "details": "13,13,13"
174
  },
175
  {
176
- "model": "gemini-2.5-flash-preview-04-17 (thinking)",
177
  "score": 19.0,
178
  "details": "15,18,24"
179
  },
180
  {
181
- "model": "gemini-2.5-pro-preview-05-06 (thinking)",
182
  "score": 12.3,
183
  "details": "15,9,13"
184
  },
@@ -243,17 +243,17 @@
243
  "details": "15,36,0"
244
  },
245
  {
246
- "model": "claude-3-7-sonnet-20250219 (thinking)",
247
  "score": 126.3,
248
  "details": "148,182,49"
249
  },
250
  {
251
- "model": "gemini-2.5-flash-preview-04-17 (thinking)",
252
  "score": 97.7,
253
  "details": "60,101,132"
254
  },
255
  {
256
- "model": "gemini-2.5-pro-preview-05-06 (thinking)",
257
  "score": 177.3,
258
  "details": "117,169,246"
259
  },
@@ -319,19 +319,19 @@
319
  "cracked_levels": "0,0,0"
320
  },
321
  {
322
- "model": "claude-3-7-sonnet-20250219 (thinking)",
323
  "score": 0,
324
  "detail_box_on_target": "0,0,0",
325
  "cracked_levels": "0,0,0"
326
  },
327
  {
328
- "model": "gemini-2.5-flash-preview-04-17 (thinking)",
329
  "score": 0,
330
  "detail_box_on_target": "0,0,0",
331
  "cracked_levels": "0,0,0"
332
  },
333
  {
334
- "model": "gemini-2.5-pro-preview-05-06 (thinking)",
335
  "score": 1,
336
  "detail_box_on_target": "1,1,1",
337
  "cracked_levels": "0,0,0"
@@ -407,17 +407,17 @@
407
  "progress": "1:1/5"
408
  },
409
  {
410
- "model": "claude-3-7-sonnet-20250219 (thinking)",
411
  "score": 3,
412
  "progress": "1:3/5"
413
  },
414
  {
415
- "model": "gemini-2.5-flash-preview-04-17 (thinking)",
416
  "score": 1,
417
  "progress": "1:1/5"
418
  },
419
  {
420
- "model": "gemini-2.5-pro-preview-05-06 (thinking)",
421
  "score": 8,
422
  "progress": "2:3/9"
423
  },
 
9
  "progress": "1-1"
10
  },
11
  {
12
+ "model": "claude-3-7-sonnet-20250219",
13
  "score": 1430.0,
14
  "detail_data": "1532,1515,1243",
15
  "progress": "1-1"
16
  },
17
  {
18
+ "model": "gemini-2.5-flash-preview-04-17",
19
  "score": 1540.7,
20
  "detail_data": "1794,1270,1558",
21
  "progress": "1-1"
22
  },
23
  {
24
+ "model": "gemini-2.5-pro-preview-05-06",
25
  "score": 1025.3,
26
  "detail_data": "820,1534,722",
27
  "progress": "1-1"
 
75
  "results": [
76
  {
77
  "model": "claude-3-5-sonnet-20241022",
78
+ "score": 84,
79
  "details": "188,20,44",
80
  "highest_tail": 32
81
  },
82
  {
83
+ "model": "claude-3-7-sonnet-20250219",
84
+ "score": 2624,
85
  "details": "1596,4256,3008",
86
  "highest_tail": 512
87
  },
88
  {
89
+ "model": "gemini-2.5-flash-preview-04-17",
90
+ "score": 1738.67,
91
  "details": "2228,1424,1564",
92
  "highest_tail": 256
93
  },
94
  {
95
+ "model": "gemini-2.5-pro-preview-05-06",
96
  "score": 120.5,
97
+ "details": "4344",
98
  "highest_tail": 512
99
  },
100
  {
101
  "model": "llama-4-maverick-17b-128e-instruct-fp8",
102
+ "score": 28,
103
  "details": "16,56,12",
104
  "highest_tail": 64
105
  },
106
  {
107
  "model": "gpt-4.1-2025-04-14",
108
+ "score": 1113.33,
109
  "details": "264,500,2576",
110
  "highest_tail": 256
111
  },
112
  {
113
  "model": "gpt-4o-2024-11-20",
114
+ "score": 176,
115
  "details": "292,196,40",
116
  "highest_tail": 32
117
  },
118
  {
119
  "model": "o1-2024-12-17",
120
+ "score": 7176,
121
  "details": "7176",
122
  "highest_tail": 512
123
  },
124
  {
125
  "model": "o3-2025-04-16",
126
+ "score": 7220,
127
  "details": "7220",
128
  "highest_tail": 512
129
  },
130
  {
131
  "model": "o4-mini-2025-04-16",
132
+ "score": 1882.67,
133
  "details": "3004,84,2560",
134
  "highest_tail": 256
135
  },
136
  {
137
  "model": "random (x30)",
138
+ "score": 1228,
139
  "details": "",
140
  "highest_tail": 128
141
  },
 
168
  "details": "10,15,12"
169
  },
170
  {
171
+ "model": "claude-3-7-sonnet-20250219",
172
  "score": 13.0,
173
  "details": "13,13,13"
174
  },
175
  {
176
+ "model": "gemini-2.5-flash-preview-04-17",
177
  "score": 19.0,
178
  "details": "15,18,24"
179
  },
180
  {
181
+ "model": "gemini-2.5-pro-preview-05-06",
182
  "score": 12.3,
183
  "details": "15,9,13"
184
  },
 
243
  "details": "15,36,0"
244
  },
245
  {
246
+ "model": "claude-3-7-sonnet-20250219",
247
  "score": 126.3,
248
  "details": "148,182,49"
249
  },
250
  {
251
+ "model": "gemini-2.5-flash-preview-04-17",
252
  "score": 97.7,
253
  "details": "60,101,132"
254
  },
255
  {
256
+ "model": "gemini-2.5-pro-preview-05-06",
257
  "score": 177.3,
258
  "details": "117,169,246"
259
  },
 
319
  "cracked_levels": "0,0,0"
320
  },
321
  {
322
+ "model": "claude-3-7-sonnet-20250219",
323
  "score": 0,
324
  "detail_box_on_target": "0,0,0",
325
  "cracked_levels": "0,0,0"
326
  },
327
  {
328
+ "model": "gemini-2.5-flash-preview-04-17",
329
  "score": 0,
330
  "detail_box_on_target": "0,0,0",
331
  "cracked_levels": "0,0,0"
332
  },
333
  {
334
+ "model": "gemini-2.5-pro-preview-05-06",
335
  "score": 1,
336
  "detail_box_on_target": "1,1,1",
337
  "cracked_levels": "0,0,0"
 
407
  "progress": "1:1/5"
408
  },
409
  {
410
+ "model": "claude-3-7-sonnet-20250219",
411
  "score": 3,
412
  "progress": "1:3/5"
413
  },
414
  {
415
+ "model": "gemini-2.5-flash-preview-04-17",
416
  "score": 1,
417
  "progress": "1:1/5"
418
  },
419
  {
420
+ "model": "gemini-2.5-pro-preview-05-06",
421
  "score": 8,
422
  "progress": "2:3/9"
423
  },