Add selimc/bilmecebench dataset and model results

#6
Files changed (35) hide show
  1. data.py +3 -2
  2. data/datasets.json +8 -0
  3. results/zero-shot/CerebrumTech__cere-llama-3-8b-tr.json +6 -0
  4. results/zero-shot/Llama-3.3-70B-Instruct.json +6 -0
  5. results/zero-shot/Ministral-8B-Instruct.json +6 -0
  6. results/zero-shot/Mistral-7B-Instruct-v0.3.json +6 -0
  7. results/zero-shot/Mistral-7B-v0.3.json +6 -0
  8. results/zero-shot/Mixtral-8x7B-Instruct-v0.1.json +6 -0
  9. results/zero-shot/Qwen2.5-0.5B-Instruct.json +6 -0
  10. results/zero-shot/Qwen2.5-0.5B.json +6 -0
  11. results/zero-shot/Qwen2.5-1.5B-Instruct.json +6 -0
  12. results/zero-shot/Qwen2.5-1.5B.json +6 -0
  13. results/zero-shot/Qwen2.5-14B-Instruct.json +6 -0
  14. results/zero-shot/Qwen2.5-14B.json +6 -0
  15. results/zero-shot/Qwen2.5-3B-Instruct.json +6 -0
  16. results/zero-shot/Qwen2.5-3B.json +6 -0
  17. results/zero-shot/Qwen2.5-7B-Instruct.json +6 -0
  18. results/zero-shot/Qwen2.5-7B.json +6 -0
  19. results/zero-shot/aya-23-35B.json +6 -0
  20. results/zero-shot/aya-23-8b.json +6 -0
  21. results/zero-shot/aya-expanse-32b.json +6 -0
  22. results/zero-shot/aya-expanse-8b.json +6 -0
  23. results/zero-shot/aya101.json +6 -0
  24. results/zero-shot/commencis-7b.json +6 -0
  25. results/zero-shot/kanarya-2b.json +6 -0
  26. results/zero-shot/llama-3-8b-instruct.json +6 -1
  27. results/zero-shot/llama-3-8b.json +6 -1
  28. results/zero-shot/llama-3.1-8b-instruct.json +6 -1
  29. results/zero-shot/llama-3.1-8b.json +6 -0
  30. results/zero-shot/llama-3.2-1b.json +6 -1
  31. results/zero-shot/llama-3.2-3b-instruct.json +6 -0
  32. results/zero-shot/llama-3.2-3b.json +6 -1
  33. results/zero-shot/mistral-7b.json +6 -0
  34. results/zero-shot/trendyol-7b.json +6 -0
  35. results/zero-shot/turna.json +6 -0
data.py CHANGED
@@ -49,6 +49,9 @@ DATASET_TASK_DICT = {
49
  'turkish_plu_next_event_prediction': Tasks.MULTIPLE_CHOICE,
50
  'turkish_plu_step_inference': Tasks.MULTIPLE_CHOICE,
51
  'turkish_plu_step_ordering': Tasks.MULTIPLE_CHOICE,
 
 
 
52
 
53
  # fact-checking, not sure whether these are multi-choice
54
  # 'trclaim19': Tasks.MULTIPLE_CHOICE,
@@ -64,8 +67,6 @@ DATASET_TASK_DICT = {
64
  # other generation
65
  'wmt-tr-en-prompt': Tasks.MACHINE_TRANSLATION,
66
  'gecturk_generation': Tasks.GRAMMATICAL_ERROR_CORRECTION,
67
- 'turkce_atasozleri': Tasks.MULTIPLE_CHOICE,
68
- 'turkishmmlu':Tasks.MULTIPLE_CHOICE,
69
  }
70
 
71
 
 
49
  'turkish_plu_next_event_prediction': Tasks.MULTIPLE_CHOICE,
50
  'turkish_plu_step_inference': Tasks.MULTIPLE_CHOICE,
51
  'turkish_plu_step_ordering': Tasks.MULTIPLE_CHOICE,
52
+ 'turkce_atasozleri': Tasks.MULTIPLE_CHOICE,
53
+ 'turkishmmlu':Tasks.MULTIPLE_CHOICE,
54
+ 'bilmecebench':Tasks.MULTIPLE_CHOICE,
55
 
56
  # fact-checking, not sure whether these are multi-choice
57
  # 'trclaim19': Tasks.MULTIPLE_CHOICE,
 
67
  # other generation
68
  'wmt-tr-en-prompt': Tasks.MACHINE_TRANSLATION,
69
  'gecturk_generation': Tasks.GRAMMATICAL_ERROR_CORRECTION,
 
 
70
  }
71
 
72
 
data/datasets.json CHANGED
@@ -197,5 +197,13 @@
197
  "url": "https://huggingface.co/datasets/AYueksel/TurkishMMLU",
198
  "hf_name": "AYueksel/TurkishMMLU",
199
  "generative": false
 
 
 
 
 
 
 
 
200
  }
201
  }
 
197
  "url": "https://huggingface.co/datasets/AYueksel/TurkishMMLU",
198
  "hf_name": "AYueksel/TurkishMMLU",
199
  "generative": false
200
+ },
201
+ "bilmecebench": {
202
+ "name": "Riddles",
203
+ "task": "multiple_choice",
204
+ "description": "A dataset for Turkish riddles and their answers.",
205
+ "url": "https://huggingface.co/datasets/selimc/bilmecebench",
206
+ "hf_name": "selimc/bilmecebench",
207
+ "generative": false
208
  }
209
  }
results/zero-shot/CerebrumTech__cere-llama-3-8b-tr.json CHANGED
@@ -180,6 +180,12 @@
180
  "task": "multiple_choice",
181
  "acc": 0.25555555555555554,
182
  "acc_norm": 0.25555555555555554
 
 
 
 
 
 
183
  }
184
  ]
185
  }
 
180
  "task": "multiple_choice",
181
  "acc": 0.25555555555555554,
182
  "acc_norm": 0.25555555555555554
183
+ },
184
+ {
185
+ "name": "bilmecebench",
186
+ "task": "multiple_choice",
187
+ "acc": 0.3393665158371041,
188
+ "acc_norm": 0.3393665158371041
189
  }
190
  ]
191
  }
results/zero-shot/Llama-3.3-70B-Instruct.json CHANGED
@@ -182,6 +182,12 @@
182
  "task": "multiple_choice",
183
  "acc": 0.646,
184
  "acc_norm": 0.646
 
 
 
 
 
 
185
  }
186
  ]
187
  }
 
182
  "task": "multiple_choice",
183
  "acc": 0.646,
184
  "acc_norm": 0.646
185
+ },
186
+ {
187
+ "name": "bilmecebench",
188
+ "task": "multiple_choice",
189
+ "acc": 0.7262443438914027,
190
+ "acc_norm": 0.7262443438914027
191
  }
192
  ]
193
  }
results/zero-shot/Ministral-8B-Instruct.json CHANGED
@@ -179,6 +179,12 @@
179
  "task": "multiple_choice",
180
  "acc": 0.2644444444444444,
181
  "acc_norm": 0.2644444444444444
 
 
 
 
 
 
182
  }
183
  ]
184
  }
 
179
  "task": "multiple_choice",
180
  "acc": 0.2644444444444444,
181
  "acc_norm": 0.2644444444444444
182
+ },
183
+ {
184
+ "name": "bilmecebench",
185
+ "task": "multiple_choice",
186
+ "acc": 0.248868778280543,
187
+ "acc_norm": 0.248868778280543
188
  }
189
  ]
190
  }
results/zero-shot/Mistral-7B-Instruct-v0.3.json CHANGED
@@ -179,6 +179,12 @@
179
  "task": "multiple_choice",
180
  "acc": 0.19555555555555557,
181
  "acc_norm": 0.19555555555555557
 
 
 
 
 
 
182
  }
183
  ]
184
  }
 
179
  "task": "multiple_choice",
180
  "acc": 0.19555555555555557,
181
  "acc_norm": 0.19555555555555557
182
+ },
183
+ {
184
+ "name": "bilmecebench",
185
+ "task": "multiple_choice",
186
+ "acc": 0.2149321266968326,
187
+ "acc_norm": 0.2149321266968326
188
  }
189
  ]
190
  }
results/zero-shot/Mistral-7B-v0.3.json CHANGED
@@ -179,6 +179,12 @@
179
  "task": "multiple_choice",
180
  "acc": 0.2688888888888889,
181
  "acc_norm": 0.2688888888888889
 
 
 
 
 
 
182
  }
183
  ]
184
  }
 
179
  "task": "multiple_choice",
180
  "acc": 0.2688888888888889,
181
  "acc_norm": 0.2688888888888889
182
+ },
183
+ {
184
+ "name": "bilmecebench",
185
+ "task": "multiple_choice",
186
+ "acc": 0.23529411764705882,
187
+ "acc_norm": 0.23529411764705882
188
  }
189
  ]
190
  }
results/zero-shot/Mixtral-8x7B-Instruct-v0.1.json CHANGED
@@ -181,6 +181,12 @@
181
  "task": "multiple_choice",
182
  "acc": 0.358,
183
  "acc_norm": 0.358
 
 
 
 
 
 
184
  }
185
  ]
186
  }
 
181
  "task": "multiple_choice",
182
  "acc": 0.358,
183
  "acc_norm": 0.358
184
+ },
185
+ {
186
+ "name": "bilmecebench",
187
+ "task": "multiple_choice",
188
+ "acc": 0.3416289592760181,
189
+ "acc_norm": 0.3416289592760181
190
  }
191
  ]
192
  }
results/zero-shot/Qwen2.5-0.5B-Instruct.json CHANGED
@@ -179,6 +179,12 @@
179
  "task": "multiple_choice",
180
  "acc": 0.2111111111111111,
181
  "acc_norm": 0.2111111111111111
 
 
 
 
 
 
182
  }
183
  ]
184
  }
 
179
  "task": "multiple_choice",
180
  "acc": 0.2111111111111111,
181
  "acc_norm": 0.2111111111111111
182
+ },
183
+ {
184
+ "name": "bilmecebench",
185
+ "task": "multiple_choice",
186
+ "acc": 0.2420814479638009,
187
+ "acc_norm": 0.2420814479638009
188
  }
189
  ]
190
  }
results/zero-shot/Qwen2.5-0.5B.json CHANGED
@@ -179,6 +179,12 @@
179
  "task": "multiple_choice",
180
  "acc": 0.17888888888888888,
181
  "acc_norm": 0.17888888888888888
 
 
 
 
 
 
182
  }
183
  ]
184
  }
 
179
  "task": "multiple_choice",
180
  "acc": 0.17888888888888888,
181
  "acc_norm": 0.17888888888888888
182
+ },
183
+ {
184
+ "name": "bilmecebench",
185
+ "task": "multiple_choice",
186
+ "acc": 0.251131221719457,
187
+ "acc_norm": 0.251131221719457
188
  }
189
  ]
190
  }
results/zero-shot/Qwen2.5-1.5B-Instruct.json CHANGED
@@ -179,6 +179,12 @@
179
  "task": "multiple_choice",
180
  "acc": 0.28888888888888886,
181
  "acc_norm": 0.28888888888888886
 
 
 
 
 
 
182
  }
183
  ]
184
  }
 
179
  "task": "multiple_choice",
180
  "acc": 0.28888888888888886,
181
  "acc_norm": 0.28888888888888886
182
+ },
183
+ {
184
+ "name": "bilmecebench",
185
+ "task": "multiple_choice",
186
+ "acc": 0.2918552036199095,
187
+ "acc_norm": 0.2918552036199095
188
  }
189
  ]
190
  }
results/zero-shot/Qwen2.5-1.5B.json CHANGED
@@ -179,6 +179,12 @@
179
  "task": "multiple_choice",
180
  "acc": 0.23,
181
  "acc_norm": 0.23
 
 
 
 
 
 
182
  }
183
  ]
184
  }
 
179
  "task": "multiple_choice",
180
  "acc": 0.23,
181
  "acc_norm": 0.23
182
+ },
183
+ {
184
+ "name": "bilmecebench",
185
+ "task": "multiple_choice",
186
+ "acc": 0.2986425339366516,
187
+ "acc_norm": 0.2986425339366516
188
  }
189
  ]
190
  }
results/zero-shot/Qwen2.5-14B-Instruct.json CHANGED
@@ -181,6 +181,12 @@
181
  "task": "multiple_choice",
182
  "acc": 0.5944444444444444,
183
  "acc_norm": 0.5944444444444444
 
 
 
 
 
 
184
  }
185
  ]
186
  }
 
181
  "task": "multiple_choice",
182
  "acc": 0.5944444444444444,
183
  "acc_norm": 0.5944444444444444
184
+ },
185
+ {
186
+ "name": "bilmecebench",
187
+ "task": "multiple_choice",
188
+ "acc": 0.5701357466063348,
189
+ "acc_norm": 0.5701357466063348
190
  }
191
  ]
192
  }
results/zero-shot/Qwen2.5-14B.json CHANGED
@@ -181,6 +181,12 @@
181
  "task": "multiple_choice",
182
  "acc": 0.5622222222222222,
183
  "acc_norm": 0.5622222222222222
 
 
 
 
 
 
184
  }
185
  ]
186
  }
 
181
  "task": "multiple_choice",
182
  "acc": 0.5622222222222222,
183
  "acc_norm": 0.5622222222222222
184
+ },
185
+ {
186
+ "name": "bilmecebench",
187
+ "task": "multiple_choice",
188
+ "acc": 0.4751131221719457,
189
+ "acc_norm": 0.4751131221719457
190
  }
191
  ]
192
  }
results/zero-shot/Qwen2.5-3B-Instruct.json CHANGED
@@ -179,6 +179,12 @@
179
  "task": "multiple_choice",
180
  "acc": 0.37777777777777777,
181
  "acc_norm": 0.37777777777777777
 
 
 
 
 
 
182
  }
183
  ]
184
  }
 
179
  "task": "multiple_choice",
180
  "acc": 0.37777777777777777,
181
  "acc_norm": 0.37777777777777777
182
+ },
183
+ {
184
+ "name": "bilmecebench",
185
+ "task": "multiple_choice",
186
+ "acc": 0.33031674208144796,
187
+ "acc_norm": 0.33031674208144796
188
  }
189
  ]
190
  }
results/zero-shot/Qwen2.5-3B.json CHANGED
@@ -179,6 +179,12 @@
179
  "task": "multiple_choice",
180
  "acc": 0.22555555555555556,
181
  "acc_norm": 0.22555555555555556
 
 
 
 
 
 
182
  }
183
  ]
184
  }
 
179
  "task": "multiple_choice",
180
  "acc": 0.22555555555555556,
181
  "acc_norm": 0.22555555555555556
182
+ },
183
+ {
184
+ "name": "bilmecebench",
185
+ "task": "multiple_choice",
186
+ "acc": 0.24434389140271492,
187
+ "acc_norm": 0.24434389140271492
188
  }
189
  ]
190
  }
results/zero-shot/Qwen2.5-7B-Instruct.json CHANGED
@@ -179,6 +179,12 @@
179
  "task": "multiple_choice",
180
  "acc": 0.47555555555555556,
181
  "acc_norm": 0.47555555555555556
 
 
 
 
 
 
182
  }
183
  ]
184
  }
 
179
  "task": "multiple_choice",
180
  "acc": 0.47555555555555556,
181
  "acc_norm": 0.47555555555555556
182
+ },
183
+ {
184
+ "name": "bilmecebench",
185
+ "task": "multiple_choice",
186
+ "acc": 0.5203619909502263,
187
+ "acc_norm": 0.5203619909502263
188
  }
189
  ]
190
  }
results/zero-shot/Qwen2.5-7B.json CHANGED
@@ -179,6 +179,12 @@
179
  "task": "multiple_choice",
180
  "acc": 0.49333333333333335,
181
  "acc_norm": 0.49333333333333335
 
 
 
 
 
 
182
  }
183
  ]
184
  }
 
179
  "task": "multiple_choice",
180
  "acc": 0.49333333333333335,
181
  "acc_norm": 0.49333333333333335
182
+ },
183
+ {
184
+ "name": "bilmecebench",
185
+ "task": "multiple_choice",
186
+ "acc": 0.4841628959276018,
187
+ "acc_norm": 0.4841628959276018
188
  }
189
  ]
190
  }
results/zero-shot/aya-23-35B.json CHANGED
@@ -181,6 +181,12 @@
181
  "task": "multiple_choice",
182
  "acc": 0.4533333333333333,
183
  "acc_norm": 0.4533333333333333
 
 
 
 
 
 
184
  }
185
  ]
186
  }
 
181
  "task": "multiple_choice",
182
  "acc": 0.4533333333333333,
183
  "acc_norm": 0.4533333333333333
184
+ },
185
+ {
186
+ "name": "bilmecebench",
187
+ "task": "multiple_choice",
188
+ "acc": 0.34841628959276016,
189
+ "acc_norm": 0.34841628959276016
190
  }
191
  ]
192
  }
results/zero-shot/aya-23-8b.json CHANGED
@@ -175,6 +175,12 @@
175
  "task": "multiple_choice",
176
  "acc": 0.33,
177
  "acc_norm": 0.33
 
 
 
 
 
 
178
  }
179
  ]
180
  }
 
175
  "task": "multiple_choice",
176
  "acc": 0.33,
177
  "acc_norm": 0.33
178
+ },
179
+ {
180
+ "name": "bilmecebench",
181
+ "task": "multiple_choice",
182
+ "acc": 0.3438914027149321,
183
+ "acc_norm": 0.3438914027149321
184
  }
185
  ]
186
  }
results/zero-shot/aya-expanse-32b.json CHANGED
@@ -180,6 +180,12 @@
180
  "task": "multiple_choice",
181
  "acc": 0.5688888888888889,
182
  "acc_norm": 0.5688888888888889
 
 
 
 
 
 
183
  }
184
  ]
185
  }
 
180
  "task": "multiple_choice",
181
  "acc": 0.5688888888888889,
182
  "acc_norm": 0.5688888888888889
183
+ },
184
+ {
185
+ "name": "bilmecebench",
186
+ "task": "multiple_choice",
187
+ "acc": 0.4117647058823529,
188
+ "acc_norm": 0.4117647058823529
189
  }
190
  ]
191
  }
results/zero-shot/aya-expanse-8b.json CHANGED
@@ -173,6 +173,12 @@
173
  "rouge1": 0.011791589892292625,
174
  "rouge2": 0.005495865074665877,
175
  "rougeL": 0.009523063183214307
 
 
 
 
 
 
176
  }
177
  ]
178
  }
 
173
  "rouge1": 0.011791589892292625,
174
  "rouge2": 0.005495865074665877,
175
  "rougeL": 0.009523063183214307
176
+ },
177
+ {
178
+ "name": "bilmecebench",
179
+ "task": "multiple_choice",
180
+ "acc": 0.48868778280542985,
181
+ "acc_norm": 0.48868778280542985
182
  }
183
  ]
184
  }
results/zero-shot/aya101.json CHANGED
@@ -179,6 +179,12 @@
179
  "task": "multiple_choice",
180
  "acc": 0.374,
181
  "acc_norm": 0.374
 
 
 
 
 
 
182
  }
183
  ]
184
  }
 
179
  "task": "multiple_choice",
180
  "acc": 0.374,
181
  "acc_norm": 0.374
182
+ },
183
+ {
184
+ "name": "bilmecebench",
185
+ "task": "multiple_choice",
186
+ "acc": 0.47058823529411764,
187
+ "acc_norm": 0.47058823529411764
188
  }
189
  ]
190
  }
results/zero-shot/commencis-7b.json CHANGED
@@ -179,6 +179,12 @@
179
  "task": "multiple_choice",
180
  "acc": 0.24666666666666667,
181
  "acc_norm": 0.24666666666666667
 
 
 
 
 
 
182
  }
183
  ]
184
  }
 
179
  "task": "multiple_choice",
180
  "acc": 0.24666666666666667,
181
  "acc_norm": 0.24666666666666667
182
+ },
183
+ {
184
+ "name": "bilmecebench",
185
+ "task": "multiple_choice",
186
+ "acc": 0.2420814479638009,
187
+ "acc_norm": 0.2420814479638009
188
  }
189
  ]
190
  }
results/zero-shot/kanarya-2b.json CHANGED
@@ -178,6 +178,12 @@
178
  "task": "multiple_choice",
179
  "acc": 0.18,
180
  "acc_norm": 0.18
 
 
 
 
 
 
181
  }
182
  ]
183
  }
 
178
  "task": "multiple_choice",
179
  "acc": 0.18,
180
  "acc_norm": 0.18
181
+ },
182
+ {
183
+ "name": "bilmecebench",
184
+ "task": "multiple_choice",
185
+ "acc": 0.27149321266968324,
186
+ "acc_norm": 0.27149321266968324
187
  }
188
  ]
189
  }
results/zero-shot/llama-3-8b-instruct.json CHANGED
@@ -174,7 +174,12 @@
174
  "task": "multiple_choice",
175
  "acc": 0.3811111111111111,
176
  "acc_norm": 0.3811111111111111
 
 
 
 
 
 
177
  }
178
-
179
  ]
180
  }
 
174
  "task": "multiple_choice",
175
  "acc": 0.3811111111111111,
176
  "acc_norm": 0.3811111111111111
177
+ },
178
+ {
179
+ "name": "bilmecebench",
180
+ "task": "multiple_choice",
181
+ "acc": 0.38461538461538464,
182
+ "acc_norm": 0.38461538461538464
183
  }
 
184
  ]
185
  }
results/zero-shot/llama-3-8b.json CHANGED
@@ -173,7 +173,12 @@
173
  "task": "multiple_choice",
174
  "acc": 0.2544444444444444,
175
  "acc_norm": 0.2544444444444444
 
 
 
 
 
 
176
  }
177
-
178
  ]
179
  }
 
173
  "task": "multiple_choice",
174
  "acc": 0.2544444444444444,
175
  "acc_norm": 0.2544444444444444
176
+ },
177
+ {
178
+ "name": "bilmecebench",
179
+ "task": "multiple_choice",
180
+ "acc": 0.29638009049773756,
181
+ "acc_norm": 0.29638009049773756
182
  }
 
183
  ]
184
  }
results/zero-shot/llama-3.1-8b-instruct.json CHANGED
@@ -173,7 +173,12 @@
173
  "task": "multiple_choice",
174
  "acc": 0.3811111111111111,
175
  "acc_norm": 0.3811111111111111
 
 
 
 
 
 
176
  }
177
-
178
  ]
179
  }
 
173
  "task": "multiple_choice",
174
  "acc": 0.3811111111111111,
175
  "acc_norm": 0.3811111111111111
176
+ },
177
+ {
178
+ "name": "bilmecebench",
179
+ "task": "multiple_choice",
180
+ "acc": 0.416289592760181,
181
+ "acc_norm": 0.416289592760181
182
  }
 
183
  ]
184
  }
results/zero-shot/llama-3.1-8b.json CHANGED
@@ -173,6 +173,12 @@
173
  "task": "multiple_choice",
174
  "acc": 0.3055555555555556,
175
  "acc_norm": 0.3055555555555556
 
 
 
 
 
 
176
  }
177
  ]
178
  }
 
173
  "task": "multiple_choice",
174
  "acc": 0.3055555555555556,
175
  "acc_norm": 0.3055555555555556
176
+ },
177
+ {
178
+ "name": "bilmecebench",
179
+ "task": "multiple_choice",
180
+ "acc": 0.3212669683257919,
181
+ "acc_norm": 0.3212669683257919
182
  }
183
  ]
184
  }
results/zero-shot/llama-3.2-1b.json CHANGED
@@ -205,7 +205,12 @@
205
  "task": "multiple_choice",
206
  "acc": 0.18888888888888888,
207
  "acc_norm": 0.18888888888888888
 
 
 
 
 
 
208
  }
209
-
210
  ]
211
  }
 
205
  "task": "multiple_choice",
206
  "acc": 0.18888888888888888,
207
  "acc_norm": 0.18888888888888888
208
+ },
209
+ {
210
+ "name": "bilmecebench",
211
+ "task": "multiple_choice",
212
+ "acc": 0.2239819004524887,
213
+ "acc_norm": 0.2239819004524887
214
  }
 
215
  ]
216
  }
results/zero-shot/llama-3.2-3b-instruct.json CHANGED
@@ -205,6 +205,12 @@
205
  "rouge1": 0.20895844439423625,
206
  "rouge2": 0.06600479924922747,
207
  "rougeL": 0.15296503277602666
 
 
 
 
 
 
208
  }
209
  ]
210
  }
 
205
  "rouge1": 0.20895844439423625,
206
  "rouge2": 0.06600479924922747,
207
  "rougeL": 0.15296503277602666
208
+ },
209
+ {
210
+ "name": "bilmecebench",
211
+ "task": "multiple_choice",
212
+ "acc": 0.30995475113122173,
213
+ "acc_norm": 0.30995475113122173
214
  }
215
  ]
216
  }
results/zero-shot/llama-3.2-3b.json CHANGED
@@ -173,7 +173,12 @@
173
  "task": "multiple_choice",
174
  "acc": 0.29,
175
  "acc_norm": 0.29
 
 
 
 
 
 
176
  }
177
-
178
  ]
179
  }
 
173
  "task": "multiple_choice",
174
  "acc": 0.29,
175
  "acc_norm": 0.29
176
+ },
177
+ {
178
+ "name": "bilmecebench",
179
+ "task": "multiple_choice",
180
+ "acc": 0.2895927601809955,
181
+ "acc_norm": 0.2895927601809955
182
  }
 
183
  ]
184
  }
results/zero-shot/mistral-7b.json CHANGED
@@ -179,6 +179,12 @@
179
  "rouge1": 0.03270500689456488,
180
  "rouge2": 0.011127489805662754,
181
  "rougeL": 0.025124899949616367
 
 
 
 
 
 
182
  }
183
  ]
184
  }
 
179
  "rouge1": 0.03270500689456488,
180
  "rouge2": 0.011127489805662754,
181
  "rougeL": 0.025124899949616367
182
+ },
183
+ {
184
+ "name": "bilmecebench",
185
+ "task": "multiple_choice",
186
+ "acc": 0.25339366515837103,
187
+ "acc_norm": 0.25339366515837103
188
  }
189
  ]
190
  }
results/zero-shot/trendyol-7b.json CHANGED
@@ -179,6 +179,12 @@
179
  "task": "multiple_choice",
180
  "acc": 0.2477777777777778,
181
  "acc_norm": 0.2477777777777778
 
 
 
 
 
 
182
  }
183
  ]
184
  }
 
179
  "task": "multiple_choice",
180
  "acc": 0.2477777777777778,
181
  "acc_norm": 0.2477777777777778
182
+ },
183
+ {
184
+ "name": "bilmecebench",
185
+ "task": "multiple_choice",
186
+ "acc": 0.23076923076923078,
187
+ "acc_norm": 0.23076923076923078
188
  }
189
  ]
190
  }
results/zero-shot/turna.json CHANGED
@@ -179,6 +179,12 @@
179
  "task": "multiple_choice",
180
  "acc": 0.19333333333333333,
181
  "acc_norm": 0.19333333333333333
 
 
 
 
 
 
182
  }
183
  ]
184
  }
 
179
  "task": "multiple_choice",
180
  "acc": 0.19333333333333333,
181
  "acc_norm": 0.19333333333333333
182
+ },
183
+ {
184
+ "name": "bilmecebench",
185
+ "task": "multiple_choice",
186
+ "acc": 0.2420814479638009,
187
+ "acc_norm": 0.2420814479638009
188
  }
189
  ]
190
  }