Spaces:
Running
Running
Add selimc/bilmecebench dataset and model results
#6
by
abrek
- opened
- data.py +3 -2
- data/datasets.json +8 -0
- results/zero-shot/CerebrumTech__cere-llama-3-8b-tr.json +6 -0
- results/zero-shot/Llama-3.3-70B-Instruct.json +6 -0
- results/zero-shot/Ministral-8B-Instruct.json +6 -0
- results/zero-shot/Mistral-7B-Instruct-v0.3.json +6 -0
- results/zero-shot/Mistral-7B-v0.3.json +6 -0
- results/zero-shot/Mixtral-8x7B-Instruct-v0.1.json +6 -0
- results/zero-shot/Qwen2.5-0.5B-Instruct.json +6 -0
- results/zero-shot/Qwen2.5-0.5B.json +6 -0
- results/zero-shot/Qwen2.5-1.5B-Instruct.json +6 -0
- results/zero-shot/Qwen2.5-1.5B.json +6 -0
- results/zero-shot/Qwen2.5-14B-Instruct.json +6 -0
- results/zero-shot/Qwen2.5-14B.json +6 -0
- results/zero-shot/Qwen2.5-3B-Instruct.json +6 -0
- results/zero-shot/Qwen2.5-3B.json +6 -0
- results/zero-shot/Qwen2.5-7B-Instruct.json +6 -0
- results/zero-shot/Qwen2.5-7B.json +6 -0
- results/zero-shot/aya-23-35B.json +6 -0
- results/zero-shot/aya-23-8b.json +6 -0
- results/zero-shot/aya-expanse-32b.json +6 -0
- results/zero-shot/aya-expanse-8b.json +6 -0
- results/zero-shot/aya101.json +6 -0
- results/zero-shot/commencis-7b.json +6 -0
- results/zero-shot/kanarya-2b.json +6 -0
- results/zero-shot/llama-3-8b-instruct.json +6 -1
- results/zero-shot/llama-3-8b.json +6 -1
- results/zero-shot/llama-3.1-8b-instruct.json +6 -1
- results/zero-shot/llama-3.1-8b.json +6 -0
- results/zero-shot/llama-3.2-1b.json +6 -1
- results/zero-shot/llama-3.2-3b-instruct.json +6 -0
- results/zero-shot/llama-3.2-3b.json +6 -1
- results/zero-shot/mistral-7b.json +6 -0
- results/zero-shot/trendyol-7b.json +6 -0
- results/zero-shot/turna.json +6 -0
data.py
CHANGED
@@ -49,6 +49,9 @@ DATASET_TASK_DICT = {
|
|
49 |
'turkish_plu_next_event_prediction': Tasks.MULTIPLE_CHOICE,
|
50 |
'turkish_plu_step_inference': Tasks.MULTIPLE_CHOICE,
|
51 |
'turkish_plu_step_ordering': Tasks.MULTIPLE_CHOICE,
|
|
|
|
|
|
|
52 |
|
53 |
# fact-checking, not sure whether these are multi-choice
|
54 |
# 'trclaim19': Tasks.MULTIPLE_CHOICE,
|
@@ -64,8 +67,6 @@ DATASET_TASK_DICT = {
|
|
64 |
# other generation
|
65 |
'wmt-tr-en-prompt': Tasks.MACHINE_TRANSLATION,
|
66 |
'gecturk_generation': Tasks.GRAMMATICAL_ERROR_CORRECTION,
|
67 |
-
'turkce_atasozleri': Tasks.MULTIPLE_CHOICE,
|
68 |
-
'turkishmmlu':Tasks.MULTIPLE_CHOICE,
|
69 |
}
|
70 |
|
71 |
|
|
|
49 |
'turkish_plu_next_event_prediction': Tasks.MULTIPLE_CHOICE,
|
50 |
'turkish_plu_step_inference': Tasks.MULTIPLE_CHOICE,
|
51 |
'turkish_plu_step_ordering': Tasks.MULTIPLE_CHOICE,
|
52 |
+
'turkce_atasozleri': Tasks.MULTIPLE_CHOICE,
|
53 |
+
'turkishmmlu':Tasks.MULTIPLE_CHOICE,
|
54 |
+
'bilmecebench':Tasks.MULTIPLE_CHOICE,
|
55 |
|
56 |
# fact-checking, not sure whether these are multi-choice
|
57 |
# 'trclaim19': Tasks.MULTIPLE_CHOICE,
|
|
|
67 |
# other generation
|
68 |
'wmt-tr-en-prompt': Tasks.MACHINE_TRANSLATION,
|
69 |
'gecturk_generation': Tasks.GRAMMATICAL_ERROR_CORRECTION,
|
|
|
|
|
70 |
}
|
71 |
|
72 |
|
data/datasets.json
CHANGED
@@ -197,5 +197,13 @@
|
|
197 |
"url": "https://huggingface.co/datasets/AYueksel/TurkishMMLU",
|
198 |
"hf_name": "AYueksel/TurkishMMLU",
|
199 |
"generative": false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
}
|
201 |
}
|
|
|
197 |
"url": "https://huggingface.co/datasets/AYueksel/TurkishMMLU",
|
198 |
"hf_name": "AYueksel/TurkishMMLU",
|
199 |
"generative": false
|
200 |
+
},
|
201 |
+
"bilmecebench": {
|
202 |
+
"name": "Riddles",
|
203 |
+
"task": "multiple_choice",
|
204 |
+
"description": "A dataset for Turkish riddles and their answers.",
|
205 |
+
"url": "https://huggingface.co/datasets/selimc/bilmecebench",
|
206 |
+
"hf_name": "selimc/bilmecebench",
|
207 |
+
"generative": false
|
208 |
}
|
209 |
}
|
results/zero-shot/CerebrumTech__cere-llama-3-8b-tr.json
CHANGED
@@ -180,6 +180,12 @@
|
|
180 |
"task": "multiple_choice",
|
181 |
"acc": 0.25555555555555554,
|
182 |
"acc_norm": 0.25555555555555554
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
}
|
184 |
]
|
185 |
}
|
|
|
180 |
"task": "multiple_choice",
|
181 |
"acc": 0.25555555555555554,
|
182 |
"acc_norm": 0.25555555555555554
|
183 |
+
},
|
184 |
+
{
|
185 |
+
"name": "bilmecebench",
|
186 |
+
"task": "multiple_choice",
|
187 |
+
"acc": 0.3393665158371041,
|
188 |
+
"acc_norm": 0.3393665158371041
|
189 |
}
|
190 |
]
|
191 |
}
|
results/zero-shot/Llama-3.3-70B-Instruct.json
CHANGED
@@ -182,6 +182,12 @@
|
|
182 |
"task": "multiple_choice",
|
183 |
"acc": 0.646,
|
184 |
"acc_norm": 0.646
|
|
|
|
|
|
|
|
|
|
|
|
|
185 |
}
|
186 |
]
|
187 |
}
|
|
|
182 |
"task": "multiple_choice",
|
183 |
"acc": 0.646,
|
184 |
"acc_norm": 0.646
|
185 |
+
},
|
186 |
+
{
|
187 |
+
"name": "bilmecebench",
|
188 |
+
"task": "multiple_choice",
|
189 |
+
"acc": 0.7262443438914027,
|
190 |
+
"acc_norm": 0.7262443438914027
|
191 |
}
|
192 |
]
|
193 |
}
|
results/zero-shot/Ministral-8B-Instruct.json
CHANGED
@@ -179,6 +179,12 @@
|
|
179 |
"task": "multiple_choice",
|
180 |
"acc": 0.2644444444444444,
|
181 |
"acc_norm": 0.2644444444444444
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
}
|
183 |
]
|
184 |
}
|
|
|
179 |
"task": "multiple_choice",
|
180 |
"acc": 0.2644444444444444,
|
181 |
"acc_norm": 0.2644444444444444
|
182 |
+
},
|
183 |
+
{
|
184 |
+
"name": "bilmecebench",
|
185 |
+
"task": "multiple_choice",
|
186 |
+
"acc": 0.248868778280543,
|
187 |
+
"acc_norm": 0.248868778280543
|
188 |
}
|
189 |
]
|
190 |
}
|
results/zero-shot/Mistral-7B-Instruct-v0.3.json
CHANGED
@@ -179,6 +179,12 @@
|
|
179 |
"task": "multiple_choice",
|
180 |
"acc": 0.19555555555555557,
|
181 |
"acc_norm": 0.19555555555555557
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
}
|
183 |
]
|
184 |
}
|
|
|
179 |
"task": "multiple_choice",
|
180 |
"acc": 0.19555555555555557,
|
181 |
"acc_norm": 0.19555555555555557
|
182 |
+
},
|
183 |
+
{
|
184 |
+
"name": "bilmecebench",
|
185 |
+
"task": "multiple_choice",
|
186 |
+
"acc": 0.2149321266968326,
|
187 |
+
"acc_norm": 0.2149321266968326
|
188 |
}
|
189 |
]
|
190 |
}
|
results/zero-shot/Mistral-7B-v0.3.json
CHANGED
@@ -179,6 +179,12 @@
|
|
179 |
"task": "multiple_choice",
|
180 |
"acc": 0.2688888888888889,
|
181 |
"acc_norm": 0.2688888888888889
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
}
|
183 |
]
|
184 |
}
|
|
|
179 |
"task": "multiple_choice",
|
180 |
"acc": 0.2688888888888889,
|
181 |
"acc_norm": 0.2688888888888889
|
182 |
+
},
|
183 |
+
{
|
184 |
+
"name": "bilmecebench",
|
185 |
+
"task": "multiple_choice",
|
186 |
+
"acc": 0.23529411764705882,
|
187 |
+
"acc_norm": 0.23529411764705882
|
188 |
}
|
189 |
]
|
190 |
}
|
results/zero-shot/Mixtral-8x7B-Instruct-v0.1.json
CHANGED
@@ -181,6 +181,12 @@
|
|
181 |
"task": "multiple_choice",
|
182 |
"acc": 0.358,
|
183 |
"acc_norm": 0.358
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
}
|
185 |
]
|
186 |
}
|
|
|
181 |
"task": "multiple_choice",
|
182 |
"acc": 0.358,
|
183 |
"acc_norm": 0.358
|
184 |
+
},
|
185 |
+
{
|
186 |
+
"name": "bilmecebench",
|
187 |
+
"task": "multiple_choice",
|
188 |
+
"acc": 0.3416289592760181,
|
189 |
+
"acc_norm": 0.3416289592760181
|
190 |
}
|
191 |
]
|
192 |
}
|
results/zero-shot/Qwen2.5-0.5B-Instruct.json
CHANGED
@@ -179,6 +179,12 @@
|
|
179 |
"task": "multiple_choice",
|
180 |
"acc": 0.2111111111111111,
|
181 |
"acc_norm": 0.2111111111111111
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
}
|
183 |
]
|
184 |
}
|
|
|
179 |
"task": "multiple_choice",
|
180 |
"acc": 0.2111111111111111,
|
181 |
"acc_norm": 0.2111111111111111
|
182 |
+
},
|
183 |
+
{
|
184 |
+
"name": "bilmecebench",
|
185 |
+
"task": "multiple_choice",
|
186 |
+
"acc": 0.2420814479638009,
|
187 |
+
"acc_norm": 0.2420814479638009
|
188 |
}
|
189 |
]
|
190 |
}
|
results/zero-shot/Qwen2.5-0.5B.json
CHANGED
@@ -179,6 +179,12 @@
|
|
179 |
"task": "multiple_choice",
|
180 |
"acc": 0.17888888888888888,
|
181 |
"acc_norm": 0.17888888888888888
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
}
|
183 |
]
|
184 |
}
|
|
|
179 |
"task": "multiple_choice",
|
180 |
"acc": 0.17888888888888888,
|
181 |
"acc_norm": 0.17888888888888888
|
182 |
+
},
|
183 |
+
{
|
184 |
+
"name": "bilmecebench",
|
185 |
+
"task": "multiple_choice",
|
186 |
+
"acc": 0.251131221719457,
|
187 |
+
"acc_norm": 0.251131221719457
|
188 |
}
|
189 |
]
|
190 |
}
|
results/zero-shot/Qwen2.5-1.5B-Instruct.json
CHANGED
@@ -179,6 +179,12 @@
|
|
179 |
"task": "multiple_choice",
|
180 |
"acc": 0.28888888888888886,
|
181 |
"acc_norm": 0.28888888888888886
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
}
|
183 |
]
|
184 |
}
|
|
|
179 |
"task": "multiple_choice",
|
180 |
"acc": 0.28888888888888886,
|
181 |
"acc_norm": 0.28888888888888886
|
182 |
+
},
|
183 |
+
{
|
184 |
+
"name": "bilmecebench",
|
185 |
+
"task": "multiple_choice",
|
186 |
+
"acc": 0.2918552036199095,
|
187 |
+
"acc_norm": 0.2918552036199095
|
188 |
}
|
189 |
]
|
190 |
}
|
results/zero-shot/Qwen2.5-1.5B.json
CHANGED
@@ -179,6 +179,12 @@
|
|
179 |
"task": "multiple_choice",
|
180 |
"acc": 0.23,
|
181 |
"acc_norm": 0.23
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
}
|
183 |
]
|
184 |
}
|
|
|
179 |
"task": "multiple_choice",
|
180 |
"acc": 0.23,
|
181 |
"acc_norm": 0.23
|
182 |
+
},
|
183 |
+
{
|
184 |
+
"name": "bilmecebench",
|
185 |
+
"task": "multiple_choice",
|
186 |
+
"acc": 0.2986425339366516,
|
187 |
+
"acc_norm": 0.2986425339366516
|
188 |
}
|
189 |
]
|
190 |
}
|
results/zero-shot/Qwen2.5-14B-Instruct.json
CHANGED
@@ -181,6 +181,12 @@
|
|
181 |
"task": "multiple_choice",
|
182 |
"acc": 0.5944444444444444,
|
183 |
"acc_norm": 0.5944444444444444
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
}
|
185 |
]
|
186 |
}
|
|
|
181 |
"task": "multiple_choice",
|
182 |
"acc": 0.5944444444444444,
|
183 |
"acc_norm": 0.5944444444444444
|
184 |
+
},
|
185 |
+
{
|
186 |
+
"name": "bilmecebench",
|
187 |
+
"task": "multiple_choice",
|
188 |
+
"acc": 0.5701357466063348,
|
189 |
+
"acc_norm": 0.5701357466063348
|
190 |
}
|
191 |
]
|
192 |
}
|
results/zero-shot/Qwen2.5-14B.json
CHANGED
@@ -181,6 +181,12 @@
|
|
181 |
"task": "multiple_choice",
|
182 |
"acc": 0.5622222222222222,
|
183 |
"acc_norm": 0.5622222222222222
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
}
|
185 |
]
|
186 |
}
|
|
|
181 |
"task": "multiple_choice",
|
182 |
"acc": 0.5622222222222222,
|
183 |
"acc_norm": 0.5622222222222222
|
184 |
+
},
|
185 |
+
{
|
186 |
+
"name": "bilmecebench",
|
187 |
+
"task": "multiple_choice",
|
188 |
+
"acc": 0.4751131221719457,
|
189 |
+
"acc_norm": 0.4751131221719457
|
190 |
}
|
191 |
]
|
192 |
}
|
results/zero-shot/Qwen2.5-3B-Instruct.json
CHANGED
@@ -179,6 +179,12 @@
|
|
179 |
"task": "multiple_choice",
|
180 |
"acc": 0.37777777777777777,
|
181 |
"acc_norm": 0.37777777777777777
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
}
|
183 |
]
|
184 |
}
|
|
|
179 |
"task": "multiple_choice",
|
180 |
"acc": 0.37777777777777777,
|
181 |
"acc_norm": 0.37777777777777777
|
182 |
+
},
|
183 |
+
{
|
184 |
+
"name": "bilmecebench",
|
185 |
+
"task": "multiple_choice",
|
186 |
+
"acc": 0.33031674208144796,
|
187 |
+
"acc_norm": 0.33031674208144796
|
188 |
}
|
189 |
]
|
190 |
}
|
results/zero-shot/Qwen2.5-3B.json
CHANGED
@@ -179,6 +179,12 @@
|
|
179 |
"task": "multiple_choice",
|
180 |
"acc": 0.22555555555555556,
|
181 |
"acc_norm": 0.22555555555555556
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
}
|
183 |
]
|
184 |
}
|
|
|
179 |
"task": "multiple_choice",
|
180 |
"acc": 0.22555555555555556,
|
181 |
"acc_norm": 0.22555555555555556
|
182 |
+
},
|
183 |
+
{
|
184 |
+
"name": "bilmecebench",
|
185 |
+
"task": "multiple_choice",
|
186 |
+
"acc": 0.24434389140271492,
|
187 |
+
"acc_norm": 0.24434389140271492
|
188 |
}
|
189 |
]
|
190 |
}
|
results/zero-shot/Qwen2.5-7B-Instruct.json
CHANGED
@@ -179,6 +179,12 @@
|
|
179 |
"task": "multiple_choice",
|
180 |
"acc": 0.47555555555555556,
|
181 |
"acc_norm": 0.47555555555555556
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
}
|
183 |
]
|
184 |
}
|
|
|
179 |
"task": "multiple_choice",
|
180 |
"acc": 0.47555555555555556,
|
181 |
"acc_norm": 0.47555555555555556
|
182 |
+
},
|
183 |
+
{
|
184 |
+
"name": "bilmecebench",
|
185 |
+
"task": "multiple_choice",
|
186 |
+
"acc": 0.5203619909502263,
|
187 |
+
"acc_norm": 0.5203619909502263
|
188 |
}
|
189 |
]
|
190 |
}
|
results/zero-shot/Qwen2.5-7B.json
CHANGED
@@ -179,6 +179,12 @@
|
|
179 |
"task": "multiple_choice",
|
180 |
"acc": 0.49333333333333335,
|
181 |
"acc_norm": 0.49333333333333335
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
}
|
183 |
]
|
184 |
}
|
|
|
179 |
"task": "multiple_choice",
|
180 |
"acc": 0.49333333333333335,
|
181 |
"acc_norm": 0.49333333333333335
|
182 |
+
},
|
183 |
+
{
|
184 |
+
"name": "bilmecebench",
|
185 |
+
"task": "multiple_choice",
|
186 |
+
"acc": 0.4841628959276018,
|
187 |
+
"acc_norm": 0.4841628959276018
|
188 |
}
|
189 |
]
|
190 |
}
|
results/zero-shot/aya-23-35B.json
CHANGED
@@ -181,6 +181,12 @@
|
|
181 |
"task": "multiple_choice",
|
182 |
"acc": 0.4533333333333333,
|
183 |
"acc_norm": 0.4533333333333333
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
}
|
185 |
]
|
186 |
}
|
|
|
181 |
"task": "multiple_choice",
|
182 |
"acc": 0.4533333333333333,
|
183 |
"acc_norm": 0.4533333333333333
|
184 |
+
},
|
185 |
+
{
|
186 |
+
"name": "bilmecebench",
|
187 |
+
"task": "multiple_choice",
|
188 |
+
"acc": 0.34841628959276016,
|
189 |
+
"acc_norm": 0.34841628959276016
|
190 |
}
|
191 |
]
|
192 |
}
|
results/zero-shot/aya-23-8b.json
CHANGED
@@ -175,6 +175,12 @@
|
|
175 |
"task": "multiple_choice",
|
176 |
"acc": 0.33,
|
177 |
"acc_norm": 0.33
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
}
|
179 |
]
|
180 |
}
|
|
|
175 |
"task": "multiple_choice",
|
176 |
"acc": 0.33,
|
177 |
"acc_norm": 0.33
|
178 |
+
},
|
179 |
+
{
|
180 |
+
"name": "bilmecebench",
|
181 |
+
"task": "multiple_choice",
|
182 |
+
"acc": 0.3438914027149321,
|
183 |
+
"acc_norm": 0.3438914027149321
|
184 |
}
|
185 |
]
|
186 |
}
|
results/zero-shot/aya-expanse-32b.json
CHANGED
@@ -180,6 +180,12 @@
|
|
180 |
"task": "multiple_choice",
|
181 |
"acc": 0.5688888888888889,
|
182 |
"acc_norm": 0.5688888888888889
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
}
|
184 |
]
|
185 |
}
|
|
|
180 |
"task": "multiple_choice",
|
181 |
"acc": 0.5688888888888889,
|
182 |
"acc_norm": 0.5688888888888889
|
183 |
+
},
|
184 |
+
{
|
185 |
+
"name": "bilmecebench",
|
186 |
+
"task": "multiple_choice",
|
187 |
+
"acc": 0.4117647058823529,
|
188 |
+
"acc_norm": 0.4117647058823529
|
189 |
}
|
190 |
]
|
191 |
}
|
results/zero-shot/aya-expanse-8b.json
CHANGED
@@ -173,6 +173,12 @@
|
|
173 |
"rouge1": 0.011791589892292625,
|
174 |
"rouge2": 0.005495865074665877,
|
175 |
"rougeL": 0.009523063183214307
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
}
|
177 |
]
|
178 |
}
|
|
|
173 |
"rouge1": 0.011791589892292625,
|
174 |
"rouge2": 0.005495865074665877,
|
175 |
"rougeL": 0.009523063183214307
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"name": "bilmecebench",
|
179 |
+
"task": "multiple_choice",
|
180 |
+
"acc": 0.48868778280542985,
|
181 |
+
"acc_norm": 0.48868778280542985
|
182 |
}
|
183 |
]
|
184 |
}
|
results/zero-shot/aya101.json
CHANGED
@@ -179,6 +179,12 @@
|
|
179 |
"task": "multiple_choice",
|
180 |
"acc": 0.374,
|
181 |
"acc_norm": 0.374
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
}
|
183 |
]
|
184 |
}
|
|
|
179 |
"task": "multiple_choice",
|
180 |
"acc": 0.374,
|
181 |
"acc_norm": 0.374
|
182 |
+
},
|
183 |
+
{
|
184 |
+
"name": "bilmecebench",
|
185 |
+
"task": "multiple_choice",
|
186 |
+
"acc": 0.47058823529411764,
|
187 |
+
"acc_norm": 0.47058823529411764
|
188 |
}
|
189 |
]
|
190 |
}
|
results/zero-shot/commencis-7b.json
CHANGED
@@ -179,6 +179,12 @@
|
|
179 |
"task": "multiple_choice",
|
180 |
"acc": 0.24666666666666667,
|
181 |
"acc_norm": 0.24666666666666667
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
}
|
183 |
]
|
184 |
}
|
|
|
179 |
"task": "multiple_choice",
|
180 |
"acc": 0.24666666666666667,
|
181 |
"acc_norm": 0.24666666666666667
|
182 |
+
},
|
183 |
+
{
|
184 |
+
"name": "bilmecebench",
|
185 |
+
"task": "multiple_choice",
|
186 |
+
"acc": 0.2420814479638009,
|
187 |
+
"acc_norm": 0.2420814479638009
|
188 |
}
|
189 |
]
|
190 |
}
|
results/zero-shot/kanarya-2b.json
CHANGED
@@ -178,6 +178,12 @@
|
|
178 |
"task": "multiple_choice",
|
179 |
"acc": 0.18,
|
180 |
"acc_norm": 0.18
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
}
|
182 |
]
|
183 |
}
|
|
|
178 |
"task": "multiple_choice",
|
179 |
"acc": 0.18,
|
180 |
"acc_norm": 0.18
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"name": "bilmecebench",
|
184 |
+
"task": "multiple_choice",
|
185 |
+
"acc": 0.27149321266968324,
|
186 |
+
"acc_norm": 0.27149321266968324
|
187 |
}
|
188 |
]
|
189 |
}
|
results/zero-shot/llama-3-8b-instruct.json
CHANGED
@@ -174,7 +174,12 @@
|
|
174 |
"task": "multiple_choice",
|
175 |
"acc": 0.3811111111111111,
|
176 |
"acc_norm": 0.3811111111111111
|
|
|
|
|
|
|
|
|
|
|
|
|
177 |
}
|
178 |
-
|
179 |
]
|
180 |
}
|
|
|
174 |
"task": "multiple_choice",
|
175 |
"acc": 0.3811111111111111,
|
176 |
"acc_norm": 0.3811111111111111
|
177 |
+
},
|
178 |
+
{
|
179 |
+
"name": "bilmecebench",
|
180 |
+
"task": "multiple_choice",
|
181 |
+
"acc": 0.38461538461538464,
|
182 |
+
"acc_norm": 0.38461538461538464
|
183 |
}
|
|
|
184 |
]
|
185 |
}
|
results/zero-shot/llama-3-8b.json
CHANGED
@@ -173,7 +173,12 @@
|
|
173 |
"task": "multiple_choice",
|
174 |
"acc": 0.2544444444444444,
|
175 |
"acc_norm": 0.2544444444444444
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
}
|
177 |
-
|
178 |
]
|
179 |
}
|
|
|
173 |
"task": "multiple_choice",
|
174 |
"acc": 0.2544444444444444,
|
175 |
"acc_norm": 0.2544444444444444
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"name": "bilmecebench",
|
179 |
+
"task": "multiple_choice",
|
180 |
+
"acc": 0.29638009049773756,
|
181 |
+
"acc_norm": 0.29638009049773756
|
182 |
}
|
|
|
183 |
]
|
184 |
}
|
results/zero-shot/llama-3.1-8b-instruct.json
CHANGED
@@ -173,7 +173,12 @@
|
|
173 |
"task": "multiple_choice",
|
174 |
"acc": 0.3811111111111111,
|
175 |
"acc_norm": 0.3811111111111111
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
}
|
177 |
-
|
178 |
]
|
179 |
}
|
|
|
173 |
"task": "multiple_choice",
|
174 |
"acc": 0.3811111111111111,
|
175 |
"acc_norm": 0.3811111111111111
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"name": "bilmecebench",
|
179 |
+
"task": "multiple_choice",
|
180 |
+
"acc": 0.416289592760181,
|
181 |
+
"acc_norm": 0.416289592760181
|
182 |
}
|
|
|
183 |
]
|
184 |
}
|
results/zero-shot/llama-3.1-8b.json
CHANGED
@@ -173,6 +173,12 @@
|
|
173 |
"task": "multiple_choice",
|
174 |
"acc": 0.3055555555555556,
|
175 |
"acc_norm": 0.3055555555555556
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
}
|
177 |
]
|
178 |
}
|
|
|
173 |
"task": "multiple_choice",
|
174 |
"acc": 0.3055555555555556,
|
175 |
"acc_norm": 0.3055555555555556
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"name": "bilmecebench",
|
179 |
+
"task": "multiple_choice",
|
180 |
+
"acc": 0.3212669683257919,
|
181 |
+
"acc_norm": 0.3212669683257919
|
182 |
}
|
183 |
]
|
184 |
}
|
results/zero-shot/llama-3.2-1b.json
CHANGED
@@ -205,7 +205,12 @@
|
|
205 |
"task": "multiple_choice",
|
206 |
"acc": 0.18888888888888888,
|
207 |
"acc_norm": 0.18888888888888888
|
|
|
|
|
|
|
|
|
|
|
|
|
208 |
}
|
209 |
-
|
210 |
]
|
211 |
}
|
|
|
205 |
"task": "multiple_choice",
|
206 |
"acc": 0.18888888888888888,
|
207 |
"acc_norm": 0.18888888888888888
|
208 |
+
},
|
209 |
+
{
|
210 |
+
"name": "bilmecebench",
|
211 |
+
"task": "multiple_choice",
|
212 |
+
"acc": 0.2239819004524887,
|
213 |
+
"acc_norm": 0.2239819004524887
|
214 |
}
|
|
|
215 |
]
|
216 |
}
|
results/zero-shot/llama-3.2-3b-instruct.json
CHANGED
@@ -205,6 +205,12 @@
|
|
205 |
"rouge1": 0.20895844439423625,
|
206 |
"rouge2": 0.06600479924922747,
|
207 |
"rougeL": 0.15296503277602666
|
|
|
|
|
|
|
|
|
|
|
|
|
208 |
}
|
209 |
]
|
210 |
}
|
|
|
205 |
"rouge1": 0.20895844439423625,
|
206 |
"rouge2": 0.06600479924922747,
|
207 |
"rougeL": 0.15296503277602666
|
208 |
+
},
|
209 |
+
{
|
210 |
+
"name": "bilmecebench",
|
211 |
+
"task": "multiple_choice",
|
212 |
+
"acc": 0.30995475113122173,
|
213 |
+
"acc_norm": 0.30995475113122173
|
214 |
}
|
215 |
]
|
216 |
}
|
results/zero-shot/llama-3.2-3b.json
CHANGED
@@ -173,7 +173,12 @@
|
|
173 |
"task": "multiple_choice",
|
174 |
"acc": 0.29,
|
175 |
"acc_norm": 0.29
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
}
|
177 |
-
|
178 |
]
|
179 |
}
|
|
|
173 |
"task": "multiple_choice",
|
174 |
"acc": 0.29,
|
175 |
"acc_norm": 0.29
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"name": "bilmecebench",
|
179 |
+
"task": "multiple_choice",
|
180 |
+
"acc": 0.2895927601809955,
|
181 |
+
"acc_norm": 0.2895927601809955
|
182 |
}
|
|
|
183 |
]
|
184 |
}
|
results/zero-shot/mistral-7b.json
CHANGED
@@ -179,6 +179,12 @@
|
|
179 |
"rouge1": 0.03270500689456488,
|
180 |
"rouge2": 0.011127489805662754,
|
181 |
"rougeL": 0.025124899949616367
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
}
|
183 |
]
|
184 |
}
|
|
|
179 |
"rouge1": 0.03270500689456488,
|
180 |
"rouge2": 0.011127489805662754,
|
181 |
"rougeL": 0.025124899949616367
|
182 |
+
},
|
183 |
+
{
|
184 |
+
"name": "bilmecebench",
|
185 |
+
"task": "multiple_choice",
|
186 |
+
"acc": 0.25339366515837103,
|
187 |
+
"acc_norm": 0.25339366515837103
|
188 |
}
|
189 |
]
|
190 |
}
|
results/zero-shot/trendyol-7b.json
CHANGED
@@ -179,6 +179,12 @@
|
|
179 |
"task": "multiple_choice",
|
180 |
"acc": 0.2477777777777778,
|
181 |
"acc_norm": 0.2477777777777778
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
}
|
183 |
]
|
184 |
}
|
|
|
179 |
"task": "multiple_choice",
|
180 |
"acc": 0.2477777777777778,
|
181 |
"acc_norm": 0.2477777777777778
|
182 |
+
},
|
183 |
+
{
|
184 |
+
"name": "bilmecebench",
|
185 |
+
"task": "multiple_choice",
|
186 |
+
"acc": 0.23076923076923078,
|
187 |
+
"acc_norm": 0.23076923076923078
|
188 |
}
|
189 |
]
|
190 |
}
|
results/zero-shot/turna.json
CHANGED
@@ -179,6 +179,12 @@
|
|
179 |
"task": "multiple_choice",
|
180 |
"acc": 0.19333333333333333,
|
181 |
"acc_norm": 0.19333333333333333
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
}
|
183 |
]
|
184 |
}
|
|
|
179 |
"task": "multiple_choice",
|
180 |
"acc": 0.19333333333333333,
|
181 |
"acc_norm": 0.19333333333333333
|
182 |
+
},
|
183 |
+
{
|
184 |
+
"name": "bilmecebench",
|
185 |
+
"task": "multiple_choice",
|
186 |
+
"acc": 0.2420814479638009,
|
187 |
+
"acc_norm": 0.2420814479638009
|
188 |
}
|
189 |
]
|
190 |
}
|