|
Model,Size,Accuracy/std,Precision_Unsafe/std,Recall_Unsafe/std,Precision_Safe/std,Recall_Safe/std |
|
Yi-1.5-34B-Chat,~30B,66.02/0.22,80.13/0.55,42.82/0.25,60.86/0.16,89.33/0.41 |
|
Qwen2.5-32B-Instruct,~30B,64.33/0.46,62.46/0.44,72.24/0.71,66.91/0.53,56.38/0.18 |
|
Opt-30B,~30B,53.82/0.03,54.42/0.21,48.32/0.20,53.34/0.11,59.34/0.27 |
|
QwQ-32B-Preview,~30B,51.82/0.06,51.04/0.10,94.83/0.28,62.38/0.26,8.61/0.39 |
|
Gemma-3-27B-it,~30B,50.00/0.00,0.00/0.00,0.00/0.00,50.00/0.00,100.00/0.00 |
|
Qwen3-32B,~30B,49.66/0.00,49.83/0.00,99.03/0.00,22.40/0.00,0.28/0.00 |
|
OpenThinker2-32B,~30B,49.91/0.00,49.95/0.00,98.26/0.00,47.27/0.00,1.56/0.00 |
|
DeepSeek-LLM-67B-Chat,>65B,68.08/0.35,94.80/0.83,38.40/0.43,61.27/0.26,97.88/0.36 |
|
Qwen1.5-72B-Chat,>65B,63.67/0.46,58.27/0.32,96.84/0.13,90.51/0.57,30.34/0.80 |
|
Qwen2.5-72B-Instruct,>65B,63.27/0.52,66.00/0.60,55.09/0.82,61.31/0.46,71.49/0.25 |
|
Qwen2-72B-Instruct,>65B,60.70/0.49,57.90/0.42,79.03/0.63,66.75/0.77,42.28/0.43 |
|
Opt-66B,>65B,59.93/0.41,56.52/0.37,86.87/0.59,71.36/0.78,32.86/0.74 |
|
DeepSeek-R1-Distill-Llama-70B,>65B,47.68/0.64,45.77/1.21,23.85/0.67,48.35/0.46,71.62/0.60 |
|
Llama-3.1-70B-Instruct,>65B,43.68/0.41,36.45/0.84,16.66/0.34,45.83/0.30,70.82/0.48 |
|
Llama3-ChatQA-1.5-70B,>65B,40.41/0.29,33.86/0.75,19.84/0.75,43.13/0.25,61.08/0.37 |
|
Llama-3.3-70B-Instruct,>65B,36.84/0.82,32.02/1.29,23.19/1.13,39.58/0.63,50.55/0.69 |
|
Phi-3-medium-4k-instruct,10B~20B,71.04/0.31,69.74/0.29,74.56/0.97,72.54/0.59,67.49/0.89 |
|
Baichuan2-13B-Chat,10B~20B,70.43/0.39,65.81/0.38,85.34/0.63,79.02/0.63,55.46/0.47 |
|
Phi-3-medium-128k-instruct,10B~20B,68.87/0.81,68.08/0.51,71.32/1.44,69.75/1.17,66.41/0.57 |
|
Mistral-Nemo-Instruct-2407,10B~20B,66.88/0.46,62.56/0.28,84.42/0.90,75.89/1.13,49.26/0.24 |
|
phi-4,10B~20B,62.62/0.32,63.73/0.41,58.98/0.20,61.66/0.31,66.28/0.78 |
|
Qwen1.5-14B-Chat,10B~20B,61.29/0.40,57.02/0.32,92.43/0.55,79.80/1.05,30.02/0.47 |
|
Mistral-Small-24B-Instruct-2501,10B~20B,59.20/0.46,58.32/0.42,65.16/1.08,60.33/0.56,53.22/0.20 |
|
Ziya2-13B-Chat,10B~20B,55.25/0.26,59.24/0.37,34.30/0.11,53.61/0.26,76.29/0.39 |
|
InternLM2-Chat-20B,10B~20B,53.67/0.16,79.00/0.66,10.30/0.60,51.90/0.11,97.25/0.26 |
|
Opt-13B,10B~20B,49.31/0.31,37.77/3.57,1.76/0.16,49.59/0.23,97.08/0.29 |
|
Moonlight-16B-A3B-Instruct,10B~20B,48.92/0.16,3.46/0.57,0.07/0.01,49.40/0.15,98.00/0.08 |
|
Qwen3-14B,10B~20B,48.34/0.00,49.14/0.00,95.13/0.00,24.26/0.00,1.56/0.00 |
|
Gemma-3-12B-it,10B~20B,50.00/0.00,0.00/0.00,0.00/0.00,50.00/0.00,100.00/0.00 |
|
Gemma-1.1-7B-it,5B~10B,64.32/0.68,59.98/0.58,86.60/0.35,75.70/0.80,41.95/0.93 |
|
Qwen1.5-7B-Chat,5B~10B,62.48/0.54,59.06/0.48,81.92/0.50,70.28/0.65,42.96/0.81 |
|
Phi-3-small-128k-instruct,5B~10B,61.76/0.27,60.47/0.16,68.45/0.61,63.46/0.50,55.05/0.61 |
|
Yi-1.5-9B-Chat,5B~10B,60.35/0.52,79.47/1.37,28.16/0.33,56.22/0.39,92.69/0.59 |
|
Phi-3-small-8k-instruct,5B~10B,59.47/0.39,56.25/0.30,86.06/0.40,70.05/0.85,32.75/0.49 |
|
DeepSeek-LLM-7B-Chat,5B~10B,56.79/0.19,84.83/1.23,16.77/0.09,53.70/0.15,96.99/0.27 |
|
Ministral-8B-Instruct-2410,5B~10B,56.28/0.51,55.10/0.51,68.83/0.58,58.24/0.51,43.66/0.54 |
|
GPT-J-6B,5B~10B,55.98/0.42,80.27/1.42,16.11/0.86,53.26/0.23,96.03/0.20 |
|
Baichuan2-7B-Chat,5B~10B,53.99/0.51,62.89/1.57,19.96/0.88,52.31/0.30,88.18/0.23 |
|
GLM-4-9B-Chat,5B~10B,50.03/0.15,50.07/0.13,99.31/0.22,44.12/9.01,0.52/0.04 |
|
InternLM2-Chat-7B,5B~10B,49.49/0.11,42.16/1.58,2.15/0.31,49.68/0.13,97.06/0.25 |
|
Opt-6.7B,5B~10B,48.54/0.43,49.24/0.31,86.62/1.03,43.40/1.18,10.30/0.55 |
|
Mistral-7B-Instruct-v0.3,5B~10B,42.99/0.06,39.54/0.47,26.01/0.69,44.69/0.11,60.05/0.50 |
|
Llama3-ChatQA-1.5-8B,5B~10B,42.11/0.29,37.46/0.85,23.20/0.89,44.20/0.09,61.11/0.57 |
|
Qwen3-4B,5B~10B,46.04/0.00,47.79/0.00,85.94/0.00,30.39/0.00,6.14/0.00 |
|
Gemma-3-4B-it,5B~10B,50.00/0.00,0.00/0.00,0.00/0.00,50.00/0.00,100.00/0.00 |