File size: 6,857 Bytes
8a254d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
"Model Name"	"Link"	"Comment"	"Group"	"Med. Len."	"Med. Resp. Len."	"Parameter Size (B)"	"Type"	"Model Type"	"Think"	"Overall"	"Content Generation"	"Editing"	"Data Analysis"	"Reasoning"	"Hallucination"	"Safety"	"Repetition"	"Summarization"	"Translation"	"Multi-Turn"
"GPT-5 (Reasoning: medium)"	"https://platform.openai.com/docs/models/gpt-5"	"Reasoning: medium"	"GPT"	""	""	""	"Proprietary"	"Think"	"On"	"70.73"	"71.0"	"74.38"	"76.49"	"79.75"	"64.94"	"56.2"	"82.86"	"80.16"	"69.38"	"54.36"
"o3-pro (Reasoning: medium)"	"https://platform.openai.com/docs/models/o3-pro"	"Reasoning: medium"	"GPT"	""	""	""	"Proprietary"	"Think"	"On"	"66.47"	"72.5"	"70.31"	"75.7"	"83.88"	"64.37"	"33.88"	"74.29"	"65.48"	"64.33"	"48.32"
"Claude 4 Opus (20250514) (think)"	"https://www.anthropic.com/claude/opus"	"version: 20250514"	"Claude"	""	""	""	"Proprietary"	"Hybrid"	"On"	"63.29"	"60.75"	"59.69"	"73.31"	"69.83"	"78.74"	"53.72"	"55.71"	"65.48"	"65.45"	"48.99"
"Claude 4.1 Opus (20250805) (think)"	"https://www.anthropic.com/claude/opus"	"version: 20250805"	"Claude"	""	""	""	"Proprietary"	"Hybrid"	"On"	"63.24"	"61.25"	"60.0"	"78.49"	"72.73"	"77.01"	"56.2"	"57.14"	"61.9"	"62.64"	"46.98"
"GPT-5 mini (Reasoning: medium)"	"https://platform.openai.com/docs/models/gpt-5-mini"	"Reasoning: medium"	"GPT"	""	""	""	"Proprietary"	"Think"	"On"	"62.56"	"68.0"	"62.5"	"74.9"	"76.86"	"55.17"	"47.93"	"44.29"	"74.6"	"56.18"	"45.3"
"Claude 4 Sonnet (20250514) (think)"	"https://www.anthropic.com/claude/sonnet"	"version: 20250514"	"Claude"	""	""	""	"Proprietary"	"Hybrid"	"On"	"61.8"	"58.0"	"58.44"	"76.49"	"67.77"	"79.31"	"57.02"	"44.29"	"65.08"	"62.92"	"44.97"
"o3"	"https://platform.openai.com/docs/models/o3"	""	"GPT"	""	""	""	"Proprietary"	"Think"	"On"	"60.91"	"68.75"	"60.0"	"73.31"	"79.34"	"54.02"	"34.71"	"64.29"	"60.71"	"55.06"	"46.98"
"Gemini 2.5 Pro"	"https://deepmind.google/models/gemini/pro/"	""	"Gemini"	""	""	""	"Proprietary"	"Think"	"On"	"59.34"	"54.0"	"60.94"	"78.88"	"73.14"	"63.22"	"17.36"	"52.86"	"67.86"	"53.93"	"52.68"
"Grok-4"	"https://x.ai/news/grok-4"	"temperature: 0.6
top-p: 0.95"	"Grok"	""	""	""	"Proprietary"	"Think"	"On"	"58.74"	"61.0"	"66.25"	"72.51"	"63.22"	"66.09"	"16.53"	"58.57"	"66.27"	"54.21"	"44.3"
"Gemini 2.5 Flash"	"https://deepmind.google/models/gemini/flash/"	""	"Gemini"	""	""	""	"Proprietary"	"Hybrid"	"On"	"58.62"	"57.25"	"62.19"	"70.52"	"72.31"	"56.9"	"28.93"	"47.14"	"68.65"	"55.06"	"46.98"
"o4-mini"	"https://platform.openai.com/docs/models/o4-mini"	""	"GPT"	""	""	""	"Proprietary"	"Think"	"On"	"57.57"	"67.25"	"61.25"	"71.71"	"75.62"	"45.4"	"39.67"	"44.29"	"59.92"	"47.19"	"41.95"
"Qwen3 235B A22B Thinking 2507"	"https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507"	"temperature: 0.6
top-p: 0.95"	"Qwen"	"2404.5"	"423.0"	"235.0"	"Open"	"Think"	"On"	"55.48"	"57.5"	"53.12"	"73.31"	"75.21"	"55.17"	"25.62"	"35.71"	"55.56"	"56.18"	"40.27"
"GPT-5 nano (Reasoning: medium)"	"https://platform.openai.com/docs/models/gpt-5-nano"	"Reasoning: medium"	"GPT"	""	""	""	"Proprietary"	"Think"	"On"	"55.39"	"63.5"	"47.19"	"68.92"	"75.21"	"55.17"	"52.07"	"34.29"	"63.49"	"40.73"	"42.95"
"GLM-4.5 FP8 (think)"	"https://huggingface.co/zai-org/GLM-4.5-FP8"	"temperature: 0.6
top-p: 0.95"	"GLM"	"1442.0"	"604.0"	"355.0"	"Open"	"Hybrid"	"On"	"54.03"	"60.75"	"53.75"	"68.92"	"74.38"	"47.13"	"33.06"	"41.43"	"60.32"	"46.07"	"35.91"
"Qwen3 235B A22B Instruct 2507"	"https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507"	"temperature: 0.7
top-p: 0.8"	"Qwen"	"433.0"	"433.0"	"235.0"	"Open"	"Instruct"	"Off"	"52.94"	"58.0"	"49.69"	"68.13"	"73.97"	"55.17"	"45.45"	"30.0"	"55.95"	"38.48"	"41.61"
"DeepSeek V3.1 (think)"	"https://huggingface.co/deepseek-ai/DeepSeek-V3.1"	"temperature: 0.6
top-p: 0.95"	"DeepSeek"	"710.5"	"356.0"	"671.0"	"Open"	"Hybrid"	"On"	"51.45"	"52.0"	"50.0"	"67.33"	"69.83"	"50.0"	"33.88"	"35.71"	"59.52"	"41.85"	"40.27"
"gpt-oss-120B (Reasoning: medium)"	"https://huggingface.co/openai/gpt-oss-120b"	"Reasoning: medium
temperature: 1.0
top-p: 1.0"	"GPT"	"759.5"	"370.5"	"117.0"	"Open"	"Think"	"On"	"49.11"	"58.5"	"48.44"	"68.92"	"69.83"	"41.38"	"39.67"	"25.71"	"50.79"	"35.67"	"32.21"
"DeepSeek R1 (0528) (top_p: 0.95, temp:0.6)"	"https://huggingface.co/deepseek-ai/DeepSeek-R1-0528"	"version: 0528
temperature: 0.6
top-p: 0.95"	"DeepSeek"	"1177.5"	"554.0"	"671.0"	"Open"	"Think"	"On"	"48.79"	"49.75"	"50.0"	"65.34"	"59.09"	"48.85"	"38.02"	"32.86"	"57.94"	"36.52"	"38.93"
"Gauss2.3 Hybrid"	""	""	"Gauss"	"546.0"	"308.0"	""	"Proprietary"	"Hybrid"	"On"	"46.58"	"52.0"	"46.25"	"59.76"	"66.94"	"41.95"	"34.71"	"25.71"	"53.17"	"34.55"	"33.22"
"DeepSeek V3 (0324) (top_p: 0.95, temp:1.3)"	"https://huggingface.co/deepseek-ai/DeepSeek-V3-0324"	"version: 0324
temperature: 1.3
top-p: 0.95"	"DeepSeek"	"408.0"	"408.0"	"671.0"	"Open"	"Instruct"	"Off"	"45.09"	"46.25"	"45.0"	"58.96"	"60.33"	"41.95"	"21.49"	"30.0"	"55.95"	"38.48"	"33.22"
"Qwen3 32B (think)"	"https://huggingface.co/Qwen/Qwen3-32B"	"temperature: 0.6
top-p: 0.95"	"Qwen"	"1113.0"	"390.0"	"32.8"	"Open"	"Hybrid"	"On"	"44.44"	"52.25"	"41.56"	"68.92"	"66.53"	"35.06"	"19.83"	"25.71"	"46.43"	"30.9"	"32.89"
"A.X 4.0"	"https://huggingface.co/skt/A.X-4.0"	""	"SKT"	"412.5"	"412.5"	"71.9"	"Open"	"Instruct"	"Off"	"41.59"	"56.0"	"43.75"	"43.43"	"42.56"	"40.23"	"15.7"	"24.29"	"53.97"	"33.43"	"32.21"
"gpt-oss-20B (Reasoning: medium)"	"https://huggingface.co/openai/gpt-oss-20b"	"Reasoning: medium
temperature: 1.0
top-p: 1.0"	"GPT"	"953.5"	"326.0"	"21.0"	"Open"	"Think"	"On"	"41.18"	"52.0"	"40.0"	"61.35"	"65.7"	"43.1"	"41.32"	"22.86"	"36.51"	"20.51"	"22.82"
"EXAONE 4.0 32B (think)"	"https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B"	"temperature: 0.6
top-p: 0.95"	"Exaone"	"1274.5"	"503.0"	"32.0"	"Open"	"Hybrid"	"On"	"33.82"	"34.25"	"29.38"	"56.97"	"57.44"	"24.71"	"27.27"	"17.14"	"38.49"	"18.54"	"25.5"
"HyperCLOVAX SEED Think 14B (think)"	"https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Think-14B"	"temperature: 0.5
top-p: 0.6"	"HCX"	"1444.0"	"382.5"	"14.7"	"Open"	"Hybrid"	"On"	"31.84"	"35.0"	"26.56"	"53.78"	"58.68"	"27.59"	"26.45"	"17.14"	"29.76"	"17.13"	"20.47"
"Solar Pro Preview (top_p:0.95, temp: 0.7)"	"https://huggingface.co/upstage/solar-pro-preview-instruct"	"temperature: 0.7
top-p: 0.95"	"Solar"	"260.0"	"260.0"	"22.0"	"Open"	"Instruct"	"Off"	"20.73"	"28.0"	"24.69"	"16.73"	"19.42"	"17.24"	"28.1"	"11.43"	"31.35"	"13.76"	"11.74"
"Mi:dm 2.0 Base Instruct"	"https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct"	"temperature: 0.8
top-p: 0.7"	"kt"	"316.0"	"316.0"	"11.5"	"Open"	"Instruct"	"Off"	"20.25"	"21.75"	"17.5"	"16.73"	"18.6"	"27.59"	"59.5"	"14.29"	"25.4"	"12.64"	"11.41"
"Kanana 1.5 15.7B A3B Instruct"	"https://huggingface.co/kakaocorp/kanana-1.5-15.7b-a3b-instruct"	"temperature: 1.0
top-p: 0.95"	"kakao"	"414.0"	"414.0"	"15.7"	"Open"	"Instruct"	"Off"	"11.71"	"14.25"	"10.62"	"13.55"	"11.16"	"22.41"	"22.31"	"4.29"	"11.9"	"6.74"	"5.37"