Spaces:

SamsungResearch
/

TRUEBench

Running

TRUEBench / ui.py

송종윤/AI Productivity팀(SR)/삼성전자

Initial commit

8a254d6 6 days ago

21.1 kB

	import gradio as gr
	from src.display.formatting import render_leaderboard_html, get_display_model_name
	from src.data_utils import get_length_category_list, get_length_category_df
	import pandas as pd
	import numpy as np

	def render_length_category_html(df, med_len_map=None):
	"""
	Render the length category table with Model Name colored by Rank (gold/silver/bronze), no Rank column.
	Model Name cell includes Think, Model Type badges. Overall column is always right after Model Name.
	Optionally, inserts Med. Len. column after Overall if med_len_map is provided.
	"""
	if df is None or df.empty:
	return "<div>No data available.</div>"

	# Compute Rank based on Overall (descending)
	df = df.copy()
	# 1. Sort so that empty strings come to the top first
	df = df.sort_values("Overall", key=lambda x: (x == "").astype(int))
	# 2. Then sort the actual values in descending order (empty strings are already at the top, so no effect)
	df = df.sort_values("Overall", ascending=False, kind="mergesort").reset_index(drop=True)
	df["Rank_Internal"] = df["Overall"].rank(method="min", ascending=False).astype(int)

	# Ensure Think and Model Type columns exist for badge rendering
	# Rename columns to ensure exact match
	if "Type" not in df.columns:
	df["Type"] = "unknown"
	if "Model Type" not in df.columns:
	df["Model Type"] = "unknown"
	if "Think" not in df.columns:
	df["Think"] = "unknown"

	# Optionally add Med. Len. column
	if med_len_map is not None:
	df["Med. Len."] = df["Model Name"].map(med_len_map)

	# Determine display columns: Model Name, Overall, Med. Len., {Category}, (rest, excluding Rank_Internal, Model Type, Think)
	base_cols = [col for col in df.columns if col not in ["Rank_Internal", "Comment", "Group", "Link"]]

	# Find the dynamic category column (e.g., "Short", "Long", etc.)
	from src.data_utils import get_length_category_list
	category_cols = [col for col in get_length_category_list() if col in base_cols]
	category_col = category_cols[0] if category_cols else None

	# Build display_cols: Model Name, Overall, Med. Len., {Category}, (rest)
	display_cols = []
	if "Model Name" in base_cols:
	display_cols.append("Model Name")
	if "Overall" in base_cols:
	display_cols.append("Overall")
	if "Med. Len." in base_cols:
	display_cols.append("Med. Len.")
	if "Med. Resp. Len." in base_cols:
	display_cols.append("Med. Resp. Len.")
	if category_col:
	display_cols.append(category_col)
	for col in base_cols:
	if col not in display_cols:
	display_cols.append(col)

	# Build HTML table
	html = '<table class="pretty-leaderboard-table">\n<thead><tr>'
	for col in display_cols:
	# Info icon for Model Name, Med. Len. and Med. Resp. Len.
	if col == "Model Name":
	html += (
	f'<th>{col}'
	'<span class="info-icon" title="Hovering the mouse displays additional details, and clicking the model name navigates to the corresponding page.">ⓘ</span>'
	'</th>'
	)
	elif col == "Med. Len.":
	html += (
	f'<th>{col}'
	'<span class="info-icon" title="Median token length of think and response for the model.">ⓘ</span>'
	'</th>'
	)
	elif col == "Med. Resp. Len.":
	html += (
	f'<th>{col}'
	'<span class="info-icon" title="Median token length of the model\'s responses (excluding think).">ⓘ</span>'
	'</th>'
	)
	else:
	html += f'<th>{col}</th>'
	html += '</tr></thead>\n<tbody>\n'

	# --- Define number formatting function ---
	from constants import NUMERIC_COLS_CATEGORY, NUMERIC_INT_COLS_CATEGORY
	def format_leaderboard_cell(cell, col):
	# Handle NaN/empty strings
	if pd.isna(cell) or (isinstance(cell, str) and cell.strip() == ""):
	return cell
	try:
	if col in NUMERIC_INT_COLS_CATEGORY:
	# Integer (rounded)
	return str(int(round(float(cell))))
	elif col in NUMERIC_COLS_CATEGORY:
	# Two decimal places
	return "{:.2f}".format(float(cell))
	else:
	return str(cell)
	except Exception:
	return str(cell)

	for idx, row in df.iterrows():
	html += '<tr>'
	for col in display_cols:
	cell = row[col]
	if col == "Model Name":
	# Gold/Silver/Bronze for 1/2/3
	rank = row["Rank_Internal"]
	if rank == 1:
	style = "color: #ffd700; font-weight: bold; text-shadow: 0 0 4px #fff2;"
	elif rank == 2:
	style = "color: #b0b0b0; font-weight: bold;"
	elif rank == 3:
	style = "color: #cd7f32; font-weight: bold;"
	else:
	style = "color: #fff; font-weight: 600;"

	# Badge HTML
	model_type = row["Model Type"] if "Model Type" in row else "unknown"
	think_type = row["Think"] if "Think" in row else "unknown"
	type_value = row["Type"] if "Type" in row else "unknown"
	from src.display.formatting import get_type_badge, get_think_badge, get_model_type_badge
	badge_html = (
	get_type_badge(type_value)
	+ get_model_type_badge(model_type)
	+ get_think_badge(think_type)
	)

	display_name = get_display_model_name(str(cell))

	# --- Start of new logic for tooltip ---
	comment_value = ""
	# Check if 'Comment' column exists and the value is not NaN/empty
	if "Comment" in row and pd.notna(row["Comment"]) and str(row["Comment"]).strip() != "":
	comment_value = str(row["Comment"]).strip()
	title_attribute = f' title="{comment_value}"' if comment_value else ""
	# --- End of new logic for tooltip ---

	# Link logic
	link_value = row["Link"] if "Link" in row and pd.notna(row["Link"]) and str(row["Link"]).strip() != "" else None
	if link_value:
	clickable_name = f'<a href="{link_value}" target="_blank" style="color:inherit;">{display_name}</a>'
	else:
	clickable_name = display_name

	html += f'<td><span style="{style}"{title_attribute}>{clickable_name}</span>{badge_html}</td>'
	elif col == "Overall":
	# Show stars
	from src.display.formatting import get_score_stars
	try:
	unique_id = row.get("Model Name", None)
	unique_id = unique_id.replace(" ", "_").replace("-", "_").replace("(", "_").replace(")", "_")
	cell_html = get_score_stars(float(cell), unique_id=unique_id)
	except Exception:
	cell_html = str(cell)
	html += f'<td>{cell_html}</td>'
	else:
	html += f'<td>{format_leaderboard_cell(cell, col)}</td>'
	html += '</tr>\n'
	html += '</tbody></table>'
	# Wrap in scrollable div for sticky header
	return f'<div class="leaderboard-table-container" style="max-height:900px;overflow-y:auto;">{html}</div>'

	def render_length_category_table(leaderboard_df=None):
	"""
	Renders a Category selector and a table showing length stats for the selected category.
	Uses Overall from leaderboard_df for ranking, coloring, and stars.
	"""
	import gradio as gr

	categories = get_length_category_list()
	default_category = categories[0] if categories else ""
	# Merge Overall from leaderboard_df
	def get_merged_df(selected_category):
	df_cat = get_length_category_df(selected_category) if selected_category else None
	if leaderboard_df is not None and df_cat is not None:
	df_merged = df_cat.copy()
	# Use Overall and {Category} from leaderboard_df
	overall_map = dict(zip(leaderboard_df["Model Name"], leaderboard_df["Overall"]))
	category_map = dict(zip(leaderboard_df["Model Name"], leaderboard_df[selected_category]))
	df_merged["Overall"] = df_merged["Model Name"].map(overall_map)
	df_merged[selected_category] = df_merged["Model Name"].map(category_map)
	# Also map Model Type and Think
	if "Type" in leaderboard_df.columns:
	type_map = dict(zip(leaderboard_df["Model Name"], leaderboard_df["Type"]))
	df_merged["Type"] = df_merged["Model Name"].map(type_map)
	if "Model Type" in leaderboard_df.columns:
	model_type_map = dict(zip(leaderboard_df["Model Name"], leaderboard_df["Model Type"]))
	df_merged["Model Type"] = df_merged["Model Name"].map(model_type_map)
	if "Think" in leaderboard_df.columns:
	think_map = dict(zip(leaderboard_df["Model Name"], leaderboard_df["Think"]))
	df_merged["Think"] = df_merged["Model Name"].map(think_map)
	# Remove rows with missing Overall or {Category}
	df_merged = df_merged[df_merged["Overall"].notna() & df_merged[selected_category].notna()]
	return df_merged
	return df_cat

	df = get_merged_df(default_category)

	# Prepare med_len_map if possible
	med_len_map = None
	if leaderboard_df is not None and "Med. Len." in leaderboard_df.columns:
	med_len_map = dict(zip(leaderboard_df["Model Name"], leaderboard_df["Med. Len."]))

	with gr.Column():
	category_selector = gr.Dropdown(
	choices=categories,
	value=default_category,
	label="Select Category for Length Table",
	interactive=True,
	)

	table_html = gr.HTML(
	value=render_length_category_html(df, med_len_map=med_len_map) if df is not None else "<div>No data available.</div>",
	elem_id="length-category-table"
	)

	def update_table(selected_category):
	df = get_merged_df(selected_category)
	html = render_length_category_html(df, med_len_map=med_len_map)
	return html

	category_selector.change(
	fn=update_table,
	inputs=[category_selector],
	outputs=[table_html]
	)

	return {
	"category_selector": category_selector,
	"table_html": table_html,
	}

	def create_leaderboard_tab(df, key):
	"""
	df: DataFrame to display
	key: "Category" or "Language"
	column_selector_value: default columns to select
	"""
	# Ensure df has Model, Model Type, Think columns for filtering
	# No need to create Model column, only use Model Name
	# Always ensure "Overall" column exists
	if "Overall" not in df.columns:
	return # Or handle error appropriately
	# No additional mapping needed since DataFrame already has columns

	df_state = gr.State(df)

	# Create DataFrame including badge information (for upper table)
	df_badge = df.copy()
	# If Overall values are in the range 0~1, convert to 0~100
	if "Overall" in df_badge.columns and df_badge["Overall"].max() <= 1.0:
	df_badge["Overall"] = df_badge["Overall"] * 100
	# Remove Group column (only in display)
	for col_to_drop in ["Group"]:
	if col_to_drop in df_badge.columns:
	df_badge = df_badge.drop(columns=[col_to_drop])
	# Handle error if "Overall" column does not exist
	if "Overall" not in df_badge.columns:
	return # Or handle error appropriately
	# Always sort by "Overall"
	# 1. Sort so that empty strings come to the top first
	df_badge = df_badge.sort_values("Overall", key=lambda x: (x == "").astype(int))
	# 2. Then sort the actual values in descending order (empty strings are already at the top, so no effect)
	df_badge = df_badge.sort_values("Overall", ascending=False, kind="mergesort").reset_index(drop=True)
	df_badge["Rank"] = df_badge.index + 1
	# Reorder "Rank" column to be right after "Model Name"
	cols = df_badge.columns.tolist()
	if "Model Name" in cols and "Rank" in cols:
	model_name_idx = cols.index("Model Name")
	cols.remove("Rank")
	cols.insert(model_name_idx + 1, "Rank")
	df_badge = df_badge[cols]

	with gr.Row():
	# Type Selector (Open/Proprietary)
	type_choices = ["Open", "Proprietary"]
	type_selector = gr.CheckboxGroup(
	choices=type_choices,
	value=type_choices,
	label="Select Type (Open/Proprietary)"
	)

	# Model Type Selector (Instruct/Think/Hybrid)
	model_type_choices = ["Instruct", "Think", "Hybrid"]
	model_type_selector = gr.CheckboxGroup(
	choices=model_type_choices,
	value=model_type_choices,
	label="Select Model Type (Instruct/Think/Hybrid)"
	)
	# Think Selector (On/Off)
	think_choices = ["On", "Off"]
	think_selector = gr.CheckboxGroup(
	choices=think_choices,
	value=think_choices,
	label="Select Think Mode (On/Off)"
	)
	# Add Gradio component for selecting sort criteria (always descending)
	# For language leaderboard, dynamically extract language columns + Avg. Len., Parameter Size (B)

	if key == "Language":
	import re
	language_columns = [col for col in df_badge.columns if re.fullmatch(r"[A-Z]{2}", col) or col == "VI"]
	available_sort_columns = ["Overall", "Med. Len.", "Med. Resp. Len.", "Parameter Size (B)"] + language_columns
	else:
	category_columns = [
	"Overall", "Med. Len.", "Med. Resp. Len.", "Parameter Size (B)", "Content Generation", "Editing", "Data Analysis", "Reasoning",
	"Hallucination", "Safety", "Repetition", "Summarization", "Translation", "Multi-Turn"
	]
	available_sort_columns = [col for col in category_columns if col in df_badge.columns]

	sort_col_dropdown = gr.Dropdown(
	choices=available_sort_columns,
	value="Overall",
	label="Sort by",
	interactive=True,
	)

	# Sorting function
	leaderboard_html = render_leaderboard_html(df_badge.round(3), overall_col="Overall", key=key)
	leaderboard_html_comp = gr.HTML(value=leaderboard_html, elem_id="leaderboard-table")

	# Filtering logic for new selectors
	def unified_filter(types, model_types, thinks, df, sort_col):
	# Apply search filter first
	filtered = df.copy()
	if "Type" in filtered.columns and (not types or len(types) == 0):
	types = filtered["Type"].unique().tolist()
	if "Model Type" in filtered.columns and (not model_types or len(model_types) == 0):
	model_types = filtered["Model Type"].unique().tolist()
	if "Think" in filtered.columns and (not thinks or len(thinks) == 0):
	thinks = filtered["Think"].unique().tolist()
	# Defensive: always ensure "Overall" exists
	if "Type" in filtered.columns:
	filtered["Type"] = filtered["Type"].fillna("").astype(str)
	types_norm = [v.lower().strip() for v in types]
	filtered = filtered[filtered["Type"].str.lower().str.strip().isin(types_norm)]
	if "Model Type" in filtered.columns:
	filtered["Model Type"] = filtered["Model Type"].fillna("").astype(str)
	model_types_norm = [v.lower().strip() for v in model_types]
	filtered = filtered[filtered["Model Type"].str.lower().str.strip().isin(model_types_norm)]
	if "Think" in filtered.columns:
	filtered["Think"] = filtered["Think"].fillna("").astype(str)
	thinks_norm = [v.lower().strip() for v in thinks]
	filtered = filtered[filtered["Think"].str.lower().str.strip().isin(thinks_norm)]
	if "Overall" not in filtered.columns:
	html = "<div style='color:red'>No 'Overall' column found in data. Please check your input data.</div>"
	return html, sort_col
	# Always sort in descending order
	# To make empty strings come to the top, replace them with np.inf and sort descending
	sort_col_for_sort = filtered[sort_col].replace('', np.inf).astype(float)
	filtered = filtered.assign(sort_col_tmp=sort_col_for_sort)
	filtered = filtered.sort_values('sort_col_tmp', ascending=False, kind="mergesort").reset_index(drop=True)
	filtered = filtered.drop(columns=['sort_col_tmp'])
	# Add "Rank" column and reorder it to be right after "Model Name"
	filtered["Rank"] = filtered.index + 1
	cols = filtered.columns.tolist()
	if "Model Name" in cols and "Rank" in cols:
	model_name_idx = cols.index("Model Name")
	cols.remove("Rank")
	cols.insert(model_name_idx + 1, "Rank")
	filtered = filtered[cols]
	# Always remove Group column
	for col_to_drop in ["Group"]:
	if col_to_drop in filtered.columns:
	filtered = filtered.drop(columns=[col_to_drop])
	filtered._sort_col = sort_col
	# Extract top-5 models (currently sorted in descending order)
	top5_models = []
	if sort_col in filtered.columns and "Model Name" in filtered.columns:
	# 1. Sort so that empty strings come to the top first
	sort_col_for_sort = filtered[sort_col].replace('', np.inf).astype(float)
	filtered_df_sorted = filtered.assign(sort_col_tmp=sort_col_for_sort)
	filtered_df_sorted = filtered_df_sorted.sort_values('sort_col_tmp', ascending=False, kind="mergesort").reset_index(drop=True)
	top5_models = filtered_df_sorted["Model Name"].tolist()[:5]
	return render_leaderboard_html(filtered, overall_col="Overall", key=key), sort_col, top5_models

	# Download CSV function
	def dataframe_to_csv(data):
	import pandas as pd
	# Convert if data is not a DataFrame
	if isinstance(data, pd.DataFrame):
	df = data.copy() # Create a copy to avoid modifying the original DataFrame in memory
	else:
	df = pd.DataFrame(data)

	# Apply get_display_model_name to the "Model Name" column if it exists
	if "Model Name" in df.columns:
	df["Model Name"] = df["Model Name"].apply(get_display_model_name)

	csv_path = f"truebench_{key}.csv"
	df.to_csv(csv_path, index=False)
	return csv_path

	# Add DownloadButton (using CSS class)
	with gr.Row():
	with gr.Column(scale=1):
	pass # Empty space
	with gr.Column(scale=0):
	download_btn = gr.DownloadButton(
	label="📥 Download to CSV",
	value=dataframe_to_csv,
	inputs=[df_state],
	visible=True,
	elem_classes=["custom-download-btn"]
	)

	# Add custom CSS
	custom_css = """
	<style>
	.custom-download-btn >>> a {
	background: #e3e6f3 !important;
	color: #222 !important;
	border: 1px solid rgba(0, 0, 0, 0.1) !important;
	border-radius: 6px !important;
	padding: 1px 1px !important;
	font-size: 13px !important;
	font-weight: bold !important;
	text-shadow: 0 1px 1px rgba(0,0,0,0.1) !important;
	margin: 0 3px 3px 0 !important;
	}
	.custom-download-btn:hover {
	background: #f5f6fa !important;
	box-shadow: 0 2px 6px rgba(0, 0, 0, 0.1) !important;
	}
	</style>
	"""
	gr.HTML(custom_css)

	sort_col_dropdown.change(
	fn=unified_filter,
	inputs=[type_selector, model_type_selector, think_selector, df_state, sort_col_dropdown],
	outputs=[leaderboard_html_comp, sort_col_dropdown, gr.State()] # Add top5_models
	)
	type_selector.change(
	fn=unified_filter,
	inputs=[type_selector, model_type_selector, think_selector, df_state, sort_col_dropdown],
	outputs=[leaderboard_html_comp, sort_col_dropdown, gr.State()]
	)
	model_type_selector.change(
	fn=unified_filter,
	inputs=[type_selector, model_type_selector, think_selector, df_state, sort_col_dropdown],
	outputs=[leaderboard_html_comp, sort_col_dropdown, gr.State()]
	)
	think_selector.change(
	fn=unified_filter,
	inputs=[type_selector, model_type_selector, think_selector, df_state, sort_col_dropdown],
	outputs=[leaderboard_html_comp, sort_col_dropdown, gr.State()]
	)

	return {
	"type_selector": type_selector,
	"model_type_selector": model_type_selector,
	"think_selector": think_selector,
	"leaderboard_html_comp": leaderboard_html_comp,
	"sort_col_dropdown": sort_col_dropdown,
	"df_state": df_state,
	"unified_filter": unified_filter # Exposed for direct external call
	}