|
import json |
|
from pathlib import Path |
|
|
|
import gradio as gr |
|
import pandas as pd |
|
|
|
TITLE = """<h1 align="center" id="space-title">LLM Leaderboard for Minecraft</h1>""" |
|
|
|
DESCRIPTION = f""" |
|
Evaluation of VLM on Minecraft |
|
""" |
|
|
|
BENCHMARKS_TO_SKIP = [] |
|
|
|
|
|
def get_leaderboard_df(score_path): |
|
with open(score_path, "r") as f: |
|
scores = json.load(f) |
|
|
|
rows = [] |
|
for model, metrics in scores.items(): |
|
row = {"Model": model} |
|
for key, value in metrics.items(): |
|
if isinstance(value, dict): |
|
for sub_key, sub_value in value.items(): |
|
if sub_key != "20": |
|
continue |
|
|
|
row[f"{key.replace('_', ' ')}"] = sub_value |
|
else: |
|
row[key] = value |
|
rows.append(row) |
|
|
|
df = pd.DataFrame(rows) |
|
df.iloc[:, 1:] = df.iloc[:, 1:].applymap(lambda x: f"{x * 100:.2f}" if isinstance(x, (int, float)) else x) |
|
|
|
return df |
|
|
|
|
|
leaderboard_df = get_leaderboard_df("score.json") |
|
|
|
import gradio as gr |
|
import pandas as pd |
|
|
|
|
|
|
|
|
|
|
|
TASKS = { |
|
"VQA": ["VQA"], |
|
"QA": ["QA"], |
|
"VQA Reasoning": ["VQA_Reasoning"], |
|
"Reason": ["Reason"], |
|
"Embodied Grounding": ["Embodied Grounding"], |
|
"GUI Grounding": ["Gui Grounding"], |
|
} |
|
|
|
|
|
def filter_and_search(search_query: str, task_name: str): |
|
df = leaderboard_df.copy() |
|
task_cols = TASKS[task_name] |
|
score_col = task_cols[0] |
|
|
|
df[score_col] = pd.to_numeric(df[score_col], errors='coerce') |
|
df = df.sort_values(by=score_col, ascending=False, na_position='last') |
|
|
|
if search_query.strip(): |
|
terms = [term.strip().lower() for term in search_query.split(";")] |
|
pattern = "|".join(terms) |
|
df = df[df["Model"].str.lower().str.contains(pattern, regex=True)] |
|
|
|
return df[["Model"] + task_cols] |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.HTML("<h2>Leaderboard</h2>") |
|
with gr.Column(): |
|
gr.Markdown("Search and view results for each task.", elem_classes="markdown-text") |
|
|
|
with gr.Tabs(elem_classes="tabs-buttons") as tabs: |
|
for task_name, task_cols in TASKS.items(): |
|
with gr.TabItem(task_name): |
|
|
|
sub_df = leaderboard_df[["Model"] + task_cols].copy() |
|
sub_df[task_cols[0]] = pd.to_numeric(sub_df[task_cols[0]], errors="coerce") |
|
sub_df = sub_df.sort_values(by=task_cols[0], ascending=False, na_position="last") |
|
|
|
with gr.Row(): |
|
search_bar = gr.Textbox(placeholder="Search model name...", show_label=False) |
|
|
|
with gr.Group(): |
|
table = gr.Dataframe( |
|
value=sub_df, |
|
wrap=True, |
|
column_widths=[400] + [110 for _ in task_cols], |
|
) |
|
|
|
|
|
search_bar.submit( |
|
fn=lambda query, t=task_name: filter_and_search(query, t), |
|
inputs=search_bar, |
|
outputs=table, |
|
) |
|
|
|
gr.HTML("Threshold corresponding to the values of GUI and Embodied Grounding: <b>20</b>") |
|
|
|
demo.launch() |
|
|
|
|
|
def filter_and_search(search_query: str, task_name: str): |
|
df = leaderboard_df.copy() |
|
task_cols = TASKS[task_name] |
|
score_col = task_cols[0] |
|
|
|
df[score_col] = pd.to_numeric(df[score_col], errors='coerce') |
|
df = df.sort_values(by=score_col, ascending=False, na_position='last') |
|
|
|
if search_query.strip(): |
|
terms = [term.strip().lower() for term in search_query.split(";")] |
|
pattern = "|".join(terms) |
|
df = df[df["Model"].str.lower().str.contains(pattern, regex=True)] |
|
|
|
return df[["Model"] + task_cols] |
|
|
|
def get_initial_table(task_name: str): |
|
df = leaderboard_df.copy() |
|
task_cols = TASKS[task_name] |
|
score_col = task_cols[0] |
|
|
|
df[score_col] = pd.to_numeric(df[score_col], errors='coerce') |
|
df = df.sort_values(by=score_col, ascending=False, na_position='last') |
|
return df[["Model"] + task_cols] |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.HTML("<h2>Leaderboard</h2>") |
|
with gr.Column(): |
|
gr.Markdown("Search and view results for each task.", elem_classes="markdown-text") |
|
|
|
with gr.Tabs(elem_classes="tabs-buttons") as tabs: |
|
for task_name, task_cols in TASKS.items(): |
|
with gr.TabItem(task_name): |
|
|
|
sub_df = leaderboard_df[["Model"] + task_cols].copy() |
|
sub_df[task_cols[0]] = pd.to_numeric(sub_df[task_cols[0]], errors="coerce") |
|
sub_df = sub_df.sort_values(by=task_cols[0], ascending=False, na_position="last") |
|
|
|
with gr.Row(): |
|
search_bar = gr.Textbox(placeholder="Search model name...", show_label=False) |
|
|
|
refresh_btn = gr.Button("Refresh") |
|
with gr.Group(): |
|
table = gr.Dataframe( |
|
value=sub_df, |
|
wrap=True, |
|
column_widths=[400] + [110 for _ in task_cols], |
|
) |
|
|
|
|
|
search_bar.submit( |
|
fn=lambda query, t=task_name: filter_and_search(query, t), |
|
inputs=search_bar, |
|
outputs=table, |
|
) |
|
def refresh(task=task_name): |
|
return "", get_initial_table(task) |
|
|
|
refresh_btn.click( |
|
fn=refresh, |
|
outputs=[search_bar, table] |
|
) |
|
|
|
gr.HTML("Threshold corresponding to the values of GUI and Embodied Grounding: <b>20</b>") |
|
|
|
demo.launch() |