LLM Leaderboard for Minecraft

import json
from pathlib import Path

import gradio as gr
import pandas as pd

TITLE = """<h1 align="center" id="space-title">LLM Leaderboard for Minecraft</h1>"""

DESCRIPTION = f"""
Evaluation of VLM on Minecraft
"""

BENCHMARKS_TO_SKIP = []


def get_leaderboard_df(score_path):
    with open(score_path, "r") as f:
        scores = json.load(f)

    rows = []
    for model, metrics in scores.items():
        row = {"Model": model}  # Initialize with the model name
        for key, value in metrics.items():
            if isinstance(value, dict):  # If it's a dictionary, further flatten it
                for sub_key, sub_value in value.items():
                    if sub_key != "20":
                        continue
                    #row[f"{key}_{sub_key}"] = sub_value
                    row[f"{key.replace('_', ' ')}"] = sub_value
            else:
                row[key] = value
        rows.append(row)

    df = pd.DataFrame(rows)
    df.iloc[:, 1:] = df.iloc[:, 1:].applymap(lambda x: f"{x * 100:.2f}" if isinstance(x, (int, float)) else x)

    return df


leaderboard_df = get_leaderboard_df("score.json")

import gradio as gr
import pandas as pd

# 示例：你已有的 dataframe
# leaderboard_df = pd.read_csv("your_data.csv")

# 示例任务列字典
TASKS = {
    "VQA": ["VQA"],
    "QA": ["QA"],
    "VQA Reasoning": ["VQA_Reasoning"],
    "Reason": ["Reason"],  # 请确保这个列名正确
    "Embodied Grounding": ["Embodied Grounding"],
    "GUI Grounding": ["Gui Grounding"],
}

# 筛选函数：只根据模型名称关键词搜索
def filter_and_search(search_query: str, task_name: str):
    df = leaderboard_df.copy()
    task_cols = TASKS[task_name]
    score_col = task_cols[0]

    df[score_col] = pd.to_numeric(df[score_col], errors='coerce')
    df = df.sort_values(by=score_col, ascending=False, na_position='last')

    if search_query.strip():
        terms = [term.strip().lower() for term in search_query.split(";")]
        pattern = "|".join(terms)
        df = df[df["Model"].str.lower().str.contains(pattern, regex=True)]

    return df[["Model"] + task_cols]

# Gradio UI 构建
with gr.Blocks() as demo:
    gr.HTML("<h2>Leaderboard</h2>")
    with gr.Column():
        gr.Markdown("Search and view results for each task.", elem_classes="markdown-text")

    with gr.Tabs(elem_classes="tabs-buttons") as tabs:
        for task_name, task_cols in TASKS.items():
            with gr.TabItem(task_name):
                # 初始数据：按得分降序
                sub_df = leaderboard_df[["Model"] + task_cols].copy()
                sub_df[task_cols[0]] = pd.to_numeric(sub_df[task_cols[0]], errors="coerce")
                sub_df = sub_df.sort_values(by=task_cols[0], ascending=False, na_position="last")

                with gr.Row():
                    search_bar = gr.Textbox(placeholder="Search model name...", show_label=False)

                with gr.Group():
                    table = gr.Dataframe(
                        value=sub_df,
                        wrap=True,
                        column_widths=[400] + [110 for _ in task_cols],
                    )

                # 绑定搜索逻辑
                search_bar.submit(
                    fn=lambda query, t=task_name: filter_and_search(query, t),
                    inputs=search_bar,
                    outputs=table,
                )

        gr.HTML("Threshold corresponding to the values of GUI and Embodied Grounding: <b>20</b>")

demo.launch()

# 筛选函数：只根据模型名称关键词搜索
def filter_and_search(search_query: str, task_name: str):
    df = leaderboard_df.copy()
    task_cols = TASKS[task_name]
    score_col = task_cols[0]

    df[score_col] = pd.to_numeric(df[score_col], errors='coerce')
    df = df.sort_values(by=score_col, ascending=False, na_position='last')

    if search_query.strip():
        terms = [term.strip().lower() for term in search_query.split(";")]
        pattern = "|".join(terms)
        df = df[df["Model"].str.lower().str.contains(pattern, regex=True)]

    return df[["Model"] + task_cols]

def get_initial_table(task_name: str):
    df = leaderboard_df.copy()
    task_cols = TASKS[task_name]
    score_col = task_cols[0]

    df[score_col] = pd.to_numeric(df[score_col], errors='coerce')
    df = df.sort_values(by=score_col, ascending=False, na_position='last')
    return df[["Model"] + task_cols]

# Gradio UI 构建
with gr.Blocks() as demo:
    gr.HTML("<h2>Leaderboard</h2>")
    with gr.Column():
        gr.Markdown("Search and view results for each task.", elem_classes="markdown-text")

    with gr.Tabs(elem_classes="tabs-buttons") as tabs:
        for task_name, task_cols in TASKS.items():
            with gr.TabItem(task_name):
                # 初始数据：按得分降序
                sub_df = leaderboard_df[["Model"] + task_cols].copy()
                sub_df[task_cols[0]] = pd.to_numeric(sub_df[task_cols[0]], errors="coerce")
                sub_df = sub_df.sort_values(by=task_cols[0], ascending=False, na_position="last")

                with gr.Row():
                    search_bar = gr.Textbox(placeholder="Search model name...", show_label=False)

                refresh_btn = gr.Button("Refresh")
                with gr.Group():
                    table = gr.Dataframe(
                        value=sub_df,
                        wrap=True,
                        column_widths=[400] + [110 for _ in task_cols],
                    )

                # 绑定搜索逻辑
                search_bar.submit(
                    fn=lambda query, t=task_name: filter_and_search(query, t),
                    inputs=search_bar,
                    outputs=table,
                )
                def refresh(task=task_name):
                    return "", get_initial_table(task)

                refresh_btn.click(
                    fn=refresh,
                    outputs=[search_bar, table]
                )

        gr.HTML("Threshold corresponding to the values of GUI and Embodied Grounding: <b>20</b>")

demo.launch()