Spaces:

HPAI-BSC
/

TuRTLe-Leaderboard

Running

File size: 19,692 Bytes

711a69b
05222de
 
 
65e4811
0762cf8
05222de
711a69b
 
 
 
05222de
65e4811
aaf0c71
b20457b
711a69b
becd2f0
aaf0c71
 
711a69b
 
aaf0c71
 
711a69b
 
8e9d8db
 
711a69b
 
 
 
 
 
 
becd2f0
711a69b
65e4811
711a69b
 
 
65e4811
711a69b
 
 
 
 
 
 
 
8e9d8db
711a69b
65e4811
 
aaf0c71
711a69b
 
 
 
 
 
 
 
 
aaf0c71
 
711a69b
aaf0c71
 
cc18a4c
aaf0c71
cc18a4c
8e9d8db
cc18a4c
aaf0c71
cc18a4c
 
711a69b
 
 
 
 
 
 
cc18a4c
65e4811
711a69b
65e4811
a25be15
711a69b
 
a25be15
711a69b
 
 
65e4811
711a69b
 
 
 
 
 
 
 
65e4811
711a69b
 
 
aaf0c71
65e4811
711a69b
65e4811
 
711a69b
 
 
 
 
 
 
65e4811
 
 
 
711a69b
 
 
 
 
 
 
 
 
 
 
 
 
3e202b5
65e4811
 
711a69b
 
 
3e202b5
65e4811
 
711a69b
 
 
65e4811
711a69b
3e202b5
711a69b
 
 
 
3e202b5
 
65e4811
 
711a69b
c94926c
 
 
 
 
 
 
 
 
711a69b
 
 
 
65e4811
711a69b
8e9d8db
aaf0c71
8e9d8db
 
711a69b
 
 
 
 
 
 
 
e996c33
711a69b
 
 
 
1df4c13
 
 
711a69b
 
 
 
388e11b
 
1df4c13
05222de
71c4da9
fce1a0c
388e11b
 
 
e70391f
71c4da9
fce1a0c
388e11b
 
 
425eb6f
71c4da9
fce1a0c
388e11b
 
1df4c13
 
 
 
 
 
 
 
 
 
 
 
42bec98
388e11b
711a69b
 
65e4811
 
aaf0c71
8e9d8db
711a69b
 
aaf0c71
711a69b
 
 
 
 
aaf0c71
711a69b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65e4811
711a69b
65e4811
2a6282d
e996c33
711a69b
 
 
 
cf4f8a2
711a69b
59a97a5
0bdaa79
59a97a5
0bdaa79
 
 
 
711a69b
9b52407
711a69b
 
1833d81
aaf0c71
a25be15
711a69b
 
 
 
 
 
a25be15
711a69b
 
 
 
 
218e8a1
711a69b
 
 
 
 
73cf0ca
45b9051
711a69b
 
 
 
 
 
 
 
 
73cf0ca
 
 
 
f73e511
becd2f0
 
f73e511
73cf0ca
e500b12
73cf0ca
 
 
 
 
559ad26
b410573
 
 
73cf0ca
 
559ad26
73cf0ca
 
 
559ad26
73cf0ca
 
3e202b5
8275743
 
9ec240c
 
 
 
2105073
 
 
 
 
9ec240c
 
 
 
 
 
 
73cf0ca
 
 
 
 
1df4c13
73cf0ca
 
 
711a69b
65e4811
711a69b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65e4811
e996c33
 
 
711a69b
 
 
e996c33
a25be15
711a69b
 
 
65e4811
 
 
 
 
 
 
711a69b
65e4811
c94926c
 
 
 
 
 
 
 
 
 
 
711a69b
 
65e4811
 
 
 
c94926c
 
 
 
 
 
 
 
 
 
 
711a69b
 
 
3e202b5
711a69b

import sys

import gradio as gr
import pandas as pd
import plotly.express as px
from gradio.themes.utils import colors

from results.parse import parse_agg, read_data
from static.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT
from style.css_html_js import custom_css
from utils import filter_bench, filter_bench_all, filter_RTLRepo, handle_special_cases


def filter_leaderboard(task, benchmark, model_type, search_query, max_params):
    subset = df.copy()

    # Filter by task specific benchmarks when 'All' benchmarks is selected
    if task == "Spec-to-RTL":
        valid_benchmarks = s2r_benchs
        if benchmark == "All":
            subset = subset[subset["Benchmark"].isin(valid_benchmarks)]
    elif task == "Code Completion":
        valid_benchmarks = cc_benchs
        if benchmark == "All":
            subset = subset[subset["Benchmark"].isin(valid_benchmarks)]
    elif task == "Line Completion":
        valid_benchmarks = lc_benchs
        if benchmark == "All":
            subset = subset[subset["Benchmark"].isin(valid_benchmarks)]

    if benchmark != "All":
        subset = df[df["Benchmark"] == benchmark]

    if model_type != "All":
        # without emojis
        subset = subset[subset["Model Type"] == model_type.split(" ")[0]]
    if search_query:
        subset = subset[
            subset["Model"].str.contains(search_query, case=False, na=False)
        ]
    max_params = float(max_params)
    subset = subset[subset["Params"] <= max_params]

    if benchmark == "All":
        if task == "Spec-to-RTL":
            return filter_bench_all(subset, df_agg, agg_column="Agg S2R")
        elif task == "Code Completion":
            return filter_bench_all(subset, df_agg, agg_column="Agg MC")
        elif task == "Line Completion":
            return filter_RTLRepo(subset)
    elif benchmark == "RTL-Repo":
        return filter_RTLRepo(subset)
    else:
        agg_column = None
        if benchmark == "VerilogEval S2R":
            agg_column = "Agg VerilogEval S2R"
        elif benchmark == "VerilogEval MC":
            agg_column = "Agg VerilogEval MC"
        elif benchmark == "RTLLM":
            agg_column = "Agg RTLLM"
        elif benchmark == "VeriGen":
            agg_column = "Agg VeriGen"

        return filter_bench(subset, df_agg, agg_column)


def update_benchmarks_by_task(task):
    if task == "Spec-to-RTL":
        new_benchmarks = ["All"] + s2r_benchs
    elif task == "Code Completion":
        new_benchmarks = ["All"] + cc_benchs
    elif task == "Line Completion":
        new_benchmarks = lc_benchs
    else:
        new_benchmarks = ["All"] + benchmarks
    benchmark_value = "All" if "All" in new_benchmarks else new_benchmarks[0]
    filtered = filter_leaderboard(
        task,
        benchmark_value,
        model_type_dropdown.value,
        search_box.value,
        params_slider.value,
    )
    return gr.update(value=benchmark_value, choices=new_benchmarks), filtered


def generate_scatter_plot(benchmark, metric):
    benchmark, metric = handle_special_cases(benchmark, metric)

    subset = df[df["Benchmark"] == benchmark]
    if benchmark == "RTL-Repo":
        subset = subset[subset["Metric"].str.contains("EM", case=False, na=False)]
        detailed_scores = subset.groupby("Model", as_index=False)["Score"].mean()
        detailed_scores.rename(columns={"Score": "Exact Matching (EM)"}, inplace=True)
    else:
        detailed_scores = subset.pivot_table(
            index="Model", columns="Metric", values="Score"
        ).reset_index()

    details = df[["Model", "Params", "Model Type"]].drop_duplicates("Model")
    scatter_data = pd.merge(detailed_scores, details, on="Model", how="left").dropna(
        subset=["Params", metric]
    )

    scatter_data["x"] = scatter_data["Params"]
    scatter_data["y"] = scatter_data[metric]
    scatter_data["size"] = (scatter_data["x"] ** 0.3) * 40

    type_colors = {"General": "green", "Coding": "yellow", "RTL-Specific": "blue"}
    scatter_data["color"] = scatter_data["Model Type"].map(type_colors).fillna("gray")

    y_axis_limits = {
        "Functionality (FNC)": [5, 90],
        "Syntax (STX)": [20, 100],
        "Synthesis (SYN)": [5, 90],
        "Power": [0, 50],
        "Performance": [0, 50],
        "Area": [0, 50],
        "Exact Matching (EM)": [0, 50],
    }
    y_range = y_axis_limits.get(metric, [0, 80])

    fig = px.scatter(
        scatter_data,
        x="x",
        y="y",
        log_x=True,
        size="size",
        color="Model Type",
        text="Model",
        hover_data={metric: ":.2f"},
        title=f"Params vs. {metric} for {benchmark}",
        labels={"x": "# Params (Log Scale)", "y": metric},
        template="plotly_white",
        height=600,
        width=1200,
    )

    fig.update_traces(
        textposition="top center",
        textfont_size=10,
        marker=dict(opacity=0.8, line=dict(width=0.5, color="black")),
    )
    fig.update_layout(
        xaxis=dict(
            showgrid=True,
            type="log",
            tickmode="array",
            tickvals=[8, 14, 32, 72, 200, 700],
            ticktext=["8", "14", "32", "72", "200", "700"],
        ),
        showlegend=False,
        yaxis=dict(range=y_range),
        margin=dict(l=50, r=50, t=50, b=50),
        plot_bgcolor="white",
    )

    return fig


js_func = """
function refresh() {
    const url = new URL(window.location);
    if (url.searchParams.get('__theme') !== 'light') {
        url.searchParams.set('__theme', 'light');
        window.location.href = url.href;
    }
}
"""

with gr.Blocks(
    css=custom_css, js=js_func, theme=gr.themes.Default(primary_hue=colors.emerald)
) as app:
    df, benchmarks, metrics, default_metric = read_data()
    df_agg = parse_agg("./results/aggregated_scores.csv")
    tasks = ["Spec-to-RTL", "Code Completion", "Line Completion"]
    s2r_benchs = ["VerilogEval S2R", "RTLLM"]
    cc_benchs = ["VerilogEval MC", "VeriGen"]
    lc_benchs = ["RTL-Repo"]
    non_rtl_metrics = [
        "Syntax (STX)",
        "Functionality (FNC)",
        "Synthesis (SYN)",
        "Power",
        "Performance",
        "Area",
    ]
    rtl_metrics = ["Exact Matching (EM)"]
    model_types = ["All", "General 🟢", "Coding 🔵", "RTL-Specific 🔴"]

    gr.HTML(
        """
    <div align="center">
        <img src='/gradio_api/file=logo.png' alt='TuRTLe Logo' width='220'/>
    </div>
    """
    )
    gr.HTML(
        """
    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
    <script defer src="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/js/all.min.js"></script>
    <div style="text-align: center; margin-bottom: 0px; margin-top: 0px;">
        <a href="https://github.com/HPAI-BSC/TuRTLe" target="_blank" style="text-decoration: none; margin-right: 10px;">
            <button style="background: #333; color: white; padding: 10px 14px; border-radius: 8px; border: none; font-size: 16px; cursor: pointer;">
                GitHub Repo 
            </button>
        </a>

        <a href="http://arxiv.org/abs/2504.01986" target="_blank" style="text-decoration: none; margin-right: 10px;">
            <button style="background: #b31b1b; color: white; padding: 10px 14px; border-radius: 8px; border: none; font-size: 16px; cursor: pointer;">
                arXiv Preprint
            </button>
        </a>

        <a href="mailto:hpai@bsc.es?subject=TuRTLe%20leaderboard%20new%20entry&body=Link%20to%20HuggingFace%20Model:" style="text-decoration: none;">
            <button style="background: #00674F; color: white; padding: 10px 14px; border-radius: 8px; border: none; font-size: 16px; cursor: pointer;">
                How to submit
            </button>
        </a>
        <p style="margin-top: 15px;">If you have any inquiries or wish to collaborate: 
            <a href="mailto:hpai@bsc.es">hpai@bsc.es</a>
        </p>
    </div>
    """
    )
    gr.HTML(
        """
    <div style=" margin-top:-10px !important;">
        <p style="margin-bottom: 15px;  text-align: start !important;">Welcome to the TuRTLe Model Leaderboard! TuRTLe is a <b>unified evaluation framework designed to systematically assess Large Language Models (LLMs) in RTL (Register-Transfer Level) generation</b> for hardware design.
        Evaluation criteria include <b>syntax correctness, functional accuracy, synthesizability, and post-synthesis quality</b> (PPA: Power, Performance, Area). TuRTLe integrates multiple benchmarks to highlight strengths and weaknesses of available LLMs. 
        Use the filters below to explore different RTL benchmarks and models.</p>
    <p style="margin-top: 15px; text-align: start !important; "><span style="font-variant: small-caps; font-weight: bold;">NEW UPDATE (JUNE 2025)</span>: We make our framework open-source on GitHub, and add 7 new recent models! For a total of 40 base and instruct models and 5 RTL benchmarks.</p>
    </div>
    """
    )
    with gr.Tabs():
        with gr.Tab("Leaderboard"):
            with gr.Row(equal_height=True):
                with gr.Column():
                    task_radio = gr.Radio(
                        choices=tasks, label="Select Task", value="Spec-to-RTL"
                    )
                with gr.Column():
                    benchmark_radio = gr.Radio(
                        choices=["All"] + s2r_benchs,
                        label="Select Benchmark",
                        value="All",
                    )

            with gr.Row(equal_height=True):
                search_box = gr.Textbox(
                    label="Search Model",
                    placeholder="Type model name...",
                    scale=2,
                )
                model_type_dropdown = gr.Radio(
                    choices=model_types,
                    label="Select Model Type",
                    value="All",
                    scale=3,
                )
                params_slider = gr.Slider(
                    minimum=df["Params"].min(),
                    maximum=700,
                    value=700,
                    label="Max Params",
                    step=1,
                    scale=2,
                )

            leaderboard = gr.DataFrame(
                value=filter_leaderboard("Spec-to-RTL", "All", "All", "", 700),
                headers="first row",
                show_row_numbers=True,
                wrap=True,
                datatype=[
                    "markdown",
                    "html",
                ],
                interactive=False,
                column_widths=[
                    "7%",
                    "24%",
                    "17%",
                    "10%",
                    "13%",
                    "10%",
                    "14%",
                ],
                elem_classes="dataframe-leaderboard",
            )

        with gr.Tab("Plot View"):
            with gr.Row(equal_height=True):
                default_benchmark = s2r_benchs[0]
                bubble_benchmark = gr.Dropdown(
                    choices=benchmarks,
                    label="Select Benchmark",
                    value=default_benchmark,
                    elem_classes="gr-dropdown",
                )
                default_metric = non_rtl_metrics[0]
                bubble_metric = gr.Dropdown(
                    choices=non_rtl_metrics,
                    label="Select Metric",
                    value=default_metric,
                )
            with gr.Row(equal_height=True):
                scatter_plot = gr.Plot(
                    value=generate_scatter_plot(default_benchmark, default_metric),
                    label="Bubble Chart",
                    elem_id="full-width-plot",
                )

        with gr.Tab("Metrics Information"):
            with open("./static/metrics.md", "r") as file:
                gr.Markdown(
                    file.read(),
                    latex_delimiters=[
                        {"left": "$$", "right": "$$", "display": True},
                        {"left": "$", "right": "$", "display": False},
                    ],
                    elem_classes="metrics-page",
                )
        with gr.Tab("About Us"):
            gr.HTML(
                """
                <div style="max-width: 800px; margin: auto; padding: 20px; border: 1px solid #ccc; border-radius: 10px;">
                    <div style="display: flex; justify-content: center; align-items: center; gap: 5%; margin-bottom: 20px;">
                        <img src='/gradio_api/file=hpai_logo_grad.png' alt='HPAI Group Logo' style="width: 45%;"/>
                        <img src='/gradio_api/file=bsc-logo.png' alt='BSC Logo' style="width: 25%;"/>
                    </div>

                    <p style="font-size: 16px; text-align: start;">
                        The <b>High-Performance Artificial Intelligence (HPAI)</b> group is part of the 
                        <a href="https://bsc.es/" target="_blank">Barcelona Supercomputing Center (BSC)</a>. 
                        This leaderboard is maintained by HPAI as part of our commitment to <b>open science</b>.
                    </p>

                    <ul style="font-size: 16px; margin-bottom: 20px; margin-top: 20px;">
                        <li><a href="https://hpai.bsc.es/" target="_blank">HPAI Website</a></li>
                        <li><a href="https://github.com/HPAI-BSC/" target="_blank">HPAI GitHub Organization Page</a></li>
                        <li><a href="https://huggingface.co/HPAI-BSC/" target="_blank">HPAI Hugging Face Organization Page</a></li>
                    </ul>

                    <p style="font-size: 16px; margin-top: 15px;">
                        Feel free to contact us:
                    </p>

                    <p style="font-size: 16px;">Email: <a href="mailto:hpai@bsc.es"><b>hpai@bsc.es</b></a></p>
                </div>
                """
            )
        with gr.Tab("References"):
            gr.HTML(
            """
            <div style="max-width: 800px; margin: auto; padding: 20px; border: 1px solid #ccc; border-radius: 10px;">
                    <ul style="font-size: 16px; margin-bottom: 20px; margin-top: 20px;">
                        <li><a href="https://github.com/bigcode-project/bigcode-evaluation-harness" target="_blank">Code Generation LM Evaluation Harness</a></li>
                        <li>RTL-Repo: Allam and M. Shalan, “Rtl-repo: A benchmark for evaluating llms on large-scale rtl design projects,” in 2024 IEEE LLM Aided Design Workshop (LAD). IEEE, 2024, pp. 1–5.</li>
                        <li>VeriGen: S. Thakur, B. Ahmad, H. Pearce, B. Tan, B. Dolan-Gavitt, R. Karri, and S. Garg, “Verigen: A large language model for verilog code generation,” ACM Transactions on Design Automation of Electronic Systems, vol. 29, no. 3, pp. 1–31, 2024. </li>
                        <li>VerilogEval (I): M. Liu, N. Pinckney, B. Khailany, and H. Ren, “Verilogeval: Evaluating large language models for verilog code generation,” in 2023 IEEE/ACM International Conference on Computer Aided Design (ICCAD). IEEE, 2023, pp. 1–8.</li>
                        <li>VerilogEval (II): N. Pinckney, C. Batten, M. Liu, H. Ren, and B. Khailany, “Revisiting VerilogEval: A Year of Improvements in Large-Language Models for Hardware Code Generation,” ACM Trans. Des. Autom. Electron. Syst., feb 2025. https://doi.org/10.1145/3718088</li>
                        <li>RTLLM: Y. Lu, S. Liu, Q. Zhang, and Z. Xie, “Rtllm: An open-source benchmark for design rtl generation with large language model,” in 2024 29th Asia and South Pacific Design Automation Conference (ASP-DAC). IEEE, 2024, pp. 722–727.</li>
                    </ul>
                    <p style="font-size: 16px; margin-top: 15px;">
                        Feel free to contact us:
                    </p>
                </div>
                """
            )
        with gr.Row():
            with gr.Accordion("📙 Citation", open=False):
                citation_button = gr.Textbox(
                    value=CITATION_BUTTON_TEXT,
                    label=CITATION_BUTTON_LABEL,
                    lines=10,
                    elem_id="citation-button",
                    show_copy_button=True,
                )

    # event handlers, ugly way but it works
    task_radio.change(
        fn=update_benchmarks_by_task,
        inputs=[task_radio],
        outputs=[benchmark_radio, leaderboard],
    )
    benchmark_radio.change(
        fn=filter_leaderboard,
        inputs=[
            task_radio,
            benchmark_radio,
            model_type_dropdown,
            search_box,
            params_slider,
        ],
        outputs=leaderboard,
    )
    model_type_dropdown.change(
        fn=filter_leaderboard,
        inputs=[
            task_radio,
            benchmark_radio,
            model_type_dropdown,
            search_box,
            params_slider,
        ],
        outputs=leaderboard,
    )
    search_box.change(
        fn=filter_leaderboard,
        inputs=[
            task_radio,
            benchmark_radio,
            model_type_dropdown,
            search_box,
            params_slider,
        ],
        outputs=leaderboard,
    )
    params_slider.change(
        fn=filter_leaderboard,
        inputs=[
            task_radio,
            benchmark_radio,
            model_type_dropdown,
            search_box,
            params_slider,
        ],
        outputs=leaderboard,
    )

    def on_benchmark_change(benchmark, _):
        if benchmark == "RTL-Repo":
            metric = "Exact Matching (EM)"
            return gr.update(choices=rtl_metrics, value=metric), generate_scatter_plot(
                benchmark, metric
            )
        else:
            metric = non_rtl_metrics[0]
            return gr.update(
                choices=non_rtl_metrics[:-1], value=metric
            ), generate_scatter_plot(benchmark, metric)

    def on_metric_change(benchmark, metric):
        benchmark, metric = handle_special_cases(benchmark, metric)
        fig = generate_scatter_plot(benchmark, metric)
        return gr.update(value=benchmark), fig

    bubble_benchmark.change(
        fn=on_benchmark_change,
        inputs=[bubble_benchmark, bubble_metric],
        outputs=[bubble_metric, scatter_plot],
        js=""" // this is to avoid resetting user scroll each time a plot is re-generated
        (benchmark, metric) => {
            let scrollY = window.scrollY;  
            const observer = new MutationObserver(() => {
                window.scrollTo(0, scrollY);
                observer.disconnect();
            });
            observer.observe(document.getElementById('full-width-plot'), { childList: true });
            return [benchmark, metric];  
        }
        """,
    )

    bubble_metric.change(
        fn=on_metric_change,
        inputs=[bubble_benchmark, bubble_metric],
        outputs=[bubble_benchmark, scatter_plot],
        js=""" // this is to avoid resetting user scroll each time a plot is re-generated
        (benchmark, metric) => {
            let scrollY = window.scrollY;  
            const observer = new MutationObserver(() => {
                window.scrollTo(0, scrollY);
                observer.disconnect();
            });
            observer.observe(document.getElementById('full-width-plot'), { childList: true });
            return [benchmark, metric];  
        }
        """,
    )


app.launch(
    allowed_paths=[
        "logo.png",
        "hpai_logo_grad.png",
        "bsc-logo.png",
    ]
)