File size: 5,191 Bytes
27b730c
 
 
 
 
 
6a61a18
 
27b730c
6a61a18
 
 
 
 
 
 
 
db0d8a2
 
 
6a61a18
 
db0d8a2
 
 
 
 
 
6a61a18
 
27b730c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cd8390d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27b730c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import json
import os

import pandas as pd

from src.display.formatting import has_no_nan_values, make_clickable_model
from src.display.utils import EvalQueueColumn
# from src.leaderboard.read_evals import get_raw_eval_results

def get_rag_leaderboard_df(csv_path):
    df = pd.read_csv(csv_path)

    for col in ["rag1", "rag2", "rag3"]:
        df[col] = pd.to_numeric(df[col], errors="coerce").round(2)

    pretty = {
        "Models":                 "Models",
        "rag1":     "Context in System Prompt (%)",
        "rag2": "Context and Question Single-Turn (%)",
        "rag3": "Context and Question Two-Turns (%)",
    }
    df = df.rename(columns=pretty)

    # sort so the lowest Single-Turn hallucination rate appears first
    df = (
        df.sort_values("Context and Question Single-Turn (%)", ascending=True)
          .reset_index(drop=True)
    )
    
    return df

def get_leaderboard_df(results_path):
    df = pd.read_csv(results_path)
    # numeric formatting
    df["ha_rag_rate"]  = df["ha_rag_rate"].round(2)
    df["ha_non_rag_rate"] = df["ha_non_rag_rate"].round(2)

    # --- map to pretty headers just before returning ---
    pretty = {
        "Models":                 "Models",
        "ha_rag_rate":     "RAG Hallucination Rate (%)",
        "ha_non_rag_rate": "Non-RAG Hallucination Rate (%)",
    }
    df = df.rename(columns=pretty) # this is what the UI will use
    # ----------- Average column & ranking ---------------------------------------------
    df["Average Hallucination Rate (%)"] = df[
        ["RAG Hallucination Rate (%)", "Non-RAG Hallucination Rate (%)"]
    ].mean(axis=1).round(2)

    # sort so *lower* average = better (true leaderboard style)
    df = df.sort_values("Average Hallucination Rate (%)", ascending=True).reset_index(drop=True)

    # # Rank & medal
    medal_map  = {1: "🥇", 2: "🥈", 3: "🥉"} 

    def medal_html(rank):
        """Return an HTML span with the medal icon for the top 3 ranks.



        The numeric rank is stored in the data-order attribute equal to the numerical rank so that

        DataTables (used under-the-hood by the gradio_leaderboard component)

        can sort the column by this hidden numeric value while still

        displaying the pretty medal icon. For ranks > 3 we just return the

        integer so the column remains fully numeric.

        """
        medal = medal_map.get(rank)
        if medal:
            # Prepend a hidden numeric span so string sorting still works numerically.
            return (
                f'<span style="display:none">{rank:04}</span>'  # zero-padded for stable string sort
                f'<span style="font-size:2.0rem;">{medal}</span>'
            )
        # For other ranks, also zero-pad to keep width and ensure proper string sort
        return f'<span style="display:none">{rank:04}</span>{rank}'
    
    df["Rank"] = df.index + 1
    df["Rank"] = df["Rank"].apply(medal_html)


    # ----------- column ordering ------------------------------------------------------
    df = df[[
        "Rank",                 # pretty column user sees
        "Models",
        "Average Hallucination Rate (%)",
        "RAG Hallucination Rate (%)",
        "Non-RAG Hallucination Rate (%)",
    ]]

    return df  




def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
    """Creates the different dataframes for the evaluation queues requestes"""
    entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
    all_evals = []

    for entry in entries:
        if ".json" in entry:
            file_path = os.path.join(save_path, entry)
            with open(file_path) as fp:
                data = json.load(fp)

            data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
            data[EvalQueueColumn.revision.name] = data.get("revision", "main")

            all_evals.append(data)
        elif ".md" not in entry:
            # this is a folder
            sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
            for sub_entry in sub_entries:
                file_path = os.path.join(save_path, entry, sub_entry)
                with open(file_path) as fp:
                    data = json.load(fp)

                data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
                data[EvalQueueColumn.revision.name] = data.get("revision", "main")
                all_evals.append(data)

    pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
    running_list = [e for e in all_evals if e["status"] == "RUNNING"]
    finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
    df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
    df_running = pd.DataFrame.from_records(running_list, columns=cols)
    df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
    return df_finished[cols], df_running[cols], df_pending[cols]