File size: 18,218 Bytes
6121f8b
 
 
 
 
 
 
e8ba31c
6121f8b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0607385
6121f8b
 
0607385
6121f8b
 
0607385
6121f8b
 
0607385
6121f8b
 
 
 
0607385
 
 
 
 
 
6121f8b
 
 
e8ba31c
 
 
 
 
 
 
 
 
6121f8b
 
 
e8ba31c
6121f8b
 
 
e8ba31c
6121f8b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0607385
6121f8b
0607385
6121f8b
 
 
 
 
 
 
 
 
0607385
6121f8b
 
 
0607385
6121f8b
 
 
 
 
 
 
 
 
 
 
1409d6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e8ba31c
1409d6f
e8ba31c
1409d6f
 
 
 
 
 
 
0607385
1409d6f
 
 
 
 
 
 
 
 
 
0607385
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6121f8b
0607385
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6121f8b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0607385
 
6121f8b
 
 
 
1409d6f
 
 
 
 
 
 
 
 
 
 
 
0607385
6121f8b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
import pandas as pd
from statistics import mean
import pandas as pd
import json
import numpy as np
from statistics import mean
import re
from datasets import load_dataset, concatenate_datasets
import os
from collections import defaultdict
from src.envs import API, SAHARA_DATA, SAHARA_RESULTS
TASKS_LIST={
    'xlni':'Cross-Lingual Natural Language Inference',
    'lid':'Language Identification',
    'news': 'News Classification',
    'sentiment':'Sentiment Analysis',
    'topic':'Topic Classification',
    'mt_eng2xx':'Machine Translation - English to African',
    'mt_fra2xx':'Machine Translation - French to African',
    'mt_xx2xx':'Machine Translation - African to African',
    'paraphrase':'Paraphrase',
    'summary':'Summarization',
    'title':'Title Generation',
    'mmlu':'General Knowledge',
    'mgsm':'Mathematical Word Problems',
    'belebele':'Reading Comprehension',
    'squad_qa':'Context-based Question Answering',
    'ner':'Named Entity Recognition',
    'phrase':'Phrase Chunking',
    'pos':'Part-of-Speech Tagging',
}
CLUSTERS = {
    "Text Classification Tasks": [
        'xlni', 'lid', 'news', 'sentiment', 'topic',
    ],
    "Text Generation Tasks": [
        'mt_eng2xx', 'mt_fra2xx', 'mt_xx2xx', 'paraphrase', 'summary', 'title',
    ],
    "MCCR Tasks": [
        'mmlu', 'mgsm', 'belebele', 'squad_qa',
    ],
    "Tokens Level Tasks": [
        'ner', 'phrase', 'pos',
    ],
}
ALL_TASKS = [t for cluster in CLUSTERS.values() for t in cluster]
# This dictionary maps each task ID to its parent cluster name
TASK_TO_CLUSTER_MAP = {
    task: cluster_name
    for cluster_name, tasks in CLUSTERS.items()
    for task in tasks
}
# ===== Authenticate and Load Data From Private HF Repo =====

def load_private_leaderboard_df():

    all_repo_files = API.list_repo_files(repo_id=SAHARA_DATA, repo_type="dataset")
    folder_path = "data/users/" 
    jsonl_files_in_folder = [
        f for f in all_repo_files 
        if f.startswith(folder_path) and f.endswith(".jsonl")
    ]
    jsonl_files_in_folder.append(SAHARA_RESULTS)
    print("++++++",jsonl_files_in_folder)
    ds = load_dataset(
        path=SAHARA_DATA,
        name=None,
        data_files=jsonl_files_in_folder,
        split="train",
        download_mode="force_redownload"
    )
    print(">>>>>>>", ds)
    return ds.to_pandas()
metrics_list={
    'bleu_1k':'spBleu<sup>1K</sup>',
    'accuracy':'Accuracy',
    'f1':'Macro-F1',
    'exact_match':'Exact Match',
    'rougeL':'RougeL',
}
LANG_ISO2NAME = {
    'eng': 'English',
    'fra': 'French',
    # 'ara': 'Arabic',
    'amh': 'Amharic',
    'ewe': 'Ewe',
    'hau': 'Hausa',
    'ibo': 'Igbo',
    'kin': 'Kinyarwanda',
    'lin': 'Lingala',
    'lug': 'Ganda',
    'orm': 'Oromo',
    'sna': 'Shona',
    'sot': 'Southern Sotho',
    'swa': 'Swahili', 'swh': 'Swahili',
    'twi': 'Twi',
    'wol': 'Wolof',
    'xho': 'Xhosa',
    'yor': 'Yoruba',
    'zul': 'Zulu',
    'afr': 'Afrikaans',
    'run': 'Rundi',
    'tir': 'Tigrinya',
    'som': 'Somali',
    'pcm': 'Nigerian Pidgin',
    'teo': 'Teso',
    'nyn': 'Nyankore',# (Nyankole)',
    'lgg': 'Lugbara',
    'bem': 'Bemba',# (Chibemba)',
    'tsn': 'Tswana',
    'bbj': 'Ghomálá',
    'mos': 'Moore',
    'bam': 'Bambara',
    'fon': 'Fon',
    'ach': 'Acholi',
    'nso': 'Sepedi',
    'tso': 'Tsonga',
    'fuv': 'Fulfude Nigeria',
    'gaz': 'Oromo', #, West Central',
    'kea': 'Kabuverdianu',
    'nya': 'Nyanja',
    'ssw': 'Swati',
    'luo': 'Dholuo',# (Luo)',
    'ven': 'Venda',
    'kir':"Kirundi",
}

# ===== Build Language Name→ISOs map =====
def build_langname_to_isos(iso2name):
    name2isos = defaultdict(set)
    for iso, name in iso2name.items():
        name2isos[name].add(iso)
    return name2isos

def compare_models(model_1_name, model_2_name):
    """
    Prepares a DataFrame comparing the performance of two models task-by-task.
    """
    if model_1_name == model_2_name:
        return pd.DataFrame([{"Info": "Please select two different models to compare."}])

    # Get data for each model from the main leaderboard results
    df1 = all_df[(all_df['model'] == model_1_name) & (all_df['leaderboard'] == 'main')][['task', 'score', 'metric']].rename(columns={'score': model_1_name})
    df2 = all_df[(all_df['model'] == model_2_name) & (all_df['leaderboard'] == 'main')][['task', 'score']].rename(columns={'score': model_2_name})

    if df1.empty or df2.empty:
        return pd.DataFrame([{"Info": "One or both selected models have no 'main' leaderboard data to compare."}])

    # Merge the two dataframes on the task ID
    comp_df = pd.merge(df1, df2, on='task', how='outer')

    # Add descriptive columns
    comp_df['Cluster'] = comp_df['task'].map(TASK_TO_CLUSTER_MAP)
    comp_df['Task Name'] = comp_df['task'].map(TASKS_LIST)
    comp_df['Metric'] = comp_df['metric'].map(metrics_list)
    comp_df.fillna({'Cluster': 'Uncategorized'}, inplace=True)
    
    # Calculate the score difference, ensuring scores are numeric
    score1 = pd.to_numeric(comp_df[model_1_name], errors='coerce')
    score2 = pd.to_numeric(comp_df[model_2_name], errors='coerce')
    comp_df['Difference'] = score1 - score2

    # Format the difference column with colors
    def format_diff(d):
        if pd.isna(d):
            return "---"
        if d > 0.001:  # Model 1 is better
            return f"<span style='color:green !important; font-weight:bold !important;'>+{d:.2f}</span>"
        elif d < -0.001:  # Model 2 is better
            return f"<span style='color:red !important; font-weight:bold !important;'>{d:.2f}</span>"
        else:
            return f"{d:.2f}"

    # Format all score columns
    comp_df[model_1_name] = comp_df[model_1_name].apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---")
    comp_df[model_2_name] = comp_df[model_2_name].apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---")
    comp_df['Difference'] = comp_df['Difference'].apply(format_diff)

    # --- MODIFIED: Added 'task' to the list of final columns ---
    final_cols = ['Cluster', 'Task Name', 'task', 'Metric', model_1_name, model_2_name, 'Difference']
    comp_df = comp_df[final_cols]
    comp_df = comp_df.sort_values(by=['Cluster', 'Task Name']).reset_index(drop=True)

    # --- NEW: Renamed 'task' column to 'Task ID' for display ---
    comp_df.rename(columns={'task': 'Task ID'}, inplace=True)

    return comp_df
    
def get_model_table(model_name):
    """
    Generates a performance table for a specific model, showing cluster, task, and score.
    The table is sorted by Cluster and then by Task Name.
    """
    # Filter for the selected model and only 'main' leaderboard entries
    model_df = all_df[(all_df['model'] == model_name) & (all_df['leaderboard'] == 'main')].copy()

    if model_df.empty:
        return pd.DataFrame([{"Info": f"No 'main' leaderboard data available for the model: {model_name}"}])

    # --- NEW: Add the Cluster Name column using the map ---
    model_df['Cluster'] = model_df['task'].map(TASK_TO_CLUSTER_MAP)
    
    # Create other descriptive columns
    model_df['Task Name'] = model_df['task'].map(TASKS_LIST)
    model_df['Metric'] = model_df['metric'].map(metrics_list)
    model_df['Score'] = model_df['score'].apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---")

    # --- MODIFIED: Select the new 'Cluster' column for the final table ---
    table = model_df[['Cluster', 'Task Name', 'task', 'Metric', 'Score']].rename(columns={'task': 'Task ID'})

    # --- MODIFIED: Sort by Cluster first, then by Task Name ---
    table = table.sort_values(by=['Cluster', 'Task Name']).reset_index(drop=True)

    # Handle cases where a task might not be in a cluster
    table['Cluster'].fillna('Uncategorized', inplace=True)

    return table
    
def get_task_leaderboard(task_key):
    """
    Generates a leaderboard for a specific task, showing model performance across all languages.
    """
    # Filter the main DataFrame for the selected task
    task_df = all_df[all_df['task'] == task_key].copy()

    if task_df.empty:
        return pd.DataFrame([{"Info": f"No data available for the task: {TASKS_LIST.get(task_key, task_key)}"}])

    # Get the metric for this task to display later
    metric_name = metrics_list.get(task_df['metric'].iloc[0], '')

    # Create a user-friendly column name for each language/leaderboard
    def make_lang_col(row):
        lb = row['leaderboard']
        if lb == 'main':
            # Skip the 'main' leaderboard for task-specific views as it's an aggregate
            return None
        if '-' in lb:
            pair_lang = lb.split('-')
            # Handles cases where an ISO code might not be in our map
            src_lang = LANG_ISO2NAME.get(pair_lang[0], pair_lang[0])
            tgt_lang = LANG_ISO2NAME.get(pair_lang[1], pair_lang[1])
            return f"{src_lang} to {tgt_lang}"
        else:
            return LANG_ISO2NAME.get(lb, lb)
    if task_key not in ['lid']:
        task_df['lang_col'] = task_df.apply(make_lang_col, axis=1)
        task_df.dropna(subset=['lang_col'], inplace=True) # Remove rows where lang_col is None
    
        if task_df.empty:
            return pd.DataFrame([{"Info": f"No language-specific data for the task: {TASKS_LIST.get(task_key, task_key)}"}])

        # Pivot the table to have models as rows and languages as columns
        table = task_df.pivot_table(index='model', columns='lang_col', values='score', aggfunc='mean').reset_index()
    else:
        table = task_df.pivot_table(index='model', columns='task', values='score', aggfunc='mean').reset_index()
    
    score_cols = [col for col in table.columns if col != 'model']
    for col in score_cols:
        table[col] = table[col].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)
    main_score_map = all_df[(all_df['task'] == task_key) & (all_df['leaderboard'] == 'main')].set_index('model')['score']
    table.insert(1, 'Task Score', table['model'].map(main_score_map).apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---"))

    # Add ranking medals based on the "Task Score"
    table = add_medals_to_models(table, score_col="Task Score")
    
    # Rename columns to be more descriptive, including the metric
    # rename_cols = {col: f"{col}<br>Metric: {metric_name}" for col in score_cols}
    if task_key in ['belebele', 'ner', 'mgsm', 'mmlu']:
        # rename_cols = {col: f"<div class='rotate_div'><br>{next(iter(LANGNAME2ISOS.get(col)))}</div>" for col in score_cols}
        rename_cols = {col: f"<div class='rotate_div'><br>{col}</div>" for col in score_cols}
    else:
        rename_cols = {col: f"{col}" for col in score_cols}
    table.rename(columns=rename_cols, inplace=True)

    return table
    
def get_task_metric_map(df):
    mapping = {}
    for _, row in df.iterrows():
        mapping[row["task"]] = row["metric"]
    return mapping

def cluster_average(row, tasks):
    vals = []
    for t in tasks:
        try:
            v = float(row[t])
            vals.append(v)
        except Exception:
            continue
    return np.mean(vals) if vals else np.nan

def add_medals_to_models(df, score_col="overall score"):
    score_float_col = "__score_float"
    df[score_float_col] = df[score_col].apply(lambda x: float(x) if x != "---" else np.nan)
    df = df.sort_values(by=score_float_col, ascending=False, kind="mergesort").reset_index(drop=True)
    def get_rank_symbols(scores):
        unique_scores = sorted(set([s for s in scores if not pd.isna(s)]), reverse=True)
        symbols = ["🏆", "🥈", "🥉"]
        score_to_symbol = {s: symbols[i] for i, s in enumerate(unique_scores[:3])}
        return [score_to_symbol.get(s, "") for s in scores]
    df['rank_symbol'] = get_rank_symbols(df[score_float_col].tolist())
    df['model'] = df['rank_symbol'] + ' ' + df['model']
    df = df.drop(columns=['rank_symbol', score_float_col])
    return df

def format_cluster_table(df, cluster_tasks, metric_map):
    col_order = ["model"] + cluster_tasks
    for t in cluster_tasks:
        if t not in df.columns:
            df[t] = '---'
    df = df[col_order]
    for t in cluster_tasks:
        df[t] = df[t].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float, np.integer, np.floating)) else x)
    df["Cluster Score"] = df[cluster_tasks].apply(
        lambda row: cluster_average(row, cluster_tasks), axis=1
    )
    df["Cluster Score"] = df["Cluster Score"].apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---")
    df = df[["model", "Cluster Score"] + cluster_tasks]
    # rename = {t: f"{t}\n{metric_map.get(t, '')}" for t in cluster_tasks}
    rename = {t: f"{TASKS_LIST[t]}<br>Metric: {metrics_list[metric_map.get(t, '')]}" for t in cluster_tasks}
    df = df.rename(columns=rename)
    df = add_medals_to_models(df, score_col="Cluster Score")
    return df

def format_main_overall_table(df, metric_map):
    main = df.copy()
    for cname, tasks in CLUSTERS.items():
        main[cname] = main[tasks].apply(lambda row: cluster_average(row, tasks), axis=1)
    cluster_cols = list(CLUSTERS.keys())
    main["Overall Score"] = main[cluster_cols].apply(
        lambda row: np.nanmean([x for x in row if pd.notna(x)]), axis=1
    )
    for c in cluster_cols + ["Overall Score"]:
        main[c] = main[c].apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---")
    main = main[["model", "Overall Score"] + cluster_cols]
    main = add_medals_to_models(main, score_col="Overall Score")
    main.rename(columns={'Overall Score': 'Sahara Score'}, inplace=True)
    return main

def load_leaderboards():
    df = load_private_leaderboard_df()
    metric_map = get_task_metric_map(df)
    main_df = df[df['leaderboard'] == 'main'].copy()
    if main_df.empty:
        cluster_tabs = {c: pd.DataFrame([{"Info": "No data"}]) for c in CLUSTERS}
        main_overall_tab = pd.DataFrame([{"Info": "No data"}])
        return cluster_tabs, main_overall_tab, [], {}, df, metric_map
    main_tasks_df = main_df.pivot_table(index='model', columns='task', values='score').reset_index()
    cluster_tabs = {}
    for cname, tasks in CLUSTERS.items():
        cluster_tabs[cname] = format_cluster_table(main_tasks_df, tasks, metric_map)
    for t in ALL_TASKS:
        if t not in main_tasks_df.columns:
            main_tasks_df[t] = np.nan
    main_overall_tab = format_main_overall_table(main_tasks_df, metric_map)
    all_langs = sorted([lb for lb in df['leaderboard'].unique() if lb not in ['main']])
    return cluster_tabs, main_overall_tab, df, metric_map

def df_to_html(df, col_minwidth=90, col_maxwidth=140, model_col_width=400):
    # Remove any column whose name contains "task"
    drop_cols = [col for col in df.columns if "task" in col]
    df = df.drop(columns=drop_cols, errors="ignore")
    df.columns.name = None   
    html = df.to_html(index=False, escape=False)
    return html



cluster_tabs, main_overall_tab, all_df, metric_map = load_leaderboards()

LANGNAME2ISOS = build_langname_to_isos(LANG_ISO2NAME)
#show only African langs
LANG_NAME_LIST = sorted([lang for lang in LANGNAME2ISOS.keys() if lang not in ['eng', 'fra', 'English', 'French']])
# TASK_NAME_LIST = sorted(list(TASKS_LIST.values()))
# Create a list of choices in the format "Task Name (id)"
TASK_NAME_LIST = sorted([f"{name} ({key})" for key, name in TASKS_LIST.items()])
TASK_NAME2KEY = {v: k for k, v in TASKS_LIST.items()}

# Get the list of unique model names for the new dropdown
MODEL_NAME_LIST = sorted(all_df['model'].unique()) if not all_df.empty else []

def get_lang_table(lang_name):
    iso_codes = LANGNAME2ISOS.get(lang_name, [])
    if not iso_codes:
        return pd.DataFrame([{"Info": "No data for this language"}])
    # Find all leaderboards containing any ISO in this language group
    pattern = re.compile(r"(^|-)(" + "|".join(re.escape(iso) for iso in iso_codes) + r")(-|$)")
    matched_langs = [lb for lb in all_df['leaderboard'].unique() if lb not in ['main'] and pattern.search(lb)]
    lang_df = all_df[all_df['leaderboard'].isin(matched_langs)].copy()
    if lang_df.empty:
        return pd.DataFrame([{"Info": "No data for this language"}])
    def make_task_col(row):
        lb = row['leaderboard']
        task = row['task']
        metric = row['metric']
        if '-' in lb:
            pair_lang = lb.split('-')
            pair = lb.replace('-', '_')
            # return f"{TASKS_LIST[task]}({task}) {LANG_ISO2NAME[pair_lang[0]]} to {LANG_ISO2NAME[pair_lang[1]]} ({pair})\n{metric}"
            return f"{TASKS_LIST[task]} <br> {LANG_ISO2NAME[pair_lang[0]]} to {LANG_ISO2NAME[pair_lang[1]]} <br> Metric: {metrics_list[metric]}"
        else:
            return f"{TASKS_LIST[task]} <br>  Metric: {metrics_list[metric]}"
    lang_df['task_col'] = lang_df.apply(make_task_col, axis=1)
    table = lang_df.pivot_table(index='model', columns='task_col', values='score').reset_index()
    score_cols = [col for col in table.columns if col != 'model']
    for col in score_cols:
        table[col] = table[col].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float, np.integer, np.floating)) else x)
    def avg_score(row):
        vals = []
        for col in score_cols:
            try:
                v = float(row[col])
                vals.append(v)
            except Exception:
                continue
        return np.mean(vals) if vals else np.nan
    table.insert(1, 'Language Score', table.apply(avg_score, axis=1).apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---"))
    table['__overall_score_float'] = table['Language Score'].apply(lambda x: float(x) if x != "---" else np.nan)
    table = table.sort_values(by='__overall_score_float', ascending=False, kind="mergesort").reset_index(drop=True)
    def get_rank_symbols(scores):
        unique_scores = sorted(set([s for s in scores if not pd.isna(s)]), reverse=True)
        symbols = ["🏆", "🥈", "🥉"]
        score_to_symbol = {s: symbols[i] for i, s in enumerate(unique_scores[:3])}
        return [score_to_symbol.get(s, "") for s in scores]
    table['rank_symbol'] = get_rank_symbols(table['__overall_score_float'].tolist())
    table['model'] = table['rank_symbol'] + ' ' + table['model']
    table = table.drop(columns=['rank_symbol', '__overall_score_float'])
    return table