import pandas as pd
from statistics import mean
import pandas as pd
import json
import numpy as np
from statistics import mean
import re
from datasets import load_dataset, concatenate_datasets
import os
from collections import defaultdict
from src.envs import API, SAHARA_DATA, SAHARA_RESULTS
TASKS_LIST={
'xlni':'Cross-Lingual Natural Language Inference',
'lid':'Language Identification',
'news': 'News Classification',
'sentiment':'Sentiment Analysis',
'topic':'Topic Classification',
'mt_eng2xx':'Machine Translation - English to African',
'mt_fra2xx':'Machine Translation - French to African',
'mt_xx2xx':'Machine Translation - African to African',
'paraphrase':'Paraphrase',
'summary':'Summarization',
'title':'Title Generation',
'mmlu':'General Knowledge',
'mgsm':'Mathematical Word Problems',
'belebele':'Reading Comprehension',
'squad_qa':'Context-based Question Answering',
'ner':'Named Entity Recognition',
'phrase':'Phrase Chunking',
'pos':'Part-of-Speech Tagging',
}
CLUSTERS = {
"Text Classification Tasks": [
'xlni', 'lid', 'news', 'sentiment', 'topic',
],
"Text Generation Tasks": [
'mt_eng2xx', 'mt_fra2xx', 'mt_xx2xx', 'paraphrase', 'summary', 'title',
],
"MCCR Tasks": [
'mmlu', 'mgsm', 'belebele', 'squad_qa',
],
"Tokens Level Tasks": [
'ner', 'phrase', 'pos',
],
}
ALL_TASKS = [t for cluster in CLUSTERS.values() for t in cluster]
# This dictionary maps each task ID to its parent cluster name
TASK_TO_CLUSTER_MAP = {
task: cluster_name
for cluster_name, tasks in CLUSTERS.items()
for task in tasks
}
# ===== Authenticate and Load Data From Private HF Repo =====
def load_private_leaderboard_df():
all_repo_files = API.list_repo_files(repo_id=SAHARA_DATA, repo_type="dataset")
folder_path = "data/users/"
jsonl_files_in_folder = [
f for f in all_repo_files
if f.startswith(folder_path) and f.endswith(".jsonl")
]
jsonl_files_in_folder.append(SAHARA_RESULTS)
print("++++++",jsonl_files_in_folder)
ds = load_dataset(
path=SAHARA_DATA,
name=None,
data_files=jsonl_files_in_folder,
split="train",
download_mode="force_redownload"
)
print(">>>>>>>", ds)
return ds.to_pandas()
metrics_list={
'bleu_1k':'spBleu1K',
'accuracy':'Accuracy',
'f1':'Macro-F1',
'exact_match':'Exact Match',
'rougeL':'RougeL',
}
LANG_ISO2NAME = {
'eng': 'English',
'fra': 'French',
# 'ara': 'Arabic',
'amh': 'Amharic',
'ewe': 'Ewe',
'hau': 'Hausa',
'ibo': 'Igbo',
'kin': 'Kinyarwanda',
'lin': 'Lingala',
'lug': 'Ganda',
'orm': 'Oromo',
'sna': 'Shona',
'sot': 'Southern Sotho',
'swa': 'Swahili', 'swh': 'Swahili',
'twi': 'Twi',
'wol': 'Wolof',
'xho': 'Xhosa',
'yor': 'Yoruba',
'zul': 'Zulu',
'afr': 'Afrikaans',
'run': 'Rundi',
'tir': 'Tigrinya',
'som': 'Somali',
'pcm': 'Nigerian Pidgin',
'teo': 'Teso',
'nyn': 'Nyankore',# (Nyankole)',
'lgg': 'Lugbara',
'bem': 'Bemba',# (Chibemba)',
'tsn': 'Tswana',
'bbj': 'Ghomálá',
'mos': 'Moore',
'bam': 'Bambara',
'fon': 'Fon',
'ach': 'Acholi',
'nso': 'Sepedi',
'tso': 'Tsonga',
'fuv': 'Fulfude Nigeria',
'gaz': 'Oromo', #, West Central',
'kea': 'Kabuverdianu',
'nya': 'Nyanja',
'ssw': 'Swati',
'luo': 'Dholuo',# (Luo)',
'ven': 'Venda',
'kir':"Kirundi",
}
# ===== Build Language Name→ISOs map =====
def build_langname_to_isos(iso2name):
name2isos = defaultdict(set)
for iso, name in iso2name.items():
name2isos[name].add(iso)
return name2isos
def compare_models(model_1_name, model_2_name):
"""
Prepares a DataFrame comparing the performance of two models task-by-task.
"""
if model_1_name == model_2_name:
return pd.DataFrame([{"Info": "Please select two different models to compare."}])
# Get data for each model from the main leaderboard results
df1 = all_df[(all_df['model'] == model_1_name) & (all_df['leaderboard'] == 'main')][['task', 'score', 'metric']].rename(columns={'score': model_1_name})
df2 = all_df[(all_df['model'] == model_2_name) & (all_df['leaderboard'] == 'main')][['task', 'score']].rename(columns={'score': model_2_name})
if df1.empty or df2.empty:
return pd.DataFrame([{"Info": "One or both selected models have no 'main' leaderboard data to compare."}])
# Merge the two dataframes on the task ID
comp_df = pd.merge(df1, df2, on='task', how='outer')
# Add descriptive columns
comp_df['Cluster'] = comp_df['task'].map(TASK_TO_CLUSTER_MAP)
comp_df['Task Name'] = comp_df['task'].map(TASKS_LIST)
comp_df['Metric'] = comp_df['metric'].map(metrics_list)
comp_df.fillna({'Cluster': 'Uncategorized'}, inplace=True)
# Calculate the score difference, ensuring scores are numeric
score1 = pd.to_numeric(comp_df[model_1_name], errors='coerce')
score2 = pd.to_numeric(comp_df[model_2_name], errors='coerce')
comp_df['Difference'] = score1 - score2
# Format the difference column with colors
def format_diff(d):
if pd.isna(d):
return "---"
if d > 0.001: # Model 1 is better
return f"+{d:.2f}"
elif d < -0.001: # Model 2 is better
return f"{d:.2f}"
else:
return f"{d:.2f}"
# Format all score columns
comp_df[model_1_name] = comp_df[model_1_name].apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---")
comp_df[model_2_name] = comp_df[model_2_name].apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---")
comp_df['Difference'] = comp_df['Difference'].apply(format_diff)
# --- MODIFIED: Added 'task' to the list of final columns ---
final_cols = ['Cluster', 'Task Name', 'task', 'Metric', model_1_name, model_2_name, 'Difference']
comp_df = comp_df[final_cols]
comp_df = comp_df.sort_values(by=['Cluster', 'Task Name']).reset_index(drop=True)
# --- NEW: Renamed 'task' column to 'Task ID' for display ---
comp_df.rename(columns={'task': 'Task ID'}, inplace=True)
return comp_df
def get_model_table(model_name):
"""
Generates a performance table for a specific model, showing cluster, task, and score.
The table is sorted by Cluster and then by Task Name.
"""
# Filter for the selected model and only 'main' leaderboard entries
model_df = all_df[(all_df['model'] == model_name) & (all_df['leaderboard'] == 'main')].copy()
if model_df.empty:
return pd.DataFrame([{"Info": f"No 'main' leaderboard data available for the model: {model_name}"}])
# --- NEW: Add the Cluster Name column using the map ---
model_df['Cluster'] = model_df['task'].map(TASK_TO_CLUSTER_MAP)
# Create other descriptive columns
model_df['Task Name'] = model_df['task'].map(TASKS_LIST)
model_df['Metric'] = model_df['metric'].map(metrics_list)
model_df['Score'] = model_df['score'].apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---")
# --- MODIFIED: Select the new 'Cluster' column for the final table ---
table = model_df[['Cluster', 'Task Name', 'task', 'Metric', 'Score']].rename(columns={'task': 'Task ID'})
# --- MODIFIED: Sort by Cluster first, then by Task Name ---
table = table.sort_values(by=['Cluster', 'Task Name']).reset_index(drop=True)
# Handle cases where a task might not be in a cluster
table['Cluster'].fillna('Uncategorized', inplace=True)
return table
def get_task_leaderboard(task_key):
"""
Generates a leaderboard for a specific task, showing model performance across all languages.
"""
# Filter the main DataFrame for the selected task
task_df = all_df[all_df['task'] == task_key].copy()
if task_df.empty:
return pd.DataFrame([{"Info": f"No data available for the task: {TASKS_LIST.get(task_key, task_key)}"}])
# Get the metric for this task to display later
metric_name = metrics_list.get(task_df['metric'].iloc[0], '')
# Create a user-friendly column name for each language/leaderboard
def make_lang_col(row):
lb = row['leaderboard']
if lb == 'main':
# Skip the 'main' leaderboard for task-specific views as it's an aggregate
return None
if '-' in lb:
pair_lang = lb.split('-')
# Handles cases where an ISO code might not be in our map
src_lang = LANG_ISO2NAME.get(pair_lang[0], pair_lang[0])
tgt_lang = LANG_ISO2NAME.get(pair_lang[1], pair_lang[1])
return f"{src_lang} to {tgt_lang}"
else:
return LANG_ISO2NAME.get(lb, lb)
if task_key not in ['lid']:
task_df['lang_col'] = task_df.apply(make_lang_col, axis=1)
task_df.dropna(subset=['lang_col'], inplace=True) # Remove rows where lang_col is None
if task_df.empty:
return pd.DataFrame([{"Info": f"No language-specific data for the task: {TASKS_LIST.get(task_key, task_key)}"}])
# Pivot the table to have models as rows and languages as columns
table = task_df.pivot_table(index='model', columns='lang_col', values='score', aggfunc='mean').reset_index()
else:
table = task_df.pivot_table(index='model', columns='task', values='score', aggfunc='mean').reset_index()
score_cols = [col for col in table.columns if col != 'model']
for col in score_cols:
table[col] = table[col].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)
main_score_map = all_df[(all_df['task'] == task_key) & (all_df['leaderboard'] == 'main')].set_index('model')['score']
table.insert(1, 'Task Score', table['model'].map(main_score_map).apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---"))
# Add ranking medals based on the "Task Score"
table = add_medals_to_models(table, score_col="Task Score")
# Rename columns to be more descriptive, including the metric
# rename_cols = {col: f"{col}
Metric: {metric_name}" for col in score_cols}
if task_key in ['belebele', 'ner', 'mgsm', 'mmlu']:
# rename_cols = {col: f"