import json import pandas as pd from statistics import mean from huggingface_hub import HfApi, create_repo from datasets import load_dataset, Dataset from datasets.data_files import EmptyDatasetError import re from constants import ( REPO_ID, HF_TOKEN, DATASETS, SHORT_DATASET_NAMES, DATASET_DESCRIPTIONS, ) api = HfApi(token=HF_TOKEN) OPEN_LICENSE_KEYWORDS = { "mit", "apache", "apache-2", "apache-2.0", "bsd", "bsd-2", "bsd-3", "bsd-2-clause", "bsd-3-clause", "isc", "mpl", "mpl-2.0", "lgpl", "lgpl-2.1", "lgpl-3.0", "gpl", "gpl-2.0", "gpl-3.0", "agpl", "agpl-3.0", "epl", "epl-2.0", "cddl", "cddl-1.0", "cddl-1.1", "bsl", "bsl-1.0", "boost", "zlib", "unlicense", "artistic-2.0", "cc0", "cc0-1.0", "cc-by", "cc-by-3.0", "cc-by-4.0", "cc-by-sa", "cc-by-sa-3.0", "cc-by-sa-4.0", "openrail", "openrail-m", "bigscience openrail", "bigscience openrail-m", "open-source", "opensource", "open source" } RESTRICTIVE_LICENSE_KEYWORDS = { "cc-by-nc", "cc-by-nc-sa", "cc-nc", "nc-sa", "nc-nd", "cc-by-nd", "cc-nd", "no-derivatives", "no derivatives", "non-commercial", "noncommercial", "research-only", "research only", "llama", "llama-2", "community license", "proprietary", "closed", "unknown", "custom" } def is_open_license(license_str: str) -> bool: s = (str(license_str) if license_str is not None else "").strip().lower() if not s: return False if any(pat in s for pat in RESTRICTIVE_LICENSE_KEYWORDS): return False return any(pat in s for pat in OPEN_LICENSE_KEYWORDS) def init_repo(): try: api.repo_info(REPO_ID, repo_type="dataset") except: create_repo(REPO_ID, repo_type="dataset", private=True, token=HF_TOKEN) def load_data(): columns = ( ["model_name", "link", "license", "overall_wer", "overall_cer"] + [f"wer_{ds}" for ds in DATASETS] + [f"cer_{ds}" for ds in DATASETS] ) try: dataset = load_dataset(REPO_ID, token=HF_TOKEN) df = dataset["train"].to_pandas() except EmptyDatasetError: df = pd.DataFrame(columns=columns) if not df.empty: df = df.sort_values("overall_wer").reset_index(drop=True) df.insert(0, "rank", df.index + 1) for col in ( ["overall_wer", "overall_cer"] + [f"wer_{ds}" for ds in DATASETS] + [f"cer_{ds}" for ds in DATASETS] ): df[col] = (df[col] * 100).round(2) best_values = {ds: df[f"wer_{ds}"].min() for ds in DATASETS} for short_ds, ds in zip(SHORT_DATASET_NAMES, DATASETS): df[short_ds] = df.apply( lambda row: f'' f"{row[f'wer_{ds}']:.2f}%", axis=1, ) df = df.drop(columns=[f"wer_{ds}", f"cer_{ds}"]) df["model_name"] = df.apply( lambda row: f'{row["model_name"]}', axis=1, ) df = df.drop(columns=["link"]) df["license"] = df["license"].apply(lambda x: "Открытая" if is_open_license(x) else "Закрытая") df["rank"] = df["rank"].apply( lambda r: "🥇" if r == 1 else "🥈" if r == 2 else "🥉" if r == 3 else str(r) ) df.rename( columns={ "overall_wer": "Средний WER ⬇️", "overall_cer": "Средний CER ⬇️", "license": "Тип модели", "model_name": "Модель", "rank": "Ранг", }, inplace=True, ) table_html = df.to_html( escape=False, index=False, classes="display cell-border compact stripe" ) return f'
{table_html}
' else: return ( '
' + "".join(f"" for short in SHORT_DATASET_NAMES) + "
РангМодельТип моделиСредний WER ⬇️Средний CER ⬇️{short}
" ) def process_submit(json_str): columns = ( ["model_name", "link", "license", "overall_wer", "overall_cer"] + [f"wer_{ds}" for ds in DATASETS] + [f"cer_{ds}" for ds in DATASETS] ) try: data = json.loads(json_str) required_keys = ["model_name", "link", "license", "metrics"] if not all(key in data for key in required_keys): raise ValueError( "Неверная структура JSON. Требуемые поля: model_name, link, license, metrics" ) metrics = data["metrics"] if set(metrics.keys()) != set(DATASETS): raise ValueError( f"Метрики должны быть для всех датасетов: {', '.join(DATASETS)}" ) wers, cers = [], [] row = { "model_name": data["model_name"], "link": data["link"], "license": data["license"], } for ds in DATASETS: if "wer" not in metrics[ds] or "cer" not in metrics[ds]: raise ValueError(f"Для {ds} требуются wer и cer") row[f"wer_{ds}"] = metrics[ds]["wer"] row[f"cer_{ds}"] = metrics[ds]["cer"] wers.append(metrics[ds]["wer"]) cers.append(metrics[ds]["cer"]) row["overall_wer"] = mean(wers) row["overall_cer"] = mean(cers) try: dataset = load_dataset(REPO_ID, token=HF_TOKEN) df = dataset["train"].to_pandas() except EmptyDatasetError: df = pd.DataFrame(columns=columns) new_df = pd.concat([df, pd.DataFrame([row])], ignore_index=True) new_dataset = Dataset.from_pandas(new_df) new_dataset.push_to_hub(REPO_ID, token=HF_TOKEN) updated_html = load_data() return updated_html, "Успешно добавлено!", "" except Exception as e: return None, f"Ошибка: {str(e)}", json_str def get_datasets_description(): html = '
' for short_ds, info in DATASET_DESCRIPTIONS.items(): html += f"""

{short_ds} {info["full_name"]}

{info["description"]}

📊 {info["num_rows"]} записей

""" html += "
" return html def _strip_punct(text: str) -> str: return re.sub(r"[^\w\s]+", "", text, flags=re.UNICODE) def normalize_text(s: str) -> str: return _strip_punct(s.lower()).strip() def _edit_distance(a, b): n, m = len(a), len(b) dp = [[0] * (m + 1) for _ in range(n + 1)] for i in range(n + 1): dp[i][0] = i for j in range(m + 1): dp[0][j] = j for i in range(1, n + 1): ai = a[i - 1] for j in range(1, m + 1): cost = 0 if ai == b[j - 1] else 1 dp[i][j] = min(dp[i - 1][j] + 1, dp[i][j - 1] + 1, dp[i - 1][j - 1] + cost) return dp[n][m] def compute_wer_cer(ref: str, hyp: str, normalize: bool = True): if normalize: ref_norm, hyp_norm = normalize_text(ref), normalize_text(hyp) else: ref_norm, hyp_norm = ref, hyp ref_words, hyp_words = ref_norm.split(), hyp_norm.split() Nw = max(1, len(ref_words)) wer = _edit_distance(ref_words, hyp_words) / Nw ref_chars, hyp_chars = list(ref_norm), list(hyp_norm) Nc = max(1, len(ref_chars)) cer = _edit_distance(ref_chars, hyp_chars) / Nc return round(wer * 100, 2), round(cer * 100, 2) def get_metrics_html(): return """

WER — Word Error Rate

WER = ( S + D + I ) / N
Sзамены
Dудаления
Iвставки
Nслов в референсе

CER — Character Error Rate

CER = ( S + D + I ) / N
S, D, Iоперации редактирования
Nсимволов в референсе

Нормализация

Перед расчётом приводим текст к нижнему регистру и удаляем пунктуацию.

Сравнение

Сортировка по среднему WER по всем датасетам. Метрики отображаются в процентах.

""" def get_submit_html(): return """

Общая информация

Метрики

Укажите WER и CER для всех датасетов в формате JSON. Значения — от 0 до 1.

{
  "Russian_LibriSpeech": { "wer": 0.1234, "cer": 0.0567 },
  "Common_Voice_Corpus_22.0": { "wer": 0.2345, "cer": 0.0789 },
  "Tone_Webinars": { "wer": 0.3456, "cer": 0.0987 },
  "Tone_Books": { "wer": 0.4567, "cer": 0.1098 },
  "Tone_Speak": { "wer": 0.5678, "cer": 0.1209 },
  "Sova_RuDevices": { "wer": 0.6789, "cer": 0.1310 }
}
"""