import json import pandas as pd from statistics import mean from huggingface_hub import HfApi, create_repo from datasets import load_dataset, Dataset from datasets.data_files import EmptyDatasetError import re from constants import ( REPO_ID, HF_TOKEN, DATASETS, SHORT_DATASET_NAMES, DATASET_DESCRIPTIONS, ) api = HfApi(token=HF_TOKEN) OPEN_LICENSE_KEYWORDS = { "mit", "apache", "apache-2", "apache-2.0", "bsd", "bsd-2", "bsd-3", "bsd-2-clause", "bsd-3-clause", "isc", "mpl", "mpl-2.0", "lgpl", "lgpl-2.1", "lgpl-3.0", "gpl", "gpl-2.0", "gpl-3.0", "agpl", "agpl-3.0", "epl", "epl-2.0", "cddl", "cddl-1.0", "cddl-1.1", "bsl", "bsl-1.0", "boost", "zlib", "unlicense", "artistic-2.0", "cc0", "cc0-1.0", "cc-by", "cc-by-3.0", "cc-by-4.0", "cc-by-sa", "cc-by-sa-3.0", "cc-by-sa-4.0", "openrail", "openrail-m", "bigscience openrail", "bigscience openrail-m", "open-source", "opensource", "open source" } RESTRICTIVE_LICENSE_KEYWORDS = { "cc-by-nc", "cc-by-nc-sa", "cc-nc", "nc-sa", "nc-nd", "cc-by-nd", "cc-nd", "no-derivatives", "no derivatives", "non-commercial", "noncommercial", "research-only", "research only", "llama", "llama-2", "community license", "proprietary", "closed", "unknown", "custom" } def is_open_license(license_str: str) -> bool: s = (str(license_str) if license_str is not None else "").strip().lower() if not s: return False if any(pat in s for pat in RESTRICTIVE_LICENSE_KEYWORDS): return False return any(pat in s for pat in OPEN_LICENSE_KEYWORDS) def init_repo(): try: api.repo_info(REPO_ID, repo_type="dataset") except: create_repo(REPO_ID, repo_type="dataset", private=True, token=HF_TOKEN) def load_data(): columns = ( ["model_name", "link", "license", "overall_wer", "overall_cer"] + [f"wer_{ds}" for ds in DATASETS] + [f"cer_{ds}" for ds in DATASETS] ) try: dataset = load_dataset(REPO_ID, token=HF_TOKEN) df = dataset["train"].to_pandas() except EmptyDatasetError: df = pd.DataFrame(columns=columns) if not df.empty: df = df.sort_values("overall_wer").reset_index(drop=True) df.insert(0, "rank", df.index + 1) for col in ( ["overall_wer", "overall_cer"] + [f"wer_{ds}" for ds in DATASETS] + [f"cer_{ds}" for ds in DATASETS] ): df[col] = (df[col] * 100).round(2) best_values = {ds: df[f"wer_{ds}"].min() for ds in DATASETS} for short_ds, ds in zip(SHORT_DATASET_NAMES, DATASETS): df[short_ds] = df.apply( lambda row: f'' f"{row[f'wer_{ds}']:.2f}%", axis=1, ) df = df.drop(columns=[f"wer_{ds}", f"cer_{ds}"]) df["model_name"] = df.apply( lambda row: f'{row["model_name"]}', axis=1, ) df = df.drop(columns=["link"]) df["license"] = df["license"].apply(lambda x: "Открытая" if is_open_license(x) else "Закрытая") df["rank"] = df["rank"].apply( lambda r: "🥇" if r == 1 else "🥈" if r == 2 else "🥉" if r == 3 else str(r) ) df.rename( columns={ "overall_wer": "Средний WER ⬇️", "overall_cer": "Средний CER ⬇️", "license": "Тип модели", "model_name": "Модель", "rank": "Ранг", }, inplace=True, ) table_html = df.to_html( escape=False, index=False, classes="display cell-border compact stripe" ) return f'
Ранг | Модель | Тип модели | Средний WER ⬇️ | Средний CER ⬇️ | ' + "".join(f"{short} | " for short in SHORT_DATASET_NAMES) + "
---|
{info["description"]}
📊 {info["num_rows"]} записей
Перед расчётом приводим текст к нижнему регистру и удаляем пунктуацию.
Сортировка по среднему WER по всем датасетам. Метрики отображаются в процентах.
Укажите WER и CER для всех датасетов в формате JSON. Значения — от 0 до 1.
{ "Russian_LibriSpeech": { "wer": 0.1234, "cer": 0.0567 }, "Common_Voice_Corpus_22.0": { "wer": 0.2345, "cer": 0.0789 }, "Tone_Webinars": { "wer": 0.3456, "cer": 0.0987 }, "Tone_Books": { "wer": 0.4567, "cer": 0.1098 }, "Tone_Speak": { "wer": 0.5678, "cer": 0.1209 }, "Sova_RuDevices": { "wer": 0.6789, "cer": 0.1310 } }