Spaces:
Building
Building
| ; | |
| /** | |
| * Fetch benchmark data from six sources and merge into data/benchmarks.json. | |
| * | |
| * Sources: | |
| * 1. AchilleasDrakou/LLMStats on GitHub (71 curated models, self-reported benchmarks) | |
| * 2. open-llm-leaderboard/contents on Hugging Face (4500+ open models, standardised evals) | |
| * 3. LiveBench (livebench.ai) β contamination-free, monthly, 70+ frontier models | |
| * 4. Chatbot Arena (lmarena.ai) β 316 models with real ELO ratings from human votes | |
| * 5. Aider (aider.chat) β code editing benchmark, 133 tasks per model | |
| * 6. Artificial Analysis (artificialanalysis.ai) β independent evaluations and speed benchmarks | |
| * | |
| * Unified field names (0-1 scale unless noted): | |
| * mmlu, mmlu_pro, gpqa, human_eval, math, gsm8k, mmmu, | |
| * hellaswag, ifeval, arc, drop, mbpp, mgsm, bbh (from LLMStats) | |
| * hf_math_lvl5, hf_musr, hf_avg, params_b (HF-only) | |
| * lb_name, lb_global, lb_reasoning, lb_coding, (LiveBench, 0-1) | |
| * lb_math, lb_language, lb_if, lb_data_analysis | |
| * arena_elo, arena_rank, arena_votes (Chatbot Arena; elo is raw ELO ~800-1500) | |
| * aider_pass_rate (Aider edit bench, 0-1) | |
| * aa_id, aa_intelligence, aa_mmlu_pro, aa_gpqa, (Artificial Analysis) | |
| * aa_livecodebench, aa_tokens_per_s, aa_latency_s | |
| * | |
| * Where multiple sources have data for the same benchmark, | |
| * LLMStats takes priority (it stores self-reported model-card values). | |
| * | |
| * Usage: | |
| * node scripts/fetch-benchmarks.js # fetch all sources | |
| * node scripts/fetch-benchmarks.js aa # refresh Artificial Analysis only | |
| * node scripts/fetch-benchmarks.js livebench # refresh LiveBench only | |
| */ | |
| const fs = require('fs'); | |
| const path = require('path'); | |
| const yaml = require('js-yaml'); | |
| const { getJson, getText } = require('./fetch-utils'); | |
| const { loadEnv } = require('./load-env'); | |
| loadEnv(); | |
| const OUT_FILE = path.join(__dirname, '..', 'data', 'benchmarks.json'); | |
| // βββ helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| const normName = (s) => | |
| (s || '').toLowerCase().replace(/[-_.]/g, ' ').replace(/[^a-z0-9 ]/g, '').replace(/\s+/g, ' ').trim(); | |
| // βββ LLMStats βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| const LLMSTATS_TREE = 'https://api.github.com/repos/AchilleasDrakou/LLMStats/git/trees/main?recursive=1'; | |
| const LLMSTATS_RAW = 'https://raw.githubusercontent.com/AchilleasDrakou/LLMStats/main/'; | |
| const LLMSTATS_MAP = { | |
| mmlu: ['MMLU', 'MMLU Chat', 'MMLU-Base', 'MMLU (CoT)', 'Multilingual MMLU'], | |
| mmlu_pro: ['MMLU-Pro', 'MMLU-STEM', 'Multilingual MMLU-Pro'], | |
| gpqa: ['GPQA'], | |
| human_eval: ['HumanEval', 'Humaneval', 'HumanEval+', 'HumanEval-Average', 'Instruct HumanEval', 'MBPP EvalPlus', 'EvalPlus', 'Evalplus'], | |
| math: ['MATH', 'Math', 'MATH (CoT)', 'MATH-500', 'Functional_MATH', 'FunctionalMATH'], | |
| gsm8k: ['GSM8K', 'GSM-8K', 'GSM8k', 'GSM8K Chat', 'GSM-8K (CoT)'], | |
| mmmu: ['MMMU', 'MMMUval', 'MMMU-Pro'], | |
| hellaswag: ['HellaSwag', 'HellaSWAG', 'Hellaswag'], | |
| ifeval: ['IFEval', 'IF-Eval'], | |
| arc: ['ARC Challenge', 'ARC-C', 'ARC-c', 'ARC-e', 'ARC-Challenge', 'AI2 Reasoning Challenge (ARC)'], | |
| drop: ['DROP'], | |
| mbpp: ['MBPP', 'MBPP+', 'MBPP++', 'MBPP pass@1', 'MBPP EvalPlus (base)'], | |
| mgsm: ['MGSM', 'Multilingual MGSM', 'Multilingual MGSM (CoT)'], | |
| bbh: ['BBH', 'BigBench Hard CoT', 'BIG-Bench-Hard', 'BigBench-Hard', 'BIG-Bench Hard', 'BigBench_Hard'], | |
| }; | |
| function extractLLMStatsMetrics(qualitative_metrics) { | |
| const scores = {}; | |
| for (const m of qualitative_metrics || []) { | |
| for (const [key, names] of Object.entries(LLMSTATS_MAP)) { | |
| if (names.some((n) => m.dataset_name === n) && scores[key] === undefined) { | |
| scores[key] = m.score; | |
| } | |
| } | |
| } | |
| return scores; | |
| } | |
| async function fetchLLMStats() { | |
| process.stdout.write('LLMStats: fetching file list... '); | |
| const tree = await getJson(LLMSTATS_TREE); | |
| const files = tree.tree.filter( | |
| (f) => f.type === 'blob' && f.path.startsWith('models/') && f.path.endsWith('/model.json') | |
| ); | |
| console.log(`${files.length} models`); | |
| const results = []; | |
| const BATCH = 10; | |
| for (let i = 0; i < files.length; i += BATCH) { | |
| const batch = files.slice(i, i + BATCH); | |
| const rows = await Promise.all(batch.map(async (f) => { | |
| try { | |
| const data = await getJson(LLMSTATS_RAW + f.path); | |
| const slug = f.path.replace(/^models\//, '').replace(/\/model\.json$/, ''); | |
| const metrics = extractLLMStatsMetrics(data.qualitative_metrics); | |
| const entry = { slug, name: data.name, ...metrics, sources: {} }; | |
| Object.keys(metrics).forEach(k => entry.sources[k] = 'llmstats'); | |
| return entry; | |
| } catch (e) { | |
| console.warn(`\n β LLMStats ${f.path}: ${e.message}`); | |
| return null; | |
| } | |
| })); | |
| rows.forEach((r) => { if (r) results.push(r); }); | |
| process.stdout.write(` LLMStats: ${Math.min(i + BATCH, files.length)}/${files.length}\r`); | |
| } | |
| console.log(` LLMStats: ${results.length} entries fetched `); | |
| return results; | |
| } | |
| // βββ HF Leaderboard βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| const HF_ROWS_URL = 'https://datasets-server.huggingface.co/rows' + | |
| '?dataset=open-llm-leaderboard%2Fcontents&config=default&split=train'; | |
| async function fetchHFPage(offset, limit = 100) { | |
| const data = await getJson(`${HF_ROWS_URL}&offset=${offset}&limit=${limit}`); | |
| return { rows: data.rows.map((r) => r.row), total: data.num_rows_total }; | |
| } | |
| async function fetchHFLeaderboard() { | |
| process.stdout.write('HF Leaderboard: probing total... '); | |
| const first = await fetchHFPage(0, 1); | |
| const total = first.total; | |
| console.log(`${total} rows`); | |
| const LIMIT = 100; | |
| const pages = Math.ceil(total / LIMIT); | |
| const allRows = [...first.rows]; | |
| // Fetch remaining pages in batches of 5 concurrent requests | |
| const CONCURRENT = 5; | |
| for (let p = 1; p < pages; p += CONCURRENT) { | |
| const batch = []; | |
| for (let q = p; q < Math.min(p + CONCURRENT, pages); q++) { | |
| batch.push(fetchHFPage(q * LIMIT, LIMIT)); | |
| } | |
| const results = await Promise.all(batch); | |
| results.forEach((r) => allRows.push(...r.rows)); | |
| const done = Math.min((p + CONCURRENT) * LIMIT, total); | |
| process.stdout.write(` HF: ${done}/${total}\r`); | |
| } | |
| console.log(` HF: ${total}/${total} β filtering... `); | |
| // The Average column name has a Unicode emoji | |
| const AVG_KEY = Object.keys(allRows[0]).find((k) => k.startsWith('Average')); | |
| const entries = allRows | |
| .filter((r) => r['Available on the hub'] && !r.Flagged) | |
| .map((r) => { | |
| const entry = { | |
| hf_id: r.fullname, | |
| name: r.fullname.split('/').pop(), | |
| sources: {}, | |
| }; | |
| if (r['#Params (B)']) { entry.params_b = r['#Params (B)']; entry.sources.params_b = 'hf'; } | |
| if (r['IFEval Raw']) { entry.ifeval = r['IFEval Raw']; entry.sources.ifeval = 'hf'; } | |
| if (r['BBH Raw']) { entry.bbh = r['BBH Raw']; entry.sources.bbh = 'hf'; } | |
| if (r['GPQA Raw']) { entry.gpqa = r['GPQA Raw']; entry.sources.gpqa = 'hf'; } | |
| if (r['MMLU-PRO Raw']) { entry.mmlu_pro = r['MMLU-PRO Raw']; entry.sources.mmlu_pro = 'hf'; } | |
| if (r['MATH Lvl 5 Raw']) { entry.hf_math_lvl5 = r['MATH Lvl 5 Raw']; entry.sources.hf_math_lvl5 = 'hf'; } | |
| if (r['MUSR Raw']) { entry.hf_musr = r['MUSR Raw']; entry.sources.hf_musr = 'hf'; } | |
| if (AVG_KEY && r[AVG_KEY]) { entry.hf_avg = r[AVG_KEY]; entry.sources.hf_avg = 'hf'; } | |
| return entry; | |
| }); | |
| console.log(` HF: ${entries.length} entries after filtering`); | |
| return entries; | |
| } | |
| // βββ LiveBench βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| const LB_GITHUB_TREE = 'https://api.github.com/repos/LiveBench/livebench.github.io/git/trees/main?recursive=1'; | |
| const LB_BASE_URL = 'https://livebench.ai'; | |
| const LB_SUFFIX_RE = new RegExp( | |
| '(-thinking-(?:auto-)?(?:\\d+k-)?(?:(?:high|medium|low)-effort)?|' + | |
| '-thinking(?:-(?:64k|32k|auto|minimal))?|' + | |
| '-(?:high|medium|low)-effort|' + | |
| '-base|-non-?reasoning|-(?:high|low|min)thinking|-nothinking)' + | |
| '(?:-(?:high|medium|low)-effort)?$' | |
| ); | |
| function lbBaseName(name) { | |
| let prev; | |
| let cur = name; | |
| do { prev = cur; cur = cur.replace(LB_SUFFIX_RE, ''); } while (cur !== prev); | |
| return cur; | |
| } | |
| function parseLiveBenchCsv(csvText, taskToGroup) { | |
| const avg = (arr) => arr.reduce((a, b) => a + b, 0) / arr.length; | |
| const lines = csvText.split('\n').filter(Boolean); | |
| const headers = lines[0].split(','); | |
| const entries = []; | |
| for (const line of lines.slice(1)) { | |
| const vals = line.split(','); | |
| const modelName = vals[0]; | |
| if (!modelName) continue; | |
| const taskScores = {}; | |
| for (let i = 1; i < headers.length; i++) { | |
| const v = parseFloat(vals[i]); | |
| if (!isNaN(v)) taskScores[headers[i]] = v / 100; | |
| } | |
| const groupBuckets = {}; | |
| for (const [task, group] of Object.entries(taskToGroup)) { | |
| if (taskScores[task] !== undefined) { | |
| groupBuckets[group] = groupBuckets[group] || []; | |
| groupBuckets[group].push(taskScores[task]); | |
| } | |
| } | |
| const allScores = Object.values(taskScores); | |
| const entry = { | |
| lb_name: modelName, | |
| lb_global: allScores.length ? avg(allScores) : undefined, | |
| lb_reasoning: groupBuckets.lb_reasoning ? avg(groupBuckets.lb_reasoning) : undefined, | |
| lb_coding: groupBuckets.lb_coding ? avg(groupBuckets.lb_coding) : undefined, | |
| lb_math: groupBuckets.lb_math ? avg(groupBuckets.lb_math) : undefined, | |
| lb_language: groupBuckets.lb_language ? avg(groupBuckets.lb_language) : undefined, | |
| lb_if: groupBuckets.lb_if ? avg(groupBuckets.lb_if) : undefined, | |
| lb_data_analysis: groupBuckets.lb_data_analysis ? avg(groupBuckets.lb_data_analysis) : undefined, | |
| sources: {}, | |
| }; | |
| Object.keys(entry).forEach(k => { | |
| if (k.startsWith('lb_') && entry[k] !== undefined) entry.sources[k] = 'livebench'; | |
| }); | |
| entries.push(entry); | |
| } | |
| return entries; | |
| } | |
| async function fetchLiveBench() { | |
| process.stdout.write('LiveBench: finding all releases... '); | |
| const tree = await getJson(LB_GITHUB_TREE); | |
| const dates = tree.tree | |
| .filter((f) => f.path.startsWith('public/table_') && f.path.endsWith('.csv')) | |
| .map((f) => f.path.replace('public/table_', '').replace('.csv', '')) | |
| .sort(); | |
| console.log(`${dates.length} releases (${dates[0]} β ${dates[dates.length - 1]})`); | |
| const cats = await getJson(`${LB_BASE_URL}/categories_${dates[dates.length - 1]}.json`); | |
| const taskToGroup = {}; | |
| for (const [cat, tasks] of Object.entries(cats)) { | |
| const group = | |
| cat === 'Coding' || cat === 'Agentic Coding' ? 'lb_coding' : | |
| cat === 'Reasoning' ? 'lb_reasoning' : | |
| cat === 'Mathematics' ? 'lb_math' : | |
| cat === 'Language' ? 'lb_language' : | |
| cat === 'IF' ? 'lb_if' : | |
| cat === 'Data Analysis' ? 'lb_data_analysis' : null; | |
| if (group) for (const t of tasks) taskToGroup[t] = group; | |
| } | |
| const byName = new Map(); | |
| for (const date of dates) { | |
| let csv; | |
| try { csv = await getText(`${LB_BASE_URL}/table_${date}.csv`); } | |
| catch (e) { console.warn(`\n β LiveBench ${date}: ${e.message}`); continue; } | |
| for (const entry of parseLiveBenchCsv(csv, taskToGroup)) byName.set(entry.lb_name, entry); | |
| process.stdout.write(` LiveBench: ${date}\r`); | |
| } | |
| const entries = [...byName.values()]; | |
| console.log(` LiveBench: ${entries.length} unique models across all releases`); | |
| return entries; | |
| } | |
| function mergeLiveBench(entries, lbEntries) { | |
| const exactMap = new Map(); | |
| const baseMap = new Map(); | |
| for (const lb of lbEntries) { | |
| exactMap.set(normName(lb.lb_name), lb); | |
| const base = normName(lbBaseName(lb.lb_name)); | |
| if (base !== normName(lb.lb_name)) { | |
| const prev = baseMap.get(base); | |
| if (!prev || (lb.lb_global || 0) > (prev.lb_global || 0)) baseMap.set(base, lb); | |
| } | |
| } | |
| const usedLbNames = new Set(); | |
| let matched = 0; | |
| for (const e of entries) { | |
| const candidates = [normName(e.name || ''), normName((e.slug || '').split('/').pop() || ''), normName((e.hf_id || '').split('/').pop() || '')].filter(Boolean); | |
| let lb = null; | |
| for (const c of candidates) { lb = exactMap.get(c) || baseMap.get(c); if (lb) break; } | |
| if (lb) { | |
| Object.assign(e, lb); | |
| e.sources = { ...(e.sources || {}), ...(lb.sources || {}) }; | |
| usedLbNames.add(lb.lb_name); | |
| matched++; | |
| } | |
| } | |
| const usedBases = new Set([...usedLbNames].map((n) => normName(lbBaseName(n)))); | |
| const newEntries = []; | |
| for (const lb of lbEntries) { | |
| if (usedLbNames.has(lb.lb_name)) continue; | |
| const base = normName(lbBaseName(lb.lb_name)); | |
| if (usedBases.has(base)) continue; | |
| if (baseMap.get(base) === lb || exactMap.get(normName(lb.lb_name)) === lb) { | |
| newEntries.push({ name: lbBaseName(lb.lb_name), ...lb }); | |
| usedBases.add(base); | |
| } | |
| } | |
| console.log(` LiveBench: ${matched} matched, ${newEntries.length} new entries`); | |
| return [...entries, ...newEntries]; | |
| } | |
| // βββ Chatbot Arena βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| async function fetchChatbotArena() { | |
| process.stdout.write('Chatbot Arena: fetching RSC leaderboard... '); | |
| const text = await getText('https://lmarena.ai/en/leaderboard/text', { | |
| headers: { 'User-Agent': 'Mozilla/5.0', 'RSC': '1', 'Accept': 'text/x-component' }, | |
| }); | |
| let entries = null; | |
| for (const line of text.split('\n')) { | |
| if (!line.includes('"entries":[') || !line.includes('"rating":')) continue; | |
| const start = line.indexOf('"entries":[') + '"entries":'.length; | |
| let depth = 0, end = -1; | |
| for (let i = start; i < line.length; i++) { | |
| if (line[i] === '[' || line[i] === '{') depth++; | |
| else if (line[i] === ']' || line[i] === '}') { depth--; if (depth === 0) { end = i + 1; break; } } | |
| } | |
| entries = JSON.parse(line.substring(start, end)); | |
| break; | |
| } | |
| if (!entries) throw new Error('Could not find entries in RSC payload'); | |
| console.log(`${entries.length} models`); | |
| return entries.map((e) => { | |
| const entry = { | |
| arena_name: e.modelDisplayName, | |
| arena_org: e.modelOrganization, | |
| arena_elo: e.rating, | |
| arena_rank: e.rank, | |
| arena_votes: e.votes, | |
| sources: {}, | |
| }; | |
| Object.keys(entry).forEach(k => { | |
| if (k.startsWith('arena_') && entry[k] !== undefined) entry.sources[k] = 'arena'; | |
| }); | |
| return entry; | |
| }); | |
| } | |
| function mergeArena(entries, arenaEntries) { | |
| const arenaMap = new Map(); | |
| for (const a of arenaEntries) arenaMap.set(normName(a.arena_name), a); | |
| let matched = 0; | |
| for (const e of entries) { | |
| const candidates = [normName(e.name || ''), normName(e.lb_name || ''), normName((e.slug || '').split('/').pop() || ''), normName((e.hf_id || '').split('/').pop() || '')]; | |
| const a = candidates.map((c) => arenaMap.get(c)).find(Boolean); | |
| if (a) { | |
| e.arena_elo = a.arena_elo; e.arena_rank = a.arena_rank; e.arena_votes = a.arena_votes; | |
| e.sources = { ...(e.sources || {}), ...(a.sources || {}) }; | |
| arenaMap.delete(normName(a.arena_name)); matched++; | |
| } | |
| } | |
| const newEntries = []; | |
| for (const a of arenaMap.values()) newEntries.push({ name: a.arena_name, ...a }); | |
| console.log(` Arena: ${matched} matched, ${newEntries.length} new entries`); | |
| return [...entries, ...newEntries]; | |
| } | |
| // βββ Aider βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| const AIDER_RAW = 'https://raw.githubusercontent.com/Aider-AI/aider/main/aider/website/_data/edit_leaderboard.yml'; | |
| async function fetchAider() { | |
| process.stdout.write('Aider: fetching edit leaderboard... '); | |
| const text = await getText(AIDER_RAW); | |
| const rows = yaml.load(text); | |
| const best = new Map(); | |
| for (const row of rows) { | |
| if (!row.model || row.pass_rate_1 === undefined) continue; | |
| const key = normName(row.model); | |
| const existing = best.get(key); | |
| if (!existing || row.pass_rate_1 > existing.pass_rate_1) best.set(key, row); | |
| } | |
| const entries = []; | |
| for (const row of best.values()) { | |
| const entry = { aider_model: row.model, aider_pass_rate: row.pass_rate_1 / 100, sources: {} }; | |
| entry.sources.aider_pass_rate = 'aider'; | |
| entries.push(entry); | |
| } | |
| console.log(`${entries.length} models (best run each)`); | |
| return entries; | |
| } | |
| function mergeAider(entries, aiderEntries) { | |
| const aiderMap = new Map(); | |
| for (const a of aiderEntries) aiderMap.set(normName(a.aider_model), a); | |
| let matched = 0; | |
| for (const e of entries) { | |
| const candidates = [normName(e.name || ''), normName(e.lb_name || ''), normName((e.slug || '').split('/').pop() || ''), normName((e.hf_id || '').split('/').pop() || ''), normName(e.arena_name || '')]; | |
| const a = candidates.map((c) => aiderMap.get(c)).find(Boolean); | |
| if (a) { | |
| e.aider_pass_rate = a.aider_pass_rate; | |
| e.sources = { ...(e.sources || {}), ...(a.sources || {}) }; | |
| aiderMap.delete(normName(a.aider_model)); | |
| matched++; | |
| } | |
| } | |
| const newEntries = []; | |
| for (const a of aiderMap.values()) newEntries.push({ name: a.aider_model, ...a }); | |
| console.log(` Aider: ${matched} matched, ${newEntries.length} new entries`); | |
| return [...entries, ...newEntries]; | |
| } | |
| // βββ Artificial Analysis βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| async function fetchArtificialAnalysis() { | |
| const apiKey = process.env.ARTIFICIAL_ANALYSIS_API_KEY; | |
| if (!apiKey) { | |
| console.log('Artificial Analysis: skipping (no API key found)'); | |
| return []; | |
| } | |
| process.stdout.write('Artificial Analysis: fetching benchmarks... '); | |
| const res = await getJson('https://artificialanalysis.ai/api/v2/data/llms/models', { | |
| headers: { 'x-api-key': apiKey }, | |
| }); | |
| if (!res.data) throw new Error('Invalid response from Artificial Analysis API'); | |
| console.log(`${res.data.length} models`); | |
| return res.data.map((m) => { | |
| const ev = m.evaluations || {}; | |
| const entry = { | |
| aa_id: m.id, | |
| aa_name: m.name, | |
| aa_slug: m.slug, | |
| aa_intelligence: ev.artificial_analysis_intelligence_index, // 0-100 | |
| aa_coding: ev.artificial_analysis_coding_index, // 0-100 | |
| aa_math: ev.artificial_analysis_math_index, // 0-100 | |
| aa_mmlu_pro: ev.mmlu_pro, // 0-1 | |
| aa_gpqa: ev.gpqa, // 0-1 | |
| aa_livecodebench: ev.livecodebench, // 0-1 | |
| aa_hle: ev.hle, | |
| aa_scicode: ev.scicode, | |
| aa_math_500: ev.math_500, | |
| aa_aime: ev.aime, | |
| aa_tokens_per_s: m.median_output_tokens_per_second, | |
| aa_latency_s: m.median_time_to_first_token_seconds, | |
| sources: {}, | |
| }; | |
| Object.keys(entry).forEach(k => { | |
| if (k.startsWith('aa_') && entry[k] !== undefined) entry.sources[k] = 'aa'; | |
| }); | |
| return entry; | |
| }); | |
| } | |
| function mergeArtificialAnalysis(entries, aaEntries) { | |
| const aaMap = new Map(); | |
| for (const a of aaEntries) aaMap.set(normName(a.aa_name), a); | |
| let matched = 0; | |
| for (const e of entries) { | |
| const candidates = [ | |
| normName(e.name || ''), | |
| normName(e.lb_name || ''), | |
| normName((e.slug || '').split('/').pop() || ''), | |
| normName((e.hf_id || '').split('/').pop() || ''), | |
| normName(e.arena_name || ''), | |
| ].filter(Boolean); | |
| const aa = candidates.map((c) => aaMap.get(c)).find(Boolean); | |
| if (aa) { | |
| Object.assign(e, aa); | |
| e.sources = { ...(e.sources || {}), ...(aa.sources || {}) }; | |
| aaMap.delete(normName(aa.aa_name)); | |
| matched++; | |
| } | |
| } | |
| const newEntries = []; | |
| for (const a of aaMap.values()) { | |
| newEntries.push({ name: a.aa_name, ...a }); | |
| } | |
| console.log(` AA: ${matched} matched, ${newEntries.length} new entries`); | |
| return [...entries, ...newEntries]; | |
| } | |
| // βββ MTEB ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| const MTEB_PATHS_URL = 'https://raw.githubusercontent.com/embeddings-benchmark/results/main/paths.json'; | |
| const MTEB_RAW_BASE_URL = 'https://raw.githubusercontent.com/embeddings-benchmark/results/main/'; | |
| async function fetchMTEB() { | |
| const providersPath = path.join(__dirname, '..', 'data', 'providers.json'); | |
| if (!fs.existsSync(providersPath)) return []; | |
| process.stdout.write('MTEB: fetching results index... '); | |
| const paths = await getJson(MTEB_PATHS_URL); | |
| const providers = JSON.parse(fs.readFileSync(providersPath, 'utf8')).providers; | |
| const hfIds = new Set(); | |
| providers.forEach(p => p.models.forEach(m => { if (m.type === 'embedding' && m.hf_id) hfIds.add(m.hf_id); })); | |
| console.log(`${hfIds.size} embedders`); | |
| const results = []; | |
| for (const hfId of hfIds) { | |
| const key = hfId.replace(/\//g, '__'); | |
| // Try original key, then find matching key in paths (case-insensitive) | |
| let resultPaths = paths[key]; | |
| if (!resultPaths) { | |
| const match = Object.keys(paths).find(k => k.toLowerCase() === key.toLowerCase()); | |
| if (match) resultPaths = paths[match]; | |
| } | |
| if (!resultPaths) continue; | |
| const revisions = [...new Set(resultPaths.map(p => p.split('/')[2]))]; | |
| // Aggregation: we'll take all unique tasks across all revisions, | |
| // prioritizing the latest revision for each task. | |
| const taskPaths = new Map(); | |
| revisions.forEach(rev => { | |
| const pathsInRev = resultPaths.filter(p => p.includes(`/${rev}/`)); | |
| pathsInRev.forEach(p => { | |
| const taskName = p.split('/').pop().replace('.json', ''); | |
| taskPaths.set(taskName, p); | |
| }); | |
| }); | |
| const latestPaths = [...taskPaths.values()]; | |
| const fetchPaths = latestPaths.slice(0, 50); // Limit to 50 tasks to prevent hangs | |
| process.stdout.write(` MTEB: ${hfId} (fetching ${fetchPaths.length}/${latestPaths.length} tasks)\r`); | |
| let total = 0, count = 0, retTotal = 0, retCount = 0; | |
| const BATCH = 20; | |
| for (let i = 0; i < fetchPaths.length; i += BATCH) { | |
| const batch = await Promise.all(fetchPaths.slice(i, i + BATCH).map(p => getJson(MTEB_RAW_BASE_URL + p).catch(() => null))); | |
| batch.forEach(res => { | |
| if (!res) return; | |
| const scores = res.scores || res; | |
| const data = scores.test || scores.dev || scores.validation; | |
| if (!data) return; | |
| const arr = Array.isArray(data) ? data : [data]; | |
| // Find English or default subset | |
| let targetRes = arr.find(r => r.languages && r.languages.some(l => l.startsWith('eng') || l === 'en')); | |
| if (!targetRes && arr.length === 1) targetRes = arr[0]; | |
| if (!targetRes) targetRes = arr.find(r => r.hf_subset === 'default'); | |
| if (!targetRes && arr.length > 0) targetRes = arr[0]; | |
| if (targetRes) { | |
| const s = targetRes.main_score || targetRes.ndcg_at_10 || targetRes.accuracy; | |
| if (typeof s === 'number' && s > 0) { | |
| let norm = s <= 1.0 ? s * 100 : s; | |
| if (norm > 100) norm = 100; // Cap at 100 | |
| total += norm; count++; | |
| const task = res.mteb_dataset_name || res.task_name || ''; | |
| if (task.includes('Retrieval') || task.includes('Search')) { retTotal += norm; retCount++; } | |
| } | |
| } | |
| }); | |
| } | |
| if (count > 0) { | |
| results.push({ | |
| hf_id: hfId, | |
| name: hfId.split('/').pop(), | |
| mteb_avg: Math.round(total / count * 100) / 100, | |
| mteb_retrieval: retCount > 0 ? Math.round(retTotal / retCount * 100) / 100 : undefined, | |
| sources: { mteb_avg: 'mteb', mteb_retrieval: retCount > 0 ? 'mteb' : undefined } | |
| }); | |
| } | |
| } | |
| console.log(`\n MTEB: ${results.length} models enriched `); | |
| return results; | |
| } | |
| function mergeMTEB(entries, mtebEntries) { | |
| const map = new Map(mtebEntries.map(m => [m.hf_id.toLowerCase(), m])); | |
| // Manual overrides for famous models not yet in the results repo or needing fixed values | |
| const overrides = [ | |
| { hf_id: 'BAAI/bge-multilingual-gemma2', mteb_avg: 70.3, mteb_retrieval: 67.5, sources: { mteb_avg: 'manual', mteb_retrieval: 'manual' } }, | |
| { hf_id: 'Qwen/Qwen3-Embedding-8B', mteb_avg: 71.2, mteb_retrieval: 72.1, sources: { mteb_avg: 'manual', mteb_retrieval: 'manual' } }, | |
| { hf_id: 'BAAI/bge-en-icl', mteb_avg: 64.9, mteb_retrieval: 58.2, sources: { mteb_avg: 'manual', mteb_retrieval: 'manual' } }, | |
| { hf_id: 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2', mteb_avg: 51.98, mteb_retrieval: 39.76, sources: { mteb_avg: 'manual', mteb_retrieval: 'manual' } }, | |
| { name: 'Mistral Embed', mteb_avg: 55.26, sources: { mteb_avg: 'manual' } }, | |
| { name: 'Codestral Embed', mteb_avg: 84.7, mteb_retrieval: 81.0, lb_coding: 0.81, sources: { mteb_avg: 'manual', mteb_retrieval: 'manual', lb_coding: 'manual' } }, | |
| ]; | |
| overrides.forEach(o => { | |
| const key = (o.hf_id || o.name).toLowerCase(); | |
| map.set(key, o); // Force override | |
| }); | |
| let matched = 0; | |
| for (const e of entries) { | |
| const m = (e.hf_id ? map.get(e.hf_id.toLowerCase()) : null) || (e.name ? map.get(e.name.toLowerCase()) : null); | |
| if (m) { | |
| if (m.mteb_avg) e.mteb_avg = m.mteb_avg; | |
| if (m.mteb_retrieval) e.mteb_retrieval = m.mteb_retrieval; | |
| if (m.lb_coding) e.lb_coding = m.lb_coding; | |
| e.sources = { ...(e.sources || {}), ...m.sources }; | |
| const key = (m.hf_id || m.name).toLowerCase(); | |
| map.delete(key); matched++; | |
| } | |
| } | |
| const newEntries = [...map.values()]; | |
| console.log(` MTEB: ${matched} matched, ${newEntries.length} new entries`); | |
| return [...entries, ...newEntries]; | |
| } | |
| // βββ OCR Benchmarks ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| function mergeOCR(entries) { | |
| const ocrData = [ | |
| { name: 'datalab-to/chandra-ocr-2', score: 85.9 }, | |
| { name: 'rednote-hilab/dots.mocr', score: 83.9 }, | |
| { name: 'lightonai/LightOnOCR-2-1B', score: 83.2 }, | |
| { name: 'datalab-to/chandra', score: 83.1 }, | |
| { name: 'infly/Infinity-Parser-7B', score: 82.5 }, | |
| { name: 'allenai/olmOCR-2-7B-1025-FP8', score: 82.4 }, | |
| { name: 'PaddlePaddle/PaddleOCR-VL', score: 80.0 }, | |
| { name: 'baidu/Qianfan-OCR', score: 79.8 }, | |
| { name: 'rednote-hilab/dots.ocr', score: 79.1 }, | |
| { name: 'deepseek-ai/DeepSeek-OCR-2', score: 76.3 }, | |
| { name: 'lightonai/LightOnOCR-1B-1025', score: 76.1 }, | |
| { name: 'deepseek-ai/DeepSeek-OCR', score: 75.7 }, | |
| { name: 'opendatalab/MinerU2.5-2509-1.2B', score: 75.2 }, | |
| { name: 'zai-org/GLM-OCR', score: 75.2 }, | |
| { name: 'FireRedTeam/FireRed-OCR', score: 70.2 }, | |
| { name: 'nanonets/Nanonets-OCR2-3B', score: 69.5 }, | |
| ]; | |
| const ocrMap = new Map(); | |
| ocrData.forEach(d => { | |
| ocrMap.set(normName(d.name), d); | |
| const modelPart = d.name.split('/').pop(); | |
| if (modelPart) ocrMap.set(normName(modelPart), d); | |
| }); | |
| let matched = 0; | |
| const usedOcr = new Set(); | |
| for (const e of entries) { | |
| const candidates = [ | |
| normName(e.name || ''), | |
| normName((e.hf_id || '').split('/').pop() || ''), | |
| normName(e.hf_id || '') | |
| ].filter(Boolean); | |
| const ocr = candidates.map(c => ocrMap.get(c)).find(Boolean); | |
| if (ocr) { | |
| e.ocr_avg = ocr.score; | |
| e.sources = { ...(e.sources || {}), ocr_avg: 'manual' }; | |
| matched++; | |
| usedOcr.add(ocr.name); | |
| } | |
| } | |
| const newEntries = []; | |
| ocrData.forEach(d => { | |
| if (!usedOcr.has(d.name)) { | |
| newEntries.push({ | |
| hf_id: d.name, | |
| name: d.name.split('/').pop(), | |
| ocr_avg: d.score, | |
| sources: { ocr_avg: 'manual' } | |
| }); | |
| } | |
| }); | |
| console.log(` OCR: ${matched} matched, ${newEntries.length} new entries`); | |
| return [...entries, ...newEntries]; | |
| } | |
| // βββ Merge βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| function mergeEntries(llmstats, hfEntries) { | |
| const lsIdx = new Map(); | |
| llmstats.forEach((e, i) => { | |
| lsIdx.set(normName(e.name), i); | |
| const slugModel = e.slug?.split('/').pop() || ''; | |
| if (slugModel) lsIdx.set(normName(slugModel), i); | |
| }); | |
| const merged = llmstats.map((e) => ({ ...e, sources: { ...(e.sources || {}) } })); | |
| const hfOnly = []; | |
| for (const hf of hfEntries) { | |
| const modelPart = normName(hf.name); | |
| const modelWords = modelPart.split(' '); | |
| const modelNoPrefix = modelWords.length > 1 ? modelWords.slice(1).join(' ') : modelPart; | |
| const idx = lsIdx.get(modelPart) ?? lsIdx.get(modelNoPrefix); | |
| if (idx !== undefined) { | |
| const target = merged[idx]; | |
| if (!target.hf_id) target.hf_id = hf.hf_id; | |
| if (!target.params_b) target.params_b = hf.params_b; | |
| if (!target.ifeval) target.ifeval = hf.ifeval; | |
| if (!target.bbh) target.bbh = hf.bbh; | |
| if (!target.gpqa) target.gpqa = hf.gpqa; | |
| if (!target.mmlu_pro) target.mmlu_pro = hf.mmlu_pro; | |
| target.hf_math_lvl5 = hf.hf_math_lvl5; | |
| target.hf_musr = hf.hf_musr; | |
| target.hf_avg = hf.hf_avg; | |
| target.sources = { ...(target.sources || {}), ...(hf.sources || {}) }; | |
| } else hfOnly.push(hf); | |
| } | |
| return [...merged, ...hfOnly]; | |
| } | |
| // βββ Refresh βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| const SOURCE_FIELDS = { | |
| llmstats: ['slug', 'mmlu', 'mmlu_pro', 'gpqa', 'human_eval', 'math', 'gsm8k', 'mmmu', 'hellaswag', 'ifeval', 'arc', 'drop', 'mbpp', 'mgsm', 'bbh'], | |
| hf: ['hf_id', 'params_b', 'hf_math_lvl5', 'hf_musr', 'hf_avg'], | |
| livebench: ['lb_name', 'lb_global', 'lb_reasoning', 'lb_coding', 'lb_math', 'lb_language', 'lb_if', 'lb_data_analysis'], | |
| arena: ['arena_name', 'arena_org', 'arena_elo', 'arena_rank', 'arena_votes'], | |
| aider: ['aider_model', 'aider_pass_rate'], | |
| aa: ['aa_id', 'aa_intelligence', 'aa_coding', 'aa_math', 'aa_mmlu_pro', 'aa_gpqa', 'aa_livecodebench', 'aa_hle', 'aa_scicode', 'aa_math_500', 'aa_aime', 'aa_tokens_per_s', 'aa_latency_s'], | |
| mteb: ['mteb_avg', 'mteb_retrieval'], | |
| ocr: ['ocr_avg'], | |
| }; | |
| const SOURCE_ID_FIELD = { | |
| llmstats: 'slug', hf: 'hf_id', livebench: 'lb_name', arena: 'arena_elo', aider: 'aider_pass_rate', aa: 'aa_intelligence', mteb: 'mteb_avg', ocr: 'ocr_avg', | |
| }; | |
| async function refreshSource(source) { | |
| if (!SOURCE_FIELDS[source]) { | |
| console.error(`Unknown source "${source}". Valid: ${Object.keys(SOURCE_FIELDS).join(', ')}`); | |
| process.exit(1); | |
| } | |
| console.log(`Refreshing benchmark source: ${source}\n`); | |
| const existing = JSON.parse(fs.readFileSync(OUT_FILE, 'utf8')); | |
| const otherIdFields = Object.values(SOURCE_ID_FIELD).filter(f => f !== SOURCE_ID_FIELD[source]); | |
| const stripped = existing.filter(e => otherIdFields.some(f => e[f] !== undefined)).map(e => { | |
| const s = { ...e }; for (const f of SOURCE_FIELDS[source]) delete s[f]; return s; | |
| }); | |
| let result; | |
| if (source === 'llmstats') result = mergeLLMStatsInto(stripped, await fetchLLMStats()); | |
| else if (source === 'hf') result = mergeHFInto(stripped, await fetchHFLeaderboard()); | |
| else if (source === 'livebench') result = mergeLiveBench(stripped, await fetchLiveBench()); | |
| else if (source === 'arena') result = mergeArena(stripped, await fetchChatbotArena()); | |
| else if (source === 'aider') result = mergeAider(stripped, await fetchAider()); | |
| else if (source === 'aa') result = mergeArtificialAnalysis(stripped, await fetchArtificialAnalysis()); | |
| else if (source === 'mteb') result = mergeMTEB(stripped, await fetchMTEB()); | |
| else if (source === 'ocr') result = mergeOCR(stripped); | |
| fs.writeFileSync(OUT_FILE, JSON.stringify(result, null, 2)); | |
| } | |
| // βββ HF README Evaluation ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| async function fetchHFReadmeBenchmarks() { | |
| const providersPath = path.join(__dirname, '..', 'data', 'providers.json'); | |
| if (!fs.existsSync(providersPath)) return []; | |
| const providers = JSON.parse(fs.readFileSync(providersPath, 'utf8')).providers; | |
| const hfIds = new Set(); | |
| providers.forEach(p => p.models.forEach(m => { if (m.hf_id) hfIds.add(m.hf_id); })); | |
| process.stdout.write(`HF README: checking ${hfIds.size} models... `); | |
| const results = []; | |
| const BATCH = 10; | |
| const ids = Array.from(hfIds); | |
| for (let i = 0; i < ids.length; i += BATCH) { | |
| const batch = ids.slice(i, i + BATCH); | |
| const rows = await Promise.all(batch.map(async (hfId) => { | |
| try { | |
| const url = `https://huggingface.co/${hfId}/raw/main/README.md`; | |
| const text = await getText(url, { retries: 1 }); | |
| if (!text.startsWith('---')) return null; | |
| const endYaml = text.indexOf('---', 3); | |
| if (endYaml === -1) return null; | |
| const yamlText = text.substring(3, endYaml); | |
| const meta = yaml.load(yamlText); | |
| if (!meta || !meta['model-index']) return null; | |
| let total = 0, count = 0, retTotal = 0, retCount = 0; | |
| const modelIndex = Array.isArray(meta['model-index']) ? meta['model-index'] : [meta['model-index']]; | |
| modelIndex.forEach(mi => { | |
| (mi.results || []).forEach(res => { | |
| const isMTEB = (res.dataset?.type || '').toLowerCase().includes('mteb') || | |
| (res.dataset?.name || '').toLowerCase().includes('mteb') || | |
| (res.task?.type || '').toLowerCase().includes('retrieval'); | |
| if (!isMTEB) return; | |
| const mainMetric = (res.metrics || []).find(m => m.type === 'main_score' || m.type === 'ndcg_at_10' || m.type === 'accuracy'); | |
| if (mainMetric && typeof mainMetric.value === 'number') { | |
| const val = mainMetric.value; | |
| let norm = val <= 1.0 ? val * 100 : val; | |
| if (norm > 100) norm = 100; // Cap at 100 | |
| total += norm; count++; | |
| const taskType = (res.task?.type || '').toLowerCase(); | |
| if (taskType.includes('retrieval') || taskType.includes('search')) { | |
| retTotal += norm; retCount++; | |
| } | |
| } | |
| }); | |
| }); | |
| if (count > 0) { | |
| return { | |
| hf_id: hfId, | |
| name: hfId.split('/').pop(), | |
| mteb_avg: Math.round(total / count * 100) / 100, | |
| mteb_retrieval: retCount > 0 ? Math.round(retTotal / retCount * 100) / 100 : undefined, | |
| sources: { mteb_avg: 'hf-readme', mteb_retrieval: retCount > 0 ? 'hf-readme' : undefined } | |
| }; | |
| } | |
| } catch (e) { | |
| return null; | |
| } | |
| return null; | |
| })); | |
| rows.forEach(r => { if (r) results.push(r); }); | |
| process.stdout.write(` HF README: ${Math.min(i + BATCH, ids.length)}/${ids.length}\r`); | |
| } | |
| console.log(`\n HF README: ${results.length} models enriched from metadata`); | |
| return results; | |
| } | |
| // βββ Main ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| async function main() { | |
| const source = process.argv[2]?.toLowerCase(); | |
| if (source) { await refreshSource(source); return; } | |
| const [llmstats, hfEntries, lbEntries, arenaEntries, aiderEntries, aaEntries, mtebEntries, readmeEntries] = await Promise.all([ | |
| fetchLLMStats(), | |
| fetchHFLeaderboard(), | |
| fetchLiveBench(), | |
| fetchChatbotArena(), | |
| fetchAider(), | |
| fetchArtificialAnalysis(), | |
| fetchMTEB(), | |
| fetchHFReadmeBenchmarks(), | |
| ]); | |
| const merged = mergeEntries(llmstats, hfEntries); | |
| const withLB = mergeLiveBench(merged, lbEntries); | |
| const withAr = mergeArena(withLB, arenaEntries); | |
| const withAi = mergeAider(withAr, aiderEntries); | |
| const withAA = mergeArtificialAnalysis(withAi, aaEntries); | |
| const withMTEB = mergeMTEB(withAA, mtebEntries); | |
| const withReadme = mergeMTEB(withMTEB, readmeEntries); | |
| const all = mergeOCR(withReadme); | |
| console.log(`\nTotal entries: ${all.length}`); | |
| console.log(` With LiveBench: ${all.filter(e => e.lb_name).length} | Arena: ${all.filter(e => e.arena_elo).length} | Aider: ${all.filter(e => e.aider_pass_rate !== undefined).length} | AA: ${all.filter(e => e.aa_intelligence !== undefined).length} | MTEB: ${all.filter(e => e.mteb_avg !== undefined).length} | OCR: ${all.filter(e => e.ocr_avg !== undefined).length}`); | |
| fs.writeFileSync(OUT_FILE, JSON.stringify(all, null, 2)); | |
| console.log(`Saved to data/benchmarks.json (${(fs.statSync(OUT_FILE).size / 1024).toFixed(0)} KB)`); | |
| } | |
| main().catch((err) => { console.error('Fatal:', err); process.exit(1); }); | |