LLMProviders / scripts /fetch-benchmarks.js
CrispStrobe
feat: verbose HF validation, OCR benchmarks, and corrected model mappings
7cc1131
'use strict';
/**
* Fetch benchmark data from six sources and merge into data/benchmarks.json.
*
* Sources:
* 1. AchilleasDrakou/LLMStats on GitHub (71 curated models, self-reported benchmarks)
* 2. open-llm-leaderboard/contents on Hugging Face (4500+ open models, standardised evals)
* 3. LiveBench (livebench.ai) β€” contamination-free, monthly, 70+ frontier models
* 4. Chatbot Arena (lmarena.ai) β€” 316 models with real ELO ratings from human votes
* 5. Aider (aider.chat) β€” code editing benchmark, 133 tasks per model
* 6. Artificial Analysis (artificialanalysis.ai) β€” independent evaluations and speed benchmarks
*
* Unified field names (0-1 scale unless noted):
* mmlu, mmlu_pro, gpqa, human_eval, math, gsm8k, mmmu,
* hellaswag, ifeval, arc, drop, mbpp, mgsm, bbh (from LLMStats)
* hf_math_lvl5, hf_musr, hf_avg, params_b (HF-only)
* lb_name, lb_global, lb_reasoning, lb_coding, (LiveBench, 0-1)
* lb_math, lb_language, lb_if, lb_data_analysis
* arena_elo, arena_rank, arena_votes (Chatbot Arena; elo is raw ELO ~800-1500)
* aider_pass_rate (Aider edit bench, 0-1)
* aa_id, aa_intelligence, aa_mmlu_pro, aa_gpqa, (Artificial Analysis)
* aa_livecodebench, aa_tokens_per_s, aa_latency_s
*
* Where multiple sources have data for the same benchmark,
* LLMStats takes priority (it stores self-reported model-card values).
*
* Usage:
* node scripts/fetch-benchmarks.js # fetch all sources
* node scripts/fetch-benchmarks.js aa # refresh Artificial Analysis only
* node scripts/fetch-benchmarks.js livebench # refresh LiveBench only
*/
const fs = require('fs');
const path = require('path');
const yaml = require('js-yaml');
const { getJson, getText } = require('./fetch-utils');
const { loadEnv } = require('./load-env');
loadEnv();
const OUT_FILE = path.join(__dirname, '..', 'data', 'benchmarks.json');
// ─── helpers ────────────────────────────────────────────────────────────────
const normName = (s) =>
(s || '').toLowerCase().replace(/[-_.]/g, ' ').replace(/[^a-z0-9 ]/g, '').replace(/\s+/g, ' ').trim();
// ─── LLMStats ───────────────────────────────────────────────────────────────
const LLMSTATS_TREE = 'https://api.github.com/repos/AchilleasDrakou/LLMStats/git/trees/main?recursive=1';
const LLMSTATS_RAW = 'https://raw.githubusercontent.com/AchilleasDrakou/LLMStats/main/';
const LLMSTATS_MAP = {
mmlu: ['MMLU', 'MMLU Chat', 'MMLU-Base', 'MMLU (CoT)', 'Multilingual MMLU'],
mmlu_pro: ['MMLU-Pro', 'MMLU-STEM', 'Multilingual MMLU-Pro'],
gpqa: ['GPQA'],
human_eval: ['HumanEval', 'Humaneval', 'HumanEval+', 'HumanEval-Average', 'Instruct HumanEval', 'MBPP EvalPlus', 'EvalPlus', 'Evalplus'],
math: ['MATH', 'Math', 'MATH (CoT)', 'MATH-500', 'Functional_MATH', 'FunctionalMATH'],
gsm8k: ['GSM8K', 'GSM-8K', 'GSM8k', 'GSM8K Chat', 'GSM-8K (CoT)'],
mmmu: ['MMMU', 'MMMUval', 'MMMU-Pro'],
hellaswag: ['HellaSwag', 'HellaSWAG', 'Hellaswag'],
ifeval: ['IFEval', 'IF-Eval'],
arc: ['ARC Challenge', 'ARC-C', 'ARC-c', 'ARC-e', 'ARC-Challenge', 'AI2 Reasoning Challenge (ARC)'],
drop: ['DROP'],
mbpp: ['MBPP', 'MBPP+', 'MBPP++', 'MBPP pass@1', 'MBPP EvalPlus (base)'],
mgsm: ['MGSM', 'Multilingual MGSM', 'Multilingual MGSM (CoT)'],
bbh: ['BBH', 'BigBench Hard CoT', 'BIG-Bench-Hard', 'BigBench-Hard', 'BIG-Bench Hard', 'BigBench_Hard'],
};
function extractLLMStatsMetrics(qualitative_metrics) {
const scores = {};
for (const m of qualitative_metrics || []) {
for (const [key, names] of Object.entries(LLMSTATS_MAP)) {
if (names.some((n) => m.dataset_name === n) && scores[key] === undefined) {
scores[key] = m.score;
}
}
}
return scores;
}
async function fetchLLMStats() {
process.stdout.write('LLMStats: fetching file list... ');
const tree = await getJson(LLMSTATS_TREE);
const files = tree.tree.filter(
(f) => f.type === 'blob' && f.path.startsWith('models/') && f.path.endsWith('/model.json')
);
console.log(`${files.length} models`);
const results = [];
const BATCH = 10;
for (let i = 0; i < files.length; i += BATCH) {
const batch = files.slice(i, i + BATCH);
const rows = await Promise.all(batch.map(async (f) => {
try {
const data = await getJson(LLMSTATS_RAW + f.path);
const slug = f.path.replace(/^models\//, '').replace(/\/model\.json$/, '');
const metrics = extractLLMStatsMetrics(data.qualitative_metrics);
const entry = { slug, name: data.name, ...metrics, sources: {} };
Object.keys(metrics).forEach(k => entry.sources[k] = 'llmstats');
return entry;
} catch (e) {
console.warn(`\n ⚠ LLMStats ${f.path}: ${e.message}`);
return null;
}
}));
rows.forEach((r) => { if (r) results.push(r); });
process.stdout.write(` LLMStats: ${Math.min(i + BATCH, files.length)}/${files.length}\r`);
}
console.log(` LLMStats: ${results.length} entries fetched `);
return results;
}
// ─── HF Leaderboard ─────────────────────────────────────────────────────────
const HF_ROWS_URL = 'https://datasets-server.huggingface.co/rows' +
'?dataset=open-llm-leaderboard%2Fcontents&config=default&split=train';
async function fetchHFPage(offset, limit = 100) {
const data = await getJson(`${HF_ROWS_URL}&offset=${offset}&limit=${limit}`);
return { rows: data.rows.map((r) => r.row), total: data.num_rows_total };
}
async function fetchHFLeaderboard() {
process.stdout.write('HF Leaderboard: probing total... ');
const first = await fetchHFPage(0, 1);
const total = first.total;
console.log(`${total} rows`);
const LIMIT = 100;
const pages = Math.ceil(total / LIMIT);
const allRows = [...first.rows];
// Fetch remaining pages in batches of 5 concurrent requests
const CONCURRENT = 5;
for (let p = 1; p < pages; p += CONCURRENT) {
const batch = [];
for (let q = p; q < Math.min(p + CONCURRENT, pages); q++) {
batch.push(fetchHFPage(q * LIMIT, LIMIT));
}
const results = await Promise.all(batch);
results.forEach((r) => allRows.push(...r.rows));
const done = Math.min((p + CONCURRENT) * LIMIT, total);
process.stdout.write(` HF: ${done}/${total}\r`);
}
console.log(` HF: ${total}/${total} β€” filtering... `);
// The Average column name has a Unicode emoji
const AVG_KEY = Object.keys(allRows[0]).find((k) => k.startsWith('Average'));
const entries = allRows
.filter((r) => r['Available on the hub'] && !r.Flagged)
.map((r) => {
const entry = {
hf_id: r.fullname,
name: r.fullname.split('/').pop(),
sources: {},
};
if (r['#Params (B)']) { entry.params_b = r['#Params (B)']; entry.sources.params_b = 'hf'; }
if (r['IFEval Raw']) { entry.ifeval = r['IFEval Raw']; entry.sources.ifeval = 'hf'; }
if (r['BBH Raw']) { entry.bbh = r['BBH Raw']; entry.sources.bbh = 'hf'; }
if (r['GPQA Raw']) { entry.gpqa = r['GPQA Raw']; entry.sources.gpqa = 'hf'; }
if (r['MMLU-PRO Raw']) { entry.mmlu_pro = r['MMLU-PRO Raw']; entry.sources.mmlu_pro = 'hf'; }
if (r['MATH Lvl 5 Raw']) { entry.hf_math_lvl5 = r['MATH Lvl 5 Raw']; entry.sources.hf_math_lvl5 = 'hf'; }
if (r['MUSR Raw']) { entry.hf_musr = r['MUSR Raw']; entry.sources.hf_musr = 'hf'; }
if (AVG_KEY && r[AVG_KEY]) { entry.hf_avg = r[AVG_KEY]; entry.sources.hf_avg = 'hf'; }
return entry;
});
console.log(` HF: ${entries.length} entries after filtering`);
return entries;
}
// ─── LiveBench ───────────────────────────────────────────────────────────────
const LB_GITHUB_TREE = 'https://api.github.com/repos/LiveBench/livebench.github.io/git/trees/main?recursive=1';
const LB_BASE_URL = 'https://livebench.ai';
const LB_SUFFIX_RE = new RegExp(
'(-thinking-(?:auto-)?(?:\\d+k-)?(?:(?:high|medium|low)-effort)?|' +
'-thinking(?:-(?:64k|32k|auto|minimal))?|' +
'-(?:high|medium|low)-effort|' +
'-base|-non-?reasoning|-(?:high|low|min)thinking|-nothinking)' +
'(?:-(?:high|medium|low)-effort)?$'
);
function lbBaseName(name) {
let prev;
let cur = name;
do { prev = cur; cur = cur.replace(LB_SUFFIX_RE, ''); } while (cur !== prev);
return cur;
}
function parseLiveBenchCsv(csvText, taskToGroup) {
const avg = (arr) => arr.reduce((a, b) => a + b, 0) / arr.length;
const lines = csvText.split('\n').filter(Boolean);
const headers = lines[0].split(',');
const entries = [];
for (const line of lines.slice(1)) {
const vals = line.split(',');
const modelName = vals[0];
if (!modelName) continue;
const taskScores = {};
for (let i = 1; i < headers.length; i++) {
const v = parseFloat(vals[i]);
if (!isNaN(v)) taskScores[headers[i]] = v / 100;
}
const groupBuckets = {};
for (const [task, group] of Object.entries(taskToGroup)) {
if (taskScores[task] !== undefined) {
groupBuckets[group] = groupBuckets[group] || [];
groupBuckets[group].push(taskScores[task]);
}
}
const allScores = Object.values(taskScores);
const entry = {
lb_name: modelName,
lb_global: allScores.length ? avg(allScores) : undefined,
lb_reasoning: groupBuckets.lb_reasoning ? avg(groupBuckets.lb_reasoning) : undefined,
lb_coding: groupBuckets.lb_coding ? avg(groupBuckets.lb_coding) : undefined,
lb_math: groupBuckets.lb_math ? avg(groupBuckets.lb_math) : undefined,
lb_language: groupBuckets.lb_language ? avg(groupBuckets.lb_language) : undefined,
lb_if: groupBuckets.lb_if ? avg(groupBuckets.lb_if) : undefined,
lb_data_analysis: groupBuckets.lb_data_analysis ? avg(groupBuckets.lb_data_analysis) : undefined,
sources: {},
};
Object.keys(entry).forEach(k => {
if (k.startsWith('lb_') && entry[k] !== undefined) entry.sources[k] = 'livebench';
});
entries.push(entry);
}
return entries;
}
async function fetchLiveBench() {
process.stdout.write('LiveBench: finding all releases... ');
const tree = await getJson(LB_GITHUB_TREE);
const dates = tree.tree
.filter((f) => f.path.startsWith('public/table_') && f.path.endsWith('.csv'))
.map((f) => f.path.replace('public/table_', '').replace('.csv', ''))
.sort();
console.log(`${dates.length} releases (${dates[0]} β†’ ${dates[dates.length - 1]})`);
const cats = await getJson(`${LB_BASE_URL}/categories_${dates[dates.length - 1]}.json`);
const taskToGroup = {};
for (const [cat, tasks] of Object.entries(cats)) {
const group =
cat === 'Coding' || cat === 'Agentic Coding' ? 'lb_coding' :
cat === 'Reasoning' ? 'lb_reasoning' :
cat === 'Mathematics' ? 'lb_math' :
cat === 'Language' ? 'lb_language' :
cat === 'IF' ? 'lb_if' :
cat === 'Data Analysis' ? 'lb_data_analysis' : null;
if (group) for (const t of tasks) taskToGroup[t] = group;
}
const byName = new Map();
for (const date of dates) {
let csv;
try { csv = await getText(`${LB_BASE_URL}/table_${date}.csv`); }
catch (e) { console.warn(`\n ⚠ LiveBench ${date}: ${e.message}`); continue; }
for (const entry of parseLiveBenchCsv(csv, taskToGroup)) byName.set(entry.lb_name, entry);
process.stdout.write(` LiveBench: ${date}\r`);
}
const entries = [...byName.values()];
console.log(` LiveBench: ${entries.length} unique models across all releases`);
return entries;
}
function mergeLiveBench(entries, lbEntries) {
const exactMap = new Map();
const baseMap = new Map();
for (const lb of lbEntries) {
exactMap.set(normName(lb.lb_name), lb);
const base = normName(lbBaseName(lb.lb_name));
if (base !== normName(lb.lb_name)) {
const prev = baseMap.get(base);
if (!prev || (lb.lb_global || 0) > (prev.lb_global || 0)) baseMap.set(base, lb);
}
}
const usedLbNames = new Set();
let matched = 0;
for (const e of entries) {
const candidates = [normName(e.name || ''), normName((e.slug || '').split('/').pop() || ''), normName((e.hf_id || '').split('/').pop() || '')].filter(Boolean);
let lb = null;
for (const c of candidates) { lb = exactMap.get(c) || baseMap.get(c); if (lb) break; }
if (lb) {
Object.assign(e, lb);
e.sources = { ...(e.sources || {}), ...(lb.sources || {}) };
usedLbNames.add(lb.lb_name);
matched++;
}
}
const usedBases = new Set([...usedLbNames].map((n) => normName(lbBaseName(n))));
const newEntries = [];
for (const lb of lbEntries) {
if (usedLbNames.has(lb.lb_name)) continue;
const base = normName(lbBaseName(lb.lb_name));
if (usedBases.has(base)) continue;
if (baseMap.get(base) === lb || exactMap.get(normName(lb.lb_name)) === lb) {
newEntries.push({ name: lbBaseName(lb.lb_name), ...lb });
usedBases.add(base);
}
}
console.log(` LiveBench: ${matched} matched, ${newEntries.length} new entries`);
return [...entries, ...newEntries];
}
// ─── Chatbot Arena ───────────────────────────────────────────────────────────
async function fetchChatbotArena() {
process.stdout.write('Chatbot Arena: fetching RSC leaderboard... ');
const text = await getText('https://lmarena.ai/en/leaderboard/text', {
headers: { 'User-Agent': 'Mozilla/5.0', 'RSC': '1', 'Accept': 'text/x-component' },
});
let entries = null;
for (const line of text.split('\n')) {
if (!line.includes('"entries":[') || !line.includes('"rating":')) continue;
const start = line.indexOf('"entries":[') + '"entries":'.length;
let depth = 0, end = -1;
for (let i = start; i < line.length; i++) {
if (line[i] === '[' || line[i] === '{') depth++;
else if (line[i] === ']' || line[i] === '}') { depth--; if (depth === 0) { end = i + 1; break; } }
}
entries = JSON.parse(line.substring(start, end));
break;
}
if (!entries) throw new Error('Could not find entries in RSC payload');
console.log(`${entries.length} models`);
return entries.map((e) => {
const entry = {
arena_name: e.modelDisplayName,
arena_org: e.modelOrganization,
arena_elo: e.rating,
arena_rank: e.rank,
arena_votes: e.votes,
sources: {},
};
Object.keys(entry).forEach(k => {
if (k.startsWith('arena_') && entry[k] !== undefined) entry.sources[k] = 'arena';
});
return entry;
});
}
function mergeArena(entries, arenaEntries) {
const arenaMap = new Map();
for (const a of arenaEntries) arenaMap.set(normName(a.arena_name), a);
let matched = 0;
for (const e of entries) {
const candidates = [normName(e.name || ''), normName(e.lb_name || ''), normName((e.slug || '').split('/').pop() || ''), normName((e.hf_id || '').split('/').pop() || '')];
const a = candidates.map((c) => arenaMap.get(c)).find(Boolean);
if (a) {
e.arena_elo = a.arena_elo; e.arena_rank = a.arena_rank; e.arena_votes = a.arena_votes;
e.sources = { ...(e.sources || {}), ...(a.sources || {}) };
arenaMap.delete(normName(a.arena_name)); matched++;
}
}
const newEntries = [];
for (const a of arenaMap.values()) newEntries.push({ name: a.arena_name, ...a });
console.log(` Arena: ${matched} matched, ${newEntries.length} new entries`);
return [...entries, ...newEntries];
}
// ─── Aider ───────────────────────────────────────────────────────────────────
const AIDER_RAW = 'https://raw.githubusercontent.com/Aider-AI/aider/main/aider/website/_data/edit_leaderboard.yml';
async function fetchAider() {
process.stdout.write('Aider: fetching edit leaderboard... ');
const text = await getText(AIDER_RAW);
const rows = yaml.load(text);
const best = new Map();
for (const row of rows) {
if (!row.model || row.pass_rate_1 === undefined) continue;
const key = normName(row.model);
const existing = best.get(key);
if (!existing || row.pass_rate_1 > existing.pass_rate_1) best.set(key, row);
}
const entries = [];
for (const row of best.values()) {
const entry = { aider_model: row.model, aider_pass_rate: row.pass_rate_1 / 100, sources: {} };
entry.sources.aider_pass_rate = 'aider';
entries.push(entry);
}
console.log(`${entries.length} models (best run each)`);
return entries;
}
function mergeAider(entries, aiderEntries) {
const aiderMap = new Map();
for (const a of aiderEntries) aiderMap.set(normName(a.aider_model), a);
let matched = 0;
for (const e of entries) {
const candidates = [normName(e.name || ''), normName(e.lb_name || ''), normName((e.slug || '').split('/').pop() || ''), normName((e.hf_id || '').split('/').pop() || ''), normName(e.arena_name || '')];
const a = candidates.map((c) => aiderMap.get(c)).find(Boolean);
if (a) {
e.aider_pass_rate = a.aider_pass_rate;
e.sources = { ...(e.sources || {}), ...(a.sources || {}) };
aiderMap.delete(normName(a.aider_model));
matched++;
}
}
const newEntries = [];
for (const a of aiderMap.values()) newEntries.push({ name: a.aider_model, ...a });
console.log(` Aider: ${matched} matched, ${newEntries.length} new entries`);
return [...entries, ...newEntries];
}
// ─── Artificial Analysis ───────────────────────────────────────────────────
async function fetchArtificialAnalysis() {
const apiKey = process.env.ARTIFICIAL_ANALYSIS_API_KEY;
if (!apiKey) {
console.log('Artificial Analysis: skipping (no API key found)');
return [];
}
process.stdout.write('Artificial Analysis: fetching benchmarks... ');
const res = await getJson('https://artificialanalysis.ai/api/v2/data/llms/models', {
headers: { 'x-api-key': apiKey },
});
if (!res.data) throw new Error('Invalid response from Artificial Analysis API');
console.log(`${res.data.length} models`);
return res.data.map((m) => {
const ev = m.evaluations || {};
const entry = {
aa_id: m.id,
aa_name: m.name,
aa_slug: m.slug,
aa_intelligence: ev.artificial_analysis_intelligence_index, // 0-100
aa_coding: ev.artificial_analysis_coding_index, // 0-100
aa_math: ev.artificial_analysis_math_index, // 0-100
aa_mmlu_pro: ev.mmlu_pro, // 0-1
aa_gpqa: ev.gpqa, // 0-1
aa_livecodebench: ev.livecodebench, // 0-1
aa_hle: ev.hle,
aa_scicode: ev.scicode,
aa_math_500: ev.math_500,
aa_aime: ev.aime,
aa_tokens_per_s: m.median_output_tokens_per_second,
aa_latency_s: m.median_time_to_first_token_seconds,
sources: {},
};
Object.keys(entry).forEach(k => {
if (k.startsWith('aa_') && entry[k] !== undefined) entry.sources[k] = 'aa';
});
return entry;
});
}
function mergeArtificialAnalysis(entries, aaEntries) {
const aaMap = new Map();
for (const a of aaEntries) aaMap.set(normName(a.aa_name), a);
let matched = 0;
for (const e of entries) {
const candidates = [
normName(e.name || ''),
normName(e.lb_name || ''),
normName((e.slug || '').split('/').pop() || ''),
normName((e.hf_id || '').split('/').pop() || ''),
normName(e.arena_name || ''),
].filter(Boolean);
const aa = candidates.map((c) => aaMap.get(c)).find(Boolean);
if (aa) {
Object.assign(e, aa);
e.sources = { ...(e.sources || {}), ...(aa.sources || {}) };
aaMap.delete(normName(aa.aa_name));
matched++;
}
}
const newEntries = [];
for (const a of aaMap.values()) {
newEntries.push({ name: a.aa_name, ...a });
}
console.log(` AA: ${matched} matched, ${newEntries.length} new entries`);
return [...entries, ...newEntries];
}
// ─── MTEB ──────────────────────────────────────────────────────────────────
const MTEB_PATHS_URL = 'https://raw.githubusercontent.com/embeddings-benchmark/results/main/paths.json';
const MTEB_RAW_BASE_URL = 'https://raw.githubusercontent.com/embeddings-benchmark/results/main/';
async function fetchMTEB() {
const providersPath = path.join(__dirname, '..', 'data', 'providers.json');
if (!fs.existsSync(providersPath)) return [];
process.stdout.write('MTEB: fetching results index... ');
const paths = await getJson(MTEB_PATHS_URL);
const providers = JSON.parse(fs.readFileSync(providersPath, 'utf8')).providers;
const hfIds = new Set();
providers.forEach(p => p.models.forEach(m => { if (m.type === 'embedding' && m.hf_id) hfIds.add(m.hf_id); }));
console.log(`${hfIds.size} embedders`);
const results = [];
for (const hfId of hfIds) {
const key = hfId.replace(/\//g, '__');
// Try original key, then find matching key in paths (case-insensitive)
let resultPaths = paths[key];
if (!resultPaths) {
const match = Object.keys(paths).find(k => k.toLowerCase() === key.toLowerCase());
if (match) resultPaths = paths[match];
}
if (!resultPaths) continue;
const revisions = [...new Set(resultPaths.map(p => p.split('/')[2]))];
// Aggregation: we'll take all unique tasks across all revisions,
// prioritizing the latest revision for each task.
const taskPaths = new Map();
revisions.forEach(rev => {
const pathsInRev = resultPaths.filter(p => p.includes(`/${rev}/`));
pathsInRev.forEach(p => {
const taskName = p.split('/').pop().replace('.json', '');
taskPaths.set(taskName, p);
});
});
const latestPaths = [...taskPaths.values()];
const fetchPaths = latestPaths.slice(0, 50); // Limit to 50 tasks to prevent hangs
process.stdout.write(` MTEB: ${hfId} (fetching ${fetchPaths.length}/${latestPaths.length} tasks)\r`);
let total = 0, count = 0, retTotal = 0, retCount = 0;
const BATCH = 20;
for (let i = 0; i < fetchPaths.length; i += BATCH) {
const batch = await Promise.all(fetchPaths.slice(i, i + BATCH).map(p => getJson(MTEB_RAW_BASE_URL + p).catch(() => null)));
batch.forEach(res => {
if (!res) return;
const scores = res.scores || res;
const data = scores.test || scores.dev || scores.validation;
if (!data) return;
const arr = Array.isArray(data) ? data : [data];
// Find English or default subset
let targetRes = arr.find(r => r.languages && r.languages.some(l => l.startsWith('eng') || l === 'en'));
if (!targetRes && arr.length === 1) targetRes = arr[0];
if (!targetRes) targetRes = arr.find(r => r.hf_subset === 'default');
if (!targetRes && arr.length > 0) targetRes = arr[0];
if (targetRes) {
const s = targetRes.main_score || targetRes.ndcg_at_10 || targetRes.accuracy;
if (typeof s === 'number' && s > 0) {
let norm = s <= 1.0 ? s * 100 : s;
if (norm > 100) norm = 100; // Cap at 100
total += norm; count++;
const task = res.mteb_dataset_name || res.task_name || '';
if (task.includes('Retrieval') || task.includes('Search')) { retTotal += norm; retCount++; }
}
}
});
}
if (count > 0) {
results.push({
hf_id: hfId,
name: hfId.split('/').pop(),
mteb_avg: Math.round(total / count * 100) / 100,
mteb_retrieval: retCount > 0 ? Math.round(retTotal / retCount * 100) / 100 : undefined,
sources: { mteb_avg: 'mteb', mteb_retrieval: retCount > 0 ? 'mteb' : undefined }
});
}
}
console.log(`\n MTEB: ${results.length} models enriched `);
return results;
}
function mergeMTEB(entries, mtebEntries) {
const map = new Map(mtebEntries.map(m => [m.hf_id.toLowerCase(), m]));
// Manual overrides for famous models not yet in the results repo or needing fixed values
const overrides = [
{ hf_id: 'BAAI/bge-multilingual-gemma2', mteb_avg: 70.3, mteb_retrieval: 67.5, sources: { mteb_avg: 'manual', mteb_retrieval: 'manual' } },
{ hf_id: 'Qwen/Qwen3-Embedding-8B', mteb_avg: 71.2, mteb_retrieval: 72.1, sources: { mteb_avg: 'manual', mteb_retrieval: 'manual' } },
{ hf_id: 'BAAI/bge-en-icl', mteb_avg: 64.9, mteb_retrieval: 58.2, sources: { mteb_avg: 'manual', mteb_retrieval: 'manual' } },
{ hf_id: 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2', mteb_avg: 51.98, mteb_retrieval: 39.76, sources: { mteb_avg: 'manual', mteb_retrieval: 'manual' } },
{ name: 'Mistral Embed', mteb_avg: 55.26, sources: { mteb_avg: 'manual' } },
{ name: 'Codestral Embed', mteb_avg: 84.7, mteb_retrieval: 81.0, lb_coding: 0.81, sources: { mteb_avg: 'manual', mteb_retrieval: 'manual', lb_coding: 'manual' } },
];
overrides.forEach(o => {
const key = (o.hf_id || o.name).toLowerCase();
map.set(key, o); // Force override
});
let matched = 0;
for (const e of entries) {
const m = (e.hf_id ? map.get(e.hf_id.toLowerCase()) : null) || (e.name ? map.get(e.name.toLowerCase()) : null);
if (m) {
if (m.mteb_avg) e.mteb_avg = m.mteb_avg;
if (m.mteb_retrieval) e.mteb_retrieval = m.mteb_retrieval;
if (m.lb_coding) e.lb_coding = m.lb_coding;
e.sources = { ...(e.sources || {}), ...m.sources };
const key = (m.hf_id || m.name).toLowerCase();
map.delete(key); matched++;
}
}
const newEntries = [...map.values()];
console.log(` MTEB: ${matched} matched, ${newEntries.length} new entries`);
return [...entries, ...newEntries];
}
// ─── OCR Benchmarks ────────────────────────────────────────────────────────
function mergeOCR(entries) {
const ocrData = [
{ name: 'datalab-to/chandra-ocr-2', score: 85.9 },
{ name: 'rednote-hilab/dots.mocr', score: 83.9 },
{ name: 'lightonai/LightOnOCR-2-1B', score: 83.2 },
{ name: 'datalab-to/chandra', score: 83.1 },
{ name: 'infly/Infinity-Parser-7B', score: 82.5 },
{ name: 'allenai/olmOCR-2-7B-1025-FP8', score: 82.4 },
{ name: 'PaddlePaddle/PaddleOCR-VL', score: 80.0 },
{ name: 'baidu/Qianfan-OCR', score: 79.8 },
{ name: 'rednote-hilab/dots.ocr', score: 79.1 },
{ name: 'deepseek-ai/DeepSeek-OCR-2', score: 76.3 },
{ name: 'lightonai/LightOnOCR-1B-1025', score: 76.1 },
{ name: 'deepseek-ai/DeepSeek-OCR', score: 75.7 },
{ name: 'opendatalab/MinerU2.5-2509-1.2B', score: 75.2 },
{ name: 'zai-org/GLM-OCR', score: 75.2 },
{ name: 'FireRedTeam/FireRed-OCR', score: 70.2 },
{ name: 'nanonets/Nanonets-OCR2-3B', score: 69.5 },
];
const ocrMap = new Map();
ocrData.forEach(d => {
ocrMap.set(normName(d.name), d);
const modelPart = d.name.split('/').pop();
if (modelPart) ocrMap.set(normName(modelPart), d);
});
let matched = 0;
const usedOcr = new Set();
for (const e of entries) {
const candidates = [
normName(e.name || ''),
normName((e.hf_id || '').split('/').pop() || ''),
normName(e.hf_id || '')
].filter(Boolean);
const ocr = candidates.map(c => ocrMap.get(c)).find(Boolean);
if (ocr) {
e.ocr_avg = ocr.score;
e.sources = { ...(e.sources || {}), ocr_avg: 'manual' };
matched++;
usedOcr.add(ocr.name);
}
}
const newEntries = [];
ocrData.forEach(d => {
if (!usedOcr.has(d.name)) {
newEntries.push({
hf_id: d.name,
name: d.name.split('/').pop(),
ocr_avg: d.score,
sources: { ocr_avg: 'manual' }
});
}
});
console.log(` OCR: ${matched} matched, ${newEntries.length} new entries`);
return [...entries, ...newEntries];
}
// ─── Merge ───────────────────────────────────────────────────────────────────
function mergeEntries(llmstats, hfEntries) {
const lsIdx = new Map();
llmstats.forEach((e, i) => {
lsIdx.set(normName(e.name), i);
const slugModel = e.slug?.split('/').pop() || '';
if (slugModel) lsIdx.set(normName(slugModel), i);
});
const merged = llmstats.map((e) => ({ ...e, sources: { ...(e.sources || {}) } }));
const hfOnly = [];
for (const hf of hfEntries) {
const modelPart = normName(hf.name);
const modelWords = modelPart.split(' ');
const modelNoPrefix = modelWords.length > 1 ? modelWords.slice(1).join(' ') : modelPart;
const idx = lsIdx.get(modelPart) ?? lsIdx.get(modelNoPrefix);
if (idx !== undefined) {
const target = merged[idx];
if (!target.hf_id) target.hf_id = hf.hf_id;
if (!target.params_b) target.params_b = hf.params_b;
if (!target.ifeval) target.ifeval = hf.ifeval;
if (!target.bbh) target.bbh = hf.bbh;
if (!target.gpqa) target.gpqa = hf.gpqa;
if (!target.mmlu_pro) target.mmlu_pro = hf.mmlu_pro;
target.hf_math_lvl5 = hf.hf_math_lvl5;
target.hf_musr = hf.hf_musr;
target.hf_avg = hf.hf_avg;
target.sources = { ...(target.sources || {}), ...(hf.sources || {}) };
} else hfOnly.push(hf);
}
return [...merged, ...hfOnly];
}
// ─── Refresh ─────────────────────────────────────────────────────────────────
const SOURCE_FIELDS = {
llmstats: ['slug', 'mmlu', 'mmlu_pro', 'gpqa', 'human_eval', 'math', 'gsm8k', 'mmmu', 'hellaswag', 'ifeval', 'arc', 'drop', 'mbpp', 'mgsm', 'bbh'],
hf: ['hf_id', 'params_b', 'hf_math_lvl5', 'hf_musr', 'hf_avg'],
livebench: ['lb_name', 'lb_global', 'lb_reasoning', 'lb_coding', 'lb_math', 'lb_language', 'lb_if', 'lb_data_analysis'],
arena: ['arena_name', 'arena_org', 'arena_elo', 'arena_rank', 'arena_votes'],
aider: ['aider_model', 'aider_pass_rate'],
aa: ['aa_id', 'aa_intelligence', 'aa_coding', 'aa_math', 'aa_mmlu_pro', 'aa_gpqa', 'aa_livecodebench', 'aa_hle', 'aa_scicode', 'aa_math_500', 'aa_aime', 'aa_tokens_per_s', 'aa_latency_s'],
mteb: ['mteb_avg', 'mteb_retrieval'],
ocr: ['ocr_avg'],
};
const SOURCE_ID_FIELD = {
llmstats: 'slug', hf: 'hf_id', livebench: 'lb_name', arena: 'arena_elo', aider: 'aider_pass_rate', aa: 'aa_intelligence', mteb: 'mteb_avg', ocr: 'ocr_avg',
};
async function refreshSource(source) {
if (!SOURCE_FIELDS[source]) {
console.error(`Unknown source "${source}". Valid: ${Object.keys(SOURCE_FIELDS).join(', ')}`);
process.exit(1);
}
console.log(`Refreshing benchmark source: ${source}\n`);
const existing = JSON.parse(fs.readFileSync(OUT_FILE, 'utf8'));
const otherIdFields = Object.values(SOURCE_ID_FIELD).filter(f => f !== SOURCE_ID_FIELD[source]);
const stripped = existing.filter(e => otherIdFields.some(f => e[f] !== undefined)).map(e => {
const s = { ...e }; for (const f of SOURCE_FIELDS[source]) delete s[f]; return s;
});
let result;
if (source === 'llmstats') result = mergeLLMStatsInto(stripped, await fetchLLMStats());
else if (source === 'hf') result = mergeHFInto(stripped, await fetchHFLeaderboard());
else if (source === 'livebench') result = mergeLiveBench(stripped, await fetchLiveBench());
else if (source === 'arena') result = mergeArena(stripped, await fetchChatbotArena());
else if (source === 'aider') result = mergeAider(stripped, await fetchAider());
else if (source === 'aa') result = mergeArtificialAnalysis(stripped, await fetchArtificialAnalysis());
else if (source === 'mteb') result = mergeMTEB(stripped, await fetchMTEB());
else if (source === 'ocr') result = mergeOCR(stripped);
fs.writeFileSync(OUT_FILE, JSON.stringify(result, null, 2));
}
// ─── HF README Evaluation ──────────────────────────────────────────────────
async function fetchHFReadmeBenchmarks() {
const providersPath = path.join(__dirname, '..', 'data', 'providers.json');
if (!fs.existsSync(providersPath)) return [];
const providers = JSON.parse(fs.readFileSync(providersPath, 'utf8')).providers;
const hfIds = new Set();
providers.forEach(p => p.models.forEach(m => { if (m.hf_id) hfIds.add(m.hf_id); }));
process.stdout.write(`HF README: checking ${hfIds.size} models... `);
const results = [];
const BATCH = 10;
const ids = Array.from(hfIds);
for (let i = 0; i < ids.length; i += BATCH) {
const batch = ids.slice(i, i + BATCH);
const rows = await Promise.all(batch.map(async (hfId) => {
try {
const url = `https://huggingface.co/${hfId}/raw/main/README.md`;
const text = await getText(url, { retries: 1 });
if (!text.startsWith('---')) return null;
const endYaml = text.indexOf('---', 3);
if (endYaml === -1) return null;
const yamlText = text.substring(3, endYaml);
const meta = yaml.load(yamlText);
if (!meta || !meta['model-index']) return null;
let total = 0, count = 0, retTotal = 0, retCount = 0;
const modelIndex = Array.isArray(meta['model-index']) ? meta['model-index'] : [meta['model-index']];
modelIndex.forEach(mi => {
(mi.results || []).forEach(res => {
const isMTEB = (res.dataset?.type || '').toLowerCase().includes('mteb') ||
(res.dataset?.name || '').toLowerCase().includes('mteb') ||
(res.task?.type || '').toLowerCase().includes('retrieval');
if (!isMTEB) return;
const mainMetric = (res.metrics || []).find(m => m.type === 'main_score' || m.type === 'ndcg_at_10' || m.type === 'accuracy');
if (mainMetric && typeof mainMetric.value === 'number') {
const val = mainMetric.value;
let norm = val <= 1.0 ? val * 100 : val;
if (norm > 100) norm = 100; // Cap at 100
total += norm; count++;
const taskType = (res.task?.type || '').toLowerCase();
if (taskType.includes('retrieval') || taskType.includes('search')) {
retTotal += norm; retCount++;
}
}
});
});
if (count > 0) {
return {
hf_id: hfId,
name: hfId.split('/').pop(),
mteb_avg: Math.round(total / count * 100) / 100,
mteb_retrieval: retCount > 0 ? Math.round(retTotal / retCount * 100) / 100 : undefined,
sources: { mteb_avg: 'hf-readme', mteb_retrieval: retCount > 0 ? 'hf-readme' : undefined }
};
}
} catch (e) {
return null;
}
return null;
}));
rows.forEach(r => { if (r) results.push(r); });
process.stdout.write(` HF README: ${Math.min(i + BATCH, ids.length)}/${ids.length}\r`);
}
console.log(`\n HF README: ${results.length} models enriched from metadata`);
return results;
}
// ─── Main ────────────────────────────────────────────────────────────────────
async function main() {
const source = process.argv[2]?.toLowerCase();
if (source) { await refreshSource(source); return; }
const [llmstats, hfEntries, lbEntries, arenaEntries, aiderEntries, aaEntries, mtebEntries, readmeEntries] = await Promise.all([
fetchLLMStats(),
fetchHFLeaderboard(),
fetchLiveBench(),
fetchChatbotArena(),
fetchAider(),
fetchArtificialAnalysis(),
fetchMTEB(),
fetchHFReadmeBenchmarks(),
]);
const merged = mergeEntries(llmstats, hfEntries);
const withLB = mergeLiveBench(merged, lbEntries);
const withAr = mergeArena(withLB, arenaEntries);
const withAi = mergeAider(withAr, aiderEntries);
const withAA = mergeArtificialAnalysis(withAi, aaEntries);
const withMTEB = mergeMTEB(withAA, mtebEntries);
const withReadme = mergeMTEB(withMTEB, readmeEntries);
const all = mergeOCR(withReadme);
console.log(`\nTotal entries: ${all.length}`);
console.log(` With LiveBench: ${all.filter(e => e.lb_name).length} | Arena: ${all.filter(e => e.arena_elo).length} | Aider: ${all.filter(e => e.aider_pass_rate !== undefined).length} | AA: ${all.filter(e => e.aa_intelligence !== undefined).length} | MTEB: ${all.filter(e => e.mteb_avg !== undefined).length} | OCR: ${all.filter(e => e.ocr_avg !== undefined).length}`);
fs.writeFileSync(OUT_FILE, JSON.stringify(all, null, 2));
console.log(`Saved to data/benchmarks.json (${(fs.statSync(OUT_FILE).size / 1024).toFixed(0)} KB)`);
}
main().catch((err) => { console.error('Fatal:', err); process.exit(1); });