Spaces:

cstr
/

LLMProviders

Building

App Files Files Community

LLMProviders / scripts /fetch-benchmarks.js

CrispStrobe

feat: verbose HF validation, OCR benchmarks, and corrected model mappings

7cc1131 7 days ago

history blame contribute delete

37.8 kB

	'use strict';

	/**
	* Fetch benchmark data from six sources and merge into data/benchmarks.json.
	*
	* Sources:
	* 1. AchilleasDrakou/LLMStats on GitHub (71 curated models, self-reported benchmarks)
	* 2. open-llm-leaderboard/contents on Hugging Face (4500+ open models, standardised evals)
	* 3. LiveBench (livebench.ai) — contamination-free, monthly, 70+ frontier models
	* 4. Chatbot Arena (lmarena.ai) — 316 models with real ELO ratings from human votes
	* 5. Aider (aider.chat) — code editing benchmark, 133 tasks per model
	* 6. Artificial Analysis (artificialanalysis.ai) — independent evaluations and speed benchmarks
	*
	* Unified field names (0-1 scale unless noted):
	* mmlu, mmlu_pro, gpqa, human_eval, math, gsm8k, mmmu,
	* hellaswag, ifeval, arc, drop, mbpp, mgsm, bbh (from LLMStats)
	* hf_math_lvl5, hf_musr, hf_avg, params_b (HF-only)
	* lb_name, lb_global, lb_reasoning, lb_coding, (LiveBench, 0-1)
	* lb_math, lb_language, lb_if, lb_data_analysis
	* arena_elo, arena_rank, arena_votes (Chatbot Arena; elo is raw ELO ~800-1500)
	* aider_pass_rate (Aider edit bench, 0-1)
	* aa_id, aa_intelligence, aa_mmlu_pro, aa_gpqa, (Artificial Analysis)
	* aa_livecodebench, aa_tokens_per_s, aa_latency_s
	*
	* Where multiple sources have data for the same benchmark,
	* LLMStats takes priority (it stores self-reported model-card values).
	*
	* Usage:
	* node scripts/fetch-benchmarks.js # fetch all sources
	* node scripts/fetch-benchmarks.js aa # refresh Artificial Analysis only
	* node scripts/fetch-benchmarks.js livebench # refresh LiveBench only
	*/

	const fs = require('fs');
	const path = require('path');
	const yaml = require('js-yaml');
	const { getJson, getText } = require('./fetch-utils');
	const { loadEnv } = require('./load-env');

	loadEnv();

	const OUT_FILE = path.join(__dirname, '..', 'data', 'benchmarks.json');

	// ─── helpers ────────────────────────────────────────────────────────────────

	const normName = (s) =>
	(s \|\| '').toLowerCase().replace(/[-_.]/g, ' ').replace(/[^a-z0-9 ]/g, '').replace(/\s+/g, ' ').trim();

	// ─── LLMStats ───────────────────────────────────────────────────────────────

	const LLMSTATS_TREE = 'https://api.github.com/repos/AchilleasDrakou/LLMStats/git/trees/main?recursive=1';
	const LLMSTATS_RAW = 'https://raw.githubusercontent.com/AchilleasDrakou/LLMStats/main/';

	const LLMSTATS_MAP = {
	mmlu: ['MMLU', 'MMLU Chat', 'MMLU-Base', 'MMLU (CoT)', 'Multilingual MMLU'],
	mmlu_pro: ['MMLU-Pro', 'MMLU-STEM', 'Multilingual MMLU-Pro'],
	gpqa: ['GPQA'],
	human_eval: ['HumanEval', 'Humaneval', 'HumanEval+', 'HumanEval-Average', 'Instruct HumanEval', 'MBPP EvalPlus', 'EvalPlus', 'Evalplus'],
	math: ['MATH', 'Math', 'MATH (CoT)', 'MATH-500', 'Functional_MATH', 'FunctionalMATH'],
	gsm8k: ['GSM8K', 'GSM-8K', 'GSM8k', 'GSM8K Chat', 'GSM-8K (CoT)'],
	mmmu: ['MMMU', 'MMMUval', 'MMMU-Pro'],
	hellaswag: ['HellaSwag', 'HellaSWAG', 'Hellaswag'],
	ifeval: ['IFEval', 'IF-Eval'],
	arc: ['ARC Challenge', 'ARC-C', 'ARC-c', 'ARC-e', 'ARC-Challenge', 'AI2 Reasoning Challenge (ARC)'],
	drop: ['DROP'],
	mbpp: ['MBPP', 'MBPP+', 'MBPP++', 'MBPP pass@1', 'MBPP EvalPlus (base)'],
	mgsm: ['MGSM', 'Multilingual MGSM', 'Multilingual MGSM (CoT)'],
	bbh: ['BBH', 'BigBench Hard CoT', 'BIG-Bench-Hard', 'BigBench-Hard', 'BIG-Bench Hard', 'BigBench_Hard'],
	};

	function extractLLMStatsMetrics(qualitative_metrics) {
	const scores = {};
	for (const m of qualitative_metrics \|\| []) {
	for (const [key, names] of Object.entries(LLMSTATS_MAP)) {
	if (names.some((n) => m.dataset_name === n) && scores[key] === undefined) {
	scores[key] = m.score;
	}
	}
	}
	return scores;
	}

	async function fetchLLMStats() {
	process.stdout.write('LLMStats: fetching file list... ');
	const tree = await getJson(LLMSTATS_TREE);
	const files = tree.tree.filter(
	(f) => f.type === 'blob' && f.path.startsWith('models/') && f.path.endsWith('/model.json')
	);
	console.log(`${files.length} models`);

	const results = [];
	const BATCH = 10;
	for (let i = 0; i < files.length; i += BATCH) {
	const batch = files.slice(i, i + BATCH);
	const rows = await Promise.all(batch.map(async (f) => {
	try {
	const data = await getJson(LLMSTATS_RAW + f.path);
	const slug = f.path.replace(/^models\//, '').replace(/\/model\.json$/, '');
	const metrics = extractLLMStatsMetrics(data.qualitative_metrics);
	const entry = { slug, name: data.name, ...metrics, sources: {} };
	Object.keys(metrics).forEach(k => entry.sources[k] = 'llmstats');
	return entry;
	} catch (e) {
	console.warn(`\n ⚠ LLMStats ${f.path}: ${e.message}`);
	return null;
	}
	}));
	rows.forEach((r) => { if (r) results.push(r); });
	process.stdout.write(` LLMStats: ${Math.min(i + BATCH, files.length)}/${files.length}\r`);
	}
	console.log(` LLMStats: ${results.length} entries fetched `);
	return results;
	}

	// ─── HF Leaderboard ─────────────────────────────────────────────────────────

	const HF_ROWS_URL = 'https://datasets-server.huggingface.co/rows' +
	'?dataset=open-llm-leaderboard%2Fcontents&config=default&split=train';

	async function fetchHFPage(offset, limit = 100) {
	const data = await getJson(`${HF_ROWS_URL}&offset=${offset}&limit=${limit}`);
	return { rows: data.rows.map((r) => r.row), total: data.num_rows_total };
	}

	async function fetchHFLeaderboard() {
	process.stdout.write('HF Leaderboard: probing total... ');
	const first = await fetchHFPage(0, 1);
	const total = first.total;
	console.log(`${total} rows`);

	const LIMIT = 100;
	const pages = Math.ceil(total / LIMIT);
	const allRows = [...first.rows];

	// Fetch remaining pages in batches of 5 concurrent requests
	const CONCURRENT = 5;
	for (let p = 1; p < pages; p += CONCURRENT) {
	const batch = [];
	for (let q = p; q < Math.min(p + CONCURRENT, pages); q++) {
	batch.push(fetchHFPage(q * LIMIT, LIMIT));
	}
	const results = await Promise.all(batch);
	results.forEach((r) => allRows.push(...r.rows));
	const done = Math.min((p + CONCURRENT) * LIMIT, total);
	process.stdout.write(` HF: ${done}/${total}\r`);
	}
	console.log(` HF: ${total}/${total} — filtering... `);

	// The Average column name has a Unicode emoji
	const AVG_KEY = Object.keys(allRows[0]).find((k) => k.startsWith('Average'));

	const entries = allRows
	.filter((r) => r['Available on the hub'] && !r.Flagged)
	.map((r) => {
	const entry = {
	hf_id: r.fullname,
	name: r.fullname.split('/').pop(),
	sources: {},
	};
	if (r['#Params (B)']) { entry.params_b = r['#Params (B)']; entry.sources.params_b = 'hf'; }
	if (r['IFEval Raw']) { entry.ifeval = r['IFEval Raw']; entry.sources.ifeval = 'hf'; }
	if (r['BBH Raw']) { entry.bbh = r['BBH Raw']; entry.sources.bbh = 'hf'; }
	if (r['GPQA Raw']) { entry.gpqa = r['GPQA Raw']; entry.sources.gpqa = 'hf'; }
	if (r['MMLU-PRO Raw']) { entry.mmlu_pro = r['MMLU-PRO Raw']; entry.sources.mmlu_pro = 'hf'; }
	if (r['MATH Lvl 5 Raw']) { entry.hf_math_lvl5 = r['MATH Lvl 5 Raw']; entry.sources.hf_math_lvl5 = 'hf'; }
	if (r['MUSR Raw']) { entry.hf_musr = r['MUSR Raw']; entry.sources.hf_musr = 'hf'; }
	if (AVG_KEY && r[AVG_KEY]) { entry.hf_avg = r[AVG_KEY]; entry.sources.hf_avg = 'hf'; }
	return entry;
	});

	console.log(` HF: ${entries.length} entries after filtering`);
	return entries;
	}

	// ─── LiveBench ───────────────────────────────────────────────────────────────

	const LB_GITHUB_TREE = 'https://api.github.com/repos/LiveBench/livebench.github.io/git/trees/main?recursive=1';
	const LB_BASE_URL = 'https://livebench.ai';

	const LB_SUFFIX_RE = new RegExp(
	'(-thinking-(?:auto-)?(?:\\d+k-)?(?:(?:high\|medium\|low)-effort)?\|' +
	'-thinking(?:-(?:64k\|32k\|auto\|minimal))?\|' +
	'-(?:high\|medium\|low)-effort\|' +
	'-base\|-non-?reasoning\|-(?:high\|low\|min)thinking\|-nothinking)' +
	'(?:-(?:high\|medium\|low)-effort)?$'
	);

	function lbBaseName(name) {
	let prev;
	let cur = name;
	do { prev = cur; cur = cur.replace(LB_SUFFIX_RE, ''); } while (cur !== prev);
	return cur;
	}

	function parseLiveBenchCsv(csvText, taskToGroup) {
	const avg = (arr) => arr.reduce((a, b) => a + b, 0) / arr.length;
	const lines = csvText.split('\n').filter(Boolean);
	const headers = lines[0].split(',');
	const entries = [];
	for (const line of lines.slice(1)) {
	const vals = line.split(',');
	const modelName = vals[0];
	if (!modelName) continue;
	const taskScores = {};
	for (let i = 1; i < headers.length; i++) {
	const v = parseFloat(vals[i]);
	if (!isNaN(v)) taskScores[headers[i]] = v / 100;
	}
	const groupBuckets = {};
	for (const [task, group] of Object.entries(taskToGroup)) {
	if (taskScores[task] !== undefined) {
	groupBuckets[group] = groupBuckets[group] \|\| [];
	groupBuckets[group].push(taskScores[task]);
	}
	}
	const allScores = Object.values(taskScores);
	const entry = {
	lb_name: modelName,
	lb_global: allScores.length ? avg(allScores) : undefined,
	lb_reasoning: groupBuckets.lb_reasoning ? avg(groupBuckets.lb_reasoning) : undefined,
	lb_coding: groupBuckets.lb_coding ? avg(groupBuckets.lb_coding) : undefined,
	lb_math: groupBuckets.lb_math ? avg(groupBuckets.lb_math) : undefined,
	lb_language: groupBuckets.lb_language ? avg(groupBuckets.lb_language) : undefined,
	lb_if: groupBuckets.lb_if ? avg(groupBuckets.lb_if) : undefined,
	lb_data_analysis: groupBuckets.lb_data_analysis ? avg(groupBuckets.lb_data_analysis) : undefined,
	sources: {},
	};
	Object.keys(entry).forEach(k => {
	if (k.startsWith('lb_') && entry[k] !== undefined) entry.sources[k] = 'livebench';
	});
	entries.push(entry);
	}
	return entries;
	}

	async function fetchLiveBench() {
	process.stdout.write('LiveBench: finding all releases... ');
	const tree = await getJson(LB_GITHUB_TREE);
	const dates = tree.tree
	.filter((f) => f.path.startsWith('public/table_') && f.path.endsWith('.csv'))
	.map((f) => f.path.replace('public/table_', '').replace('.csv', ''))
	.sort();
	console.log(`${dates.length} releases (${dates[0]} → ${dates[dates.length - 1]})`);

	const cats = await getJson(`${LB_BASE_URL}/categories_${dates[dates.length - 1]}.json`);
	const taskToGroup = {};
	for (const [cat, tasks] of Object.entries(cats)) {
	const group =
	cat === 'Coding' \|\| cat === 'Agentic Coding' ? 'lb_coding' :
	cat === 'Reasoning' ? 'lb_reasoning' :
	cat === 'Mathematics' ? 'lb_math' :
	cat === 'Language' ? 'lb_language' :
	cat === 'IF' ? 'lb_if' :
	cat === 'Data Analysis' ? 'lb_data_analysis' : null;
	if (group) for (const t of tasks) taskToGroup[t] = group;
	}

	const byName = new Map();
	for (const date of dates) {
	let csv;
	try { csv = await getText(`${LB_BASE_URL}/table_${date}.csv`); }
	catch (e) { console.warn(`\n ⚠ LiveBench ${date}: ${e.message}`); continue; }
	for (const entry of parseLiveBenchCsv(csv, taskToGroup)) byName.set(entry.lb_name, entry);
	process.stdout.write(` LiveBench: ${date}\r`);
	}
	const entries = [...byName.values()];
	console.log(` LiveBench: ${entries.length} unique models across all releases`);
	return entries;
	}

	function mergeLiveBench(entries, lbEntries) {
	const exactMap = new Map();
	const baseMap = new Map();
	for (const lb of lbEntries) {
	exactMap.set(normName(lb.lb_name), lb);
	const base = normName(lbBaseName(lb.lb_name));
	if (base !== normName(lb.lb_name)) {
	const prev = baseMap.get(base);
	if (!prev \|\| (lb.lb_global \|\| 0) > (prev.lb_global \|\| 0)) baseMap.set(base, lb);
	}
	}
	const usedLbNames = new Set();
	let matched = 0;
	for (const e of entries) {
	const candidates = [normName(e.name \|\| ''), normName((e.slug \|\| '').split('/').pop() \|\| ''), normName((e.hf_id \|\| '').split('/').pop() \|\| '')].filter(Boolean);
	let lb = null;
	for (const c of candidates) { lb = exactMap.get(c) \|\| baseMap.get(c); if (lb) break; }
	if (lb) {
	Object.assign(e, lb);
	e.sources = { ...(e.sources \|\| {}), ...(lb.sources \|\| {}) };
	usedLbNames.add(lb.lb_name);
	matched++;
	}
	}
	const usedBases = new Set([...usedLbNames].map((n) => normName(lbBaseName(n))));
	const newEntries = [];
	for (const lb of lbEntries) {
	if (usedLbNames.has(lb.lb_name)) continue;
	const base = normName(lbBaseName(lb.lb_name));
	if (usedBases.has(base)) continue;
	if (baseMap.get(base) === lb \|\| exactMap.get(normName(lb.lb_name)) === lb) {
	newEntries.push({ name: lbBaseName(lb.lb_name), ...lb });
	usedBases.add(base);
	}
	}
	console.log(` LiveBench: ${matched} matched, ${newEntries.length} new entries`);
	return [...entries, ...newEntries];
	}

	// ─── Chatbot Arena ───────────────────────────────────────────────────────────

	async function fetchChatbotArena() {
	process.stdout.write('Chatbot Arena: fetching RSC leaderboard... ');
	const text = await getText('https://lmarena.ai/en/leaderboard/text', {
	headers: { 'User-Agent': 'Mozilla/5.0', 'RSC': '1', 'Accept': 'text/x-component' },
	});
	let entries = null;
	for (const line of text.split('\n')) {
	if (!line.includes('"entries":[') \|\| !line.includes('"rating":')) continue;
	const start = line.indexOf('"entries":[') + '"entries":'.length;
	let depth = 0, end = -1;
	for (let i = start; i < line.length; i++) {
	if (line[i] === '[' \|\| line[i] === '{') depth++;
	else if (line[i] === ']' \|\| line[i] === '}') { depth--; if (depth === 0) { end = i + 1; break; } }
	}
	entries = JSON.parse(line.substring(start, end));
	break;
	}
	if (!entries) throw new Error('Could not find entries in RSC payload');
	console.log(`${entries.length} models`);
	return entries.map((e) => {
	const entry = {
	arena_name: e.modelDisplayName,
	arena_org: e.modelOrganization,
	arena_elo: e.rating,
	arena_rank: e.rank,
	arena_votes: e.votes,
	sources: {},
	};
	Object.keys(entry).forEach(k => {
	if (k.startsWith('arena_') && entry[k] !== undefined) entry.sources[k] = 'arena';
	});
	return entry;
	});
	}

	function mergeArena(entries, arenaEntries) {
	const arenaMap = new Map();
	for (const a of arenaEntries) arenaMap.set(normName(a.arena_name), a);
	let matched = 0;
	for (const e of entries) {
	const candidates = [normName(e.name \|\| ''), normName(e.lb_name \|\| ''), normName((e.slug \|\| '').split('/').pop() \|\| ''), normName((e.hf_id \|\| '').split('/').pop() \|\| '')];
	const a = candidates.map((c) => arenaMap.get(c)).find(Boolean);
	if (a) {
	e.arena_elo = a.arena_elo; e.arena_rank = a.arena_rank; e.arena_votes = a.arena_votes;
	e.sources = { ...(e.sources \|\| {}), ...(a.sources \|\| {}) };
	arenaMap.delete(normName(a.arena_name)); matched++;
	}
	}
	const newEntries = [];
	for (const a of arenaMap.values()) newEntries.push({ name: a.arena_name, ...a });
	console.log(` Arena: ${matched} matched, ${newEntries.length} new entries`);
	return [...entries, ...newEntries];
	}

	// ─── Aider ───────────────────────────────────────────────────────────────────

	const AIDER_RAW = 'https://raw.githubusercontent.com/Aider-AI/aider/main/aider/website/_data/edit_leaderboard.yml';

	async function fetchAider() {
	process.stdout.write('Aider: fetching edit leaderboard... ');
	const text = await getText(AIDER_RAW);
	const rows = yaml.load(text);
	const best = new Map();
	for (const row of rows) {
	if (!row.model \|\| row.pass_rate_1 === undefined) continue;
	const key = normName(row.model);
	const existing = best.get(key);
	if (!existing \|\| row.pass_rate_1 > existing.pass_rate_1) best.set(key, row);
	}
	const entries = [];
	for (const row of best.values()) {
	const entry = { aider_model: row.model, aider_pass_rate: row.pass_rate_1 / 100, sources: {} };
	entry.sources.aider_pass_rate = 'aider';
	entries.push(entry);
	}
	console.log(`${entries.length} models (best run each)`);
	return entries;
	}

	function mergeAider(entries, aiderEntries) {
	const aiderMap = new Map();
	for (const a of aiderEntries) aiderMap.set(normName(a.aider_model), a);
	let matched = 0;
	for (const e of entries) {
	const candidates = [normName(e.name \|\| ''), normName(e.lb_name \|\| ''), normName((e.slug \|\| '').split('/').pop() \|\| ''), normName((e.hf_id \|\| '').split('/').pop() \|\| ''), normName(e.arena_name \|\| '')];
	const a = candidates.map((c) => aiderMap.get(c)).find(Boolean);
	if (a) {
	e.aider_pass_rate = a.aider_pass_rate;
	e.sources = { ...(e.sources \|\| {}), ...(a.sources \|\| {}) };
	aiderMap.delete(normName(a.aider_model));
	matched++;
	}
	}
	const newEntries = [];
	for (const a of aiderMap.values()) newEntries.push({ name: a.aider_model, ...a });
	console.log(` Aider: ${matched} matched, ${newEntries.length} new entries`);
	return [...entries, ...newEntries];
	}

	// ─── Artificial Analysis ───────────────────────────────────────────────────

	async function fetchArtificialAnalysis() {
	const apiKey = process.env.ARTIFICIAL_ANALYSIS_API_KEY;
	if (!apiKey) {
	console.log('Artificial Analysis: skipping (no API key found)');
	return [];
	}

	process.stdout.write('Artificial Analysis: fetching benchmarks... ');
	const res = await getJson('https://artificialanalysis.ai/api/v2/data/llms/models', {
	headers: { 'x-api-key': apiKey },
	});

	if (!res.data) throw new Error('Invalid response from Artificial Analysis API');
	console.log(`${res.data.length} models`);

	return res.data.map((m) => {
	const ev = m.evaluations \|\| {};
	const entry = {
	aa_id: m.id,
	aa_name: m.name,
	aa_slug: m.slug,
	aa_intelligence: ev.artificial_analysis_intelligence_index, // 0-100
	aa_coding: ev.artificial_analysis_coding_index, // 0-100
	aa_math: ev.artificial_analysis_math_index, // 0-100
	aa_mmlu_pro: ev.mmlu_pro, // 0-1
	aa_gpqa: ev.gpqa, // 0-1
	aa_livecodebench: ev.livecodebench, // 0-1
	aa_hle: ev.hle,
	aa_scicode: ev.scicode,
	aa_math_500: ev.math_500,
	aa_aime: ev.aime,
	aa_tokens_per_s: m.median_output_tokens_per_second,
	aa_latency_s: m.median_time_to_first_token_seconds,
	sources: {},
	};
	Object.keys(entry).forEach(k => {
	if (k.startsWith('aa_') && entry[k] !== undefined) entry.sources[k] = 'aa';
	});
	return entry;
	});
	}

	function mergeArtificialAnalysis(entries, aaEntries) {
	const aaMap = new Map();
	for (const a of aaEntries) aaMap.set(normName(a.aa_name), a);

	let matched = 0;
	for (const e of entries) {
	const candidates = [
	normName(e.name \|\| ''),
	normName(e.lb_name \|\| ''),
	normName((e.slug \|\| '').split('/').pop() \|\| ''),
	normName((e.hf_id \|\| '').split('/').pop() \|\| ''),
	normName(e.arena_name \|\| ''),
	].filter(Boolean);

	const aa = candidates.map((c) => aaMap.get(c)).find(Boolean);
	if (aa) {
	Object.assign(e, aa);
	e.sources = { ...(e.sources \|\| {}), ...(aa.sources \|\| {}) };
	aaMap.delete(normName(aa.aa_name));
	matched++;
	}
	}

	const newEntries = [];
	for (const a of aaMap.values()) {
	newEntries.push({ name: a.aa_name, ...a });
	}

	console.log(` AA: ${matched} matched, ${newEntries.length} new entries`);
	return [...entries, ...newEntries];
	}

	// ─── MTEB ──────────────────────────────────────────────────────────────────

	const MTEB_PATHS_URL = 'https://raw.githubusercontent.com/embeddings-benchmark/results/main/paths.json';
	const MTEB_RAW_BASE_URL = 'https://raw.githubusercontent.com/embeddings-benchmark/results/main/';

	async function fetchMTEB() {
	const providersPath = path.join(__dirname, '..', 'data', 'providers.json');
	if (!fs.existsSync(providersPath)) return [];

	process.stdout.write('MTEB: fetching results index... ');
	const paths = await getJson(MTEB_PATHS_URL);
	const providers = JSON.parse(fs.readFileSync(providersPath, 'utf8')).providers;
	const hfIds = new Set();
	providers.forEach(p => p.models.forEach(m => { if (m.type === 'embedding' && m.hf_id) hfIds.add(m.hf_id); }));
	console.log(`${hfIds.size} embedders`);

	const results = [];
	for (const hfId of hfIds) {
	const key = hfId.replace(/\//g, '__');
	// Try original key, then find matching key in paths (case-insensitive)
	let resultPaths = paths[key];
	if (!resultPaths) {
	const match = Object.keys(paths).find(k => k.toLowerCase() === key.toLowerCase());
	if (match) resultPaths = paths[match];
	}
	if (!resultPaths) continue;

	const revisions = [...new Set(resultPaths.map(p => p.split('/')[2]))];
	// Aggregation: we'll take all unique tasks across all revisions,
	// prioritizing the latest revision for each task.
	const taskPaths = new Map();
	revisions.forEach(rev => {
	const pathsInRev = resultPaths.filter(p => p.includes(`/${rev}/`));
	pathsInRev.forEach(p => {
	const taskName = p.split('/').pop().replace('.json', '');
	taskPaths.set(taskName, p);
	});
	});

	const latestPaths = [...taskPaths.values()];
	const fetchPaths = latestPaths.slice(0, 50); // Limit to 50 tasks to prevent hangs
	process.stdout.write(` MTEB: ${hfId} (fetching ${fetchPaths.length}/${latestPaths.length} tasks)\r`);

	let total = 0, count = 0, retTotal = 0, retCount = 0;
	const BATCH = 20;
	for (let i = 0; i < fetchPaths.length; i += BATCH) {
	const batch = await Promise.all(fetchPaths.slice(i, i + BATCH).map(p => getJson(MTEB_RAW_BASE_URL + p).catch(() => null)));
	batch.forEach(res => {
	if (!res) return;
	const scores = res.scores \|\| res;
	const data = scores.test \|\| scores.dev \|\| scores.validation;
	if (!data) return;
	const arr = Array.isArray(data) ? data : [data];

	// Find English or default subset
	let targetRes = arr.find(r => r.languages && r.languages.some(l => l.startsWith('eng') \|\| l === 'en'));
	if (!targetRes && arr.length === 1) targetRes = arr[0];
	if (!targetRes) targetRes = arr.find(r => r.hf_subset === 'default');
	if (!targetRes && arr.length > 0) targetRes = arr[0];

	if (targetRes) {
	const s = targetRes.main_score \|\| targetRes.ndcg_at_10 \|\| targetRes.accuracy;
	if (typeof s === 'number' && s > 0) {
	let norm = s <= 1.0 ? s * 100 : s;
	if (norm > 100) norm = 100; // Cap at 100
	total += norm; count++;
	const task = res.mteb_dataset_name \|\| res.task_name \|\| '';
	if (task.includes('Retrieval') \|\| task.includes('Search')) { retTotal += norm; retCount++; }
	}
	}
	});
	}
	if (count > 0) {
	results.push({
	hf_id: hfId,
	name: hfId.split('/').pop(),
	mteb_avg: Math.round(total / count * 100) / 100,
	mteb_retrieval: retCount > 0 ? Math.round(retTotal / retCount * 100) / 100 : undefined,
	sources: { mteb_avg: 'mteb', mteb_retrieval: retCount > 0 ? 'mteb' : undefined }
	});
	}
	}
	console.log(`\n MTEB: ${results.length} models enriched `);
	return results;
	}

	function mergeMTEB(entries, mtebEntries) {
	const map = new Map(mtebEntries.map(m => [m.hf_id.toLowerCase(), m]));

	// Manual overrides for famous models not yet in the results repo or needing fixed values
	const overrides = [
	{ hf_id: 'BAAI/bge-multilingual-gemma2', mteb_avg: 70.3, mteb_retrieval: 67.5, sources: { mteb_avg: 'manual', mteb_retrieval: 'manual' } },
	{ hf_id: 'Qwen/Qwen3-Embedding-8B', mteb_avg: 71.2, mteb_retrieval: 72.1, sources: { mteb_avg: 'manual', mteb_retrieval: 'manual' } },
	{ hf_id: 'BAAI/bge-en-icl', mteb_avg: 64.9, mteb_retrieval: 58.2, sources: { mteb_avg: 'manual', mteb_retrieval: 'manual' } },
	{ hf_id: 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2', mteb_avg: 51.98, mteb_retrieval: 39.76, sources: { mteb_avg: 'manual', mteb_retrieval: 'manual' } },
	{ name: 'Mistral Embed', mteb_avg: 55.26, sources: { mteb_avg: 'manual' } },
	{ name: 'Codestral Embed', mteb_avg: 84.7, mteb_retrieval: 81.0, lb_coding: 0.81, sources: { mteb_avg: 'manual', mteb_retrieval: 'manual', lb_coding: 'manual' } },
	];
	overrides.forEach(o => {
	const key = (o.hf_id \|\| o.name).toLowerCase();
	map.set(key, o); // Force override
	});

	let matched = 0;
	for (const e of entries) {
	const m = (e.hf_id ? map.get(e.hf_id.toLowerCase()) : null) \|\| (e.name ? map.get(e.name.toLowerCase()) : null);
	if (m) {
	if (m.mteb_avg) e.mteb_avg = m.mteb_avg;
	if (m.mteb_retrieval) e.mteb_retrieval = m.mteb_retrieval;
	if (m.lb_coding) e.lb_coding = m.lb_coding;
	e.sources = { ...(e.sources \|\| {}), ...m.sources };
	const key = (m.hf_id \|\| m.name).toLowerCase();
	map.delete(key); matched++;
	}
	}
	const newEntries = [...map.values()];
	console.log(` MTEB: ${matched} matched, ${newEntries.length} new entries`);
	return [...entries, ...newEntries];
	}

	// ─── OCR Benchmarks ────────────────────────────────────────────────────────

	function mergeOCR(entries) {
	const ocrData = [
	{ name: 'datalab-to/chandra-ocr-2', score: 85.9 },
	{ name: 'rednote-hilab/dots.mocr', score: 83.9 },
	{ name: 'lightonai/LightOnOCR-2-1B', score: 83.2 },
	{ name: 'datalab-to/chandra', score: 83.1 },
	{ name: 'infly/Infinity-Parser-7B', score: 82.5 },
	{ name: 'allenai/olmOCR-2-7B-1025-FP8', score: 82.4 },
	{ name: 'PaddlePaddle/PaddleOCR-VL', score: 80.0 },
	{ name: 'baidu/Qianfan-OCR', score: 79.8 },
	{ name: 'rednote-hilab/dots.ocr', score: 79.1 },
	{ name: 'deepseek-ai/DeepSeek-OCR-2', score: 76.3 },
	{ name: 'lightonai/LightOnOCR-1B-1025', score: 76.1 },
	{ name: 'deepseek-ai/DeepSeek-OCR', score: 75.7 },
	{ name: 'opendatalab/MinerU2.5-2509-1.2B', score: 75.2 },
	{ name: 'zai-org/GLM-OCR', score: 75.2 },
	{ name: 'FireRedTeam/FireRed-OCR', score: 70.2 },
	{ name: 'nanonets/Nanonets-OCR2-3B', score: 69.5 },
	];

	const ocrMap = new Map();
	ocrData.forEach(d => {
	ocrMap.set(normName(d.name), d);
	const modelPart = d.name.split('/').pop();
	if (modelPart) ocrMap.set(normName(modelPart), d);
	});

	let matched = 0;
	const usedOcr = new Set();
	for (const e of entries) {
	const candidates = [
	normName(e.name \|\| ''),
	normName((e.hf_id \|\| '').split('/').pop() \|\| ''),
	normName(e.hf_id \|\| '')
	].filter(Boolean);

	const ocr = candidates.map(c => ocrMap.get(c)).find(Boolean);
	if (ocr) {
	e.ocr_avg = ocr.score;
	e.sources = { ...(e.sources \|\| {}), ocr_avg: 'manual' };
	matched++;
	usedOcr.add(ocr.name);
	}
	}

	const newEntries = [];
	ocrData.forEach(d => {
	if (!usedOcr.has(d.name)) {
	newEntries.push({
	hf_id: d.name,
	name: d.name.split('/').pop(),
	ocr_avg: d.score,
	sources: { ocr_avg: 'manual' }
	});
	}
	});

	console.log(` OCR: ${matched} matched, ${newEntries.length} new entries`);
	return [...entries, ...newEntries];
	}

	// ─── Merge ───────────────────────────────────────────────────────────────────

	function mergeEntries(llmstats, hfEntries) {
	const lsIdx = new Map();
	llmstats.forEach((e, i) => {
	lsIdx.set(normName(e.name), i);
	const slugModel = e.slug?.split('/').pop() \|\| '';
	if (slugModel) lsIdx.set(normName(slugModel), i);
	});
	const merged = llmstats.map((e) => ({ ...e, sources: { ...(e.sources \|\| {}) } }));
	const hfOnly = [];
	for (const hf of hfEntries) {
	const modelPart = normName(hf.name);
	const modelWords = modelPart.split(' ');
	const modelNoPrefix = modelWords.length > 1 ? modelWords.slice(1).join(' ') : modelPart;
	const idx = lsIdx.get(modelPart) ?? lsIdx.get(modelNoPrefix);
	if (idx !== undefined) {
	const target = merged[idx];
	if (!target.hf_id) target.hf_id = hf.hf_id;
	if (!target.params_b) target.params_b = hf.params_b;
	if (!target.ifeval) target.ifeval = hf.ifeval;
	if (!target.bbh) target.bbh = hf.bbh;
	if (!target.gpqa) target.gpqa = hf.gpqa;
	if (!target.mmlu_pro) target.mmlu_pro = hf.mmlu_pro;
	target.hf_math_lvl5 = hf.hf_math_lvl5;
	target.hf_musr = hf.hf_musr;
	target.hf_avg = hf.hf_avg;
	target.sources = { ...(target.sources \|\| {}), ...(hf.sources \|\| {}) };
	} else hfOnly.push(hf);
	}
	return [...merged, ...hfOnly];
	}

	// ─── Refresh ─────────────────────────────────────────────────────────────────

	const SOURCE_FIELDS = {
	llmstats: ['slug', 'mmlu', 'mmlu_pro', 'gpqa', 'human_eval', 'math', 'gsm8k', 'mmmu', 'hellaswag', 'ifeval', 'arc', 'drop', 'mbpp', 'mgsm', 'bbh'],
	hf: ['hf_id', 'params_b', 'hf_math_lvl5', 'hf_musr', 'hf_avg'],
	livebench: ['lb_name', 'lb_global', 'lb_reasoning', 'lb_coding', 'lb_math', 'lb_language', 'lb_if', 'lb_data_analysis'],
	arena: ['arena_name', 'arena_org', 'arena_elo', 'arena_rank', 'arena_votes'],
	aider: ['aider_model', 'aider_pass_rate'],
	aa: ['aa_id', 'aa_intelligence', 'aa_coding', 'aa_math', 'aa_mmlu_pro', 'aa_gpqa', 'aa_livecodebench', 'aa_hle', 'aa_scicode', 'aa_math_500', 'aa_aime', 'aa_tokens_per_s', 'aa_latency_s'],
	mteb: ['mteb_avg', 'mteb_retrieval'],
	ocr: ['ocr_avg'],
	};

	const SOURCE_ID_FIELD = {
	llmstats: 'slug', hf: 'hf_id', livebench: 'lb_name', arena: 'arena_elo', aider: 'aider_pass_rate', aa: 'aa_intelligence', mteb: 'mteb_avg', ocr: 'ocr_avg',
	};

	async function refreshSource(source) {
	if (!SOURCE_FIELDS[source]) {
	console.error(`Unknown source "${source}". Valid: ${Object.keys(SOURCE_FIELDS).join(', ')}`);
	process.exit(1);
	}
	console.log(`Refreshing benchmark source: ${source}\n`);
	const existing = JSON.parse(fs.readFileSync(OUT_FILE, 'utf8'));
	const otherIdFields = Object.values(SOURCE_ID_FIELD).filter(f => f !== SOURCE_ID_FIELD[source]);
	const stripped = existing.filter(e => otherIdFields.some(f => e[f] !== undefined)).map(e => {
	const s = { ...e }; for (const f of SOURCE_FIELDS[source]) delete s[f]; return s;
	});
	let result;
	if (source === 'llmstats') result = mergeLLMStatsInto(stripped, await fetchLLMStats());
	else if (source === 'hf') result = mergeHFInto(stripped, await fetchHFLeaderboard());
	else if (source === 'livebench') result = mergeLiveBench(stripped, await fetchLiveBench());
	else if (source === 'arena') result = mergeArena(stripped, await fetchChatbotArena());
	else if (source === 'aider') result = mergeAider(stripped, await fetchAider());
	else if (source === 'aa') result = mergeArtificialAnalysis(stripped, await fetchArtificialAnalysis());
	else if (source === 'mteb') result = mergeMTEB(stripped, await fetchMTEB());
	else if (source === 'ocr') result = mergeOCR(stripped);
	fs.writeFileSync(OUT_FILE, JSON.stringify(result, null, 2));
	}

	// ─── HF README Evaluation ──────────────────────────────────────────────────

	async function fetchHFReadmeBenchmarks() {
	const providersPath = path.join(__dirname, '..', 'data', 'providers.json');
	if (!fs.existsSync(providersPath)) return [];

	const providers = JSON.parse(fs.readFileSync(providersPath, 'utf8')).providers;
	const hfIds = new Set();
	providers.forEach(p => p.models.forEach(m => { if (m.hf_id) hfIds.add(m.hf_id); }));

	process.stdout.write(`HF README: checking ${hfIds.size} models... `);
	const results = [];

	const BATCH = 10;
	const ids = Array.from(hfIds);
	for (let i = 0; i < ids.length; i += BATCH) {
	const batch = ids.slice(i, i + BATCH);
	const rows = await Promise.all(batch.map(async (hfId) => {
	try {
	const url = `https://huggingface.co/${hfId}/raw/main/README.md`;
	const text = await getText(url, { retries: 1 });
	if (!text.startsWith('---')) return null;

	const endYaml = text.indexOf('---', 3);
	if (endYaml === -1) return null;

	const yamlText = text.substring(3, endYaml);
	const meta = yaml.load(yamlText);
	if (!meta \|\| !meta['model-index']) return null;

	let total = 0, count = 0, retTotal = 0, retCount = 0;
	const modelIndex = Array.isArray(meta['model-index']) ? meta['model-index'] : [meta['model-index']];
	modelIndex.forEach(mi => {
	(mi.results \|\| []).forEach(res => {
	const isMTEB = (res.dataset?.type \|\| '').toLowerCase().includes('mteb') \|\|
	(res.dataset?.name \|\| '').toLowerCase().includes('mteb') \|\|
	(res.task?.type \|\| '').toLowerCase().includes('retrieval');
	if (!isMTEB) return;

	const mainMetric = (res.metrics \|\| []).find(m => m.type === 'main_score' \|\| m.type === 'ndcg_at_10' \|\| m.type === 'accuracy');
	if (mainMetric && typeof mainMetric.value === 'number') {
	const val = mainMetric.value;
	let norm = val <= 1.0 ? val * 100 : val;
	if (norm > 100) norm = 100; // Cap at 100
	total += norm; count++;

	const taskType = (res.task?.type \|\| '').toLowerCase();
	if (taskType.includes('retrieval') \|\| taskType.includes('search')) {
	retTotal += norm; retCount++;
	}
	}
	});
	});

	if (count > 0) {
	return {
	hf_id: hfId,
	name: hfId.split('/').pop(),
	mteb_avg: Math.round(total / count * 100) / 100,
	mteb_retrieval: retCount > 0 ? Math.round(retTotal / retCount * 100) / 100 : undefined,
	sources: { mteb_avg: 'hf-readme', mteb_retrieval: retCount > 0 ? 'hf-readme' : undefined }
	};
	}
	} catch (e) {
	return null;
	}
	return null;
	}));
	rows.forEach(r => { if (r) results.push(r); });
	process.stdout.write(` HF README: ${Math.min(i + BATCH, ids.length)}/${ids.length}\r`);
	}

	console.log(`\n HF README: ${results.length} models enriched from metadata`);
	return results;
	}

	// ─── Main ────────────────────────────────────────────────────────────────────

	async function main() {
	const source = process.argv[2]?.toLowerCase();
	if (source) { await refreshSource(source); return; }

	const [llmstats, hfEntries, lbEntries, arenaEntries, aiderEntries, aaEntries, mtebEntries, readmeEntries] = await Promise.all([
	fetchLLMStats(),
	fetchHFLeaderboard(),
	fetchLiveBench(),
	fetchChatbotArena(),
	fetchAider(),
	fetchArtificialAnalysis(),
	fetchMTEB(),
	fetchHFReadmeBenchmarks(),
	]);

	const merged = mergeEntries(llmstats, hfEntries);
	const withLB = mergeLiveBench(merged, lbEntries);
	const withAr = mergeArena(withLB, arenaEntries);
	const withAi = mergeAider(withAr, aiderEntries);
	const withAA = mergeArtificialAnalysis(withAi, aaEntries);
	const withMTEB = mergeMTEB(withAA, mtebEntries);
	const withReadme = mergeMTEB(withMTEB, readmeEntries);
	const all = mergeOCR(withReadme);

	console.log(`\nTotal entries: ${all.length}`);
	console.log(` With LiveBench: ${all.filter(e => e.lb_name).length} \| Arena: ${all.filter(e => e.arena_elo).length} \| Aider: ${all.filter(e => e.aider_pass_rate !== undefined).length} \| AA: ${all.filter(e => e.aa_intelligence !== undefined).length} \| MTEB: ${all.filter(e => e.mteb_avg !== undefined).length} \| OCR: ${all.filter(e => e.ocr_avg !== undefined).length}`);

	fs.writeFileSync(OUT_FILE, JSON.stringify(all, null, 2));
	console.log(`Saved to data/benchmarks.json (${(fs.statSync(OUT_FILE).size / 1024).toFixed(0)} KB)`);
	}

	main().catch((err) => { console.error('Fatal:', err); process.exit(1); });