RAG_Eval / evaluation /stats /significance.py
Rom89823974978's picture
Resolved issues
4dc151e
"""Significance testing utilities (Wilcoxon, Holm-Bonferroni, delta‐metric)."""
from __future__ import annotations
from typing import Sequence, Mapping, List, Tuple
import numpy as np
from scipy import stats
def wilcoxon_signed_rank(
x: Sequence[float],
y: Sequence[float],
*,
alternative: str = "two-sided",
) -> tuple[float, float]:
"""Paired Wilcoxon signed-rank test (wrapper)."""
res = stats.wilcoxon(x, y, alternative=alternative)
return float(res.statistic), float(res.pvalue)
def holm_bonferroni(pvalues: Mapping[str, float]) -> Mapping[str, float]:
"""Holm-Bonferroni correction for multiple hypotheses.
Parameters
----------
pvalues : dict
Mapping from *name* → raw p-value.
Returns
-------
dict
Mapping from *name* → adjusted p-value.
"""
m = len(pvalues)
sorted_items: List[Tuple[str, float]] = sorted(pvalues.items(), key=lambda kv: kv[1])
adjusted: dict[str, float] = {}
for i, (name, p) in enumerate(sorted_items, start=1):
adjusted[name] = min((m - i + 1) * p, 1.0)
return adjusted