RAG_Eval / evaluation /stats /robustness.py
Rom89823974978's picture
Resolved issues
4dc151e
"""Robustness & sensitivity analysis helpers (RQ3 / RQ4)."""
from __future__ import annotations
from typing import Sequence, Tuple, Mapping, Any
import numpy as np
from scipy import stats
def delta_metric(
orig: Sequence[float], perturbed: Sequence[float]
) -> Tuple[float, float]:
"""Return mean delta and Cohen's *d* effect size.
*orig* and *perturbed* must be paired metric values (same length).
"""
orig = np.asarray(orig, dtype=float)
perturbed = np.asarray(perturbed, dtype=float)
if orig.shape != perturbed.shape:
raise ValueError("orig and perturbed must have the same length")
delta = np.mean(perturbed - orig)
pooled_sd = np.sqrt(((orig.std(ddof=1) ** 2) + (perturbed.std(ddof=1) ** 2)) / 2)
cohen_d = delta / pooled_sd if pooled_sd else float("nan")
return float(delta), float(cohen_d)
def conditional_failure_rate(
retrieval_errors: Sequence[bool], hallucinations: Sequence[bool]
) -> Mapping[str, float]:
"""Fraction of hallucinations conditional on retrieval failure.
Returns
-------
dict with keys:
p_hallucination_given_error
p_hallucination_given_success
"""
import numpy as np
retrieval_errors = np.asarray(retrieval_errors, dtype=bool)
hallucinations = np.asarray(hallucinations, dtype=bool)
if retrieval_errors.shape != hallucinations.shape:
raise ValueError("Input lengths differ")
err_idx = retrieval_errors
succ_idx = ~retrieval_errors
def _rate(mask):
if mask.sum() == 0:
return float("nan")
return float(hallucinations[mask].mean())
return {
"p_hallucination_given_error": _rate(err_idx),
"p_hallucination_given_success": _rate(succ_idx),
}
def chi2_error_propagation(
retrieval_errors: Sequence[bool], hallucinations: Sequence[bool]
):
"""Chi‑square test of independence between retrieval error and hallucination."""
from scipy.stats import chi2_contingency
retrieval_errors = np.asarray(retrieval_errors, dtype=bool)
hallucinations = np.asarray(hallucinations, dtype=bool)
table = [
[
((~retrieval_errors) & (~hallucinations)).sum(),
((~retrieval_errors) & hallucinations).sum(),
],
[
(retrieval_errors & (~hallucinations)).sum(),
(retrieval_errors & hallucinations).sum(),
],
]
try:
chi2, p, dof, expected = chi2_contingency(table)
return dict(chi2=float(chi2), p=float(p), dof=int(dof), expected= expected.tolist(), table=table)
except ValueError:
default_expected = [[0, 0], [0, 0]]
return dict(chi2=0.0, p=1.0, dof=0, expected=default_expected, table=table)