Spaces:
Sleeping
Sleeping
"""Robustness & sensitivity analysis helpers (RQ3 / RQ4).""" | |
from __future__ import annotations | |
from typing import Sequence, Tuple, Mapping, Any | |
import numpy as np | |
from scipy import stats | |
def delta_metric( | |
orig: Sequence[float], perturbed: Sequence[float] | |
) -> Tuple[float, float]: | |
"""Return mean delta and Cohen's *d* effect size. | |
*orig* and *perturbed* must be paired metric values (same length). | |
""" | |
orig = np.asarray(orig, dtype=float) | |
perturbed = np.asarray(perturbed, dtype=float) | |
if orig.shape != perturbed.shape: | |
raise ValueError("orig and perturbed must have the same length") | |
delta = np.mean(perturbed - orig) | |
pooled_sd = np.sqrt(((orig.std(ddof=1) ** 2) + (perturbed.std(ddof=1) ** 2)) / 2) | |
cohen_d = delta / pooled_sd if pooled_sd else float("nan") | |
return float(delta), float(cohen_d) | |
def conditional_failure_rate( | |
retrieval_errors: Sequence[bool], hallucinations: Sequence[bool] | |
) -> Mapping[str, float]: | |
"""Fraction of hallucinations conditional on retrieval failure. | |
Returns | |
------- | |
dict with keys: | |
p_hallucination_given_error | |
p_hallucination_given_success | |
""" | |
import numpy as np | |
retrieval_errors = np.asarray(retrieval_errors, dtype=bool) | |
hallucinations = np.asarray(hallucinations, dtype=bool) | |
if retrieval_errors.shape != hallucinations.shape: | |
raise ValueError("Input lengths differ") | |
err_idx = retrieval_errors | |
succ_idx = ~retrieval_errors | |
def _rate(mask): | |
if mask.sum() == 0: | |
return float("nan") | |
return float(hallucinations[mask].mean()) | |
return { | |
"p_hallucination_given_error": _rate(err_idx), | |
"p_hallucination_given_success": _rate(succ_idx), | |
} | |
def chi2_error_propagation( | |
retrieval_errors: Sequence[bool], hallucinations: Sequence[bool] | |
): | |
"""Chi‑square test of independence between retrieval error and hallucination.""" | |
from scipy.stats import chi2_contingency | |
retrieval_errors = np.asarray(retrieval_errors, dtype=bool) | |
hallucinations = np.asarray(hallucinations, dtype=bool) | |
table = [ | |
[ | |
((~retrieval_errors) & (~hallucinations)).sum(), | |
((~retrieval_errors) & hallucinations).sum(), | |
], | |
[ | |
(retrieval_errors & (~hallucinations)).sum(), | |
(retrieval_errors & hallucinations).sum(), | |
], | |
] | |
try: | |
chi2, p, dof, expected = chi2_contingency(table) | |
return dict(chi2=float(chi2), p=float(p), dof=int(dof), expected= expected.tolist(), table=table) | |
except ValueError: | |
default_expected = [[0, 0], [0, 0]] | |
return dict(chi2=0.0, p=1.0, dof=0, expected=default_expected, table=table) |