Spaces:

Rom89823974978
/

RAG_Eval

Sleeping

App Files Files Community

RAG_Eval / evaluation /stats /robustness.py

Rom89823974978

Resolved issues

4dc151e 4 months ago

raw

history blame contribute delete

2.73 kB

	"""Robustness & sensitivity analysis helpers (RQ3 / RQ4)."""

	from __future__ import annotations
	from typing import Sequence, Tuple, Mapping, Any
	import numpy as np
	from scipy import stats


	def delta_metric(
	orig: Sequence[float], perturbed: Sequence[float]
	) -> Tuple[float, float]:
	"""Return mean delta and Cohen's d effect size.

	orig and perturbed must be paired metric values (same length).
	"""
	orig = np.asarray(orig, dtype=float)
	perturbed = np.asarray(perturbed, dtype=float)
	if orig.shape != perturbed.shape:
	raise ValueError("orig and perturbed must have the same length")
	delta = np.mean(perturbed - orig)
	pooled_sd = np.sqrt(((orig.std(ddof=1) 2) + (perturbed.std(ddof=1) 2)) / 2)
	cohen_d = delta / pooled_sd if pooled_sd else float("nan")
	return float(delta), float(cohen_d)


	def conditional_failure_rate(
	retrieval_errors: Sequence[bool], hallucinations: Sequence[bool]
	) -> Mapping[str, float]:
	"""Fraction of hallucinations conditional on retrieval failure.

	Returns
	-------
	dict with keys:
	p_hallucination_given_error
	p_hallucination_given_success
	"""
	import numpy as np

	retrieval_errors = np.asarray(retrieval_errors, dtype=bool)
	hallucinations = np.asarray(hallucinations, dtype=bool)

	if retrieval_errors.shape != hallucinations.shape:
	raise ValueError("Input lengths differ")

	err_idx = retrieval_errors
	succ_idx = ~retrieval_errors

	def _rate(mask):
	if mask.sum() == 0:
	return float("nan")
	return float(hallucinations[mask].mean())

	return {
	"p_hallucination_given_error": _rate(err_idx),
	"p_hallucination_given_success": _rate(succ_idx),
	}


	def chi2_error_propagation(
	retrieval_errors: Sequence[bool], hallucinations: Sequence[bool]
	):
	"""Chi‑square test of independence between retrieval error and hallucination."""
	from scipy.stats import chi2_contingency

	retrieval_errors = np.asarray(retrieval_errors, dtype=bool)
	hallucinations = np.asarray(hallucinations, dtype=bool)

	table = [
	[
	((~retrieval_errors) & (~hallucinations)).sum(),
	((~retrieval_errors) & hallucinations).sum(),
	],
	[
	(retrieval_errors & (~hallucinations)).sum(),
	(retrieval_errors & hallucinations).sum(),
	],
	]
	try:
	chi2, p, dof, expected = chi2_contingency(table)
	return dict(chi2=float(chi2), p=float(p), dof=int(dof), expected= expected.tolist(), table=table)
	except ValueError:
	default_expected = [[0, 0], [0, 0]]
	return dict(chi2=0.0, p=1.0, dof=0, expected=default_expected, table=table)