import numpy as np from sklearn.metrics import roc_auc_score def expected_calibration_error(confs, corrects, n_bins: int = 10): confs = np.array(confs, dtype=float) corrects = np.array(corrects, dtype=int) if len(confs) == 0: return None bins = np.linspace(0.0, 1.0, n_bins+1) ece = 0.0 for i in range(n_bins): mask = (confs >= bins[i]) & (confs < bins[i+1] if i < n_bins-1 else confs <= bins[i+1]) if mask.any(): acc = corrects[mask].mean() conf = confs[mask].mean() ece += (mask.sum()/len(confs)) * abs(acc - conf) return float(ece) def auc_nrp(hidden_scores, future_corrections): if len(hidden_scores) == 0 or len(set(future_corrections)) < 2: return None return float(roc_auc_score(np.array(future_corrections).astype(int), np.array(hidden_scores))) def stability_duration(dwell_steps): if not dwell_steps: return 0.0 return float(np.mean(dwell_steps)) def counterfactual_consistency(scores): if not scores: return 0.0 return float(np.mean(scores))