llm_qualia_2 / bp_phi /metrics.py
neuralworm's picture
initial commit
2f0addb
import numpy as np
from sklearn.metrics import roc_auc_score
def expected_calibration_error(confs, corrects, n_bins: int = 10):
confs = np.array(confs, dtype=float)
corrects = np.array(corrects, dtype=int)
if len(confs) == 0:
return None
bins = np.linspace(0.0, 1.0, n_bins+1)
ece = 0.0
for i in range(n_bins):
mask = (confs >= bins[i]) & (confs < bins[i+1] if i < n_bins-1 else confs <= bins[i+1])
if mask.any():
acc = corrects[mask].mean()
conf = confs[mask].mean()
ece += (mask.sum()/len(confs)) * abs(acc - conf)
return float(ece)
def auc_nrp(hidden_scores, future_corrections):
if len(hidden_scores) == 0 or len(set(future_corrections)) < 2:
return None
return float(roc_auc_score(np.array(future_corrections).astype(int), np.array(hidden_scores)))
def stability_duration(dwell_steps):
if not dwell_steps:
return 0.0
return float(np.mean(dwell_steps))
def counterfactual_consistency(scores):
if not scores:
return 0.0
return float(np.mean(scores))