import pandas as pd import numpy as np from sklearn.metrics import roc_auc_score def compute_roc_auc_from_csv(preds_csv: str, labels_csv: str, valid_mask): """ Compute ROC AUC per class and overall mean, similar to the PyTorch-style function. Handles missing labels (NaN) like y_mask. """ preds = pd.read_csv(preds_csv) labels = pd.read_csv(labels_csv) smiles_cols = [c for c in preds.columns if "smiles" in c.lower()] if smiles_cols: print(f"🧪 Dropping SMILES columns: {smiles_cols}") preds = preds.drop(columns=smiles_cols, errors="ignore") labels = labels.drop(columns=smiles_cols, errors="ignore") shared_cols = [c for c in preds.columns if c in labels.columns] preds = preds[shared_cols].apply(pd.to_numeric, errors="coerce") labels = labels[shared_cols].apply(pd.to_numeric, errors="coerce") y_pred_clean = preds.to_numpy(dtype=float) y_true = labels.to_numpy(dtype=float) valid_mask = valid_mask[-y_true.shape[0]:] #Re-expand to original size y_pred = np.full((len(valid_mask), y_pred_clean.shape[1]), 0.5, dtype=float) y_pred[valid_mask] = y_pred_clean y_mask = ~np.isnan(y_true) auc_list = [] for i in range(y_true.shape[1]): mask_i = y_mask[:, i] if mask_i.sum() > 0: try: auc = roc_auc_score(y_true[mask_i, i], y_pred[mask_i, i]) except ValueError: auc = np.nan else: auc = np.nan auc_list.append(auc) auc_array = np.array(auc_list, dtype=np.float32) mean_auc = np.nanmean(auc_array) return auc_array, mean_auc