Sonja Topf
commited on
Commit
·
e448508
1
Parent(s):
0d7dfdb
changed evaluation
Browse files- .gitignore +2 -1
- src/evaluation.py +6 -2
- src/preprocess.py +16 -0
.gitignore
CHANGED
|
@@ -9,4 +9,5 @@ logs/*
|
|
| 9 |
!logs/.gitkeep
|
| 10 |
data/*
|
| 11 |
!data/.gitkeep
|
| 12 |
-
.env
|
|
|
|
|
|
| 9 |
!logs/.gitkeep
|
| 10 |
data/*
|
| 11 |
!data/.gitkeep
|
| 12 |
+
.env
|
| 13 |
+
eval_runs.py
|
src/evaluation.py
CHANGED
|
@@ -2,7 +2,7 @@ import pandas as pd
|
|
| 2 |
import numpy as np
|
| 3 |
from sklearn.metrics import roc_auc_score
|
| 4 |
|
| 5 |
-
def compute_roc_auc_from_csv(preds_csv: str, labels_csv: str):
|
| 6 |
"""
|
| 7 |
Compute ROC AUC per class and overall mean, similar to the PyTorch-style function.
|
| 8 |
Handles missing labels (NaN) like y_mask.
|
|
@@ -20,8 +20,12 @@ def compute_roc_auc_from_csv(preds_csv: str, labels_csv: str):
|
|
| 20 |
preds = preds[shared_cols].apply(pd.to_numeric, errors="coerce")
|
| 21 |
labels = labels[shared_cols].apply(pd.to_numeric, errors="coerce")
|
| 22 |
|
| 23 |
-
|
| 24 |
y_true = labels.to_numpy(dtype=float)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
y_mask = ~np.isnan(y_true)
|
| 27 |
|
|
|
|
| 2 |
import numpy as np
|
| 3 |
from sklearn.metrics import roc_auc_score
|
| 4 |
|
| 5 |
+
def compute_roc_auc_from_csv(preds_csv: str, labels_csv: str, valid_mask):
|
| 6 |
"""
|
| 7 |
Compute ROC AUC per class and overall mean, similar to the PyTorch-style function.
|
| 8 |
Handles missing labels (NaN) like y_mask.
|
|
|
|
| 20 |
preds = preds[shared_cols].apply(pd.to_numeric, errors="coerce")
|
| 21 |
labels = labels[shared_cols].apply(pd.to_numeric, errors="coerce")
|
| 22 |
|
| 23 |
+
y_pred_clean = preds.to_numpy(dtype=float)
|
| 24 |
y_true = labels.to_numpy(dtype=float)
|
| 25 |
+
valid_mask = valid_mask[-y_true.shape[0]:]
|
| 26 |
+
#Re-expand to original size
|
| 27 |
+
y_pred = np.full((len(valid_mask), y_pred_clean.shape[1]), 0.5, dtype=float)
|
| 28 |
+
y_pred[valid_mask] = y_pred_clean
|
| 29 |
|
| 30 |
y_mask = ~np.isnan(y_true)
|
| 31 |
|
src/preprocess.py
CHANGED
|
@@ -67,6 +67,7 @@ def clean_smiles_in_csv(input_csv: str, output_csv: str, smiles_col: str = "smil
|
|
| 67 |
# Save cleaned dataset
|
| 68 |
df_clean.to_csv(output_csv, index=False)
|
| 69 |
print(f"✅ Cleaned dataset saved to '{output_csv}' ({len(df_clean)} valid molecules).")
|
|
|
|
| 70 |
|
| 71 |
|
| 72 |
def get_tox21_split(token, cvfold=None):
|
|
@@ -108,5 +109,20 @@ def get_combined_dataset_csv(token, save_path):
|
|
| 108 |
# Combine all into one DataFrame
|
| 109 |
combined_df = pd.concat([train_df, val_df, test_df], ignore_index=True)
|
| 110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
# Save to a new CSV
|
| 112 |
combined_df.to_csv(save_path, index=False)
|
|
|
|
| 67 |
# Save cleaned dataset
|
| 68 |
df_clean.to_csv(output_csv, index=False)
|
| 69 |
print(f"✅ Cleaned dataset saved to '{output_csv}' ({len(df_clean)} valid molecules).")
|
| 70 |
+
return valid_mask
|
| 71 |
|
| 72 |
|
| 73 |
def get_tox21_split(token, cvfold=None):
|
|
|
|
| 109 |
# Combine all into one DataFrame
|
| 110 |
combined_df = pd.concat([train_df, val_df, test_df], ignore_index=True)
|
| 111 |
|
| 112 |
+
# Save to a new CSV
|
| 113 |
+
combined_df.to_csv(save_path, index=False)
|
| 114 |
+
|
| 115 |
+
def get_combined_dataset_with_testset_csv(token, save_path, testset_path):
|
| 116 |
+
datasets = get_tox21_split(token, cvfold=4)
|
| 117 |
+
train_df, val_df = datasets["train"], datasets["validation"]
|
| 118 |
+
test_df = pd.read_csv(testset_path)
|
| 119 |
+
# Add split column
|
| 120 |
+
train_df["split"] = "train"
|
| 121 |
+
val_df["split"] = "val"
|
| 122 |
+
test_df["split"] = "test"
|
| 123 |
+
|
| 124 |
+
# Combine all into one DataFrame
|
| 125 |
+
combined_df = pd.concat([train_df, val_df, test_df], ignore_index=True)
|
| 126 |
+
|
| 127 |
# Save to a new CSV
|
| 128 |
combined_df.to_csv(save_path, index=False)
|