Sonja Topf commited on
Commit
e448508
·
1 Parent(s): 0d7dfdb

changed evaluation

Browse files
Files changed (3) hide show
  1. .gitignore +2 -1
  2. src/evaluation.py +6 -2
  3. src/preprocess.py +16 -0
.gitignore CHANGED
@@ -9,4 +9,5 @@ logs/*
9
  !logs/.gitkeep
10
  data/*
11
  !data/.gitkeep
12
- .env
 
 
9
  !logs/.gitkeep
10
  data/*
11
  !data/.gitkeep
12
+ .env
13
+ eval_runs.py
src/evaluation.py CHANGED
@@ -2,7 +2,7 @@ import pandas as pd
2
  import numpy as np
3
  from sklearn.metrics import roc_auc_score
4
 
5
- def compute_roc_auc_from_csv(preds_csv: str, labels_csv: str):
6
  """
7
  Compute ROC AUC per class and overall mean, similar to the PyTorch-style function.
8
  Handles missing labels (NaN) like y_mask.
@@ -20,8 +20,12 @@ def compute_roc_auc_from_csv(preds_csv: str, labels_csv: str):
20
  preds = preds[shared_cols].apply(pd.to_numeric, errors="coerce")
21
  labels = labels[shared_cols].apply(pd.to_numeric, errors="coerce")
22
 
23
- y_pred = preds.to_numpy(dtype=float)
24
  y_true = labels.to_numpy(dtype=float)
 
 
 
 
25
 
26
  y_mask = ~np.isnan(y_true)
27
 
 
2
  import numpy as np
3
  from sklearn.metrics import roc_auc_score
4
 
5
+ def compute_roc_auc_from_csv(preds_csv: str, labels_csv: str, valid_mask):
6
  """
7
  Compute ROC AUC per class and overall mean, similar to the PyTorch-style function.
8
  Handles missing labels (NaN) like y_mask.
 
20
  preds = preds[shared_cols].apply(pd.to_numeric, errors="coerce")
21
  labels = labels[shared_cols].apply(pd.to_numeric, errors="coerce")
22
 
23
+ y_pred_clean = preds.to_numpy(dtype=float)
24
  y_true = labels.to_numpy(dtype=float)
25
+ valid_mask = valid_mask[-y_true.shape[0]:]
26
+ #Re-expand to original size
27
+ y_pred = np.full((len(valid_mask), y_pred_clean.shape[1]), 0.5, dtype=float)
28
+ y_pred[valid_mask] = y_pred_clean
29
 
30
  y_mask = ~np.isnan(y_true)
31
 
src/preprocess.py CHANGED
@@ -67,6 +67,7 @@ def clean_smiles_in_csv(input_csv: str, output_csv: str, smiles_col: str = "smil
67
  # Save cleaned dataset
68
  df_clean.to_csv(output_csv, index=False)
69
  print(f"✅ Cleaned dataset saved to '{output_csv}' ({len(df_clean)} valid molecules).")
 
70
 
71
 
72
  def get_tox21_split(token, cvfold=None):
@@ -108,5 +109,20 @@ def get_combined_dataset_csv(token, save_path):
108
  # Combine all into one DataFrame
109
  combined_df = pd.concat([train_df, val_df, test_df], ignore_index=True)
110
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  # Save to a new CSV
112
  combined_df.to_csv(save_path, index=False)
 
67
  # Save cleaned dataset
68
  df_clean.to_csv(output_csv, index=False)
69
  print(f"✅ Cleaned dataset saved to '{output_csv}' ({len(df_clean)} valid molecules).")
70
+ return valid_mask
71
 
72
 
73
  def get_tox21_split(token, cvfold=None):
 
109
  # Combine all into one DataFrame
110
  combined_df = pd.concat([train_df, val_df, test_df], ignore_index=True)
111
 
112
+ # Save to a new CSV
113
+ combined_df.to_csv(save_path, index=False)
114
+
115
+ def get_combined_dataset_with_testset_csv(token, save_path, testset_path):
116
+ datasets = get_tox21_split(token, cvfold=4)
117
+ train_df, val_df = datasets["train"], datasets["validation"]
118
+ test_df = pd.read_csv(testset_path)
119
+ # Add split column
120
+ train_df["split"] = "train"
121
+ val_df["split"] = "val"
122
+ test_df["split"] = "test"
123
+
124
+ # Combine all into one DataFrame
125
+ combined_df = pd.concat([train_df, val_df, test_df], ignore_index=True)
126
+
127
  # Save to a new CSV
128
  combined_df.to_csv(save_path, index=False)