Sonja Topf commited on
Commit
b0119a6
·
1 Parent(s): 1ce331f

dataset changes

Browse files
Files changed (1) hide show
  1. src/preprocess.py +8 -1
src/preprocess.py CHANGED
@@ -80,6 +80,14 @@ class Tox21Dataset(InMemoryDataset):
80
 
81
  # Clean molecules & filter dataframe
82
  mols, clean_mask = create_clean_mol_objects(dataframe["smiles"].tolist())
 
 
 
 
 
 
 
 
83
  dataframe = dataframe[clean_mask].reset_index(drop=True)
84
 
85
  # Now mols and dataframe are aligned, so we can zip
@@ -88,7 +96,6 @@ class Tox21Dataset(InMemoryDataset):
88
  data = from_rdmol(mol)
89
 
90
  # Extract labels as a pandas Series
91
- drop_cols = ["ID","smiles","inchikey","sdftitle","order","set","CVfold"]
92
  labels = row.drop(drop_cols)
93
 
94
  # Mask for valid labels
 
80
 
81
  # Clean molecules & filter dataframe
82
  mols, clean_mask = create_clean_mol_objects(dataframe["smiles"].tolist())
83
+ self.clean_mask = torch.tensor(clean_mask, dtype=torch.bool)
84
+
85
+ drop_cols = ["ID","smiles","inchikey","sdftitle","order","set","CVfold"]
86
+ labels_df = dataframe.drop(columns=drop_cols)
87
+ numeric_labels = labels_df.apply(pd.to_numeric, errors="coerce").fillna(0.0)
88
+ self.all_labels = torch.tensor(numeric_labels.values, dtype=torch.float)
89
+ self.all_label_masks = torch.tensor(~labels_df.isna().values, dtype=torch.bool)
90
+
91
  dataframe = dataframe[clean_mask].reset_index(drop=True)
92
 
93
  # Now mols and dataframe are aligned, so we can zip
 
96
  data = from_rdmol(mol)
97
 
98
  # Extract labels as a pandas Series
 
99
  labels = row.drop(drop_cols)
100
 
101
  # Mask for valid labels