Spaces:
Running
Running
Sonja Topf
commited on
Commit
·
b0119a6
1
Parent(s):
1ce331f
dataset changes
Browse files- src/preprocess.py +8 -1
src/preprocess.py
CHANGED
|
@@ -80,6 +80,14 @@ class Tox21Dataset(InMemoryDataset):
|
|
| 80 |
|
| 81 |
# Clean molecules & filter dataframe
|
| 82 |
mols, clean_mask = create_clean_mol_objects(dataframe["smiles"].tolist())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
dataframe = dataframe[clean_mask].reset_index(drop=True)
|
| 84 |
|
| 85 |
# Now mols and dataframe are aligned, so we can zip
|
|
@@ -88,7 +96,6 @@ class Tox21Dataset(InMemoryDataset):
|
|
| 88 |
data = from_rdmol(mol)
|
| 89 |
|
| 90 |
# Extract labels as a pandas Series
|
| 91 |
-
drop_cols = ["ID","smiles","inchikey","sdftitle","order","set","CVfold"]
|
| 92 |
labels = row.drop(drop_cols)
|
| 93 |
|
| 94 |
# Mask for valid labels
|
|
|
|
| 80 |
|
| 81 |
# Clean molecules & filter dataframe
|
| 82 |
mols, clean_mask = create_clean_mol_objects(dataframe["smiles"].tolist())
|
| 83 |
+
self.clean_mask = torch.tensor(clean_mask, dtype=torch.bool)
|
| 84 |
+
|
| 85 |
+
drop_cols = ["ID","smiles","inchikey","sdftitle","order","set","CVfold"]
|
| 86 |
+
labels_df = dataframe.drop(columns=drop_cols)
|
| 87 |
+
numeric_labels = labels_df.apply(pd.to_numeric, errors="coerce").fillna(0.0)
|
| 88 |
+
self.all_labels = torch.tensor(numeric_labels.values, dtype=torch.float)
|
| 89 |
+
self.all_label_masks = torch.tensor(~labels_df.isna().values, dtype=torch.bool)
|
| 90 |
+
|
| 91 |
dataframe = dataframe[clean_mask].reset_index(drop=True)
|
| 92 |
|
| 93 |
# Now mols and dataframe are aligned, so we can zip
|
|
|
|
| 96 |
data = from_rdmol(mol)
|
| 97 |
|
| 98 |
# Extract labels as a pandas Series
|
|
|
|
| 99 |
labels = row.drop(drop_cols)
|
| 100 |
|
| 101 |
# Mask for valid labels
|