tox21_snn_classifier / src /preprocess.py
antoniaebner's picture
add code
9af3c0c
raw
history blame
8.73 kB
# pipeline taken from https://huggingface.co/spaces/ml-jku/mhnfs/blob/main/src/data_preprocessing/create_descriptors.py
"""
This files includes a the data processing for Tox21.
As an input it takes a list of SMILES and it outputs a nested dictionary with
SMILES and target names as keys.
"""
import json
from typing import Iterable
import numpy as np
from sklearn.preprocessing import StandardScaler
from statsmodels.distributions.empirical_distribution import ECDF
from rdkit import Chem, DataStructs
from rdkit.Chem import Descriptors, rdFingerprintGenerator, MACCSkeys
from rdkit.Chem.rdchem import Mol
from .utils import (
KNOWN_DESCR,
USED_200_DESCR,
Standardizer,
write_pickle,
)
def create_cleaned_mol_objects(smiles: list[str]) -> tuple[list[Mol], np.ndarray]:
"""This function creates cleaned RDKit mol objects from a list of SMILES.
Args:
smiles (list[str]): list of SMILES
Returns:
list[Mol]: list of cleaned molecules
np.ndarray[bool]: mask that contains False at index `i`, if molecule in `smiles` at
index `i` could not be cleaned and was removed.
"""
sm = Standardizer(canon_taut=True)
clean_mol_mask = list()
mols = list()
for i, smile in enumerate(smiles):
mol = Chem.MolFromSmiles(smile)
standardized_mol, _ = sm.standardize_mol(mol)
is_cleaned = standardized_mol is not None
clean_mol_mask.append(is_cleaned)
if not is_cleaned:
continue
can_mol = Chem.MolFromSmiles(Chem.MolToSmiles(standardized_mol))
mols.append(can_mol)
return mols, np.array(clean_mol_mask)
def create_ecfp_fps(mols: list[Mol]) -> np.ndarray:
"""This function ECFP fingerprints for a list of molecules.
Args:
mols (list[Mol]): list of molecules
Returns:
np.ndarray: ECFP fingerprints of molecules
"""
ecfps = list()
for mol in mols:
fp_sparse_vec = rdFingerprintGenerator.GetCountFPs(
[mol], fpType=rdFingerprintGenerator.MorganFP
)[0]
fp = np.zeros((0,), np.int8)
DataStructs.ConvertToNumpyArray(fp_sparse_vec, fp)
ecfps.append(fp)
return np.array(ecfps)
def create_maccs_keys(mols: list[Mol]) -> np.ndarray:
maccs = [MACCSkeys.GenMACCSKeys(x) for x in mols]
return np.array(maccs)
def get_tox_patterns(filepath: str):
"""This calculates tox features defined in tox_smarts.json.
Args:
mols: A list of Mol
n_jobs: If >1 multiprocessing is used
"""
# load patterns
with open(filepath) as f:
smarts_list = [s[1] for s in json.load(f)]
# Code does not work for this case
assert len([s for s in smarts_list if ("AND" in s) and ("OR" in s)]) == 0
# Chem.MolFromSmarts takes a long time so it pays of to parse all the smarts first
# and then use them for all molecules. This gives a huge speedup over existing code.
# a list of patterns, whether to negate the match result and how to join them to obtain one boolean value
all_patterns = []
for smarts in smarts_list:
patterns = [] # list of smarts-patterns
# value for each of the patterns above. Negates the values of the above later.
negations = []
if " AND " in smarts:
smarts = smarts.split(" AND ")
merge_any = False # If an ' AND ' is found all 'subsmarts' have to match
else:
# If there is an ' OR ' present it's enough is any of the 'subsmarts' match.
# This also accumulates smarts where neither ' OR ' nor ' AND ' occur
smarts = smarts.split(" OR ")
merge_any = True
# for all subsmarts check if they are preceded by 'NOT '
for s in smarts:
neg = s.startswith("NOT ")
if neg:
s = s[4:]
patterns.append(Chem.MolFromSmarts(s))
negations.append(neg)
all_patterns.append((patterns, negations, merge_any))
return all_patterns
def create_tox_features(mols: list[Mol], patterns: list) -> np.ndarray:
"""Matches the tox patterns against a molecule. Returns a boolean array"""
tox_data = []
for mol in mols:
mol_features = []
for patts, negations, merge_any in patterns:
matches = [mol.HasSubstructMatch(p) for p in patts]
matches = [m != n for m, n in zip(matches, negations)]
if merge_any:
pres = any(matches)
else:
pres = all(matches)
mol_features.append(pres)
tox_data.append(np.array(mol_features))
return np.array(tox_data)
def create_rdkit_descriptors(mols: list[Mol]) -> np.ndarray:
"""This function creates RDKit descriptors for a list of molecules.
Args:
mols (list[Mol]): list of molecules
Returns:
np.ndarray: RDKit descriptors of molecules
"""
rdkit_descriptors = list()
for mol in mols:
descrs = []
for _, descr_calc_fn in Descriptors._descList:
descrs.append(descr_calc_fn(mol))
descrs = np.array(descrs)
descrs = descrs[USED_200_DESCR]
rdkit_descriptors.append(descrs)
return np.array(rdkit_descriptors)
def create_quantiles(raw_features: np.ndarray, ecdfs: list) -> np.ndarray:
"""Create quantile values for given features using the columns
Args:
raw_features (np.ndarray): values to put into quantiles
ecdfs (list): ECDFs to use
Returns:
np.ndarray: computed quantiles
"""
quantiles = np.zeros_like(raw_features)
for column in range(raw_features.shape[1]):
raw_values = raw_features[:, column].reshape(-1)
ecdf = ecdfs[column]
q = ecdf(raw_values)
quantiles[:, column] = q
return quantiles
def fill(features, mask, value=np.nan):
n_mols = len(mask)
n_features = features.shape[1]
data = np.zeros(shape=(n_mols, n_features))
data.fill(value)
data[~mask] = features
return data
def normalize_features(
raw_features,
scaler=None,
save_scaler_path: str = "",
verbose=True,
):
if scaler is None:
scaler = StandardScaler()
scaler.fit(raw_features)
if verbose:
print("Fitted the StandardScaler")
if save_scaler_path:
write_pickle(save_scaler_path, scaler)
if verbose:
print(f"Saved the StandardScaler under {save_scaler_path}")
# Normalize feature vectors
normalized_features = scaler.transform(raw_features)
if verbose:
print("Normalized molecule features")
return normalized_features, scaler
def create_descriptors(
smiles,
ecdfs=None,
scaler=None,
descriptors: Iterable = KNOWN_DESCR,
):
# Create cleanded rdkit mol objects
mols, clean_mol_mask = create_cleaned_mol_objects(smiles)
print("Cleaned molecules")
features = []
if "ecfps" in descriptors:
# Create fingerprints and descriptors
ecfps = create_ecfp_fps(mols)
# expand using mol_mask
ecfps = fill(ecfps, ~clean_mol_mask)
features.append(ecfps)
print("Created ECFP fingerprints")
if "rdkit_descr_quantiles" in descriptors:
rdkit_descrs = create_rdkit_descriptors(mols)
print("Created RDKit descriptors")
# Create and save ecdfs
if ecdfs is None:
print("Create ECDFs")
ecdfs = []
for column in range(rdkit_descrs.shape[1]):
raw_values = rdkit_descrs[:, column].reshape(-1)
ecdfs.append(ECDF(raw_values))
# Create quantiles
rdkit_descr_quantiles = create_quantiles(rdkit_descrs, ecdfs)
# expand using mol_mask
rdkit_descr_quantiles = fill(rdkit_descr_quantiles, ~clean_mol_mask)
features.append(rdkit_descr_quantiles)
print("Created quantiles of RDKit descriptors")
if "maccs" in descriptors:
maccs = create_maccs_keys(mols)
maccs = fill(maccs, ~clean_mol_mask)
features.append(rdkit_descr_quantiles)
print("Created MACCS keys")
if "tox" in descriptors:
tox_patterns = get_tox_patterns("assets/tox_smarts.json")
tox = create_tox_features(mols, tox_patterns)
tox = fill(tox, ~clean_mol_mask)
features.append(rdkit_descr_quantiles)
print("Created Tox features")
# concatenate features
raw_features = np.concatenate(features, axis=1)
# normalize with scaler if scaler is passed, else create scaler
features = normalize_features(
raw_features,
scaler=scaler,
verbose=True,
)
return features, clean_mol_mask