Spaces:

tschouis
/

tox21_snn_classifier

Sleeping

App Files Files Community

tox21_snn_classifier / src /preprocess.py

antoniaebner

add code

9af3c0c about 1 month ago

raw

history blame

8.73 kB

	# pipeline taken from https://huggingface.co/spaces/ml-jku/mhnfs/blob/main/src/data_preprocessing/create_descriptors.py

	"""
	This files includes a the data processing for Tox21.
	As an input it takes a list of SMILES and it outputs a nested dictionary with
	SMILES and target names as keys.
	"""

	import json
	from typing import Iterable

	import numpy as np

	from sklearn.preprocessing import StandardScaler
	from statsmodels.distributions.empirical_distribution import ECDF

	from rdkit import Chem, DataStructs
	from rdkit.Chem import Descriptors, rdFingerprintGenerator, MACCSkeys
	from rdkit.Chem.rdchem import Mol

	from .utils import (
	KNOWN_DESCR,
	USED_200_DESCR,
	Standardizer,
	write_pickle,
	)


	def create_cleaned_mol_objects(smiles: list[str]) -> tuple[list[Mol], np.ndarray]:
	"""This function creates cleaned RDKit mol objects from a list of SMILES.

	Args:
	smiles (list[str]): list of SMILES

	Returns:
	list[Mol]: list of cleaned molecules
	np.ndarray[bool]: mask that contains False at index `i`, if molecule in `smiles` at
	index `i` could not be cleaned and was removed.
	"""
	sm = Standardizer(canon_taut=True)

	clean_mol_mask = list()
	mols = list()
	for i, smile in enumerate(smiles):
	mol = Chem.MolFromSmiles(smile)
	standardized_mol, _ = sm.standardize_mol(mol)
	is_cleaned = standardized_mol is not None
	clean_mol_mask.append(is_cleaned)
	if not is_cleaned:
	continue
	can_mol = Chem.MolFromSmiles(Chem.MolToSmiles(standardized_mol))
	mols.append(can_mol)

	return mols, np.array(clean_mol_mask)


	def create_ecfp_fps(mols: list[Mol]) -> np.ndarray:
	"""This function ECFP fingerprints for a list of molecules.

	Args:
	mols (list[Mol]): list of molecules

	Returns:
	np.ndarray: ECFP fingerprints of molecules
	"""
	ecfps = list()

	for mol in mols:
	fp_sparse_vec = rdFingerprintGenerator.GetCountFPs(
	[mol], fpType=rdFingerprintGenerator.MorganFP
	)[0]
	fp = np.zeros((0,), np.int8)
	DataStructs.ConvertToNumpyArray(fp_sparse_vec, fp)

	ecfps.append(fp)

	return np.array(ecfps)


	def create_maccs_keys(mols: list[Mol]) -> np.ndarray:
	maccs = [MACCSkeys.GenMACCSKeys(x) for x in mols]
	return np.array(maccs)


	def get_tox_patterns(filepath: str):
	"""This calculates tox features defined in tox_smarts.json.
	Args:
	mols: A list of Mol
	n_jobs: If >1 multiprocessing is used
	"""
	# load patterns
	with open(filepath) as f:
	smarts_list = [s[1] for s in json.load(f)]

	# Code does not work for this case
	assert len([s for s in smarts_list if ("AND" in s) and ("OR" in s)]) == 0

	# Chem.MolFromSmarts takes a long time so it pays of to parse all the smarts first
	# and then use them for all molecules. This gives a huge speedup over existing code.
	# a list of patterns, whether to negate the match result and how to join them to obtain one boolean value
	all_patterns = []
	for smarts in smarts_list:
	patterns = [] # list of smarts-patterns
	# value for each of the patterns above. Negates the values of the above later.
	negations = []

	if " AND " in smarts:
	smarts = smarts.split(" AND ")
	merge_any = False # If an ' AND ' is found all 'subsmarts' have to match
	else:
	# If there is an ' OR ' present it's enough is any of the 'subsmarts' match.
	# This also accumulates smarts where neither ' OR ' nor ' AND ' occur
	smarts = smarts.split(" OR ")
	merge_any = True

	# for all subsmarts check if they are preceded by 'NOT '
	for s in smarts:
	neg = s.startswith("NOT ")
	if neg:
	s = s[4:]
	patterns.append(Chem.MolFromSmarts(s))
	negations.append(neg)

	all_patterns.append((patterns, negations, merge_any))
	return all_patterns


	def create_tox_features(mols: list[Mol], patterns: list) -> np.ndarray:
	"""Matches the tox patterns against a molecule. Returns a boolean array"""
	tox_data = []
	for mol in mols:
	mol_features = []
	for patts, negations, merge_any in patterns:
	matches = [mol.HasSubstructMatch(p) for p in patts]
	matches = [m != n for m, n in zip(matches, negations)]
	if merge_any:
	pres = any(matches)
	else:
	pres = all(matches)
	mol_features.append(pres)

	tox_data.append(np.array(mol_features))

	return np.array(tox_data)


	def create_rdkit_descriptors(mols: list[Mol]) -> np.ndarray:
	"""This function creates RDKit descriptors for a list of molecules.

	Args:
	mols (list[Mol]): list of molecules

	Returns:
	np.ndarray: RDKit descriptors of molecules
	"""
	rdkit_descriptors = list()

	for mol in mols:
	descrs = []
	for _, descr_calc_fn in Descriptors._descList:
	descrs.append(descr_calc_fn(mol))

	descrs = np.array(descrs)
	descrs = descrs[USED_200_DESCR]
	rdkit_descriptors.append(descrs)

	return np.array(rdkit_descriptors)


	def create_quantiles(raw_features: np.ndarray, ecdfs: list) -> np.ndarray:
	"""Create quantile values for given features using the columns

	Args:
	raw_features (np.ndarray): values to put into quantiles
	ecdfs (list): ECDFs to use

	Returns:
	np.ndarray: computed quantiles
	"""
	quantiles = np.zeros_like(raw_features)

	for column in range(raw_features.shape[1]):
	raw_values = raw_features[:, column].reshape(-1)
	ecdf = ecdfs[column]
	q = ecdf(raw_values)
	quantiles[:, column] = q

	return quantiles


	def fill(features, mask, value=np.nan):
	n_mols = len(mask)
	n_features = features.shape[1]

	data = np.zeros(shape=(n_mols, n_features))
	data.fill(value)
	data[~mask] = features
	return data


	def normalize_features(
	raw_features,
	scaler=None,
	save_scaler_path: str = "",
	verbose=True,
	):
	if scaler is None:
	scaler = StandardScaler()
	scaler.fit(raw_features)
	if verbose:
	print("Fitted the StandardScaler")
	if save_scaler_path:
	write_pickle(save_scaler_path, scaler)
	if verbose:
	print(f"Saved the StandardScaler under {save_scaler_path}")

	# Normalize feature vectors
	normalized_features = scaler.transform(raw_features)
	if verbose:
	print("Normalized molecule features")
	return normalized_features, scaler


	def create_descriptors(
	smiles,
	ecdfs=None,
	scaler=None,
	descriptors: Iterable = KNOWN_DESCR,
	):
	# Create cleanded rdkit mol objects
	mols, clean_mol_mask = create_cleaned_mol_objects(smiles)
	print("Cleaned molecules")

	features = []
	if "ecfps" in descriptors:
	# Create fingerprints and descriptors
	ecfps = create_ecfp_fps(mols)
	# expand using mol_mask
	ecfps = fill(ecfps, ~clean_mol_mask)
	features.append(ecfps)
	print("Created ECFP fingerprints")

	if "rdkit_descr_quantiles" in descriptors:
	rdkit_descrs = create_rdkit_descriptors(mols)
	print("Created RDKit descriptors")

	# Create and save ecdfs
	if ecdfs is None:
	print("Create ECDFs")
	ecdfs = []
	for column in range(rdkit_descrs.shape[1]):
	raw_values = rdkit_descrs[:, column].reshape(-1)
	ecdfs.append(ECDF(raw_values))

	# Create quantiles
	rdkit_descr_quantiles = create_quantiles(rdkit_descrs, ecdfs)
	# expand using mol_mask
	rdkit_descr_quantiles = fill(rdkit_descr_quantiles, ~clean_mol_mask)
	features.append(rdkit_descr_quantiles)
	print("Created quantiles of RDKit descriptors")

	if "maccs" in descriptors:
	maccs = create_maccs_keys(mols)
	maccs = fill(maccs, ~clean_mol_mask)
	features.append(rdkit_descr_quantiles)
	print("Created MACCS keys")

	if "tox" in descriptors:
	tox_patterns = get_tox_patterns("assets/tox_smarts.json")
	tox = create_tox_features(mols, tox_patterns)
	tox = fill(tox, ~clean_mol_mask)
	features.append(rdkit_descr_quantiles)
	print("Created Tox features")

	# concatenate features
	raw_features = np.concatenate(features, axis=1)

	# normalize with scaler if scaler is passed, else create scaler
	features = normalize_features(
	raw_features,
	scaler=scaler,
	verbose=True,
	)

	return features, clean_mol_mask