antoniaebner's picture
add code
9af3c0c
raw
history blame
9.95 kB
## These MolStandardizer classes are due to Paolo Tosco
## It was taken from the FS-Mol github
## (https://github.com/microsoft/FS-Mol/blob/main/fs_mol/preprocessing/utils/
## standardizer.py)
## They ensure that a sequence of standardization operations are applied
## https://gist.github.com/ptosco/7e6b9ab9cc3e44ba0919060beaed198e
import os
import pickle
from rdkit import Chem
from rdkit.Chem.MolStandardize import rdMolStandardize
HF_TOKEN = os.environ.get("HF_TOKEN")
PAD_VALUE = -100
TASKS = [
"NR-AR",
"NR-AR-LBD",
"NR-AhR",
"NR-Aromatase",
"NR-ER",
"NR-ER-LBD",
"NR-PPAR-gamma",
"SR-ARE",
"SR-ATAD5",
"SR-HSE",
"SR-MMP",
"SR-p53",
]
KNOWN_DESCR = ["ecfps", "rdkit_descr_quantiles", "maccs", "tox"]
USED_200_DESCR = [
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
25,
26,
27,
28,
29,
30,
31,
32,
33,
34,
35,
36,
37,
38,
39,
40,
41,
42,
43,
44,
45,
46,
47,
48,
49,
50,
51,
52,
53,
54,
55,
56,
57,
58,
59,
60,
61,
62,
63,
64,
65,
66,
67,
68,
69,
70,
71,
72,
73,
74,
75,
76,
77,
78,
79,
80,
81,
82,
83,
84,
85,
86,
87,
88,
89,
90,
91,
92,
93,
94,
95,
96,
97,
98,
99,
100,
101,
102,
103,
104,
105,
106,
107,
108,
109,
110,
111,
112,
113,
114,
115,
116,
117,
118,
119,
120,
121,
122,
123,
124,
125,
126,
127,
128,
129,
130,
131,
132,
133,
134,
135,
136,
137,
138,
139,
140,
141,
142,
143,
144,
145,
146,
147,
148,
149,
150,
151,
152,
153,
154,
155,
156,
157,
158,
159,
160,
161,
162,
163,
164,
165,
166,
167,
168,
169,
170,
171,
172,
173,
174,
175,
176,
177,
178,
179,
180,
181,
182,
183,
184,
185,
186,
187,
188,
189,
190,
191,
192,
193,
194,
195,
196,
197,
198,
199,
200,
201,
202,
203,
204,
205,
206,
207,
]
class Standardizer:
"""
Simple wrapper class around rdkit Standardizer.
"""
DEFAULT_CANON_TAUT = False
DEFAULT_METAL_DISCONNECT = False
MAX_TAUTOMERS = 100
MAX_TRANSFORMS = 100
MAX_RESTARTS = 200
PREFER_ORGANIC = True
def __init__(
self,
metal_disconnect=None,
canon_taut=None,
):
"""
Constructor.
All parameters are optional.
:param metal_disconnect: if True, metallorganic complexes are
disconnected
:param canon_taut: if True, molecules are converted to their
canonical tautomer
"""
super().__init__()
if metal_disconnect is None:
metal_disconnect = self.DEFAULT_METAL_DISCONNECT
if canon_taut is None:
canon_taut = self.DEFAULT_CANON_TAUT
self._canon_taut = canon_taut
self._metal_disconnect = metal_disconnect
self._taut_enumerator = None
self._uncharger = None
self._lfrag_chooser = None
self._metal_disconnector = None
self._normalizer = None
self._reionizer = None
self._params = None
@property
def params(self):
"""Return the MolStandardize CleanupParameters."""
if self._params is None:
self._params = rdMolStandardize.CleanupParameters()
self._params.maxTautomers = self.MAX_TAUTOMERS
self._params.maxTransforms = self.MAX_TRANSFORMS
self._params.maxRestarts = self.MAX_RESTARTS
self._params.preferOrganic = self.PREFER_ORGANIC
self._params.tautomerRemoveSp3Stereo = False
return self._params
@property
def canon_taut(self):
"""Return whether tautomer canonicalization will be done."""
return self._canon_taut
@property
def metal_disconnect(self):
"""Return whether metallorganic complexes will be disconnected."""
return self._metal_disconnect
@property
def taut_enumerator(self):
"""Return the TautomerEnumerator object."""
if self._taut_enumerator is None:
self._taut_enumerator = rdMolStandardize.TautomerEnumerator(self.params)
return self._taut_enumerator
@property
def uncharger(self):
"""Return the Uncharger object."""
if self._uncharger is None:
self._uncharger = rdMolStandardize.Uncharger()
return self._uncharger
@property
def lfrag_chooser(self):
"""Return the LargestFragmentChooser object."""
if self._lfrag_chooser is None:
self._lfrag_chooser = rdMolStandardize.LargestFragmentChooser(
self.params.preferOrganic
)
return self._lfrag_chooser
@property
def metal_disconnector(self):
"""Return the MetalDisconnector object."""
if self._metal_disconnector is None:
self._metal_disconnector = rdMolStandardize.MetalDisconnector()
return self._metal_disconnector
@property
def normalizer(self):
"""Return the Normalizer object."""
if self._normalizer is None:
self._normalizer = rdMolStandardize.Normalizer(
self.params.normalizationsFile, self.params.maxRestarts
)
return self._normalizer
@property
def reionizer(self):
"""Return the Reionizer object."""
if self._reionizer is None:
self._reionizer = rdMolStandardize.Reionizer(self.params.acidbaseFile)
return self._reionizer
def charge_parent(self, mol_in):
"""Sequentially apply a series of MolStandardize operations:
* MetalDisconnector
* Normalizer
* Reionizer
* LargestFragmentChooser
* Uncharger
The net result is that a desalted, normalized, neutral
molecule with implicit Hs is returned.
"""
params = Chem.RemoveHsParameters()
params.removeAndTrackIsotopes = True
mol_in = Chem.RemoveHs(mol_in, params, sanitize=False)
if self._metal_disconnect:
mol_in = self.metal_disconnector.Disconnect(mol_in)
normalized = self.normalizer.normalize(mol_in)
Chem.SanitizeMol(normalized)
normalized = self.reionizer.reionize(normalized)
Chem.AssignStereochemistry(normalized)
normalized = self.lfrag_chooser.choose(normalized)
normalized = self.uncharger.uncharge(normalized)
# need this to reassess aromaticity on things like
# cyclopentadienyl, tropylium, azolium, etc.
Chem.SanitizeMol(normalized)
return Chem.RemoveHs(Chem.AddHs(normalized))
def standardize_mol(self, mol_in):
"""
Standardize a single molecule.
:param mol_in: a Chem.Mol
:return: * (standardized Chem.Mol, n_taut) tuple
if success. n_taut will be negative if
tautomer enumeration was aborted due
to reaching a limit
* (None, error_msg) if failure
This calls self.charge_parent() and, if self._canon_taut
is True, runs tautomer canonicalization.
"""
n_tautomers = 0
if isinstance(mol_in, Chem.Mol):
name = None
try:
name = mol_in.GetProp("_Name")
except KeyError:
pass
if not name:
name = "NONAME"
else:
error = f"Expected SMILES or Chem.Mol as input, got {str(type(mol_in))}"
return None, error
try:
mol_out = self.charge_parent(mol_in)
except Exception as e:
error = f"charge_parent FAILED: {str(e).strip()}"
return None, error
if self._canon_taut:
try:
res = self.taut_enumerator.Enumerate(mol_out, False)
except TypeError:
# we are still on the pre-2021 RDKit API
res = self.taut_enumerator.Enumerate(mol_out)
except Exception as e:
# something else went wrong
error = f"canon_taut FAILED: {str(e).strip()}"
return None, error
n_tautomers = len(res)
if hasattr(res, "status"):
completed = (
res.status == rdMolStandardize.TautomerEnumeratorStatus.Completed
)
else:
# we are still on the pre-2021 RDKit API
completed = len(res) < 1000
if not completed:
n_tautomers = -n_tautomers
try:
mol_out = self.taut_enumerator.PickCanonical(res)
except AttributeError:
# we are still on the pre-2021 RDKit API
mol_out = max(
[(self.taut_enumerator.ScoreTautomer(m), m) for m in res]
)[1]
except Exception as e:
# something else went wrong
error = f"canon_taut FAILED: {str(e).strip()}"
return None, error
mol_out.SetProp("_Name", name)
return mol_out, n_tautomers
def load_pickle(path: str):
with open(path, "rb") as file:
content = pickle.load(file)
return content
def write_pickle(path: str, obj: object):
with open(path, "wb") as file:
pickle.dump(obj, file)