Spaces:
Sleeping
Sleeping
| ## These MolStandardizer classes are due to Paolo Tosco | |
| ## It was taken from the FS-Mol github | |
| ## (https://github.com/microsoft/FS-Mol/blob/main/fs_mol/preprocessing/utils/ | |
| ## standardizer.py) | |
| ## They ensure that a sequence of standardization operations are applied | |
| ## https://gist.github.com/ptosco/7e6b9ab9cc3e44ba0919060beaed198e | |
| import os | |
| import pickle | |
| from rdkit import Chem | |
| from rdkit.Chem.MolStandardize import rdMolStandardize | |
| HF_TOKEN = os.environ.get("HF_TOKEN") | |
| PAD_VALUE = -100 | |
| TASKS = [ | |
| "NR-AR", | |
| "NR-AR-LBD", | |
| "NR-AhR", | |
| "NR-Aromatase", | |
| "NR-ER", | |
| "NR-ER-LBD", | |
| "NR-PPAR-gamma", | |
| "SR-ARE", | |
| "SR-ATAD5", | |
| "SR-HSE", | |
| "SR-MMP", | |
| "SR-p53", | |
| ] | |
| KNOWN_DESCR = ["ecfps", "rdkit_descr_quantiles", "maccs", "tox"] | |
| USED_200_DESCR = [ | |
| 0, | |
| 1, | |
| 2, | |
| 3, | |
| 4, | |
| 5, | |
| 6, | |
| 7, | |
| 8, | |
| 9, | |
| 10, | |
| 11, | |
| 12, | |
| 13, | |
| 14, | |
| 15, | |
| 16, | |
| 25, | |
| 26, | |
| 27, | |
| 28, | |
| 29, | |
| 30, | |
| 31, | |
| 32, | |
| 33, | |
| 34, | |
| 35, | |
| 36, | |
| 37, | |
| 38, | |
| 39, | |
| 40, | |
| 41, | |
| 42, | |
| 43, | |
| 44, | |
| 45, | |
| 46, | |
| 47, | |
| 48, | |
| 49, | |
| 50, | |
| 51, | |
| 52, | |
| 53, | |
| 54, | |
| 55, | |
| 56, | |
| 57, | |
| 58, | |
| 59, | |
| 60, | |
| 61, | |
| 62, | |
| 63, | |
| 64, | |
| 65, | |
| 66, | |
| 67, | |
| 68, | |
| 69, | |
| 70, | |
| 71, | |
| 72, | |
| 73, | |
| 74, | |
| 75, | |
| 76, | |
| 77, | |
| 78, | |
| 79, | |
| 80, | |
| 81, | |
| 82, | |
| 83, | |
| 84, | |
| 85, | |
| 86, | |
| 87, | |
| 88, | |
| 89, | |
| 90, | |
| 91, | |
| 92, | |
| 93, | |
| 94, | |
| 95, | |
| 96, | |
| 97, | |
| 98, | |
| 99, | |
| 100, | |
| 101, | |
| 102, | |
| 103, | |
| 104, | |
| 105, | |
| 106, | |
| 107, | |
| 108, | |
| 109, | |
| 110, | |
| 111, | |
| 112, | |
| 113, | |
| 114, | |
| 115, | |
| 116, | |
| 117, | |
| 118, | |
| 119, | |
| 120, | |
| 121, | |
| 122, | |
| 123, | |
| 124, | |
| 125, | |
| 126, | |
| 127, | |
| 128, | |
| 129, | |
| 130, | |
| 131, | |
| 132, | |
| 133, | |
| 134, | |
| 135, | |
| 136, | |
| 137, | |
| 138, | |
| 139, | |
| 140, | |
| 141, | |
| 142, | |
| 143, | |
| 144, | |
| 145, | |
| 146, | |
| 147, | |
| 148, | |
| 149, | |
| 150, | |
| 151, | |
| 152, | |
| 153, | |
| 154, | |
| 155, | |
| 156, | |
| 157, | |
| 158, | |
| 159, | |
| 160, | |
| 161, | |
| 162, | |
| 163, | |
| 164, | |
| 165, | |
| 166, | |
| 167, | |
| 168, | |
| 169, | |
| 170, | |
| 171, | |
| 172, | |
| 173, | |
| 174, | |
| 175, | |
| 176, | |
| 177, | |
| 178, | |
| 179, | |
| 180, | |
| 181, | |
| 182, | |
| 183, | |
| 184, | |
| 185, | |
| 186, | |
| 187, | |
| 188, | |
| 189, | |
| 190, | |
| 191, | |
| 192, | |
| 193, | |
| 194, | |
| 195, | |
| 196, | |
| 197, | |
| 198, | |
| 199, | |
| 200, | |
| 201, | |
| 202, | |
| 203, | |
| 204, | |
| 205, | |
| 206, | |
| 207, | |
| ] | |
| class Standardizer: | |
| """ | |
| Simple wrapper class around rdkit Standardizer. | |
| """ | |
| DEFAULT_CANON_TAUT = False | |
| DEFAULT_METAL_DISCONNECT = False | |
| MAX_TAUTOMERS = 100 | |
| MAX_TRANSFORMS = 100 | |
| MAX_RESTARTS = 200 | |
| PREFER_ORGANIC = True | |
| def __init__( | |
| self, | |
| metal_disconnect=None, | |
| canon_taut=None, | |
| ): | |
| """ | |
| Constructor. | |
| All parameters are optional. | |
| :param metal_disconnect: if True, metallorganic complexes are | |
| disconnected | |
| :param canon_taut: if True, molecules are converted to their | |
| canonical tautomer | |
| """ | |
| super().__init__() | |
| if metal_disconnect is None: | |
| metal_disconnect = self.DEFAULT_METAL_DISCONNECT | |
| if canon_taut is None: | |
| canon_taut = self.DEFAULT_CANON_TAUT | |
| self._canon_taut = canon_taut | |
| self._metal_disconnect = metal_disconnect | |
| self._taut_enumerator = None | |
| self._uncharger = None | |
| self._lfrag_chooser = None | |
| self._metal_disconnector = None | |
| self._normalizer = None | |
| self._reionizer = None | |
| self._params = None | |
| def params(self): | |
| """Return the MolStandardize CleanupParameters.""" | |
| if self._params is None: | |
| self._params = rdMolStandardize.CleanupParameters() | |
| self._params.maxTautomers = self.MAX_TAUTOMERS | |
| self._params.maxTransforms = self.MAX_TRANSFORMS | |
| self._params.maxRestarts = self.MAX_RESTARTS | |
| self._params.preferOrganic = self.PREFER_ORGANIC | |
| self._params.tautomerRemoveSp3Stereo = False | |
| return self._params | |
| def canon_taut(self): | |
| """Return whether tautomer canonicalization will be done.""" | |
| return self._canon_taut | |
| def metal_disconnect(self): | |
| """Return whether metallorganic complexes will be disconnected.""" | |
| return self._metal_disconnect | |
| def taut_enumerator(self): | |
| """Return the TautomerEnumerator object.""" | |
| if self._taut_enumerator is None: | |
| self._taut_enumerator = rdMolStandardize.TautomerEnumerator(self.params) | |
| return self._taut_enumerator | |
| def uncharger(self): | |
| """Return the Uncharger object.""" | |
| if self._uncharger is None: | |
| self._uncharger = rdMolStandardize.Uncharger() | |
| return self._uncharger | |
| def lfrag_chooser(self): | |
| """Return the LargestFragmentChooser object.""" | |
| if self._lfrag_chooser is None: | |
| self._lfrag_chooser = rdMolStandardize.LargestFragmentChooser( | |
| self.params.preferOrganic | |
| ) | |
| return self._lfrag_chooser | |
| def metal_disconnector(self): | |
| """Return the MetalDisconnector object.""" | |
| if self._metal_disconnector is None: | |
| self._metal_disconnector = rdMolStandardize.MetalDisconnector() | |
| return self._metal_disconnector | |
| def normalizer(self): | |
| """Return the Normalizer object.""" | |
| if self._normalizer is None: | |
| self._normalizer = rdMolStandardize.Normalizer( | |
| self.params.normalizationsFile, self.params.maxRestarts | |
| ) | |
| return self._normalizer | |
| def reionizer(self): | |
| """Return the Reionizer object.""" | |
| if self._reionizer is None: | |
| self._reionizer = rdMolStandardize.Reionizer(self.params.acidbaseFile) | |
| return self._reionizer | |
| def charge_parent(self, mol_in): | |
| """Sequentially apply a series of MolStandardize operations: | |
| * MetalDisconnector | |
| * Normalizer | |
| * Reionizer | |
| * LargestFragmentChooser | |
| * Uncharger | |
| The net result is that a desalted, normalized, neutral | |
| molecule with implicit Hs is returned. | |
| """ | |
| params = Chem.RemoveHsParameters() | |
| params.removeAndTrackIsotopes = True | |
| mol_in = Chem.RemoveHs(mol_in, params, sanitize=False) | |
| if self._metal_disconnect: | |
| mol_in = self.metal_disconnector.Disconnect(mol_in) | |
| normalized = self.normalizer.normalize(mol_in) | |
| Chem.SanitizeMol(normalized) | |
| normalized = self.reionizer.reionize(normalized) | |
| Chem.AssignStereochemistry(normalized) | |
| normalized = self.lfrag_chooser.choose(normalized) | |
| normalized = self.uncharger.uncharge(normalized) | |
| # need this to reassess aromaticity on things like | |
| # cyclopentadienyl, tropylium, azolium, etc. | |
| Chem.SanitizeMol(normalized) | |
| return Chem.RemoveHs(Chem.AddHs(normalized)) | |
| def standardize_mol(self, mol_in): | |
| """ | |
| Standardize a single molecule. | |
| :param mol_in: a Chem.Mol | |
| :return: * (standardized Chem.Mol, n_taut) tuple | |
| if success. n_taut will be negative if | |
| tautomer enumeration was aborted due | |
| to reaching a limit | |
| * (None, error_msg) if failure | |
| This calls self.charge_parent() and, if self._canon_taut | |
| is True, runs tautomer canonicalization. | |
| """ | |
| n_tautomers = 0 | |
| if isinstance(mol_in, Chem.Mol): | |
| name = None | |
| try: | |
| name = mol_in.GetProp("_Name") | |
| except KeyError: | |
| pass | |
| if not name: | |
| name = "NONAME" | |
| else: | |
| error = f"Expected SMILES or Chem.Mol as input, got {str(type(mol_in))}" | |
| return None, error | |
| try: | |
| mol_out = self.charge_parent(mol_in) | |
| except Exception as e: | |
| error = f"charge_parent FAILED: {str(e).strip()}" | |
| return None, error | |
| if self._canon_taut: | |
| try: | |
| res = self.taut_enumerator.Enumerate(mol_out, False) | |
| except TypeError: | |
| # we are still on the pre-2021 RDKit API | |
| res = self.taut_enumerator.Enumerate(mol_out) | |
| except Exception as e: | |
| # something else went wrong | |
| error = f"canon_taut FAILED: {str(e).strip()}" | |
| return None, error | |
| n_tautomers = len(res) | |
| if hasattr(res, "status"): | |
| completed = ( | |
| res.status == rdMolStandardize.TautomerEnumeratorStatus.Completed | |
| ) | |
| else: | |
| # we are still on the pre-2021 RDKit API | |
| completed = len(res) < 1000 | |
| if not completed: | |
| n_tautomers = -n_tautomers | |
| try: | |
| mol_out = self.taut_enumerator.PickCanonical(res) | |
| except AttributeError: | |
| # we are still on the pre-2021 RDKit API | |
| mol_out = max( | |
| [(self.taut_enumerator.ScoreTautomer(m), m) for m in res] | |
| )[1] | |
| except Exception as e: | |
| # something else went wrong | |
| error = f"canon_taut FAILED: {str(e).strip()}" | |
| return None, error | |
| mol_out.SetProp("_Name", name) | |
| return mol_out, n_tautomers | |
| def load_pickle(path: str): | |
| with open(path, "rb") as file: | |
| content = pickle.load(file) | |
| return content | |
| def write_pickle(path: str, obj: object): | |
| with open(path, "wb") as file: | |
| pickle.dump(obj, file) | |