Spaces:
Sleeping
Sleeping
File size: 1,223 Bytes
9f78de0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
import numpy as np
from rdkit import Chem
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit import Chem
import numpy as np
def create_clean_smiles(smiles_list: list[str]) -> tuple[list[str], np.ndarray]:
"""
Clean and canonicalize SMILES strings while staying in SMILES space.
Returns (list of cleaned SMILES, mask of valid SMILES).
"""
clean_smis = []
valid_mask = []
cleaner = rdMolStandardize.CleanupParameters()
tautomer_enumerator = rdMolStandardize.TautomerEnumerator()
for smi in smiles_list:
try:
mol = Chem.MolFromSmiles(smi)
if mol is None:
valid_mask.append(False)
continue
# Cleanup and tautomer canonicalization
mol = rdMolStandardize.Cleanup(mol, cleaner)
mol = tautomer_enumerator.Canonicalize(mol)
# Canonical SMILES output
clean_smi = Chem.MolToSmiles(mol, canonical=True)
clean_smis.append(clean_smi)
valid_mask.append(True)
except Exception as e:
print(f"Failed to clean {smi}: {e}")
valid_mask.append(False)
return clean_smis, np.array(valid_mask, dtype=bool) |