File size: 1,223 Bytes
9f78de0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import numpy as np

from rdkit import Chem
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit import Chem
import numpy as np

def create_clean_smiles(smiles_list: list[str]) -> tuple[list[str], np.ndarray]:
    """
    Clean and canonicalize SMILES strings while staying in SMILES space.
    Returns (list of cleaned SMILES, mask of valid SMILES).
    """
    clean_smis = []
    valid_mask = []

    cleaner = rdMolStandardize.CleanupParameters()
    tautomer_enumerator = rdMolStandardize.TautomerEnumerator()

    for smi in smiles_list:
        try:
            mol = Chem.MolFromSmiles(smi)
            if mol is None:
                valid_mask.append(False)
                continue

            # Cleanup and tautomer canonicalization
            mol = rdMolStandardize.Cleanup(mol, cleaner)
            mol = tautomer_enumerator.Canonicalize(mol)

            # Canonical SMILES output
            clean_smi = Chem.MolToSmiles(mol, canonical=True)
            clean_smis.append(clean_smi)
            valid_mask.append(True)

        except Exception as e:
            print(f"Failed to clean {smi}: {e}")
            valid_mask.append(False)

    return clean_smis, np.array(valid_mask, dtype=bool)