Spaces:
Sleeping
Sleeping
| import numpy as np | |
| from rdkit import Chem | |
| from rdkit.Chem.MolStandardize import rdMolStandardize | |
| from rdkit import Chem | |
| import numpy as np | |
| def create_clean_smiles(smiles_list: list[str]) -> tuple[list[str], np.ndarray]: | |
| """ | |
| Clean and canonicalize SMILES strings while staying in SMILES space. | |
| Returns (list of cleaned SMILES, mask of valid SMILES). | |
| """ | |
| clean_smis = [] | |
| valid_mask = [] | |
| cleaner = rdMolStandardize.CleanupParameters() | |
| tautomer_enumerator = rdMolStandardize.TautomerEnumerator() | |
| for smi in smiles_list: | |
| try: | |
| mol = Chem.MolFromSmiles(smi) | |
| if mol is None: | |
| valid_mask.append(False) | |
| continue | |
| # Cleanup and tautomer canonicalization | |
| mol = rdMolStandardize.Cleanup(mol, cleaner) | |
| mol = tautomer_enumerator.Canonicalize(mol) | |
| # Canonical SMILES output | |
| clean_smi = Chem.MolToSmiles(mol, canonical=True) | |
| clean_smis.append(clean_smi) | |
| valid_mask.append(True) | |
| except Exception as e: | |
| print(f"Failed to clean {smi}: {e}") | |
| valid_mask.append(False) | |
| return clean_smis, np.array(valid_mask, dtype=bool) |