import numpy as np from rdkit import Chem from rdkit.Chem.MolStandardize import rdMolStandardize from rdkit import Chem import numpy as np def create_clean_smiles(smiles_list: list[str]) -> tuple[list[str], np.ndarray]: """ Clean and canonicalize SMILES strings while staying in SMILES space. Returns (list of cleaned SMILES, mask of valid SMILES). """ clean_smis = [] valid_mask = [] cleaner = rdMolStandardize.CleanupParameters() tautomer_enumerator = rdMolStandardize.TautomerEnumerator() for smi in smiles_list: try: mol = Chem.MolFromSmiles(smi) if mol is None: valid_mask.append(False) continue # Cleanup and tautomer canonicalization mol = rdMolStandardize.Cleanup(mol, cleaner) mol = tautomer_enumerator.Canonicalize(mol) # Canonical SMILES output clean_smi = Chem.MolToSmiles(mol, canonical=True) clean_smis.append(clean_smi) valid_mask.append(True) except Exception as e: print(f"Failed to clean {smi}: {e}") valid_mask.append(False) return clean_smis, np.array(valid_mask, dtype=bool)