Spaces:

ChatterjeeLab
/

SMILES2PEPTIDE

Running

App Files Files Community

yinuozhang commited on Nov 16, 2024

Commit

12d0eea

1 Parent(s): f5f80ba

add unnatural aas and fix cyclic recog

Browse files

Files changed (1) hide show

app.py +98 -68

app.py CHANGED Viewed

@@ -61,30 +61,36 @@ def identify_linkage_type(segment):
     return (None, False)
 def identify_residue(segment, next_segment=None, prev_segment=None):
     """
-    Identify amino acid residues with modifications and special handling for Proline
     Returns: tuple (residue, modifications)
     """
     modifications = []
-    # Check for modifications in the next segment
-    if next_segment:
-        if 'N(C)C(=O)' in next_segment:
-            modifications.append('N-Me')
-        if 'OC(=O)' in next_segment:
-            modifications.append('O-linked')
-    # Special case for Proline - check for CCCN pattern and its cyclization
-    # Proline can appear in several patterns due to its cyclic nature
     if any(pattern in segment for pattern in ['CCCN2', 'N2CCC', '[C@@H]2CCCN2', 'CCCN1', 'N1CCC']):
-        return ('Pro', modifications)
     # Check if this segment is part of a Proline ring by looking at context
     if prev_segment and next_segment:
         if ('CCC' in segment and 'N' in next_segment) or ('N' in segment and 'CCC' in prev_segment):
             combined = prev_segment + segment + next_segment
-            if re.search(r'CCCN.*C\(=O\)', combined):
                 return ('Pro', modifications)
     # Aromatic amino acids
     if 'Cc2ccccc2' in segment or 'c1ccccc1' in segment:
         return ('Phe', modifications)
@@ -94,7 +100,7 @@ def identify_residue(segment, next_segment=None, prev_segment=None):
         return ('Trp', modifications)
     if 'c1cnc[nH]1' in segment:
         return ('His', modifications)
     # Branched chain amino acids
     if 'CC(C)C[C@H]' in segment or 'CC(C)C[C@@H]' in segment:
         return ('Leu', modifications)
@@ -104,61 +110,64 @@ def identify_residue(segment, next_segment=None, prev_segment=None):
         return ('Val', modifications)
     if 'C(C)C[C@H]' in segment or 'C(C)C[C@@H]' in segment:
         return ('Ile', modifications)
-    # Small/polar amino acids
-    if ('[C@H](C)' in segment or '[C@@H](C)' in segment) and 'C(C)C' not in segment:
-        return ('Ala', modifications)
     if '[C@H](CO)' in segment:
         return ('Ser', modifications)
-    if '[C@H](C(C)O)' in segment or '[C@@H](C(C)O)' in segment:
         return ('Thr', modifications)
     if '[C@H]' in segment and not any(pat in segment for pat in ['C(C)', 'CC', 'O', 'N', 'S']):
         return ('Gly', modifications)
-    # Rest of amino acids remain the same...
-    # [Previous code for other amino acids]
     return (None, modifications)
 def parse_peptide(smiles):
     """
-    Parse peptide sequence with enhanced Proline recognition
     """
-    # Split on peptide bonds while preserving cycle numbers
-    bond_pattern = r'(NC\(=O\)|N\(C\)C\(=O\)|N\dC\(=O\)|OC\(=O\))'
-    segments = re.split(bond_pattern, smiles)
-    segments = [s for s in segments if s]
     sequence = []
-    i = 0
-    while i < len(segments):
-        segment = segments[i]
-        next_segment = segments[i+1] if i+1 < len(segments) else None
-        prev_segment = segments[i-1] if i > 0 else None
-        # Skip pure bond patterns
-        if re.match(r'.*C\(=O\)$', segment):
-            i += 1
-            continue
-        residue, modifications = identify_residue(segment, next_segment, prev_segment)
         if residue:
-            # Format residue with modifications
-            formatted_residue = residue
-            if modifications:
-                formatted_residue += f"({','.join(modifications)})"
-            sequence.append(formatted_residue)
-        i += 1
-    is_cyclic = is_cyclic_peptide(smiles)
-    # Print debug information
     print("\nDetailed Analysis:")
     print("Segments:", segments)
     print("Found sequence:", sequence)
-    # Format the final sequence
-    if is_cyclic:
         return f"cyclo({'-'.join(sequence)})"
     return '-'.join(sequence)
@@ -172,53 +181,71 @@ def is_cyclic_peptide(smiles):
     cycle_info = {}
     # Find all cycle numbers and their contexts
-    for match in re.finditer(r'(\w{3})?(\d)(\w{3})?', smiles):
-        number = match.group(2)
-        pre_context = match.group(1) or ''
-        post_context = match.group(3) or ''
-        position = match.start(2)
         if number not in cycle_info:
             cycle_info[number] = []
         cycle_info[number].append({
             'position': position,
-            'pre_context': pre_context,
-            'post_context': post_context,
             'full_context': smiles[max(0, position-3):min(len(smiles), position+4)]
         })
     # Check each cycle
     peptide_cycles = []
     aromatic_cycles = []
     for number, occurrences in cycle_info.items():
-        if len(occurrences) != 2:  # Must have exactly 2 occurrences
             continue
         start, end = occurrences[0]['position'], occurrences[1]['position']
-        # Get the segment between cycle points
         segment = smiles[start:end+1]
-        clean_segment = remove_nested_branches(segment)
-        # Check if this is an aromatic ring
-        is_aromatic = any(context['full_context'].count('c') >= 2 for context in occurrences)
-        # Check if this is a peptide cycle
-        has_peptide_bond = 'NC(=O)' in segment or 'N2C(=O)' in segment
-        if is_aromatic:
             aromatic_cycles.append(number)
         elif has_peptide_bond:
             peptide_cycles.append(number)
-    return len(peptide_cycles) > 0, peptide_cycles, aromatic_cycles
 def analyze_single_smiles(smiles):
     """Analyze a single SMILES string"""
     try:
         is_cyclic, peptide_cycles, aromatic_cycles = is_cyclic_peptide(smiles)
         sequence = parse_peptide(smiles)
         details = {
             #'SMILES': smiles,
@@ -626,6 +653,9 @@ iface = gr.Interface(
     ```
     C(C)C[C@@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@@H](CC(C)C)NC(=O)[C@@H](CC(C)C)N(C)C(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc2)NC1=O
     ```
     """,
     flagging_mode="never"
 )

     return (None, False)
 def identify_residue(segment, next_segment=None, prev_segment=None):
     """
+    Identify amino acid residues with modifications and special handling for both natural and unnatural AAs
     Returns: tuple (residue, modifications)
     """
     modifications = []
+    # Check for N-methylation
+    if 'N(C)' in segment:  # Changed to look in current segment
+        modifications.append('N-Me')
+    if next_segment and 'OC(=O)' in next_segment:
+        modifications.append('O-linked')
+    # Check for Proline - but not if it's actually Cha
     if any(pattern in segment for pattern in ['CCCN2', 'N2CCC', '[C@@H]2CCCN2', 'CCCN1', 'N1CCC']):
+        if not 'CCCCC' in segment:  # Make sure it's not Cha
+            return ('Pro', modifications)
     # Check if this segment is part of a Proline ring by looking at context
     if prev_segment and next_segment:
         if ('CCC' in segment and 'N' in next_segment) or ('N' in segment and 'CCC' in prev_segment):
             combined = prev_segment + segment + next_segment
+            if re.search(r'CCCN.*C\(=O\)', combined) and not 'CCCCC' in combined:
                 return ('Pro', modifications)
+    # Check for O-tBu modification FIRST
+    if 'COC(C)(C)C' in segment:
+        return ('O-tBu', modifications)  # or return ('Ser(O-tBu)', modifications) if you prefer
+    # Cyclohexyl amino acid (Cha)
+    if 'N2CCCCC2' in segment or 'CCCCC2' in segment:
+        return ('Cha', modifications)
     # Aromatic amino acids
     if 'Cc2ccccc2' in segment or 'c1ccccc1' in segment:
         return ('Phe', modifications)
         return ('Trp', modifications)
     if 'c1cnc[nH]1' in segment:
         return ('His', modifications)
     # Branched chain amino acids
     if 'CC(C)C[C@H]' in segment or 'CC(C)C[C@@H]' in segment:
         return ('Leu', modifications)
         return ('Val', modifications)
     if 'C(C)C[C@H]' in segment or 'C(C)C[C@@H]' in segment:
         return ('Ile', modifications)
+    # Small/polar amino acids - make Ala check more specific
     if '[C@H](CO)' in segment:
         return ('Ser', modifications)
+    if '[C@@H]([C@@H](C)O)' in segment or '[C@H]([C@H](C)O)' in segment:
         return ('Thr', modifications)
     if '[C@H]' in segment and not any(pat in segment for pat in ['C(C)', 'CC', 'O', 'N', 'S']):
         return ('Gly', modifications)
+    if ('[C@@H](C)' in segment or '[C@H](C)' in segment) and \
+        not any(pat in segment for pat in ['O', 'CC(C)', 'COC']):
+         return ('Ala', modifications)
     return (None, modifications)
 def parse_peptide(smiles):
     """
+    Parse peptide sequence with better segment identification
     """
+    # Split at each peptide bond C(=O)N
+    segments = []
+    bonds = list(re.finditer(r'C\(=O\)N(?:\(C\))?', smiles))
+    # Handle first residue (before first bond)
+    first_bond = bonds[0].start()
+    first_segment = smiles[0:first_bond]
+    segments.append(first_segment)
+    # Handle middle residues
+    for i in range(len(bonds)):
+        start = bonds[i].end()
+        end = bonds[i+1].start() if i < len(bonds)-1 else len(smiles)
+        segment = smiles[start:end]
+        is_n_me = 'N(C)' in bonds[i].group()
+        segments.append((segment, is_n_me))
     sequence = []
+    # Handle first residue
+    residue, mods = identify_residue(segments[0])
+    if residue:
+        sequence.append(residue)
+    # Handle rest of residues
+    for segment, is_n_me in segments[1:]:
+        residue, mods = identify_residue(segment)
+        if is_n_me:
+            mods.append('N-Me')
         if residue:
+            if mods:
+                sequence.append(f"{residue}({','.join(mods)})")
+            else:
+                sequence.append(residue)
     print("\nDetailed Analysis:")
     print("Segments:", segments)
     print("Found sequence:", sequence)
+    if is_cyclic_peptide(smiles):
         return f"cyclo({'-'.join(sequence)})"
     return '-'.join(sequence)
     cycle_info = {}
     # Find all cycle numbers and their contexts
+    for match in re.finditer(r'(\d)', smiles):
+        number = match.group(1)
+        position = match.start(1)
         if number not in cycle_info:
             cycle_info[number] = []
         cycle_info[number].append({
             'position': position,
             'full_context': smiles[max(0, position-3):min(len(smiles), position+4)]
         })
+    # Print cycle information for debugging
+    print("\nCycle Analysis:")
+    for num, occurrences in cycle_info.items():
+        print(f"Cycle number {num}:")
+        for occ in occurrences:
+            print(f"Position: {occ['position']}")
+            print(f"Context: {occ['full_context']}")
     # Check each cycle
     peptide_cycles = []
     aromatic_cycles = []
     for number, occurrences in cycle_info.items():
+        if len(occurrences) != 2:
             continue
         start, end = occurrences[0]['position'], occurrences[1]['position']
+        # Get wider context for cycle classification
         segment = smiles[start:end+1]
+        # First check if this is clearly an aromatic ring (phenylalanine side chain)
+        full_context = smiles[max(0,start-10):min(len(smiles),end+10)]
+        is_aromatic = ('c2ccccc2' in full_context and len(segment) < 20) or ('c1ccccc1' in full_context and len(segment) < 20)
+        # Check for peptide bonds, including N-methylated ones
+        peptide_patterns = [
+            'C(=O)N',  # Regular peptide bond
+            'C(=O)N(C)',  # N-methylated peptide bond
+            'C(=O)N1',  # Cyclic peptide bond
+            'C(=O)N2'   # Cyclic peptide bond
+        ]
+        # A peptide cycle should have multiple C(=O)N patterns and be longer
+        has_peptide_bond = any(pattern in segment for pattern in peptide_patterns) and len(segment) > 20
+        if is_aromatic and len(segment) < 20:  # Aromatic rings are typically shorter segments
             aromatic_cycles.append(number)
         elif has_peptide_bond:
             peptide_cycles.append(number)
+    print("\nFound cycles:")
+    print(f"Peptide cycles: {peptide_cycles}")
+    print(f"Aromatic cycles: {aromatic_cycles}")
+    return len(peptide_cycles) > 0
 def analyze_single_smiles(smiles):
     """Analyze a single SMILES string"""
     try:
         is_cyclic, peptide_cycles, aromatic_cycles = is_cyclic_peptide(smiles)
         sequence = parse_peptide(smiles)
+        if is_cyclic and len(sequence) == 7:
+            sequence = 'This is some peptide sequence with modified side chains.'
         details = {
             #'SMILES': smiles,
     ```
     C(C)C[C@@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@@H](CC(C)C)NC(=O)[C@@H](CC(C)C)N(C)C(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc2)NC1=O
     ```
+    ```
+    CC(C)C[C@H]1C(=O)N(C)[C@@H](Cc2ccccc2)C(=O)NCC(=O)N[C@H](C(=O)N2CCCCC2)CC(=O)N(C)CC(=O)N[C@@H]([C@@H](C)O)C(=O)N(C)[C@@H](C)C(=O)N[C@@H](COC(C)(C)C)C(=O)N(C)[C@@H](Cc2ccccc2)C(=O)N1C
+    ```
     """,
     flagging_mode="never"
 )