Spaces:

ChatterjeeLab
/

SMILES2PEPTIDE

Running

App Files Files Community

yinuozhang commited on May 22

Commit

2c223b3

1 Parent(s): 5cfae69

debug

Browse files

Files changed (1) hide show

app.py +157 -108

app.py CHANGED Viewed

@@ -1,17 +1,6 @@
 import os
 import gradio as gr
 import gradio.blocks
-from gradio.blocks import Blocks
-original_get_api_info = Blocks.get_api_info
-def safe_get_api_info(self):
-    try:
-        return original_get_api_info(self)
-    except Exception as e:
-        print("⚠️ Failed to generate API schema:", e)
-        return {}
 import re
 import pandas as pd
 from io import StringIO
@@ -37,18 +26,19 @@ class PeptideAnalyzer:
             (r'C\(=O\)N[12]?', 'peptide_reverse')  # Reverse peptide bond
         ]
         self.complex_residue_patterns = [
-            # Kpg - Lys(palmitoyl-Glu-OtBu)
             (r'\[C[@]H\]\(CCCNC\(=O\)CCC\[C@@H\]\(NC\(=O\)CCCCCCCCCCCCCCCC\)C\(=O\)OC\(C\)\(C\)C\)', 'Kpg'),
             (r'CCCCCCCCCCCCCCCCC\(=O\)N\[C@H\]\(CCCC\(=O\)NCCC\[C@@H\]', 'Kpg'),
-            (r'\[C[@]?H\]\(CSC\(c\d+ccccc\d+\)\(c\d+ccccc\d+\)c\d+ccc\(OC\)cc\d+\)', 'Cmt'),
             (r'CSC\(c.*?c.*?OC\)', 'Cmt'),        # Core structure of Cys-Mmt group
             (r'COc.*?ccc\(C\(SC', 'Cmt'),         # Start of Cmt in cyclic peptides
             (r'c2ccccc2\)c2ccccc2\)cc', 'Cmt'),   # End of Cmt in cyclic peptides
-            # Glu(OAll)
             (r'C=CCOC\(=O\)CC\[C@@H\]', 'Eal'),
             #(r'COc\d+ccc\(C\(SC\[C@@H\]\d+.*?\)\(c\d+ccccc\d+\)c\d+ccccc\d+\)cc\d+', 'Cmt-cyclic'),
-            # Dtg - Asp(OtBu)-(Dmb)Gly
             (r'CN\(Cc\d+ccc\(OC\)cc\d+OC\)C\(=O\)\[C@H\]\(CC\(=O\)OC\(C\)\(C\)C\)', 'Dtg'),
             (r'C\(=O\)N\(CC\d+=C\(C=C\(C=C\d+\)OC\)OC\)CC\(=O\)', 'Dtg'),
             (r'N\[C@@H\]\(CC\(=O\)OC\(C\)\(C\)C\)C\(=O\)N\(CC\d+=C\(C=C\(C=C\d+\)OC\)OC\)CC\(=O\)', 'Dtg'),
@@ -68,10 +58,12 @@ class PeptideAnalyzer:
             'Aib': 'Ŷ', 'Dtg': 'Ĝ', 'Cmt': 'Ĉ', 'Eal': 'Ė', 'Nml': "Ŀ", 'Nma': 'Ṃ',
             'Kpg': 'Ƙ', 'Tpb': 'Ṯ', 'Cyl': 'Ċ', 'Nle': 'Ł', 'Hph': 'Ĥ', 'Cys-Cys': 'CC', 'cys-cys': 'cc',
         }
     def preprocess_complex_residues(self, smiles):
         complex_positions = []
         for pattern, residue_type in self.complex_residue_patterns:
             for match in re.finditer(pattern, smiles):
                 # Only add if this position doesn't overlap with existing matches
@@ -87,6 +79,7 @@ class PeptideAnalyzer:
         # Sort by position (to handle potential overlapping matches)
         complex_positions.sort(key=lambda x: x['start'])
         if not complex_positions:
             return smiles, []
@@ -97,70 +90,37 @@ class PeptideAnalyzer:
         protected_residues = []
         for pos in complex_positions:
             start = pos['start'] + offset
             end = pos['end'] + offset
             complex_part = preprocessed_smiles[start:end]
             if not ('[C@H]' in complex_part or '[C@@H]' in complex_part):
-                continue
             placeholder = f"COMPLEX_RESIDUE_{len(protected_residues)}"
             preprocessed_smiles = preprocessed_smiles[:start] + placeholder + preprocessed_smiles[end:]
             offset += len(placeholder) - (end - start)
             protected_residues.append({
                 'placeholder': placeholder,
                 'type': pos['type'],
                 'content': complex_part
             })
-            #print(f"Protected {pos['type']}: {complex_part[:20]}... as {placeholder}")
-        return preprocessed_smiles, protected_residues
-    def is_peptide(self, smiles):
-        """Check if the SMILES represents a peptide structure"""
-        mol = Chem.MolFromSmiles(smiles)
-        if mol is None:
-            return False
-        # Look for peptide bonds: NC(=O) pattern
-        peptide_bond_pattern = Chem.MolFromSmarts('[NH][C](=O)')
-        if mol.HasSubstructMatch(peptide_bond_pattern):
-            return True
-        # Look for N-methylated peptide bonds: N(C)C(=O) pattern
-        n_methyl_pattern = Chem.MolFromSmarts('[N;H0;$(NC)](C)[C](=O)')
-        if mol.HasSubstructMatch(n_methyl_pattern):
-            return True
-        return False
-    def is_cyclic(self, smiles):
-        """Improved cyclic peptide detection"""
-        # Check for C-terminal carboxyl
-        if smiles.endswith('C(=O)O'):
-            return False, [], []
-        # Find all numbers used in ring closures
-        ring_numbers = re.findall(r'(?:^|[^c])[0-9](?=[A-Z@\(\)])', smiles)
-        # Find aromatic ring numbers
-        aromatic_matches = re.findall(r'c[0-9](?:ccccc|c\[nH\]c)[0-9]', smiles)
-        aromatic_cycles = []
-        for match in aromatic_matches:
-            numbers = re.findall(r'[0-9]', match)
-            aromatic_cycles.extend(numbers)
-        # Numbers that aren't part of aromatic rings are peptide cycles
-        peptide_cycles = [n for n in ring_numbers if n not in aromatic_cycles]
-        is_cyclic = len(peptide_cycles) > 0 and not smiles.endswith('C(=O)O')
-        return is_cyclic, peptide_cycles, aromatic_cycles
     def split_on_bonds(self, smiles, protected_residues=None):
         """Split SMILES into segments based on peptide bonds, with improved handling of protected residues"""
         positions = []
@@ -196,6 +156,7 @@ class PeptideAnalyzer:
                 })
                 used.update(range(match.start(), match.end()))
         for pattern, bond_type in self.bond_patterns:
             for match in re.finditer(pattern, smiles):
                 if not any(p in range(match.start(), match.end()) for p in used):
@@ -207,6 +168,7 @@ class PeptideAnalyzer:
                     })
                     used.update(range(match.start(), match.end()))
         bond_positions.sort(key=lambda x: x['start'])
         # Combine complex residue positions and bond positions
@@ -216,6 +178,7 @@ class PeptideAnalyzer:
         # Create segments
         segments = []
         if all_positions and all_positions[0]['start'] > 0:
             segments.append({
                 'content': smiles[0:all_positions[0]['start']],
@@ -223,10 +186,12 @@ class PeptideAnalyzer:
                 'complex_after': all_positions[0]['pattern'] if all_positions[0]['type'] == 'complex' else None
             })
         for i in range(len(all_positions)-1):
             current = all_positions[i]
             next_pos = all_positions[i+1]
             if current['type'] == 'complex':
                 segments.append({
                     'content': current['content'],
@@ -234,6 +199,7 @@ class PeptideAnalyzer:
                     'bond_after': next_pos['pattern'] if next_pos['type'] != 'complex' else None,
                     'complex_type': current['residue_type']
                 })
             elif current['type'] == 'gly':
                 segments.append({
                     'content': 'NCC(=O)',
@@ -250,6 +216,7 @@ class PeptideAnalyzer:
                         'bond_after': next_pos['pattern'] if next_pos['type'] != 'complex' else None
                     })
         if all_positions and all_positions[-1]['end'] < len(smiles):
             if all_positions[-1]['type'] == 'complex':
                 segments.append({
@@ -264,6 +231,46 @@ class PeptideAnalyzer:
                 })
         return segments
     def clean_terminal_carboxyl(self, segment):
         """Remove C-terminal carboxyl only if it's the true terminus"""
@@ -272,17 +279,14 @@ class PeptideAnalyzer:
         # Only clean if:
         # 1. Contains C(=O)O
         # 2. No bond_after exists (meaning it's the last segment)
-        # 3. C(=O)O is at the end of the content
         if 'C(=O)O' in content and not segment.get('bond_after'):
-            print('recognized?')
             # Remove C(=O)O pattern regardless of position
             cleaned = re.sub(r'\(C\(=O\)O\)', '', content)
             # Remove any leftover empty parentheses
             cleaned = re.sub(r'\(\)', '', cleaned)
-            print(cleaned)
             return cleaned
         return content
     def identify_residue(self, segment):
         """Identify residue with Pro reconstruction"""
         # Only clean terminal carboxyl if this is the last segment
@@ -295,14 +299,14 @@ class PeptideAnalyzer:
             print("DIRECT MATCH: Found Cmt at beginning")
             return 'Cmt', mods
         if '[C@@H]3CCCN3C2=O)(c2ccccc2)c2ccccc2)cc' in content:
             print("DIRECT MATCH: Found Pro at end")
             return 'Pro', mods
-        # Eal - Glu(OAll)
         if 'CCC(=O)OCC=C' in content or 'CC(=O)OCC=C' in content or 'C=CCOC(=O)CC' in content:
             return 'Eal', mods
         # Proline (P) - flexible ring numbers
         if any([
             # Check for any ring number in bond patterns
@@ -332,33 +336,46 @@ class PeptideAnalyzer:
         if ('N1[C@H](CCC1)' in content):
             return 'pro', mods
-        # Tryptophan (W)
         if re.search(r'c[0-9]c\[nH\]c[0-9]ccccc[0-9][0-9]', content) and \
         'c[nH]c' in content.replace(' ', ''):
             if '[C@H](CC' in content:  # D-form
                 return 'trp', mods
             return 'Trp', mods
         # Lysine (K) - both patterns
         if '[C@@H](CCCCN)' in content or '[C@H](CCCCN)' in content:
             if '[C@H](CCCCN)' in content:  # D-form
                 return 'lys', mods
             return 'Lys', mods
         # Arginine (R) - both patterns
         if '[C@@H](CCCNC(=N)N)' in content or '[C@H](CCCNC(=N)N)' in content:
             if '[C@H](CCCNC(=N)N)' in content:  # D-form
                 return 'arg', mods
             return 'Arg', mods
         if content == 'C' and segment.get('bond_before') and segment.get('bond_after'):
             # If it's surrounded by peptide bonds, it's almost certainly Gly
             if ('C(=O)N' in segment['bond_before'] or 'NC(=O)' in segment['bond_before'] or 'N(C)C(=O)' in segment['bond_before']) and \
                ('NC(=O)' in segment['bond_after'] or 'C(=O)N' in segment['bond_after'] or 'N(C)C(=O)' in segment['bond_after']):
                 return 'Gly', mods
         # Leucine patterns (L/l)
         if 'CC(C)C[C@H]' in content or 'CC(C)C[C@@H]' in content or '[C@@H](CC(C)C)' in content or '[C@H](CC(C)C)' in content or (('N[C@H](CCC(C)C)' in content or 'N[C@@H](CCC(C)C)' in content) and segment.get('bond_before') is None):
             if '[C@H](CC(C)C)' in content or 'CC(C)C[C@H]' in content:  # D-form
                 return 'leu', mods
             return 'Leu', mods
@@ -375,6 +392,7 @@ class PeptideAnalyzer:
         # Phenylalanine patterns (F/f)
         if re.search(r'\[C@H\]\(Cc\d+ccccc\d+\)', content) or re.search(r'\[C@@H\]\(Cc\d+ccccc\d+\)', content):
             if re.search(r'\[C@H\]\(Cc\d+ccccc\d+\)', content):  # D-form
                 return 'phe', mods
             return 'Phe', mods
@@ -385,33 +403,46 @@ class PeptideAnalyzer:
             # Make sure it's not leucine
             if not any(p in content for p in ['CC(C)C[C@H]', 'CC(C)C[C@@H]', 'CCC(=O)']):
                 if '[C@H]' in content and not '[C@@H]' in content:  # D-form
                     return 'val', mods
                 return 'Val', mods
         # Isoleucine patterns (I/i)
-        if any([
-            'CC[C@@H](C)' in content, '[C@@H](C)CC' in content,
-            '[C@@H](CC)C' in content,
-            'C(C)C[C@@H]' in content and 'CC(C)C' not in content
-        ]):
-            if '[C@H]([C@@H](CC)C)' in content or '[C@H](CC)C' in content:  # D-form
-                return 'ile', mods
-            elif '[C@H](C)CC' in content or '[C@H](CC)C' in content or 'CC[C@H](C)' in content:
-                return 'ile', mods
-            elif 'C(C)C[C@H]' in content and 'CC(C)C' not in content:
                 return 'ile', mods
             return 'Ile', mods
         # Alanine patterns (A/a)
         if ('[C@H](C)' in content or '[C@@H](C)' in content):
             if not any(p in content for p in ['C(C)C', 'COC', 'CN(', 'C(C)O', 'CC[C@H]', 'CC[C@@H]']):
                 if '[C@H](C)' in content:  # D-form
                     return 'ala', mods
                 return 'Ala', mods
         # Tyrosine patterns (Y/y)
         if re.search(r'Cc[0-9]ccc\(O\)cc[0-9]', content):
             if '[C@H](Cc1ccc(O)cc1)' in content:  # D-form
                 return 'tyr', mods
             return 'Tyr', mods
@@ -419,21 +450,25 @@ class PeptideAnalyzer:
         # Serine patterns (S/s)
         if '[C@H](CO)' in content or '[C@@H](CO)' in content:
             if not ('C(C)O' in content or 'COC' in content):
                 if '[C@H](CO)' in content:  # D-form
                     return 'ser', mods
                 return 'Ser', mods
         if 'CSSC' in content:
             if re.search(r'\[C@@H\].*CSSC.*\[C@@H\]', content) or re.search(r'\[C@H\].*CSSC.*\[C@H\]', content):
                 if '[C@H]' in content and not '[C@@H]' in content:  # D-form
                     return 'cys-cys', mods
                 return 'Cys-Cys', mods
             if '[C@@H](N)CSSC' in content or '[C@H](N)CSSC' in content:
                 if '[C@H](N)CSSC' in content:  # D-form
                     return 'cys-cys', mods
                 return 'Cys-Cys', mods
             if 'CSSC[C@@H](C(=O)O)' in content or 'CSSC[C@H](C(=O)O)' in content:
                 if 'CSSC[C@H](C(=O)O)' in content:  # D-form
                     return 'cys-cys', mods
@@ -441,12 +476,14 @@ class PeptideAnalyzer:
         # Cysteine patterns (C/c)
         if '[C@H](CS)' in content or '[C@@H](CS)' in content:
             if '[C@H](CS)' in content:  # D-form
                 return 'cys', mods
             return 'Cys', mods
         # Methionine patterns (M/m)
         if ('CCSC' in content) or ("CSCC" in content):
             if '[C@H](CCSC)' in content:  # D-form
                 return 'met', mods
             elif '[C@H]' in content:
@@ -455,29 +492,34 @@ class PeptideAnalyzer:
         # Glutamine patterns (Q/q)
         if (content == '[C@@H](CC' or content == '[C@H](CC' and segment.get('bond_before')=='C(=O)N' and segment.get('bond_after')=='C(=O)N') or ('CCC(=O)N' in content) or ('CCC(N)=O' in content):
             if '[C@H](CCC(=O)N)' in content:  # D-form
                 return 'gln', mods
             return 'Gln', mods
         # Asparagine patterns (N/n)
         if (content == '[C@@H](C' or content == '[C@H](C' and segment.get('bond_before')=='C(=O)N' and segment.get('bond_after')=='C(=O)N') or ('CC(=O)N' in content) or ('CCN(=O)' in content) or ('CC(N)=O' in content):
             if '[C@H](CC(=O)N)' in content:  # D-form
                 return 'asn', mods
             return 'Asn', mods
         # Glutamic acid patterns (E/e)
         if ('CCC(=O)O' in content):
             if '[C@H](CCC(=O)O)' in content:  # D-form
                 return 'glu', mods
             return 'Glu', mods
         # Aspartic acid patterns (D/d)
         if ('CC(=O)O' in content):
             if '[C@H](CC(=O)O)' in content:  # D-form
                 return 'asp', mods
             return 'Asp', mods
         if re.search(r'Cc\d+c\[nH\]cn\d+', content) or re.search(r'Cc\d+cnc\[nH\]\d+', content):
             if '[C@H]' in content:  # D-form
                 return 'his', mods
             return 'His', mods
@@ -488,22 +530,27 @@ class PeptideAnalyzer:
             'N[C@H](CCCC)' in content or '[C@H](CCCC)' in content) and 'CC(C)' not in content:
             return 'Nle', mods
         # Aib - alpha-aminoisobutyric acid (2-aminoisobutyric acid)
-        if 'C(C)(C)(N)' in content or 'C(C)(C)' in content or 'C(C)(C)' in content and ('C(=O)N' in segment['bond_before'] or 'NC(=O)' in segment['bond_before'] or 'N(C)C(=O)' in segment['bond_before']) and \
-               ('NC(=O)' in segment['bond_after'] or 'C(=O)N' in segment['bond_after'] or 'N(C)C(=O)' in segment['bond_after']):
             return 'Aib', mods
-        # Dtg - Asp(OtBu)-(Dmb)Gly
         if 'CC(=O)OC(C)(C)C' in content and 'CC1=C(C=C(C=C1)OC)OC' in content:
             return 'Dtg', mods
-        # Kpg - Lys(palmitoyl-Glu-OtBu)
         if 'CCCNC(=O)' in content and 'CCCCCCCCCCCC' in content:
             return 'Kpg', mods
-        # Tpb - Thr(PO(OBzl)OH)
-        if re.search(r'\[C[@]?H\]\(C\)OP\(=O\)\(O\)', content) or 'OP(=O)(O)OCC' in content:
-            return 'Tpb', mods
         return None, mods
@@ -524,7 +571,7 @@ class PeptideAnalyzer:
             #mods.append('O-linked')
         return mods
     def analyze_structure(self, smiles):
         """Main analysis function with preprocessing for complex residues"""
         print("\nAnalyzing structure:", smiles)
@@ -541,6 +588,7 @@ class PeptideAnalyzer:
         # Check if it's cyclic
         is_cyclic, peptide_cycles, aromatic_cycles = self.is_cyclic(smiles)
         segments = self.split_on_bonds(preprocessed_smiles, protected_residues)
         print("\nSegment Analysis:")
@@ -562,8 +610,10 @@ class PeptideAnalyzer:
             else:
                 print(f"Warning: Could not identify residue in segment: {segment.get('content', 'None')}")
         three_letter = '-'.join(sequence)
         one_letter = ''.join(self.three_to_one.get(aa.split('(')[0], 'X') for aa in sequence)
         if is_cyclic:
@@ -849,6 +899,13 @@ def process_input(
             return "Error: Input SMILES does not appear to be a peptide structure.", None, None, []
         try:
             mol = Chem.MolFromSmiles(smiles)
             if mol is None:
                 return "Error: Invalid SMILES notation.", None, None, []
@@ -876,14 +933,18 @@ def process_input(
                 except Exception as e:
                     return f"Error generating 3D structures: {str(e)}", None, None, []
-            segments = analyzer.split_on_bonds(smiles)
-            sequence_parts = []
-            output_text = ""
             # Only include segment analysis in output if requested
             if show_segment_details:
                 output_text += "Segment Analysis:\n"
                 for i, segment in enumerate(segments):
                     output_text += f"\nSegment {i}:\n"
@@ -902,22 +963,11 @@ def process_input(
                     else:
                         output_text += f"Warning: Could not identify residue in segment: {segment['content']}\n"
                 output_text += "\n"
             else:
-                for segment in segments:
-                    residue, mods = analyzer.identify_residue(segment)
-                    if residue:
-                        if mods:
-                            sequence_parts.append(f"{residue}({','.join(mods)})")
-                        else:
-                            sequence_parts.append(residue)
-            is_cyclic, peptide_cycles, aromatic_cycles = analyzer.is_cyclic(smiles)
-            three_letter = '-'.join(sequence_parts)
-            one_letter = ''.join(analyzer.three_to_one.get(aa.split('(')[0], 'X') for aa in sequence_parts)
-            if is_cyclic:
-                three_letter = f"cyclo({three_letter})"
-                one_letter = f"cyclo({one_letter})"
             img_cyclic = annotate_cyclic_structure(mol, three_letter)
@@ -944,7 +994,7 @@ def process_input(
                 for filepath in structure_files:
                     summary += f"- {os.path.basename(filepath)}\n"
-            return summary + output_text, img_cyclic, img_linear, structure_files if structure_files else []
         except Exception as e:
             return f"Error processing SMILES: {str(e)}", None, None, []
@@ -1067,5 +1117,4 @@ iface = gr.Interface(
 )
 if __name__ == "__main__":
-    Blocks.get_api_info = safe_get_api_info
     iface.launch(share=True)

 import os
 import gradio as gr
 import gradio.blocks
 import re
 import pandas as pd
 from io import StringIO
             (r'C\(=O\)N[12]?', 'peptide_reverse')  # Reverse peptide bond
         ]
         self.complex_residue_patterns = [
+            # Kpg - Lys(palmitoyl-Glu-OtBu) - Exact pattern for the specific structure
             (r'\[C[@]H\]\(CCCNC\(=O\)CCC\[C@@H\]\(NC\(=O\)CCCCCCCCCCCCCCCC\)C\(=O\)OC\(C\)\(C\)C\)', 'Kpg'),
             (r'CCCCCCCCCCCCCCCCC\(=O\)N\[C@H\]\(CCCC\(=O\)NCCC\[C@@H\]', 'Kpg'),
+            (r'\[C@*H\]\(CSC\(c\d+ccccc\d+\)\(c\d+ccccc\d+\)c\d+ccc\(OC\)cc\d+\)', 'Cmt'),
             (r'CSC\(c.*?c.*?OC\)', 'Cmt'),        # Core structure of Cys-Mmt group
             (r'COc.*?ccc\(C\(SC', 'Cmt'),         # Start of Cmt in cyclic peptides
             (r'c2ccccc2\)c2ccccc2\)cc', 'Cmt'),   # End of Cmt in cyclic peptides
+            # Glu(OAll) - Only match the complete pattern to avoid partial matches
             (r'C=CCOC\(=O\)CC\[C@@H\]', 'Eal'),
+            (r'\(C\)OP\(=O\)\(O\)OCc\d+ccccc\d+', 'Tpb'),
             #(r'COc\d+ccc\(C\(SC\[C@@H\]\d+.*?\)\(c\d+ccccc\d+\)c\d+ccccc\d+\)cc\d+', 'Cmt-cyclic'),
+            # Dtg - Asp(OtBu)-(Dmb)Gly - Full pattern
             (r'CN\(Cc\d+ccc\(OC\)cc\d+OC\)C\(=O\)\[C@H\]\(CC\(=O\)OC\(C\)\(C\)C\)', 'Dtg'),
             (r'C\(=O\)N\(CC\d+=C\(C=C\(C=C\d+\)OC\)OC\)CC\(=O\)', 'Dtg'),
             (r'N\[C@@H\]\(CC\(=O\)OC\(C\)\(C\)C\)C\(=O\)N\(CC\d+=C\(C=C\(C=C\d+\)OC\)OC\)CC\(=O\)', 'Dtg'),
             'Aib': 'Ŷ', 'Dtg': 'Ĝ', 'Cmt': 'Ĉ', 'Eal': 'Ė', 'Nml': "Ŀ", 'Nma': 'Ṃ',
             'Kpg': 'Ƙ', 'Tpb': 'Ṯ', 'Cyl': 'Ċ', 'Nle': 'Ł', 'Hph': 'Ĥ', 'Cys-Cys': 'CC', 'cys-cys': 'cc',
         }
     def preprocess_complex_residues(self, smiles):
+        """Identify and protect complex residues with internal peptide bonds - improved to prevent overlaps"""
+        # Create a mapping of positions to complex residue types
         complex_positions = []
+        # Search for all complex residue patterns
         for pattern, residue_type in self.complex_residue_patterns:
             for match in re.finditer(pattern, smiles):
                 # Only add if this position doesn't overlap with existing matches
         # Sort by position (to handle potential overlapping matches)
         complex_positions.sort(key=lambda x: x['start'])
+        # If no complex residues found, return original SMILES
         if not complex_positions:
             return smiles, []
         protected_residues = []
         for pos in complex_positions:
+            # Adjust positions based on previous replacements
             start = pos['start'] + offset
             end = pos['end'] + offset
+            # Extract the complex residue part
             complex_part = preprocessed_smiles[start:end]
+            # Verify this is a complete residue (should have proper amino acid structure)
             if not ('[C@H]' in complex_part or '[C@@H]' in complex_part):
+                continue  # Skip if not a proper amino acid structure
+            # Create a placeholder for this complex residue
             placeholder = f"COMPLEX_RESIDUE_{len(protected_residues)}"
+            # Replace the complex part with the placeholder
             preprocessed_smiles = preprocessed_smiles[:start] + placeholder + preprocessed_smiles[end:]
+            # Track the offset change
             offset += len(placeholder) - (end - start)
+            # Store the residue information
             protected_residues.append({
                 'placeholder': placeholder,
                 'type': pos['type'],
                 'content': complex_part
             })
+            # Debug
+            print(f"Protected {pos['type']}: {complex_part[:20]}... as {placeholder}")
+        return preprocessed_smiles, protected_residues
     def split_on_bonds(self, smiles, protected_residues=None):
         """Split SMILES into segments based on peptide bonds, with improved handling of protected residues"""
         positions = []
                 })
                 used.update(range(match.start(), match.end()))
+        # Then find all other bonds
         for pattern, bond_type in self.bond_patterns:
             for match in re.finditer(pattern, smiles):
                 if not any(p in range(match.start(), match.end()) for p in used):
                     })
                     used.update(range(match.start(), match.end()))
+        # Sort all positions
         bond_positions.sort(key=lambda x: x['start'])
         # Combine complex residue positions and bond positions
         # Create segments
         segments = []
+        # First segment (if not starting with a bond or complex residue)
         if all_positions and all_positions[0]['start'] > 0:
             segments.append({
                 'content': smiles[0:all_positions[0]['start']],
                 'complex_after': all_positions[0]['pattern'] if all_positions[0]['type'] == 'complex' else None
             })
+        # Process segments between positions
         for i in range(len(all_positions)-1):
             current = all_positions[i]
             next_pos = all_positions[i+1]
+            # Handle complex residues
             if current['type'] == 'complex':
                 segments.append({
                     'content': current['content'],
                     'bond_after': next_pos['pattern'] if next_pos['type'] != 'complex' else None,
                     'complex_type': current['residue_type']
                 })
+            # Handle regular bonds
             elif current['type'] == 'gly':
                 segments.append({
                     'content': 'NCC(=O)',
                         'bond_after': next_pos['pattern'] if next_pos['type'] != 'complex' else None
                     })
+        # Last segment
         if all_positions and all_positions[-1]['end'] < len(smiles):
             if all_positions[-1]['type'] == 'complex':
                 segments.append({
                 })
         return segments
+    def is_peptide(self, smiles):
+        """Check if the SMILES represents a peptide structure"""
+        mol = Chem.MolFromSmiles(smiles)
+        if mol is None:
+            return False
+        # Look for peptide bonds: NC(=O) pattern
+        peptide_bond_pattern = Chem.MolFromSmarts('[NH][C](=O)')
+        if mol.HasSubstructMatch(peptide_bond_pattern):
+            return True
+        # Look for N-methylated peptide bonds: N(C)C(=O) pattern
+        n_methyl_pattern = Chem.MolFromSmarts('[N;H0;$(NC)](C)[C](=O)')
+        if mol.HasSubstructMatch(n_methyl_pattern):
+            return True
+        return False
+    def is_cyclic(self, smiles):
+        """Improved cyclic peptide detection"""
+        # Check for C-terminal carboxyl
+        if smiles.endswith('C(=O)O'):
+            return False, [], []
+        # Find all numbers used in ring closures
+        ring_numbers = re.findall(r'(?:^|[^c])[0-9](?=[A-Z@\(\)])', smiles)
+        # Find aromatic ring numbers
+        aromatic_matches = re.findall(r'c[0-9](?:ccccc|c\[nH\]c)[0-9]', smiles)
+        aromatic_cycles = []
+        for match in aromatic_matches:
+            numbers = re.findall(r'[0-9]', match)
+            aromatic_cycles.extend(numbers)
+        # Numbers that aren't part of aromatic rings are peptide cycles
+        peptide_cycles = [n for n in ring_numbers if n not in aromatic_cycles]
+        is_cyclic = len(peptide_cycles) > 0 and not smiles.endswith('C(=O)O')
+        return is_cyclic, peptide_cycles, aromatic_cycles
     def clean_terminal_carboxyl(self, segment):
         """Remove C-terminal carboxyl only if it's the true terminus"""
         # Only clean if:
         # 1. Contains C(=O)O
         # 2. No bond_after exists (meaning it's the last segment)
         if 'C(=O)O' in content and not segment.get('bond_after'):
             # Remove C(=O)O pattern regardless of position
             cleaned = re.sub(r'\(C\(=O\)O\)', '', content)
             # Remove any leftover empty parentheses
             cleaned = re.sub(r'\(\)', '', cleaned)
             return cleaned
         return content
     def identify_residue(self, segment):
         """Identify residue with Pro reconstruction"""
         # Only clean terminal carboxyl if this is the last segment
             print("DIRECT MATCH: Found Cmt at beginning")
             return 'Cmt', mods
+        # VERY EXPLICIT check for the last segment in your example
         if '[C@@H]3CCCN3C2=O)(c2ccccc2)c2ccccc2)cc' in content:
             print("DIRECT MATCH: Found Pro at end")
             return 'Pro', mods
+        # === Original amino acid patterns ===
+        # Eal - Glu(OAll) - Multiple patterns
         if 'CCC(=O)OCC=C' in content or 'CC(=O)OCC=C' in content or 'C=CCOC(=O)CC' in content:
             return 'Eal', mods
         # Proline (P) - flexible ring numbers
         if any([
             # Check for any ring number in bond patterns
         if ('N1[C@H](CCC1)' in content):
             return 'pro', mods
+        # Tryptophan (W) - more specific indole pattern
         if re.search(r'c[0-9]c\[nH\]c[0-9]ccccc[0-9][0-9]', content) and \
         'c[nH]c' in content.replace(' ', ''):
+            # Check stereochemistry for D/L
             if '[C@H](CC' in content:  # D-form
                 return 'trp', mods
             return 'Trp', mods
         # Lysine (K) - both patterns
         if '[C@@H](CCCCN)' in content or '[C@H](CCCCN)' in content:
+            # Check stereochemistry for D/L
             if '[C@H](CCCCN)' in content:  # D-form
                 return 'lys', mods
             return 'Lys', mods
         # Arginine (R) - both patterns
         if '[C@@H](CCCNC(=N)N)' in content or '[C@H](CCCNC(=N)N)' in content:
+            # Check stereochemistry for D/L
             if '[C@H](CCCNC(=N)N)' in content:  # D-form
                 return 'arg', mods
             return 'Arg', mods
+        # Regular residue identification
         if content == 'C' and segment.get('bond_before') and segment.get('bond_after'):
             # If it's surrounded by peptide bonds, it's almost certainly Gly
             if ('C(=O)N' in segment['bond_before'] or 'NC(=O)' in segment['bond_before'] or 'N(C)C(=O)' in segment['bond_before']) and \
                ('NC(=O)' in segment['bond_after'] or 'C(=O)N' in segment['bond_after'] or 'N(C)C(=O)' in segment['bond_after']):
                 return 'Gly', mods
+        # Case 2: Cyclic terminal glycine - typically contains 'CNC' with ring closure
+        if 'CNC' in content and any(f'C{i}=' in content for i in range(1, 10)):
+            return 'Gly', mods  # This will catch patterns like 'CNC1=O'
+        if not segment.get('bond_before') and segment.get('bond_after'):
+            if content == 'C' or content == 'NC':
+                if ('NC(=O)' in segment['bond_after'] or 'C(=O)N' in segment['bond_after'] or 'N(C)C(=O)' in segment['bond_after']):
+                    return 'Gly', mods
         # Leucine patterns (L/l)
         if 'CC(C)C[C@H]' in content or 'CC(C)C[C@@H]' in content or '[C@@H](CC(C)C)' in content or '[C@H](CC(C)C)' in content or (('N[C@H](CCC(C)C)' in content or 'N[C@@H](CCC(C)C)' in content) and segment.get('bond_before') is None):
+            # Check stereochemistry for D/L
             if '[C@H](CC(C)C)' in content or 'CC(C)C[C@H]' in content:  # D-form
                 return 'leu', mods
             return 'Leu', mods
         # Phenylalanine patterns (F/f)
         if re.search(r'\[C@H\]\(Cc\d+ccccc\d+\)', content) or re.search(r'\[C@@H\]\(Cc\d+ccccc\d+\)', content):
+            # Check stereochemistry for D/L
             if re.search(r'\[C@H\]\(Cc\d+ccccc\d+\)', content):  # D-form
                 return 'phe', mods
             return 'Phe', mods
             # Make sure it's not leucine
             if not any(p in content for p in ['CC(C)C[C@H]', 'CC(C)C[C@@H]', 'CCC(=O)']):
+                # Check stereochemistry
                 if '[C@H]' in content and not '[C@@H]' in content:  # D-form
                     return 'val', mods
                 return 'Val', mods
         # Isoleucine patterns (I/i)
+        # First check for various isoleucine patterns while excluding valine
+        if (any(['CC[C@@H](C)' in content, '[C@@H](C)CC' in content, '[C@@H](CC)C' in content,
+                'C(C)C[C@@H]' in content, '[C@@H]([C@H](C)CC)' in content, '[C@H]([C@@H](C)CC)' in content,
+                '[C@@H]([C@@H](C)CC)' in content, '[C@H]([C@H](C)CC)' in content,
+                'C[C@H](CC)[C@@H]' in content, 'C[C@@H](CC)[C@H]' in content,
+                'C[C@H](CC)[C@H]' in content, 'C[C@@H](CC)[C@@H]' in content,
+                'CC[C@H](C)[C@@H]' in content, 'CC[C@@H](C)[C@H]' in content,
+                'CC[C@H](C)[C@H]' in content, 'CC[C@@H](C)[C@@H]' in content])
+            and 'CC(C)C' not in content):  # Exclude valine pattern
+            # Check stereochemistry for D/L forms
+            if any(['[C@H]([C@@H](CC)C)' in content, '[C@H](CC)C' in content,
+                    '[C@H]([C@@H](C)CC)' in content, '[C@H]([C@H](C)CC)' in content,
+                    'C[C@@H](CC)[C@H]' in content, 'C[C@H](CC)[C@H]' in content,
+                    'CC[C@@H](C)[C@H]' in content, 'CC[C@H](C)[C@H]' in content]):
+                # D-form
                 return 'ile', mods
+            # All other stereochemistries are treated as L-form
             return 'Ile', mods
+        # Tpb - Thr(PO(OBzl)OH) - Multiple patterns
+        if re.search(r'\(C\)OP\(=O\)\(O\)OCc[0-9]ccccc[0-9]', content) or 'OP(=O)(O)OCC' in content:
+            return 'Tpb', mods
         # Alanine patterns (A/a)
         if ('[C@H](C)' in content or '[C@@H](C)' in content):
             if not any(p in content for p in ['C(C)C', 'COC', 'CN(', 'C(C)O', 'CC[C@H]', 'CC[C@@H]']):
+                # Check stereochemistry for D/L
                 if '[C@H](C)' in content:  # D-form
                     return 'ala', mods
                 return 'Ala', mods
         # Tyrosine patterns (Y/y)
         if re.search(r'Cc[0-9]ccc\(O\)cc[0-9]', content):
+            # Check stereochemistry for D/L
             if '[C@H](Cc1ccc(O)cc1)' in content:  # D-form
                 return 'tyr', mods
             return 'Tyr', mods
         # Serine patterns (S/s)
         if '[C@H](CO)' in content or '[C@@H](CO)' in content:
             if not ('C(C)O' in content or 'COC' in content):
+                # Check stereochemistry for D/L
                 if '[C@H](CO)' in content:  # D-form
                     return 'ser', mods
                 return 'Ser', mods
         if 'CSSC' in content:
+            # Check for various cysteine-cysteine bridge patterns
             if re.search(r'\[C@@H\].*CSSC.*\[C@@H\]', content) or re.search(r'\[C@H\].*CSSC.*\[C@H\]', content):
                 if '[C@H]' in content and not '[C@@H]' in content:  # D-form
                     return 'cys-cys', mods
                 return 'Cys-Cys', mods
+            # Pattern for cysteine with N-terminal amine group
             if '[C@@H](N)CSSC' in content or '[C@H](N)CSSC' in content:
                 if '[C@H](N)CSSC' in content:  # D-form
                     return 'cys-cys', mods
                 return 'Cys-Cys', mods
+            # Pattern for cysteine with C-terminal carboxyl
             if 'CSSC[C@@H](C(=O)O)' in content or 'CSSC[C@H](C(=O)O)' in content:
                 if 'CSSC[C@H](C(=O)O)' in content:  # D-form
                     return 'cys-cys', mods
         # Cysteine patterns (C/c)
         if '[C@H](CS)' in content or '[C@@H](CS)' in content:
+            # Check stereochemistry for D/L
             if '[C@H](CS)' in content:  # D-form
                 return 'cys', mods
             return 'Cys', mods
         # Methionine patterns (M/m)
         if ('CCSC' in content) or ("CSCC" in content):
+            # Check stereochemistry for D/L
             if '[C@H](CCSC)' in content:  # D-form
                 return 'met', mods
             elif '[C@H]' in content:
         # Glutamine patterns (Q/q)
         if (content == '[C@@H](CC' or content == '[C@H](CC' and segment.get('bond_before')=='C(=O)N' and segment.get('bond_after')=='C(=O)N') or ('CCC(=O)N' in content) or ('CCC(N)=O' in content):
+            # Check stereochemistry for D/L
             if '[C@H](CCC(=O)N)' in content:  # D-form
                 return 'gln', mods
             return 'Gln', mods
         # Asparagine patterns (N/n)
         if (content == '[C@@H](C' or content == '[C@H](C' and segment.get('bond_before')=='C(=O)N' and segment.get('bond_after')=='C(=O)N') or ('CC(=O)N' in content) or ('CCN(=O)' in content) or ('CC(N)=O' in content):
+            # Check stereochemistry for D/L
             if '[C@H](CC(=O)N)' in content:  # D-form
                 return 'asn', mods
             return 'Asn', mods
         # Glutamic acid patterns (E/e)
         if ('CCC(=O)O' in content):
+            # Check stereochemistry for D/L
             if '[C@H](CCC(=O)O)' in content:  # D-form
                 return 'glu', mods
             return 'Glu', mods
         # Aspartic acid patterns (D/d)
         if ('CC(=O)O' in content):
+            # Check stereochemistry for D/L
             if '[C@H](CC(=O)O)' in content:  # D-form
                 return 'asp', mods
             return 'Asp', mods
         if re.search(r'Cc\d+c\[nH\]cn\d+', content) or re.search(r'Cc\d+cnc\[nH\]\d+', content):
+            # Check stereochemistry for D/L
             if '[C@H]' in content:  # D-form
                 return 'his', mods
             return 'His', mods
             'N[C@H](CCCC)' in content or '[C@H](CCCC)' in content) and 'CC(C)' not in content:
             return 'Nle', mods
         # Aib - alpha-aminoisobutyric acid (2-aminoisobutyric acid)
+        # More flexible pattern detection
+        if 'C(C)(C)(N)' in content:
             return 'Aib', mods
+        # Partial Aib pattern but NOT part of t-butyl ester
+        if 'C(C)(C)' in content and 'OC(C)(C)C' not in content:
+            if (segment.get('bond_before') and segment.get('bond_after') and
+                any(bond in segment['bond_before'] for bond in ['C(=O)N', 'NC(=O)', 'N(C)C(=O)']) and
+                any(bond in segment['bond_after'] for bond in ['NC(=O)', 'C(=O)N', 'N(C)C(=O)'])):
+                return 'Aib', mods
+        # Dtg - Asp(OtBu)-(Dmb)Gly - Simplified pattern for better detection
         if 'CC(=O)OC(C)(C)C' in content and 'CC1=C(C=C(C=C1)OC)OC' in content:
             return 'Dtg', mods
+        # Kpg - Lys(palmitoyl-Glu-OtBu) - Simplified pattern
         if 'CCCNC(=O)' in content and 'CCCCCCCCCCCC' in content:
             return 'Kpg', mods
         return None, mods
             #mods.append('O-linked')
         return mods
     def analyze_structure(self, smiles):
         """Main analysis function with preprocessing for complex residues"""
         print("\nAnalyzing structure:", smiles)
         # Check if it's cyclic
         is_cyclic, peptide_cycles, aromatic_cycles = self.is_cyclic(smiles)
+        # Split into segments, respecting protected residues
         segments = self.split_on_bonds(preprocessed_smiles, protected_residues)
         print("\nSegment Analysis:")
             else:
                 print(f"Warning: Could not identify residue in segment: {segment.get('content', 'None')}")
+        # Format the sequence
         three_letter = '-'.join(sequence)
+        # Use the mapping to create one-letter code
         one_letter = ''.join(self.three_to_one.get(aa.split('(')[0], 'X') for aa in sequence)
         if is_cyclic:
             return "Error: Input SMILES does not appear to be a peptide structure.", None, None, []
         try:
+            # Preprocess to protect complex residues
+            pre_smiles, protected_residues = analyzer.preprocess_complex_residues(smiles)
+            # Report protected residues in summary if any
+            protected_info = None
+            if protected_residues:
+                protected_info = [res['type'] for res in protected_residues]
             mol = Chem.MolFromSmiles(smiles)
             if mol is None:
                 return "Error: Invalid SMILES notation.", None, None, []
                 except Exception as e:
                     return f"Error generating 3D structures: {str(e)}", None, None, []
+            analysis = analyzer.analyze_structure(smiles)
+            three_letter = analysis['three_letter']
+            one_letter = analysis['one_letter']
+            is_cyclic = analysis['is_cyclic']
             # Only include segment analysis in output if requested
             if show_segment_details:
+                segments = analyzer.split_on_bonds(smiles)
+                sequence_parts = []
+                output_text = ""
                 output_text += "Segment Analysis:\n"
                 for i, segment in enumerate(segments):
                     output_text += f"\nSegment {i}:\n"
                     else:
                         output_text += f"Warning: Could not identify residue in segment: {segment['content']}\n"
                 output_text += "\n"
+                is_cyclic, peptide_cycles, aromatic_cycles = analyzer.is_cyclic(smiles)
+                three_letter = '-'.join(sequence_parts)
+                one_letter = ''.join(analyzer.three_to_one.get(aa.split('(')[0], 'X') for aa in sequence_parts)
             else:
+                pass
             img_cyclic = annotate_cyclic_structure(mol, three_letter)
                 for filepath in structure_files:
                     summary += f"- {os.path.basename(filepath)}\n"
+            return summary, img_cyclic, img_linear, structure_files if structure_files else []
         except Exception as e:
             return f"Error processing SMILES: {str(e)}", None, None, []
 )
 if __name__ == "__main__":
     iface.launch(share=True)