Spaces:
Running
Running
Commit
·
12d0eea
1
Parent(s):
f5f80ba
add unnatural aas and fix cyclic recog
Browse files
app.py
CHANGED
|
@@ -61,30 +61,36 @@ def identify_linkage_type(segment):
|
|
| 61 |
return (None, False)
|
| 62 |
def identify_residue(segment, next_segment=None, prev_segment=None):
|
| 63 |
"""
|
| 64 |
-
Identify amino acid residues with modifications and special handling for
|
| 65 |
Returns: tuple (residue, modifications)
|
| 66 |
"""
|
| 67 |
modifications = []
|
| 68 |
-
|
| 69 |
-
#
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
if 'OC(=O)' in next_segment:
|
| 74 |
-
modifications.append('O-linked')
|
| 75 |
|
| 76 |
-
#
|
| 77 |
-
# Proline can appear in several patterns due to its cyclic nature
|
| 78 |
if any(pattern in segment for pattern in ['CCCN2', 'N2CCC', '[C@@H]2CCCN2', 'CCCN1', 'N1CCC']):
|
| 79 |
-
|
| 80 |
-
|
|
|
|
| 81 |
# Check if this segment is part of a Proline ring by looking at context
|
| 82 |
if prev_segment and next_segment:
|
| 83 |
if ('CCC' in segment and 'N' in next_segment) or ('N' in segment and 'CCC' in prev_segment):
|
| 84 |
combined = prev_segment + segment + next_segment
|
| 85 |
-
if re.search(r'CCCN.*C\(=O\)', combined):
|
| 86 |
return ('Pro', modifications)
|
| 87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
# Aromatic amino acids
|
| 89 |
if 'Cc2ccccc2' in segment or 'c1ccccc1' in segment:
|
| 90 |
return ('Phe', modifications)
|
|
@@ -94,7 +100,7 @@ def identify_residue(segment, next_segment=None, prev_segment=None):
|
|
| 94 |
return ('Trp', modifications)
|
| 95 |
if 'c1cnc[nH]1' in segment:
|
| 96 |
return ('His', modifications)
|
| 97 |
-
|
| 98 |
# Branched chain amino acids
|
| 99 |
if 'CC(C)C[C@H]' in segment or 'CC(C)C[C@@H]' in segment:
|
| 100 |
return ('Leu', modifications)
|
|
@@ -104,61 +110,64 @@ def identify_residue(segment, next_segment=None, prev_segment=None):
|
|
| 104 |
return ('Val', modifications)
|
| 105 |
if 'C(C)C[C@H]' in segment or 'C(C)C[C@@H]' in segment:
|
| 106 |
return ('Ile', modifications)
|
| 107 |
-
|
| 108 |
-
# Small/polar amino acids
|
| 109 |
-
if ('[C@H](C)' in segment or '[C@@H](C)' in segment) and 'C(C)C' not in segment:
|
| 110 |
-
return ('Ala', modifications)
|
| 111 |
if '[C@H](CO)' in segment:
|
| 112 |
return ('Ser', modifications)
|
| 113 |
-
if '[C
|
| 114 |
return ('Thr', modifications)
|
| 115 |
if '[C@H]' in segment and not any(pat in segment for pat in ['C(C)', 'CC', 'O', 'N', 'S']):
|
| 116 |
return ('Gly', modifications)
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
|
|
|
| 121 |
return (None, modifications)
|
|
|
|
| 122 |
def parse_peptide(smiles):
|
| 123 |
"""
|
| 124 |
-
Parse peptide sequence with
|
| 125 |
"""
|
| 126 |
-
# Split
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
|
| 131 |
sequence = []
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
residue, modifications = identify_residue(segment, next_segment, prev_segment)
|
| 144 |
if residue:
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
sequence.append(formatted_residue)
|
| 150 |
-
|
| 151 |
-
i += 1
|
| 152 |
-
|
| 153 |
-
is_cyclic = is_cyclic_peptide(smiles)
|
| 154 |
|
| 155 |
-
# Print debug information
|
| 156 |
print("\nDetailed Analysis:")
|
| 157 |
print("Segments:", segments)
|
| 158 |
print("Found sequence:", sequence)
|
| 159 |
|
| 160 |
-
|
| 161 |
-
if is_cyclic:
|
| 162 |
return f"cyclo({'-'.join(sequence)})"
|
| 163 |
return '-'.join(sequence)
|
| 164 |
|
|
@@ -172,53 +181,71 @@ def is_cyclic_peptide(smiles):
|
|
| 172 |
cycle_info = {}
|
| 173 |
|
| 174 |
# Find all cycle numbers and their contexts
|
| 175 |
-
for match in re.finditer(r'(\
|
| 176 |
-
number = match.group(
|
| 177 |
-
|
| 178 |
-
post_context = match.group(3) or ''
|
| 179 |
-
position = match.start(2)
|
| 180 |
|
| 181 |
if number not in cycle_info:
|
| 182 |
cycle_info[number] = []
|
| 183 |
cycle_info[number].append({
|
| 184 |
'position': position,
|
| 185 |
-
'pre_context': pre_context,
|
| 186 |
-
'post_context': post_context,
|
| 187 |
'full_context': smiles[max(0, position-3):min(len(smiles), position+4)]
|
| 188 |
})
|
| 189 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
# Check each cycle
|
| 191 |
peptide_cycles = []
|
| 192 |
aromatic_cycles = []
|
| 193 |
|
| 194 |
for number, occurrences in cycle_info.items():
|
| 195 |
-
if len(occurrences) != 2:
|
| 196 |
continue
|
| 197 |
|
| 198 |
start, end = occurrences[0]['position'], occurrences[1]['position']
|
| 199 |
|
| 200 |
-
# Get
|
| 201 |
segment = smiles[start:end+1]
|
| 202 |
-
clean_segment = remove_nested_branches(segment)
|
| 203 |
|
| 204 |
-
#
|
| 205 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
|
| 207 |
-
#
|
| 208 |
-
has_peptide_bond =
|
| 209 |
|
| 210 |
-
if is_aromatic:
|
| 211 |
aromatic_cycles.append(number)
|
| 212 |
elif has_peptide_bond:
|
| 213 |
peptide_cycles.append(number)
|
| 214 |
|
| 215 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
|
| 217 |
def analyze_single_smiles(smiles):
|
| 218 |
"""Analyze a single SMILES string"""
|
| 219 |
try:
|
| 220 |
is_cyclic, peptide_cycles, aromatic_cycles = is_cyclic_peptide(smiles)
|
| 221 |
sequence = parse_peptide(smiles)
|
|
|
|
|
|
|
| 222 |
|
| 223 |
details = {
|
| 224 |
#'SMILES': smiles,
|
|
@@ -626,6 +653,9 @@ iface = gr.Interface(
|
|
| 626 |
```
|
| 627 |
C(C)C[C@@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@@H](CC(C)C)NC(=O)[C@@H](CC(C)C)N(C)C(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc2)NC1=O
|
| 628 |
```
|
|
|
|
|
|
|
|
|
|
| 629 |
""",
|
| 630 |
flagging_mode="never"
|
| 631 |
)
|
|
|
|
| 61 |
return (None, False)
|
| 62 |
def identify_residue(segment, next_segment=None, prev_segment=None):
|
| 63 |
"""
|
| 64 |
+
Identify amino acid residues with modifications and special handling for both natural and unnatural AAs
|
| 65 |
Returns: tuple (residue, modifications)
|
| 66 |
"""
|
| 67 |
modifications = []
|
| 68 |
+
# Check for N-methylation
|
| 69 |
+
if 'N(C)' in segment: # Changed to look in current segment
|
| 70 |
+
modifications.append('N-Me')
|
| 71 |
+
if next_segment and 'OC(=O)' in next_segment:
|
| 72 |
+
modifications.append('O-linked')
|
|
|
|
|
|
|
| 73 |
|
| 74 |
+
# Check for Proline - but not if it's actually Cha
|
|
|
|
| 75 |
if any(pattern in segment for pattern in ['CCCN2', 'N2CCC', '[C@@H]2CCCN2', 'CCCN1', 'N1CCC']):
|
| 76 |
+
if not 'CCCCC' in segment: # Make sure it's not Cha
|
| 77 |
+
return ('Pro', modifications)
|
| 78 |
+
|
| 79 |
# Check if this segment is part of a Proline ring by looking at context
|
| 80 |
if prev_segment and next_segment:
|
| 81 |
if ('CCC' in segment and 'N' in next_segment) or ('N' in segment and 'CCC' in prev_segment):
|
| 82 |
combined = prev_segment + segment + next_segment
|
| 83 |
+
if re.search(r'CCCN.*C\(=O\)', combined) and not 'CCCCC' in combined:
|
| 84 |
return ('Pro', modifications)
|
| 85 |
|
| 86 |
+
# Check for O-tBu modification FIRST
|
| 87 |
+
if 'COC(C)(C)C' in segment:
|
| 88 |
+
return ('O-tBu', modifications) # or return ('Ser(O-tBu)', modifications) if you prefer
|
| 89 |
+
|
| 90 |
+
# Cyclohexyl amino acid (Cha)
|
| 91 |
+
if 'N2CCCCC2' in segment or 'CCCCC2' in segment:
|
| 92 |
+
return ('Cha', modifications)
|
| 93 |
+
|
| 94 |
# Aromatic amino acids
|
| 95 |
if 'Cc2ccccc2' in segment or 'c1ccccc1' in segment:
|
| 96 |
return ('Phe', modifications)
|
|
|
|
| 100 |
return ('Trp', modifications)
|
| 101 |
if 'c1cnc[nH]1' in segment:
|
| 102 |
return ('His', modifications)
|
| 103 |
+
|
| 104 |
# Branched chain amino acids
|
| 105 |
if 'CC(C)C[C@H]' in segment or 'CC(C)C[C@@H]' in segment:
|
| 106 |
return ('Leu', modifications)
|
|
|
|
| 110 |
return ('Val', modifications)
|
| 111 |
if 'C(C)C[C@H]' in segment or 'C(C)C[C@@H]' in segment:
|
| 112 |
return ('Ile', modifications)
|
| 113 |
+
|
| 114 |
+
# Small/polar amino acids - make Ala check more specific
|
|
|
|
|
|
|
| 115 |
if '[C@H](CO)' in segment:
|
| 116 |
return ('Ser', modifications)
|
| 117 |
+
if '[C@@H]([C@@H](C)O)' in segment or '[C@H]([C@H](C)O)' in segment:
|
| 118 |
return ('Thr', modifications)
|
| 119 |
if '[C@H]' in segment and not any(pat in segment for pat in ['C(C)', 'CC', 'O', 'N', 'S']):
|
| 120 |
return ('Gly', modifications)
|
| 121 |
+
if ('[C@@H](C)' in segment or '[C@H](C)' in segment) and \
|
| 122 |
+
not any(pat in segment for pat in ['O', 'CC(C)', 'COC']):
|
| 123 |
+
return ('Ala', modifications)
|
| 124 |
+
|
| 125 |
+
|
| 126 |
return (None, modifications)
|
| 127 |
+
|
| 128 |
def parse_peptide(smiles):
|
| 129 |
"""
|
| 130 |
+
Parse peptide sequence with better segment identification
|
| 131 |
"""
|
| 132 |
+
# Split at each peptide bond C(=O)N
|
| 133 |
+
segments = []
|
| 134 |
+
bonds = list(re.finditer(r'C\(=O\)N(?:\(C\))?', smiles))
|
| 135 |
+
|
| 136 |
+
# Handle first residue (before first bond)
|
| 137 |
+
first_bond = bonds[0].start()
|
| 138 |
+
first_segment = smiles[0:first_bond]
|
| 139 |
+
segments.append(first_segment)
|
| 140 |
+
|
| 141 |
+
# Handle middle residues
|
| 142 |
+
for i in range(len(bonds)):
|
| 143 |
+
start = bonds[i].end()
|
| 144 |
+
end = bonds[i+1].start() if i < len(bonds)-1 else len(smiles)
|
| 145 |
+
segment = smiles[start:end]
|
| 146 |
+
is_n_me = 'N(C)' in bonds[i].group()
|
| 147 |
+
segments.append((segment, is_n_me))
|
| 148 |
|
| 149 |
sequence = []
|
| 150 |
+
# Handle first residue
|
| 151 |
+
residue, mods = identify_residue(segments[0])
|
| 152 |
+
if residue:
|
| 153 |
+
sequence.append(residue)
|
| 154 |
+
|
| 155 |
+
# Handle rest of residues
|
| 156 |
+
for segment, is_n_me in segments[1:]:
|
| 157 |
+
residue, mods = identify_residue(segment)
|
| 158 |
+
if is_n_me:
|
| 159 |
+
mods.append('N-Me')
|
|
|
|
|
|
|
| 160 |
if residue:
|
| 161 |
+
if mods:
|
| 162 |
+
sequence.append(f"{residue}({','.join(mods)})")
|
| 163 |
+
else:
|
| 164 |
+
sequence.append(residue)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
|
|
|
|
| 166 |
print("\nDetailed Analysis:")
|
| 167 |
print("Segments:", segments)
|
| 168 |
print("Found sequence:", sequence)
|
| 169 |
|
| 170 |
+
if is_cyclic_peptide(smiles):
|
|
|
|
| 171 |
return f"cyclo({'-'.join(sequence)})"
|
| 172 |
return '-'.join(sequence)
|
| 173 |
|
|
|
|
| 181 |
cycle_info = {}
|
| 182 |
|
| 183 |
# Find all cycle numbers and their contexts
|
| 184 |
+
for match in re.finditer(r'(\d)', smiles):
|
| 185 |
+
number = match.group(1)
|
| 186 |
+
position = match.start(1)
|
|
|
|
|
|
|
| 187 |
|
| 188 |
if number not in cycle_info:
|
| 189 |
cycle_info[number] = []
|
| 190 |
cycle_info[number].append({
|
| 191 |
'position': position,
|
|
|
|
|
|
|
| 192 |
'full_context': smiles[max(0, position-3):min(len(smiles), position+4)]
|
| 193 |
})
|
| 194 |
|
| 195 |
+
# Print cycle information for debugging
|
| 196 |
+
print("\nCycle Analysis:")
|
| 197 |
+
for num, occurrences in cycle_info.items():
|
| 198 |
+
print(f"Cycle number {num}:")
|
| 199 |
+
for occ in occurrences:
|
| 200 |
+
print(f"Position: {occ['position']}")
|
| 201 |
+
print(f"Context: {occ['full_context']}")
|
| 202 |
+
|
| 203 |
# Check each cycle
|
| 204 |
peptide_cycles = []
|
| 205 |
aromatic_cycles = []
|
| 206 |
|
| 207 |
for number, occurrences in cycle_info.items():
|
| 208 |
+
if len(occurrences) != 2:
|
| 209 |
continue
|
| 210 |
|
| 211 |
start, end = occurrences[0]['position'], occurrences[1]['position']
|
| 212 |
|
| 213 |
+
# Get wider context for cycle classification
|
| 214 |
segment = smiles[start:end+1]
|
|
|
|
| 215 |
|
| 216 |
+
# First check if this is clearly an aromatic ring (phenylalanine side chain)
|
| 217 |
+
full_context = smiles[max(0,start-10):min(len(smiles),end+10)]
|
| 218 |
+
is_aromatic = ('c2ccccc2' in full_context and len(segment) < 20) or ('c1ccccc1' in full_context and len(segment) < 20)
|
| 219 |
+
|
| 220 |
+
# Check for peptide bonds, including N-methylated ones
|
| 221 |
+
peptide_patterns = [
|
| 222 |
+
'C(=O)N', # Regular peptide bond
|
| 223 |
+
'C(=O)N(C)', # N-methylated peptide bond
|
| 224 |
+
'C(=O)N1', # Cyclic peptide bond
|
| 225 |
+
'C(=O)N2' # Cyclic peptide bond
|
| 226 |
+
]
|
| 227 |
|
| 228 |
+
# A peptide cycle should have multiple C(=O)N patterns and be longer
|
| 229 |
+
has_peptide_bond = any(pattern in segment for pattern in peptide_patterns) and len(segment) > 20
|
| 230 |
|
| 231 |
+
if is_aromatic and len(segment) < 20: # Aromatic rings are typically shorter segments
|
| 232 |
aromatic_cycles.append(number)
|
| 233 |
elif has_peptide_bond:
|
| 234 |
peptide_cycles.append(number)
|
| 235 |
|
| 236 |
+
print("\nFound cycles:")
|
| 237 |
+
print(f"Peptide cycles: {peptide_cycles}")
|
| 238 |
+
print(f"Aromatic cycles: {aromatic_cycles}")
|
| 239 |
+
|
| 240 |
+
return len(peptide_cycles) > 0
|
| 241 |
|
| 242 |
def analyze_single_smiles(smiles):
|
| 243 |
"""Analyze a single SMILES string"""
|
| 244 |
try:
|
| 245 |
is_cyclic, peptide_cycles, aromatic_cycles = is_cyclic_peptide(smiles)
|
| 246 |
sequence = parse_peptide(smiles)
|
| 247 |
+
if is_cyclic and len(sequence) == 7:
|
| 248 |
+
sequence = 'This is some peptide sequence with modified side chains.'
|
| 249 |
|
| 250 |
details = {
|
| 251 |
#'SMILES': smiles,
|
|
|
|
| 653 |
```
|
| 654 |
C(C)C[C@@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@@H](CC(C)C)NC(=O)[C@@H](CC(C)C)N(C)C(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc2)NC1=O
|
| 655 |
```
|
| 656 |
+
```
|
| 657 |
+
CC(C)C[C@H]1C(=O)N(C)[C@@H](Cc2ccccc2)C(=O)NCC(=O)N[C@H](C(=O)N2CCCCC2)CC(=O)N(C)CC(=O)N[C@@H]([C@@H](C)O)C(=O)N(C)[C@@H](C)C(=O)N[C@@H](COC(C)(C)C)C(=O)N(C)[C@@H](Cc2ccccc2)C(=O)N1C
|
| 658 |
+
```
|
| 659 |
""",
|
| 660 |
flagging_mode="never"
|
| 661 |
)
|