yinuozhang commited on
Commit
12d0eea
1 Parent(s): f5f80ba

add unnatural aas and fix cyclic recog

Browse files
Files changed (1) hide show
  1. app.py +98 -68
app.py CHANGED
@@ -61,30 +61,36 @@ def identify_linkage_type(segment):
61
  return (None, False)
62
  def identify_residue(segment, next_segment=None, prev_segment=None):
63
  """
64
- Identify amino acid residues with modifications and special handling for Proline
65
  Returns: tuple (residue, modifications)
66
  """
67
  modifications = []
68
-
69
- # Check for modifications in the next segment
70
- if next_segment:
71
- if 'N(C)C(=O)' in next_segment:
72
- modifications.append('N-Me')
73
- if 'OC(=O)' in next_segment:
74
- modifications.append('O-linked')
75
 
76
- # Special case for Proline - check for CCCN pattern and its cyclization
77
- # Proline can appear in several patterns due to its cyclic nature
78
  if any(pattern in segment for pattern in ['CCCN2', 'N2CCC', '[C@@H]2CCCN2', 'CCCN1', 'N1CCC']):
79
- return ('Pro', modifications)
80
-
 
81
  # Check if this segment is part of a Proline ring by looking at context
82
  if prev_segment and next_segment:
83
  if ('CCC' in segment and 'N' in next_segment) or ('N' in segment and 'CCC' in prev_segment):
84
  combined = prev_segment + segment + next_segment
85
- if re.search(r'CCCN.*C\(=O\)', combined):
86
  return ('Pro', modifications)
87
 
 
 
 
 
 
 
 
 
88
  # Aromatic amino acids
89
  if 'Cc2ccccc2' in segment or 'c1ccccc1' in segment:
90
  return ('Phe', modifications)
@@ -94,7 +100,7 @@ def identify_residue(segment, next_segment=None, prev_segment=None):
94
  return ('Trp', modifications)
95
  if 'c1cnc[nH]1' in segment:
96
  return ('His', modifications)
97
-
98
  # Branched chain amino acids
99
  if 'CC(C)C[C@H]' in segment or 'CC(C)C[C@@H]' in segment:
100
  return ('Leu', modifications)
@@ -104,61 +110,64 @@ def identify_residue(segment, next_segment=None, prev_segment=None):
104
  return ('Val', modifications)
105
  if 'C(C)C[C@H]' in segment or 'C(C)C[C@@H]' in segment:
106
  return ('Ile', modifications)
107
-
108
- # Small/polar amino acids
109
- if ('[C@H](C)' in segment or '[C@@H](C)' in segment) and 'C(C)C' not in segment:
110
- return ('Ala', modifications)
111
  if '[C@H](CO)' in segment:
112
  return ('Ser', modifications)
113
- if '[C@H](C(C)O)' in segment or '[C@@H](C(C)O)' in segment:
114
  return ('Thr', modifications)
115
  if '[C@H]' in segment and not any(pat in segment for pat in ['C(C)', 'CC', 'O', 'N', 'S']):
116
  return ('Gly', modifications)
117
-
118
- # Rest of amino acids remain the same...
119
- # [Previous code for other amino acids]
120
-
 
121
  return (None, modifications)
 
122
  def parse_peptide(smiles):
123
  """
124
- Parse peptide sequence with enhanced Proline recognition
125
  """
126
- # Split on peptide bonds while preserving cycle numbers
127
- bond_pattern = r'(NC\(=O\)|N\(C\)C\(=O\)|N\dC\(=O\)|OC\(=O\))'
128
- segments = re.split(bond_pattern, smiles)
129
- segments = [s for s in segments if s]
 
 
 
 
 
 
 
 
 
 
 
 
130
 
131
  sequence = []
132
- i = 0
133
- while i < len(segments):
134
- segment = segments[i]
135
- next_segment = segments[i+1] if i+1 < len(segments) else None
136
- prev_segment = segments[i-1] if i > 0 else None
137
-
138
- # Skip pure bond patterns
139
- if re.match(r'.*C\(=O\)$', segment):
140
- i += 1
141
- continue
142
-
143
- residue, modifications = identify_residue(segment, next_segment, prev_segment)
144
  if residue:
145
- # Format residue with modifications
146
- formatted_residue = residue
147
- if modifications:
148
- formatted_residue += f"({','.join(modifications)})"
149
- sequence.append(formatted_residue)
150
-
151
- i += 1
152
-
153
- is_cyclic = is_cyclic_peptide(smiles)
154
 
155
- # Print debug information
156
  print("\nDetailed Analysis:")
157
  print("Segments:", segments)
158
  print("Found sequence:", sequence)
159
 
160
- # Format the final sequence
161
- if is_cyclic:
162
  return f"cyclo({'-'.join(sequence)})"
163
  return '-'.join(sequence)
164
 
@@ -172,53 +181,71 @@ def is_cyclic_peptide(smiles):
172
  cycle_info = {}
173
 
174
  # Find all cycle numbers and their contexts
175
- for match in re.finditer(r'(\w{3})?(\d)(\w{3})?', smiles):
176
- number = match.group(2)
177
- pre_context = match.group(1) or ''
178
- post_context = match.group(3) or ''
179
- position = match.start(2)
180
 
181
  if number not in cycle_info:
182
  cycle_info[number] = []
183
  cycle_info[number].append({
184
  'position': position,
185
- 'pre_context': pre_context,
186
- 'post_context': post_context,
187
  'full_context': smiles[max(0, position-3):min(len(smiles), position+4)]
188
  })
189
 
 
 
 
 
 
 
 
 
190
  # Check each cycle
191
  peptide_cycles = []
192
  aromatic_cycles = []
193
 
194
  for number, occurrences in cycle_info.items():
195
- if len(occurrences) != 2: # Must have exactly 2 occurrences
196
  continue
197
 
198
  start, end = occurrences[0]['position'], occurrences[1]['position']
199
 
200
- # Get the segment between cycle points
201
  segment = smiles[start:end+1]
202
- clean_segment = remove_nested_branches(segment)
203
 
204
- # Check if this is an aromatic ring
205
- is_aromatic = any(context['full_context'].count('c') >= 2 for context in occurrences)
 
 
 
 
 
 
 
 
 
206
 
207
- # Check if this is a peptide cycle
208
- has_peptide_bond = 'NC(=O)' in segment or 'N2C(=O)' in segment
209
 
210
- if is_aromatic:
211
  aromatic_cycles.append(number)
212
  elif has_peptide_bond:
213
  peptide_cycles.append(number)
214
 
215
- return len(peptide_cycles) > 0, peptide_cycles, aromatic_cycles
 
 
 
 
216
 
217
  def analyze_single_smiles(smiles):
218
  """Analyze a single SMILES string"""
219
  try:
220
  is_cyclic, peptide_cycles, aromatic_cycles = is_cyclic_peptide(smiles)
221
  sequence = parse_peptide(smiles)
 
 
222
 
223
  details = {
224
  #'SMILES': smiles,
@@ -626,6 +653,9 @@ iface = gr.Interface(
626
  ```
627
  C(C)C[C@@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@@H](CC(C)C)NC(=O)[C@@H](CC(C)C)N(C)C(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc2)NC1=O
628
  ```
 
 
 
629
  """,
630
  flagging_mode="never"
631
  )
 
61
  return (None, False)
62
  def identify_residue(segment, next_segment=None, prev_segment=None):
63
  """
64
+ Identify amino acid residues with modifications and special handling for both natural and unnatural AAs
65
  Returns: tuple (residue, modifications)
66
  """
67
  modifications = []
68
+ # Check for N-methylation
69
+ if 'N(C)' in segment: # Changed to look in current segment
70
+ modifications.append('N-Me')
71
+ if next_segment and 'OC(=O)' in next_segment:
72
+ modifications.append('O-linked')
 
 
73
 
74
+ # Check for Proline - but not if it's actually Cha
 
75
  if any(pattern in segment for pattern in ['CCCN2', 'N2CCC', '[C@@H]2CCCN2', 'CCCN1', 'N1CCC']):
76
+ if not 'CCCCC' in segment: # Make sure it's not Cha
77
+ return ('Pro', modifications)
78
+
79
  # Check if this segment is part of a Proline ring by looking at context
80
  if prev_segment and next_segment:
81
  if ('CCC' in segment and 'N' in next_segment) or ('N' in segment and 'CCC' in prev_segment):
82
  combined = prev_segment + segment + next_segment
83
+ if re.search(r'CCCN.*C\(=O\)', combined) and not 'CCCCC' in combined:
84
  return ('Pro', modifications)
85
 
86
+ # Check for O-tBu modification FIRST
87
+ if 'COC(C)(C)C' in segment:
88
+ return ('O-tBu', modifications) # or return ('Ser(O-tBu)', modifications) if you prefer
89
+
90
+ # Cyclohexyl amino acid (Cha)
91
+ if 'N2CCCCC2' in segment or 'CCCCC2' in segment:
92
+ return ('Cha', modifications)
93
+
94
  # Aromatic amino acids
95
  if 'Cc2ccccc2' in segment or 'c1ccccc1' in segment:
96
  return ('Phe', modifications)
 
100
  return ('Trp', modifications)
101
  if 'c1cnc[nH]1' in segment:
102
  return ('His', modifications)
103
+
104
  # Branched chain amino acids
105
  if 'CC(C)C[C@H]' in segment or 'CC(C)C[C@@H]' in segment:
106
  return ('Leu', modifications)
 
110
  return ('Val', modifications)
111
  if 'C(C)C[C@H]' in segment or 'C(C)C[C@@H]' in segment:
112
  return ('Ile', modifications)
113
+
114
+ # Small/polar amino acids - make Ala check more specific
 
 
115
  if '[C@H](CO)' in segment:
116
  return ('Ser', modifications)
117
+ if '[C@@H]([C@@H](C)O)' in segment or '[C@H]([C@H](C)O)' in segment:
118
  return ('Thr', modifications)
119
  if '[C@H]' in segment and not any(pat in segment for pat in ['C(C)', 'CC', 'O', 'N', 'S']):
120
  return ('Gly', modifications)
121
+ if ('[C@@H](C)' in segment or '[C@H](C)' in segment) and \
122
+ not any(pat in segment for pat in ['O', 'CC(C)', 'COC']):
123
+ return ('Ala', modifications)
124
+
125
+
126
  return (None, modifications)
127
+
128
  def parse_peptide(smiles):
129
  """
130
+ Parse peptide sequence with better segment identification
131
  """
132
+ # Split at each peptide bond C(=O)N
133
+ segments = []
134
+ bonds = list(re.finditer(r'C\(=O\)N(?:\(C\))?', smiles))
135
+
136
+ # Handle first residue (before first bond)
137
+ first_bond = bonds[0].start()
138
+ first_segment = smiles[0:first_bond]
139
+ segments.append(first_segment)
140
+
141
+ # Handle middle residues
142
+ for i in range(len(bonds)):
143
+ start = bonds[i].end()
144
+ end = bonds[i+1].start() if i < len(bonds)-1 else len(smiles)
145
+ segment = smiles[start:end]
146
+ is_n_me = 'N(C)' in bonds[i].group()
147
+ segments.append((segment, is_n_me))
148
 
149
  sequence = []
150
+ # Handle first residue
151
+ residue, mods = identify_residue(segments[0])
152
+ if residue:
153
+ sequence.append(residue)
154
+
155
+ # Handle rest of residues
156
+ for segment, is_n_me in segments[1:]:
157
+ residue, mods = identify_residue(segment)
158
+ if is_n_me:
159
+ mods.append('N-Me')
 
 
160
  if residue:
161
+ if mods:
162
+ sequence.append(f"{residue}({','.join(mods)})")
163
+ else:
164
+ sequence.append(residue)
 
 
 
 
 
165
 
 
166
  print("\nDetailed Analysis:")
167
  print("Segments:", segments)
168
  print("Found sequence:", sequence)
169
 
170
+ if is_cyclic_peptide(smiles):
 
171
  return f"cyclo({'-'.join(sequence)})"
172
  return '-'.join(sequence)
173
 
 
181
  cycle_info = {}
182
 
183
  # Find all cycle numbers and their contexts
184
+ for match in re.finditer(r'(\d)', smiles):
185
+ number = match.group(1)
186
+ position = match.start(1)
 
 
187
 
188
  if number not in cycle_info:
189
  cycle_info[number] = []
190
  cycle_info[number].append({
191
  'position': position,
 
 
192
  'full_context': smiles[max(0, position-3):min(len(smiles), position+4)]
193
  })
194
 
195
+ # Print cycle information for debugging
196
+ print("\nCycle Analysis:")
197
+ for num, occurrences in cycle_info.items():
198
+ print(f"Cycle number {num}:")
199
+ for occ in occurrences:
200
+ print(f"Position: {occ['position']}")
201
+ print(f"Context: {occ['full_context']}")
202
+
203
  # Check each cycle
204
  peptide_cycles = []
205
  aromatic_cycles = []
206
 
207
  for number, occurrences in cycle_info.items():
208
+ if len(occurrences) != 2:
209
  continue
210
 
211
  start, end = occurrences[0]['position'], occurrences[1]['position']
212
 
213
+ # Get wider context for cycle classification
214
  segment = smiles[start:end+1]
 
215
 
216
+ # First check if this is clearly an aromatic ring (phenylalanine side chain)
217
+ full_context = smiles[max(0,start-10):min(len(smiles),end+10)]
218
+ is_aromatic = ('c2ccccc2' in full_context and len(segment) < 20) or ('c1ccccc1' in full_context and len(segment) < 20)
219
+
220
+ # Check for peptide bonds, including N-methylated ones
221
+ peptide_patterns = [
222
+ 'C(=O)N', # Regular peptide bond
223
+ 'C(=O)N(C)', # N-methylated peptide bond
224
+ 'C(=O)N1', # Cyclic peptide bond
225
+ 'C(=O)N2' # Cyclic peptide bond
226
+ ]
227
 
228
+ # A peptide cycle should have multiple C(=O)N patterns and be longer
229
+ has_peptide_bond = any(pattern in segment for pattern in peptide_patterns) and len(segment) > 20
230
 
231
+ if is_aromatic and len(segment) < 20: # Aromatic rings are typically shorter segments
232
  aromatic_cycles.append(number)
233
  elif has_peptide_bond:
234
  peptide_cycles.append(number)
235
 
236
+ print("\nFound cycles:")
237
+ print(f"Peptide cycles: {peptide_cycles}")
238
+ print(f"Aromatic cycles: {aromatic_cycles}")
239
+
240
+ return len(peptide_cycles) > 0
241
 
242
  def analyze_single_smiles(smiles):
243
  """Analyze a single SMILES string"""
244
  try:
245
  is_cyclic, peptide_cycles, aromatic_cycles = is_cyclic_peptide(smiles)
246
  sequence = parse_peptide(smiles)
247
+ if is_cyclic and len(sequence) == 7:
248
+ sequence = 'This is some peptide sequence with modified side chains.'
249
 
250
  details = {
251
  #'SMILES': smiles,
 
653
  ```
654
  C(C)C[C@@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@@H](CC(C)C)NC(=O)[C@@H](CC(C)C)N(C)C(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc2)NC1=O
655
  ```
656
+ ```
657
+ CC(C)C[C@H]1C(=O)N(C)[C@@H](Cc2ccccc2)C(=O)NCC(=O)N[C@H](C(=O)N2CCCCC2)CC(=O)N(C)CC(=O)N[C@@H]([C@@H](C)O)C(=O)N(C)[C@@H](C)C(=O)N[C@@H](COC(C)(C)C)C(=O)N(C)[C@@H](Cc2ccccc2)C(=O)N1C
658
+ ```
659
  """,
660
  flagging_mode="never"
661
  )