yinuozhang commited on
Commit
a953180
1 Parent(s): 418afab

class format

Browse files
Files changed (1) hide show
  1. app.py +334 -253
app.py CHANGED
@@ -11,257 +11,294 @@ import matplotlib.pyplot as plt
11
  import matplotlib.patches as patches
12
  from io import BytesIO
13
 
14
- def is_peptide(smiles):
15
- """Check if the SMILES represents a peptide by looking for peptide bonds"""
16
- mol = Chem.MolFromSmiles(smiles)
17
- if mol is None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  return False
 
 
 
 
 
 
 
19
 
20
- # Look for peptide bonds: NC(=O) pattern
21
- peptide_bond_pattern = Chem.MolFromSmarts('[NH][C](=O)')
22
- if mol.HasSubstructMatch(peptide_bond_pattern):
23
- return True
 
 
 
 
 
 
 
24
 
25
- # Look for N-methylated peptide bonds: N(C)C(=O) pattern
26
- n_methyl_pattern = Chem.MolFromSmarts('[N;H0;$(NC)](C)[C](=O)')
27
- if mol.HasSubstructMatch(n_methyl_pattern):
28
- return True
29
 
30
- # Look for ester bonds in cyclic depsipeptides: OC(=O) pattern
31
- ester_bond_pattern = Chem.MolFromSmarts('O[C](=O)')
32
- if mol.HasSubstructMatch(ester_bond_pattern):
33
- return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
- return False
36
-
37
- def remove_nested_branches(smiles):
38
- """Remove nested branches from SMILES string"""
39
- result = ''
40
- depth = 0
41
- for char in smiles:
42
- if char == '(':
43
- depth += 1
44
- elif char == ')':
45
- depth -= 1
46
- elif depth == 0:
47
- result += char
48
- return result
49
 
50
- def identify_linkage_type(segment):
51
- """
52
- Identify the type of linkage between residues
53
- Returns: tuple (type, is_n_methylated)
54
- """
55
- if 'OC(=O)' in segment:
56
- return ('ester', False)
57
- elif 'N(C)C(=O)' in segment:
58
- return ('peptide', True) # N-methylated peptide bond
59
- elif 'NC(=O)' in segment:
60
- return ('peptide', False) # Regular peptide bond
61
- return (None, False)
62
- def identify_residue(segment, next_segment=None, prev_segment=None):
63
- """
64
- Identify amino acid residues with modifications and special handling for both natural and unnatural AAs
65
- Returns: tuple (residue, modifications)
66
- """
67
- modifications = []
68
- # Check for N-methylation
69
- if 'N(C)' in segment: # Changed to look in current segment
70
- modifications.append('N-Me')
71
- if next_segment and 'OC(=O)' in next_segment:
72
- modifications.append('O-linked')
73
-
74
- # Check for Proline - but not if it's actually Cha
75
- if any(pattern in segment for pattern in ['CCCN2', 'N2CCC', '[C@@H]2CCCN2', 'CCCN1', 'N1CCC']):
76
- if not 'CCCCC' in segment: # Make sure it's not Cha
77
- return ('Pro', modifications)
78
-
79
- # Check if this segment is part of a Proline ring by looking at context
80
- if prev_segment and next_segment:
81
- if ('CCC' in segment and 'N' in next_segment) or ('N' in segment and 'CCC' in prev_segment):
82
- combined = prev_segment + segment + next_segment
83
- if re.search(r'CCCN.*C\(=O\)', combined) and not 'CCCCC' in combined:
84
- return ('Pro', modifications)
85
 
86
- # Check for O-tBu modification FIRST
87
- if 'COC(C)(C)C' in segment:
88
- return ('O-tBu', modifications) # or return ('Ser(O-tBu)', modifications) if you prefer
 
 
 
 
 
 
89
 
90
- # Cyclohexyl amino acid (Cha)
91
- if 'N2CCCCC2' in segment or 'CCCCC2' in segment:
92
- return ('Cha', modifications)
93
-
94
- # Aromatic amino acids
95
- if 'Cc2ccccc2' in segment or 'c1ccccc1' in segment:
96
- return ('Phe', modifications)
97
- if 'c2ccc(O)cc2' in segment:
98
- return ('Tyr', modifications)
99
- if 'c1c[nH]c2ccccc12' in segment:
100
- return ('Trp', modifications)
101
- if 'c1cnc[nH]1' in segment:
102
- return ('His', modifications)
103
-
104
- # Branched chain amino acids
105
- if 'CC(C)C[C@H]' in segment or 'CC(C)C[C@@H]' in segment:
106
- return ('Leu', modifications)
107
- if '[C@H](CC(C)C)' in segment or '[C@@H](CC(C)C)' in segment:
108
- return ('Leu', modifications)
109
- if 'C(C)C' in segment and not any(pat in segment for pat in ['CC(C)C', 'C(C)C[C@H]', 'C(C)C[C@@H]']):
110
- return ('Val', modifications)
111
- if 'C(C)C[C@H]' in segment or 'C(C)C[C@@H]' in segment:
112
- return ('Ile', modifications)
113
-
114
- # Small/polar amino acids - make Ala check more specific
115
- if '[C@H](CO)' in segment:
116
- return ('Ser', modifications)
117
- if '[C@@H]([C@@H](C)O)' in segment or '[C@H]([C@H](C)O)' in segment:
118
- return ('Thr', modifications)
119
- if '[C@H]' in segment and not any(pat in segment for pat in ['C(C)', 'CC', 'O', 'N', 'S']):
120
- return ('Gly', modifications)
121
- if ('[C@@H](C)' in segment or '[C@H](C)' in segment) and \
122
- not any(pat in segment for pat in ['O', 'CC(C)', 'COC']):
123
- return ('Ala', modifications)
124
 
125
-
126
- return (None, modifications)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
- def parse_peptide(smiles):
129
- """
130
- Parse peptide sequence with better segment identification
131
- """
132
- # Split at each peptide bond C(=O)N
133
- segments = []
134
- bonds = list(re.finditer(r'C\(=O\)N(?:\(C\))?', smiles))
135
-
136
- # Handle first residue (before first bond)
137
- first_bond = bonds[0].start()
138
- first_segment = smiles[0:first_bond]
139
- segments.append(first_segment)
140
-
141
- # Handle middle residues
142
- for i in range(len(bonds)):
143
- start = bonds[i].end()
144
- end = bonds[i+1].start() if i < len(bonds)-1 else len(smiles)
145
- segment = smiles[start:end]
146
- is_n_me = 'N(C)' in bonds[i].group()
147
- segments.append((segment, is_n_me))
148
-
149
- sequence = []
150
- # Handle first residue
151
- residue, mods = identify_residue(segments[0])
152
- if residue:
153
- sequence.append(residue)
154
-
155
- # Handle rest of residues
156
- for segment, is_n_me in segments[1:]:
157
- residue, mods = identify_residue(segment)
158
- if is_n_me:
159
- mods.append('N-Me')
160
- if residue:
161
- if mods:
162
- sequence.append(f"{residue}({','.join(mods)})")
163
- else:
164
- sequence.append(residue)
165
-
166
- print("\nDetailed Analysis:")
167
- print("Segments:", segments)
168
- print("Found sequence:", sequence)
169
-
170
- if is_cyclic_peptide(smiles):
171
- return f"cyclo({'-'.join(sequence)})"
172
- return '-'.join(sequence)
173
 
174
- def is_cyclic_peptide(smiles):
175
- """
176
- Determine if SMILES represents a cyclic peptide by checking:
177
- 1. Proper cycle number pairing
178
- 2. Presence of peptide bonds between cycle points
179
- 3. Distinguishing between aromatic rings and peptide cycles
180
- """
181
- cycle_info = {}
182
-
183
- # Find all cycle numbers and their contexts
184
- for match in re.finditer(r'(\d)', smiles):
185
- number = match.group(1)
186
- position = match.start(1)
187
 
188
- if number not in cycle_info:
189
- cycle_info[number] = []
190
- cycle_info[number].append({
191
- 'position': position,
192
- 'full_context': smiles[max(0, position-3):min(len(smiles), position+4)]
193
- })
194
-
195
- # Print cycle information for debugging
196
- print("\nCycle Analysis:")
197
- for num, occurrences in cycle_info.items():
198
- print(f"Cycle number {num}:")
199
- for occ in occurrences:
200
- print(f"Position: {occ['position']}")
201
- print(f"Context: {occ['full_context']}")
202
-
203
- # Check each cycle
204
- peptide_cycles = []
205
- aromatic_cycles = []
206
-
207
- for number, occurrences in cycle_info.items():
208
- if len(occurrences) != 2:
209
- continue
210
-
211
- start, end = occurrences[0]['position'], occurrences[1]['position']
212
 
213
- # Get wider context for cycle classification
214
- segment = smiles[start:end+1]
215
 
216
- # First check if this is clearly an aromatic ring (phenylalanine side chain)
217
- full_context = smiles[max(0,start-10):min(len(smiles),end+10)]
218
- is_aromatic = ('c2ccccc2' in full_context and len(segment) < 20) or ('c1ccccc1' in full_context and len(segment) < 20)
219
 
220
- # Check for peptide bonds, including N-methylated ones
221
- peptide_patterns = [
222
- 'C(=O)N', # Regular peptide bond
223
- 'C(=O)N(C)', # N-methylated peptide bond
224
- 'C(=O)N1', # Cyclic peptide bond
225
- 'C(=O)N2' # Cyclic peptide bond
226
- ]
227
 
228
- # A peptide cycle should have multiple C(=O)N patterns and be longer
229
- has_peptide_bond = any(pattern in segment for pattern in peptide_patterns) and len(segment) > 20
230
 
231
- if is_aromatic and len(segment) < 20: # Aromatic rings are typically shorter segments
232
- aromatic_cycles.append(number)
233
- elif has_peptide_bond:
234
- peptide_cycles.append(number)
235
-
236
- print("\nFound cycles:")
237
- print(f"Peptide cycles: {peptide_cycles}")
238
- print(f"Aromatic cycles: {aromatic_cycles}")
239
-
240
- return len(peptide_cycles) > 0
241
 
242
- def analyze_single_smiles(smiles):
243
- """Analyze a single SMILES string"""
244
- try:
245
- is_cyclic, peptide_cycles, aromatic_cycles = is_cyclic_peptide(smiles)
246
- sequence = parse_peptide(smiles)
 
 
 
 
247
 
248
- details = {
249
- #'SMILES': smiles,
250
- 'Sequence': sequence,
251
- 'Is Cyclic': 'Yes' if is_cyclic else 'No',
252
- #'Peptide Cycles': ', '.join(peptide_cycles) if peptide_cycles else 'None',
253
- #'Aromatic Cycles': ', '.join(aromatic_cycles) if aromatic_cycles else 'None'
254
- }
255
- return details
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
- except Exception as e:
258
- return {
259
- #'SMILES': smiles,
260
- 'Sequence': f'Error: {str(e)}',
261
- 'Is Cyclic': 'Error',
262
- #'Peptide Cycles': 'Error',
263
- #'Aromatic Cycles': 'Error'
264
- }
265
  """
266
  def annotate_cyclic_structure(mol, sequence):
267
  '''Create annotated 2D structure with clear, non-overlapping residue labels'''
@@ -529,16 +566,15 @@ def create_enhanced_linear_viz(sequence, smiles):
529
  return fig
530
 
531
  def process_input(smiles_input=None, file_obj=None, show_linear=False):
532
- """Process input and create visualizations"""
533
- results = []
534
- images = []
535
 
536
  # Handle direct SMILES input
537
  if smiles_input:
538
  smiles = smiles_input.strip()
539
 
540
- # First check if it's a peptide
541
- if not is_peptide(smiles):
542
  return "Error: Input SMILES does not appear to be a peptide structure.", None, None
543
 
544
  try:
@@ -547,9 +583,32 @@ def process_input(smiles_input=None, file_obj=None, show_linear=False):
547
  if mol is None:
548
  return "Error: Invalid SMILES notation.", None, None
549
 
550
- # Get sequence and cyclic information
551
- sequence = parse_peptide(smiles)
552
- is_cyclic, peptide_cycles, aromatic_cycles = is_cyclic_peptide(smiles)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
553
 
554
  # Create cyclic structure visualization
555
  img_cyclic = annotate_cyclic_structure(mol, sequence)
@@ -558,19 +617,21 @@ def process_input(smiles_input=None, file_obj=None, show_linear=False):
558
  img_linear = None
559
  if show_linear:
560
  fig_linear = create_enhanced_linear_viz(sequence, smiles)
561
-
562
- # Convert matplotlib figure to image
563
  buf = BytesIO()
564
  fig_linear.savefig(buf, format='png', bbox_inches='tight', dpi=300)
565
  buf.seek(0)
566
  img_linear = Image.open(buf)
567
  plt.close(fig_linear)
568
 
569
- # Format text output
570
- output_text = f"Sequence: {sequence}\n"
571
- output_text += f"Is Cyclic: {'Yes' if is_cyclic else 'No'}\n"
 
 
 
 
572
 
573
- return output_text, img_cyclic, img_linear
574
 
575
  except Exception as e:
576
  return f"Error processing SMILES: {str(e)}", None, None
@@ -578,31 +639,51 @@ def process_input(smiles_input=None, file_obj=None, show_linear=False):
578
  # Handle file input
579
  if file_obj is not None:
580
  try:
581
- # Handle file content based on file object type
582
- if hasattr(file_obj, 'name'): # If it's a file path
583
  with open(file_obj.name, 'r') as f:
584
  content = f.read()
585
- else: # If it's file content
586
  content = file_obj.decode('utf-8') if isinstance(file_obj, bytes) else str(file_obj)
587
 
588
  output_text = ""
589
  for line in content.splitlines():
590
  smiles = line.strip()
591
  if smiles:
592
- if not is_peptide(smiles):
 
593
  output_text += f"Skipping non-peptide SMILES: {smiles}\n"
594
  continue
595
- result = analyze_single_smiles(smiles)
596
- output_text += f"Sequence: {result['Sequence']}\n"
597
- output_text += f"Is Cyclic: {result['Is Cyclic']}\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
598
  output_text += "-" * 50 + "\n"
 
599
  return output_text, None, None
600
 
601
  except Exception as e:
602
  return f"Error processing file: {str(e)}", None, None
603
 
604
  return "No input provided.", None, None
605
-
606
  # Create Gradio interface with simplified examples
607
  iface = gr.Interface(
608
  fn=process_input,
 
11
  import matplotlib.patches as patches
12
  from io import BytesIO
13
 
14
+ import re
15
+ from rdkit import Chem
16
+
17
+ class PeptideAnalyzer:
18
+ def __init__(self):
19
+ self.bond_patterns = [
20
+ r'OC\(=O\)', # ester bond
21
+ r'N\(C\)C\(=O\)', # N-methylated peptide bond
22
+ r'N[12]?C\(=O\)', # peptide bond (including Pro N1/N2)
23
+ r'C\(=O\)N\(C\)', # N-methylated peptide bond reverse
24
+ r'C\(=O\)N' # peptide bond reverse
25
+ ]
26
+
27
+ def is_peptide(self, smiles):
28
+ """Check if the SMILES represents a peptide structure"""
29
+ mol = Chem.MolFromSmiles(smiles)
30
+ if mol is None:
31
+ return False
32
+
33
+ # Look for peptide bonds: NC(=O) pattern
34
+ peptide_bond_pattern = Chem.MolFromSmarts('[NH][C](=O)')
35
+ if mol.HasSubstructMatch(peptide_bond_pattern):
36
+ return True
37
+
38
+ # Look for N-methylated peptide bonds: N(C)C(=O) pattern
39
+ n_methyl_pattern = Chem.MolFromSmarts('[N;H0;$(NC)](C)[C](=O)')
40
+ if mol.HasSubstructMatch(n_methyl_pattern):
41
+ return True
42
+
43
+ # Look for ester bonds in cyclic depsipeptides: OC(=O) pattern
44
+ ester_bond_pattern = Chem.MolFromSmarts('O[C](=O)')
45
+ if mol.HasSubstructMatch(ester_bond_pattern):
46
+ return True
47
+
48
  return False
49
+
50
+ def is_cyclic(self, smiles):
51
+ """
52
+ Determine if SMILES represents a cyclic peptide
53
+ Returns: (is_cyclic, peptide_cycles, aromatic_cycles)
54
+ """
55
+ cycle_info = {}
56
 
57
+ # Find all cycle numbers and their contexts
58
+ for match in re.finditer(r'(\d)', smiles):
59
+ number = match.group(1)
60
+ position = match.start(1)
61
+
62
+ if number not in cycle_info:
63
+ cycle_info[number] = []
64
+ cycle_info[number].append({
65
+ 'position': position,
66
+ 'full_context': smiles[max(0, position-3):min(len(smiles), position+4)]
67
+ })
68
 
69
+ # Check each cycle
70
+ peptide_cycles = []
71
+ aromatic_cycles = []
 
72
 
73
+ for number, occurrences in cycle_info.items():
74
+ if len(occurrences) != 2:
75
+ continue
76
+
77
+ start, end = occurrences[0]['position'], occurrences[1]['position']
78
+ segment = smiles[start:end+1]
79
+
80
+ # Check for aromatic rings
81
+ full_context = smiles[max(0,start-10):min(len(smiles),end+10)]
82
+ is_aromatic = ('c2ccccc2' in full_context and len(segment) < 20) or \
83
+ ('c1ccccc1' in full_context and len(segment) < 20)
84
+
85
+ # Check for peptide bonds
86
+ peptide_patterns = [
87
+ 'C(=O)N', # Regular peptide bond
88
+ 'C(=O)N(C)', # N-methylated peptide bond
89
+ 'C(=O)N1', # Cyclic peptide bond
90
+ 'C(=O)N2' # Cyclic peptide bond
91
+ ]
92
+
93
+ has_peptide_bond = any(pattern in segment for pattern in peptide_patterns) and \
94
+ len(segment) > 20
95
+
96
+ if is_aromatic and len(segment) < 20:
97
+ aromatic_cycles.append(number)
98
+ elif has_peptide_bond:
99
+ peptide_cycles.append(number)
100
 
101
+ return len(peptide_cycles) > 0, peptide_cycles, aromatic_cycles
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
+
104
+ def split_on_bonds(self, smiles):
105
+ """Split SMILES into segments with simplified Pro handling"""
106
+ positions = []
107
+ used = set()
108
+
109
+ # Find Gly pattern first
110
+ gly_pattern = r'NCC\(=O\)'
111
+ for match in re.finditer(gly_pattern, smiles):
112
+ if not any(p in range(match.start(), match.end()) for p in used):
113
+ positions.append({
114
+ 'start': match.start(),
115
+ 'end': match.end(),
116
+ 'type': 'gly',
117
+ 'pattern': match.group()
118
+ })
119
+ used.update(range(match.start(), match.end()))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
+ # Then find all bonds, including N2C(=O)
122
+ bond_patterns = [
123
+ (r'OC\(=O\)', 'ester'),
124
+ (r'N\(C\)C\(=O\)', 'n_methyl'),
125
+ (r'N[12]C\(=O\)', 'peptide'), # Pro peptide bonds
126
+ (r'NC\(=O\)', 'peptide'), # Regular peptide bonds
127
+ (r'C\(=O\)N\(C\)', 'n_methyl'),
128
+ (r'C\(=O\)N[12]?', 'peptide')
129
+ ]
130
 
131
+ for pattern, bond_type in bond_patterns:
132
+ for match in re.finditer(pattern, smiles):
133
+ if not any(p in range(match.start(), match.end()) for p in used):
134
+ positions.append({
135
+ 'start': match.start(),
136
+ 'end': match.end(),
137
+ 'type': bond_type,
138
+ 'pattern': match.group()
139
+ })
140
+ used.update(range(match.start(), match.end()))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
+ # Sort by position
143
+ positions.sort(key=lambda x: x['start'])
144
+
145
+ # Create segments
146
+ segments = []
147
+
148
+ if positions:
149
+ # First segment
150
+ if positions[0]['start'] > 0:
151
+ segments.append({
152
+ 'content': smiles[0:positions[0]['start']],
153
+ 'bond_after': positions[0]['pattern']
154
+ })
155
+
156
+ # Process segments
157
+ for i in range(len(positions)-1):
158
+ current = positions[i]
159
+ next_pos = positions[i+1]
160
+
161
+ if current['type'] == 'gly':
162
+ segments.append({
163
+ 'content': 'NCC(=O)',
164
+ 'bond_before': positions[i-1]['pattern'] if i > 0 else None,
165
+ 'bond_after': next_pos['pattern']
166
+ })
167
+ else:
168
+ content = smiles[current['end']:next_pos['start']]
169
+ if content:
170
+ segments.append({
171
+ 'content': content,
172
+ 'bond_before': current['pattern'],
173
+ 'bond_after': next_pos['pattern']
174
+ })
175
+
176
+ # Last segment
177
+ if positions[-1]['end'] < len(smiles):
178
+ segments.append({
179
+ 'content': smiles[positions[-1]['end']:],
180
+ 'bond_before': positions[-1]['pattern']
181
+ })
182
+
183
+ return segments
184
 
185
+ def identify_residue(self, segment):
186
+ """Identify residue with Pro reconstruction"""
187
+ content = segment['content']
188
+ mods = self.get_modifications(segment)
189
+
190
+ # Special handling for Pro: reconstruct the complete pattern
191
+ if (segment.get('bond_after') == 'N2C(=O)' and 'CCC' in content) or \
192
+ ('CCCN2' in content and content.endswith('=O')): # End case
193
+ # Reconstruct the complete Pro pattern
194
+ if '[C@@H]2' in content or '[C@H]2' in content:
195
+ return 'Pro', mods
196
+
197
+ if ('C[C@H](CCCC)' in content or 'C[C@@H](CCCC)' in content) and 'CC(C)' not in content:
198
+ return 'Nle', mods
199
+
200
+ # Ornithine (Orn) - 3-carbon chain with NH2
201
+ if ('C[C@H](CCCN)' in content or 'C[C@@H](CCCN)' in content) and 'CC(C)' not in content:
202
+ return 'Orn', mods
203
+
204
+ # 2-Naphthylalanine (2Nal) - distinct from Phe pattern
205
+ if ('Cc3cc2ccccc2c3' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
206
+ return '2Nal', mods
207
+
208
+ # Cyclohexylalanine (Cha) - already in your code but moved here for clarity
209
+ if 'N2CCCCC2' in content or 'CCCCC2' in content:
210
+ return 'Cha', mods
211
+
212
+ # Aminobutyric acid (Abu) - 2-carbon chain
213
+ if ('C[C@H](CC)' in content or 'C[C@@H](CC)' in content) and not any(p in content for p in ['CC(C)', 'CCCC', 'CCC(C)']):
214
+ return 'Abu', mods
215
+
216
+ # Pipecolic acid (Pip) - 6-membered ring like Pro
217
+ if ('N3CCCCC3' in content or 'CCCCC3' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
218
+ return 'Pip', mods
 
 
 
 
 
 
 
 
 
 
 
219
 
220
+ # Cyclohexylglycine (Chg) - direct cyclohexyl without CH2
221
+ if ('C[C@H](C1CCCCC1)' in content or 'C[C@@H](C1CCCCC1)' in content):
222
+ return 'Chg', mods
223
+
224
+ # 4-Fluorophenylalanine (4F-Phe)
225
+ if ('Cc2ccc(F)cc2' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
226
+ return '4F-Phe', mods
 
 
 
 
 
 
227
 
228
+ # Regular residue identification
229
+ if 'NCC(=O)' in content:
230
+ return 'Gly', mods
231
+
232
+ if 'CC(C)C[C@H]' in content or 'CC(C)C[C@@H]' in content:
233
+ return 'Leu', mods
234
+ if '[C@@H](CC(C)C)' in content or '[C@H](CC(C)C)' in content:
235
+ return 'Leu', mods
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
 
237
+ if ('C(C)C[C@H]' in content or 'C(C)C[C@@H]' in content) and 'CC(C)C' not in content:
238
+ return 'Ile', mods
239
 
240
+ if '[C@@H]([C@@H](C)O)' in content or '[C@H]([C@H](C)O)' in content:
241
+ return 'Thr', mods
 
242
 
243
+ if '[C@H](Cc2ccccc2)' in content or '[C@@H](Cc2ccccc2)' in content:
244
+ return 'Phe', mods
245
+
246
+ if '[C@H](C(C)C)' in content or '[C@@H](C(C)C)' in content:
247
+ if not any(p in content for p in ['CC(C)C[C@H]', 'CC(C)C[C@@H]']):
248
+ return 'Val', mods
 
249
 
250
+ if '[C@H](COC(C)(C)C)' in content or '[C@@H](COC(C)(C)C)' in content:
251
+ return 'O-tBu', mods
252
 
253
+ if ('[C@H](C)' in content or '[C@@H](C)' in content):
254
+ if not any(p in content for p in ['C(C)C', 'COC', 'CN(', 'C(C)O']):
255
+ return 'Ala', mods
256
+
257
+ return None, mods
 
 
 
 
 
258
 
259
+ def get_modifications(self, segment):
260
+ """Get modifications based on bond types"""
261
+ mods = []
262
+ if segment.get('bond_after'):
263
+ if 'N(C)' in segment['bond_after'] or segment['bond_after'].startswith('C(=O)N(C)'):
264
+ mods.append('N-Me')
265
+ if 'OC(=O)' in segment['bond_after']:
266
+ mods.append('O-linked')
267
+ return mods
268
 
269
+ def analyze_structure(self, smiles):
270
+ """Main analysis function"""
271
+ print("\nAnalyzing structure:", smiles)
272
+
273
+ # Split into segments
274
+ segments = self.split_on_bonds(smiles)
275
+
276
+ print("\nSegment Analysis:")
277
+ sequence = []
278
+ for i, segment in enumerate(segments):
279
+ print(f"\nSegment {i}:")
280
+ print(f"Content: {segment['content']}")
281
+ print(f"Bond before: {segment.get('bond_before', 'None')}")
282
+ print(f"Bond after: {segment.get('bond_after', 'None')}")
283
+
284
+ residue, mods = self.identify_residue(segment)
285
+ if residue:
286
+ if mods:
287
+ sequence.append(f"{residue}({','.join(mods)})")
288
+ else:
289
+ sequence.append(residue)
290
+ print(f"Identified as: {residue}")
291
+ print(f"Modifications: {mods}")
292
+ else:
293
+ print(f"Warning: Could not identify residue in segment: {segment['content']}")
294
+
295
+ # Check if cyclic
296
+ is_cyclic = 'N1' in smiles or 'N2' in smiles
297
+ final_sequence = f"cyclo({'-'.join(sequence)})" if is_cyclic else '-'.join(sequence)
298
 
299
+ print(f"\nFinal sequence: {final_sequence}")
300
+ return final_sequence
301
+
 
 
 
 
 
302
  """
303
  def annotate_cyclic_structure(mol, sequence):
304
  '''Create annotated 2D structure with clear, non-overlapping residue labels'''
 
566
  return fig
567
 
568
  def process_input(smiles_input=None, file_obj=None, show_linear=False):
569
+ """Process input and create visualizations using PeptideAnalyzer"""
570
+ analyzer = PeptideAnalyzer()
 
571
 
572
  # Handle direct SMILES input
573
  if smiles_input:
574
  smiles = smiles_input.strip()
575
 
576
+ # First check if it's a peptide using analyzer's method
577
+ if not analyzer.is_peptide(smiles):
578
  return "Error: Input SMILES does not appear to be a peptide structure.", None, None
579
 
580
  try:
 
583
  if mol is None:
584
  return "Error: Invalid SMILES notation.", None, None
585
 
586
+ # Use analyzer to get sequence
587
+ segments = analyzer.split_on_bonds(smiles)
588
+
589
+ # Process segments and build sequence
590
+ sequence_parts = []
591
+ output_text = "Segment Analysis:\n"
592
+ for i, segment in enumerate(segments):
593
+ output_text += f"\nSegment {i}:\n"
594
+ output_text += f"Content: {segment['content']}\n"
595
+ output_text += f"Bond before: {segment.get('bond_before', 'None')}\n"
596
+ output_text += f"Bond after: {segment.get('bond_after', 'None')}\n"
597
+
598
+ residue, mods = analyzer.identify_residue(segment)
599
+ if residue:
600
+ if mods:
601
+ sequence_parts.append(f"{residue}({','.join(mods)})")
602
+ else:
603
+ sequence_parts.append(residue)
604
+ output_text += f"Identified as: {residue}\n"
605
+ output_text += f"Modifications: {mods}\n"
606
+ else:
607
+ output_text += f"Warning: Could not identify residue in segment: {segment['content']}\n"
608
+
609
+ # Check if cyclic using analyzer's method
610
+ is_cyclic, peptide_cycles, aromatic_cycles = analyzer.is_cyclic(smiles)
611
+ sequence = f"cyclo({'-'.join(sequence_parts)})" if is_cyclic else '-'.join(sequence_parts)
612
 
613
  # Create cyclic structure visualization
614
  img_cyclic = annotate_cyclic_structure(mol, sequence)
 
617
  img_linear = None
618
  if show_linear:
619
  fig_linear = create_enhanced_linear_viz(sequence, smiles)
 
 
620
  buf = BytesIO()
621
  fig_linear.savefig(buf, format='png', bbox_inches='tight', dpi=300)
622
  buf.seek(0)
623
  img_linear = Image.open(buf)
624
  plt.close(fig_linear)
625
 
626
+ # Add summary to output
627
+ summary = f"\nSummary:\n"
628
+ summary += f"Sequence: {sequence}\n"
629
+ summary += f"Is Cyclic: {'Yes' if is_cyclic else 'No'}\n"
630
+ if is_cyclic:
631
+ summary += f"Peptide Cycles: {', '.join(peptide_cycles)}\n"
632
+ summary += f"Aromatic Cycles: {', '.join(aromatic_cycles)}\n"
633
 
634
+ return summary + "\n" + output_text, img_cyclic, img_linear
635
 
636
  except Exception as e:
637
  return f"Error processing SMILES: {str(e)}", None, None
 
639
  # Handle file input
640
  if file_obj is not None:
641
  try:
642
+ # Handle file content
643
+ if hasattr(file_obj, 'name'):
644
  with open(file_obj.name, 'r') as f:
645
  content = f.read()
646
+ else:
647
  content = file_obj.decode('utf-8') if isinstance(file_obj, bytes) else str(file_obj)
648
 
649
  output_text = ""
650
  for line in content.splitlines():
651
  smiles = line.strip()
652
  if smiles:
653
+ # Check if it's a peptide
654
+ if not analyzer.is_peptide(smiles):
655
  output_text += f"Skipping non-peptide SMILES: {smiles}\n"
656
  continue
657
+
658
+ # Process this SMILES
659
+ segments = analyzer.split_on_bonds(smiles)
660
+ sequence_parts = []
661
+ for segment in segments:
662
+ residue, mods = analyzer.identify_residue(segment)
663
+ if residue:
664
+ if mods:
665
+ sequence_parts.append(f"{residue}({','.join(mods)})")
666
+ else:
667
+ sequence_parts.append(residue)
668
+
669
+ # Get cyclicity and create sequence
670
+ is_cyclic, peptide_cycles, aromatic_cycles = analyzer.is_cyclic(smiles)
671
+ sequence = f"cyclo({'-'.join(sequence_parts)})" if is_cyclic else '-'.join(sequence_parts)
672
+
673
+ output_text += f"SMILES: {smiles}\n"
674
+ output_text += f"Sequence: {sequence}\n"
675
+ output_text += f"Is Cyclic: {'Yes' if is_cyclic else 'No'}\n"
676
+ if is_cyclic:
677
+ output_text += f"Peptide Cycles: {', '.join(peptide_cycles)}\n"
678
+ output_text += f"Aromatic Cycles: {', '.join(aromatic_cycles)}\n"
679
  output_text += "-" * 50 + "\n"
680
+
681
  return output_text, None, None
682
 
683
  except Exception as e:
684
  return f"Error processing file: {str(e)}", None, None
685
 
686
  return "No input provided.", None, None
 
687
  # Create Gradio interface with simplified examples
688
  iface = gr.Interface(
689
  fn=process_input,