Spaces:
Running
Running
yinuozhang
commited on
Commit
•
a953180
1
Parent(s):
418afab
class format
Browse files
app.py
CHANGED
@@ -11,257 +11,294 @@ import matplotlib.pyplot as plt
|
|
11 |
import matplotlib.patches as patches
|
12 |
from io import BytesIO
|
13 |
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
return True
|
29 |
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
-
|
36 |
-
|
37 |
-
def remove_nested_branches(smiles):
|
38 |
-
"""Remove nested branches from SMILES string"""
|
39 |
-
result = ''
|
40 |
-
depth = 0
|
41 |
-
for char in smiles:
|
42 |
-
if char == '(':
|
43 |
-
depth += 1
|
44 |
-
elif char == ')':
|
45 |
-
depth -= 1
|
46 |
-
elif depth == 0:
|
47 |
-
result += char
|
48 |
-
return result
|
49 |
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
modifications = []
|
68 |
-
# Check for N-methylation
|
69 |
-
if 'N(C)' in segment: # Changed to look in current segment
|
70 |
-
modifications.append('N-Me')
|
71 |
-
if next_segment and 'OC(=O)' in next_segment:
|
72 |
-
modifications.append('O-linked')
|
73 |
-
|
74 |
-
# Check for Proline - but not if it's actually Cha
|
75 |
-
if any(pattern in segment for pattern in ['CCCN2', 'N2CCC', '[C@@H]2CCCN2', 'CCCN1', 'N1CCC']):
|
76 |
-
if not 'CCCCC' in segment: # Make sure it's not Cha
|
77 |
-
return ('Pro', modifications)
|
78 |
-
|
79 |
-
# Check if this segment is part of a Proline ring by looking at context
|
80 |
-
if prev_segment and next_segment:
|
81 |
-
if ('CCC' in segment and 'N' in next_segment) or ('N' in segment and 'CCC' in prev_segment):
|
82 |
-
combined = prev_segment + segment + next_segment
|
83 |
-
if re.search(r'CCCN.*C\(=O\)', combined) and not 'CCCCC' in combined:
|
84 |
-
return ('Pro', modifications)
|
85 |
|
86 |
-
|
87 |
-
|
88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
return ('Trp', modifications)
|
101 |
-
if 'c1cnc[nH]1' in segment:
|
102 |
-
return ('His', modifications)
|
103 |
-
|
104 |
-
# Branched chain amino acids
|
105 |
-
if 'CC(C)C[C@H]' in segment or 'CC(C)C[C@@H]' in segment:
|
106 |
-
return ('Leu', modifications)
|
107 |
-
if '[C@H](CC(C)C)' in segment or '[C@@H](CC(C)C)' in segment:
|
108 |
-
return ('Leu', modifications)
|
109 |
-
if 'C(C)C' in segment and not any(pat in segment for pat in ['CC(C)C', 'C(C)C[C@H]', 'C(C)C[C@@H]']):
|
110 |
-
return ('Val', modifications)
|
111 |
-
if 'C(C)C[C@H]' in segment or 'C(C)C[C@@H]' in segment:
|
112 |
-
return ('Ile', modifications)
|
113 |
-
|
114 |
-
# Small/polar amino acids - make Ala check more specific
|
115 |
-
if '[C@H](CO)' in segment:
|
116 |
-
return ('Ser', modifications)
|
117 |
-
if '[C@@H]([C@@H](C)O)' in segment or '[C@H]([C@H](C)O)' in segment:
|
118 |
-
return ('Thr', modifications)
|
119 |
-
if '[C@H]' in segment and not any(pat in segment for pat in ['C(C)', 'CC', 'O', 'N', 'S']):
|
120 |
-
return ('Gly', modifications)
|
121 |
-
if ('[C@@H](C)' in segment or '[C@H](C)' in segment) and \
|
122 |
-
not any(pat in segment for pat in ['O', 'CC(C)', 'COC']):
|
123 |
-
return ('Ala', modifications)
|
124 |
|
125 |
-
|
126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
|
128 |
-
def
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
if
|
161 |
-
|
162 |
-
sequence.append(f"{residue}({','.join(mods)})")
|
163 |
-
else:
|
164 |
-
sequence.append(residue)
|
165 |
-
|
166 |
-
print("\nDetailed Analysis:")
|
167 |
-
print("Segments:", segments)
|
168 |
-
print("Found sequence:", sequence)
|
169 |
-
|
170 |
-
if is_cyclic_peptide(smiles):
|
171 |
-
return f"cyclo({'-'.join(sequence)})"
|
172 |
-
return '-'.join(sequence)
|
173 |
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
cycle_info = {}
|
182 |
-
|
183 |
-
# Find all cycle numbers and their contexts
|
184 |
-
for match in re.finditer(r'(\d)', smiles):
|
185 |
-
number = match.group(1)
|
186 |
-
position = match.start(1)
|
187 |
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
print("\nCycle Analysis:")
|
197 |
-
for num, occurrences in cycle_info.items():
|
198 |
-
print(f"Cycle number {num}:")
|
199 |
-
for occ in occurrences:
|
200 |
-
print(f"Position: {occ['position']}")
|
201 |
-
print(f"Context: {occ['full_context']}")
|
202 |
-
|
203 |
-
# Check each cycle
|
204 |
-
peptide_cycles = []
|
205 |
-
aromatic_cycles = []
|
206 |
-
|
207 |
-
for number, occurrences in cycle_info.items():
|
208 |
-
if len(occurrences) != 2:
|
209 |
-
continue
|
210 |
-
|
211 |
-
start, end = occurrences[0]['position'], occurrences[1]['position']
|
212 |
|
213 |
-
|
214 |
-
|
215 |
|
216 |
-
|
217 |
-
|
218 |
-
is_aromatic = ('c2ccccc2' in full_context and len(segment) < 20) or ('c1ccccc1' in full_context and len(segment) < 20)
|
219 |
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
'C
|
225 |
-
|
226 |
-
]
|
227 |
|
228 |
-
|
229 |
-
|
230 |
|
231 |
-
if
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
print("\nFound cycles:")
|
237 |
-
print(f"Peptide cycles: {peptide_cycles}")
|
238 |
-
print(f"Aromatic cycles: {aromatic_cycles}")
|
239 |
-
|
240 |
-
return len(peptide_cycles) > 0
|
241 |
|
242 |
-
def
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
|
|
|
|
|
|
|
|
247 |
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
256 |
|
257 |
-
|
258 |
-
return
|
259 |
-
|
260 |
-
'Sequence': f'Error: {str(e)}',
|
261 |
-
'Is Cyclic': 'Error',
|
262 |
-
#'Peptide Cycles': 'Error',
|
263 |
-
#'Aromatic Cycles': 'Error'
|
264 |
-
}
|
265 |
"""
|
266 |
def annotate_cyclic_structure(mol, sequence):
|
267 |
'''Create annotated 2D structure with clear, non-overlapping residue labels'''
|
@@ -529,16 +566,15 @@ def create_enhanced_linear_viz(sequence, smiles):
|
|
529 |
return fig
|
530 |
|
531 |
def process_input(smiles_input=None, file_obj=None, show_linear=False):
|
532 |
-
"""Process input and create visualizations"""
|
533 |
-
|
534 |
-
images = []
|
535 |
|
536 |
# Handle direct SMILES input
|
537 |
if smiles_input:
|
538 |
smiles = smiles_input.strip()
|
539 |
|
540 |
-
# First check if it's a peptide
|
541 |
-
if not is_peptide(smiles):
|
542 |
return "Error: Input SMILES does not appear to be a peptide structure.", None, None
|
543 |
|
544 |
try:
|
@@ -547,9 +583,32 @@ def process_input(smiles_input=None, file_obj=None, show_linear=False):
|
|
547 |
if mol is None:
|
548 |
return "Error: Invalid SMILES notation.", None, None
|
549 |
|
550 |
-
#
|
551 |
-
|
552 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
553 |
|
554 |
# Create cyclic structure visualization
|
555 |
img_cyclic = annotate_cyclic_structure(mol, sequence)
|
@@ -558,19 +617,21 @@ def process_input(smiles_input=None, file_obj=None, show_linear=False):
|
|
558 |
img_linear = None
|
559 |
if show_linear:
|
560 |
fig_linear = create_enhanced_linear_viz(sequence, smiles)
|
561 |
-
|
562 |
-
# Convert matplotlib figure to image
|
563 |
buf = BytesIO()
|
564 |
fig_linear.savefig(buf, format='png', bbox_inches='tight', dpi=300)
|
565 |
buf.seek(0)
|
566 |
img_linear = Image.open(buf)
|
567 |
plt.close(fig_linear)
|
568 |
|
569 |
-
#
|
570 |
-
|
571 |
-
|
|
|
|
|
|
|
|
|
572 |
|
573 |
-
return output_text, img_cyclic, img_linear
|
574 |
|
575 |
except Exception as e:
|
576 |
return f"Error processing SMILES: {str(e)}", None, None
|
@@ -578,31 +639,51 @@ def process_input(smiles_input=None, file_obj=None, show_linear=False):
|
|
578 |
# Handle file input
|
579 |
if file_obj is not None:
|
580 |
try:
|
581 |
-
# Handle file content
|
582 |
-
if hasattr(file_obj, 'name'):
|
583 |
with open(file_obj.name, 'r') as f:
|
584 |
content = f.read()
|
585 |
-
else:
|
586 |
content = file_obj.decode('utf-8') if isinstance(file_obj, bytes) else str(file_obj)
|
587 |
|
588 |
output_text = ""
|
589 |
for line in content.splitlines():
|
590 |
smiles = line.strip()
|
591 |
if smiles:
|
592 |
-
if
|
|
|
593 |
output_text += f"Skipping non-peptide SMILES: {smiles}\n"
|
594 |
continue
|
595 |
-
|
596 |
-
|
597 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
598 |
output_text += "-" * 50 + "\n"
|
|
|
599 |
return output_text, None, None
|
600 |
|
601 |
except Exception as e:
|
602 |
return f"Error processing file: {str(e)}", None, None
|
603 |
|
604 |
return "No input provided.", None, None
|
605 |
-
|
606 |
# Create Gradio interface with simplified examples
|
607 |
iface = gr.Interface(
|
608 |
fn=process_input,
|
|
|
11 |
import matplotlib.patches as patches
|
12 |
from io import BytesIO
|
13 |
|
14 |
+
import re
|
15 |
+
from rdkit import Chem
|
16 |
+
|
17 |
+
class PeptideAnalyzer:
|
18 |
+
def __init__(self):
|
19 |
+
self.bond_patterns = [
|
20 |
+
r'OC\(=O\)', # ester bond
|
21 |
+
r'N\(C\)C\(=O\)', # N-methylated peptide bond
|
22 |
+
r'N[12]?C\(=O\)', # peptide bond (including Pro N1/N2)
|
23 |
+
r'C\(=O\)N\(C\)', # N-methylated peptide bond reverse
|
24 |
+
r'C\(=O\)N' # peptide bond reverse
|
25 |
+
]
|
26 |
+
|
27 |
+
def is_peptide(self, smiles):
|
28 |
+
"""Check if the SMILES represents a peptide structure"""
|
29 |
+
mol = Chem.MolFromSmiles(smiles)
|
30 |
+
if mol is None:
|
31 |
+
return False
|
32 |
+
|
33 |
+
# Look for peptide bonds: NC(=O) pattern
|
34 |
+
peptide_bond_pattern = Chem.MolFromSmarts('[NH][C](=O)')
|
35 |
+
if mol.HasSubstructMatch(peptide_bond_pattern):
|
36 |
+
return True
|
37 |
+
|
38 |
+
# Look for N-methylated peptide bonds: N(C)C(=O) pattern
|
39 |
+
n_methyl_pattern = Chem.MolFromSmarts('[N;H0;$(NC)](C)[C](=O)')
|
40 |
+
if mol.HasSubstructMatch(n_methyl_pattern):
|
41 |
+
return True
|
42 |
+
|
43 |
+
# Look for ester bonds in cyclic depsipeptides: OC(=O) pattern
|
44 |
+
ester_bond_pattern = Chem.MolFromSmarts('O[C](=O)')
|
45 |
+
if mol.HasSubstructMatch(ester_bond_pattern):
|
46 |
+
return True
|
47 |
+
|
48 |
return False
|
49 |
+
|
50 |
+
def is_cyclic(self, smiles):
|
51 |
+
"""
|
52 |
+
Determine if SMILES represents a cyclic peptide
|
53 |
+
Returns: (is_cyclic, peptide_cycles, aromatic_cycles)
|
54 |
+
"""
|
55 |
+
cycle_info = {}
|
56 |
|
57 |
+
# Find all cycle numbers and their contexts
|
58 |
+
for match in re.finditer(r'(\d)', smiles):
|
59 |
+
number = match.group(1)
|
60 |
+
position = match.start(1)
|
61 |
+
|
62 |
+
if number not in cycle_info:
|
63 |
+
cycle_info[number] = []
|
64 |
+
cycle_info[number].append({
|
65 |
+
'position': position,
|
66 |
+
'full_context': smiles[max(0, position-3):min(len(smiles), position+4)]
|
67 |
+
})
|
68 |
|
69 |
+
# Check each cycle
|
70 |
+
peptide_cycles = []
|
71 |
+
aromatic_cycles = []
|
|
|
72 |
|
73 |
+
for number, occurrences in cycle_info.items():
|
74 |
+
if len(occurrences) != 2:
|
75 |
+
continue
|
76 |
+
|
77 |
+
start, end = occurrences[0]['position'], occurrences[1]['position']
|
78 |
+
segment = smiles[start:end+1]
|
79 |
+
|
80 |
+
# Check for aromatic rings
|
81 |
+
full_context = smiles[max(0,start-10):min(len(smiles),end+10)]
|
82 |
+
is_aromatic = ('c2ccccc2' in full_context and len(segment) < 20) or \
|
83 |
+
('c1ccccc1' in full_context and len(segment) < 20)
|
84 |
+
|
85 |
+
# Check for peptide bonds
|
86 |
+
peptide_patterns = [
|
87 |
+
'C(=O)N', # Regular peptide bond
|
88 |
+
'C(=O)N(C)', # N-methylated peptide bond
|
89 |
+
'C(=O)N1', # Cyclic peptide bond
|
90 |
+
'C(=O)N2' # Cyclic peptide bond
|
91 |
+
]
|
92 |
+
|
93 |
+
has_peptide_bond = any(pattern in segment for pattern in peptide_patterns) and \
|
94 |
+
len(segment) > 20
|
95 |
+
|
96 |
+
if is_aromatic and len(segment) < 20:
|
97 |
+
aromatic_cycles.append(number)
|
98 |
+
elif has_peptide_bond:
|
99 |
+
peptide_cycles.append(number)
|
100 |
|
101 |
+
return len(peptide_cycles) > 0, peptide_cycles, aromatic_cycles
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
+
|
104 |
+
def split_on_bonds(self, smiles):
|
105 |
+
"""Split SMILES into segments with simplified Pro handling"""
|
106 |
+
positions = []
|
107 |
+
used = set()
|
108 |
+
|
109 |
+
# Find Gly pattern first
|
110 |
+
gly_pattern = r'NCC\(=O\)'
|
111 |
+
for match in re.finditer(gly_pattern, smiles):
|
112 |
+
if not any(p in range(match.start(), match.end()) for p in used):
|
113 |
+
positions.append({
|
114 |
+
'start': match.start(),
|
115 |
+
'end': match.end(),
|
116 |
+
'type': 'gly',
|
117 |
+
'pattern': match.group()
|
118 |
+
})
|
119 |
+
used.update(range(match.start(), match.end()))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
|
121 |
+
# Then find all bonds, including N2C(=O)
|
122 |
+
bond_patterns = [
|
123 |
+
(r'OC\(=O\)', 'ester'),
|
124 |
+
(r'N\(C\)C\(=O\)', 'n_methyl'),
|
125 |
+
(r'N[12]C\(=O\)', 'peptide'), # Pro peptide bonds
|
126 |
+
(r'NC\(=O\)', 'peptide'), # Regular peptide bonds
|
127 |
+
(r'C\(=O\)N\(C\)', 'n_methyl'),
|
128 |
+
(r'C\(=O\)N[12]?', 'peptide')
|
129 |
+
]
|
130 |
|
131 |
+
for pattern, bond_type in bond_patterns:
|
132 |
+
for match in re.finditer(pattern, smiles):
|
133 |
+
if not any(p in range(match.start(), match.end()) for p in used):
|
134 |
+
positions.append({
|
135 |
+
'start': match.start(),
|
136 |
+
'end': match.end(),
|
137 |
+
'type': bond_type,
|
138 |
+
'pattern': match.group()
|
139 |
+
})
|
140 |
+
used.update(range(match.start(), match.end()))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
|
142 |
+
# Sort by position
|
143 |
+
positions.sort(key=lambda x: x['start'])
|
144 |
+
|
145 |
+
# Create segments
|
146 |
+
segments = []
|
147 |
+
|
148 |
+
if positions:
|
149 |
+
# First segment
|
150 |
+
if positions[0]['start'] > 0:
|
151 |
+
segments.append({
|
152 |
+
'content': smiles[0:positions[0]['start']],
|
153 |
+
'bond_after': positions[0]['pattern']
|
154 |
+
})
|
155 |
+
|
156 |
+
# Process segments
|
157 |
+
for i in range(len(positions)-1):
|
158 |
+
current = positions[i]
|
159 |
+
next_pos = positions[i+1]
|
160 |
+
|
161 |
+
if current['type'] == 'gly':
|
162 |
+
segments.append({
|
163 |
+
'content': 'NCC(=O)',
|
164 |
+
'bond_before': positions[i-1]['pattern'] if i > 0 else None,
|
165 |
+
'bond_after': next_pos['pattern']
|
166 |
+
})
|
167 |
+
else:
|
168 |
+
content = smiles[current['end']:next_pos['start']]
|
169 |
+
if content:
|
170 |
+
segments.append({
|
171 |
+
'content': content,
|
172 |
+
'bond_before': current['pattern'],
|
173 |
+
'bond_after': next_pos['pattern']
|
174 |
+
})
|
175 |
+
|
176 |
+
# Last segment
|
177 |
+
if positions[-1]['end'] < len(smiles):
|
178 |
+
segments.append({
|
179 |
+
'content': smiles[positions[-1]['end']:],
|
180 |
+
'bond_before': positions[-1]['pattern']
|
181 |
+
})
|
182 |
+
|
183 |
+
return segments
|
184 |
|
185 |
+
def identify_residue(self, segment):
|
186 |
+
"""Identify residue with Pro reconstruction"""
|
187 |
+
content = segment['content']
|
188 |
+
mods = self.get_modifications(segment)
|
189 |
+
|
190 |
+
# Special handling for Pro: reconstruct the complete pattern
|
191 |
+
if (segment.get('bond_after') == 'N2C(=O)' and 'CCC' in content) or \
|
192 |
+
('CCCN2' in content and content.endswith('=O')): # End case
|
193 |
+
# Reconstruct the complete Pro pattern
|
194 |
+
if '[C@@H]2' in content or '[C@H]2' in content:
|
195 |
+
return 'Pro', mods
|
196 |
+
|
197 |
+
if ('C[C@H](CCCC)' in content or 'C[C@@H](CCCC)' in content) and 'CC(C)' not in content:
|
198 |
+
return 'Nle', mods
|
199 |
+
|
200 |
+
# Ornithine (Orn) - 3-carbon chain with NH2
|
201 |
+
if ('C[C@H](CCCN)' in content or 'C[C@@H](CCCN)' in content) and 'CC(C)' not in content:
|
202 |
+
return 'Orn', mods
|
203 |
+
|
204 |
+
# 2-Naphthylalanine (2Nal) - distinct from Phe pattern
|
205 |
+
if ('Cc3cc2ccccc2c3' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
|
206 |
+
return '2Nal', mods
|
207 |
+
|
208 |
+
# Cyclohexylalanine (Cha) - already in your code but moved here for clarity
|
209 |
+
if 'N2CCCCC2' in content or 'CCCCC2' in content:
|
210 |
+
return 'Cha', mods
|
211 |
+
|
212 |
+
# Aminobutyric acid (Abu) - 2-carbon chain
|
213 |
+
if ('C[C@H](CC)' in content or 'C[C@@H](CC)' in content) and not any(p in content for p in ['CC(C)', 'CCCC', 'CCC(C)']):
|
214 |
+
return 'Abu', mods
|
215 |
+
|
216 |
+
# Pipecolic acid (Pip) - 6-membered ring like Pro
|
217 |
+
if ('N3CCCCC3' in content or 'CCCCC3' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
|
218 |
+
return 'Pip', mods
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
219 |
|
220 |
+
# Cyclohexylglycine (Chg) - direct cyclohexyl without CH2
|
221 |
+
if ('C[C@H](C1CCCCC1)' in content or 'C[C@@H](C1CCCCC1)' in content):
|
222 |
+
return 'Chg', mods
|
223 |
+
|
224 |
+
# 4-Fluorophenylalanine (4F-Phe)
|
225 |
+
if ('Cc2ccc(F)cc2' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
|
226 |
+
return '4F-Phe', mods
|
|
|
|
|
|
|
|
|
|
|
|
|
227 |
|
228 |
+
# Regular residue identification
|
229 |
+
if 'NCC(=O)' in content:
|
230 |
+
return 'Gly', mods
|
231 |
+
|
232 |
+
if 'CC(C)C[C@H]' in content or 'CC(C)C[C@@H]' in content:
|
233 |
+
return 'Leu', mods
|
234 |
+
if '[C@@H](CC(C)C)' in content or '[C@H](CC(C)C)' in content:
|
235 |
+
return 'Leu', mods
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
|
237 |
+
if ('C(C)C[C@H]' in content or 'C(C)C[C@@H]' in content) and 'CC(C)C' not in content:
|
238 |
+
return 'Ile', mods
|
239 |
|
240 |
+
if '[C@@H]([C@@H](C)O)' in content or '[C@H]([C@H](C)O)' in content:
|
241 |
+
return 'Thr', mods
|
|
|
242 |
|
243 |
+
if '[C@H](Cc2ccccc2)' in content or '[C@@H](Cc2ccccc2)' in content:
|
244 |
+
return 'Phe', mods
|
245 |
+
|
246 |
+
if '[C@H](C(C)C)' in content or '[C@@H](C(C)C)' in content:
|
247 |
+
if not any(p in content for p in ['CC(C)C[C@H]', 'CC(C)C[C@@H]']):
|
248 |
+
return 'Val', mods
|
|
|
249 |
|
250 |
+
if '[C@H](COC(C)(C)C)' in content or '[C@@H](COC(C)(C)C)' in content:
|
251 |
+
return 'O-tBu', mods
|
252 |
|
253 |
+
if ('[C@H](C)' in content or '[C@@H](C)' in content):
|
254 |
+
if not any(p in content for p in ['C(C)C', 'COC', 'CN(', 'C(C)O']):
|
255 |
+
return 'Ala', mods
|
256 |
+
|
257 |
+
return None, mods
|
|
|
|
|
|
|
|
|
|
|
258 |
|
259 |
+
def get_modifications(self, segment):
|
260 |
+
"""Get modifications based on bond types"""
|
261 |
+
mods = []
|
262 |
+
if segment.get('bond_after'):
|
263 |
+
if 'N(C)' in segment['bond_after'] or segment['bond_after'].startswith('C(=O)N(C)'):
|
264 |
+
mods.append('N-Me')
|
265 |
+
if 'OC(=O)' in segment['bond_after']:
|
266 |
+
mods.append('O-linked')
|
267 |
+
return mods
|
268 |
|
269 |
+
def analyze_structure(self, smiles):
|
270 |
+
"""Main analysis function"""
|
271 |
+
print("\nAnalyzing structure:", smiles)
|
272 |
+
|
273 |
+
# Split into segments
|
274 |
+
segments = self.split_on_bonds(smiles)
|
275 |
+
|
276 |
+
print("\nSegment Analysis:")
|
277 |
+
sequence = []
|
278 |
+
for i, segment in enumerate(segments):
|
279 |
+
print(f"\nSegment {i}:")
|
280 |
+
print(f"Content: {segment['content']}")
|
281 |
+
print(f"Bond before: {segment.get('bond_before', 'None')}")
|
282 |
+
print(f"Bond after: {segment.get('bond_after', 'None')}")
|
283 |
+
|
284 |
+
residue, mods = self.identify_residue(segment)
|
285 |
+
if residue:
|
286 |
+
if mods:
|
287 |
+
sequence.append(f"{residue}({','.join(mods)})")
|
288 |
+
else:
|
289 |
+
sequence.append(residue)
|
290 |
+
print(f"Identified as: {residue}")
|
291 |
+
print(f"Modifications: {mods}")
|
292 |
+
else:
|
293 |
+
print(f"Warning: Could not identify residue in segment: {segment['content']}")
|
294 |
+
|
295 |
+
# Check if cyclic
|
296 |
+
is_cyclic = 'N1' in smiles or 'N2' in smiles
|
297 |
+
final_sequence = f"cyclo({'-'.join(sequence)})" if is_cyclic else '-'.join(sequence)
|
298 |
|
299 |
+
print(f"\nFinal sequence: {final_sequence}")
|
300 |
+
return final_sequence
|
301 |
+
|
|
|
|
|
|
|
|
|
|
|
302 |
"""
|
303 |
def annotate_cyclic_structure(mol, sequence):
|
304 |
'''Create annotated 2D structure with clear, non-overlapping residue labels'''
|
|
|
566 |
return fig
|
567 |
|
568 |
def process_input(smiles_input=None, file_obj=None, show_linear=False):
|
569 |
+
"""Process input and create visualizations using PeptideAnalyzer"""
|
570 |
+
analyzer = PeptideAnalyzer()
|
|
|
571 |
|
572 |
# Handle direct SMILES input
|
573 |
if smiles_input:
|
574 |
smiles = smiles_input.strip()
|
575 |
|
576 |
+
# First check if it's a peptide using analyzer's method
|
577 |
+
if not analyzer.is_peptide(smiles):
|
578 |
return "Error: Input SMILES does not appear to be a peptide structure.", None, None
|
579 |
|
580 |
try:
|
|
|
583 |
if mol is None:
|
584 |
return "Error: Invalid SMILES notation.", None, None
|
585 |
|
586 |
+
# Use analyzer to get sequence
|
587 |
+
segments = analyzer.split_on_bonds(smiles)
|
588 |
+
|
589 |
+
# Process segments and build sequence
|
590 |
+
sequence_parts = []
|
591 |
+
output_text = "Segment Analysis:\n"
|
592 |
+
for i, segment in enumerate(segments):
|
593 |
+
output_text += f"\nSegment {i}:\n"
|
594 |
+
output_text += f"Content: {segment['content']}\n"
|
595 |
+
output_text += f"Bond before: {segment.get('bond_before', 'None')}\n"
|
596 |
+
output_text += f"Bond after: {segment.get('bond_after', 'None')}\n"
|
597 |
+
|
598 |
+
residue, mods = analyzer.identify_residue(segment)
|
599 |
+
if residue:
|
600 |
+
if mods:
|
601 |
+
sequence_parts.append(f"{residue}({','.join(mods)})")
|
602 |
+
else:
|
603 |
+
sequence_parts.append(residue)
|
604 |
+
output_text += f"Identified as: {residue}\n"
|
605 |
+
output_text += f"Modifications: {mods}\n"
|
606 |
+
else:
|
607 |
+
output_text += f"Warning: Could not identify residue in segment: {segment['content']}\n"
|
608 |
+
|
609 |
+
# Check if cyclic using analyzer's method
|
610 |
+
is_cyclic, peptide_cycles, aromatic_cycles = analyzer.is_cyclic(smiles)
|
611 |
+
sequence = f"cyclo({'-'.join(sequence_parts)})" if is_cyclic else '-'.join(sequence_parts)
|
612 |
|
613 |
# Create cyclic structure visualization
|
614 |
img_cyclic = annotate_cyclic_structure(mol, sequence)
|
|
|
617 |
img_linear = None
|
618 |
if show_linear:
|
619 |
fig_linear = create_enhanced_linear_viz(sequence, smiles)
|
|
|
|
|
620 |
buf = BytesIO()
|
621 |
fig_linear.savefig(buf, format='png', bbox_inches='tight', dpi=300)
|
622 |
buf.seek(0)
|
623 |
img_linear = Image.open(buf)
|
624 |
plt.close(fig_linear)
|
625 |
|
626 |
+
# Add summary to output
|
627 |
+
summary = f"\nSummary:\n"
|
628 |
+
summary += f"Sequence: {sequence}\n"
|
629 |
+
summary += f"Is Cyclic: {'Yes' if is_cyclic else 'No'}\n"
|
630 |
+
if is_cyclic:
|
631 |
+
summary += f"Peptide Cycles: {', '.join(peptide_cycles)}\n"
|
632 |
+
summary += f"Aromatic Cycles: {', '.join(aromatic_cycles)}\n"
|
633 |
|
634 |
+
return summary + "\n" + output_text, img_cyclic, img_linear
|
635 |
|
636 |
except Exception as e:
|
637 |
return f"Error processing SMILES: {str(e)}", None, None
|
|
|
639 |
# Handle file input
|
640 |
if file_obj is not None:
|
641 |
try:
|
642 |
+
# Handle file content
|
643 |
+
if hasattr(file_obj, 'name'):
|
644 |
with open(file_obj.name, 'r') as f:
|
645 |
content = f.read()
|
646 |
+
else:
|
647 |
content = file_obj.decode('utf-8') if isinstance(file_obj, bytes) else str(file_obj)
|
648 |
|
649 |
output_text = ""
|
650 |
for line in content.splitlines():
|
651 |
smiles = line.strip()
|
652 |
if smiles:
|
653 |
+
# Check if it's a peptide
|
654 |
+
if not analyzer.is_peptide(smiles):
|
655 |
output_text += f"Skipping non-peptide SMILES: {smiles}\n"
|
656 |
continue
|
657 |
+
|
658 |
+
# Process this SMILES
|
659 |
+
segments = analyzer.split_on_bonds(smiles)
|
660 |
+
sequence_parts = []
|
661 |
+
for segment in segments:
|
662 |
+
residue, mods = analyzer.identify_residue(segment)
|
663 |
+
if residue:
|
664 |
+
if mods:
|
665 |
+
sequence_parts.append(f"{residue}({','.join(mods)})")
|
666 |
+
else:
|
667 |
+
sequence_parts.append(residue)
|
668 |
+
|
669 |
+
# Get cyclicity and create sequence
|
670 |
+
is_cyclic, peptide_cycles, aromatic_cycles = analyzer.is_cyclic(smiles)
|
671 |
+
sequence = f"cyclo({'-'.join(sequence_parts)})" if is_cyclic else '-'.join(sequence_parts)
|
672 |
+
|
673 |
+
output_text += f"SMILES: {smiles}\n"
|
674 |
+
output_text += f"Sequence: {sequence}\n"
|
675 |
+
output_text += f"Is Cyclic: {'Yes' if is_cyclic else 'No'}\n"
|
676 |
+
if is_cyclic:
|
677 |
+
output_text += f"Peptide Cycles: {', '.join(peptide_cycles)}\n"
|
678 |
+
output_text += f"Aromatic Cycles: {', '.join(aromatic_cycles)}\n"
|
679 |
output_text += "-" * 50 + "\n"
|
680 |
+
|
681 |
return output_text, None, None
|
682 |
|
683 |
except Exception as e:
|
684 |
return f"Error processing file: {str(e)}", None, None
|
685 |
|
686 |
return "No input provided.", None, None
|
|
|
687 |
# Create Gradio interface with simplified examples
|
688 |
iface = gr.Interface(
|
689 |
fn=process_input,
|