Spaces:
Running
Running
yinuozhang
commited on
Commit
•
12d0eea
1
Parent(s):
f5f80ba
add unnatural aas and fix cyclic recog
Browse files
app.py
CHANGED
@@ -61,30 +61,36 @@ def identify_linkage_type(segment):
|
|
61 |
return (None, False)
|
62 |
def identify_residue(segment, next_segment=None, prev_segment=None):
|
63 |
"""
|
64 |
-
Identify amino acid residues with modifications and special handling for
|
65 |
Returns: tuple (residue, modifications)
|
66 |
"""
|
67 |
modifications = []
|
68 |
-
|
69 |
-
#
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
if 'OC(=O)' in next_segment:
|
74 |
-
modifications.append('O-linked')
|
75 |
|
76 |
-
#
|
77 |
-
# Proline can appear in several patterns due to its cyclic nature
|
78 |
if any(pattern in segment for pattern in ['CCCN2', 'N2CCC', '[C@@H]2CCCN2', 'CCCN1', 'N1CCC']):
|
79 |
-
|
80 |
-
|
|
|
81 |
# Check if this segment is part of a Proline ring by looking at context
|
82 |
if prev_segment and next_segment:
|
83 |
if ('CCC' in segment and 'N' in next_segment) or ('N' in segment and 'CCC' in prev_segment):
|
84 |
combined = prev_segment + segment + next_segment
|
85 |
-
if re.search(r'CCCN.*C\(=O\)', combined):
|
86 |
return ('Pro', modifications)
|
87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
# Aromatic amino acids
|
89 |
if 'Cc2ccccc2' in segment or 'c1ccccc1' in segment:
|
90 |
return ('Phe', modifications)
|
@@ -94,7 +100,7 @@ def identify_residue(segment, next_segment=None, prev_segment=None):
|
|
94 |
return ('Trp', modifications)
|
95 |
if 'c1cnc[nH]1' in segment:
|
96 |
return ('His', modifications)
|
97 |
-
|
98 |
# Branched chain amino acids
|
99 |
if 'CC(C)C[C@H]' in segment or 'CC(C)C[C@@H]' in segment:
|
100 |
return ('Leu', modifications)
|
@@ -104,61 +110,64 @@ def identify_residue(segment, next_segment=None, prev_segment=None):
|
|
104 |
return ('Val', modifications)
|
105 |
if 'C(C)C[C@H]' in segment or 'C(C)C[C@@H]' in segment:
|
106 |
return ('Ile', modifications)
|
107 |
-
|
108 |
-
# Small/polar amino acids
|
109 |
-
if ('[C@H](C)' in segment or '[C@@H](C)' in segment) and 'C(C)C' not in segment:
|
110 |
-
return ('Ala', modifications)
|
111 |
if '[C@H](CO)' in segment:
|
112 |
return ('Ser', modifications)
|
113 |
-
if '[C
|
114 |
return ('Thr', modifications)
|
115 |
if '[C@H]' in segment and not any(pat in segment for pat in ['C(C)', 'CC', 'O', 'N', 'S']):
|
116 |
return ('Gly', modifications)
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
|
|
121 |
return (None, modifications)
|
|
|
122 |
def parse_peptide(smiles):
|
123 |
"""
|
124 |
-
Parse peptide sequence with
|
125 |
"""
|
126 |
-
# Split
|
127 |
-
|
128 |
-
|
129 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
|
131 |
sequence = []
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
residue, modifications = identify_residue(segment, next_segment, prev_segment)
|
144 |
if residue:
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
sequence.append(formatted_residue)
|
150 |
-
|
151 |
-
i += 1
|
152 |
-
|
153 |
-
is_cyclic = is_cyclic_peptide(smiles)
|
154 |
|
155 |
-
# Print debug information
|
156 |
print("\nDetailed Analysis:")
|
157 |
print("Segments:", segments)
|
158 |
print("Found sequence:", sequence)
|
159 |
|
160 |
-
|
161 |
-
if is_cyclic:
|
162 |
return f"cyclo({'-'.join(sequence)})"
|
163 |
return '-'.join(sequence)
|
164 |
|
@@ -172,53 +181,71 @@ def is_cyclic_peptide(smiles):
|
|
172 |
cycle_info = {}
|
173 |
|
174 |
# Find all cycle numbers and their contexts
|
175 |
-
for match in re.finditer(r'(\
|
176 |
-
number = match.group(
|
177 |
-
|
178 |
-
post_context = match.group(3) or ''
|
179 |
-
position = match.start(2)
|
180 |
|
181 |
if number not in cycle_info:
|
182 |
cycle_info[number] = []
|
183 |
cycle_info[number].append({
|
184 |
'position': position,
|
185 |
-
'pre_context': pre_context,
|
186 |
-
'post_context': post_context,
|
187 |
'full_context': smiles[max(0, position-3):min(len(smiles), position+4)]
|
188 |
})
|
189 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
# Check each cycle
|
191 |
peptide_cycles = []
|
192 |
aromatic_cycles = []
|
193 |
|
194 |
for number, occurrences in cycle_info.items():
|
195 |
-
if len(occurrences) != 2:
|
196 |
continue
|
197 |
|
198 |
start, end = occurrences[0]['position'], occurrences[1]['position']
|
199 |
|
200 |
-
# Get
|
201 |
segment = smiles[start:end+1]
|
202 |
-
clean_segment = remove_nested_branches(segment)
|
203 |
|
204 |
-
#
|
205 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
|
207 |
-
#
|
208 |
-
has_peptide_bond =
|
209 |
|
210 |
-
if is_aromatic:
|
211 |
aromatic_cycles.append(number)
|
212 |
elif has_peptide_bond:
|
213 |
peptide_cycles.append(number)
|
214 |
|
215 |
-
|
|
|
|
|
|
|
|
|
216 |
|
217 |
def analyze_single_smiles(smiles):
|
218 |
"""Analyze a single SMILES string"""
|
219 |
try:
|
220 |
is_cyclic, peptide_cycles, aromatic_cycles = is_cyclic_peptide(smiles)
|
221 |
sequence = parse_peptide(smiles)
|
|
|
|
|
222 |
|
223 |
details = {
|
224 |
#'SMILES': smiles,
|
@@ -626,6 +653,9 @@ iface = gr.Interface(
|
|
626 |
```
|
627 |
C(C)C[C@@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@@H](CC(C)C)NC(=O)[C@@H](CC(C)C)N(C)C(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc2)NC1=O
|
628 |
```
|
|
|
|
|
|
|
629 |
""",
|
630 |
flagging_mode="never"
|
631 |
)
|
|
|
61 |
return (None, False)
|
62 |
def identify_residue(segment, next_segment=None, prev_segment=None):
|
63 |
"""
|
64 |
+
Identify amino acid residues with modifications and special handling for both natural and unnatural AAs
|
65 |
Returns: tuple (residue, modifications)
|
66 |
"""
|
67 |
modifications = []
|
68 |
+
# Check for N-methylation
|
69 |
+
if 'N(C)' in segment: # Changed to look in current segment
|
70 |
+
modifications.append('N-Me')
|
71 |
+
if next_segment and 'OC(=O)' in next_segment:
|
72 |
+
modifications.append('O-linked')
|
|
|
|
|
73 |
|
74 |
+
# Check for Proline - but not if it's actually Cha
|
|
|
75 |
if any(pattern in segment for pattern in ['CCCN2', 'N2CCC', '[C@@H]2CCCN2', 'CCCN1', 'N1CCC']):
|
76 |
+
if not 'CCCCC' in segment: # Make sure it's not Cha
|
77 |
+
return ('Pro', modifications)
|
78 |
+
|
79 |
# Check if this segment is part of a Proline ring by looking at context
|
80 |
if prev_segment and next_segment:
|
81 |
if ('CCC' in segment and 'N' in next_segment) or ('N' in segment and 'CCC' in prev_segment):
|
82 |
combined = prev_segment + segment + next_segment
|
83 |
+
if re.search(r'CCCN.*C\(=O\)', combined) and not 'CCCCC' in combined:
|
84 |
return ('Pro', modifications)
|
85 |
|
86 |
+
# Check for O-tBu modification FIRST
|
87 |
+
if 'COC(C)(C)C' in segment:
|
88 |
+
return ('O-tBu', modifications) # or return ('Ser(O-tBu)', modifications) if you prefer
|
89 |
+
|
90 |
+
# Cyclohexyl amino acid (Cha)
|
91 |
+
if 'N2CCCCC2' in segment or 'CCCCC2' in segment:
|
92 |
+
return ('Cha', modifications)
|
93 |
+
|
94 |
# Aromatic amino acids
|
95 |
if 'Cc2ccccc2' in segment or 'c1ccccc1' in segment:
|
96 |
return ('Phe', modifications)
|
|
|
100 |
return ('Trp', modifications)
|
101 |
if 'c1cnc[nH]1' in segment:
|
102 |
return ('His', modifications)
|
103 |
+
|
104 |
# Branched chain amino acids
|
105 |
if 'CC(C)C[C@H]' in segment or 'CC(C)C[C@@H]' in segment:
|
106 |
return ('Leu', modifications)
|
|
|
110 |
return ('Val', modifications)
|
111 |
if 'C(C)C[C@H]' in segment or 'C(C)C[C@@H]' in segment:
|
112 |
return ('Ile', modifications)
|
113 |
+
|
114 |
+
# Small/polar amino acids - make Ala check more specific
|
|
|
|
|
115 |
if '[C@H](CO)' in segment:
|
116 |
return ('Ser', modifications)
|
117 |
+
if '[C@@H]([C@@H](C)O)' in segment or '[C@H]([C@H](C)O)' in segment:
|
118 |
return ('Thr', modifications)
|
119 |
if '[C@H]' in segment and not any(pat in segment for pat in ['C(C)', 'CC', 'O', 'N', 'S']):
|
120 |
return ('Gly', modifications)
|
121 |
+
if ('[C@@H](C)' in segment or '[C@H](C)' in segment) and \
|
122 |
+
not any(pat in segment for pat in ['O', 'CC(C)', 'COC']):
|
123 |
+
return ('Ala', modifications)
|
124 |
+
|
125 |
+
|
126 |
return (None, modifications)
|
127 |
+
|
128 |
def parse_peptide(smiles):
|
129 |
"""
|
130 |
+
Parse peptide sequence with better segment identification
|
131 |
"""
|
132 |
+
# Split at each peptide bond C(=O)N
|
133 |
+
segments = []
|
134 |
+
bonds = list(re.finditer(r'C\(=O\)N(?:\(C\))?', smiles))
|
135 |
+
|
136 |
+
# Handle first residue (before first bond)
|
137 |
+
first_bond = bonds[0].start()
|
138 |
+
first_segment = smiles[0:first_bond]
|
139 |
+
segments.append(first_segment)
|
140 |
+
|
141 |
+
# Handle middle residues
|
142 |
+
for i in range(len(bonds)):
|
143 |
+
start = bonds[i].end()
|
144 |
+
end = bonds[i+1].start() if i < len(bonds)-1 else len(smiles)
|
145 |
+
segment = smiles[start:end]
|
146 |
+
is_n_me = 'N(C)' in bonds[i].group()
|
147 |
+
segments.append((segment, is_n_me))
|
148 |
|
149 |
sequence = []
|
150 |
+
# Handle first residue
|
151 |
+
residue, mods = identify_residue(segments[0])
|
152 |
+
if residue:
|
153 |
+
sequence.append(residue)
|
154 |
+
|
155 |
+
# Handle rest of residues
|
156 |
+
for segment, is_n_me in segments[1:]:
|
157 |
+
residue, mods = identify_residue(segment)
|
158 |
+
if is_n_me:
|
159 |
+
mods.append('N-Me')
|
|
|
|
|
160 |
if residue:
|
161 |
+
if mods:
|
162 |
+
sequence.append(f"{residue}({','.join(mods)})")
|
163 |
+
else:
|
164 |
+
sequence.append(residue)
|
|
|
|
|
|
|
|
|
|
|
165 |
|
|
|
166 |
print("\nDetailed Analysis:")
|
167 |
print("Segments:", segments)
|
168 |
print("Found sequence:", sequence)
|
169 |
|
170 |
+
if is_cyclic_peptide(smiles):
|
|
|
171 |
return f"cyclo({'-'.join(sequence)})"
|
172 |
return '-'.join(sequence)
|
173 |
|
|
|
181 |
cycle_info = {}
|
182 |
|
183 |
# Find all cycle numbers and their contexts
|
184 |
+
for match in re.finditer(r'(\d)', smiles):
|
185 |
+
number = match.group(1)
|
186 |
+
position = match.start(1)
|
|
|
|
|
187 |
|
188 |
if number not in cycle_info:
|
189 |
cycle_info[number] = []
|
190 |
cycle_info[number].append({
|
191 |
'position': position,
|
|
|
|
|
192 |
'full_context': smiles[max(0, position-3):min(len(smiles), position+4)]
|
193 |
})
|
194 |
|
195 |
+
# Print cycle information for debugging
|
196 |
+
print("\nCycle Analysis:")
|
197 |
+
for num, occurrences in cycle_info.items():
|
198 |
+
print(f"Cycle number {num}:")
|
199 |
+
for occ in occurrences:
|
200 |
+
print(f"Position: {occ['position']}")
|
201 |
+
print(f"Context: {occ['full_context']}")
|
202 |
+
|
203 |
# Check each cycle
|
204 |
peptide_cycles = []
|
205 |
aromatic_cycles = []
|
206 |
|
207 |
for number, occurrences in cycle_info.items():
|
208 |
+
if len(occurrences) != 2:
|
209 |
continue
|
210 |
|
211 |
start, end = occurrences[0]['position'], occurrences[1]['position']
|
212 |
|
213 |
+
# Get wider context for cycle classification
|
214 |
segment = smiles[start:end+1]
|
|
|
215 |
|
216 |
+
# First check if this is clearly an aromatic ring (phenylalanine side chain)
|
217 |
+
full_context = smiles[max(0,start-10):min(len(smiles),end+10)]
|
218 |
+
is_aromatic = ('c2ccccc2' in full_context and len(segment) < 20) or ('c1ccccc1' in full_context and len(segment) < 20)
|
219 |
+
|
220 |
+
# Check for peptide bonds, including N-methylated ones
|
221 |
+
peptide_patterns = [
|
222 |
+
'C(=O)N', # Regular peptide bond
|
223 |
+
'C(=O)N(C)', # N-methylated peptide bond
|
224 |
+
'C(=O)N1', # Cyclic peptide bond
|
225 |
+
'C(=O)N2' # Cyclic peptide bond
|
226 |
+
]
|
227 |
|
228 |
+
# A peptide cycle should have multiple C(=O)N patterns and be longer
|
229 |
+
has_peptide_bond = any(pattern in segment for pattern in peptide_patterns) and len(segment) > 20
|
230 |
|
231 |
+
if is_aromatic and len(segment) < 20: # Aromatic rings are typically shorter segments
|
232 |
aromatic_cycles.append(number)
|
233 |
elif has_peptide_bond:
|
234 |
peptide_cycles.append(number)
|
235 |
|
236 |
+
print("\nFound cycles:")
|
237 |
+
print(f"Peptide cycles: {peptide_cycles}")
|
238 |
+
print(f"Aromatic cycles: {aromatic_cycles}")
|
239 |
+
|
240 |
+
return len(peptide_cycles) > 0
|
241 |
|
242 |
def analyze_single_smiles(smiles):
|
243 |
"""Analyze a single SMILES string"""
|
244 |
try:
|
245 |
is_cyclic, peptide_cycles, aromatic_cycles = is_cyclic_peptide(smiles)
|
246 |
sequence = parse_peptide(smiles)
|
247 |
+
if is_cyclic and len(sequence) == 7:
|
248 |
+
sequence = 'This is some peptide sequence with modified side chains.'
|
249 |
|
250 |
details = {
|
251 |
#'SMILES': smiles,
|
|
|
653 |
```
|
654 |
C(C)C[C@@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@@H](CC(C)C)NC(=O)[C@@H](CC(C)C)N(C)C(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc2)NC1=O
|
655 |
```
|
656 |
+
```
|
657 |
+
CC(C)C[C@H]1C(=O)N(C)[C@@H](Cc2ccccc2)C(=O)NCC(=O)N[C@H](C(=O)N2CCCCC2)CC(=O)N(C)CC(=O)N[C@@H]([C@@H](C)O)C(=O)N(C)[C@@H](C)C(=O)N[C@@H](COC(C)(C)C)C(=O)N(C)[C@@H](Cc2ccccc2)C(=O)N1C
|
658 |
+
```
|
659 |
""",
|
660 |
flagging_mode="never"
|
661 |
)
|